diff --git a/src/3rd.git/Simd b/src/3rd.git/Simd deleted file mode 160000 index b4ae8db4..00000000 --- a/src/3rd.git/Simd +++ /dev/null @@ -1 +0,0 @@ -Subproject commit b4ae8db4be881cb8bf4569a4a95407ec47c8c85b diff --git a/src/3rd.git/libwebp b/src/3rd.git/libwebp deleted file mode 160000 index f9b30586..00000000 --- a/src/3rd.git/libwebp +++ /dev/null @@ -1 +0,0 @@ -Subproject commit f9b30586eb2ad4b1466d4b58fb6720a096697752 diff --git a/src/3rd/LibOpenJpeg/LibOpenJpeg32.vcxproj b/src/3rd/LibOpenJpeg/LibOpenJpeg32.vcxproj deleted file mode 100644 index 32a33319..00000000 --- a/src/3rd/LibOpenJpeg/LibOpenJpeg32.vcxproj +++ /dev/null @@ -1,100 +0,0 @@ - - - - - - Debug - Win32 - - - Release - Win32 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - {1622C4EF-06A4-4DAA-9631-5D71B32858A4} - Win32Proj - LibOpenJpeg32 - - - StaticLibrary - - - v143 - - - v143 - - - - - _WINDOWS;_LIB;_CRT_SECURE_NO_DEPRECATE;OPJ_STATIC;%(PreprocessorDefinitions) - NoExtensions - - - Windows - MachineX86 - - - - \ No newline at end of file diff --git a/src/3rd/LibOpenJpeg/LibOpenJpeg64.vcxproj b/src/3rd/LibOpenJpeg/LibOpenJpeg64.vcxproj deleted file mode 100644 index 6ab8ebc4..00000000 --- a/src/3rd/LibOpenJpeg/LibOpenJpeg64.vcxproj +++ /dev/null @@ -1,100 +0,0 @@ - - - - - - Debug - x64 - - - Release - x64 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - {B6080B9D-6F8C-417C-AF11-9853C47BA665} - Win32Proj - LibOpenJpeg64 - - - StaticLibrary - - - v143 - - - v143 - - - - - _WINDOWS;_LIB;_CRT_SECURE_NO_DEPRECATE;OPJ_STATIC;%(PreprocessorDefinitions) - NotSet - - - Windows - MachineX64 - - - - \ No newline at end of file diff --git a/src/3rd/LibOpenJpeg/bench_dwt.c b/src/3rd/LibOpenJpeg/bench_dwt.c deleted file mode 100644 index 8cb64d06..00000000 --- a/src/3rd/LibOpenJpeg/bench_dwt.c +++ /dev/null @@ -1,273 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2017, IntoPix SA - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "opj_includes.h" - -#ifdef _WIN32 -#include -#else -#include -#include -#include -#endif /* _WIN32 */ - -OPJ_INT32 getValue(OPJ_UINT32 i) -{ - return ((OPJ_INT32)i % 511) - 256; -} - -void init_tilec(opj_tcd_tilecomp_t * l_tilec, - OPJ_INT32 x0, - OPJ_INT32 y0, - OPJ_INT32 x1, - OPJ_INT32 y1, - OPJ_UINT32 numresolutions) -{ - opj_tcd_resolution_t* l_res; - OPJ_UINT32 resno, l_level_no; - size_t i, nValues; - - memset(l_tilec, 0, sizeof(*l_tilec)); - l_tilec->x0 = x0; - l_tilec->y0 = y0; - l_tilec->x1 = x1; - l_tilec->y1 = y1; - nValues = (size_t)(l_tilec->x1 - l_tilec->x0) * - (size_t)(l_tilec->y1 - l_tilec->y0); - l_tilec->data = (OPJ_INT32*) opj_malloc(sizeof(OPJ_INT32) * nValues); - for (i = 0; i < nValues; i++) { - l_tilec->data[i] = getValue((OPJ_UINT32)i); - } - l_tilec->numresolutions = numresolutions; - l_tilec->resolutions = (opj_tcd_resolution_t*) opj_calloc( - l_tilec->numresolutions, - sizeof(opj_tcd_resolution_t)); - - l_level_no = l_tilec->numresolutions; - l_res = l_tilec->resolutions; - - /* Adapted from opj_tcd_init_tile() */ - for (resno = 0; resno < l_tilec->numresolutions; ++resno) { - - --l_level_no; - - /* border for each resolution level (global) */ - l_res->x0 = opj_int_ceildivpow2(l_tilec->x0, (OPJ_INT32)l_level_no); - l_res->y0 = opj_int_ceildivpow2(l_tilec->y0, (OPJ_INT32)l_level_no); - l_res->x1 = opj_int_ceildivpow2(l_tilec->x1, (OPJ_INT32)l_level_no); - l_res->y1 = opj_int_ceildivpow2(l_tilec->y1, (OPJ_INT32)l_level_no); - - ++l_res; - } -} - -void free_tilec(opj_tcd_tilecomp_t * l_tilec) -{ - opj_free(l_tilec->data); - opj_free(l_tilec->resolutions); -} - -void usage(void) -{ - printf( - "bench_dwt [-size value] [-check] [-display] [-num_resolutions val]\n"); - printf( - " [-offset x y] [-num_threads val]\n"); - exit(1); -} - - -OPJ_FLOAT64 opj_clock(void) -{ -#ifdef _WIN32 - /* _WIN32: use QueryPerformance (very accurate) */ - LARGE_INTEGER freq, t ; - /* freq is the clock speed of the CPU */ - QueryPerformanceFrequency(&freq) ; - /* cout << "freq = " << ((double) freq.QuadPart) << endl; */ - /* t is the high resolution performance counter (see MSDN) */ - QueryPerformanceCounter(& t) ; - return freq.QuadPart ? (t.QuadPart / (OPJ_FLOAT64) freq.QuadPart) : 0 ; -#else - /* Unix or Linux: use resource usage */ - struct rusage t; - OPJ_FLOAT64 procTime; - /* (1) Get the rusage data structure at this moment (man getrusage) */ - getrusage(0, &t); - /* (2) What is the elapsed time ? - CPU time = User time + System time */ - /* (2a) Get the seconds */ - procTime = (OPJ_FLOAT64)(t.ru_utime.tv_sec + t.ru_stime.tv_sec); - /* (2b) More precisely! Get the microseconds part ! */ - return (procTime + (OPJ_FLOAT64)(t.ru_utime.tv_usec + t.ru_stime.tv_usec) * - 1e-6) ; -#endif -} - -int main(int argc, char** argv) -{ - int num_threads = 0; - opj_tcd_t tcd; - opj_tcd_image_t tcd_image; - opj_tcd_tile_t tcd_tile; - opj_tcd_tilecomp_t tilec; - opj_image_t image; - opj_image_comp_t image_comp; - opj_thread_pool_t* tp; - OPJ_INT32 i, j, k; - OPJ_BOOL display = OPJ_FALSE; - OPJ_BOOL check = OPJ_FALSE; - OPJ_INT32 size = 16384 - 1; - OPJ_FLOAT64 start, stop; - OPJ_UINT32 offset_x = ((OPJ_UINT32)size + 1) / 2 - 1; - OPJ_UINT32 offset_y = ((OPJ_UINT32)size + 1) / 2 - 1; - OPJ_UINT32 num_resolutions = 6; - - for (i = 1; i < argc; i++) { - if (strcmp(argv[i], "-display") == 0) { - display = OPJ_TRUE; - check = OPJ_TRUE; - } else if (strcmp(argv[i], "-check") == 0) { - check = OPJ_TRUE; - } else if (strcmp(argv[i], "-size") == 0 && i + 1 < argc) { - size = atoi(argv[i + 1]); - i ++; - } else if (strcmp(argv[i], "-num_threads") == 0 && i + 1 < argc) { - num_threads = atoi(argv[i + 1]); - i ++; - } else if (strcmp(argv[i], "-num_resolutions") == 0 && i + 1 < argc) { - num_resolutions = (OPJ_UINT32)atoi(argv[i + 1]); - if (num_resolutions == 0 || num_resolutions > 32) { - fprintf(stderr, - "Invalid value for num_resolutions. Should be >= 1 and <= 32\n"); - exit(1); - } - i ++; - } else if (strcmp(argv[i], "-offset") == 0 && i + 2 < argc) { - offset_x = (OPJ_UINT32)atoi(argv[i + 1]); - offset_y = (OPJ_UINT32)atoi(argv[i + 2]); - i += 2; - } else { - usage(); - } - } - - tp = opj_thread_pool_create(num_threads); - - init_tilec(&tilec, (OPJ_INT32)offset_x, (OPJ_INT32)offset_y, - (OPJ_INT32)offset_x + size, (OPJ_INT32)offset_y + size, - num_resolutions); - - if (display) { - printf("Before\n"); - k = 0; - for (j = 0; j < tilec.y1 - tilec.y0; j++) { - for (i = 0; i < tilec.x1 - tilec.x0; i++) { - printf("%d ", tilec.data[k]); - k ++; - } - printf("\n"); - } - } - - memset(&tcd, 0, sizeof(tcd)); - tcd.thread_pool = tp; - tcd.whole_tile_decoding = OPJ_TRUE; - tcd.win_x0 = (OPJ_UINT32)tilec.x0; - tcd.win_y0 = (OPJ_UINT32)tilec.y0; - tcd.win_x1 = (OPJ_UINT32)tilec.x1; - tcd.win_y1 = (OPJ_UINT32)tilec.y1; - tcd.tcd_image = &tcd_image; - memset(&tcd_image, 0, sizeof(tcd_image)); - tcd_image.tiles = &tcd_tile; - memset(&tcd_tile, 0, sizeof(tcd_tile)); - tcd_tile.x0 = tilec.x0; - tcd_tile.y0 = tilec.y0; - tcd_tile.x1 = tilec.x1; - tcd_tile.y1 = tilec.y1; - tcd_tile.numcomps = 1; - tcd_tile.comps = &tilec; - tcd.image = ℑ - memset(&image, 0, sizeof(image)); - image.numcomps = 1; - image.comps = &image_comp; - memset(&image_comp, 0, sizeof(image_comp)); - image_comp.dx = 1; - image_comp.dy = 1; - - start = opj_clock(); - opj_dwt_decode(&tcd, &tilec, tilec.numresolutions); - stop = opj_clock(); - printf("time for dwt_decode: %.03f s\n", stop - start); - - if (display || check) { - if (display) { - printf("After IDWT\n"); - k = 0; - for (j = 0; j < tilec.y1 - tilec.y0; j++) { - for (i = 0; i < tilec.x1 - tilec.x0; i++) { - printf("%d ", tilec.data[k]); - k ++; - } - printf("\n"); - } - } - - opj_dwt_encode(&tilec); - if (display) { - printf("After FDWT\n"); - k = 0; - for (j = 0; j < tilec.y1 - tilec.y0; j++) { - for (i = 0; i < tilec.x1 - tilec.x0; i++) { - printf("%d ", tilec.data[k]); - k ++; - } - printf("\n"); - } - } - - if (check) { - size_t idx; - size_t nValues = (size_t)(tilec.x1 - tilec.x0) * - (size_t)(tilec.y1 - tilec.y0); - for (idx = 0; idx < nValues; idx++) { - if (tilec.data[idx] != getValue((OPJ_UINT32)idx)) { - printf("Difference found at idx = %u\n", (OPJ_UINT32)idx); - exit(1); - } - } - } - } - - free_tilec(&tilec); - - opj_thread_pool_destroy(tp); - return 0; -} diff --git a/src/3rd/LibOpenJpeg/bio.c b/src/3rd/LibOpenJpeg/bio.c deleted file mode 100644 index 09dcd7f5..00000000 --- a/src/3rd/LibOpenJpeg/bio.c +++ /dev/null @@ -1,217 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "opj_includes.h" - -/** @defgroup BIO BIO - Individual bit input-output stream */ -/*@{*/ - -/** @name Local static functions */ -/*@{*/ - -/** -Write a bit -@param bio BIO handle -@param b Bit to write (0 or 1) -*/ -static void opj_bio_putbit(opj_bio_t *bio, OPJ_UINT32 b); -/** -Read a bit -@param bio BIO handle -@return Returns the read bit -*/ -static OPJ_UINT32 opj_bio_getbit(opj_bio_t *bio); -/** -Write a byte -@param bio BIO handle -@return Returns OPJ_TRUE if successful, returns OPJ_FALSE otherwise -*/ -static OPJ_BOOL opj_bio_byteout(opj_bio_t *bio); -/** -Read a byte -@param bio BIO handle -@return Returns OPJ_TRUE if successful, returns OPJ_FALSE otherwise -*/ -static OPJ_BOOL opj_bio_bytein(opj_bio_t *bio); - -/*@}*/ - -/*@}*/ - -/* -========================================================== - local functions -========================================================== -*/ - -static OPJ_BOOL opj_bio_byteout(opj_bio_t *bio) -{ - bio->buf = (bio->buf << 8) & 0xffff; - bio->ct = bio->buf == 0xff00 ? 7 : 8; - if ((OPJ_SIZE_T)bio->bp >= (OPJ_SIZE_T)bio->end) { - return OPJ_FALSE; - } - *bio->bp++ = (OPJ_BYTE)(bio->buf >> 8); - return OPJ_TRUE; -} - -static OPJ_BOOL opj_bio_bytein(opj_bio_t *bio) -{ - bio->buf = (bio->buf << 8) & 0xffff; - bio->ct = bio->buf == 0xff00 ? 7 : 8; - if ((OPJ_SIZE_T)bio->bp >= (OPJ_SIZE_T)bio->end) { - return OPJ_FALSE; - } - bio->buf |= *bio->bp++; - return OPJ_TRUE; -} - -static void opj_bio_putbit(opj_bio_t *bio, OPJ_UINT32 b) -{ - if (bio->ct == 0) { - opj_bio_byteout( - bio); /* MSD: why not check the return value of this function ? */ - } - bio->ct--; - bio->buf |= b << bio->ct; -} - -static OPJ_UINT32 opj_bio_getbit(opj_bio_t *bio) -{ - if (bio->ct == 0) { - opj_bio_bytein( - bio); /* MSD: why not check the return value of this function ? */ - } - bio->ct--; - return (bio->buf >> bio->ct) & 1; -} - -/* -========================================================== - Bit Input/Output interface -========================================================== -*/ - -opj_bio_t* opj_bio_create(void) -{ - opj_bio_t *bio = (opj_bio_t*)opj_malloc(sizeof(opj_bio_t)); - return bio; -} - -void opj_bio_destroy(opj_bio_t *bio) -{ - if (bio) { - opj_free(bio); - } -} - -ptrdiff_t opj_bio_numbytes(opj_bio_t *bio) -{ - return (bio->bp - bio->start); -} - -void opj_bio_init_enc(opj_bio_t *bio, OPJ_BYTE *bp, OPJ_UINT32 len) -{ - bio->start = bp; - bio->end = bp + len; - bio->bp = bp; - bio->buf = 0; - bio->ct = 8; -} - -void opj_bio_init_dec(opj_bio_t *bio, OPJ_BYTE *bp, OPJ_UINT32 len) -{ - bio->start = bp; - bio->end = bp + len; - bio->bp = bp; - bio->buf = 0; - bio->ct = 0; -} - -void opj_bio_write(opj_bio_t *bio, OPJ_UINT32 v, OPJ_UINT32 n) -{ - OPJ_INT32 i; - - assert((n > 0U) && (n <= 32U)); - for (i = (OPJ_INT32)n - 1; i >= 0; i--) { - opj_bio_putbit(bio, (v >> i) & 1); - } -} - -OPJ_UINT32 opj_bio_read(opj_bio_t *bio, OPJ_UINT32 n) -{ - OPJ_INT32 i; - OPJ_UINT32 v; - - assert((n > 0U) /* && (n <= 32U)*/); -#ifdef OPJ_UBSAN_BUILD - /* This assert fails for some corrupted images which are gracefully rejected */ - /* Add this assert only for ubsan build. */ - /* This is the condition for overflow not to occur below which is needed because of OPJ_NOSANITIZE */ - assert(n <= 32U); -#endif - v = 0U; - for (i = (OPJ_INT32)n - 1; i >= 0; i--) { - v |= opj_bio_getbit(bio) << - i; /* can't overflow, opj_bio_getbit returns 0 or 1 */ - } - return v; -} - -OPJ_BOOL opj_bio_flush(opj_bio_t *bio) -{ - if (! opj_bio_byteout(bio)) { - return OPJ_FALSE; - } - if (bio->ct == 7) { - if (! opj_bio_byteout(bio)) { - return OPJ_FALSE; - } - } - return OPJ_TRUE; -} - -OPJ_BOOL opj_bio_inalign(opj_bio_t *bio) -{ - if ((bio->buf & 0xff) == 0xff) { - if (! opj_bio_bytein(bio)) { - return OPJ_FALSE; - } - } - bio->ct = 0; - return OPJ_TRUE; -} diff --git a/src/3rd/LibOpenJpeg/bio.h b/src/3rd/LibOpenJpeg/bio.h deleted file mode 100644 index 448fdda2..00000000 --- a/src/3rd/LibOpenJpeg/bio.h +++ /dev/null @@ -1,134 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef OPJ_BIO_H -#define OPJ_BIO_H - -#include /* ptrdiff_t */ - -/** -@file bio.h -@brief Implementation of an individual bit input-output (BIO) - -The functions in BIO.C have for goal to realize an individual bit input - output. -*/ - -/** @defgroup BIO BIO - Individual bit input-output stream */ -/*@{*/ - -/** -Individual bit input-output stream (BIO) -*/ -typedef struct opj_bio { - /** pointer to the start of the buffer */ - OPJ_BYTE *start; - /** pointer to the end of the buffer */ - OPJ_BYTE *end; - /** pointer to the present position in the buffer */ - OPJ_BYTE *bp; - /** temporary place where each byte is read or written */ - OPJ_UINT32 buf; - /** coder : number of bits free to write. decoder : number of bits read */ - OPJ_UINT32 ct; -} opj_bio_t; - -/** @name Exported functions */ -/*@{*/ -/* ----------------------------------------------------------------------- */ -/** -Create a new BIO handle -@return Returns a new BIO handle if successful, returns NULL otherwise -*/ -opj_bio_t* opj_bio_create(void); -/** -Destroy a previously created BIO handle -@param bio BIO handle to destroy -*/ -void opj_bio_destroy(opj_bio_t *bio); -/** -Number of bytes written. -@param bio BIO handle -@return Returns the number of bytes written -*/ -ptrdiff_t opj_bio_numbytes(opj_bio_t *bio); -/** -Init encoder -@param bio BIO handle -@param bp Output buffer -@param len Output buffer length -*/ -void opj_bio_init_enc(opj_bio_t *bio, OPJ_BYTE *bp, OPJ_UINT32 len); -/** -Init decoder -@param bio BIO handle -@param bp Input buffer -@param len Input buffer length -*/ -void opj_bio_init_dec(opj_bio_t *bio, OPJ_BYTE *bp, OPJ_UINT32 len); -/** -Write bits -@param bio BIO handle -@param v Value of bits -@param n Number of bits to write -*/ -void opj_bio_write(opj_bio_t *bio, OPJ_UINT32 v, OPJ_UINT32 n); -/** -Read bits -@param bio BIO handle -@param n Number of bits to read -@return Returns the corresponding read number -*/ -OPJ_UINT32 opj_bio_read(opj_bio_t *bio, OPJ_UINT32 n); -/** -Flush bits -@param bio BIO handle -@return Returns OPJ_TRUE if successful, returns OPJ_FALSE otherwise -*/ -OPJ_BOOL opj_bio_flush(opj_bio_t *bio); -/** -Passes the ending bits (coming from flushing) -@param bio BIO handle -@return Returns OPJ_TRUE if successful, returns OPJ_FALSE otherwise -*/ -OPJ_BOOL opj_bio_inalign(opj_bio_t *bio); -/* ----------------------------------------------------------------------- */ -/*@}*/ - -/*@}*/ - -#endif /* OPJ_BIO_H */ - diff --git a/src/3rd/LibOpenJpeg/cio.c b/src/3rd/LibOpenJpeg/cio.c deleted file mode 100644 index 4fde9fe2..00000000 --- a/src/3rd/LibOpenJpeg/cio.c +++ /dev/null @@ -1,683 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR - * Copyright (c) 2012, CS Systemes d'Information, France - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "opj_includes.h" - -/* ----------------------------------------------------------------------- */ - - -/* ----------------------------------------------------------------------- */ - -void opj_write_bytes_BE(OPJ_BYTE * p_buffer, OPJ_UINT32 p_value, - OPJ_UINT32 p_nb_bytes) -{ - const OPJ_BYTE * l_data_ptr = ((const OPJ_BYTE *) &p_value) + sizeof( - OPJ_UINT32) - p_nb_bytes; - - assert(p_nb_bytes > 0 && p_nb_bytes <= sizeof(OPJ_UINT32)); - - memcpy(p_buffer, l_data_ptr, p_nb_bytes); -} - -void opj_write_bytes_LE(OPJ_BYTE * p_buffer, OPJ_UINT32 p_value, - OPJ_UINT32 p_nb_bytes) -{ - const OPJ_BYTE * l_data_ptr = ((const OPJ_BYTE *) &p_value) + p_nb_bytes - 1; - OPJ_UINT32 i; - - assert(p_nb_bytes > 0 && p_nb_bytes <= sizeof(OPJ_UINT32)); - - for (i = 0; i < p_nb_bytes; ++i) { - *(p_buffer++) = *(l_data_ptr--); - } -} - -void opj_read_bytes_BE(const OPJ_BYTE * p_buffer, OPJ_UINT32 * p_value, - OPJ_UINT32 p_nb_bytes) -{ - OPJ_BYTE * l_data_ptr = ((OPJ_BYTE *) p_value); - - assert(p_nb_bytes > 0 && p_nb_bytes <= sizeof(OPJ_UINT32)); - - *p_value = 0; - memcpy(l_data_ptr + sizeof(OPJ_UINT32) - p_nb_bytes, p_buffer, p_nb_bytes); -} - -void opj_read_bytes_LE(const OPJ_BYTE * p_buffer, OPJ_UINT32 * p_value, - OPJ_UINT32 p_nb_bytes) -{ - OPJ_BYTE * l_data_ptr = ((OPJ_BYTE *) p_value) + p_nb_bytes - 1; - OPJ_UINT32 i; - - assert(p_nb_bytes > 0 && p_nb_bytes <= sizeof(OPJ_UINT32)); - - *p_value = 0; - for (i = 0; i < p_nb_bytes; ++i) { - *(l_data_ptr--) = *(p_buffer++); - } -} - -void opj_write_double_BE(OPJ_BYTE * p_buffer, OPJ_FLOAT64 p_value) -{ - const OPJ_BYTE * l_data_ptr = ((const OPJ_BYTE *) &p_value); - memcpy(p_buffer, l_data_ptr, sizeof(OPJ_FLOAT64)); -} - -void opj_write_double_LE(OPJ_BYTE * p_buffer, OPJ_FLOAT64 p_value) -{ - const OPJ_BYTE * l_data_ptr = ((const OPJ_BYTE *) &p_value) + sizeof( - OPJ_FLOAT64) - 1; - OPJ_UINT32 i; - for (i = 0; i < sizeof(OPJ_FLOAT64); ++i) { - *(p_buffer++) = *(l_data_ptr--); - } -} - -void opj_read_double_BE(const OPJ_BYTE * p_buffer, OPJ_FLOAT64 * p_value) -{ - OPJ_BYTE * l_data_ptr = ((OPJ_BYTE *) p_value); - memcpy(l_data_ptr, p_buffer, sizeof(OPJ_FLOAT64)); -} - -void opj_read_double_LE(const OPJ_BYTE * p_buffer, OPJ_FLOAT64 * p_value) -{ - OPJ_BYTE * l_data_ptr = ((OPJ_BYTE *) p_value) + sizeof(OPJ_FLOAT64) - 1; - OPJ_UINT32 i; - for (i = 0; i < sizeof(OPJ_FLOAT64); ++i) { - *(l_data_ptr--) = *(p_buffer++); - } -} - -void opj_write_float_BE(OPJ_BYTE * p_buffer, OPJ_FLOAT32 p_value) -{ - const OPJ_BYTE * l_data_ptr = ((const OPJ_BYTE *) &p_value); - memcpy(p_buffer, l_data_ptr, sizeof(OPJ_FLOAT32)); -} - -void opj_write_float_LE(OPJ_BYTE * p_buffer, OPJ_FLOAT32 p_value) -{ - const OPJ_BYTE * l_data_ptr = ((const OPJ_BYTE *) &p_value) + sizeof( - OPJ_FLOAT32) - 1; - OPJ_UINT32 i; - for (i = 0; i < sizeof(OPJ_FLOAT32); ++i) { - *(p_buffer++) = *(l_data_ptr--); - } -} - -void opj_read_float_BE(const OPJ_BYTE * p_buffer, OPJ_FLOAT32 * p_value) -{ - OPJ_BYTE * l_data_ptr = ((OPJ_BYTE *) p_value); - memcpy(l_data_ptr, p_buffer, sizeof(OPJ_FLOAT32)); -} - -void opj_read_float_LE(const OPJ_BYTE * p_buffer, OPJ_FLOAT32 * p_value) -{ - OPJ_BYTE * l_data_ptr = ((OPJ_BYTE *) p_value) + sizeof(OPJ_FLOAT32) - 1; - OPJ_UINT32 i; - for (i = 0; i < sizeof(OPJ_FLOAT32); ++i) { - *(l_data_ptr--) = *(p_buffer++); - } -} - -opj_stream_t* OPJ_CALLCONV opj_stream_create(OPJ_SIZE_T p_buffer_size, - OPJ_BOOL l_is_input) -{ - opj_stream_private_t * l_stream = 00; - l_stream = (opj_stream_private_t*) opj_calloc(1, sizeof(opj_stream_private_t)); - if (! l_stream) { - return 00; - } - - l_stream->m_buffer_size = p_buffer_size; - l_stream->m_stored_data = (OPJ_BYTE *) opj_malloc(p_buffer_size); - if (! l_stream->m_stored_data) { - opj_free(l_stream); - return 00; - } - - l_stream->m_current_data = l_stream->m_stored_data; - - if (l_is_input) { - l_stream->m_status |= OPJ_STREAM_STATUS_INPUT; - l_stream->m_opj_skip = opj_stream_read_skip; - l_stream->m_opj_seek = opj_stream_read_seek; - } else { - l_stream->m_status |= OPJ_STREAM_STATUS_OUTPUT; - l_stream->m_opj_skip = opj_stream_write_skip; - l_stream->m_opj_seek = opj_stream_write_seek; - } - - l_stream->m_read_fn = opj_stream_default_read; - l_stream->m_write_fn = opj_stream_default_write; - l_stream->m_skip_fn = opj_stream_default_skip; - l_stream->m_seek_fn = opj_stream_default_seek; - - return (opj_stream_t *) l_stream; -} - -opj_stream_t* OPJ_CALLCONV opj_stream_default_create(OPJ_BOOL l_is_input) -{ - return opj_stream_create(OPJ_J2K_STREAM_CHUNK_SIZE, l_is_input); -} - -void OPJ_CALLCONV opj_stream_destroy(opj_stream_t* p_stream) -{ - opj_stream_private_t* l_stream = (opj_stream_private_t*) p_stream; - - if (l_stream) { - if (l_stream->m_free_user_data_fn) { - l_stream->m_free_user_data_fn(l_stream->m_user_data); - } - opj_free(l_stream->m_stored_data); - l_stream->m_stored_data = 00; - opj_free(l_stream); - } -} - -void OPJ_CALLCONV opj_stream_set_read_function(opj_stream_t* p_stream, - opj_stream_read_fn p_function) -{ - opj_stream_private_t* l_stream = (opj_stream_private_t*) p_stream; - - if ((!l_stream) || (!(l_stream->m_status & OPJ_STREAM_STATUS_INPUT))) { - return; - } - - l_stream->m_read_fn = p_function; -} - -void OPJ_CALLCONV opj_stream_set_seek_function(opj_stream_t* p_stream, - opj_stream_seek_fn p_function) -{ - opj_stream_private_t* l_stream = (opj_stream_private_t*) p_stream; - - if (!l_stream) { - return; - } - l_stream->m_seek_fn = p_function; -} - -void OPJ_CALLCONV opj_stream_set_write_function(opj_stream_t* p_stream, - opj_stream_write_fn p_function) -{ - opj_stream_private_t* l_stream = (opj_stream_private_t*) p_stream; - - if ((!l_stream) || (!(l_stream->m_status & OPJ_STREAM_STATUS_OUTPUT))) { - return; - } - - l_stream->m_write_fn = p_function; -} - -void OPJ_CALLCONV opj_stream_set_skip_function(opj_stream_t* p_stream, - opj_stream_skip_fn p_function) -{ - opj_stream_private_t* l_stream = (opj_stream_private_t*) p_stream; - - if (! l_stream) { - return; - } - - l_stream->m_skip_fn = p_function; -} - -void OPJ_CALLCONV opj_stream_set_user_data(opj_stream_t* p_stream, - void * p_data, opj_stream_free_user_data_fn p_function) -{ - opj_stream_private_t* l_stream = (opj_stream_private_t*) p_stream; - if (!l_stream) { - return; - } - l_stream->m_user_data = p_data; - l_stream->m_free_user_data_fn = p_function; -} - -void OPJ_CALLCONV opj_stream_set_user_data_length(opj_stream_t* p_stream, - OPJ_UINT64 data_length) -{ - opj_stream_private_t* l_stream = (opj_stream_private_t*) p_stream; - if (!l_stream) { - return; - } - l_stream->m_user_data_length = data_length; -} - -OPJ_SIZE_T opj_stream_read_data(opj_stream_private_t * p_stream, - OPJ_BYTE * p_buffer, OPJ_SIZE_T p_size, opj_event_mgr_t * p_event_mgr) -{ - OPJ_SIZE_T l_read_nb_bytes = 0; - if (p_stream->m_bytes_in_buffer >= p_size) { - memcpy(p_buffer, p_stream->m_current_data, p_size); - p_stream->m_current_data += p_size; - p_stream->m_bytes_in_buffer -= p_size; - l_read_nb_bytes += p_size; - p_stream->m_byte_offset += (OPJ_OFF_T)p_size; - return l_read_nb_bytes; - } - - /* we are now in the case when the remaining data if not sufficient */ - if (p_stream->m_status & OPJ_STREAM_STATUS_END) { - l_read_nb_bytes += p_stream->m_bytes_in_buffer; - memcpy(p_buffer, p_stream->m_current_data, p_stream->m_bytes_in_buffer); - p_stream->m_current_data += p_stream->m_bytes_in_buffer; - p_stream->m_byte_offset += (OPJ_OFF_T)p_stream->m_bytes_in_buffer; - p_stream->m_bytes_in_buffer = 0; - return l_read_nb_bytes ? l_read_nb_bytes : (OPJ_SIZE_T) - 1; - } - - /* the flag is not set, we copy data and then do an actual read on the stream */ - if (p_stream->m_bytes_in_buffer) { - l_read_nb_bytes += p_stream->m_bytes_in_buffer; - memcpy(p_buffer, p_stream->m_current_data, p_stream->m_bytes_in_buffer); - p_stream->m_current_data = p_stream->m_stored_data; - p_buffer += p_stream->m_bytes_in_buffer; - p_size -= p_stream->m_bytes_in_buffer; - p_stream->m_byte_offset += (OPJ_OFF_T)p_stream->m_bytes_in_buffer; - p_stream->m_bytes_in_buffer = 0; - } else { - /* case where we are already at the end of the buffer - so reset the m_current_data to point to the start of the - stored buffer to get ready to read from disk*/ - p_stream->m_current_data = p_stream->m_stored_data; - } - - for (;;) { - /* we should read less than a chunk -> read a chunk */ - if (p_size < p_stream->m_buffer_size) { - /* we should do an actual read on the media */ - p_stream->m_bytes_in_buffer = p_stream->m_read_fn(p_stream->m_stored_data, - p_stream->m_buffer_size, p_stream->m_user_data); - - if (p_stream->m_bytes_in_buffer == (OPJ_SIZE_T) - 1) { - /* end of stream */ - opj_event_msg(p_event_mgr, EVT_INFO, "Stream reached its end !\n"); - - p_stream->m_bytes_in_buffer = 0; - p_stream->m_status |= OPJ_STREAM_STATUS_END; - /* end of stream */ - return l_read_nb_bytes ? l_read_nb_bytes : (OPJ_SIZE_T) - 1; - } else if (p_stream->m_bytes_in_buffer < p_size) { - /* not enough data */ - l_read_nb_bytes += p_stream->m_bytes_in_buffer; - memcpy(p_buffer, p_stream->m_current_data, p_stream->m_bytes_in_buffer); - p_stream->m_current_data = p_stream->m_stored_data; - p_buffer += p_stream->m_bytes_in_buffer; - p_size -= p_stream->m_bytes_in_buffer; - p_stream->m_byte_offset += (OPJ_OFF_T)p_stream->m_bytes_in_buffer; - p_stream->m_bytes_in_buffer = 0; - } else { - l_read_nb_bytes += p_size; - memcpy(p_buffer, p_stream->m_current_data, p_size); - p_stream->m_current_data += p_size; - p_stream->m_bytes_in_buffer -= p_size; - p_stream->m_byte_offset += (OPJ_OFF_T)p_size; - return l_read_nb_bytes; - } - } else { - /* direct read on the dest buffer */ - p_stream->m_bytes_in_buffer = p_stream->m_read_fn(p_buffer, p_size, - p_stream->m_user_data); - - if (p_stream->m_bytes_in_buffer == (OPJ_SIZE_T) - 1) { - /* end of stream */ - opj_event_msg(p_event_mgr, EVT_INFO, "Stream reached its end !\n"); - - p_stream->m_bytes_in_buffer = 0; - p_stream->m_status |= OPJ_STREAM_STATUS_END; - /* end of stream */ - return l_read_nb_bytes ? l_read_nb_bytes : (OPJ_SIZE_T) - 1; - } else if (p_stream->m_bytes_in_buffer < p_size) { - /* not enough data */ - l_read_nb_bytes += p_stream->m_bytes_in_buffer; - p_stream->m_current_data = p_stream->m_stored_data; - p_buffer += p_stream->m_bytes_in_buffer; - p_size -= p_stream->m_bytes_in_buffer; - p_stream->m_byte_offset += (OPJ_OFF_T)p_stream->m_bytes_in_buffer; - p_stream->m_bytes_in_buffer = 0; - } else { - /* we have read the exact size */ - l_read_nb_bytes += p_stream->m_bytes_in_buffer; - p_stream->m_byte_offset += (OPJ_OFF_T)p_stream->m_bytes_in_buffer; - p_stream->m_current_data = p_stream->m_stored_data; - p_stream->m_bytes_in_buffer = 0; - return l_read_nb_bytes; - } - } - } -} - -OPJ_SIZE_T opj_stream_write_data(opj_stream_private_t * p_stream, - const OPJ_BYTE * p_buffer, - OPJ_SIZE_T p_size, - opj_event_mgr_t * p_event_mgr) -{ - OPJ_SIZE_T l_remaining_bytes = 0; - OPJ_SIZE_T l_write_nb_bytes = 0; - - if (p_stream->m_status & OPJ_STREAM_STATUS_ERROR) { - return (OPJ_SIZE_T) - 1; - } - - for (;;) { - l_remaining_bytes = p_stream->m_buffer_size - p_stream->m_bytes_in_buffer; - - /* we have more memory than required */ - if (l_remaining_bytes >= p_size) { - memcpy(p_stream->m_current_data, p_buffer, p_size); - - p_stream->m_current_data += p_size; - p_stream->m_bytes_in_buffer += p_size; - l_write_nb_bytes += p_size; - p_stream->m_byte_offset += (OPJ_OFF_T)p_size; - - return l_write_nb_bytes; - } - - /* we copy data and then do an actual read on the stream */ - if (l_remaining_bytes) { - l_write_nb_bytes += l_remaining_bytes; - - memcpy(p_stream->m_current_data, p_buffer, l_remaining_bytes); - - p_stream->m_current_data = p_stream->m_stored_data; - - p_buffer += l_remaining_bytes; - p_size -= l_remaining_bytes; - p_stream->m_bytes_in_buffer += l_remaining_bytes; - p_stream->m_byte_offset += (OPJ_OFF_T)l_remaining_bytes; - } - - if (! opj_stream_flush(p_stream, p_event_mgr)) { - return (OPJ_SIZE_T) - 1; - } - } - -} - -OPJ_BOOL opj_stream_flush(opj_stream_private_t * p_stream, - opj_event_mgr_t * p_event_mgr) -{ - /* the number of bytes written on the media. */ - OPJ_SIZE_T l_current_write_nb_bytes = 0; - - p_stream->m_current_data = p_stream->m_stored_data; - - while (p_stream->m_bytes_in_buffer) { - /* we should do an actual write on the media */ - l_current_write_nb_bytes = p_stream->m_write_fn(p_stream->m_current_data, - p_stream->m_bytes_in_buffer, - p_stream->m_user_data); - - if (l_current_write_nb_bytes == (OPJ_SIZE_T) - 1) { - p_stream->m_status |= OPJ_STREAM_STATUS_ERROR; - opj_event_msg(p_event_mgr, EVT_INFO, "Error on writing stream!\n"); - - return OPJ_FALSE; - } - - p_stream->m_current_data += l_current_write_nb_bytes; - p_stream->m_bytes_in_buffer -= l_current_write_nb_bytes; - } - - p_stream->m_current_data = p_stream->m_stored_data; - - return OPJ_TRUE; -} - -OPJ_OFF_T opj_stream_read_skip(opj_stream_private_t * p_stream, - OPJ_OFF_T p_size, opj_event_mgr_t * p_event_mgr) -{ - OPJ_OFF_T l_skip_nb_bytes = 0; - OPJ_OFF_T l_current_skip_nb_bytes = 0; - - assert(p_size >= 0); - - if (p_stream->m_bytes_in_buffer >= (OPJ_SIZE_T)p_size) { - p_stream->m_current_data += p_size; - /* it is safe to cast p_size to OPJ_SIZE_T since it is <= m_bytes_in_buffer - which is of type OPJ_SIZE_T */ - p_stream->m_bytes_in_buffer -= (OPJ_SIZE_T)p_size; - l_skip_nb_bytes += p_size; - p_stream->m_byte_offset += l_skip_nb_bytes; - return l_skip_nb_bytes; - } - - /* we are now in the case when the remaining data if not sufficient */ - if (p_stream->m_status & OPJ_STREAM_STATUS_END) { - l_skip_nb_bytes += (OPJ_OFF_T)p_stream->m_bytes_in_buffer; - p_stream->m_current_data += p_stream->m_bytes_in_buffer; - p_stream->m_bytes_in_buffer = 0; - p_stream->m_byte_offset += l_skip_nb_bytes; - return l_skip_nb_bytes ? l_skip_nb_bytes : (OPJ_OFF_T) - 1; - } - - /* the flag is not set, we copy data and then do an actual skip on the stream */ - if (p_stream->m_bytes_in_buffer) { - l_skip_nb_bytes += (OPJ_OFF_T)p_stream->m_bytes_in_buffer; - p_stream->m_current_data = p_stream->m_stored_data; - p_size -= (OPJ_OFF_T)p_stream->m_bytes_in_buffer; - p_stream->m_bytes_in_buffer = 0; - } - - while (p_size > 0) { - /* Check if we are going beyond the end of file. Most skip_fn do not */ - /* check that, but we must be careful not to advance m_byte_offset */ - /* beyond m_user_data_length, otherwise */ - /* opj_stream_get_number_byte_left() will assert. */ - if ((OPJ_UINT64)(p_stream->m_byte_offset + l_skip_nb_bytes + p_size) > - p_stream->m_user_data_length) { - opj_event_msg(p_event_mgr, EVT_INFO, "Stream reached its end !\n"); - - p_stream->m_byte_offset += l_skip_nb_bytes; - l_skip_nb_bytes = (OPJ_OFF_T)(p_stream->m_user_data_length - - (OPJ_UINT64)p_stream->m_byte_offset); - - opj_stream_read_seek(p_stream, (OPJ_OFF_T)p_stream->m_user_data_length, - p_event_mgr); - p_stream->m_status |= OPJ_STREAM_STATUS_END; - - /* end if stream */ - return l_skip_nb_bytes ? l_skip_nb_bytes : (OPJ_OFF_T) - 1; - } - - /* we should do an actual skip on the media */ - l_current_skip_nb_bytes = p_stream->m_skip_fn(p_size, p_stream->m_user_data); - if (l_current_skip_nb_bytes == (OPJ_OFF_T) - 1) { - opj_event_msg(p_event_mgr, EVT_INFO, "Stream reached its end !\n"); - - p_stream->m_status |= OPJ_STREAM_STATUS_END; - p_stream->m_byte_offset += l_skip_nb_bytes; - /* end if stream */ - return l_skip_nb_bytes ? l_skip_nb_bytes : (OPJ_OFF_T) - 1; - } - p_size -= l_current_skip_nb_bytes; - l_skip_nb_bytes += l_current_skip_nb_bytes; - } - - p_stream->m_byte_offset += l_skip_nb_bytes; - - return l_skip_nb_bytes; -} - -OPJ_OFF_T opj_stream_write_skip(opj_stream_private_t * p_stream, - OPJ_OFF_T p_size, opj_event_mgr_t * p_event_mgr) -{ - OPJ_BOOL l_is_written = 0; - OPJ_OFF_T l_current_skip_nb_bytes = 0; - OPJ_OFF_T l_skip_nb_bytes = 0; - - if (p_stream->m_status & OPJ_STREAM_STATUS_ERROR) { - return (OPJ_OFF_T) - 1; - } - - /* we should flush data */ - l_is_written = opj_stream_flush(p_stream, p_event_mgr); - if (! l_is_written) { - p_stream->m_status |= OPJ_STREAM_STATUS_ERROR; - p_stream->m_bytes_in_buffer = 0; - return (OPJ_OFF_T) - 1; - } - /* then skip */ - - while (p_size > 0) { - /* we should do an actual skip on the media */ - l_current_skip_nb_bytes = p_stream->m_skip_fn(p_size, p_stream->m_user_data); - - if (l_current_skip_nb_bytes == (OPJ_OFF_T) - 1) { - opj_event_msg(p_event_mgr, EVT_INFO, "Stream error!\n"); - - p_stream->m_status |= OPJ_STREAM_STATUS_ERROR; - p_stream->m_byte_offset += l_skip_nb_bytes; - /* end if stream */ - return l_skip_nb_bytes ? l_skip_nb_bytes : (OPJ_OFF_T) - 1; - } - p_size -= l_current_skip_nb_bytes; - l_skip_nb_bytes += l_current_skip_nb_bytes; - } - - p_stream->m_byte_offset += l_skip_nb_bytes; - - return l_skip_nb_bytes; -} - -OPJ_OFF_T opj_stream_tell(const opj_stream_private_t * p_stream) -{ - return p_stream->m_byte_offset; -} - -OPJ_OFF_T opj_stream_get_number_byte_left(const opj_stream_private_t * p_stream) -{ - assert(p_stream->m_byte_offset >= 0); - assert(p_stream->m_user_data_length >= (OPJ_UINT64)p_stream->m_byte_offset); - return p_stream->m_user_data_length ? - (OPJ_OFF_T)(p_stream->m_user_data_length) - p_stream->m_byte_offset : - 0; -} - -OPJ_OFF_T opj_stream_skip(opj_stream_private_t * p_stream, OPJ_OFF_T p_size, - opj_event_mgr_t * p_event_mgr) -{ - assert(p_size >= 0); - return p_stream->m_opj_skip(p_stream, p_size, p_event_mgr); -} - -OPJ_BOOL opj_stream_read_seek(opj_stream_private_t * p_stream, OPJ_OFF_T p_size, - opj_event_mgr_t * p_event_mgr) -{ - OPJ_ARG_NOT_USED(p_event_mgr); - p_stream->m_current_data = p_stream->m_stored_data; - p_stream->m_bytes_in_buffer = 0; - - if (!(p_stream->m_seek_fn(p_size, p_stream->m_user_data))) { - p_stream->m_status |= OPJ_STREAM_STATUS_END; - return OPJ_FALSE; - } else { - /* reset stream status */ - p_stream->m_status &= (~OPJ_STREAM_STATUS_END); - p_stream->m_byte_offset = p_size; - - } - - return OPJ_TRUE; -} - -OPJ_BOOL opj_stream_write_seek(opj_stream_private_t * p_stream, - OPJ_OFF_T p_size, opj_event_mgr_t * p_event_mgr) -{ - if (! opj_stream_flush(p_stream, p_event_mgr)) { - p_stream->m_status |= OPJ_STREAM_STATUS_ERROR; - return OPJ_FALSE; - } - - p_stream->m_current_data = p_stream->m_stored_data; - p_stream->m_bytes_in_buffer = 0; - - if (! p_stream->m_seek_fn(p_size, p_stream->m_user_data)) { - p_stream->m_status |= OPJ_STREAM_STATUS_ERROR; - return OPJ_FALSE; - } else { - p_stream->m_byte_offset = p_size; - } - - return OPJ_TRUE; -} - -OPJ_BOOL opj_stream_seek(opj_stream_private_t * p_stream, OPJ_OFF_T p_size, - struct opj_event_mgr * p_event_mgr) -{ - assert(p_size >= 0); - return p_stream->m_opj_seek(p_stream, p_size, p_event_mgr); -} - -OPJ_BOOL opj_stream_has_seek(const opj_stream_private_t * p_stream) -{ - return p_stream->m_seek_fn != opj_stream_default_seek; -} - -OPJ_SIZE_T opj_stream_default_read(void * p_buffer, OPJ_SIZE_T p_nb_bytes, - void * p_user_data) -{ - OPJ_ARG_NOT_USED(p_buffer); - OPJ_ARG_NOT_USED(p_nb_bytes); - OPJ_ARG_NOT_USED(p_user_data); - return (OPJ_SIZE_T) - 1; -} - -OPJ_SIZE_T opj_stream_default_write(void * p_buffer, OPJ_SIZE_T p_nb_bytes, - void * p_user_data) -{ - OPJ_ARG_NOT_USED(p_buffer); - OPJ_ARG_NOT_USED(p_nb_bytes); - OPJ_ARG_NOT_USED(p_user_data); - return (OPJ_SIZE_T) - 1; -} - -OPJ_OFF_T opj_stream_default_skip(OPJ_OFF_T p_nb_bytes, void * p_user_data) -{ - OPJ_ARG_NOT_USED(p_nb_bytes); - OPJ_ARG_NOT_USED(p_user_data); - return (OPJ_OFF_T) - 1; -} - -OPJ_BOOL opj_stream_default_seek(OPJ_OFF_T p_nb_bytes, void * p_user_data) -{ - OPJ_ARG_NOT_USED(p_nb_bytes); - OPJ_ARG_NOT_USED(p_user_data); - return OPJ_FALSE; -} diff --git a/src/3rd/LibOpenJpeg/cio.h b/src/3rd/LibOpenJpeg/cio.h deleted file mode 100644 index 6996a9a0..00000000 --- a/src/3rd/LibOpenJpeg/cio.h +++ /dev/null @@ -1,412 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR - * Copyright (c) 2012, CS Systemes d'Information, France - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef OPJ_CIO_H -#define OPJ_CIO_H -/** -@file cio.h -@brief Implementation of a byte input-output process (CIO) - -The functions in CIO.C have for goal to realize a byte input / output process. -*/ - -/** @defgroup CIO CIO - byte input-output stream */ -/*@{*/ - -#include "opj_config_private.h" - -/* ----------------------------------------------------------------------- */ - -#if defined(OPJ_BIG_ENDIAN) -#define opj_write_bytes opj_write_bytes_BE -#define opj_read_bytes opj_read_bytes_BE -#define opj_write_double opj_write_double_BE -#define opj_read_double opj_read_double_BE -#define opj_write_float opj_write_float_BE -#define opj_read_float opj_read_float_BE -#else -#define opj_write_bytes opj_write_bytes_LE -#define opj_read_bytes opj_read_bytes_LE -#define opj_write_double opj_write_double_LE -#define opj_read_double opj_read_double_LE -#define opj_write_float opj_write_float_LE -#define opj_read_float opj_read_float_LE -#endif - - -#define OPJ_STREAM_STATUS_OUTPUT 0x1U -#define OPJ_STREAM_STATUS_INPUT 0x2U -#define OPJ_STREAM_STATUS_END 0x4U -#define OPJ_STREAM_STATUS_ERROR 0x8U - -/** -Byte input-output stream. -*/ -typedef struct opj_stream_private { - /** - * User data, be it files, ... The actual data depends on the type of the stream. - */ - void * m_user_data; - - /** - * Pointer to function to free m_user_data (NULL at initialization) - * when destroying the stream. If pointer is NULL the function is not - * called and the m_user_data is not freed (even if non-NULL). - */ - opj_stream_free_user_data_fn m_free_user_data_fn; - - /** - * User data length - */ - OPJ_UINT64 m_user_data_length; - - /** - * Pointer to actual read function (NULL at the initialization of the cio. - */ - opj_stream_read_fn m_read_fn; - - /** - * Pointer to actual write function (NULL at the initialization of the cio. - */ - opj_stream_write_fn m_write_fn; - - /** - * Pointer to actual skip function (NULL at the initialization of the cio. - * There is no seek function to prevent from back and forth slow procedures. - */ - opj_stream_skip_fn m_skip_fn; - - /** - * Pointer to actual seek function (if available). - */ - opj_stream_seek_fn m_seek_fn; - - /** - * Actual data stored into the stream if readed from. Data is read by chunk of fixed size. - * you should never access this data directly. - */ - OPJ_BYTE * m_stored_data; - - /** - * Pointer to the current read data. - */ - OPJ_BYTE * m_current_data; - - /** - * FIXME DOC. - */ - OPJ_OFF_T(* m_opj_skip)(struct opj_stream_private *, OPJ_OFF_T, - struct opj_event_mgr *); - - /** - * FIXME DOC. - */ - OPJ_BOOL(* m_opj_seek)(struct opj_stream_private *, OPJ_OFF_T, - struct opj_event_mgr *); - - /** - * number of bytes containing in the buffer. - */ - OPJ_SIZE_T m_bytes_in_buffer; - - /** - * The number of bytes read/written from the beginning of the stream - */ - OPJ_OFF_T m_byte_offset; - - /** - * The size of the buffer. - */ - OPJ_SIZE_T m_buffer_size; - - /** - * Flags to tell the status of the stream. - * Used with OPJ_STREAM_STATUS_* defines. - */ - OPJ_UINT32 m_status; - -} -opj_stream_private_t; - -/** @name Exported functions (see also openjpeg.h) */ -/*@{*/ -/* ----------------------------------------------------------------------- */ -/** - * Write some bytes to the given data buffer, this function is used in Big Endian cpus. - * @param p_buffer pointer the data buffer to write data to. - * @param p_value the value to write - * @param p_nb_bytes the number of bytes to write -*/ -void opj_write_bytes_BE(OPJ_BYTE * p_buffer, OPJ_UINT32 p_value, - OPJ_UINT32 p_nb_bytes); - -/** - * Reads some bytes from the given data buffer, this function is used in Big Endian cpus. - * @param p_buffer pointer the data buffer to read data from. - * @param p_value pointer to the value that will store the data. - * @param p_nb_bytes the nb bytes to read. - * @return the number of bytes read or -1 if an error occurred. - */ -void opj_read_bytes_BE(const OPJ_BYTE * p_buffer, OPJ_UINT32 * p_value, - OPJ_UINT32 p_nb_bytes); - -/** - * Write some bytes to the given data buffer, this function is used in Little Endian cpus. - * @param p_buffer pointer the data buffer to write data to. - * @param p_value the value to write - * @param p_nb_bytes the number of bytes to write - * @return the number of bytes written or -1 if an error occurred -*/ -void opj_write_bytes_LE(OPJ_BYTE * p_buffer, OPJ_UINT32 p_value, - OPJ_UINT32 p_nb_bytes); - -/** - * Reads some bytes from the given data buffer, this function is used in Little Endian cpus. - * @param p_buffer pointer the data buffer to read data from. - * @param p_value pointer to the value that will store the data. - * @param p_nb_bytes the nb bytes to read. - * @return the number of bytes read or -1 if an error occurred. - */ -void opj_read_bytes_LE(const OPJ_BYTE * p_buffer, OPJ_UINT32 * p_value, - OPJ_UINT32 p_nb_bytes); - - -/** - * Write some bytes to the given data buffer, this function is used in Little Endian cpus. - * @param p_buffer pointer the data buffer to write data to. - * @param p_value the value to write - */ -void opj_write_double_LE(OPJ_BYTE * p_buffer, OPJ_FLOAT64 p_value); - -/*** - * Write some bytes to the given data buffer, this function is used in Big Endian cpus. - * @param p_buffer pointer the data buffer to write data to. - * @param p_value the value to write - */ -void opj_write_double_BE(OPJ_BYTE * p_buffer, OPJ_FLOAT64 p_value); - -/** - * Reads some bytes from the given data buffer, this function is used in Little Endian cpus. - * @param p_buffer pointer the data buffer to read data from. - * @param p_value pointer to the value that will store the data. - */ -void opj_read_double_LE(const OPJ_BYTE * p_buffer, OPJ_FLOAT64 * p_value); - -/** - * Reads some bytes from the given data buffer, this function is used in Big Endian cpus. - * @param p_buffer pointer the data buffer to read data from. - * @param p_value pointer to the value that will store the data. - */ -void opj_read_double_BE(const OPJ_BYTE * p_buffer, OPJ_FLOAT64 * p_value); - -/** - * Reads some bytes from the given data buffer, this function is used in Little Endian cpus. - * @param p_buffer pointer the data buffer to read data from. - * @param p_value pointer to the value that will store the data. - */ -void opj_read_float_LE(const OPJ_BYTE * p_buffer, OPJ_FLOAT32 * p_value); - -/** - * Reads some bytes from the given data buffer, this function is used in Big Endian cpus. - * @param p_buffer pointer the data buffer to read data from. - * @param p_value pointer to the value that will store the data. - */ -void opj_read_float_BE(const OPJ_BYTE * p_buffer, OPJ_FLOAT32 * p_value); - -/** - * Write some bytes to the given data buffer, this function is used in Little Endian cpus. - * @param p_buffer pointer the data buffer to write data to. - * @param p_value the value to write - */ -void opj_write_float_LE(OPJ_BYTE * p_buffer, OPJ_FLOAT32 p_value); - -/*** - * Write some bytes to the given data buffer, this function is used in Big Endian cpus. - * @param p_buffer pointer the data buffer to write data to. - * @param p_value the value to write - */ -void opj_write_float_BE(OPJ_BYTE * p_buffer, OPJ_FLOAT32 p_value); - -/** - * Reads some bytes from the stream. - * @param p_stream the stream to read data from. - * @param p_buffer pointer to the data buffer that will receive the data. - * @param p_size number of bytes to read. - * @param p_event_mgr the user event manager to be notified of special events. - * @return the number of bytes read, or -1 if an error occurred or if the stream is at the end. - */ -OPJ_SIZE_T opj_stream_read_data(opj_stream_private_t * p_stream, - OPJ_BYTE * p_buffer, OPJ_SIZE_T p_size, struct opj_event_mgr * p_event_mgr); - -/** - * Writes some bytes to the stream. - * @param p_stream the stream to write data to. - * @param p_buffer pointer to the data buffer holds the data to be writtent. - * @param p_size number of bytes to write. - * @param p_event_mgr the user event manager to be notified of special events. - * @return the number of bytes writtent, or -1 if an error occurred. - */ -OPJ_SIZE_T opj_stream_write_data(opj_stream_private_t * p_stream, - const OPJ_BYTE * p_buffer, OPJ_SIZE_T p_size, - struct opj_event_mgr * p_event_mgr); - -/** - * Writes the content of the stream buffer to the stream. - * @param p_stream the stream to write data to. - * @param p_event_mgr the user event manager to be notified of special events. - * @return true if the data could be flushed, false else. - */ -OPJ_BOOL opj_stream_flush(opj_stream_private_t * p_stream, - struct opj_event_mgr * p_event_mgr); - -/** - * Skips a number of bytes from the stream. - * @param p_stream the stream to skip data from. - * @param p_size the number of bytes to skip. - * @param p_event_mgr the user event manager to be notified of special events. - * @return the number of bytes skipped, or -1 if an error occurred. - */ -OPJ_OFF_T opj_stream_skip(opj_stream_private_t * p_stream, OPJ_OFF_T p_size, - struct opj_event_mgr * p_event_mgr); - -/** - * Tells the byte offset on the stream (similar to ftell). - * - * @param p_stream the stream to get the information from. - * - * @return the current position o fthe stream. - */ -OPJ_OFF_T opj_stream_tell(const opj_stream_private_t * p_stream); - - -/** - * Get the number of bytes left before the end of the stream (similar to cio_numbytesleft). - * - * @param p_stream the stream to get the information from. - * - * @return Number of bytes left before the end of the stream. - */ -OPJ_OFF_T opj_stream_get_number_byte_left(const opj_stream_private_t * - p_stream); - -/** - * Skips a number of bytes from the stream. - * @param p_stream the stream to skip data from. - * @param p_size the number of bytes to skip. - * @param p_event_mgr the user event manager to be notified of special events. - * @return the number of bytes skipped, or -1 if an error occurred. - */ -OPJ_OFF_T opj_stream_write_skip(opj_stream_private_t * p_stream, - OPJ_OFF_T p_size, struct opj_event_mgr * p_event_mgr); - -/** - * Skips a number of bytes from the stream. - * @param p_stream the stream to skip data from. - * @param p_size the number of bytes to skip. - * @param p_event_mgr the user event manager to be notified of special events. - * @return the number of bytes skipped, or -1 if an error occurred. - */ -OPJ_OFF_T opj_stream_read_skip(opj_stream_private_t * p_stream, - OPJ_OFF_T p_size, struct opj_event_mgr * p_event_mgr); - -/** - * Skips a number of bytes from the stream. - * @param p_stream the stream to skip data from. - * @param p_size the number of bytes to skip. - * @param p_event_mgr the user event manager to be notified of special events. - * @return OPJ_TRUE if success, or OPJ_FALSE if an error occurred. - */ -OPJ_BOOL opj_stream_read_seek(opj_stream_private_t * p_stream, OPJ_OFF_T p_size, - struct opj_event_mgr * p_event_mgr); - -/** - * Skips a number of bytes from the stream. - * @param p_stream the stream to skip data from. - * @param p_size the number of bytes to skip. - * @param p_event_mgr the user event manager to be notified of special events. - * @return the number of bytes skipped, or -1 if an error occurred. - */ -OPJ_BOOL opj_stream_write_seek(opj_stream_private_t * p_stream, - OPJ_OFF_T p_size, struct opj_event_mgr * p_event_mgr); - -/** - * Seeks a number of bytes from the stream. - * @param p_stream the stream to skip data from. - * @param p_size the number of bytes to skip. - * @param p_event_mgr the user event manager to be notified of special events. - * @return true if the stream is seekable. - */ -OPJ_BOOL opj_stream_seek(opj_stream_private_t * p_stream, OPJ_OFF_T p_size, - struct opj_event_mgr * p_event_mgr); - -/** - * Tells if the given stream is seekable. - */ -OPJ_BOOL opj_stream_has_seek(const opj_stream_private_t * p_stream); - -/** - * FIXME DOC. - */ -OPJ_SIZE_T opj_stream_default_read(void * p_buffer, OPJ_SIZE_T p_nb_bytes, - void * p_user_data); - -/** - * FIXME DOC. - */ -OPJ_SIZE_T opj_stream_default_write(void * p_buffer, OPJ_SIZE_T p_nb_bytes, - void * p_user_data); - -/** - * FIXME DOC. - */ -OPJ_OFF_T opj_stream_default_skip(OPJ_OFF_T p_nb_bytes, void * p_user_data); - -/** - * FIXME DOC. - */ -OPJ_BOOL opj_stream_default_seek(OPJ_OFF_T p_nb_bytes, void * p_user_data); - -/* ----------------------------------------------------------------------- */ -/*@}*/ - -/*@}*/ - - -#endif /* OPJ_CIO_H */ - diff --git a/src/3rd/LibOpenJpeg/dwt.c b/src/3rd/LibOpenJpeg/dwt.c deleted file mode 100644 index 5b98d2b3..00000000 --- a/src/3rd/LibOpenJpeg/dwt.c +++ /dev/null @@ -1,2889 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2007, Jonathan Ballard - * Copyright (c) 2007, Callum Lerwick - * Copyright (c) 2017, IntoPIX SA - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include - -#define OPJ_SKIP_POISON -#include "opj_includes.h" - -#ifdef __SSE__ -#include -#endif -#ifdef __SSE2__ -#include -#endif -#ifdef __SSSE3__ -#include -#endif -#ifdef __AVX2__ -#include -#endif - -#if defined(__GNUC__) -#pragma GCC poison malloc calloc realloc free -#endif - -/** @defgroup DWT DWT - Implementation of a discrete wavelet transform */ -/*@{*/ - -#define OPJ_WS(i) v->mem[(i)*2] -#define OPJ_WD(i) v->mem[(1+(i)*2)] - -#ifdef __AVX2__ -/** Number of int32 values in a AVX2 register */ -#define VREG_INT_COUNT 8 -#else -/** Number of int32 values in a SSE2 register */ -#define VREG_INT_COUNT 4 -#endif - -/** Number of columns that we can process in parallel in the vertical pass */ -#define PARALLEL_COLS_53 (2*VREG_INT_COUNT) - -/** @name Local data structures */ -/*@{*/ - -typedef struct dwt_local { - OPJ_INT32* mem; - OPJ_INT32 dn; /* number of elements in high pass band */ - OPJ_INT32 sn; /* number of elements in low pass band */ - OPJ_INT32 cas; /* 0 = start on even coord, 1 = start on odd coord */ -} opj_dwt_t; - -typedef union { - OPJ_FLOAT32 f[4]; -} opj_v4_t; - -typedef struct v4dwt_local { - opj_v4_t* wavelet ; - OPJ_INT32 dn ; /* number of elements in high pass band */ - OPJ_INT32 sn ; /* number of elements in low pass band */ - OPJ_INT32 cas ; /* 0 = start on even coord, 1 = start on odd coord */ - OPJ_UINT32 win_l_x0; /* start coord in low pass band */ - OPJ_UINT32 win_l_x1; /* end coord in low pass band */ - OPJ_UINT32 win_h_x0; /* start coord in high pass band */ - OPJ_UINT32 win_h_x1; /* end coord in high pass band */ -} opj_v4dwt_t ; - -static const OPJ_FLOAT32 opj_dwt_alpha = 1.586134342f; /* 12994 */ -static const OPJ_FLOAT32 opj_dwt_beta = 0.052980118f; /* 434 */ -static const OPJ_FLOAT32 opj_dwt_gamma = -0.882911075f; /* -7233 */ -static const OPJ_FLOAT32 opj_dwt_delta = -0.443506852f; /* -3633 */ - -static const OPJ_FLOAT32 opj_K = 1.230174105f; /* 10078 */ -static const OPJ_FLOAT32 opj_c13318 = 1.625732422f; - -/*@}*/ - -/** -Virtual function type for wavelet transform in 1-D -*/ -typedef void (*DWT1DFN)(const opj_dwt_t* v); - -/** @name Local static functions */ -/*@{*/ - -/** -Forward lazy transform (horizontal) -*/ -static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, - OPJ_INT32 sn, OPJ_INT32 cas); -/** -Forward lazy transform (vertical) -*/ -static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, - OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas); -/** -Forward 5-3 wavelet transform in 1-D -*/ -static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, - OPJ_INT32 cas); -/** -Forward 9-7 wavelet transform in 1-D -*/ -static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, - OPJ_INT32 cas); -/** -Explicit calculation of the Quantization Stepsizes -*/ -static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, - opj_stepsize_t *bandno_stepsize); -/** -Inverse wavelet transform in 2-D. -*/ -static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, - opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i); - -static OPJ_BOOL opj_dwt_decode_partial_tile( - opj_tcd_tilecomp_t* tilec, - OPJ_UINT32 numres); - -static OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec, - void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32)); - -static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r, - OPJ_UINT32 i); - -/* */ -/* Inverse 9-7 wavelet transform in 1-D. */ -/* */ -static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt); - -static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt, - OPJ_FLOAT32* OPJ_RESTRICT a, - OPJ_UINT32 width, - OPJ_UINT32 remaining_height); - -static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt, - OPJ_FLOAT32* OPJ_RESTRICT a, - OPJ_UINT32 width, - OPJ_UINT32 nb_elts_read); - -#ifdef __SSE__ -static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, - OPJ_UINT32 start, - OPJ_UINT32 end, - const __m128 c); - -static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, - OPJ_UINT32 start, - OPJ_UINT32 end, - OPJ_UINT32 m, __m128 c); - -#else -static void opj_v4dwt_decode_step1(opj_v4_t* w, - OPJ_UINT32 start, - OPJ_UINT32 end, - const OPJ_FLOAT32 c); - -static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, - OPJ_UINT32 start, - OPJ_UINT32 end, - OPJ_UINT32 m, - OPJ_FLOAT32 c); - -#endif - -/*@}*/ - -/*@}*/ - -#define OPJ_S(i) a[(i)*2] -#define OPJ_D(i) a[(1+(i)*2)] -#define OPJ_S_(i) ((i)<0?OPJ_S(0):((i)>=sn?OPJ_S(sn-1):OPJ_S(i))) -#define OPJ_D_(i) ((i)<0?OPJ_D(0):((i)>=dn?OPJ_D(dn-1):OPJ_D(i))) -/* new */ -#define OPJ_SS_(i) ((i)<0?OPJ_S(0):((i)>=dn?OPJ_S(dn-1):OPJ_S(i))) -#define OPJ_DD_(i) ((i)<0?OPJ_D(0):((i)>=sn?OPJ_D(sn-1):OPJ_D(i))) - -/* */ -/* This table contains the norms of the 5-3 wavelets for different bands. */ -/* */ -/* FIXME! the array should really be extended up to 33 resolution levels */ -/* See https://github.com/uclouvain/openjpeg/issues/493 */ -static const OPJ_FLOAT64 opj_dwt_norms[4][10] = { - {1.000, 1.500, 2.750, 5.375, 10.68, 21.34, 42.67, 85.33, 170.7, 341.3}, - {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9}, - {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9}, - {.7186, .9218, 1.586, 3.043, 6.019, 12.01, 24.00, 47.97, 95.93} -}; - -/* */ -/* This table contains the norms of the 9-7 wavelets for different bands. */ -/* */ -/* FIXME! the array should really be extended up to 33 resolution levels */ -/* See https://github.com/uclouvain/openjpeg/issues/493 */ -static const OPJ_FLOAT64 opj_dwt_norms_real[4][10] = { - {1.000, 1.965, 4.177, 8.403, 16.90, 33.84, 67.69, 135.3, 270.6, 540.9}, - {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0}, - {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0}, - {2.080, 3.865, 8.307, 17.18, 34.71, 69.59, 139.3, 278.6, 557.2} -}; - -/* -========================================================== - local functions -========================================================== -*/ - -/* */ -/* Forward lazy transform (horizontal). */ -/* */ -static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, - OPJ_INT32 sn, OPJ_INT32 cas) -{ - OPJ_INT32 i; - OPJ_INT32 * l_dest = b; - OPJ_INT32 * l_src = a + cas; - - for (i = 0; i < sn; ++i) { - *l_dest++ = *l_src; - l_src += 2; - } - - l_dest = b + sn; - l_src = a + 1 - cas; - - for (i = 0; i < dn; ++i) { - *l_dest++ = *l_src; - l_src += 2; - } -} - -/* */ -/* Forward lazy transform (vertical). */ -/* */ -static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, - OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas) -{ - OPJ_INT32 i = sn; - OPJ_INT32 * l_dest = b; - OPJ_INT32 * l_src = a + cas; - - while (i--) { - *l_dest = *l_src; - l_dest += x; - l_src += 2; - } /* b[i*x]=a[2*i+cas]; */ - - l_dest = b + (OPJ_SIZE_T)sn * (OPJ_SIZE_T)x; - l_src = a + 1 - cas; - - i = dn; - while (i--) { - *l_dest = *l_src; - l_dest += x; - l_src += 2; - } /*b[(sn+i)*x]=a[(2*i+1-cas)];*/ -} - -#ifdef STANDARD_SLOW_VERSION -/* */ -/* Inverse lazy transform (horizontal). */ -/* */ -static void opj_dwt_interleave_h(const opj_dwt_t* h, OPJ_INT32 *a) -{ - OPJ_INT32 *ai = a; - OPJ_INT32 *bi = h->mem + h->cas; - OPJ_INT32 i = h->sn; - while (i--) { - *bi = *(ai++); - bi += 2; - } - ai = a + h->sn; - bi = h->mem + 1 - h->cas; - i = h->dn ; - while (i--) { - *bi = *(ai++); - bi += 2; - } -} - -/* */ -/* Inverse lazy transform (vertical). */ -/* */ -static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x) -{ - OPJ_INT32 *ai = a; - OPJ_INT32 *bi = v->mem + v->cas; - OPJ_INT32 i = v->sn; - while (i--) { - *bi = *ai; - bi += 2; - ai += x; - } - ai = a + (v->sn * (OPJ_SIZE_T)x); - bi = v->mem + 1 - v->cas; - i = v->dn ; - while (i--) { - *bi = *ai; - bi += 2; - ai += x; - } -} - -#endif /* STANDARD_SLOW_VERSION */ - -/* */ -/* Forward 5-3 wavelet transform in 1-D. */ -/* */ -static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, - OPJ_INT32 cas) -{ - OPJ_INT32 i; - - if (!cas) { - if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ - for (i = 0; i < dn; i++) { - OPJ_D(i) -= (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1; - } - for (i = 0; i < sn; i++) { - OPJ_S(i) += (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; - } - } - } else { - if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */ - OPJ_S(0) *= 2; - } else { - for (i = 0; i < dn; i++) { - OPJ_S(i) -= (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1; - } - for (i = 0; i < sn; i++) { - OPJ_D(i) += (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2; - } - } - } -} - -#ifdef STANDARD_SLOW_VERSION -/* */ -/* Inverse 5-3 wavelet transform in 1-D. */ -/* */ -static void opj_dwt_decode_1_(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, - OPJ_INT32 cas) -{ - OPJ_INT32 i; - - if (!cas) { - if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ - for (i = 0; i < sn; i++) { - OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; - } - for (i = 0; i < dn; i++) { - OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1; - } - } - } else { - if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */ - OPJ_S(0) /= 2; - } else { - for (i = 0; i < sn; i++) { - OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2; - } - for (i = 0; i < dn; i++) { - OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1; - } - } - } -} - -static void opj_dwt_decode_1(const opj_dwt_t *v) -{ - opj_dwt_decode_1_(v->mem, v->dn, v->sn, v->cas); -} - -#endif /* STANDARD_SLOW_VERSION */ - -#if !defined(STANDARD_SLOW_VERSION) -static void opj_idwt53_h_cas0(OPJ_INT32* tmp, - const OPJ_INT32 sn, - const OPJ_INT32 len, - OPJ_INT32* tiledp) -{ - OPJ_INT32 i, j; - const OPJ_INT32* in_even = &tiledp[0]; - const OPJ_INT32* in_odd = &tiledp[sn]; - -#ifdef TWO_PASS_VERSION - /* For documentation purpose: performs lifting in two iterations, */ - /* but without explicit interleaving */ - - assert(len > 1); - - /* Even */ - tmp[0] = in_even[0] - ((in_odd[0] + 1) >> 1); - for (i = 2, j = 0; i <= len - 2; i += 2, j++) { - tmp[i] = in_even[j + 1] - ((in_odd[j] + in_odd[j + 1] + 2) >> 2); - } - if (len & 1) { /* if len is odd */ - tmp[len - 1] = in_even[(len - 1) / 2] - ((in_odd[(len - 2) / 2] + 1) >> 1); - } - - /* Odd */ - for (i = 1, j = 0; i < len - 1; i += 2, j++) { - tmp[i] = in_odd[j] + ((tmp[i - 1] + tmp[i + 1]) >> 1); - } - if (!(len & 1)) { /* if len is even */ - tmp[len - 1] = in_odd[(len - 1) / 2] + tmp[len - 2]; - } -#else - OPJ_INT32 d1c, d1n, s1n, s0c, s0n; - - assert(len > 1); - - /* Improved version of the TWO_PASS_VERSION: */ - /* Performs lifting in one single iteration. Saves memory */ - /* accesses and explicit interleaving. */ - s1n = in_even[0]; - d1n = in_odd[0]; - s0n = s1n - ((d1n + 1) >> 1); - - for (i = 0, j = 1; i < (len - 3); i += 2, j++) { - d1c = d1n; - s0c = s0n; - - s1n = in_even[j]; - d1n = in_odd[j]; - - s0n = s1n - ((d1c + d1n + 2) >> 2); - - tmp[i ] = s0c; - tmp[i + 1] = d1c + ((s0c + s0n) >> 1); - } - - tmp[i] = s0n; - - if (len & 1) { - tmp[len - 1] = in_even[(len - 1) / 2] - ((d1n + 1) >> 1); - tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1); - } else { - tmp[len - 1] = d1n + s0n; - } -#endif - memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32)); -} - -static void opj_idwt53_h_cas1(OPJ_INT32* tmp, - const OPJ_INT32 sn, - const OPJ_INT32 len, - OPJ_INT32* tiledp) -{ - OPJ_INT32 i, j; - const OPJ_INT32* in_even = &tiledp[sn]; - const OPJ_INT32* in_odd = &tiledp[0]; - -#ifdef TWO_PASS_VERSION - /* For documentation purpose: performs lifting in two iterations, */ - /* but without explicit interleaving */ - - assert(len > 2); - - /* Odd */ - for (i = 1, j = 0; i < len - 1; i += 2, j++) { - tmp[i] = in_odd[j] - ((in_even[j] + in_even[j + 1] + 2) >> 2); - } - if (!(len & 1)) { - tmp[len - 1] = in_odd[len / 2 - 1] - ((in_even[len / 2 - 1] + 1) >> 1); - } - - /* Even */ - tmp[0] = in_even[0] + tmp[1]; - for (i = 2, j = 1; i < len - 1; i += 2, j++) { - tmp[i] = in_even[j] + ((tmp[i + 1] + tmp[i - 1]) >> 1); - } - if (len & 1) { - tmp[len - 1] = in_even[len / 2] + tmp[len - 2]; - } -#else - OPJ_INT32 s1, s2, dc, dn; - - assert(len > 2); - - /* Improved version of the TWO_PASS_VERSION: */ - /* Performs lifting in one single iteration. Saves memory */ - /* accesses and explicit interleaving. */ - - s1 = in_even[1]; - dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2); - tmp[0] = in_even[0] + dc; - - for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) { - - s2 = in_even[j + 1]; - - dn = in_odd[j] - ((s1 + s2 + 2) >> 2); - tmp[i ] = dc; - tmp[i + 1] = s1 + ((dn + dc) >> 1); - - dc = dn; - s1 = s2; - } - - tmp[i] = dc; - - if (!(len & 1)) { - dn = in_odd[len / 2 - 1] - ((s1 + 1) >> 1); - tmp[len - 2] = s1 + ((dn + dc) >> 1); - tmp[len - 1] = dn; - } else { - tmp[len - 1] = s1 + dc; - } -#endif - memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32)); -} - - -#endif /* !defined(STANDARD_SLOW_VERSION) */ - -/* */ -/* Inverse 5-3 wavelet transform in 1-D for one row. */ -/* */ -/* Performs interleave, inverse wavelet transform and copy back to buffer */ -static void opj_idwt53_h(const opj_dwt_t *dwt, - OPJ_INT32* tiledp) -{ -#ifdef STANDARD_SLOW_VERSION - /* For documentation purpose */ - opj_dwt_interleave_h(dwt, tiledp); - opj_dwt_decode_1(dwt); - memcpy(tiledp, dwt->mem, (OPJ_UINT32)(dwt->sn + dwt->dn) * sizeof(OPJ_INT32)); -#else - const OPJ_INT32 sn = dwt->sn; - const OPJ_INT32 len = sn + dwt->dn; - if (dwt->cas == 0) { /* Left-most sample is on even coordinate */ - if (len > 1) { - opj_idwt53_h_cas0(dwt->mem, sn, len, tiledp); - } else { - /* Unmodified value */ - } - } else { /* Left-most sample is on odd coordinate */ - if (len == 1) { - tiledp[0] /= 2; - } else if (len == 2) { - OPJ_INT32* out = dwt->mem; - const OPJ_INT32* in_even = &tiledp[sn]; - const OPJ_INT32* in_odd = &tiledp[0]; - out[1] = in_odd[0] - ((in_even[0] + 1) >> 1); - out[0] = in_even[0] + out[1]; - memcpy(tiledp, dwt->mem, (OPJ_UINT32)len * sizeof(OPJ_INT32)); - } else if (len > 2) { - opj_idwt53_h_cas1(dwt->mem, sn, len, tiledp); - } - } -#endif -} - -#if (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION) - -/* Conveniency macros to improve the readabilty of the formulas */ -#if __AVX2__ -#define VREG __m256i -#define LOAD_CST(x) _mm256_set1_epi32(x) -#define LOAD(x) _mm256_load_si256((const VREG*)(x)) -#define LOADU(x) _mm256_loadu_si256((const VREG*)(x)) -#define STORE(x,y) _mm256_store_si256((VREG*)(x),(y)) -#define STOREU(x,y) _mm256_storeu_si256((VREG*)(x),(y)) -#define ADD(x,y) _mm256_add_epi32((x),(y)) -#define SUB(x,y) _mm256_sub_epi32((x),(y)) -#define SAR(x,y) _mm256_srai_epi32((x),(y)) -#else -#define VREG __m128i -#define LOAD_CST(x) _mm_set1_epi32(x) -#define LOAD(x) _mm_load_si128((const VREG*)(x)) -#define LOADU(x) _mm_loadu_si128((const VREG*)(x)) -#define STORE(x,y) _mm_store_si128((VREG*)(x),(y)) -#define STOREU(x,y) _mm_storeu_si128((VREG*)(x),(y)) -#define ADD(x,y) _mm_add_epi32((x),(y)) -#define SUB(x,y) _mm_sub_epi32((x),(y)) -#define SAR(x,y) _mm_srai_epi32((x),(y)) -#endif -#define ADD3(x,y,z) ADD(ADD(x,y),z) - -static -void opj_idwt53_v_final_memcpy(OPJ_INT32* tiledp_col, - const OPJ_INT32* tmp, - OPJ_INT32 len, - OPJ_SIZE_T stride) -{ - OPJ_INT32 i; - for (i = 0; i < len; ++i) { - /* A memcpy(&tiledp_col[i * stride + 0], - &tmp[PARALLEL_COLS_53 * i + 0], - PARALLEL_COLS_53 * sizeof(OPJ_INT32)) - would do but would be a tiny bit slower. - We can take here advantage of our knowledge of alignment */ - STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + 0], - LOAD(&tmp[PARALLEL_COLS_53 * i + 0])); - STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + VREG_INT_COUNT], - LOAD(&tmp[PARALLEL_COLS_53 * i + VREG_INT_COUNT])); - } -} - -/** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or - * 16 in AVX2, when top-most pixel is on even coordinate */ -static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2( - OPJ_INT32* tmp, - const OPJ_INT32 sn, - const OPJ_INT32 len, - OPJ_INT32* tiledp_col, - const OPJ_SIZE_T stride) -{ - const OPJ_INT32* in_even = &tiledp_col[0]; - const OPJ_INT32* in_odd = &tiledp_col[(OPJ_SIZE_T)sn * stride]; - - OPJ_INT32 i; - OPJ_SIZE_T j; - VREG d1c_0, d1n_0, s1n_0, s0c_0, s0n_0; - VREG d1c_1, d1n_1, s1n_1, s0c_1, s0n_1; - const VREG two = LOAD_CST(2); - - assert(len > 1); -#if __AVX2__ - assert(PARALLEL_COLS_53 == 16); - assert(VREG_INT_COUNT == 8); -#else - assert(PARALLEL_COLS_53 == 8); - assert(VREG_INT_COUNT == 4); -#endif - - /* Note: loads of input even/odd values must be done in a unaligned */ - /* fashion. But stores in tmp can be done with aligned store, since */ - /* the temporary buffer is properly aligned */ - assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0); - - s1n_0 = LOADU(in_even + 0); - s1n_1 = LOADU(in_even + VREG_INT_COUNT); - d1n_0 = LOADU(in_odd); - d1n_1 = LOADU(in_odd + VREG_INT_COUNT); - - /* s0n = s1n - ((d1n + 1) >> 1); <==> */ - /* s0n = s1n - ((d1n + d1n + 2) >> 2); */ - s0n_0 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2)); - s0n_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2)); - - for (i = 0, j = 1; i < (len - 3); i += 2, j++) { - d1c_0 = d1n_0; - s0c_0 = s0n_0; - d1c_1 = d1n_1; - s0c_1 = s0n_1; - - s1n_0 = LOADU(in_even + j * stride); - s1n_1 = LOADU(in_even + j * stride + VREG_INT_COUNT); - d1n_0 = LOADU(in_odd + j * stride); - d1n_1 = LOADU(in_odd + j * stride + VREG_INT_COUNT); - - /*s0n = s1n - ((d1c + d1n + 2) >> 2);*/ - s0n_0 = SUB(s1n_0, SAR(ADD3(d1c_0, d1n_0, two), 2)); - s0n_1 = SUB(s1n_1, SAR(ADD3(d1c_1, d1n_1, two), 2)); - - STORE(tmp + PARALLEL_COLS_53 * (i + 0), s0c_0); - STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0c_1); - - /* d1c + ((s0c + s0n) >> 1) */ - STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0, - ADD(d1c_0, SAR(ADD(s0c_0, s0n_0), 1))); - STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT, - ADD(d1c_1, SAR(ADD(s0c_1, s0n_1), 1))); - } - - STORE(tmp + PARALLEL_COLS_53 * (i + 0) + 0, s0n_0); - STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0n_1); - - if (len & 1) { - VREG tmp_len_minus_1; - s1n_0 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride); - /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */ - tmp_len_minus_1 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2)); - STORE(tmp + PARALLEL_COLS_53 * (len - 1), tmp_len_minus_1); - /* d1n + ((s0n + tmp_len_minus_1) >> 1) */ - STORE(tmp + PARALLEL_COLS_53 * (len - 2), - ADD(d1n_0, SAR(ADD(s0n_0, tmp_len_minus_1), 1))); - - s1n_1 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride + VREG_INT_COUNT); - /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */ - tmp_len_minus_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2)); - STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, - tmp_len_minus_1); - /* d1n + ((s0n + tmp_len_minus_1) >> 1) */ - STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT, - ADD(d1n_1, SAR(ADD(s0n_1, tmp_len_minus_1), 1))); - - - } else { - STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, - ADD(d1n_0, s0n_0)); - STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, - ADD(d1n_1, s0n_1)); - } - - opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride); -} - - -/** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or - * 16 in AVX2, when top-most pixel is on odd coordinate */ -static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2( - OPJ_INT32* tmp, - const OPJ_INT32 sn, - const OPJ_INT32 len, - OPJ_INT32* tiledp_col, - const OPJ_SIZE_T stride) -{ - OPJ_INT32 i; - OPJ_SIZE_T j; - - VREG s1_0, s2_0, dc_0, dn_0; - VREG s1_1, s2_1, dc_1, dn_1; - const VREG two = LOAD_CST(2); - - const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride]; - const OPJ_INT32* in_odd = &tiledp_col[0]; - - assert(len > 2); -#if __AVX2__ - assert(PARALLEL_COLS_53 == 16); - assert(VREG_INT_COUNT == 8); -#else - assert(PARALLEL_COLS_53 == 8); - assert(VREG_INT_COUNT == 4); -#endif - - /* Note: loads of input even/odd values must be done in a unaligned */ - /* fashion. But stores in tmp can be done with aligned store, since */ - /* the temporary buffer is properly aligned */ - assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0); - - s1_0 = LOADU(in_even + stride); - /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */ - dc_0 = SUB(LOADU(in_odd + 0), - SAR(ADD3(LOADU(in_even + 0), s1_0, two), 2)); - STORE(tmp + PARALLEL_COLS_53 * 0, ADD(LOADU(in_even + 0), dc_0)); - - s1_1 = LOADU(in_even + stride + VREG_INT_COUNT); - /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */ - dc_1 = SUB(LOADU(in_odd + VREG_INT_COUNT), - SAR(ADD3(LOADU(in_even + VREG_INT_COUNT), s1_1, two), 2)); - STORE(tmp + PARALLEL_COLS_53 * 0 + VREG_INT_COUNT, - ADD(LOADU(in_even + VREG_INT_COUNT), dc_1)); - - for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) { - - s2_0 = LOADU(in_even + (j + 1) * stride); - s2_1 = LOADU(in_even + (j + 1) * stride + VREG_INT_COUNT); - - /* dn = in_odd[j * stride] - ((s1 + s2 + 2) >> 2); */ - dn_0 = SUB(LOADU(in_odd + j * stride), - SAR(ADD3(s1_0, s2_0, two), 2)); - dn_1 = SUB(LOADU(in_odd + j * stride + VREG_INT_COUNT), - SAR(ADD3(s1_1, s2_1, two), 2)); - - STORE(tmp + PARALLEL_COLS_53 * i, dc_0); - STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1); - - /* tmp[i + 1] = s1 + ((dn + dc) >> 1); */ - STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0, - ADD(s1_0, SAR(ADD(dn_0, dc_0), 1))); - STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT, - ADD(s1_1, SAR(ADD(dn_1, dc_1), 1))); - - dc_0 = dn_0; - s1_0 = s2_0; - dc_1 = dn_1; - s1_1 = s2_1; - } - STORE(tmp + PARALLEL_COLS_53 * i, dc_0); - STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1); - - if (!(len & 1)) { - /*dn = in_odd[(len / 2 - 1) * stride] - ((s1 + 1) >> 1); */ - dn_0 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride), - SAR(ADD3(s1_0, s1_0, two), 2)); - dn_1 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride + VREG_INT_COUNT), - SAR(ADD3(s1_1, s1_1, two), 2)); - - /* tmp[len - 2] = s1 + ((dn + dc) >> 1); */ - STORE(tmp + PARALLEL_COLS_53 * (len - 2) + 0, - ADD(s1_0, SAR(ADD(dn_0, dc_0), 1))); - STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT, - ADD(s1_1, SAR(ADD(dn_1, dc_1), 1))); - - STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, dn_0); - STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, dn_1); - } else { - STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, ADD(s1_0, dc_0)); - STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, - ADD(s1_1, dc_1)); - } - - opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride); -} - -#undef VREG -#undef LOAD_CST -#undef LOADU -#undef LOAD -#undef STORE -#undef STOREU -#undef ADD -#undef ADD3 -#undef SUB -#undef SAR - -#endif /* (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION) */ - -#if !defined(STANDARD_SLOW_VERSION) -/** Vertical inverse 5x3 wavelet transform for one column, when top-most - * pixel is on even coordinate */ -static void opj_idwt3_v_cas0(OPJ_INT32* tmp, - const OPJ_INT32 sn, - const OPJ_INT32 len, - OPJ_INT32* tiledp_col, - const OPJ_SIZE_T stride) -{ - OPJ_INT32 i, j; - OPJ_INT32 d1c, d1n, s1n, s0c, s0n; - - assert(len > 1); - - /* Performs lifting in one single iteration. Saves memory */ - /* accesses and explicit interleaving. */ - - s1n = tiledp_col[0]; - d1n = tiledp_col[(OPJ_SIZE_T)sn * stride]; - s0n = s1n - ((d1n + 1) >> 1); - - for (i = 0, j = 0; i < (len - 3); i += 2, j++) { - d1c = d1n; - s0c = s0n; - - s1n = tiledp_col[(OPJ_SIZE_T)(j + 1) * stride]; - d1n = tiledp_col[(OPJ_SIZE_T)(sn + j + 1) * stride]; - - s0n = s1n - ((d1c + d1n + 2) >> 2); - - tmp[i ] = s0c; - tmp[i + 1] = d1c + ((s0c + s0n) >> 1); - } - - tmp[i] = s0n; - - if (len & 1) { - tmp[len - 1] = - tiledp_col[(OPJ_SIZE_T)((len - 1) / 2) * stride] - - ((d1n + 1) >> 1); - tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1); - } else { - tmp[len - 1] = d1n + s0n; - } - - for (i = 0; i < len; ++i) { - tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i]; - } -} - - -/** Vertical inverse 5x3 wavelet transform for one column, when top-most - * pixel is on odd coordinate */ -static void opj_idwt3_v_cas1(OPJ_INT32* tmp, - const OPJ_INT32 sn, - const OPJ_INT32 len, - OPJ_INT32* tiledp_col, - const OPJ_SIZE_T stride) -{ - OPJ_INT32 i, j; - OPJ_INT32 s1, s2, dc, dn; - const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride]; - const OPJ_INT32* in_odd = &tiledp_col[0]; - - assert(len > 2); - - /* Performs lifting in one single iteration. Saves memory */ - /* accesses and explicit interleaving. */ - - s1 = in_even[stride]; - dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2); - tmp[0] = in_even[0] + dc; - for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) { - - s2 = in_even[(OPJ_SIZE_T)(j + 1) * stride]; - - dn = in_odd[(OPJ_SIZE_T)j * stride] - ((s1 + s2 + 2) >> 2); - tmp[i ] = dc; - tmp[i + 1] = s1 + ((dn + dc) >> 1); - - dc = dn; - s1 = s2; - } - tmp[i] = dc; - if (!(len & 1)) { - dn = in_odd[(OPJ_SIZE_T)(len / 2 - 1) * stride] - ((s1 + 1) >> 1); - tmp[len - 2] = s1 + ((dn + dc) >> 1); - tmp[len - 1] = dn; - } else { - tmp[len - 1] = s1 + dc; - } - - for (i = 0; i < len; ++i) { - tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i]; - } -} -#endif /* !defined(STANDARD_SLOW_VERSION) */ - -/* */ -/* Inverse vertical 5-3 wavelet transform in 1-D for several columns. */ -/* */ -/* Performs interleave, inverse wavelet transform and copy back to buffer */ -static void opj_idwt53_v(const opj_dwt_t *dwt, - OPJ_INT32* tiledp_col, - OPJ_SIZE_T stride, - OPJ_INT32 nb_cols) -{ -#ifdef STANDARD_SLOW_VERSION - /* For documentation purpose */ - OPJ_INT32 k, c; - for (c = 0; c < nb_cols; c ++) { - opj_dwt_interleave_v(dwt, tiledp_col + c, stride); - opj_dwt_decode_1(dwt); - for (k = 0; k < dwt->sn + dwt->dn; ++k) { - tiledp_col[c + k * stride] = dwt->mem[k]; - } - } -#else - const OPJ_INT32 sn = dwt->sn; - const OPJ_INT32 len = sn + dwt->dn; - if (dwt->cas == 0) { - /* If len == 1, unmodified value */ - -#if (defined(__SSE2__) || defined(__AVX2__)) - if (len > 1 && nb_cols == PARALLEL_COLS_53) { - /* Same as below general case, except that thanks to SSE2/AVX2 */ - /* we can efficently process 8/16 columns in parallel */ - opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride); - return; - } -#endif - if (len > 1) { - OPJ_INT32 c; - for (c = 0; c < nb_cols; c++, tiledp_col++) { - opj_idwt3_v_cas0(dwt->mem, sn, len, tiledp_col, stride); - } - return; - } - } else { - if (len == 1) { - OPJ_INT32 c; - for (c = 0; c < nb_cols; c++, tiledp_col++) { - tiledp_col[0] /= 2; - } - return; - } - - if (len == 2) { - OPJ_INT32 c; - OPJ_INT32* out = dwt->mem; - for (c = 0; c < nb_cols; c++, tiledp_col++) { - OPJ_INT32 i; - const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride]; - const OPJ_INT32* in_odd = &tiledp_col[0]; - - out[1] = in_odd[0] - ((in_even[0] + 1) >> 1); - out[0] = in_even[0] + out[1]; - - for (i = 0; i < len; ++i) { - tiledp_col[(OPJ_SIZE_T)i * stride] = out[i]; - } - } - - return; - } - -#if (defined(__SSE2__) || defined(__AVX2__)) - if (len > 2 && nb_cols == PARALLEL_COLS_53) { - /* Same as below general case, except that thanks to SSE2/AVX2 */ - /* we can efficently process 8/16 columns in parallel */ - opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride); - return; - } -#endif - if (len > 2) { - OPJ_INT32 c; - for (c = 0; c < nb_cols; c++, tiledp_col++) { - opj_idwt3_v_cas1(dwt->mem, sn, len, tiledp_col, stride); - } - return; - } - } -#endif -} - - -/* */ -/* Forward 9-7 wavelet transform in 1-D. */ -/* */ -static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, - OPJ_INT32 cas) -{ - OPJ_INT32 i; - if (!cas) { - if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ - for (i = 0; i < dn; i++) { - OPJ_D(i) -= opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 12993); - } - for (i = 0; i < sn; i++) { - OPJ_S(i) -= opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 434); - } - for (i = 0; i < dn; i++) { - OPJ_D(i) += opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 7233); - } - for (i = 0; i < sn; i++) { - OPJ_S(i) += opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 3633); - } - for (i = 0; i < dn; i++) { - OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 5038); /*5038 */ - } - for (i = 0; i < sn; i++) { - OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 6659); /*6660 */ - } - } - } else { - if ((sn > 0) || (dn > 1)) { /* NEW : CASE ONE ELEMENT */ - for (i = 0; i < dn; i++) { - OPJ_S(i) -= opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 12993); - } - for (i = 0; i < sn; i++) { - OPJ_D(i) -= opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 434); - } - for (i = 0; i < dn; i++) { - OPJ_S(i) += opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 7233); - } - for (i = 0; i < sn; i++) { - OPJ_D(i) += opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 3633); - } - for (i = 0; i < dn; i++) { - OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 5038); /*5038 */ - } - for (i = 0; i < sn; i++) { - OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 6659); /*6660 */ - } - } - } -} - -static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, - opj_stepsize_t *bandno_stepsize) -{ - OPJ_INT32 p, n; - p = opj_int_floorlog2(stepsize) - 13; - n = 11 - opj_int_floorlog2(stepsize); - bandno_stepsize->mant = (n < 0 ? stepsize >> -n : stepsize << n) & 0x7ff; - bandno_stepsize->expn = numbps - p; -} - -/* -========================================================== - DWT interface -========================================================== -*/ - - -/* */ -/* Forward 5-3 wavelet transform in 2-D. */ -/* */ -static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec, - void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32)) -{ - OPJ_INT32 i, j, k; - OPJ_INT32 *a = 00; - OPJ_INT32 *aj = 00; - OPJ_INT32 *bj = 00; - OPJ_INT32 w, l; - - OPJ_INT32 rw; /* width of the resolution level computed */ - OPJ_INT32 rh; /* height of the resolution level computed */ - OPJ_SIZE_T l_data_size; - - opj_tcd_resolution_t * l_cur_res = 0; - opj_tcd_resolution_t * l_last_res = 0; - - w = tilec->x1 - tilec->x0; - l = (OPJ_INT32)tilec->numresolutions - 1; - a = tilec->data; - - l_cur_res = tilec->resolutions + l; - l_last_res = l_cur_res - 1; - - l_data_size = opj_dwt_max_resolution(tilec->resolutions, tilec->numresolutions); - /* overflow check */ - if (l_data_size > (SIZE_MAX / sizeof(OPJ_INT32))) { - /* FIXME event manager error callback */ - return OPJ_FALSE; - } - l_data_size *= sizeof(OPJ_INT32); - bj = (OPJ_INT32*)opj_malloc(l_data_size); - /* l_data_size is equal to 0 when numresolutions == 1 but bj is not used */ - /* in that case, so do not error out */ - if (l_data_size != 0 && ! bj) { - return OPJ_FALSE; - } - i = l; - - while (i--) { - OPJ_INT32 rw1; /* width of the resolution level once lower than computed one */ - OPJ_INT32 rh1; /* height of the resolution level once lower than computed one */ - OPJ_INT32 cas_col; /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */ - OPJ_INT32 cas_row; /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering */ - OPJ_INT32 dn, sn; - - rw = l_cur_res->x1 - l_cur_res->x0; - rh = l_cur_res->y1 - l_cur_res->y0; - rw1 = l_last_res->x1 - l_last_res->x0; - rh1 = l_last_res->y1 - l_last_res->y0; - - cas_row = l_cur_res->x0 & 1; - cas_col = l_cur_res->y0 & 1; - - sn = rh1; - dn = rh - rh1; - for (j = 0; j < rw; ++j) { - aj = a + j; - for (k = 0; k < rh; ++k) { - bj[k] = aj[k * w]; - } - - (*p_function)(bj, dn, sn, cas_col); - - opj_dwt_deinterleave_v(bj, aj, dn, sn, w, cas_col); - } - - sn = rw1; - dn = rw - rw1; - - for (j = 0; j < rh; j++) { - aj = a + j * w; - for (k = 0; k < rw; k++) { - bj[k] = aj[k]; - } - (*p_function)(bj, dn, sn, cas_row); - opj_dwt_deinterleave_h(bj, aj, dn, sn, cas_row); - } - - l_cur_res = l_last_res; - - --l_last_res; - } - - opj_free(bj); - return OPJ_TRUE; -} - -/* Forward 5-3 wavelet transform in 2-D. */ -/* */ -OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec) -{ - return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1); -} - -/* */ -/* Inverse 5-3 wavelet transform in 2-D. */ -/* */ -OPJ_BOOL opj_dwt_decode(opj_tcd_t *p_tcd, opj_tcd_tilecomp_t* tilec, - OPJ_UINT32 numres) -{ - if (p_tcd->whole_tile_decoding) { - return opj_dwt_decode_tile(p_tcd->thread_pool, tilec, numres); - } else { - return opj_dwt_decode_partial_tile(tilec, numres); - } -} - - -/* */ -/* Get gain of 5-3 wavelet transform. */ -/* */ -OPJ_UINT32 opj_dwt_getgain(OPJ_UINT32 orient) -{ - if (orient == 0) { - return 0; - } - if (orient == 1 || orient == 2) { - return 1; - } - return 2; -} - -/* */ -/* Get norm of 5-3 wavelet. */ -/* */ -OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient) -{ - /* FIXME ! This is just a band-aid to avoid a buffer overflow */ - /* but the array should really be extended up to 33 resolution levels */ - /* See https://github.com/uclouvain/openjpeg/issues/493 */ - if (orient == 0 && level >= 10) { - level = 9; - } else if (orient > 0 && level >= 9) { - level = 8; - } - return opj_dwt_norms[orient][level]; -} - -/* */ -/* Forward 9-7 wavelet transform in 2-D. */ -/* */ -OPJ_BOOL opj_dwt_encode_real(opj_tcd_tilecomp_t * tilec) -{ - return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1_real); -} - -/* */ -/* Get gain of 9-7 wavelet transform. */ -/* */ -OPJ_UINT32 opj_dwt_getgain_real(OPJ_UINT32 orient) -{ - (void)orient; - return 0; -} - -/* */ -/* Get norm of 9-7 wavelet. */ -/* */ -OPJ_FLOAT64 opj_dwt_getnorm_real(OPJ_UINT32 level, OPJ_UINT32 orient) -{ - /* FIXME ! This is just a band-aid to avoid a buffer overflow */ - /* but the array should really be extended up to 33 resolution levels */ - /* See https://github.com/uclouvain/openjpeg/issues/493 */ - if (orient == 0 && level >= 10) { - level = 9; - } else if (orient > 0 && level >= 9) { - level = 8; - } - return opj_dwt_norms_real[orient][level]; -} - -void opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, OPJ_UINT32 prec) -{ - OPJ_UINT32 numbands, bandno; - numbands = 3 * tccp->numresolutions - 2; - for (bandno = 0; bandno < numbands; bandno++) { - OPJ_FLOAT64 stepsize; - OPJ_UINT32 resno, level, orient, gain; - - resno = (bandno == 0) ? 0 : ((bandno - 1) / 3 + 1); - orient = (bandno == 0) ? 0 : ((bandno - 1) % 3 + 1); - level = tccp->numresolutions - 1 - resno; - gain = (tccp->qmfbid == 0) ? 0 : ((orient == 0) ? 0 : (((orient == 1) || - (orient == 2)) ? 1 : 2)); - if (tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) { - stepsize = 1.0; - } else { - OPJ_FLOAT64 norm = opj_dwt_norms_real[orient][level]; - stepsize = (1 << (gain)) / norm; - } - opj_dwt_encode_stepsize((OPJ_INT32) floor(stepsize * 8192.0), - (OPJ_INT32)(prec + gain), &tccp->stepsizes[bandno]); - } -} - -/* */ -/* Determine maximum computed resolution level for inverse wavelet transform */ -/* */ -static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r, - OPJ_UINT32 i) -{ - OPJ_UINT32 mr = 0; - OPJ_UINT32 w; - while (--i) { - ++r; - if (mr < (w = (OPJ_UINT32)(r->x1 - r->x0))) { - mr = w ; - } - if (mr < (w = (OPJ_UINT32)(r->y1 - r->y0))) { - mr = w ; - } - } - return mr ; -} - -typedef struct { - opj_dwt_t h; - OPJ_UINT32 rw; - OPJ_UINT32 w; - OPJ_INT32 * OPJ_RESTRICT tiledp; - OPJ_UINT32 min_j; - OPJ_UINT32 max_j; -} opj_dwd_decode_h_job_t; - -static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls) -{ - OPJ_UINT32 j; - opj_dwd_decode_h_job_t* job; - (void)tls; - - job = (opj_dwd_decode_h_job_t*)user_data; - for (j = job->min_j; j < job->max_j; j++) { - opj_idwt53_h(&job->h, &job->tiledp[j * job->w]); - } - - opj_aligned_free(job->h.mem); - opj_free(job); -} - -typedef struct { - opj_dwt_t v; - OPJ_UINT32 rh; - OPJ_UINT32 w; - OPJ_INT32 * OPJ_RESTRICT tiledp; - OPJ_UINT32 min_j; - OPJ_UINT32 max_j; -} opj_dwd_decode_v_job_t; - -static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls) -{ - OPJ_UINT32 j; - opj_dwd_decode_v_job_t* job; - (void)tls; - - job = (opj_dwd_decode_v_job_t*)user_data; - for (j = job->min_j; j + PARALLEL_COLS_53 <= job->max_j; - j += PARALLEL_COLS_53) { - opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w, - PARALLEL_COLS_53); - } - if (j < job->max_j) - opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w, - (OPJ_INT32)(job->max_j - j)); - - opj_aligned_free(job->v.mem); - opj_free(job); -} - - -/* */ -/* Inverse wavelet transform in 2-D. */ -/* */ -static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, - opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres) -{ - opj_dwt_t h; - opj_dwt_t v; - - opj_tcd_resolution_t* tr = tilec->resolutions; - - OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 - - tr->x0); /* width of the resolution level computed */ - OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 - - tr->y0); /* height of the resolution level computed */ - - OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions - - 1].x1 - - tilec->resolutions[tilec->minimum_num_resolutions - 1].x0); - OPJ_SIZE_T h_mem_size; - int num_threads; - - if (numres == 1U) { - return OPJ_TRUE; - } - num_threads = opj_thread_pool_get_thread_count(tp); - h_mem_size = opj_dwt_max_resolution(tr, numres); - /* overflow check */ - if (h_mem_size > (SIZE_MAX / PARALLEL_COLS_53 / sizeof(OPJ_INT32))) { - /* FIXME event manager error callback */ - return OPJ_FALSE; - } - /* We need PARALLEL_COLS_53 times the height of the array, */ - /* since for the vertical pass */ - /* we process PARALLEL_COLS_53 columns at a time */ - h_mem_size *= PARALLEL_COLS_53 * sizeof(OPJ_INT32); - h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size); - if (! h.mem) { - /* FIXME event manager error callback */ - return OPJ_FALSE; - } - - v.mem = h.mem; - - while (--numres) { - OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data; - OPJ_UINT32 j; - - ++tr; - h.sn = (OPJ_INT32)rw; - v.sn = (OPJ_INT32)rh; - - rw = (OPJ_UINT32)(tr->x1 - tr->x0); - rh = (OPJ_UINT32)(tr->y1 - tr->y0); - - h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn); - h.cas = tr->x0 % 2; - - if (num_threads <= 1 || rh <= 1) { - for (j = 0; j < rh; ++j) { - opj_idwt53_h(&h, &tiledp[(OPJ_SIZE_T)j * w]); - } - } else { - OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads; - OPJ_UINT32 step_j; - - if (rh < num_jobs) { - num_jobs = rh; - } - step_j = (rh / num_jobs); - - for (j = 0; j < num_jobs; j++) { - opj_dwd_decode_h_job_t* job; - - job = (opj_dwd_decode_h_job_t*) opj_malloc(sizeof(opj_dwd_decode_h_job_t)); - if (!job) { - /* It would be nice to fallback to single thread case, but */ - /* unfortunately some jobs may be launched and have modified */ - /* tiledp, so it is not practical to recover from that error */ - /* FIXME event manager error callback */ - opj_thread_pool_wait_completion(tp, 0); - opj_aligned_free(h.mem); - return OPJ_FALSE; - } - job->h = h; - job->rw = rw; - job->w = w; - job->tiledp = tiledp; - job->min_j = j * step_j; - job->max_j = (j + 1U) * step_j; /* this can overflow */ - if (j == (num_jobs - 1U)) { /* this will take care of the overflow */ - job->max_j = rh; - } - job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size); - if (!job->h.mem) { - /* FIXME event manager error callback */ - opj_thread_pool_wait_completion(tp, 0); - opj_free(job); - opj_aligned_free(h.mem); - return OPJ_FALSE; - } - opj_thread_pool_submit_job(tp, opj_dwt_decode_h_func, job); - } - opj_thread_pool_wait_completion(tp, 0); - } - - v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn); - v.cas = tr->y0 % 2; - - if (num_threads <= 1 || rw <= 1) { - for (j = 0; j + PARALLEL_COLS_53 <= rw; - j += PARALLEL_COLS_53) { - opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, PARALLEL_COLS_53); - } - if (j < rw) { - opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, (OPJ_INT32)(rw - j)); - } - } else { - OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads; - OPJ_UINT32 step_j; - - if (rw < num_jobs) { - num_jobs = rw; - } - step_j = (rw / num_jobs); - - for (j = 0; j < num_jobs; j++) { - opj_dwd_decode_v_job_t* job; - - job = (opj_dwd_decode_v_job_t*) opj_malloc(sizeof(opj_dwd_decode_v_job_t)); - if (!job) { - /* It would be nice to fallback to single thread case, but */ - /* unfortunately some jobs may be launched and have modified */ - /* tiledp, so it is not practical to recover from that error */ - /* FIXME event manager error callback */ - opj_thread_pool_wait_completion(tp, 0); - opj_aligned_free(v.mem); - return OPJ_FALSE; - } - job->v = v; - job->rh = rh; - job->w = w; - job->tiledp = tiledp; - job->min_j = j * step_j; - job->max_j = (j + 1U) * step_j; /* this can overflow */ - if (j == (num_jobs - 1U)) { /* this will take care of the overflow */ - job->max_j = rw; - } - job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size); - if (!job->v.mem) { - /* FIXME event manager error callback */ - opj_thread_pool_wait_completion(tp, 0); - opj_free(job); - opj_aligned_free(v.mem); - return OPJ_FALSE; - } - opj_thread_pool_submit_job(tp, opj_dwt_decode_v_func, job); - } - opj_thread_pool_wait_completion(tp, 0); - } - } - opj_aligned_free(h.mem); - return OPJ_TRUE; -} - -static void opj_dwt_interleave_partial_h(OPJ_INT32 *dest, - OPJ_INT32 cas, - opj_sparse_array_int32_t* sa, - OPJ_UINT32 sa_line, - OPJ_UINT32 sn, - OPJ_UINT32 win_l_x0, - OPJ_UINT32 win_l_x1, - OPJ_UINT32 win_h_x0, - OPJ_UINT32 win_h_x1) -{ - OPJ_BOOL ret; - ret = opj_sparse_array_int32_read(sa, - win_l_x0, sa_line, - win_l_x1, sa_line + 1, - dest + cas + 2 * win_l_x0, - 2, 0, OPJ_TRUE); - assert(ret); - ret = opj_sparse_array_int32_read(sa, - sn + win_h_x0, sa_line, - sn + win_h_x1, sa_line + 1, - dest + 1 - cas + 2 * win_h_x0, - 2, 0, OPJ_TRUE); - assert(ret); - OPJ_UNUSED(ret); -} - - -static void opj_dwt_interleave_partial_v(OPJ_INT32 *dest, - OPJ_INT32 cas, - opj_sparse_array_int32_t* sa, - OPJ_UINT32 sa_col, - OPJ_UINT32 nb_cols, - OPJ_UINT32 sn, - OPJ_UINT32 win_l_y0, - OPJ_UINT32 win_l_y1, - OPJ_UINT32 win_h_y0, - OPJ_UINT32 win_h_y1) -{ - OPJ_BOOL ret; - ret = opj_sparse_array_int32_read(sa, - sa_col, win_l_y0, - sa_col + nb_cols, win_l_y1, - dest + cas * 4 + 2 * 4 * win_l_y0, - 1, 2 * 4, OPJ_TRUE); - assert(ret); - ret = opj_sparse_array_int32_read(sa, - sa_col, sn + win_h_y0, - sa_col + nb_cols, sn + win_h_y1, - dest + (1 - cas) * 4 + 2 * 4 * win_h_y0, - 1, 2 * 4, OPJ_TRUE); - assert(ret); - OPJ_UNUSED(ret); -} - -static void opj_dwt_decode_partial_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, - OPJ_INT32 cas, - OPJ_INT32 win_l_x0, - OPJ_INT32 win_l_x1, - OPJ_INT32 win_h_x0, - OPJ_INT32 win_h_x1) -{ - OPJ_INT32 i; - - if (!cas) { - if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ - - /* Naive version is : - for (i = win_l_x0; i < i_max; i++) { - OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; - } - for (i = win_h_x0; i < win_h_x1; i++) { - OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1; - } - but the compiler doesn't manage to unroll it to avoid bound - checking in OPJ_S_ and OPJ_D_ macros - */ - - i = win_l_x0; - if (i < win_l_x1) { - OPJ_INT32 i_max; - - /* Left-most case */ - OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; - i ++; - - i_max = win_l_x1; - if (i_max > dn) { - i_max = dn; - } - for (; i < i_max; i++) { - /* No bound checking */ - OPJ_S(i) -= (OPJ_D(i - 1) + OPJ_D(i) + 2) >> 2; - } - for (; i < win_l_x1; i++) { - /* Right-most case */ - OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; - } - } - - i = win_h_x0; - if (i < win_h_x1) { - OPJ_INT32 i_max = win_h_x1; - if (i_max >= sn) { - i_max = sn - 1; - } - for (; i < i_max; i++) { - /* No bound checking */ - OPJ_D(i) += (OPJ_S(i) + OPJ_S(i + 1)) >> 1; - } - for (; i < win_h_x1; i++) { - /* Right-most case */ - OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1; - } - } - } - } else { - if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */ - OPJ_S(0) /= 2; - } else { - for (i = win_l_x0; i < win_l_x1; i++) { - OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2; - } - for (i = win_h_x0; i < win_h_x1; i++) { - OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1; - } - } - } -} - -#define OPJ_S_off(i,off) a[(OPJ_UINT32)(i)*2*4+off] -#define OPJ_D_off(i,off) a[(1+(OPJ_UINT32)(i)*2)*4+off] -#define OPJ_S__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=sn?OPJ_S_off(sn-1,off):OPJ_S_off(i,off))) -#define OPJ_D__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=dn?OPJ_D_off(dn-1,off):OPJ_D_off(i,off))) -#define OPJ_SS__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=dn?OPJ_S_off(dn-1,off):OPJ_S_off(i,off))) -#define OPJ_DD__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=sn?OPJ_D_off(sn-1,off):OPJ_D_off(i,off))) - -static void opj_dwt_decode_partial_1_parallel(OPJ_INT32 *a, - OPJ_UINT32 nb_cols, - OPJ_INT32 dn, OPJ_INT32 sn, - OPJ_INT32 cas, - OPJ_INT32 win_l_x0, - OPJ_INT32 win_l_x1, - OPJ_INT32 win_h_x0, - OPJ_INT32 win_h_x1) -{ - OPJ_INT32 i; - OPJ_UINT32 off; - - (void)nb_cols; - - if (!cas) { - if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ - - /* Naive version is : - for (i = win_l_x0; i < i_max; i++) { - OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; - } - for (i = win_h_x0; i < win_h_x1; i++) { - OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1; - } - but the compiler doesn't manage to unroll it to avoid bound - checking in OPJ_S_ and OPJ_D_ macros - */ - - i = win_l_x0; - if (i < win_l_x1) { - OPJ_INT32 i_max; - - /* Left-most case */ - for (off = 0; off < 4; off++) { - OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2; - } - i ++; - - i_max = win_l_x1; - if (i_max > dn) { - i_max = dn; - } - -#ifdef __SSE2__ - if (i + 1 < i_max) { - const __m128i two = _mm_set1_epi32(2); - __m128i Dm1 = _mm_load_si128((__m128i * const)(a + 4 + (i - 1) * 8)); - for (; i + 1 < i_max; i += 2) { - /* No bound checking */ - __m128i S = _mm_load_si128((__m128i * const)(a + i * 8)); - __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8)); - __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8)); - __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8)); - S = _mm_sub_epi32(S, - _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(Dm1, D), two), 2)); - S1 = _mm_sub_epi32(S1, - _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(D, D1), two), 2)); - _mm_store_si128((__m128i*)(a + i * 8), S); - _mm_store_si128((__m128i*)(a + (i + 1) * 8), S1); - Dm1 = D1; - } - } -#endif - - for (; i < i_max; i++) { - /* No bound checking */ - for (off = 0; off < 4; off++) { - OPJ_S_off(i, off) -= (OPJ_D_off(i - 1, off) + OPJ_D_off(i, off) + 2) >> 2; - } - } - for (; i < win_l_x1; i++) { - /* Right-most case */ - for (off = 0; off < 4; off++) { - OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2; - } - } - } - - i = win_h_x0; - if (i < win_h_x1) { - OPJ_INT32 i_max = win_h_x1; - if (i_max >= sn) { - i_max = sn - 1; - } - -#ifdef __SSE2__ - if (i + 1 < i_max) { - __m128i S = _mm_load_si128((__m128i * const)(a + i * 8)); - for (; i + 1 < i_max; i += 2) { - /* No bound checking */ - __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8)); - __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8)); - __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8)); - __m128i S2 = _mm_load_si128((__m128i * const)(a + (i + 2) * 8)); - D = _mm_add_epi32(D, _mm_srai_epi32(_mm_add_epi32(S, S1), 1)); - D1 = _mm_add_epi32(D1, _mm_srai_epi32(_mm_add_epi32(S1, S2), 1)); - _mm_store_si128((__m128i*)(a + 4 + i * 8), D); - _mm_store_si128((__m128i*)(a + 4 + (i + 1) * 8), D1); - S = S2; - } - } -#endif - - for (; i < i_max; i++) { - /* No bound checking */ - for (off = 0; off < 4; off++) { - OPJ_D_off(i, off) += (OPJ_S_off(i, off) + OPJ_S_off(i + 1, off)) >> 1; - } - } - for (; i < win_h_x1; i++) { - /* Right-most case */ - for (off = 0; off < 4; off++) { - OPJ_D_off(i, off) += (OPJ_S__off(i, off) + OPJ_S__off(i + 1, off)) >> 1; - } - } - } - } - } else { - if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */ - for (off = 0; off < 4; off++) { - OPJ_S_off(0, off) /= 2; - } - } else { - for (i = win_l_x0; i < win_l_x1; i++) { - for (off = 0; off < 4; off++) { - OPJ_D_off(i, off) -= (OPJ_SS__off(i, off) + OPJ_SS__off(i + 1, off) + 2) >> 2; - } - } - for (i = win_h_x0; i < win_h_x1; i++) { - for (off = 0; off < 4; off++) { - OPJ_S_off(i, off) += (OPJ_DD__off(i, off) + OPJ_DD__off(i - 1, off)) >> 1; - } - } - } - } -} - -static void opj_dwt_get_band_coordinates(opj_tcd_tilecomp_t* tilec, - OPJ_UINT32 resno, - OPJ_UINT32 bandno, - OPJ_UINT32 tcx0, - OPJ_UINT32 tcy0, - OPJ_UINT32 tcx1, - OPJ_UINT32 tcy1, - OPJ_UINT32* tbx0, - OPJ_UINT32* tby0, - OPJ_UINT32* tbx1, - OPJ_UINT32* tby1) -{ - /* Compute number of decomposition for this band. See table F-1 */ - OPJ_UINT32 nb = (resno == 0) ? - tilec->numresolutions - 1 : - tilec->numresolutions - resno; - /* Map above tile-based coordinates to sub-band-based coordinates per */ - /* equation B-15 of the standard */ - OPJ_UINT32 x0b = bandno & 1; - OPJ_UINT32 y0b = bandno >> 1; - if (tbx0) { - *tbx0 = (nb == 0) ? tcx0 : - (tcx0 <= (1U << (nb - 1)) * x0b) ? 0 : - opj_uint_ceildivpow2(tcx0 - (1U << (nb - 1)) * x0b, nb); - } - if (tby0) { - *tby0 = (nb == 0) ? tcy0 : - (tcy0 <= (1U << (nb - 1)) * y0b) ? 0 : - opj_uint_ceildivpow2(tcy0 - (1U << (nb - 1)) * y0b, nb); - } - if (tbx1) { - *tbx1 = (nb == 0) ? tcx1 : - (tcx1 <= (1U << (nb - 1)) * x0b) ? 0 : - opj_uint_ceildivpow2(tcx1 - (1U << (nb - 1)) * x0b, nb); - } - if (tby1) { - *tby1 = (nb == 0) ? tcy1 : - (tcy1 <= (1U << (nb - 1)) * y0b) ? 0 : - opj_uint_ceildivpow2(tcy1 - (1U << (nb - 1)) * y0b, nb); - } -} - -static void opj_dwt_segment_grow(OPJ_UINT32 filter_width, - OPJ_UINT32 max_size, - OPJ_UINT32* start, - OPJ_UINT32* end) -{ - *start = opj_uint_subs(*start, filter_width); - *end = opj_uint_adds(*end, filter_width); - *end = opj_uint_min(*end, max_size); -} - - -static opj_sparse_array_int32_t* opj_dwt_init_sparse_array( - opj_tcd_tilecomp_t* tilec, - OPJ_UINT32 numres) -{ - opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]); - OPJ_UINT32 w = (OPJ_UINT32)(tr_max->x1 - tr_max->x0); - OPJ_UINT32 h = (OPJ_UINT32)(tr_max->y1 - tr_max->y0); - OPJ_UINT32 resno, bandno, precno, cblkno; - opj_sparse_array_int32_t* sa = opj_sparse_array_int32_create( - w, h, opj_uint_min(w, 64), opj_uint_min(h, 64)); - if (sa == NULL) { - return NULL; - } - - for (resno = 0; resno < numres; ++resno) { - opj_tcd_resolution_t* res = &tilec->resolutions[resno]; - - for (bandno = 0; bandno < res->numbands; ++bandno) { - opj_tcd_band_t* band = &res->bands[bandno]; - - for (precno = 0; precno < res->pw * res->ph; ++precno) { - opj_tcd_precinct_t* precinct = &band->precincts[precno]; - for (cblkno = 0; cblkno < precinct->cw * precinct->ch; ++cblkno) { - opj_tcd_cblk_dec_t* cblk = &precinct->cblks.dec[cblkno]; - if (cblk->decoded_data != NULL) { - OPJ_UINT32 x = (OPJ_UINT32)(cblk->x0 - band->x0); - OPJ_UINT32 y = (OPJ_UINT32)(cblk->y0 - band->y0); - OPJ_UINT32 cblk_w = (OPJ_UINT32)(cblk->x1 - cblk->x0); - OPJ_UINT32 cblk_h = (OPJ_UINT32)(cblk->y1 - cblk->y0); - - if (band->bandno & 1) { - opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1]; - x += (OPJ_UINT32)(pres->x1 - pres->x0); - } - if (band->bandno & 2) { - opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1]; - y += (OPJ_UINT32)(pres->y1 - pres->y0); - } - - if (!opj_sparse_array_int32_write(sa, x, y, - x + cblk_w, y + cblk_h, - cblk->decoded_data, - 1, cblk_w, OPJ_TRUE)) { - opj_sparse_array_int32_free(sa); - return NULL; - } - } - } - } - } - } - - return sa; -} - - -static OPJ_BOOL opj_dwt_decode_partial_tile( - opj_tcd_tilecomp_t* tilec, - OPJ_UINT32 numres) -{ - opj_sparse_array_int32_t* sa; - opj_dwt_t h; - opj_dwt_t v; - OPJ_UINT32 resno; - /* This value matches the maximum left/right extension given in tables */ - /* F.2 and F.3 of the standard. */ - const OPJ_UINT32 filter_width = 2U; - - opj_tcd_resolution_t* tr = tilec->resolutions; - opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]); - - OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 - - tr->x0); /* width of the resolution level computed */ - OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 - - tr->y0); /* height of the resolution level computed */ - - OPJ_SIZE_T h_mem_size; - - /* Compute the intersection of the area of interest, expressed in tile coordinates */ - /* with the tile coordinates */ - OPJ_UINT32 win_tcx0 = tilec->win_x0; - OPJ_UINT32 win_tcy0 = tilec->win_y0; - OPJ_UINT32 win_tcx1 = tilec->win_x1; - OPJ_UINT32 win_tcy1 = tilec->win_y1; - - if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) { - return OPJ_TRUE; - } - - sa = opj_dwt_init_sparse_array(tilec, numres); - if (sa == NULL) { - return OPJ_FALSE; - } - - if (numres == 1U) { - OPJ_BOOL ret = opj_sparse_array_int32_read(sa, - tr_max->win_x0 - (OPJ_UINT32)tr_max->x0, - tr_max->win_y0 - (OPJ_UINT32)tr_max->y0, - tr_max->win_x1 - (OPJ_UINT32)tr_max->x0, - tr_max->win_y1 - (OPJ_UINT32)tr_max->y0, - tilec->data_win, - 1, tr_max->win_x1 - tr_max->win_x0, - OPJ_TRUE); - assert(ret); - OPJ_UNUSED(ret); - opj_sparse_array_int32_free(sa); - return OPJ_TRUE; - } - h_mem_size = opj_dwt_max_resolution(tr, numres); - /* overflow check */ - /* in vertical pass, we process 4 columns at a time */ - if (h_mem_size > (SIZE_MAX / (4 * sizeof(OPJ_INT32)))) { - /* FIXME event manager error callback */ - opj_sparse_array_int32_free(sa); - return OPJ_FALSE; - } - - h_mem_size *= 4 * sizeof(OPJ_INT32); - h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size); - if (! h.mem) { - /* FIXME event manager error callback */ - opj_sparse_array_int32_free(sa); - return OPJ_FALSE; - } - - v.mem = h.mem; - - for (resno = 1; resno < numres; resno ++) { - OPJ_UINT32 i, j; - /* Window of interest subband-based coordinates */ - OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1; - OPJ_UINT32 win_hl_x0, win_hl_x1; - OPJ_UINT32 win_lh_y0, win_lh_y1; - /* Window of interest tile-resolution-based coordinates */ - OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1; - /* Tile-resolution subband-based coordinates */ - OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0; - - ++tr; - - h.sn = (OPJ_INT32)rw; - v.sn = (OPJ_INT32)rh; - - rw = (OPJ_UINT32)(tr->x1 - tr->x0); - rh = (OPJ_UINT32)(tr->y1 - tr->y0); - - h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn); - h.cas = tr->x0 % 2; - - v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn); - v.cas = tr->y0 % 2; - - /* Get the subband coordinates for the window of interest */ - /* LL band */ - opj_dwt_get_band_coordinates(tilec, resno, 0, - win_tcx0, win_tcy0, win_tcx1, win_tcy1, - &win_ll_x0, &win_ll_y0, - &win_ll_x1, &win_ll_y1); - - /* HL band */ - opj_dwt_get_band_coordinates(tilec, resno, 1, - win_tcx0, win_tcy0, win_tcx1, win_tcy1, - &win_hl_x0, NULL, &win_hl_x1, NULL); - - /* LH band */ - opj_dwt_get_band_coordinates(tilec, resno, 2, - win_tcx0, win_tcy0, win_tcx1, win_tcy1, - NULL, &win_lh_y0, NULL, &win_lh_y1); - - /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */ - tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0; - tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0; - tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0; - tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0; - - /* Substract the origin of the bands for this tile, to the subwindow */ - /* of interest band coordinates, so as to get them relative to the */ - /* tile */ - win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0); - win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0); - win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0); - win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0); - win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0); - win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0); - win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0); - win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0); - - opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1); - opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1); - - opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1); - opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1); - - /* Compute the tile-resolution-based coordinates for the window of interest */ - if (h.cas == 0) { - win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1); - win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw); - } else { - win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1); - win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw); - } - - if (v.cas == 0) { - win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1); - win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh); - } else { - win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1); - win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh); - } - - for (j = 0; j < rh; ++j) { - if ((j >= win_ll_y0 && j < win_ll_y1) || - (j >= win_lh_y0 + (OPJ_UINT32)v.sn && j < win_lh_y1 + (OPJ_UINT32)v.sn)) { - - /* Avoids dwt.c:1584:44 (in opj_dwt_decode_partial_1): runtime error: */ - /* signed integer overflow: -1094795586 + -1094795586 cannot be represented in type 'int' */ - /* on opj_decompress -i ../../openjpeg/MAPA.jp2 -o out.tif -d 0,0,256,256 */ - /* This is less extreme than memsetting the whole buffer to 0 */ - /* although we could potentially do better with better handling of edge conditions */ - if (win_tr_x1 >= 1 && win_tr_x1 < rw) { - h.mem[win_tr_x1 - 1] = 0; - } - if (win_tr_x1 < rw) { - h.mem[win_tr_x1] = 0; - } - - opj_dwt_interleave_partial_h(h.mem, - h.cas, - sa, - j, - (OPJ_UINT32)h.sn, - win_ll_x0, - win_ll_x1, - win_hl_x0, - win_hl_x1); - opj_dwt_decode_partial_1(h.mem, h.dn, h.sn, h.cas, - (OPJ_INT32)win_ll_x0, - (OPJ_INT32)win_ll_x1, - (OPJ_INT32)win_hl_x0, - (OPJ_INT32)win_hl_x1); - if (!opj_sparse_array_int32_write(sa, - win_tr_x0, j, - win_tr_x1, j + 1, - h.mem + win_tr_x0, - 1, 0, OPJ_TRUE)) { - /* FIXME event manager error callback */ - opj_sparse_array_int32_free(sa); - opj_aligned_free(h.mem); - return OPJ_FALSE; - } - } - } - - for (i = win_tr_x0; i < win_tr_x1;) { - OPJ_UINT32 nb_cols = opj_uint_min(4U, win_tr_x1 - i); - opj_dwt_interleave_partial_v(v.mem, - v.cas, - sa, - i, - nb_cols, - (OPJ_UINT32)v.sn, - win_ll_y0, - win_ll_y1, - win_lh_y0, - win_lh_y1); - opj_dwt_decode_partial_1_parallel(v.mem, nb_cols, v.dn, v.sn, v.cas, - (OPJ_INT32)win_ll_y0, - (OPJ_INT32)win_ll_y1, - (OPJ_INT32)win_lh_y0, - (OPJ_INT32)win_lh_y1); - if (!opj_sparse_array_int32_write(sa, - i, win_tr_y0, - i + nb_cols, win_tr_y1, - v.mem + 4 * win_tr_y0, - 1, 4, OPJ_TRUE)) { - /* FIXME event manager error callback */ - opj_sparse_array_int32_free(sa); - opj_aligned_free(h.mem); - return OPJ_FALSE; - } - - i += nb_cols; - } - } - opj_aligned_free(h.mem); - - { - OPJ_BOOL ret = opj_sparse_array_int32_read(sa, - tr_max->win_x0 - (OPJ_UINT32)tr_max->x0, - tr_max->win_y0 - (OPJ_UINT32)tr_max->y0, - tr_max->win_x1 - (OPJ_UINT32)tr_max->x0, - tr_max->win_y1 - (OPJ_UINT32)tr_max->y0, - tilec->data_win, - 1, tr_max->win_x1 - tr_max->win_x0, - OPJ_TRUE); - assert(ret); - OPJ_UNUSED(ret); - } - opj_sparse_array_int32_free(sa); - return OPJ_TRUE; -} - -static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt, - OPJ_FLOAT32* OPJ_RESTRICT a, - OPJ_UINT32 width, - OPJ_UINT32 remaining_height) -{ - OPJ_FLOAT32* OPJ_RESTRICT bi = (OPJ_FLOAT32*)(dwt->wavelet + dwt->cas); - OPJ_UINT32 i, k; - OPJ_UINT32 x0 = dwt->win_l_x0; - OPJ_UINT32 x1 = dwt->win_l_x1; - - for (k = 0; k < 2; ++k) { - if (remaining_height >= 4 && ((OPJ_SIZE_T) a & 0x0f) == 0 && - ((OPJ_SIZE_T) bi & 0x0f) == 0 && (width & 0x0f) == 0) { - /* Fast code path */ - for (i = x0; i < x1; ++i) { - OPJ_UINT32 j = i; - bi[i * 8 ] = a[j]; - j += width; - bi[i * 8 + 1] = a[j]; - j += width; - bi[i * 8 + 2] = a[j]; - j += width; - bi[i * 8 + 3] = a[j]; - } - } else { - /* Slow code path */ - for (i = x0; i < x1; ++i) { - OPJ_UINT32 j = i; - bi[i * 8 ] = a[j]; - j += width; - if (remaining_height == 1) { - continue; - } - bi[i * 8 + 1] = a[j]; - j += width; - if (remaining_height == 2) { - continue; - } - bi[i * 8 + 2] = a[j]; - j += width; - if (remaining_height == 3) { - continue; - } - bi[i * 8 + 3] = a[j]; /* This one*/ - } - } - - bi = (OPJ_FLOAT32*)(dwt->wavelet + 1 - dwt->cas); - a += dwt->sn; - x0 = dwt->win_h_x0; - x1 = dwt->win_h_x1; - } -} - -static void opj_v4dwt_interleave_partial_h(opj_v4dwt_t* dwt, - opj_sparse_array_int32_t* sa, - OPJ_UINT32 sa_line, - OPJ_UINT32 remaining_height) -{ - OPJ_UINT32 i; - for (i = 0; i < remaining_height; i++) { - OPJ_BOOL ret; - ret = opj_sparse_array_int32_read(sa, - dwt->win_l_x0, sa_line + i, - dwt->win_l_x1, sa_line + i + 1, - /* Nasty cast from float* to int32* */ - (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0) + i, - 8, 0, OPJ_TRUE); - assert(ret); - ret = opj_sparse_array_int32_read(sa, - (OPJ_UINT32)dwt->sn + dwt->win_h_x0, sa_line + i, - (OPJ_UINT32)dwt->sn + dwt->win_h_x1, sa_line + i + 1, - /* Nasty cast from float* to int32* */ - (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0) + i, - 8, 0, OPJ_TRUE); - assert(ret); - OPJ_UNUSED(ret); - } -} - -static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt, - OPJ_FLOAT32* OPJ_RESTRICT a, - OPJ_UINT32 width, - OPJ_UINT32 nb_elts_read) -{ - opj_v4_t* OPJ_RESTRICT bi = dwt->wavelet + dwt->cas; - OPJ_UINT32 i; - - for (i = dwt->win_l_x0; i < dwt->win_l_x1; ++i) { - memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width], - (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32)); - } - - a += (OPJ_UINT32)dwt->sn * (OPJ_SIZE_T)width; - bi = dwt->wavelet + 1 - dwt->cas; - - for (i = dwt->win_h_x0; i < dwt->win_h_x1; ++i) { - memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width], - (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32)); - } -} - -static void opj_v4dwt_interleave_partial_v(opj_v4dwt_t* OPJ_RESTRICT dwt, - opj_sparse_array_int32_t* sa, - OPJ_UINT32 sa_col, - OPJ_UINT32 nb_elts_read) -{ - OPJ_BOOL ret; - ret = opj_sparse_array_int32_read(sa, - sa_col, dwt->win_l_x0, - sa_col + nb_elts_read, dwt->win_l_x1, - (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0), - 1, 8, OPJ_TRUE); - assert(ret); - ret = opj_sparse_array_int32_read(sa, - sa_col, (OPJ_UINT32)dwt->sn + dwt->win_h_x0, - sa_col + nb_elts_read, (OPJ_UINT32)dwt->sn + dwt->win_h_x1, - (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0), - 1, 8, OPJ_TRUE); - assert(ret); - OPJ_UNUSED(ret); -} - -#ifdef __SSE__ - -static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, - OPJ_UINT32 start, - OPJ_UINT32 end, - const __m128 c) -{ - __m128* OPJ_RESTRICT vw = (__m128*) w; - OPJ_UINT32 i; - /* 4x unrolled loop */ - vw += 2 * start; - for (i = start; i + 3 < end; i += 4, vw += 8) { - __m128 xmm0 = _mm_mul_ps(vw[0], c); - __m128 xmm2 = _mm_mul_ps(vw[2], c); - __m128 xmm4 = _mm_mul_ps(vw[4], c); - __m128 xmm6 = _mm_mul_ps(vw[6], c); - vw[0] = xmm0; - vw[2] = xmm2; - vw[4] = xmm4; - vw[6] = xmm6; - } - for (; i < end; ++i, vw += 2) { - vw[0] = _mm_mul_ps(vw[0], c); - } -} - -static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, - OPJ_UINT32 start, - OPJ_UINT32 end, - OPJ_UINT32 m, - __m128 c) -{ - __m128* OPJ_RESTRICT vl = (__m128*) l; - __m128* OPJ_RESTRICT vw = (__m128*) w; - OPJ_UINT32 i; - OPJ_UINT32 imax = opj_uint_min(end, m); - __m128 tmp1, tmp2, tmp3; - if (start == 0) { - tmp1 = vl[0]; - } else { - vw += start * 2; - tmp1 = vw[-3]; - } - - i = start; - - /* 4x loop unrolling */ - for (; i + 3 < imax; i += 4) { - __m128 tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; - tmp2 = vw[-1]; - tmp3 = vw[ 0]; - tmp4 = vw[ 1]; - tmp5 = vw[ 2]; - tmp6 = vw[ 3]; - tmp7 = vw[ 4]; - tmp8 = vw[ 5]; - tmp9 = vw[ 6]; - vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c)); - vw[ 1] = _mm_add_ps(tmp4, _mm_mul_ps(_mm_add_ps(tmp3, tmp5), c)); - vw[ 3] = _mm_add_ps(tmp6, _mm_mul_ps(_mm_add_ps(tmp5, tmp7), c)); - vw[ 5] = _mm_add_ps(tmp8, _mm_mul_ps(_mm_add_ps(tmp7, tmp9), c)); - tmp1 = tmp9; - vw += 8; - } - - for (; i < imax; ++i) { - tmp2 = vw[-1]; - tmp3 = vw[ 0]; - vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c)); - tmp1 = tmp3; - vw += 2; - } - if (m < end) { - assert(m + 1 == end); - c = _mm_add_ps(c, c); - c = _mm_mul_ps(c, vw[-2]); - vw[-1] = _mm_add_ps(vw[-1], c); - } -} - -#else - -static void opj_v4dwt_decode_step1(opj_v4_t* w, - OPJ_UINT32 start, - OPJ_UINT32 end, - const OPJ_FLOAT32 c) -{ - OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w; - OPJ_UINT32 i; - for (i = start; i < end; ++i) { - OPJ_FLOAT32 tmp1 = fw[i * 8 ]; - OPJ_FLOAT32 tmp2 = fw[i * 8 + 1]; - OPJ_FLOAT32 tmp3 = fw[i * 8 + 2]; - OPJ_FLOAT32 tmp4 = fw[i * 8 + 3]; - fw[i * 8 ] = tmp1 * c; - fw[i * 8 + 1] = tmp2 * c; - fw[i * 8 + 2] = tmp3 * c; - fw[i * 8 + 3] = tmp4 * c; - } -} - -static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, - OPJ_UINT32 start, - OPJ_UINT32 end, - OPJ_UINT32 m, - OPJ_FLOAT32 c) -{ - OPJ_FLOAT32* fl = (OPJ_FLOAT32*) l; - OPJ_FLOAT32* fw = (OPJ_FLOAT32*) w; - OPJ_UINT32 i; - OPJ_UINT32 imax = opj_uint_min(end, m); - if (start > 0) { - fw += 8 * start; - fl = fw - 8; - } - for (i = start; i < imax; ++i) { - OPJ_FLOAT32 tmp1_1 = fl[0]; - OPJ_FLOAT32 tmp1_2 = fl[1]; - OPJ_FLOAT32 tmp1_3 = fl[2]; - OPJ_FLOAT32 tmp1_4 = fl[3]; - OPJ_FLOAT32 tmp2_1 = fw[-4]; - OPJ_FLOAT32 tmp2_2 = fw[-3]; - OPJ_FLOAT32 tmp2_3 = fw[-2]; - OPJ_FLOAT32 tmp2_4 = fw[-1]; - OPJ_FLOAT32 tmp3_1 = fw[0]; - OPJ_FLOAT32 tmp3_2 = fw[1]; - OPJ_FLOAT32 tmp3_3 = fw[2]; - OPJ_FLOAT32 tmp3_4 = fw[3]; - fw[-4] = tmp2_1 + ((tmp1_1 + tmp3_1) * c); - fw[-3] = tmp2_2 + ((tmp1_2 + tmp3_2) * c); - fw[-2] = tmp2_3 + ((tmp1_3 + tmp3_3) * c); - fw[-1] = tmp2_4 + ((tmp1_4 + tmp3_4) * c); - fl = fw; - fw += 8; - } - if (m < end) { - assert(m + 1 == end); - c += c; - fw[-4] = fw[-4] + fl[0] * c; - fw[-3] = fw[-3] + fl[1] * c; - fw[-2] = fw[-2] + fl[2] * c; - fw[-1] = fw[-1] + fl[3] * c; - } -} - -#endif - -/* */ -/* Inverse 9-7 wavelet transform in 1-D. */ -/* */ -static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt) -{ - OPJ_INT32 a, b; - if (dwt->cas == 0) { - if (!((dwt->dn > 0) || (dwt->sn > 1))) { - return; - } - a = 0; - b = 1; - } else { - if (!((dwt->sn > 0) || (dwt->dn > 1))) { - return; - } - a = 1; - b = 0; - } -#ifdef __SSE__ - opj_v4dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, - _mm_set1_ps(opj_K)); - opj_v4dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, - _mm_set1_ps(opj_c13318)); - opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, - dwt->win_l_x0, dwt->win_l_x1, - (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), - _mm_set1_ps(opj_dwt_delta)); - opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, - dwt->win_h_x0, dwt->win_h_x1, - (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), - _mm_set1_ps(opj_dwt_gamma)); - opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, - dwt->win_l_x0, dwt->win_l_x1, - (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), - _mm_set1_ps(opj_dwt_beta)); - opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, - dwt->win_h_x0, dwt->win_h_x1, - (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), - _mm_set1_ps(opj_dwt_alpha)); -#else - opj_v4dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, - opj_K); - opj_v4dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, - opj_c13318); - opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, - dwt->win_l_x0, dwt->win_l_x1, - (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), - opj_dwt_delta); - opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, - dwt->win_h_x0, dwt->win_h_x1, - (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), - opj_dwt_gamma); - opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, - dwt->win_l_x0, dwt->win_l_x1, - (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), - opj_dwt_beta); - opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, - dwt->win_h_x0, dwt->win_h_x1, - (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), - opj_dwt_alpha); -#endif -} - - -/* */ -/* Inverse 9-7 wavelet transform in 2-D. */ -/* */ -static -OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, - OPJ_UINT32 numres) -{ - opj_v4dwt_t h; - opj_v4dwt_t v; - - opj_tcd_resolution_t* res = tilec->resolutions; - - OPJ_UINT32 rw = (OPJ_UINT32)(res->x1 - - res->x0); /* width of the resolution level computed */ - OPJ_UINT32 rh = (OPJ_UINT32)(res->y1 - - res->y0); /* height of the resolution level computed */ - - OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions - - 1].x1 - - tilec->resolutions[tilec->minimum_num_resolutions - 1].x0); - - OPJ_SIZE_T l_data_size; - - l_data_size = opj_dwt_max_resolution(res, numres); - /* overflow check */ - if (l_data_size > (SIZE_MAX - 5U)) { - /* FIXME event manager error callback */ - return OPJ_FALSE; - } - l_data_size += 5U; - /* overflow check */ - if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) { - /* FIXME event manager error callback */ - return OPJ_FALSE; - } - h.wavelet = (opj_v4_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v4_t)); - if (!h.wavelet) { - /* FIXME event manager error callback */ - return OPJ_FALSE; - } - v.wavelet = h.wavelet; - - while (--numres) { - OPJ_FLOAT32 * OPJ_RESTRICT aj = (OPJ_FLOAT32*) tilec->data; - OPJ_UINT32 j; - - h.sn = (OPJ_INT32)rw; - v.sn = (OPJ_INT32)rh; - - ++res; - - rw = (OPJ_UINT32)(res->x1 - - res->x0); /* width of the resolution level computed */ - rh = (OPJ_UINT32)(res->y1 - - res->y0); /* height of the resolution level computed */ - - h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn); - h.cas = res->x0 % 2; - - h.win_l_x0 = 0; - h.win_l_x1 = (OPJ_UINT32)h.sn; - h.win_h_x0 = 0; - h.win_h_x1 = (OPJ_UINT32)h.dn; - for (j = 0; j + 3 < rh; j += 4) { - OPJ_UINT32 k; - opj_v4dwt_interleave_h(&h, aj, w, rh - j); - opj_v4dwt_decode(&h); - - for (k = 0; k < rw; k++) { - aj[k ] = h.wavelet[k].f[0]; - aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[1]; - aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2]; - aj[k + (OPJ_SIZE_T)w * 3] = h.wavelet[k].f[3]; - } - - aj += w * 4; - } - - if (j < rh) { - OPJ_UINT32 k; - opj_v4dwt_interleave_h(&h, aj, w, rh - j); - opj_v4dwt_decode(&h); - for (k = 0; k < rw; k++) { - switch (rh - j) { - case 3: - aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2]; - /* FALLTHRU */ - case 2: - aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[1]; - /* FALLTHRU */ - case 1: - aj[k] = h.wavelet[k].f[0]; - } - } - } - - v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn); - v.cas = res->y0 % 2; - v.win_l_x0 = 0; - v.win_l_x1 = (OPJ_UINT32)v.sn; - v.win_h_x0 = 0; - v.win_h_x1 = (OPJ_UINT32)v.dn; - - aj = (OPJ_FLOAT32*) tilec->data; - for (j = rw; j > 3; j -= 4) { - OPJ_UINT32 k; - - opj_v4dwt_interleave_v(&v, aj, w, 4); - opj_v4dwt_decode(&v); - - for (k = 0; k < rh; ++k) { - memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], 4 * sizeof(OPJ_FLOAT32)); - } - aj += 4; - } - - if (rw & 0x03) { - OPJ_UINT32 k; - - j = rw & 0x03; - - opj_v4dwt_interleave_v(&v, aj, w, j); - opj_v4dwt_decode(&v); - - for (k = 0; k < rh; ++k) { - memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], - (OPJ_SIZE_T)j * sizeof(OPJ_FLOAT32)); - } - } - } - - opj_aligned_free(h.wavelet); - return OPJ_TRUE; -} - -static -OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, - OPJ_UINT32 numres) -{ - opj_sparse_array_int32_t* sa; - opj_v4dwt_t h; - opj_v4dwt_t v; - OPJ_UINT32 resno; - /* This value matches the maximum left/right extension given in tables */ - /* F.2 and F.3 of the standard. Note: in opj_tcd_is_subband_area_of_interest() */ - /* we currently use 3. */ - const OPJ_UINT32 filter_width = 4U; - - opj_tcd_resolution_t* tr = tilec->resolutions; - opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]); - - OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 - - tr->x0); /* width of the resolution level computed */ - OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 - - tr->y0); /* height of the resolution level computed */ - - OPJ_SIZE_T l_data_size; - - /* Compute the intersection of the area of interest, expressed in tile coordinates */ - /* with the tile coordinates */ - OPJ_UINT32 win_tcx0 = tilec->win_x0; - OPJ_UINT32 win_tcy0 = tilec->win_y0; - OPJ_UINT32 win_tcx1 = tilec->win_x1; - OPJ_UINT32 win_tcy1 = tilec->win_y1; - - if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) { - return OPJ_TRUE; - } - - sa = opj_dwt_init_sparse_array(tilec, numres); - if (sa == NULL) { - return OPJ_FALSE; - } - - if (numres == 1U) { - OPJ_BOOL ret = opj_sparse_array_int32_read(sa, - tr_max->win_x0 - (OPJ_UINT32)tr_max->x0, - tr_max->win_y0 - (OPJ_UINT32)tr_max->y0, - tr_max->win_x1 - (OPJ_UINT32)tr_max->x0, - tr_max->win_y1 - (OPJ_UINT32)tr_max->y0, - tilec->data_win, - 1, tr_max->win_x1 - tr_max->win_x0, - OPJ_TRUE); - assert(ret); - OPJ_UNUSED(ret); - opj_sparse_array_int32_free(sa); - return OPJ_TRUE; - } - - l_data_size = opj_dwt_max_resolution(tr, numres); - /* overflow check */ - if (l_data_size > (SIZE_MAX - 5U)) { - /* FIXME event manager error callback */ - return OPJ_FALSE; - } - l_data_size += 5U; - /* overflow check */ - if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) { - /* FIXME event manager error callback */ - return OPJ_FALSE; - } - h.wavelet = (opj_v4_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v4_t)); - if (!h.wavelet) { - /* FIXME event manager error callback */ - return OPJ_FALSE; - } - v.wavelet = h.wavelet; - - for (resno = 1; resno < numres; resno ++) { - OPJ_UINT32 j; - /* Window of interest subband-based coordinates */ - OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1; - OPJ_UINT32 win_hl_x0, win_hl_x1; - OPJ_UINT32 win_lh_y0, win_lh_y1; - /* Window of interest tile-resolution-based coordinates */ - OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1; - /* Tile-resolution subband-based coordinates */ - OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0; - - ++tr; - - h.sn = (OPJ_INT32)rw; - v.sn = (OPJ_INT32)rh; - - rw = (OPJ_UINT32)(tr->x1 - tr->x0); - rh = (OPJ_UINT32)(tr->y1 - tr->y0); - - h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn); - h.cas = tr->x0 % 2; - - v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn); - v.cas = tr->y0 % 2; - - /* Get the subband coordinates for the window of interest */ - /* LL band */ - opj_dwt_get_band_coordinates(tilec, resno, 0, - win_tcx0, win_tcy0, win_tcx1, win_tcy1, - &win_ll_x0, &win_ll_y0, - &win_ll_x1, &win_ll_y1); - - /* HL band */ - opj_dwt_get_band_coordinates(tilec, resno, 1, - win_tcx0, win_tcy0, win_tcx1, win_tcy1, - &win_hl_x0, NULL, &win_hl_x1, NULL); - - /* LH band */ - opj_dwt_get_band_coordinates(tilec, resno, 2, - win_tcx0, win_tcy0, win_tcx1, win_tcy1, - NULL, &win_lh_y0, NULL, &win_lh_y1); - - /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */ - tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0; - tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0; - tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0; - tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0; - - /* Substract the origin of the bands for this tile, to the subwindow */ - /* of interest band coordinates, so as to get them relative to the */ - /* tile */ - win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0); - win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0); - win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0); - win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0); - win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0); - win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0); - win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0); - win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0); - - opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1); - opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1); - - opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1); - opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1); - - /* Compute the tile-resolution-based coordinates for the window of interest */ - if (h.cas == 0) { - win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1); - win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw); - } else { - win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1); - win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw); - } - - if (v.cas == 0) { - win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1); - win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh); - } else { - win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1); - win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh); - } - - h.win_l_x0 = win_ll_x0; - h.win_l_x1 = win_ll_x1; - h.win_h_x0 = win_hl_x0; - h.win_h_x1 = win_hl_x1; - for (j = 0; j + 3 < rh; j += 4) { - if ((j + 3 >= win_ll_y0 && j < win_ll_y1) || - (j + 3 >= win_lh_y0 + (OPJ_UINT32)v.sn && - j < win_lh_y1 + (OPJ_UINT32)v.sn)) { - opj_v4dwt_interleave_partial_h(&h, sa, j, opj_uint_min(4U, rh - j)); - opj_v4dwt_decode(&h); - if (!opj_sparse_array_int32_write(sa, - win_tr_x0, j, - win_tr_x1, j + 4, - (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0], - 4, 1, OPJ_TRUE)) { - /* FIXME event manager error callback */ - opj_sparse_array_int32_free(sa); - opj_aligned_free(h.wavelet); - return OPJ_FALSE; - } - } - } - - if (j < rh && - ((j + 3 >= win_ll_y0 && j < win_ll_y1) || - (j + 3 >= win_lh_y0 + (OPJ_UINT32)v.sn && - j < win_lh_y1 + (OPJ_UINT32)v.sn))) { - opj_v4dwt_interleave_partial_h(&h, sa, j, rh - j); - opj_v4dwt_decode(&h); - if (!opj_sparse_array_int32_write(sa, - win_tr_x0, j, - win_tr_x1, rh, - (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0], - 4, 1, OPJ_TRUE)) { - /* FIXME event manager error callback */ - opj_sparse_array_int32_free(sa); - opj_aligned_free(h.wavelet); - return OPJ_FALSE; - } - } - - v.win_l_x0 = win_ll_y0; - v.win_l_x1 = win_ll_y1; - v.win_h_x0 = win_lh_y0; - v.win_h_x1 = win_lh_y1; - for (j = win_tr_x0; j < win_tr_x1; j += 4) { - OPJ_UINT32 nb_elts = opj_uint_min(4U, win_tr_x1 - j); - - opj_v4dwt_interleave_partial_v(&v, sa, j, nb_elts); - opj_v4dwt_decode(&v); - - if (!opj_sparse_array_int32_write(sa, - j, win_tr_y0, - j + nb_elts, win_tr_y1, - (OPJ_INT32*)&h.wavelet[win_tr_y0].f[0], - 1, 4, OPJ_TRUE)) { - /* FIXME event manager error callback */ - opj_sparse_array_int32_free(sa); - opj_aligned_free(h.wavelet); - return OPJ_FALSE; - } - } - } - - { - OPJ_BOOL ret = opj_sparse_array_int32_read(sa, - tr_max->win_x0 - (OPJ_UINT32)tr_max->x0, - tr_max->win_y0 - (OPJ_UINT32)tr_max->y0, - tr_max->win_x1 - (OPJ_UINT32)tr_max->x0, - tr_max->win_y1 - (OPJ_UINT32)tr_max->y0, - tilec->data_win, - 1, tr_max->win_x1 - tr_max->win_x0, - OPJ_TRUE); - assert(ret); - OPJ_UNUSED(ret); - } - opj_sparse_array_int32_free(sa); - - opj_aligned_free(h.wavelet); - return OPJ_TRUE; -} - - -OPJ_BOOL opj_dwt_decode_real(opj_tcd_t *p_tcd, - opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, - OPJ_UINT32 numres) -{ - if (p_tcd->whole_tile_decoding) { - return opj_dwt_decode_tile_97(tilec, numres); - } else { - return opj_dwt_decode_partial_97(tilec, numres); - } -} diff --git a/src/3rd/LibOpenJpeg/dwt.h b/src/3rd/LibOpenJpeg/dwt.h deleted file mode 100644 index 4f63e524..00000000 --- a/src/3rd/LibOpenJpeg/dwt.h +++ /dev/null @@ -1,128 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef OPJ_DWT_H -#define OPJ_DWT_H -/** -@file dwt.h -@brief Implementation of a discrete wavelet transform (DWT) - -The functions in DWT.C have for goal to realize forward and inverse discret wavelet -transform with filter 5-3 (reversible) and filter 9-7 (irreversible). The functions in -DWT.C are used by some function in TCD.C. -*/ - -/** @defgroup DWT DWT - Implementation of a discrete wavelet transform */ -/*@{*/ - - -/** @name Exported functions */ -/*@{*/ -/* ----------------------------------------------------------------------- */ -/** -Forward 5-3 wavelet transform in 2-D. -Apply a reversible DWT transform to a component of an image. -@param tilec Tile component information (current tile) -*/ -OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec); - -/** -Inverse 5-3 wavelet transform in 2-D. -Apply a reversible inverse DWT transform to a component of an image. -@param p_tcd TCD handle -@param tilec Tile component information (current tile) -@param numres Number of resolution levels to decode -*/ -OPJ_BOOL opj_dwt_decode(opj_tcd_t *p_tcd, - opj_tcd_tilecomp_t* tilec, - OPJ_UINT32 numres); - -/** -Get the gain of a subband for the reversible 5-3 DWT. -@param orient Number that identifies the subband (0->LL, 1->HL, 2->LH, 3->HH) -@return Returns 0 if orient = 0, returns 1 if orient = 1 or 2, returns 2 otherwise -*/ -OPJ_UINT32 opj_dwt_getgain(OPJ_UINT32 orient) ; -/** -Get the norm of a wavelet function of a subband at a specified level for the reversible 5-3 DWT. -@param level Level of the wavelet function -@param orient Band of the wavelet function -@return Returns the norm of the wavelet function -*/ -OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient); -/** -Forward 9-7 wavelet transform in 2-D. -Apply an irreversible DWT transform to a component of an image. -@param tilec Tile component information (current tile) -*/ -OPJ_BOOL opj_dwt_encode_real(opj_tcd_tilecomp_t * tilec); -/** -Inverse 9-7 wavelet transform in 2-D. -Apply an irreversible inverse DWT transform to a component of an image. -@param p_tcd TCD handle -@param tilec Tile component information (current tile) -@param numres Number of resolution levels to decode -*/ -OPJ_BOOL opj_dwt_decode_real(opj_tcd_t *p_tcd, - opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, - OPJ_UINT32 numres); - -/** -Get the gain of a subband for the irreversible 9-7 DWT. -@param orient Number that identifies the subband (0->LL, 1->HL, 2->LH, 3->HH) -@return Returns the gain of the 9-7 wavelet transform -*/ -OPJ_UINT32 opj_dwt_getgain_real(OPJ_UINT32 orient); -/** -Get the norm of a wavelet function of a subband at a specified level for the irreversible 9-7 DWT -@param level Level of the wavelet function -@param orient Band of the wavelet function -@return Returns the norm of the 9-7 wavelet -*/ -OPJ_FLOAT64 opj_dwt_getnorm_real(OPJ_UINT32 level, OPJ_UINT32 orient); -/** -Explicit calculation of the Quantization Stepsizes -@param tccp Tile-component coding parameters -@param prec Precint analyzed -*/ -void opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, OPJ_UINT32 prec); -/* ----------------------------------------------------------------------- */ -/*@}*/ - -/*@}*/ - -#endif /* OPJ_DWT_H */ diff --git a/src/3rd/LibOpenJpeg/event.c b/src/3rd/LibOpenJpeg/event.c deleted file mode 100644 index aad9d76c..00000000 --- a/src/3rd/LibOpenJpeg/event.c +++ /dev/null @@ -1,151 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR - * Copyright (c) 2012, CS Systemes d'Information, France - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "opj_includes.h" - -/* ========================================================== - Utility functions - ==========================================================*/ - -#ifdef OPJ_CODE_NOT_USED -#ifndef _WIN32 -static char* -i2a(unsigned i, char *a, unsigned r) -{ - if (i / r > 0) { - a = i2a(i / r, a, r); - } - *a = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"[i % r]; - return a + 1; -} - -/** - Transforms integer i into an ascii string and stores the result in a; - string is encoded in the base indicated by r. - @param i Number to be converted - @param a String result - @param r Base of value; must be in the range 2 - 36 - @return Returns a -*/ -static char * -_itoa(int i, char *a, int r) -{ - r = ((r < 2) || (r > 36)) ? 10 : r; - if (i < 0) { - *a = '-'; - *i2a(-i, a + 1, r) = 0; - } else { - *i2a(i, a, r) = 0; - } - return a; -} - -#endif /* !_WIN32 */ -#endif - -/* ----------------------------------------------------------------------- */ -/** - * Default callback function. - * Do nothing. - */ -static void opj_default_callback(const char *msg, void *client_data) -{ - OPJ_ARG_NOT_USED(msg); - OPJ_ARG_NOT_USED(client_data); -} - -/* ----------------------------------------------------------------------- */ - - -/* ----------------------------------------------------------------------- */ -OPJ_BOOL opj_event_msg(opj_event_mgr_t* p_event_mgr, OPJ_INT32 event_type, - const char *fmt, ...) -{ -#define OPJ_MSG_SIZE 512 /* 512 bytes should be more than enough for a short message */ - opj_msg_callback msg_handler = 00; - void * l_data = 00; - - if (p_event_mgr != 00) { - switch (event_type) { - case EVT_ERROR: - msg_handler = p_event_mgr->error_handler; - l_data = p_event_mgr->m_error_data; - break; - case EVT_WARNING: - msg_handler = p_event_mgr->warning_handler; - l_data = p_event_mgr->m_warning_data; - break; - case EVT_INFO: - msg_handler = p_event_mgr->info_handler; - l_data = p_event_mgr->m_info_data; - break; - default: - break; - } - if (msg_handler == 00) { - return OPJ_FALSE; - } - } else { - return OPJ_FALSE; - } - - if ((fmt != 00) && (p_event_mgr != 00)) { - va_list arg; - char message[OPJ_MSG_SIZE]; - memset(message, 0, OPJ_MSG_SIZE); - /* initialize the optional parameter list */ - va_start(arg, fmt); - /* parse the format string and put the result in 'message' */ - vsnprintf(message, OPJ_MSG_SIZE, fmt, arg); - /* force zero termination for Windows _vsnprintf() of old MSVC */ - message[OPJ_MSG_SIZE - 1] = '\0'; - /* deinitialize the optional parameter list */ - va_end(arg); - - /* output the message to the user program */ - msg_handler(message, l_data); - } - - return OPJ_TRUE; -} - -void opj_set_default_event_handler(opj_event_mgr_t * p_manager) -{ - p_manager->m_error_data = 00; - p_manager->m_warning_data = 00; - p_manager->m_info_data = 00; - p_manager->error_handler = opj_default_callback; - p_manager->info_handler = opj_default_callback; - p_manager->warning_handler = opj_default_callback; -} - diff --git a/src/3rd/LibOpenJpeg/event.h b/src/3rd/LibOpenJpeg/event.h deleted file mode 100644 index d880388d..00000000 --- a/src/3rd/LibOpenJpeg/event.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR - * Copyright (c) 2012, CS Systemes d'Information, France - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef OPJ_EVENT_H -#define OPJ_EVENT_H -/** -@file event.h -@brief Implementation of a event callback system - -The functions in EVENT.C have for goal to send output messages (errors, warnings, debug) to the user. -*/ -/** -Message handler object -used for -
    -
  • Error messages -
  • Warning messages -
  • Debugging messages -
-*/ -typedef struct opj_event_mgr { - /** Data to call the event manager upon */ - void * m_error_data; - /** Data to call the event manager upon */ - void * m_warning_data; - /** Data to call the event manager upon */ - void * m_info_data; - /** Error message callback if available, NULL otherwise */ - opj_msg_callback error_handler; - /** Warning message callback if available, NULL otherwise */ - opj_msg_callback warning_handler; - /** Debug message callback if available, NULL otherwise */ - opj_msg_callback info_handler; -} opj_event_mgr_t; - - -#define EVT_ERROR 1 /**< Error event type */ -#define EVT_WARNING 2 /**< Warning event type */ -#define EVT_INFO 4 /**< Debug event type */ - -/** @defgroup EVENT EVENT - Implementation of a event callback system */ -/*@{*/ - -/** @name Exported functions (see also openjpeg.h) */ -/*@{*/ -/* ----------------------------------------------------------------------- */ - - -/* ----------------------------------------------------------------------- */ - -/** - * Write formatted data to a string and send the string to a user callback. - * - * @param event_mgr Event handler - * @param event_type Event type or callback to use to send the message - * @param fmt Format-control string (plus optional arguments) - * - * @return Returns true if successful, returns false otherwise - */ -OPJ_BOOL opj_event_msg(opj_event_mgr_t* event_mgr, OPJ_INT32 event_type, - const char *fmt, ...); -/* ----------------------------------------------------------------------- */ - -/** - * Set the event manager with the default callback function for the 3 levels. - */ -void opj_set_default_event_handler(opj_event_mgr_t * p_manager); - -/* -#ifdef __GNUC__ -#pragma GCC poison printf fprintf -#endif -*/ - -/*@}*/ - -/*@}*/ - -#endif /* OPJ_EVENT_H */ diff --git a/src/3rd/LibOpenJpeg/function_list.c b/src/3rd/LibOpenJpeg/function_list.c deleted file mode 100644 index e1c1af38..00000000 --- a/src/3rd/LibOpenJpeg/function_list.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2008, Jerome Fimes, Communications & Systemes - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "opj_includes.h" - -/** - * Default size of the validation list, if not sufficient, data will be reallocated with a double size. - */ -#define OPJ_VALIDATION_SIZE 10 - -opj_procedure_list_t * opj_procedure_list_create() -{ - /* memory allocation */ - opj_procedure_list_t * l_validation = (opj_procedure_list_t *) opj_calloc(1, - sizeof(opj_procedure_list_t)); - if (! l_validation) { - return 00; - } - /* initialization */ - l_validation->m_nb_max_procedures = OPJ_VALIDATION_SIZE; - l_validation->m_procedures = (opj_procedure*)opj_calloc(OPJ_VALIDATION_SIZE, - sizeof(opj_procedure)); - if (! l_validation->m_procedures) { - opj_free(l_validation); - return 00; - } - return l_validation; -} - -void opj_procedure_list_destroy(opj_procedure_list_t * p_list) -{ - if (! p_list) { - return; - } - /* initialization */ - if (p_list->m_procedures) { - opj_free(p_list->m_procedures); - } - opj_free(p_list); -} - -OPJ_BOOL opj_procedure_list_add_procedure(opj_procedure_list_t * - p_validation_list, opj_procedure p_procedure, opj_event_mgr_t* p_manager) -{ - - assert(p_manager != NULL); - - if (p_validation_list->m_nb_max_procedures == - p_validation_list->m_nb_procedures) { - opj_procedure * new_procedures; - - p_validation_list->m_nb_max_procedures += OPJ_VALIDATION_SIZE; - new_procedures = (opj_procedure*)opj_realloc( - p_validation_list->m_procedures, - p_validation_list->m_nb_max_procedures * sizeof(opj_procedure)); - if (! new_procedures) { - opj_free(p_validation_list->m_procedures); - p_validation_list->m_nb_max_procedures = 0; - p_validation_list->m_nb_procedures = 0; - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to add a new validation procedure\n"); - return OPJ_FALSE; - } else { - p_validation_list->m_procedures = new_procedures; - } - } - p_validation_list->m_procedures[p_validation_list->m_nb_procedures] = - p_procedure; - ++p_validation_list->m_nb_procedures; - - return OPJ_TRUE; -} - -OPJ_UINT32 opj_procedure_list_get_nb_procedures(opj_procedure_list_t * - p_validation_list) -{ - return p_validation_list->m_nb_procedures; -} - -opj_procedure* opj_procedure_list_get_first_procedure(opj_procedure_list_t * - p_validation_list) -{ - return p_validation_list->m_procedures; -} - -void opj_procedure_list_clear(opj_procedure_list_t * p_validation_list) -{ - p_validation_list->m_nb_procedures = 0; -} diff --git a/src/3rd/LibOpenJpeg/function_list.h b/src/3rd/LibOpenJpeg/function_list.h deleted file mode 100644 index 81a3954a..00000000 --- a/src/3rd/LibOpenJpeg/function_list.h +++ /dev/null @@ -1,134 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2008, Jerome Fimes, Communications & Systemes - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef OPJ_FUNCTION_LIST_H -#define OPJ_FUNCTION_LIST_H - -/** - * @file function_list.h - * @brief Implementation of a list of procedures. - - * The functions in validation.c aims to have access to a list of procedures. -*/ - -/** @defgroup VAL VAL - validation procedure*/ -/*@{*/ - -/************************************************************************************************** - ***************************************** FORWARD DECLARATION ************************************ - **************************************************************************************************/ - -/** - * declare a function pointer - */ -typedef void (*opj_procedure)(void); - -/** - * A list of procedures. -*/ -typedef struct opj_procedure_list { - /** - * The number of validation procedures. - */ - OPJ_UINT32 m_nb_procedures; - /** - * The number of the array of validation procedures. - */ - OPJ_UINT32 m_nb_max_procedures; - /** - * The array of procedures. - */ - opj_procedure * m_procedures; - -} opj_procedure_list_t; - -/* ----------------------------------------------------------------------- */ - -/** - * Creates a validation list. - * - * @return the newly created validation list. - */ -opj_procedure_list_t * opj_procedure_list_create(void); - -/** - * Destroys a validation list. - * - * @param p_list the list to destroy. - */ -void opj_procedure_list_destroy(opj_procedure_list_t * p_list); - -/** - * Adds a new validation procedure. - * - * @param p_validation_list the list of procedure to modify. - * @param p_procedure the procedure to add. - * @param p_manager the user event manager. - * - * @return OPJ_TRUE if the procedure could be added. - */ -OPJ_BOOL opj_procedure_list_add_procedure(opj_procedure_list_t * - p_validation_list, opj_procedure p_procedure, opj_event_mgr_t* p_manager); - -/** - * Gets the number of validation procedures. - * - * @param p_validation_list the list of procedure to modify. - * - * @return the number of validation procedures. - */ -OPJ_UINT32 opj_procedure_list_get_nb_procedures(opj_procedure_list_t * - p_validation_list); - -/** - * Gets the pointer on the first validation procedure. This function is similar to the C++ - * iterator class to iterate through all the procedures inside the validation list. - * the caller does not take ownership of the pointer. - * - * @param p_validation_list the list of procedure to get the first procedure from. - * - * @return a pointer to the first procedure. - */ -opj_procedure* opj_procedure_list_get_first_procedure(opj_procedure_list_t * - p_validation_list); - - -/** - * Clears the list of validation procedures. - * - * @param p_validation_list the list of procedure to clear. - * - */ -void opj_procedure_list_clear(opj_procedure_list_t * p_validation_list); -/*@}*/ - -#endif /* OPJ_FUNCTION_LIST_H */ - diff --git a/src/3rd/LibOpenJpeg/image.c b/src/3rd/LibOpenJpeg/image.c deleted file mode 100644 index fe373905..00000000 --- a/src/3rd/LibOpenJpeg/image.c +++ /dev/null @@ -1,264 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "opj_includes.h" - -opj_image_t* opj_image_create0(void) -{ - opj_image_t *image = (opj_image_t*)opj_calloc(1, sizeof(opj_image_t)); - return image; -} - -opj_image_t* OPJ_CALLCONV opj_image_create(OPJ_UINT32 numcmpts, - opj_image_cmptparm_t *cmptparms, OPJ_COLOR_SPACE clrspc) -{ - OPJ_UINT32 compno; - opj_image_t *image = NULL; - - image = (opj_image_t*) opj_calloc(1, sizeof(opj_image_t)); - if (image) { - image->color_space = clrspc; - image->numcomps = numcmpts; - /* allocate memory for the per-component information */ - image->comps = (opj_image_comp_t*)opj_calloc(image->numcomps, - sizeof(opj_image_comp_t)); - if (!image->comps) { - /* TODO replace with event manager, breaks API */ - /* fprintf(stderr,"Unable to allocate memory for image.\n"); */ - opj_image_destroy(image); - return NULL; - } - /* create the individual image components */ - for (compno = 0; compno < numcmpts; compno++) { - opj_image_comp_t *comp = &image->comps[compno]; - comp->dx = cmptparms[compno].dx; - comp->dy = cmptparms[compno].dy; - comp->w = cmptparms[compno].w; - comp->h = cmptparms[compno].h; - comp->x0 = cmptparms[compno].x0; - comp->y0 = cmptparms[compno].y0; - comp->prec = cmptparms[compno].prec; - comp->bpp = cmptparms[compno].bpp; - comp->sgnd = cmptparms[compno].sgnd; - if (comp->h != 0 && - (OPJ_SIZE_T)comp->w > SIZE_MAX / comp->h / sizeof(OPJ_INT32)) { - /* TODO event manager */ - opj_image_destroy(image); - return NULL; - } - comp->data = (OPJ_INT32*) opj_image_data_alloc( - (size_t)comp->w * comp->h * sizeof(OPJ_INT32)); - if (!comp->data) { - /* TODO replace with event manager, breaks API */ - /* fprintf(stderr,"Unable to allocate memory for image.\n"); */ - opj_image_destroy(image); - return NULL; - } - memset(comp->data, 0, (size_t)comp->w * comp->h * sizeof(OPJ_INT32)); - } - } - - return image; -} - -void OPJ_CALLCONV opj_image_destroy(opj_image_t *image) -{ - if (image) { - if (image->comps) { - OPJ_UINT32 compno; - - /* image components */ - for (compno = 0; compno < image->numcomps; compno++) { - opj_image_comp_t *image_comp = &(image->comps[compno]); - if (image_comp->data) { - opj_image_data_free(image_comp->data); - } - } - opj_free(image->comps); - } - - if (image->icc_profile_buf) { - opj_free(image->icc_profile_buf); - } - - opj_free(image); - } -} - -/** - * Updates the components characteristics of the image from the coding parameters. - * - * @param p_image_header the image header to update. - * @param p_cp the coding parameters from which to update the image. - */ -void opj_image_comp_header_update(opj_image_t * p_image_header, - const struct opj_cp * p_cp) -{ - OPJ_UINT32 i, l_width, l_height; - OPJ_UINT32 l_x0, l_y0, l_x1, l_y1; - OPJ_UINT32 l_comp_x0, l_comp_y0, l_comp_x1, l_comp_y1; - opj_image_comp_t* l_img_comp = NULL; - - l_x0 = opj_uint_max(p_cp->tx0, p_image_header->x0); - l_y0 = opj_uint_max(p_cp->ty0, p_image_header->y0); - l_x1 = p_cp->tx0 + (p_cp->tw - 1U) * - p_cp->tdx; /* validity of p_cp members used here checked in opj_j2k_read_siz. Can't overflow. */ - l_y1 = p_cp->ty0 + (p_cp->th - 1U) * p_cp->tdy; /* can't overflow */ - l_x1 = opj_uint_min(opj_uint_adds(l_x1, p_cp->tdx), - p_image_header->x1); /* use add saturated to prevent overflow */ - l_y1 = opj_uint_min(opj_uint_adds(l_y1, p_cp->tdy), - p_image_header->y1); /* use add saturated to prevent overflow */ - - l_img_comp = p_image_header->comps; - for (i = 0; i < p_image_header->numcomps; ++i) { - l_comp_x0 = opj_uint_ceildiv(l_x0, l_img_comp->dx); - l_comp_y0 = opj_uint_ceildiv(l_y0, l_img_comp->dy); - l_comp_x1 = opj_uint_ceildiv(l_x1, l_img_comp->dx); - l_comp_y1 = opj_uint_ceildiv(l_y1, l_img_comp->dy); - l_width = opj_uint_ceildivpow2(l_comp_x1 - l_comp_x0, l_img_comp->factor); - l_height = opj_uint_ceildivpow2(l_comp_y1 - l_comp_y0, l_img_comp->factor); - l_img_comp->w = l_width; - l_img_comp->h = l_height; - l_img_comp->x0 = l_comp_x0; - l_img_comp->y0 = l_comp_y0; - ++l_img_comp; - } -} - - -/** - * Copy only header of image and its component header (no data are copied) - * if dest image have data, they will be freed - * - * @param p_image_src the src image - * @param p_image_dest the dest image - * - */ -void opj_copy_image_header(const opj_image_t* p_image_src, - opj_image_t* p_image_dest) -{ - OPJ_UINT32 compno; - - /* preconditions */ - assert(p_image_src != 00); - assert(p_image_dest != 00); - - p_image_dest->x0 = p_image_src->x0; - p_image_dest->y0 = p_image_src->y0; - p_image_dest->x1 = p_image_src->x1; - p_image_dest->y1 = p_image_src->y1; - - if (p_image_dest->comps) { - for (compno = 0; compno < p_image_dest->numcomps; compno++) { - opj_image_comp_t *image_comp = &(p_image_dest->comps[compno]); - if (image_comp->data) { - opj_image_data_free(image_comp->data); - } - } - opj_free(p_image_dest->comps); - p_image_dest->comps = NULL; - } - - p_image_dest->numcomps = p_image_src->numcomps; - - p_image_dest->comps = (opj_image_comp_t*) opj_malloc(p_image_dest->numcomps * - sizeof(opj_image_comp_t)); - if (!p_image_dest->comps) { - p_image_dest->comps = NULL; - p_image_dest->numcomps = 0; - return; - } - - for (compno = 0; compno < p_image_dest->numcomps; compno++) { - memcpy(&(p_image_dest->comps[compno]), - &(p_image_src->comps[compno]), - sizeof(opj_image_comp_t)); - p_image_dest->comps[compno].data = NULL; - } - - p_image_dest->color_space = p_image_src->color_space; - p_image_dest->icc_profile_len = p_image_src->icc_profile_len; - - if (p_image_dest->icc_profile_len) { - p_image_dest->icc_profile_buf = (OPJ_BYTE*)opj_malloc( - p_image_dest->icc_profile_len); - if (!p_image_dest->icc_profile_buf) { - p_image_dest->icc_profile_buf = NULL; - p_image_dest->icc_profile_len = 0; - return; - } - memcpy(p_image_dest->icc_profile_buf, - p_image_src->icc_profile_buf, - p_image_src->icc_profile_len); - } else { - p_image_dest->icc_profile_buf = NULL; - } - - return; -} - -opj_image_t* OPJ_CALLCONV opj_image_tile_create(OPJ_UINT32 numcmpts, - opj_image_cmptparm_t *cmptparms, OPJ_COLOR_SPACE clrspc) -{ - OPJ_UINT32 compno; - opj_image_t *image = 00; - - image = (opj_image_t*) opj_calloc(1, sizeof(opj_image_t)); - if (image) { - - image->color_space = clrspc; - image->numcomps = numcmpts; - - /* allocate memory for the per-component information */ - image->comps = (opj_image_comp_t*)opj_calloc(image->numcomps, - sizeof(opj_image_comp_t)); - if (!image->comps) { - opj_image_destroy(image); - return 00; - } - - /* create the individual image components */ - for (compno = 0; compno < numcmpts; compno++) { - opj_image_comp_t *comp = &image->comps[compno]; - comp->dx = cmptparms[compno].dx; - comp->dy = cmptparms[compno].dy; - comp->w = cmptparms[compno].w; - comp->h = cmptparms[compno].h; - comp->x0 = cmptparms[compno].x0; - comp->y0 = cmptparms[compno].y0; - comp->prec = cmptparms[compno].prec; - comp->sgnd = cmptparms[compno].sgnd; - comp->data = 0; - } - } - - return image; -} diff --git a/src/3rd/LibOpenJpeg/image.h b/src/3rd/LibOpenJpeg/image.h deleted file mode 100644 index bad83c61..00000000 --- a/src/3rd/LibOpenJpeg/image.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef OPJ_IMAGE_H -#define OPJ_IMAGE_H -/** -@file image.h -@brief Implementation of operations on images (IMAGE) - -The functions in IMAGE.C have for goal to realize operations on images. -*/ - -struct opj_image; -struct opj_cp; - -/** @defgroup IMAGE IMAGE - Implementation of operations on images */ -/*@{*/ - -/** - * Create an empty image - * - * @return returns an empty image if successful, returns NULL otherwise - */ -opj_image_t* opj_image_create0(void); - - - -/** - * Updates the components characteristics of the image from the coding parameters. - * - * @param p_image_header the image header to update. - * @param p_cp the coding parameters from which to update the image. - */ -void opj_image_comp_header_update(opj_image_t * p_image, - const struct opj_cp* p_cp); - -void opj_copy_image_header(const opj_image_t* p_image_src, - opj_image_t* p_image_dest); - -/*@}*/ - -#endif /* OPJ_IMAGE_H */ - diff --git a/src/3rd/LibOpenJpeg/invert.c b/src/3rd/LibOpenJpeg/invert.c deleted file mode 100644 index 89f60715..00000000 --- a/src/3rd/LibOpenJpeg/invert.c +++ /dev/null @@ -1,295 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2008, Jerome Fimes, Communications & Systemes - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "opj_includes.h" - -/** - * LUP decomposition - */ -static OPJ_BOOL opj_lupDecompose(OPJ_FLOAT32 * matrix, - OPJ_UINT32 * permutations, - OPJ_FLOAT32 * p_swap_area, - OPJ_UINT32 nb_compo); -/** - * LUP solving - */ -static void opj_lupSolve(OPJ_FLOAT32 * pResult, - OPJ_FLOAT32* pMatrix, - OPJ_FLOAT32* pVector, - OPJ_UINT32* pPermutations, - OPJ_UINT32 nb_compo, - OPJ_FLOAT32 * p_intermediate_data); - -/** - *LUP inversion (call with the result of lupDecompose) - */ -static void opj_lupInvert(OPJ_FLOAT32 * pSrcMatrix, - OPJ_FLOAT32 * pDestMatrix, - OPJ_UINT32 nb_compo, - OPJ_UINT32 * pPermutations, - OPJ_FLOAT32 * p_src_temp, - OPJ_FLOAT32 * p_dest_temp, - OPJ_FLOAT32 * p_swap_area); - -/* -========================================================== - Matric inversion interface -========================================================== -*/ -/** - * Matrix inversion. - */ -OPJ_BOOL opj_matrix_inversion_f(OPJ_FLOAT32 * pSrcMatrix, - OPJ_FLOAT32 * pDestMatrix, - OPJ_UINT32 nb_compo) -{ - OPJ_BYTE * l_data = 00; - OPJ_UINT32 l_permutation_size = nb_compo * (OPJ_UINT32)sizeof(OPJ_UINT32); - OPJ_UINT32 l_swap_size = nb_compo * (OPJ_UINT32)sizeof(OPJ_FLOAT32); - OPJ_UINT32 l_total_size = l_permutation_size + 3 * l_swap_size; - OPJ_UINT32 * lPermutations = 00; - OPJ_FLOAT32 * l_double_data = 00; - - l_data = (OPJ_BYTE *) opj_malloc(l_total_size); - if (l_data == 0) { - return OPJ_FALSE; - } - lPermutations = (OPJ_UINT32 *) l_data; - l_double_data = (OPJ_FLOAT32 *)(l_data + l_permutation_size); - memset(lPermutations, 0, l_permutation_size); - - if (! opj_lupDecompose(pSrcMatrix, lPermutations, l_double_data, nb_compo)) { - opj_free(l_data); - return OPJ_FALSE; - } - - opj_lupInvert(pSrcMatrix, pDestMatrix, nb_compo, lPermutations, l_double_data, - l_double_data + nb_compo, l_double_data + 2 * nb_compo); - opj_free(l_data); - - return OPJ_TRUE; -} - - -/* -========================================================== - Local functions -========================================================== -*/ -static OPJ_BOOL opj_lupDecompose(OPJ_FLOAT32 * matrix, - OPJ_UINT32 * permutations, - OPJ_FLOAT32 * p_swap_area, - OPJ_UINT32 nb_compo) -{ - OPJ_UINT32 * tmpPermutations = permutations; - OPJ_UINT32 * dstPermutations; - OPJ_UINT32 k2 = 0, t; - OPJ_FLOAT32 temp; - OPJ_UINT32 i, j, k; - OPJ_FLOAT32 p; - OPJ_UINT32 lLastColum = nb_compo - 1; - OPJ_UINT32 lSwapSize = nb_compo * (OPJ_UINT32)sizeof(OPJ_FLOAT32); - OPJ_FLOAT32 * lTmpMatrix = matrix; - OPJ_FLOAT32 * lColumnMatrix, * lDestMatrix; - OPJ_UINT32 offset = 1; - OPJ_UINT32 lStride = nb_compo - 1; - - /*initialize permutations */ - for (i = 0; i < nb_compo; ++i) { - *tmpPermutations++ = i; - } - /* now make a pivot with column switch */ - tmpPermutations = permutations; - for (k = 0; k < lLastColum; ++k) { - p = 0.0; - - /* take the middle element */ - lColumnMatrix = lTmpMatrix + k; - - /* make permutation with the biggest value in the column */ - for (i = k; i < nb_compo; ++i) { - temp = ((*lColumnMatrix > 0) ? *lColumnMatrix : -(*lColumnMatrix)); - if (temp > p) { - p = temp; - k2 = i; - } - /* next line */ - lColumnMatrix += nb_compo; - } - - /* a whole rest of 0 -> non singular */ - if (p == 0.0) { - return OPJ_FALSE; - } - - /* should we permute ? */ - if (k2 != k) { - /*exchange of line */ - /* k2 > k */ - dstPermutations = tmpPermutations + k2 - k; - /* swap indices */ - t = *tmpPermutations; - *tmpPermutations = *dstPermutations; - *dstPermutations = t; - - /* and swap entire line. */ - lColumnMatrix = lTmpMatrix + (k2 - k) * nb_compo; - memcpy(p_swap_area, lColumnMatrix, lSwapSize); - memcpy(lColumnMatrix, lTmpMatrix, lSwapSize); - memcpy(lTmpMatrix, p_swap_area, lSwapSize); - } - - /* now update data in the rest of the line and line after */ - lDestMatrix = lTmpMatrix + k; - lColumnMatrix = lDestMatrix + nb_compo; - /* take the middle element */ - temp = *(lDestMatrix++); - - /* now compute up data (i.e. coeff up of the diagonal). */ - for (i = offset; i < nb_compo; ++i) { - /*lColumnMatrix; */ - /* divide the lower column elements by the diagonal value */ - - /* matrix[i][k] /= matrix[k][k]; */ - /* p = matrix[i][k] */ - p = *lColumnMatrix / temp; - *(lColumnMatrix++) = p; - - for (j = /* k + 1 */ offset; j < nb_compo; ++j) { - /* matrix[i][j] -= matrix[i][k] * matrix[k][j]; */ - *(lColumnMatrix++) -= p * (*(lDestMatrix++)); - } - /* come back to the k+1th element */ - lDestMatrix -= lStride; - /* go to kth element of the next line */ - lColumnMatrix += k; - } - - /* offset is now k+2 */ - ++offset; - /* 1 element less for stride */ - --lStride; - /* next line */ - lTmpMatrix += nb_compo; - /* next permutation element */ - ++tmpPermutations; - } - return OPJ_TRUE; -} - -static void opj_lupSolve(OPJ_FLOAT32 * pResult, - OPJ_FLOAT32 * pMatrix, - OPJ_FLOAT32 * pVector, - OPJ_UINT32* pPermutations, - OPJ_UINT32 nb_compo, OPJ_FLOAT32 * p_intermediate_data) -{ - OPJ_INT32 k; - OPJ_UINT32 i, j; - OPJ_FLOAT32 sum; - OPJ_FLOAT32 u; - OPJ_UINT32 lStride = nb_compo + 1; - OPJ_FLOAT32 * lCurrentPtr; - OPJ_FLOAT32 * lIntermediatePtr; - OPJ_FLOAT32 * lDestPtr; - OPJ_FLOAT32 * lTmpMatrix; - OPJ_FLOAT32 * lLineMatrix = pMatrix; - OPJ_FLOAT32 * lBeginPtr = pResult + nb_compo - 1; - OPJ_FLOAT32 * lGeneratedData; - OPJ_UINT32 * lCurrentPermutationPtr = pPermutations; - - - lIntermediatePtr = p_intermediate_data; - lGeneratedData = p_intermediate_data + nb_compo - 1; - - for (i = 0; i < nb_compo; ++i) { - sum = 0.0; - lCurrentPtr = p_intermediate_data; - lTmpMatrix = lLineMatrix; - for (j = 1; j <= i; ++j) { - /* sum += matrix[i][j-1] * y[j-1]; */ - sum += (*(lTmpMatrix++)) * (*(lCurrentPtr++)); - } - /*y[i] = pVector[pPermutations[i]] - sum; */ - *(lIntermediatePtr++) = pVector[*(lCurrentPermutationPtr++)] - sum; - lLineMatrix += nb_compo; - } - - /* we take the last point of the matrix */ - lLineMatrix = pMatrix + nb_compo * nb_compo - 1; - - /* and we take after the last point of the destination vector */ - lDestPtr = pResult + nb_compo; - - - assert(nb_compo != 0); - for (k = (OPJ_INT32)nb_compo - 1; k != -1 ; --k) { - sum = 0.0; - lTmpMatrix = lLineMatrix; - u = *(lTmpMatrix++); - lCurrentPtr = lDestPtr--; - for (j = (OPJ_UINT32)(k + 1); j < nb_compo; ++j) { - /* sum += matrix[k][j] * x[j] */ - sum += (*(lTmpMatrix++)) * (*(lCurrentPtr++)); - } - /*x[k] = (y[k] - sum) / u; */ - *(lBeginPtr--) = (*(lGeneratedData--) - sum) / u; - lLineMatrix -= lStride; - } -} - - -static void opj_lupInvert(OPJ_FLOAT32 * pSrcMatrix, - OPJ_FLOAT32 * pDestMatrix, - OPJ_UINT32 nb_compo, - OPJ_UINT32 * pPermutations, - OPJ_FLOAT32 * p_src_temp, - OPJ_FLOAT32 * p_dest_temp, - OPJ_FLOAT32 * p_swap_area) -{ - OPJ_UINT32 j, i; - OPJ_FLOAT32 * lCurrentPtr; - OPJ_FLOAT32 * lLineMatrix = pDestMatrix; - OPJ_UINT32 lSwapSize = nb_compo * (OPJ_UINT32)sizeof(OPJ_FLOAT32); - - for (j = 0; j < nb_compo; ++j) { - lCurrentPtr = lLineMatrix++; - memset(p_src_temp, 0, lSwapSize); - p_src_temp[j] = 1.0; - opj_lupSolve(p_dest_temp, pSrcMatrix, p_src_temp, pPermutations, nb_compo, - p_swap_area); - - for (i = 0; i < nb_compo; ++i) { - *(lCurrentPtr) = p_dest_temp[i]; - lCurrentPtr += nb_compo; - } - } -} - diff --git a/src/3rd/LibOpenJpeg/invert.h b/src/3rd/LibOpenJpeg/invert.h deleted file mode 100644 index 70402135..00000000 --- a/src/3rd/LibOpenJpeg/invert.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2008, Jerome Fimes, Communications & Systemes - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef OPJ_INVERT_H -#define OPJ_INVERT_H -/** -@file invert.h -@brief Implementation of the matrix inversion - -The function in INVERT.H compute a matrix inversion with a LUP method -*/ - -/** @defgroup INVERT INVERT - Implementation of a matrix inversion */ -/*@{*/ -/** @name Exported functions */ -/*@{*/ -/* ----------------------------------------------------------------------- */ - -/** - * Calculates a n x n double matrix inversion with a LUP method. Data is aligned, rows after rows (or columns after columns). - * The function does not take ownership of any memory block, data must be fred by the user. - * - * @param pSrcMatrix the matrix to invert. - * @param pDestMatrix data to store the inverted matrix. - * @param nb_compo size of the matrix - * @return OPJ_TRUE if the inversion is successful, OPJ_FALSE if the matrix is singular. - */ -OPJ_BOOL opj_matrix_inversion_f(OPJ_FLOAT32 * pSrcMatrix, - OPJ_FLOAT32 * pDestMatrix, - OPJ_UINT32 nb_compo); -/* ----------------------------------------------------------------------- */ -/*@}*/ - -/*@}*/ - -#endif /* OPJ_INVERT_H */ diff --git a/src/3rd/LibOpenJpeg/j2k.c b/src/3rd/LibOpenJpeg/j2k.c deleted file mode 100644 index 6e9cf8ce..00000000 --- a/src/3rd/LibOpenJpeg/j2k.c +++ /dev/null @@ -1,12126 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2008, Jerome Fimes, Communications & Systemes - * Copyright (c) 2006-2007, Parvatha Elangovan - * Copyright (c) 2010-2011, Kaori Hagihara - * Copyright (c) 2011-2012, Centre National d'Etudes Spatiales (CNES), France - * Copyright (c) 2012, CS Systemes d'Information, France - * Copyright (c) 2017, IntoPIX SA - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "opj_includes.h" - -/** @defgroup J2K J2K - JPEG-2000 codestream reader/writer */ -/*@{*/ - -/** @name Local static functions */ -/*@{*/ - -/** - * Sets up the procedures to do on reading header. Developpers wanting to extend the library can add their own reading procedures. - */ -static OPJ_BOOL opj_j2k_setup_header_reading(opj_j2k_t *p_j2k, - opj_event_mgr_t * p_manager); - -/** - * The read header procedure. - */ -static OPJ_BOOL opj_j2k_read_header_procedure(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * The default encoding validation procedure without any extension. - * - * @param p_j2k the jpeg2000 codec to validate. - * @param p_stream the input stream to validate. - * @param p_manager the user event manager. - * - * @return true if the parameters are correct. - */ -static OPJ_BOOL opj_j2k_encoding_validation(opj_j2k_t * p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * The default decoding validation procedure without any extension. - * - * @param p_j2k the jpeg2000 codec to validate. - * @param p_stream the input stream to validate. - * @param p_manager the user event manager. - * - * @return true if the parameters are correct. - */ -static OPJ_BOOL opj_j2k_decoding_validation(opj_j2k_t * p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Sets up the validation ,i.e. adds the procedures to lauch to make sure the codec parameters - * are valid. Developpers wanting to extend the library can add their own validation procedures. - */ -static OPJ_BOOL opj_j2k_setup_encoding_validation(opj_j2k_t *p_j2k, - opj_event_mgr_t * p_manager); - -/** - * Sets up the validation ,i.e. adds the procedures to lauch to make sure the codec parameters - * are valid. Developpers wanting to extend the library can add their own validation procedures. - */ -static OPJ_BOOL opj_j2k_setup_decoding_validation(opj_j2k_t *p_j2k, - opj_event_mgr_t * p_manager); - -/** - * Sets up the validation ,i.e. adds the procedures to lauch to make sure the codec parameters - * are valid. Developpers wanting to extend the library can add their own validation procedures. - */ -static OPJ_BOOL opj_j2k_setup_end_compress(opj_j2k_t *p_j2k, - opj_event_mgr_t * p_manager); - -/** - * The mct encoding validation procedure. - * - * @param p_j2k the jpeg2000 codec to validate. - * @param p_stream the input stream to validate. - * @param p_manager the user event manager. - * - * @return true if the parameters are correct. - */ -static OPJ_BOOL opj_j2k_mct_validation(opj_j2k_t * p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Builds the tcd decoder to use to decode tile. - */ -static OPJ_BOOL opj_j2k_build_decoder(opj_j2k_t * p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); -/** - * Builds the tcd encoder to use to encode tile. - */ -static OPJ_BOOL opj_j2k_build_encoder(opj_j2k_t * p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Creates a tile-coder encoder. - * - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_create_tcd(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Excutes the given procedures on the given codec. - * - * @param p_procedure_list the list of procedures to execute - * @param p_j2k the jpeg2000 codec to execute the procedures on. - * @param p_stream the stream to execute the procedures on. - * @param p_manager the user manager. - * - * @return true if all the procedures were successfully executed. - */ -static OPJ_BOOL opj_j2k_exec(opj_j2k_t * p_j2k, - opj_procedure_list_t * p_procedure_list, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Updates the rates of the tcp. - * - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_update_rates(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Copies the decoding tile parameters onto all the tile parameters. - * Creates also the tile decoder. - */ -static OPJ_BOOL opj_j2k_copy_default_tcp_and_create_tcd(opj_j2k_t * p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Destroys the memory associated with the decoding of headers. - */ -static OPJ_BOOL opj_j2k_destroy_header_memory(opj_j2k_t * p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Reads the lookup table containing all the marker, status and action, and returns the handler associated - * with the marker value. - * @param p_id Marker value to look up - * - * @return the handler associated with the id. -*/ -static const struct opj_dec_memory_marker_handler * opj_j2k_get_marker_handler( - OPJ_UINT32 p_id); - -/** - * Destroys a tile coding parameter structure. - * - * @param p_tcp the tile coding parameter to destroy. - */ -static void opj_j2k_tcp_destroy(opj_tcp_t *p_tcp); - -/** - * Destroys the data inside a tile coding parameter structure. - * - * @param p_tcp the tile coding parameter which contain data to destroy. - */ -static void opj_j2k_tcp_data_destroy(opj_tcp_t *p_tcp); - -/** - * Destroys a coding parameter structure. - * - * @param p_cp the coding parameter to destroy. - */ -static void opj_j2k_cp_destroy(opj_cp_t *p_cp); - -/** - * Compare 2 a SPCod/ SPCoc elements, i.e. the coding style of a given component of a tile. - * - * @param p_j2k J2K codec. - * @param p_tile_no Tile number - * @param p_first_comp_no The 1st component number to compare. - * @param p_second_comp_no The 1st component number to compare. - * - * @return OPJ_TRUE if SPCdod are equals. - */ -static OPJ_BOOL opj_j2k_compare_SPCod_SPCoc(opj_j2k_t *p_j2k, - OPJ_UINT32 p_tile_no, OPJ_UINT32 p_first_comp_no, OPJ_UINT32 p_second_comp_no); - -/** - * Writes a SPCod or SPCoc element, i.e. the coding style of a given component of a tile. - * - * @param p_j2k J2K codec. - * @param p_tile_no FIXME DOC - * @param p_comp_no the component number to output. - * @param p_data FIXME DOC - * @param p_header_size FIXME DOC - * @param p_manager the user event manager. - * - * @return FIXME DOC -*/ -static OPJ_BOOL opj_j2k_write_SPCod_SPCoc(opj_j2k_t *p_j2k, - OPJ_UINT32 p_tile_no, - OPJ_UINT32 p_comp_no, - OPJ_BYTE * p_data, - OPJ_UINT32 * p_header_size, - opj_event_mgr_t * p_manager); - -/** - * Gets the size taken by writing a SPCod or SPCoc for the given tile and component. - * - * @param p_j2k the J2K codec. - * @param p_tile_no the tile index. - * @param p_comp_no the component being outputted. - * - * @return the number of bytes taken by the SPCod element. - */ -static OPJ_UINT32 opj_j2k_get_SPCod_SPCoc_size(opj_j2k_t *p_j2k, - OPJ_UINT32 p_tile_no, - OPJ_UINT32 p_comp_no); - -/** - * Reads a SPCod or SPCoc element, i.e. the coding style of a given component of a tile. - * @param p_j2k the jpeg2000 codec. - * @param compno FIXME DOC - * @param p_header_data the data contained in the COM box. - * @param p_header_size the size of the data contained in the COM marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_SPCod_SPCoc(opj_j2k_t *p_j2k, - OPJ_UINT32 compno, - OPJ_BYTE * p_header_data, - OPJ_UINT32 * p_header_size, - opj_event_mgr_t * p_manager); - -/** - * Gets the size taken by writing SQcd or SQcc element, i.e. the quantization values of a band in the QCD or QCC. - * - * @param p_tile_no the tile index. - * @param p_comp_no the component being outputted. - * @param p_j2k the J2K codec. - * - * @return the number of bytes taken by the SPCod element. - */ -static OPJ_UINT32 opj_j2k_get_SQcd_SQcc_size(opj_j2k_t *p_j2k, - OPJ_UINT32 p_tile_no, - OPJ_UINT32 p_comp_no); - -/** - * Compares 2 SQcd or SQcc element, i.e. the quantization values of a band in the QCD or QCC. - * - * @param p_j2k J2K codec. - * @param p_tile_no the tile to output. - * @param p_first_comp_no the first component number to compare. - * @param p_second_comp_no the second component number to compare. - * - * @return OPJ_TRUE if equals. - */ -static OPJ_BOOL opj_j2k_compare_SQcd_SQcc(opj_j2k_t *p_j2k, - OPJ_UINT32 p_tile_no, OPJ_UINT32 p_first_comp_no, OPJ_UINT32 p_second_comp_no); - - -/** - * Writes a SQcd or SQcc element, i.e. the quantization values of a band in the QCD or QCC. - * - * @param p_tile_no the tile to output. - * @param p_comp_no the component number to output. - * @param p_data the data buffer. - * @param p_header_size pointer to the size of the data buffer, it is changed by the function. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. - * -*/ -static OPJ_BOOL opj_j2k_write_SQcd_SQcc(opj_j2k_t *p_j2k, - OPJ_UINT32 p_tile_no, - OPJ_UINT32 p_comp_no, - OPJ_BYTE * p_data, - OPJ_UINT32 * p_header_size, - opj_event_mgr_t * p_manager); - -/** - * Updates the Tile Length Marker. - */ -static void opj_j2k_update_tlm(opj_j2k_t * p_j2k, OPJ_UINT32 p_tile_part_size); - -/** - * Reads a SQcd or SQcc element, i.e. the quantization values of a band in the QCD or QCC. - * - * @param p_j2k J2K codec. - * @param compno the component number to output. - * @param p_header_data the data buffer. - * @param p_header_size pointer to the size of the data buffer, it is changed by the function. - * @param p_manager the user event manager. - * -*/ -static OPJ_BOOL opj_j2k_read_SQcd_SQcc(opj_j2k_t *p_j2k, - OPJ_UINT32 compno, - OPJ_BYTE * p_header_data, - OPJ_UINT32 * p_header_size, - opj_event_mgr_t * p_manager); - -/** - * Copies the tile component parameters of all the component from the first tile component. - * - * @param p_j2k the J2k codec. - */ -static void opj_j2k_copy_tile_component_parameters(opj_j2k_t *p_j2k); - -/** - * Copies the tile quantization parameters of all the component from the first tile component. - * - * @param p_j2k the J2k codec. - */ -static void opj_j2k_copy_tile_quantization_parameters(opj_j2k_t *p_j2k); - -/** - * Reads the tiles. - */ -static OPJ_BOOL opj_j2k_decode_tiles(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -static OPJ_BOOL opj_j2k_pre_write_tile(opj_j2k_t * p_j2k, - OPJ_UINT32 p_tile_index, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -static OPJ_BOOL opj_j2k_update_image_data(opj_tcd_t * p_tcd, - opj_image_t* p_output_image); - -static void opj_get_tile_dimensions(opj_image_t * l_image, - opj_tcd_tilecomp_t * l_tilec, - opj_image_comp_t * l_img_comp, - OPJ_UINT32* l_size_comp, - OPJ_UINT32* l_width, - OPJ_UINT32* l_height, - OPJ_UINT32* l_offset_x, - OPJ_UINT32* l_offset_y, - OPJ_UINT32* l_image_width, - OPJ_UINT32* l_stride, - OPJ_UINT32* l_tile_offset); - -static void opj_j2k_get_tile_data(opj_tcd_t * p_tcd, OPJ_BYTE * p_data); - -static OPJ_BOOL opj_j2k_post_write_tile(opj_j2k_t * p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Sets up the procedures to do on writing header. - * Developers wanting to extend the library can add their own writing procedures. - */ -static OPJ_BOOL opj_j2k_setup_header_writing(opj_j2k_t *p_j2k, - opj_event_mgr_t * p_manager); - -static OPJ_BOOL opj_j2k_write_first_tile_part(opj_j2k_t *p_j2k, - OPJ_BYTE * p_data, - OPJ_UINT32 * p_data_written, - OPJ_UINT32 p_total_data_size, - opj_stream_private_t *p_stream, - struct opj_event_mgr * p_manager); - -static OPJ_BOOL opj_j2k_write_all_tile_parts(opj_j2k_t *p_j2k, - OPJ_BYTE * p_data, - OPJ_UINT32 * p_data_written, - OPJ_UINT32 p_total_data_size, - opj_stream_private_t *p_stream, - struct opj_event_mgr * p_manager); - -/** - * Gets the offset of the header. - * - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_get_end_header(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -static OPJ_BOOL opj_j2k_allocate_tile_element_cstr_index(opj_j2k_t *p_j2k); - -/* - * ----------------------------------------------------------------------- - * ----------------------------------------------------------------------- - * ----------------------------------------------------------------------- - */ - -/** - * Writes the SOC marker (Start Of Codestream) - * - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_soc(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Reads a SOC marker (Start of Codestream) - * @param p_j2k the jpeg2000 file codec. - * @param p_stream XXX needs data - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_soc(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Writes the SIZ marker (image and tile size) - * - * @param p_j2k J2K codec. - * @param p_stream the stream to write data to. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_siz(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Reads a SIZ marker (image and tile size) - * @param p_j2k the jpeg2000 file codec. - * @param p_header_data the data contained in the SIZ box. - * @param p_header_size the size of the data contained in the SIZ marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_siz(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); - -/** - * Writes the COM marker (comment) - * - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_com(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Reads a COM marker (comments) - * @param p_j2k the jpeg2000 file codec. - * @param p_header_data the data contained in the COM box. - * @param p_header_size the size of the data contained in the COM marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_com(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); -/** - * Writes the COD marker (Coding style default) - * - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_cod(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Reads a COD marker (Coding style defaults) - * @param p_header_data the data contained in the COD box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the COD marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_cod(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); - -/** - * Compares 2 COC markers (Coding style component) - * - * @param p_j2k J2K codec. - * @param p_first_comp_no the index of the first component to compare. - * @param p_second_comp_no the index of the second component to compare. - * - * @return OPJ_TRUE if equals - */ -static OPJ_BOOL opj_j2k_compare_coc(opj_j2k_t *p_j2k, - OPJ_UINT32 p_first_comp_no, OPJ_UINT32 p_second_comp_no); - -/** - * Writes the COC marker (Coding style component) - * - * @param p_j2k J2K codec. - * @param p_comp_no the index of the component to output. - * @param p_stream the stream to write data to. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_coc(opj_j2k_t *p_j2k, - OPJ_UINT32 p_comp_no, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Writes the COC marker (Coding style component) - * - * @param p_j2k J2K codec. - * @param p_comp_no the index of the component to output. - * @param p_data FIXME DOC - * @param p_data_written FIXME DOC - * @param p_manager the user event manager. -*/ -static void opj_j2k_write_coc_in_memory(opj_j2k_t *p_j2k, - OPJ_UINT32 p_comp_no, - OPJ_BYTE * p_data, - OPJ_UINT32 * p_data_written, - opj_event_mgr_t * p_manager); - -/** - * Gets the maximum size taken by a coc. - * - * @param p_j2k the jpeg2000 codec to use. - */ -static OPJ_UINT32 opj_j2k_get_max_coc_size(opj_j2k_t *p_j2k); - -/** - * Reads a COC marker (Coding Style Component) - * @param p_header_data the data contained in the COC box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the COC marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_coc(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); - -/** - * Writes the QCD marker (quantization default) - * - * @param p_j2k J2K codec. - * @param p_stream the stream to write data to. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_qcd(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Reads a QCD marker (Quantization defaults) - * @param p_header_data the data contained in the QCD box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the QCD marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_qcd(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); - -/** - * Compare QCC markers (quantization component) - * - * @param p_j2k J2K codec. - * @param p_first_comp_no the index of the first component to compare. - * @param p_second_comp_no the index of the second component to compare. - * - * @return OPJ_TRUE if equals. - */ -static OPJ_BOOL opj_j2k_compare_qcc(opj_j2k_t *p_j2k, - OPJ_UINT32 p_first_comp_no, OPJ_UINT32 p_second_comp_no); - -/** - * Writes the QCC marker (quantization component) - * - * @param p_comp_no the index of the component to output. - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_qcc(opj_j2k_t *p_j2k, - OPJ_UINT32 p_comp_no, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Writes the QCC marker (quantization component) - * - * @param p_j2k J2K codec. - * @param p_comp_no the index of the component to output. - * @param p_data FIXME DOC - * @param p_data_written the stream to write data to. - * @param p_manager the user event manager. -*/ -static void opj_j2k_write_qcc_in_memory(opj_j2k_t *p_j2k, - OPJ_UINT32 p_comp_no, - OPJ_BYTE * p_data, - OPJ_UINT32 * p_data_written, - opj_event_mgr_t * p_manager); - -/** - * Gets the maximum size taken by a qcc. - */ -static OPJ_UINT32 opj_j2k_get_max_qcc_size(opj_j2k_t *p_j2k); - -/** - * Reads a QCC marker (Quantization component) - * @param p_header_data the data contained in the QCC box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the QCC marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_qcc(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); -/** - * Writes the POC marker (Progression Order Change) - * - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_poc(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); -/** - * Writes the POC marker (Progression Order Change) - * - * @param p_j2k J2K codec. - * @param p_data FIXME DOC - * @param p_data_written the stream to write data to. - * @param p_manager the user event manager. - */ -static void opj_j2k_write_poc_in_memory(opj_j2k_t *p_j2k, - OPJ_BYTE * p_data, - OPJ_UINT32 * p_data_written, - opj_event_mgr_t * p_manager); -/** - * Gets the maximum size taken by the writing of a POC. - */ -static OPJ_UINT32 opj_j2k_get_max_poc_size(opj_j2k_t *p_j2k); - -/** - * Reads a POC marker (Progression Order Change) - * - * @param p_header_data the data contained in the POC box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the POC marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_poc(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); - -/** - * Gets the maximum size taken by the toc headers of all the tile parts of any given tile. - */ -static OPJ_UINT32 opj_j2k_get_max_toc_size(opj_j2k_t *p_j2k); - -/** - * Gets the maximum size taken by the headers of the SOT. - * - * @param p_j2k the jpeg2000 codec to use. - */ -static OPJ_UINT32 opj_j2k_get_specific_header_sizes(opj_j2k_t *p_j2k); - -/** - * Reads a CRG marker (Component registration) - * - * @param p_header_data the data contained in the TLM box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the TLM marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_crg(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); -/** - * Reads a TLM marker (Tile Length Marker) - * - * @param p_header_data the data contained in the TLM box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the TLM marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_tlm(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); - -/** - * Writes the updated tlm. - * - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_updated_tlm(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Reads a PLM marker (Packet length, main header marker) - * - * @param p_header_data the data contained in the TLM box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the TLM marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_plm(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); -/** - * Reads a PLT marker (Packet length, tile-part header) - * - * @param p_header_data the data contained in the PLT box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the PLT marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_plt(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); - -/** - * Reads a PPM marker (Packed headers, main header) - * - * @param p_header_data the data contained in the POC box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the POC marker. - * @param p_manager the user event manager. - */ - -static OPJ_BOOL opj_j2k_read_ppm( - opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); - -/** - * Merges all PPM markers read (Packed headers, main header) - * - * @param p_cp main coding parameters. - * @param p_manager the user event manager. - */ -static OPJ_BOOL opj_j2k_merge_ppm(opj_cp_t *p_cp, opj_event_mgr_t * p_manager); - -/** - * Reads a PPT marker (Packed packet headers, tile-part header) - * - * @param p_header_data the data contained in the PPT box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the PPT marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_ppt(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); - -/** - * Merges all PPT markers read (Packed headers, tile-part header) - * - * @param p_tcp the tile. - * @param p_manager the user event manager. - */ -static OPJ_BOOL opj_j2k_merge_ppt(opj_tcp_t *p_tcp, - opj_event_mgr_t * p_manager); - - -/** - * Writes the TLM marker (Tile Length Marker) - * - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_tlm(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Writes the SOT marker (Start of tile-part) - * - * @param p_j2k J2K codec. - * @param p_data Output buffer - * @param p_total_data_size Output buffer size - * @param p_data_written Number of bytes written into stream - * @param p_stream the stream to write data to. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_sot(opj_j2k_t *p_j2k, - OPJ_BYTE * p_data, - OPJ_UINT32 p_total_data_size, - OPJ_UINT32 * p_data_written, - const opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Reads values from a SOT marker (Start of tile-part) - * - * the j2k decoder state is not affected. No side effects, no checks except for p_header_size. - * - * @param p_header_data the data contained in the SOT marker. - * @param p_header_size the size of the data contained in the SOT marker. - * @param p_tile_no Isot. - * @param p_tot_len Psot. - * @param p_current_part TPsot. - * @param p_num_parts TNsot. - * @param p_manager the user event manager. - */ -static OPJ_BOOL opj_j2k_get_sot_values(OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - OPJ_UINT32* p_tile_no, - OPJ_UINT32* p_tot_len, - OPJ_UINT32* p_current_part, - OPJ_UINT32* p_num_parts, - opj_event_mgr_t * p_manager); -/** - * Reads a SOT marker (Start of tile-part) - * - * @param p_header_data the data contained in the SOT marker. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the PPT marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_sot(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); -/** - * Writes the SOD marker (Start of data) - * - * @param p_j2k J2K codec. - * @param p_tile_coder FIXME DOC - * @param p_data FIXME DOC - * @param p_data_written FIXME DOC - * @param p_total_data_size FIXME DOC - * @param p_stream the stream to write data to. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_sod(opj_j2k_t *p_j2k, - opj_tcd_t * p_tile_coder, - OPJ_BYTE * p_data, - OPJ_UINT32 * p_data_written, - OPJ_UINT32 p_total_data_size, - const opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Reads a SOD marker (Start Of Data) - * - * @param p_j2k the jpeg2000 codec. - * @param p_stream FIXME DOC - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_sod(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -static void opj_j2k_update_tlm(opj_j2k_t * p_j2k, OPJ_UINT32 p_tile_part_size) -{ - opj_write_bytes(p_j2k->m_specific_param.m_encoder.m_tlm_sot_offsets_current, - p_j2k->m_current_tile_number, 1); /* PSOT */ - ++p_j2k->m_specific_param.m_encoder.m_tlm_sot_offsets_current; - - opj_write_bytes(p_j2k->m_specific_param.m_encoder.m_tlm_sot_offsets_current, - p_tile_part_size, 4); /* PSOT */ - p_j2k->m_specific_param.m_encoder.m_tlm_sot_offsets_current += 4; -} - -/** - * Writes the RGN marker (Region Of Interest) - * - * @param p_tile_no the tile to output - * @param p_comp_no the component to output - * @param nb_comps the number of components - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_rgn(opj_j2k_t *p_j2k, - OPJ_UINT32 p_tile_no, - OPJ_UINT32 p_comp_no, - OPJ_UINT32 nb_comps, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Reads a RGN marker (Region Of Interest) - * - * @param p_header_data the data contained in the POC box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the POC marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_rgn(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); - -/** - * Writes the EOC marker (End of Codestream) - * - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_eoc(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -#if 0 -/** - * Reads a EOC marker (End Of Codestream) - * - * @param p_j2k the jpeg2000 codec. - * @param p_stream FIXME DOC - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_eoc(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); -#endif - -/** - * Writes the CBD-MCT-MCC-MCO markers (Multi components transform) - * - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_mct_data_group(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Inits the Info - * - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_init_info(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** -Add main header marker information -@param cstr_index Codestream information structure -@param type marker type -@param pos byte offset of marker segment -@param len length of marker segment - */ -static OPJ_BOOL opj_j2k_add_mhmarker(opj_codestream_index_t *cstr_index, - OPJ_UINT32 type, OPJ_OFF_T pos, OPJ_UINT32 len) ; -/** -Add tile header marker information -@param tileno tile index number -@param cstr_index Codestream information structure -@param type marker type -@param pos byte offset of marker segment -@param len length of marker segment - */ -static OPJ_BOOL opj_j2k_add_tlmarker(OPJ_UINT32 tileno, - opj_codestream_index_t *cstr_index, OPJ_UINT32 type, OPJ_OFF_T pos, - OPJ_UINT32 len); - -/** - * Reads an unknown marker - * - * @param p_j2k the jpeg2000 codec. - * @param p_stream the stream object to read from. - * @param output_marker FIXME DOC - * @param p_manager the user event manager. - * - * @return true if the marker could be deduced. -*/ -static OPJ_BOOL opj_j2k_read_unk(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - OPJ_UINT32 *output_marker, - opj_event_mgr_t * p_manager); - -/** - * Writes the MCT marker (Multiple Component Transform) - * - * @param p_j2k J2K codec. - * @param p_mct_record FIXME DOC - * @param p_stream the stream to write data to. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_mct_record(opj_j2k_t *p_j2k, - opj_mct_data_t * p_mct_record, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Reads a MCT marker (Multiple Component Transform) - * - * @param p_header_data the data contained in the MCT box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the MCT marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_mct(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); - -/** - * Writes the MCC marker (Multiple Component Collection) - * - * @param p_j2k J2K codec. - * @param p_mcc_record FIXME DOC - * @param p_stream the stream to write data to. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_mcc_record(opj_j2k_t *p_j2k, - opj_simple_mcc_decorrelation_data_t * p_mcc_record, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Reads a MCC marker (Multiple Component Collection) - * - * @param p_header_data the data contained in the MCC box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the MCC marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_mcc(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); - -/** - * Writes the MCO marker (Multiple component transformation ordering) - * - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_mco(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Reads a MCO marker (Multiple Component Transform Ordering) - * - * @param p_header_data the data contained in the MCO box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the MCO marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_mco(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); - -static OPJ_BOOL opj_j2k_add_mct(opj_tcp_t * p_tcp, opj_image_t * p_image, - OPJ_UINT32 p_index); - -static void opj_j2k_read_int16_to_float(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem); -static void opj_j2k_read_int32_to_float(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem); -static void opj_j2k_read_float32_to_float(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem); -static void opj_j2k_read_float64_to_float(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem); - -static void opj_j2k_read_int16_to_int32(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem); -static void opj_j2k_read_int32_to_int32(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem); -static void opj_j2k_read_float32_to_int32(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem); -static void opj_j2k_read_float64_to_int32(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem); - -static void opj_j2k_write_float_to_int16(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem); -static void opj_j2k_write_float_to_int32(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem); -static void opj_j2k_write_float_to_float(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem); -static void opj_j2k_write_float_to_float64(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem); - -/** - * Ends the encoding, i.e. frees memory. - * - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_end_encoding(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Writes the CBD marker (Component bit depth definition) - * - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_cbd(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Reads a CBD marker (Component bit depth definition) - * @param p_header_data the data contained in the CBD box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the CBD marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_cbd(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); - - -/** - * Writes COC marker for each component. - * - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_all_coc(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Writes QCC marker for each component. - * - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_all_qcc(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Writes regions of interests. - * - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_regions(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Writes EPC ???? - * - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_write_epc(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Checks the progression order changes values. Tells of the poc given as input are valid. - * A nice message is outputted at errors. - * - * @param p_pocs the progression order changes. - * @param p_nb_pocs the number of progression order changes. - * @param p_nb_resolutions the number of resolutions. - * @param numcomps the number of components - * @param numlayers the number of layers. - * @param p_manager the user event manager. - * - * @return true if the pocs are valid. - */ -static OPJ_BOOL opj_j2k_check_poc_val(const opj_poc_t *p_pocs, - OPJ_UINT32 p_nb_pocs, - OPJ_UINT32 p_nb_resolutions, - OPJ_UINT32 numcomps, - OPJ_UINT32 numlayers, - opj_event_mgr_t * p_manager); - -/** - * Gets the number of tile parts used for the given change of progression (if any) and the given tile. - * - * @param cp the coding parameters. - * @param pino the offset of the given poc (i.e. its position in the coding parameter). - * @param tileno the given tile. - * - * @return the number of tile parts. - */ -static OPJ_UINT32 opj_j2k_get_num_tp(opj_cp_t *cp, OPJ_UINT32 pino, - OPJ_UINT32 tileno); - -/** - * Calculates the total number of tile parts needed by the encoder to - * encode such an image. If not enough memory is available, then the function return false. - * - * @param p_nb_tiles pointer that will hold the number of tile parts. - * @param cp the coding parameters for the image. - * @param image the image to encode. - * @param p_j2k the p_j2k encoder. - * @param p_manager the user event manager. - * - * @return true if the function was successful, false else. - */ -static OPJ_BOOL opj_j2k_calculate_tp(opj_j2k_t *p_j2k, - opj_cp_t *cp, - OPJ_UINT32 * p_nb_tiles, - opj_image_t *image, - opj_event_mgr_t * p_manager); - -static void opj_j2k_dump_MH_info(opj_j2k_t* p_j2k, FILE* out_stream); - -static void opj_j2k_dump_MH_index(opj_j2k_t* p_j2k, FILE* out_stream); - -static opj_codestream_index_t* opj_j2k_create_cstr_index(void); - -static OPJ_FLOAT32 opj_j2k_get_tp_stride(opj_tcp_t * p_tcp); - -static OPJ_FLOAT32 opj_j2k_get_default_stride(opj_tcp_t * p_tcp); - -static int opj_j2k_initialise_4K_poc(opj_poc_t *POC, int numres); - -static void opj_j2k_set_cinema_parameters(opj_cparameters_t *parameters, - opj_image_t *image, opj_event_mgr_t *p_manager); - -static OPJ_BOOL opj_j2k_is_cinema_compliant(opj_image_t *image, OPJ_UINT16 rsiz, - opj_event_mgr_t *p_manager); - -/** - * Checks for invalid number of tile-parts in SOT marker (TPsot==TNsot). See issue 254. - * - * @param p_stream the stream to read data from. - * @param tile_no tile number we're looking for. - * @param p_correction_needed output value. if true, non conformant codestream needs TNsot correction. - * @param p_manager the user event manager. - * - * @return true if the function was successful, false else. - */ -static OPJ_BOOL opj_j2k_need_nb_tile_parts_correction(opj_stream_private_t - *p_stream, OPJ_UINT32 tile_no, OPJ_BOOL* p_correction_needed, - opj_event_mgr_t * p_manager); - -/*@}*/ - -/*@}*/ - -/* ----------------------------------------------------------------------- */ -typedef struct j2k_prog_order { - OPJ_PROG_ORDER enum_prog; - char str_prog[5]; -} j2k_prog_order_t; - -static const j2k_prog_order_t j2k_prog_order_list[] = { - {OPJ_CPRL, "CPRL"}, - {OPJ_LRCP, "LRCP"}, - {OPJ_PCRL, "PCRL"}, - {OPJ_RLCP, "RLCP"}, - {OPJ_RPCL, "RPCL"}, - {(OPJ_PROG_ORDER) - 1, ""} -}; - -/** - * FIXME DOC - */ -static const OPJ_UINT32 MCT_ELEMENT_SIZE [] = { - 2, - 4, - 4, - 8 -}; - -typedef void (* opj_j2k_mct_function)(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem); - -static const opj_j2k_mct_function j2k_mct_read_functions_to_float [] = { - opj_j2k_read_int16_to_float, - opj_j2k_read_int32_to_float, - opj_j2k_read_float32_to_float, - opj_j2k_read_float64_to_float -}; - -static const opj_j2k_mct_function j2k_mct_read_functions_to_int32 [] = { - opj_j2k_read_int16_to_int32, - opj_j2k_read_int32_to_int32, - opj_j2k_read_float32_to_int32, - opj_j2k_read_float64_to_int32 -}; - -static const opj_j2k_mct_function j2k_mct_write_functions_from_float [] = { - opj_j2k_write_float_to_int16, - opj_j2k_write_float_to_int32, - opj_j2k_write_float_to_float, - opj_j2k_write_float_to_float64 -}; - -typedef struct opj_dec_memory_marker_handler { - /** marker value */ - OPJ_UINT32 id; - /** value of the state when the marker can appear */ - OPJ_UINT32 states; - /** action linked to the marker */ - OPJ_BOOL(*handler)(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); -} -opj_dec_memory_marker_handler_t; - -static const opj_dec_memory_marker_handler_t j2k_memory_marker_handler_tab [] = -{ - {J2K_MS_SOT, J2K_STATE_MH | J2K_STATE_TPHSOT, opj_j2k_read_sot}, - {J2K_MS_COD, J2K_STATE_MH | J2K_STATE_TPH, opj_j2k_read_cod}, - {J2K_MS_COC, J2K_STATE_MH | J2K_STATE_TPH, opj_j2k_read_coc}, - {J2K_MS_RGN, J2K_STATE_MH | J2K_STATE_TPH, opj_j2k_read_rgn}, - {J2K_MS_QCD, J2K_STATE_MH | J2K_STATE_TPH, opj_j2k_read_qcd}, - {J2K_MS_QCC, J2K_STATE_MH | J2K_STATE_TPH, opj_j2k_read_qcc}, - {J2K_MS_POC, J2K_STATE_MH | J2K_STATE_TPH, opj_j2k_read_poc}, - {J2K_MS_SIZ, J2K_STATE_MHSIZ, opj_j2k_read_siz}, - {J2K_MS_TLM, J2K_STATE_MH, opj_j2k_read_tlm}, - {J2K_MS_PLM, J2K_STATE_MH, opj_j2k_read_plm}, - {J2K_MS_PLT, J2K_STATE_TPH, opj_j2k_read_plt}, - {J2K_MS_PPM, J2K_STATE_MH, opj_j2k_read_ppm}, - {J2K_MS_PPT, J2K_STATE_TPH, opj_j2k_read_ppt}, - {J2K_MS_SOP, 0, 0}, - {J2K_MS_CRG, J2K_STATE_MH, opj_j2k_read_crg}, - {J2K_MS_COM, J2K_STATE_MH | J2K_STATE_TPH, opj_j2k_read_com}, - {J2K_MS_MCT, J2K_STATE_MH | J2K_STATE_TPH, opj_j2k_read_mct}, - {J2K_MS_CBD, J2K_STATE_MH, opj_j2k_read_cbd}, - {J2K_MS_MCC, J2K_STATE_MH | J2K_STATE_TPH, opj_j2k_read_mcc}, - {J2K_MS_MCO, J2K_STATE_MH | J2K_STATE_TPH, opj_j2k_read_mco}, -#ifdef USE_JPWL -#ifdef TODO_MS /* remove these functions which are not commpatible with the v2 API */ - {J2K_MS_EPC, J2K_STATE_MH | J2K_STATE_TPH, j2k_read_epc}, - {J2K_MS_EPB, J2K_STATE_MH | J2K_STATE_TPH, j2k_read_epb}, - {J2K_MS_ESD, J2K_STATE_MH | J2K_STATE_TPH, j2k_read_esd}, - {J2K_MS_RED, J2K_STATE_MH | J2K_STATE_TPH, j2k_read_red}, -#endif -#endif /* USE_JPWL */ -#ifdef USE_JPSEC - {J2K_MS_SEC, J2K_DEC_STATE_MH, j2k_read_sec}, - {J2K_MS_INSEC, 0, j2k_read_insec} -#endif /* USE_JPSEC */ - {J2K_MS_UNK, J2K_STATE_MH | J2K_STATE_TPH, 0}/*opj_j2k_read_unk is directly used*/ -}; - -static void opj_j2k_read_int16_to_float(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem) -{ - OPJ_BYTE * l_src_data = (OPJ_BYTE *) p_src_data; - OPJ_FLOAT32 * l_dest_data = (OPJ_FLOAT32 *) p_dest_data; - OPJ_UINT32 i; - OPJ_UINT32 l_temp; - - for (i = 0; i < p_nb_elem; ++i) { - opj_read_bytes(l_src_data, &l_temp, 2); - - l_src_data += sizeof(OPJ_INT16); - - *(l_dest_data++) = (OPJ_FLOAT32) l_temp; - } -} - -static void opj_j2k_read_int32_to_float(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem) -{ - OPJ_BYTE * l_src_data = (OPJ_BYTE *) p_src_data; - OPJ_FLOAT32 * l_dest_data = (OPJ_FLOAT32 *) p_dest_data; - OPJ_UINT32 i; - OPJ_UINT32 l_temp; - - for (i = 0; i < p_nb_elem; ++i) { - opj_read_bytes(l_src_data, &l_temp, 4); - - l_src_data += sizeof(OPJ_INT32); - - *(l_dest_data++) = (OPJ_FLOAT32) l_temp; - } -} - -static void opj_j2k_read_float32_to_float(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem) -{ - OPJ_BYTE * l_src_data = (OPJ_BYTE *) p_src_data; - OPJ_FLOAT32 * l_dest_data = (OPJ_FLOAT32 *) p_dest_data; - OPJ_UINT32 i; - OPJ_FLOAT32 l_temp; - - for (i = 0; i < p_nb_elem; ++i) { - opj_read_float(l_src_data, &l_temp); - - l_src_data += sizeof(OPJ_FLOAT32); - - *(l_dest_data++) = l_temp; - } -} - -static void opj_j2k_read_float64_to_float(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem) -{ - OPJ_BYTE * l_src_data = (OPJ_BYTE *) p_src_data; - OPJ_FLOAT32 * l_dest_data = (OPJ_FLOAT32 *) p_dest_data; - OPJ_UINT32 i; - OPJ_FLOAT64 l_temp; - - for (i = 0; i < p_nb_elem; ++i) { - opj_read_double(l_src_data, &l_temp); - - l_src_data += sizeof(OPJ_FLOAT64); - - *(l_dest_data++) = (OPJ_FLOAT32) l_temp; - } -} - -static void opj_j2k_read_int16_to_int32(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem) -{ - OPJ_BYTE * l_src_data = (OPJ_BYTE *) p_src_data; - OPJ_INT32 * l_dest_data = (OPJ_INT32 *) p_dest_data; - OPJ_UINT32 i; - OPJ_UINT32 l_temp; - - for (i = 0; i < p_nb_elem; ++i) { - opj_read_bytes(l_src_data, &l_temp, 2); - - l_src_data += sizeof(OPJ_INT16); - - *(l_dest_data++) = (OPJ_INT32) l_temp; - } -} - -static void opj_j2k_read_int32_to_int32(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem) -{ - OPJ_BYTE * l_src_data = (OPJ_BYTE *) p_src_data; - OPJ_INT32 * l_dest_data = (OPJ_INT32 *) p_dest_data; - OPJ_UINT32 i; - OPJ_UINT32 l_temp; - - for (i = 0; i < p_nb_elem; ++i) { - opj_read_bytes(l_src_data, &l_temp, 4); - - l_src_data += sizeof(OPJ_INT32); - - *(l_dest_data++) = (OPJ_INT32) l_temp; - } -} - -static void opj_j2k_read_float32_to_int32(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem) -{ - OPJ_BYTE * l_src_data = (OPJ_BYTE *) p_src_data; - OPJ_INT32 * l_dest_data = (OPJ_INT32 *) p_dest_data; - OPJ_UINT32 i; - OPJ_FLOAT32 l_temp; - - for (i = 0; i < p_nb_elem; ++i) { - opj_read_float(l_src_data, &l_temp); - - l_src_data += sizeof(OPJ_FLOAT32); - - *(l_dest_data++) = (OPJ_INT32) l_temp; - } -} - -static void opj_j2k_read_float64_to_int32(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem) -{ - OPJ_BYTE * l_src_data = (OPJ_BYTE *) p_src_data; - OPJ_INT32 * l_dest_data = (OPJ_INT32 *) p_dest_data; - OPJ_UINT32 i; - OPJ_FLOAT64 l_temp; - - for (i = 0; i < p_nb_elem; ++i) { - opj_read_double(l_src_data, &l_temp); - - l_src_data += sizeof(OPJ_FLOAT64); - - *(l_dest_data++) = (OPJ_INT32) l_temp; - } -} - -static void opj_j2k_write_float_to_int16(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem) -{ - OPJ_BYTE * l_dest_data = (OPJ_BYTE *) p_dest_data; - OPJ_FLOAT32 * l_src_data = (OPJ_FLOAT32 *) p_src_data; - OPJ_UINT32 i; - OPJ_UINT32 l_temp; - - for (i = 0; i < p_nb_elem; ++i) { - l_temp = (OPJ_UINT32) * (l_src_data++); - - opj_write_bytes(l_dest_data, l_temp, sizeof(OPJ_INT16)); - - l_dest_data += sizeof(OPJ_INT16); - } -} - -static void opj_j2k_write_float_to_int32(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem) -{ - OPJ_BYTE * l_dest_data = (OPJ_BYTE *) p_dest_data; - OPJ_FLOAT32 * l_src_data = (OPJ_FLOAT32 *) p_src_data; - OPJ_UINT32 i; - OPJ_UINT32 l_temp; - - for (i = 0; i < p_nb_elem; ++i) { - l_temp = (OPJ_UINT32) * (l_src_data++); - - opj_write_bytes(l_dest_data, l_temp, sizeof(OPJ_INT32)); - - l_dest_data += sizeof(OPJ_INT32); - } -} - -static void opj_j2k_write_float_to_float(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem) -{ - OPJ_BYTE * l_dest_data = (OPJ_BYTE *) p_dest_data; - OPJ_FLOAT32 * l_src_data = (OPJ_FLOAT32 *) p_src_data; - OPJ_UINT32 i; - OPJ_FLOAT32 l_temp; - - for (i = 0; i < p_nb_elem; ++i) { - l_temp = (OPJ_FLOAT32) * (l_src_data++); - - opj_write_float(l_dest_data, l_temp); - - l_dest_data += sizeof(OPJ_FLOAT32); - } -} - -static void opj_j2k_write_float_to_float64(const void * p_src_data, - void * p_dest_data, OPJ_UINT32 p_nb_elem) -{ - OPJ_BYTE * l_dest_data = (OPJ_BYTE *) p_dest_data; - OPJ_FLOAT32 * l_src_data = (OPJ_FLOAT32 *) p_src_data; - OPJ_UINT32 i; - OPJ_FLOAT64 l_temp; - - for (i = 0; i < p_nb_elem; ++i) { - l_temp = (OPJ_FLOAT64) * (l_src_data++); - - opj_write_double(l_dest_data, l_temp); - - l_dest_data += sizeof(OPJ_FLOAT64); - } -} - -const char *opj_j2k_convert_progression_order(OPJ_PROG_ORDER prg_order) -{ - const j2k_prog_order_t *po; - for (po = j2k_prog_order_list; po->enum_prog != -1; po++) { - if (po->enum_prog == prg_order) { - return po->str_prog; - } - } - return po->str_prog; -} - -static OPJ_BOOL opj_j2k_check_poc_val(const opj_poc_t *p_pocs, - OPJ_UINT32 p_nb_pocs, - OPJ_UINT32 p_nb_resolutions, - OPJ_UINT32 p_num_comps, - OPJ_UINT32 p_num_layers, - opj_event_mgr_t * p_manager) -{ - OPJ_UINT32* packet_array; - OPJ_UINT32 index, resno, compno, layno; - OPJ_UINT32 i; - OPJ_UINT32 step_c = 1; - OPJ_UINT32 step_r = p_num_comps * step_c; - OPJ_UINT32 step_l = p_nb_resolutions * step_r; - OPJ_BOOL loss = OPJ_FALSE; - OPJ_UINT32 layno0 = 0; - - packet_array = (OPJ_UINT32*) opj_calloc(step_l * p_num_layers, - sizeof(OPJ_UINT32)); - if (packet_array == 00) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory for checking the poc values.\n"); - return OPJ_FALSE; - } - - if (p_nb_pocs == 0) { - opj_free(packet_array); - return OPJ_TRUE; - } - - index = step_r * p_pocs->resno0; - /* take each resolution for each poc */ - for (resno = p_pocs->resno0 ; resno < p_pocs->resno1 ; ++resno) { - OPJ_UINT32 res_index = index + p_pocs->compno0 * step_c; - - /* take each comp of each resolution for each poc */ - for (compno = p_pocs->compno0 ; compno < p_pocs->compno1 ; ++compno) { - OPJ_UINT32 comp_index = res_index + layno0 * step_l; - - /* and finally take each layer of each res of ... */ - for (layno = layno0; layno < p_pocs->layno1 ; ++layno) { - /*index = step_r * resno + step_c * compno + step_l * layno;*/ - packet_array[comp_index] = 1; - comp_index += step_l; - } - - res_index += step_c; - } - - index += step_r; - } - ++p_pocs; - - /* iterate through all the pocs */ - for (i = 1; i < p_nb_pocs ; ++i) { - OPJ_UINT32 l_last_layno1 = (p_pocs - 1)->layno1 ; - - layno0 = (p_pocs->layno1 > l_last_layno1) ? l_last_layno1 : 0; - index = step_r * p_pocs->resno0; - - /* take each resolution for each poc */ - for (resno = p_pocs->resno0 ; resno < p_pocs->resno1 ; ++resno) { - OPJ_UINT32 res_index = index + p_pocs->compno0 * step_c; - - /* take each comp of each resolution for each poc */ - for (compno = p_pocs->compno0 ; compno < p_pocs->compno1 ; ++compno) { - OPJ_UINT32 comp_index = res_index + layno0 * step_l; - - /* and finally take each layer of each res of ... */ - for (layno = layno0; layno < p_pocs->layno1 ; ++layno) { - /*index = step_r * resno + step_c * compno + step_l * layno;*/ - packet_array[comp_index] = 1; - comp_index += step_l; - } - - res_index += step_c; - } - - index += step_r; - } - - ++p_pocs; - } - - index = 0; - for (layno = 0; layno < p_num_layers ; ++layno) { - for (resno = 0; resno < p_nb_resolutions; ++resno) { - for (compno = 0; compno < p_num_comps; ++compno) { - loss |= (packet_array[index] != 1); - /*index = step_r * resno + step_c * compno + step_l * layno;*/ - index += step_c; - } - } - } - - if (loss) { - opj_event_msg(p_manager, EVT_ERROR, "Missing packets possible loss of data\n"); - } - - opj_free(packet_array); - - return !loss; -} - -/* ----------------------------------------------------------------------- */ - -static OPJ_UINT32 opj_j2k_get_num_tp(opj_cp_t *cp, OPJ_UINT32 pino, - OPJ_UINT32 tileno) -{ - const OPJ_CHAR *prog = 00; - OPJ_INT32 i; - OPJ_UINT32 tpnum = 1; - opj_tcp_t *tcp = 00; - opj_poc_t * l_current_poc = 00; - - /* preconditions */ - assert(tileno < (cp->tw * cp->th)); - assert(pino < (cp->tcps[tileno].numpocs + 1)); - - /* get the given tile coding parameter */ - tcp = &cp->tcps[tileno]; - assert(tcp != 00); - - l_current_poc = &(tcp->pocs[pino]); - assert(l_current_poc != 0); - - /* get the progression order as a character string */ - prog = opj_j2k_convert_progression_order(tcp->prg); - assert(strlen(prog) > 0); - - if (cp->m_specific_param.m_enc.m_tp_on == 1) { - for (i = 0; i < 4; ++i) { - switch (prog[i]) { - /* component wise */ - case 'C': - tpnum *= l_current_poc->compE; - break; - /* resolution wise */ - case 'R': - tpnum *= l_current_poc->resE; - break; - /* precinct wise */ - case 'P': - tpnum *= l_current_poc->prcE; - break; - /* layer wise */ - case 'L': - tpnum *= l_current_poc->layE; - break; - } - /* whould we split here ? */ - if (cp->m_specific_param.m_enc.m_tp_flag == prog[i]) { - cp->m_specific_param.m_enc.m_tp_pos = i; - break; - } - } - } else { - tpnum = 1; - } - - return tpnum; -} - -static OPJ_BOOL opj_j2k_calculate_tp(opj_j2k_t *p_j2k, - opj_cp_t *cp, - OPJ_UINT32 * p_nb_tiles, - opj_image_t *image, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 pino, tileno; - OPJ_UINT32 l_nb_tiles; - opj_tcp_t *tcp; - - /* preconditions */ - assert(p_nb_tiles != 00); - assert(cp != 00); - assert(image != 00); - assert(p_j2k != 00); - assert(p_manager != 00); - - OPJ_UNUSED(p_j2k); - OPJ_UNUSED(p_manager); - - l_nb_tiles = cp->tw * cp->th; - * p_nb_tiles = 0; - tcp = cp->tcps; - - /* INDEX >> */ - /* TODO mergeV2: check this part which use cstr_info */ - /*if (p_j2k->cstr_info) { - opj_tile_info_t * l_info_tile_ptr = p_j2k->cstr_info->tile; - - for (tileno = 0; tileno < l_nb_tiles; ++tileno) { - OPJ_UINT32 cur_totnum_tp = 0; - - opj_pi_update_encoding_parameters(image,cp,tileno); - - for (pino = 0; pino <= tcp->numpocs; ++pino) - { - OPJ_UINT32 tp_num = opj_j2k_get_num_tp(cp,pino,tileno); - - *p_nb_tiles = *p_nb_tiles + tp_num; - - cur_totnum_tp += tp_num; - } - - tcp->m_nb_tile_parts = cur_totnum_tp; - - l_info_tile_ptr->tp = (opj_tp_info_t *) opj_malloc(cur_totnum_tp * sizeof(opj_tp_info_t)); - if (l_info_tile_ptr->tp == 00) { - return OPJ_FALSE; - } - - memset(l_info_tile_ptr->tp,0,cur_totnum_tp * sizeof(opj_tp_info_t)); - - l_info_tile_ptr->num_tps = cur_totnum_tp; - - ++l_info_tile_ptr; - ++tcp; - } - } - else */{ - for (tileno = 0; tileno < l_nb_tiles; ++tileno) { - OPJ_UINT32 cur_totnum_tp = 0; - - opj_pi_update_encoding_parameters(image, cp, tileno); - - for (pino = 0; pino <= tcp->numpocs; ++pino) { - OPJ_UINT32 tp_num = opj_j2k_get_num_tp(cp, pino, tileno); - - *p_nb_tiles = *p_nb_tiles + tp_num; - - cur_totnum_tp += tp_num; - } - tcp->m_nb_tile_parts = cur_totnum_tp; - - ++tcp; - } - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_soc(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager) -{ - /* 2 bytes will be written */ - OPJ_BYTE * l_start_stream = 00; - - /* preconditions */ - assert(p_stream != 00); - assert(p_j2k != 00); - assert(p_manager != 00); - - l_start_stream = p_j2k->m_specific_param.m_encoder.m_header_tile_data; - - /* write SOC identifier */ - opj_write_bytes(l_start_stream, J2K_MS_SOC, 2); - - if (opj_stream_write_data(p_stream, l_start_stream, 2, p_manager) != 2) { - return OPJ_FALSE; - } - - /* UniPG>> */ -#ifdef USE_JPWL - /* update markers struct */ - /* - OPJ_BOOL res = j2k_add_marker(p_j2k->cstr_info, J2K_MS_SOC, p_stream_tell(p_stream) - 2, 2); - */ - assert(0 && "TODO"); -#endif /* USE_JPWL */ - /* <m_specific_param.m_decoder.m_state = J2K_STATE_MHSIZ; - - /* FIXME move it in a index structure included in p_j2k*/ - p_j2k->cstr_index->main_head_start = opj_stream_tell(p_stream) - 2; - - opj_event_msg(p_manager, EVT_INFO, "Start to read j2k main header (%d).\n", - p_j2k->cstr_index->main_head_start); - - /* Add the marker to the codestream index*/ - if (OPJ_FALSE == opj_j2k_add_mhmarker(p_j2k->cstr_index, J2K_MS_SOC, - p_j2k->cstr_index->main_head_start, 2)) { - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to add mh marker\n"); - return OPJ_FALSE; - } - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_siz(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager) -{ - OPJ_UINT32 i; - OPJ_UINT32 l_size_len; - OPJ_BYTE * l_current_ptr; - opj_image_t * l_image = 00; - opj_cp_t *cp = 00; - opj_image_comp_t * l_img_comp = 00; - - /* preconditions */ - assert(p_stream != 00); - assert(p_j2k != 00); - assert(p_manager != 00); - - l_image = p_j2k->m_private_image; - cp = &(p_j2k->m_cp); - l_size_len = 40 + 3 * l_image->numcomps; - l_img_comp = l_image->comps; - - if (l_size_len > p_j2k->m_specific_param.m_encoder.m_header_tile_data_size) { - - OPJ_BYTE *new_header_tile_data = (OPJ_BYTE *) opj_realloc( - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_size_len); - if (! new_header_tile_data) { - opj_free(p_j2k->m_specific_param.m_encoder.m_header_tile_data); - p_j2k->m_specific_param.m_encoder.m_header_tile_data = NULL; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = 0; - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory for the SIZ marker\n"); - return OPJ_FALSE; - } - p_j2k->m_specific_param.m_encoder.m_header_tile_data = new_header_tile_data; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = l_size_len; - } - - l_current_ptr = p_j2k->m_specific_param.m_encoder.m_header_tile_data; - - /* write SOC identifier */ - opj_write_bytes(l_current_ptr, J2K_MS_SIZ, 2); /* SIZ */ - l_current_ptr += 2; - - opj_write_bytes(l_current_ptr, l_size_len - 2, 2); /* L_SIZ */ - l_current_ptr += 2; - - opj_write_bytes(l_current_ptr, cp->rsiz, 2); /* Rsiz (capabilities) */ - l_current_ptr += 2; - - opj_write_bytes(l_current_ptr, l_image->x1, 4); /* Xsiz */ - l_current_ptr += 4; - - opj_write_bytes(l_current_ptr, l_image->y1, 4); /* Ysiz */ - l_current_ptr += 4; - - opj_write_bytes(l_current_ptr, l_image->x0, 4); /* X0siz */ - l_current_ptr += 4; - - opj_write_bytes(l_current_ptr, l_image->y0, 4); /* Y0siz */ - l_current_ptr += 4; - - opj_write_bytes(l_current_ptr, cp->tdx, 4); /* XTsiz */ - l_current_ptr += 4; - - opj_write_bytes(l_current_ptr, cp->tdy, 4); /* YTsiz */ - l_current_ptr += 4; - - opj_write_bytes(l_current_ptr, cp->tx0, 4); /* XT0siz */ - l_current_ptr += 4; - - opj_write_bytes(l_current_ptr, cp->ty0, 4); /* YT0siz */ - l_current_ptr += 4; - - opj_write_bytes(l_current_ptr, l_image->numcomps, 2); /* Csiz */ - l_current_ptr += 2; - - for (i = 0; i < l_image->numcomps; ++i) { - /* TODO here with MCT ? */ - opj_write_bytes(l_current_ptr, l_img_comp->prec - 1 + (l_img_comp->sgnd << 7), - 1); /* Ssiz_i */ - ++l_current_ptr; - - opj_write_bytes(l_current_ptr, l_img_comp->dx, 1); /* XRsiz_i */ - ++l_current_ptr; - - opj_write_bytes(l_current_ptr, l_img_comp->dy, 1); /* YRsiz_i */ - ++l_current_ptr; - - ++l_img_comp; - } - - if (opj_stream_write_data(p_stream, - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_size_len, - p_manager) != l_size_len) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -/** - * Reads a SIZ marker (image and tile size) - * @param p_j2k the jpeg2000 file codec. - * @param p_header_data the data contained in the SIZ box. - * @param p_header_size the size of the data contained in the SIZ marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_siz(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 i; - OPJ_UINT32 l_nb_comp; - OPJ_UINT32 l_nb_comp_remain; - OPJ_UINT32 l_remaining_size; - OPJ_UINT32 l_nb_tiles; - OPJ_UINT32 l_tmp, l_tx1, l_ty1; - OPJ_UINT32 l_prec0, l_sgnd0; - opj_image_t *l_image = 00; - opj_cp_t *l_cp = 00; - opj_image_comp_t * l_img_comp = 00; - opj_tcp_t * l_current_tile_param = 00; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_header_data != 00); - - l_image = p_j2k->m_private_image; - l_cp = &(p_j2k->m_cp); - - /* minimum size == 39 - 3 (= minimum component parameter) */ - if (p_header_size < 36) { - opj_event_msg(p_manager, EVT_ERROR, "Error with SIZ marker size\n"); - return OPJ_FALSE; - } - - l_remaining_size = p_header_size - 36; - l_nb_comp = l_remaining_size / 3; - l_nb_comp_remain = l_remaining_size % 3; - if (l_nb_comp_remain != 0) { - opj_event_msg(p_manager, EVT_ERROR, "Error with SIZ marker size\n"); - return OPJ_FALSE; - } - - opj_read_bytes(p_header_data, &l_tmp, - 2); /* Rsiz (capabilities) */ - p_header_data += 2; - l_cp->rsiz = (OPJ_UINT16) l_tmp; - opj_read_bytes(p_header_data, (OPJ_UINT32*) &l_image->x1, 4); /* Xsiz */ - p_header_data += 4; - opj_read_bytes(p_header_data, (OPJ_UINT32*) &l_image->y1, 4); /* Ysiz */ - p_header_data += 4; - opj_read_bytes(p_header_data, (OPJ_UINT32*) &l_image->x0, 4); /* X0siz */ - p_header_data += 4; - opj_read_bytes(p_header_data, (OPJ_UINT32*) &l_image->y0, 4); /* Y0siz */ - p_header_data += 4; - opj_read_bytes(p_header_data, (OPJ_UINT32*) &l_cp->tdx, - 4); /* XTsiz */ - p_header_data += 4; - opj_read_bytes(p_header_data, (OPJ_UINT32*) &l_cp->tdy, - 4); /* YTsiz */ - p_header_data += 4; - opj_read_bytes(p_header_data, (OPJ_UINT32*) &l_cp->tx0, - 4); /* XT0siz */ - p_header_data += 4; - opj_read_bytes(p_header_data, (OPJ_UINT32*) &l_cp->ty0, - 4); /* YT0siz */ - p_header_data += 4; - opj_read_bytes(p_header_data, (OPJ_UINT32*) &l_tmp, - 2); /* Csiz */ - p_header_data += 2; - if (l_tmp < 16385) { - l_image->numcomps = (OPJ_UINT16) l_tmp; - } else { - opj_event_msg(p_manager, EVT_ERROR, - "Error with SIZ marker: number of component is illegal -> %d\n", l_tmp); - return OPJ_FALSE; - } - - if (l_image->numcomps != l_nb_comp) { - opj_event_msg(p_manager, EVT_ERROR, - "Error with SIZ marker: number of component is not compatible with the remaining number of parameters ( %d vs %d)\n", - l_image->numcomps, l_nb_comp); - return OPJ_FALSE; - } - - /* testcase 4035.pdf.SIGSEGV.d8b.3375 */ - /* testcase issue427-null-image-size.jp2 */ - if ((l_image->x0 >= l_image->x1) || (l_image->y0 >= l_image->y1)) { - opj_event_msg(p_manager, EVT_ERROR, - "Error with SIZ marker: negative or zero image size (%" PRId64 " x %" PRId64 - ")\n", (OPJ_INT64)l_image->x1 - l_image->x0, - (OPJ_INT64)l_image->y1 - l_image->y0); - return OPJ_FALSE; - } - /* testcase 2539.pdf.SIGFPE.706.1712 (also 3622.pdf.SIGFPE.706.2916 and 4008.pdf.SIGFPE.706.3345 and maybe more) */ - if ((l_cp->tdx == 0U) || (l_cp->tdy == 0U)) { - opj_event_msg(p_manager, EVT_ERROR, - "Error with SIZ marker: invalid tile size (tdx: %d, tdy: %d)\n", l_cp->tdx, - l_cp->tdy); - return OPJ_FALSE; - } - - /* testcase issue427-illegal-tile-offset.jp2 */ - l_tx1 = opj_uint_adds(l_cp->tx0, l_cp->tdx); /* manage overflow */ - l_ty1 = opj_uint_adds(l_cp->ty0, l_cp->tdy); /* manage overflow */ - if ((l_cp->tx0 > l_image->x0) || (l_cp->ty0 > l_image->y0) || - (l_tx1 <= l_image->x0) || (l_ty1 <= l_image->y0)) { - opj_event_msg(p_manager, EVT_ERROR, - "Error with SIZ marker: illegal tile offset\n"); - return OPJ_FALSE; - } - if (!p_j2k->dump_state) { - OPJ_UINT32 siz_w, siz_h; - - siz_w = l_image->x1 - l_image->x0; - siz_h = l_image->y1 - l_image->y0; - - if (p_j2k->ihdr_w > 0 && p_j2k->ihdr_h > 0 - && (p_j2k->ihdr_w != siz_w || p_j2k->ihdr_h != siz_h)) { - opj_event_msg(p_manager, EVT_ERROR, - "Error with SIZ marker: IHDR w(%u) h(%u) vs. SIZ w(%u) h(%u)\n", p_j2k->ihdr_w, - p_j2k->ihdr_h, siz_w, siz_h); - return OPJ_FALSE; - } - } -#ifdef USE_JPWL - if (l_cp->correct) { - /* if JPWL is on, we check whether TX errors have damaged - too much the SIZ parameters */ - if (!(l_image->x1 * l_image->y1)) { - opj_event_msg(p_manager, EVT_ERROR, - "JPWL: bad image size (%d x %d)\n", - l_image->x1, l_image->y1); - if (!JPWL_ASSUME) { - opj_event_msg(p_manager, EVT_ERROR, "JPWL: giving up\n"); - return OPJ_FALSE; - } - } - - /* FIXME check previously in the function so why keep this piece of code ? Need by the norm ? - if (l_image->numcomps != ((len - 38) / 3)) { - opj_event_msg(p_manager, JPWL_ASSUME ? EVT_WARNING : EVT_ERROR, - "JPWL: Csiz is %d => space in SIZ only for %d comps.!!!\n", - l_image->numcomps, ((len - 38) / 3)); - if (!JPWL_ASSUME) { - opj_event_msg(p_manager, EVT_ERROR, "JPWL: giving up\n"); - return OPJ_FALSE; - } - */ /* we try to correct */ - /* opj_event_msg(p_manager, EVT_WARNING, "- trying to adjust this\n"); - if (l_image->numcomps < ((len - 38) / 3)) { - len = 38 + 3 * l_image->numcomps; - opj_event_msg(p_manager, EVT_WARNING, "- setting Lsiz to %d => HYPOTHESIS!!!\n", - len); - } else { - l_image->numcomps = ((len - 38) / 3); - opj_event_msg(p_manager, EVT_WARNING, "- setting Csiz to %d => HYPOTHESIS!!!\n", - l_image->numcomps); - } - } - */ - - /* update components number in the jpwl_exp_comps filed */ - l_cp->exp_comps = l_image->numcomps; - } -#endif /* USE_JPWL */ - - /* Allocate the resulting image components */ - l_image->comps = (opj_image_comp_t*) opj_calloc(l_image->numcomps, - sizeof(opj_image_comp_t)); - if (l_image->comps == 00) { - l_image->numcomps = 0; - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to take in charge SIZ marker\n"); - return OPJ_FALSE; - } - - l_img_comp = l_image->comps; - - l_prec0 = 0; - l_sgnd0 = 0; - /* Read the component information */ - for (i = 0; i < l_image->numcomps; ++i) { - OPJ_UINT32 tmp; - opj_read_bytes(p_header_data, &tmp, 1); /* Ssiz_i */ - ++p_header_data; - l_img_comp->prec = (tmp & 0x7f) + 1; - l_img_comp->sgnd = tmp >> 7; - - if (p_j2k->dump_state == 0) { - if (i == 0) { - l_prec0 = l_img_comp->prec; - l_sgnd0 = l_img_comp->sgnd; - } else if (!l_cp->allow_different_bit_depth_sign - && (l_img_comp->prec != l_prec0 || l_img_comp->sgnd != l_sgnd0)) { - opj_event_msg(p_manager, EVT_WARNING, - "Despite JP2 BPC!=255, precision and/or sgnd values for comp[%d] is different than comp[0]:\n" - " [0] prec(%d) sgnd(%d) [%d] prec(%d) sgnd(%d)\n", i, l_prec0, l_sgnd0, - i, l_img_comp->prec, l_img_comp->sgnd); - } - /* TODO: we should perhaps also check against JP2 BPCC values */ - } - opj_read_bytes(p_header_data, &tmp, 1); /* XRsiz_i */ - ++p_header_data; - l_img_comp->dx = (OPJ_UINT32)tmp; /* should be between 1 and 255 */ - opj_read_bytes(p_header_data, &tmp, 1); /* YRsiz_i */ - ++p_header_data; - l_img_comp->dy = (OPJ_UINT32)tmp; /* should be between 1 and 255 */ - if (l_img_comp->dx < 1 || l_img_comp->dx > 255 || - l_img_comp->dy < 1 || l_img_comp->dy > 255) { - opj_event_msg(p_manager, EVT_ERROR, - "Invalid values for comp = %d : dx=%u dy=%u (should be between 1 and 255 according to the JPEG2000 norm)\n", - i, l_img_comp->dx, l_img_comp->dy); - return OPJ_FALSE; - } - /* Avoids later undefined shift in computation of */ - /* p_j2k->m_specific_param.m_decoder.m_default_tcp->tccps[i].m_dc_level_shift = 1 - << (l_image->comps[i].prec - 1); */ - if (l_img_comp->prec > 31) { - opj_event_msg(p_manager, EVT_ERROR, - "Invalid values for comp = %d : prec=%u (should be between 1 and 38 according to the JPEG2000 norm. OpenJpeg only supports up to 31)\n", - i, l_img_comp->prec); - return OPJ_FALSE; - } -#ifdef USE_JPWL - if (l_cp->correct) { - /* if JPWL is on, we check whether TX errors have damaged - too much the SIZ parameters, again */ - if (!(l_image->comps[i].dx * l_image->comps[i].dy)) { - opj_event_msg(p_manager, JPWL_ASSUME ? EVT_WARNING : EVT_ERROR, - "JPWL: bad XRsiz_%d/YRsiz_%d (%d x %d)\n", - i, i, l_image->comps[i].dx, l_image->comps[i].dy); - if (!JPWL_ASSUME) { - opj_event_msg(p_manager, EVT_ERROR, "JPWL: giving up\n"); - return OPJ_FALSE; - } - /* we try to correct */ - opj_event_msg(p_manager, EVT_WARNING, "- trying to adjust them\n"); - if (!l_image->comps[i].dx) { - l_image->comps[i].dx = 1; - opj_event_msg(p_manager, EVT_WARNING, - "- setting XRsiz_%d to %d => HYPOTHESIS!!!\n", - i, l_image->comps[i].dx); - } - if (!l_image->comps[i].dy) { - l_image->comps[i].dy = 1; - opj_event_msg(p_manager, EVT_WARNING, - "- setting YRsiz_%d to %d => HYPOTHESIS!!!\n", - i, l_image->comps[i].dy); - } - } - } -#endif /* USE_JPWL */ - l_img_comp->resno_decoded = - 0; /* number of resolution decoded */ - l_img_comp->factor = - l_cp->m_specific_param.m_dec.m_reduce; /* reducing factor per component */ - ++l_img_comp; - } - - if (l_cp->tdx == 0 || l_cp->tdy == 0) { - return OPJ_FALSE; - } - - /* Compute the number of tiles */ - l_cp->tw = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)(l_image->x1 - l_cp->tx0), - (OPJ_INT32)l_cp->tdx); - l_cp->th = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)(l_image->y1 - l_cp->ty0), - (OPJ_INT32)l_cp->tdy); - - /* Check that the number of tiles is valid */ - if (l_cp->tw == 0 || l_cp->th == 0 || l_cp->tw > 65535 / l_cp->th) { - opj_event_msg(p_manager, EVT_ERROR, - "Invalid number of tiles : %u x %u (maximum fixed by jpeg2000 norm is 65535 tiles)\n", - l_cp->tw, l_cp->th); - return OPJ_FALSE; - } - l_nb_tiles = l_cp->tw * l_cp->th; - - /* Define the tiles which will be decoded */ - if (p_j2k->m_specific_param.m_decoder.m_discard_tiles) { - p_j2k->m_specific_param.m_decoder.m_start_tile_x = - (p_j2k->m_specific_param.m_decoder.m_start_tile_x - l_cp->tx0) / l_cp->tdx; - p_j2k->m_specific_param.m_decoder.m_start_tile_y = - (p_j2k->m_specific_param.m_decoder.m_start_tile_y - l_cp->ty0) / l_cp->tdy; - p_j2k->m_specific_param.m_decoder.m_end_tile_x = (OPJ_UINT32)opj_int_ceildiv(( - OPJ_INT32)(p_j2k->m_specific_param.m_decoder.m_end_tile_x - l_cp->tx0), - (OPJ_INT32)l_cp->tdx); - p_j2k->m_specific_param.m_decoder.m_end_tile_y = (OPJ_UINT32)opj_int_ceildiv(( - OPJ_INT32)(p_j2k->m_specific_param.m_decoder.m_end_tile_y - l_cp->ty0), - (OPJ_INT32)l_cp->tdy); - } else { - p_j2k->m_specific_param.m_decoder.m_start_tile_x = 0; - p_j2k->m_specific_param.m_decoder.m_start_tile_y = 0; - p_j2k->m_specific_param.m_decoder.m_end_tile_x = l_cp->tw; - p_j2k->m_specific_param.m_decoder.m_end_tile_y = l_cp->th; - } - -#ifdef USE_JPWL - if (l_cp->correct) { - /* if JPWL is on, we check whether TX errors have damaged - too much the SIZ parameters */ - if ((l_cp->tw < 1) || (l_cp->th < 1) || (l_cp->tw > l_cp->max_tiles) || - (l_cp->th > l_cp->max_tiles)) { - opj_event_msg(p_manager, JPWL_ASSUME ? EVT_WARNING : EVT_ERROR, - "JPWL: bad number of tiles (%d x %d)\n", - l_cp->tw, l_cp->th); - if (!JPWL_ASSUME) { - opj_event_msg(p_manager, EVT_ERROR, "JPWL: giving up\n"); - return OPJ_FALSE; - } - /* we try to correct */ - opj_event_msg(p_manager, EVT_WARNING, "- trying to adjust them\n"); - if (l_cp->tw < 1) { - l_cp->tw = 1; - opj_event_msg(p_manager, EVT_WARNING, - "- setting %d tiles in x => HYPOTHESIS!!!\n", - l_cp->tw); - } - if (l_cp->tw > l_cp->max_tiles) { - l_cp->tw = 1; - opj_event_msg(p_manager, EVT_WARNING, - "- too large x, increase expectance of %d\n" - "- setting %d tiles in x => HYPOTHESIS!!!\n", - l_cp->max_tiles, l_cp->tw); - } - if (l_cp->th < 1) { - l_cp->th = 1; - opj_event_msg(p_manager, EVT_WARNING, - "- setting %d tiles in y => HYPOTHESIS!!!\n", - l_cp->th); - } - if (l_cp->th > l_cp->max_tiles) { - l_cp->th = 1; - opj_event_msg(p_manager, EVT_WARNING, - "- too large y, increase expectance of %d to continue\n", - "- setting %d tiles in y => HYPOTHESIS!!!\n", - l_cp->max_tiles, l_cp->th); - } - } - } -#endif /* USE_JPWL */ - - /* memory allocations */ - l_cp->tcps = (opj_tcp_t*) opj_calloc(l_nb_tiles, sizeof(opj_tcp_t)); - if (l_cp->tcps == 00) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to take in charge SIZ marker\n"); - return OPJ_FALSE; - } - -#ifdef USE_JPWL - if (l_cp->correct) { - if (!l_cp->tcps) { - opj_event_msg(p_manager, JPWL_ASSUME ? EVT_WARNING : EVT_ERROR, - "JPWL: could not alloc tcps field of cp\n"); - if (!JPWL_ASSUME) { - opj_event_msg(p_manager, EVT_ERROR, "JPWL: giving up\n"); - return OPJ_FALSE; - } - } - } -#endif /* USE_JPWL */ - - p_j2k->m_specific_param.m_decoder.m_default_tcp->tccps = - (opj_tccp_t*) opj_calloc(l_image->numcomps, sizeof(opj_tccp_t)); - if (p_j2k->m_specific_param.m_decoder.m_default_tcp->tccps == 00) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to take in charge SIZ marker\n"); - return OPJ_FALSE; - } - - p_j2k->m_specific_param.m_decoder.m_default_tcp->m_mct_records = - (opj_mct_data_t*)opj_calloc(OPJ_J2K_MCT_DEFAULT_NB_RECORDS, - sizeof(opj_mct_data_t)); - - if (! p_j2k->m_specific_param.m_decoder.m_default_tcp->m_mct_records) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to take in charge SIZ marker\n"); - return OPJ_FALSE; - } - p_j2k->m_specific_param.m_decoder.m_default_tcp->m_nb_max_mct_records = - OPJ_J2K_MCT_DEFAULT_NB_RECORDS; - - p_j2k->m_specific_param.m_decoder.m_default_tcp->m_mcc_records = - (opj_simple_mcc_decorrelation_data_t*) - opj_calloc(OPJ_J2K_MCC_DEFAULT_NB_RECORDS, - sizeof(opj_simple_mcc_decorrelation_data_t)); - - if (! p_j2k->m_specific_param.m_decoder.m_default_tcp->m_mcc_records) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to take in charge SIZ marker\n"); - return OPJ_FALSE; - } - p_j2k->m_specific_param.m_decoder.m_default_tcp->m_nb_max_mcc_records = - OPJ_J2K_MCC_DEFAULT_NB_RECORDS; - - /* set up default dc level shift */ - for (i = 0; i < l_image->numcomps; ++i) { - if (! l_image->comps[i].sgnd) { - p_j2k->m_specific_param.m_decoder.m_default_tcp->tccps[i].m_dc_level_shift = 1 - << (l_image->comps[i].prec - 1); - } - } - - l_current_tile_param = l_cp->tcps; - for (i = 0; i < l_nb_tiles; ++i) { - l_current_tile_param->tccps = (opj_tccp_t*) opj_calloc(l_image->numcomps, - sizeof(opj_tccp_t)); - if (l_current_tile_param->tccps == 00) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to take in charge SIZ marker\n"); - return OPJ_FALSE; - } - - ++l_current_tile_param; - } - - p_j2k->m_specific_param.m_decoder.m_state = J2K_STATE_MH; - opj_image_comp_header_update(l_image, l_cp); - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_com(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 l_comment_size; - OPJ_UINT32 l_total_com_size; - const OPJ_CHAR *l_comment; - OPJ_BYTE * l_current_ptr = 00; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_stream != 00); - assert(p_manager != 00); - - l_comment = p_j2k->m_cp.comment; - l_comment_size = (OPJ_UINT32)strlen(l_comment); - l_total_com_size = l_comment_size + 6; - - if (l_total_com_size > - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size) { - OPJ_BYTE *new_header_tile_data = (OPJ_BYTE *) opj_realloc( - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_total_com_size); - if (! new_header_tile_data) { - opj_free(p_j2k->m_specific_param.m_encoder.m_header_tile_data); - p_j2k->m_specific_param.m_encoder.m_header_tile_data = NULL; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = 0; - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to write the COM marker\n"); - return OPJ_FALSE; - } - p_j2k->m_specific_param.m_encoder.m_header_tile_data = new_header_tile_data; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = l_total_com_size; - } - - l_current_ptr = p_j2k->m_specific_param.m_encoder.m_header_tile_data; - - opj_write_bytes(l_current_ptr, J2K_MS_COM, 2); /* COM */ - l_current_ptr += 2; - - opj_write_bytes(l_current_ptr, l_total_com_size - 2, 2); /* L_COM */ - l_current_ptr += 2; - - opj_write_bytes(l_current_ptr, 1, - 2); /* General use (IS 8859-15:1999 (Latin) values) */ - l_current_ptr += 2; - - memcpy(l_current_ptr, l_comment, l_comment_size); - - if (opj_stream_write_data(p_stream, - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_total_com_size, - p_manager) != l_total_com_size) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -/** - * Reads a COM marker (comments) - * @param p_j2k the jpeg2000 file codec. - * @param p_header_data the data contained in the COM box. - * @param p_header_size the size of the data contained in the COM marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_com(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager - ) -{ - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_header_data != 00); - - OPJ_UNUSED(p_j2k); - OPJ_UNUSED(p_header_data); - OPJ_UNUSED(p_header_size); - OPJ_UNUSED(p_manager); - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_cod(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager) -{ - opj_cp_t *l_cp = 00; - opj_tcp_t *l_tcp = 00; - OPJ_UINT32 l_code_size, l_remaining_size; - OPJ_BYTE * l_current_data = 00; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - l_cp = &(p_j2k->m_cp); - l_tcp = &l_cp->tcps[p_j2k->m_current_tile_number]; - l_code_size = 9 + opj_j2k_get_SPCod_SPCoc_size(p_j2k, - p_j2k->m_current_tile_number, 0); - l_remaining_size = l_code_size; - - if (l_code_size > p_j2k->m_specific_param.m_encoder.m_header_tile_data_size) { - OPJ_BYTE *new_header_tile_data = (OPJ_BYTE *) opj_realloc( - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_code_size); - if (! new_header_tile_data) { - opj_free(p_j2k->m_specific_param.m_encoder.m_header_tile_data); - p_j2k->m_specific_param.m_encoder.m_header_tile_data = NULL; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = 0; - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to write COD marker\n"); - return OPJ_FALSE; - } - p_j2k->m_specific_param.m_encoder.m_header_tile_data = new_header_tile_data; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = l_code_size; - } - - l_current_data = p_j2k->m_specific_param.m_encoder.m_header_tile_data; - - opj_write_bytes(l_current_data, J2K_MS_COD, 2); /* COD */ - l_current_data += 2; - - opj_write_bytes(l_current_data, l_code_size - 2, 2); /* L_COD */ - l_current_data += 2; - - opj_write_bytes(l_current_data, l_tcp->csty, 1); /* Scod */ - ++l_current_data; - - opj_write_bytes(l_current_data, (OPJ_UINT32)l_tcp->prg, 1); /* SGcod (A) */ - ++l_current_data; - - opj_write_bytes(l_current_data, l_tcp->numlayers, 2); /* SGcod (B) */ - l_current_data += 2; - - opj_write_bytes(l_current_data, l_tcp->mct, 1); /* SGcod (C) */ - ++l_current_data; - - l_remaining_size -= 9; - - if (! opj_j2k_write_SPCod_SPCoc(p_j2k, p_j2k->m_current_tile_number, 0, - l_current_data, &l_remaining_size, p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, "Error writing COD marker\n"); - return OPJ_FALSE; - } - - if (l_remaining_size != 0) { - opj_event_msg(p_manager, EVT_ERROR, "Error writing COD marker\n"); - return OPJ_FALSE; - } - - if (opj_stream_write_data(p_stream, - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_code_size, - p_manager) != l_code_size) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -/** - * Reads a COD marker (Coding style defaults) - * @param p_header_data the data contained in the COD box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the COD marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_cod(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager - ) -{ - /* loop */ - OPJ_UINT32 i; - OPJ_UINT32 l_tmp; - opj_cp_t *l_cp = 00; - opj_tcp_t *l_tcp = 00; - opj_image_t *l_image = 00; - - /* preconditions */ - assert(p_header_data != 00); - assert(p_j2k != 00); - assert(p_manager != 00); - - l_image = p_j2k->m_private_image; - l_cp = &(p_j2k->m_cp); - - /* If we are in the first tile-part header of the current tile */ - l_tcp = (p_j2k->m_specific_param.m_decoder.m_state == J2K_STATE_TPH) ? - &l_cp->tcps[p_j2k->m_current_tile_number] : - p_j2k->m_specific_param.m_decoder.m_default_tcp; - -#if 0 - /* This check was added per https://github.com/uclouvain/openjpeg/commit/daed8cc9195555e101ab708a501af2dfe6d5e001 */ - /* but this is no longer necessary to handle issue476.jp2 */ - /* and this actually cause issues on legit files. See https://github.com/uclouvain/openjpeg/issues/1043 */ - /* Only one COD per tile */ - if (l_tcp->cod) { - opj_event_msg(p_manager, EVT_ERROR, - "COD marker already read. No more than one COD marker per tile.\n"); - return OPJ_FALSE; - } -#endif - l_tcp->cod = 1; - - /* Make sure room is sufficient */ - if (p_header_size < 5) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading COD marker\n"); - return OPJ_FALSE; - } - - opj_read_bytes(p_header_data, &l_tcp->csty, 1); /* Scod */ - ++p_header_data; - /* Make sure we know how to decode this */ - if ((l_tcp->csty & ~(OPJ_UINT32)(J2K_CP_CSTY_PRT | J2K_CP_CSTY_SOP | - J2K_CP_CSTY_EPH)) != 0U) { - opj_event_msg(p_manager, EVT_ERROR, "Unknown Scod value in COD marker\n"); - return OPJ_FALSE; - } - opj_read_bytes(p_header_data, &l_tmp, 1); /* SGcod (A) */ - ++p_header_data; - l_tcp->prg = (OPJ_PROG_ORDER) l_tmp; - /* Make sure progression order is valid */ - if (l_tcp->prg > OPJ_CPRL) { - opj_event_msg(p_manager, EVT_ERROR, - "Unknown progression order in COD marker\n"); - l_tcp->prg = OPJ_PROG_UNKNOWN; - } - opj_read_bytes(p_header_data, &l_tcp->numlayers, 2); /* SGcod (B) */ - p_header_data += 2; - - if ((l_tcp->numlayers < 1U) || (l_tcp->numlayers > 65535U)) { - opj_event_msg(p_manager, EVT_ERROR, - "Invalid number of layers in COD marker : %d not in range [1-65535]\n", - l_tcp->numlayers); - return OPJ_FALSE; - } - - /* If user didn't set a number layer to decode take the max specify in the codestream. */ - if (l_cp->m_specific_param.m_dec.m_layer) { - l_tcp->num_layers_to_decode = l_cp->m_specific_param.m_dec.m_layer; - } else { - l_tcp->num_layers_to_decode = l_tcp->numlayers; - } - - opj_read_bytes(p_header_data, &l_tcp->mct, 1); /* SGcod (C) */ - ++p_header_data; - - p_header_size -= 5; - for (i = 0; i < l_image->numcomps; ++i) { - l_tcp->tccps[i].csty = l_tcp->csty & J2K_CCP_CSTY_PRT; - } - - if (! opj_j2k_read_SPCod_SPCoc(p_j2k, 0, p_header_data, &p_header_size, - p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading COD marker\n"); - return OPJ_FALSE; - } - - if (p_header_size != 0) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading COD marker\n"); - return OPJ_FALSE; - } - - /* Apply the coding style to other components of the current tile or the m_default_tcp*/ - opj_j2k_copy_tile_component_parameters(p_j2k); - - /* Index */ -#ifdef WIP_REMOVE_MSD - if (p_j2k->cstr_info) { - /*opj_codestream_info_t *l_cstr_info = p_j2k->cstr_info;*/ - p_j2k->cstr_info->prog = l_tcp->prg; - p_j2k->cstr_info->numlayers = l_tcp->numlayers; - p_j2k->cstr_info->numdecompos = (OPJ_INT32*) opj_malloc( - l_image->numcomps * sizeof(OPJ_UINT32)); - if (!p_j2k->cstr_info->numdecompos) { - return OPJ_FALSE; - } - for (i = 0; i < l_image->numcomps; ++i) { - p_j2k->cstr_info->numdecompos[i] = l_tcp->tccps[i].numresolutions - 1; - } - } -#endif - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_coc(opj_j2k_t *p_j2k, - OPJ_UINT32 p_comp_no, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager) -{ - OPJ_UINT32 l_coc_size, l_remaining_size; - OPJ_UINT32 l_comp_room; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - l_comp_room = (p_j2k->m_private_image->numcomps <= 256) ? 1 : 2; - - l_coc_size = 5 + l_comp_room + opj_j2k_get_SPCod_SPCoc_size(p_j2k, - p_j2k->m_current_tile_number, p_comp_no); - - if (l_coc_size > p_j2k->m_specific_param.m_encoder.m_header_tile_data_size) { - OPJ_BYTE *new_header_tile_data; - /*p_j2k->m_specific_param.m_encoder.m_header_tile_data - = (OPJ_BYTE*)opj_realloc( - p_j2k->m_specific_param.m_encoder.m_header_tile_data, - l_coc_size);*/ - - new_header_tile_data = (OPJ_BYTE *) opj_realloc( - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_coc_size); - if (! new_header_tile_data) { - opj_free(p_j2k->m_specific_param.m_encoder.m_header_tile_data); - p_j2k->m_specific_param.m_encoder.m_header_tile_data = NULL; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = 0; - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to write COC marker\n"); - return OPJ_FALSE; - } - p_j2k->m_specific_param.m_encoder.m_header_tile_data = new_header_tile_data; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = l_coc_size; - } - - opj_j2k_write_coc_in_memory(p_j2k, p_comp_no, - p_j2k->m_specific_param.m_encoder.m_header_tile_data, &l_remaining_size, - p_manager); - - if (opj_stream_write_data(p_stream, - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_coc_size, - p_manager) != l_coc_size) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_compare_coc(opj_j2k_t *p_j2k, - OPJ_UINT32 p_first_comp_no, OPJ_UINT32 p_second_comp_no) -{ - opj_cp_t *l_cp = NULL; - opj_tcp_t *l_tcp = NULL; - - /* preconditions */ - assert(p_j2k != 00); - - l_cp = &(p_j2k->m_cp); - l_tcp = &l_cp->tcps[p_j2k->m_current_tile_number]; - - if (l_tcp->tccps[p_first_comp_no].csty != l_tcp->tccps[p_second_comp_no].csty) { - return OPJ_FALSE; - } - - - return opj_j2k_compare_SPCod_SPCoc(p_j2k, p_j2k->m_current_tile_number, - p_first_comp_no, p_second_comp_no); -} - -static void opj_j2k_write_coc_in_memory(opj_j2k_t *p_j2k, - OPJ_UINT32 p_comp_no, - OPJ_BYTE * p_data, - OPJ_UINT32 * p_data_written, - opj_event_mgr_t * p_manager - ) -{ - opj_cp_t *l_cp = 00; - opj_tcp_t *l_tcp = 00; - OPJ_UINT32 l_coc_size, l_remaining_size; - OPJ_BYTE * l_current_data = 00; - opj_image_t *l_image = 00; - OPJ_UINT32 l_comp_room; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - - l_cp = &(p_j2k->m_cp); - l_tcp = &l_cp->tcps[p_j2k->m_current_tile_number]; - l_image = p_j2k->m_private_image; - l_comp_room = (l_image->numcomps <= 256) ? 1 : 2; - - l_coc_size = 5 + l_comp_room + opj_j2k_get_SPCod_SPCoc_size(p_j2k, - p_j2k->m_current_tile_number, p_comp_no); - l_remaining_size = l_coc_size; - - l_current_data = p_data; - - opj_write_bytes(l_current_data, J2K_MS_COC, - 2); /* COC */ - l_current_data += 2; - - opj_write_bytes(l_current_data, l_coc_size - 2, - 2); /* L_COC */ - l_current_data += 2; - - opj_write_bytes(l_current_data, p_comp_no, l_comp_room); /* Ccoc */ - l_current_data += l_comp_room; - - opj_write_bytes(l_current_data, l_tcp->tccps[p_comp_no].csty, - 1); /* Scoc */ - ++l_current_data; - - l_remaining_size -= (5 + l_comp_room); - opj_j2k_write_SPCod_SPCoc(p_j2k, p_j2k->m_current_tile_number, 0, - l_current_data, &l_remaining_size, p_manager); - * p_data_written = l_coc_size; -} - -static OPJ_UINT32 opj_j2k_get_max_coc_size(opj_j2k_t *p_j2k) -{ - OPJ_UINT32 i, j; - OPJ_UINT32 l_nb_comp; - OPJ_UINT32 l_nb_tiles; - OPJ_UINT32 l_max = 0; - - /* preconditions */ - - l_nb_tiles = p_j2k->m_cp.tw * p_j2k->m_cp.th ; - l_nb_comp = p_j2k->m_private_image->numcomps; - - for (i = 0; i < l_nb_tiles; ++i) { - for (j = 0; j < l_nb_comp; ++j) { - l_max = opj_uint_max(l_max, opj_j2k_get_SPCod_SPCoc_size(p_j2k, i, j)); - } - } - - return 6 + l_max; -} - -/** - * Reads a COC marker (Coding Style Component) - * @param p_header_data the data contained in the COC box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the COC marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_coc(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager - ) -{ - opj_cp_t *l_cp = NULL; - opj_tcp_t *l_tcp = NULL; - opj_image_t *l_image = NULL; - OPJ_UINT32 l_comp_room; - OPJ_UINT32 l_comp_no; - - /* preconditions */ - assert(p_header_data != 00); - assert(p_j2k != 00); - assert(p_manager != 00); - - l_cp = &(p_j2k->m_cp); - l_tcp = (p_j2k->m_specific_param.m_decoder.m_state == J2K_STATE_TPH) - ? - &l_cp->tcps[p_j2k->m_current_tile_number] : - p_j2k->m_specific_param.m_decoder.m_default_tcp; - l_image = p_j2k->m_private_image; - - l_comp_room = l_image->numcomps <= 256 ? 1 : 2; - - /* make sure room is sufficient*/ - if (p_header_size < l_comp_room + 1) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading COC marker\n"); - return OPJ_FALSE; - } - p_header_size -= l_comp_room + 1; - - opj_read_bytes(p_header_data, &l_comp_no, - l_comp_room); /* Ccoc */ - p_header_data += l_comp_room; - if (l_comp_no >= l_image->numcomps) { - opj_event_msg(p_manager, EVT_ERROR, - "Error reading COC marker (bad number of components)\n"); - return OPJ_FALSE; - } - - opj_read_bytes(p_header_data, &l_tcp->tccps[l_comp_no].csty, - 1); /* Scoc */ - ++p_header_data ; - - if (! opj_j2k_read_SPCod_SPCoc(p_j2k, l_comp_no, p_header_data, &p_header_size, - p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading COC marker\n"); - return OPJ_FALSE; - } - - if (p_header_size != 0) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading COC marker\n"); - return OPJ_FALSE; - } - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_qcd(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 l_qcd_size, l_remaining_size; - OPJ_BYTE * l_current_data = 00; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - l_qcd_size = 4 + opj_j2k_get_SQcd_SQcc_size(p_j2k, p_j2k->m_current_tile_number, - 0); - l_remaining_size = l_qcd_size; - - if (l_qcd_size > p_j2k->m_specific_param.m_encoder.m_header_tile_data_size) { - OPJ_BYTE *new_header_tile_data = (OPJ_BYTE *) opj_realloc( - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_qcd_size); - if (! new_header_tile_data) { - opj_free(p_j2k->m_specific_param.m_encoder.m_header_tile_data); - p_j2k->m_specific_param.m_encoder.m_header_tile_data = NULL; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = 0; - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to write QCD marker\n"); - return OPJ_FALSE; - } - p_j2k->m_specific_param.m_encoder.m_header_tile_data = new_header_tile_data; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = l_qcd_size; - } - - l_current_data = p_j2k->m_specific_param.m_encoder.m_header_tile_data; - - opj_write_bytes(l_current_data, J2K_MS_QCD, 2); /* QCD */ - l_current_data += 2; - - opj_write_bytes(l_current_data, l_qcd_size - 2, 2); /* L_QCD */ - l_current_data += 2; - - l_remaining_size -= 4; - - if (! opj_j2k_write_SQcd_SQcc(p_j2k, p_j2k->m_current_tile_number, 0, - l_current_data, &l_remaining_size, p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, "Error writing QCD marker\n"); - return OPJ_FALSE; - } - - if (l_remaining_size != 0) { - opj_event_msg(p_manager, EVT_ERROR, "Error writing QCD marker\n"); - return OPJ_FALSE; - } - - if (opj_stream_write_data(p_stream, - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_qcd_size, - p_manager) != l_qcd_size) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -/** - * Reads a QCD marker (Quantization defaults) - * @param p_header_data the data contained in the QCD box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the QCD marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_qcd(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager - ) -{ - /* preconditions */ - assert(p_header_data != 00); - assert(p_j2k != 00); - assert(p_manager != 00); - - if (! opj_j2k_read_SQcd_SQcc(p_j2k, 0, p_header_data, &p_header_size, - p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading QCD marker\n"); - return OPJ_FALSE; - } - - if (p_header_size != 0) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading QCD marker\n"); - return OPJ_FALSE; - } - - /* Apply the quantization parameters to other components of the current tile or the m_default_tcp */ - opj_j2k_copy_tile_quantization_parameters(p_j2k); - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_qcc(opj_j2k_t *p_j2k, - OPJ_UINT32 p_comp_no, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 l_qcc_size, l_remaining_size; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - l_qcc_size = 5 + opj_j2k_get_SQcd_SQcc_size(p_j2k, p_j2k->m_current_tile_number, - p_comp_no); - l_qcc_size += p_j2k->m_private_image->numcomps <= 256 ? 0 : 1; - l_remaining_size = l_qcc_size; - - if (l_qcc_size > p_j2k->m_specific_param.m_encoder.m_header_tile_data_size) { - OPJ_BYTE *new_header_tile_data = (OPJ_BYTE *) opj_realloc( - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_qcc_size); - if (! new_header_tile_data) { - opj_free(p_j2k->m_specific_param.m_encoder.m_header_tile_data); - p_j2k->m_specific_param.m_encoder.m_header_tile_data = NULL; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = 0; - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to write QCC marker\n"); - return OPJ_FALSE; - } - p_j2k->m_specific_param.m_encoder.m_header_tile_data = new_header_tile_data; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = l_qcc_size; - } - - opj_j2k_write_qcc_in_memory(p_j2k, p_comp_no, - p_j2k->m_specific_param.m_encoder.m_header_tile_data, &l_remaining_size, - p_manager); - - if (opj_stream_write_data(p_stream, - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_qcc_size, - p_manager) != l_qcc_size) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_compare_qcc(opj_j2k_t *p_j2k, - OPJ_UINT32 p_first_comp_no, OPJ_UINT32 p_second_comp_no) -{ - return opj_j2k_compare_SQcd_SQcc(p_j2k, p_j2k->m_current_tile_number, - p_first_comp_no, p_second_comp_no); -} - -static void opj_j2k_write_qcc_in_memory(opj_j2k_t *p_j2k, - OPJ_UINT32 p_comp_no, - OPJ_BYTE * p_data, - OPJ_UINT32 * p_data_written, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 l_qcc_size, l_remaining_size; - OPJ_BYTE * l_current_data = 00; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - - l_qcc_size = 6 + opj_j2k_get_SQcd_SQcc_size(p_j2k, p_j2k->m_current_tile_number, - p_comp_no); - l_remaining_size = l_qcc_size; - - l_current_data = p_data; - - opj_write_bytes(l_current_data, J2K_MS_QCC, 2); /* QCC */ - l_current_data += 2; - - if (p_j2k->m_private_image->numcomps <= 256) { - --l_qcc_size; - - opj_write_bytes(l_current_data, l_qcc_size - 2, 2); /* L_QCC */ - l_current_data += 2; - - opj_write_bytes(l_current_data, p_comp_no, 1); /* Cqcc */ - ++l_current_data; - - /* in the case only one byte is sufficient the last byte allocated is useless -> still do -6 for available */ - l_remaining_size -= 6; - } else { - opj_write_bytes(l_current_data, l_qcc_size - 2, 2); /* L_QCC */ - l_current_data += 2; - - opj_write_bytes(l_current_data, p_comp_no, 2); /* Cqcc */ - l_current_data += 2; - - l_remaining_size -= 6; - } - - opj_j2k_write_SQcd_SQcc(p_j2k, p_j2k->m_current_tile_number, p_comp_no, - l_current_data, &l_remaining_size, p_manager); - - *p_data_written = l_qcc_size; -} - -static OPJ_UINT32 opj_j2k_get_max_qcc_size(opj_j2k_t *p_j2k) -{ - return opj_j2k_get_max_coc_size(p_j2k); -} - -/** - * Reads a QCC marker (Quantization component) - * @param p_header_data the data contained in the QCC box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the QCC marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_qcc(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 l_num_comp, l_comp_no; - - /* preconditions */ - assert(p_header_data != 00); - assert(p_j2k != 00); - assert(p_manager != 00); - - l_num_comp = p_j2k->m_private_image->numcomps; - - if (l_num_comp <= 256) { - if (p_header_size < 1) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading QCC marker\n"); - return OPJ_FALSE; - } - opj_read_bytes(p_header_data, &l_comp_no, 1); - ++p_header_data; - --p_header_size; - } else { - if (p_header_size < 2) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading QCC marker\n"); - return OPJ_FALSE; - } - opj_read_bytes(p_header_data, &l_comp_no, 2); - p_header_data += 2; - p_header_size -= 2; - } - -#ifdef USE_JPWL - if (p_j2k->m_cp.correct) { - - static OPJ_UINT32 backup_compno = 0; - - /* compno is negative or larger than the number of components!!! */ - if (/*(l_comp_no < 0) ||*/ (l_comp_no >= l_num_comp)) { - opj_event_msg(p_manager, EVT_ERROR, - "JPWL: bad component number in QCC (%d out of a maximum of %d)\n", - l_comp_no, l_num_comp); - if (!JPWL_ASSUME) { - opj_event_msg(p_manager, EVT_ERROR, "JPWL: giving up\n"); - return OPJ_FALSE; - } - /* we try to correct */ - l_comp_no = backup_compno % l_num_comp; - opj_event_msg(p_manager, EVT_WARNING, "- trying to adjust this\n" - "- setting component number to %d\n", - l_comp_no); - } - - /* keep your private count of tiles */ - backup_compno++; - }; -#endif /* USE_JPWL */ - - if (l_comp_no >= p_j2k->m_private_image->numcomps) { - opj_event_msg(p_manager, EVT_ERROR, - "Invalid component number: %d, regarding the number of components %d\n", - l_comp_no, p_j2k->m_private_image->numcomps); - return OPJ_FALSE; - } - - if (! opj_j2k_read_SQcd_SQcc(p_j2k, l_comp_no, p_header_data, &p_header_size, - p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading QCC marker\n"); - return OPJ_FALSE; - } - - if (p_header_size != 0) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading QCC marker\n"); - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_poc(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 l_nb_comp; - OPJ_UINT32 l_nb_poc; - OPJ_UINT32 l_poc_size; - OPJ_UINT32 l_written_size = 0; - opj_tcp_t *l_tcp = 00; - OPJ_UINT32 l_poc_room; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - l_tcp = &p_j2k->m_cp.tcps[p_j2k->m_current_tile_number]; - l_nb_comp = p_j2k->m_private_image->numcomps; - l_nb_poc = 1 + l_tcp->numpocs; - - if (l_nb_comp <= 256) { - l_poc_room = 1; - } else { - l_poc_room = 2; - } - l_poc_size = 4 + (5 + 2 * l_poc_room) * l_nb_poc; - - if (l_poc_size > p_j2k->m_specific_param.m_encoder.m_header_tile_data_size) { - OPJ_BYTE *new_header_tile_data = (OPJ_BYTE *) opj_realloc( - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_poc_size); - if (! new_header_tile_data) { - opj_free(p_j2k->m_specific_param.m_encoder.m_header_tile_data); - p_j2k->m_specific_param.m_encoder.m_header_tile_data = NULL; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = 0; - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to write POC marker\n"); - return OPJ_FALSE; - } - p_j2k->m_specific_param.m_encoder.m_header_tile_data = new_header_tile_data; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = l_poc_size; - } - - opj_j2k_write_poc_in_memory(p_j2k, - p_j2k->m_specific_param.m_encoder.m_header_tile_data, &l_written_size, - p_manager); - - if (opj_stream_write_data(p_stream, - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_poc_size, - p_manager) != l_poc_size) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -static void opj_j2k_write_poc_in_memory(opj_j2k_t *p_j2k, - OPJ_BYTE * p_data, - OPJ_UINT32 * p_data_written, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 i; - OPJ_BYTE * l_current_data = 00; - OPJ_UINT32 l_nb_comp; - OPJ_UINT32 l_nb_poc; - OPJ_UINT32 l_poc_size; - opj_image_t *l_image = 00; - opj_tcp_t *l_tcp = 00; - opj_tccp_t *l_tccp = 00; - opj_poc_t *l_current_poc = 00; - OPJ_UINT32 l_poc_room; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - - OPJ_UNUSED(p_manager); - - l_tcp = &p_j2k->m_cp.tcps[p_j2k->m_current_tile_number]; - l_tccp = &l_tcp->tccps[0]; - l_image = p_j2k->m_private_image; - l_nb_comp = l_image->numcomps; - l_nb_poc = 1 + l_tcp->numpocs; - - if (l_nb_comp <= 256) { - l_poc_room = 1; - } else { - l_poc_room = 2; - } - - l_poc_size = 4 + (5 + 2 * l_poc_room) * l_nb_poc; - - l_current_data = p_data; - - opj_write_bytes(l_current_data, J2K_MS_POC, - 2); /* POC */ - l_current_data += 2; - - opj_write_bytes(l_current_data, l_poc_size - 2, - 2); /* Lpoc */ - l_current_data += 2; - - l_current_poc = l_tcp->pocs; - for (i = 0; i < l_nb_poc; ++i) { - opj_write_bytes(l_current_data, l_current_poc->resno0, - 1); /* RSpoc_i */ - ++l_current_data; - - opj_write_bytes(l_current_data, l_current_poc->compno0, - l_poc_room); /* CSpoc_i */ - l_current_data += l_poc_room; - - opj_write_bytes(l_current_data, l_current_poc->layno1, - 2); /* LYEpoc_i */ - l_current_data += 2; - - opj_write_bytes(l_current_data, l_current_poc->resno1, - 1); /* REpoc_i */ - ++l_current_data; - - opj_write_bytes(l_current_data, l_current_poc->compno1, - l_poc_room); /* CEpoc_i */ - l_current_data += l_poc_room; - - opj_write_bytes(l_current_data, (OPJ_UINT32)l_current_poc->prg, - 1); /* Ppoc_i */ - ++l_current_data; - - /* change the value of the max layer according to the actual number of layers in the file, components and resolutions*/ - l_current_poc->layno1 = (OPJ_UINT32)opj_int_min((OPJ_INT32) - l_current_poc->layno1, (OPJ_INT32)l_tcp->numlayers); - l_current_poc->resno1 = (OPJ_UINT32)opj_int_min((OPJ_INT32) - l_current_poc->resno1, (OPJ_INT32)l_tccp->numresolutions); - l_current_poc->compno1 = (OPJ_UINT32)opj_int_min((OPJ_INT32) - l_current_poc->compno1, (OPJ_INT32)l_nb_comp); - - ++l_current_poc; - } - - *p_data_written = l_poc_size; -} - -static OPJ_UINT32 opj_j2k_get_max_poc_size(opj_j2k_t *p_j2k) -{ - opj_tcp_t * l_tcp = 00; - OPJ_UINT32 l_nb_tiles = 0; - OPJ_UINT32 l_max_poc = 0; - OPJ_UINT32 i; - - l_tcp = p_j2k->m_cp.tcps; - l_nb_tiles = p_j2k->m_cp.th * p_j2k->m_cp.tw; - - for (i = 0; i < l_nb_tiles; ++i) { - l_max_poc = opj_uint_max(l_max_poc, l_tcp->numpocs); - ++l_tcp; - } - - ++l_max_poc; - - return 4 + 9 * l_max_poc; -} - -static OPJ_UINT32 opj_j2k_get_max_toc_size(opj_j2k_t *p_j2k) -{ - OPJ_UINT32 i; - OPJ_UINT32 l_nb_tiles; - OPJ_UINT32 l_max = 0; - opj_tcp_t * l_tcp = 00; - - l_tcp = p_j2k->m_cp.tcps; - l_nb_tiles = p_j2k->m_cp.tw * p_j2k->m_cp.th ; - - for (i = 0; i < l_nb_tiles; ++i) { - l_max = opj_uint_max(l_max, l_tcp->m_nb_tile_parts); - - ++l_tcp; - } - - return 12 * l_max; -} - -static OPJ_UINT32 opj_j2k_get_specific_header_sizes(opj_j2k_t *p_j2k) -{ - OPJ_UINT32 l_nb_bytes = 0; - OPJ_UINT32 l_nb_comps; - OPJ_UINT32 l_coc_bytes, l_qcc_bytes; - - l_nb_comps = p_j2k->m_private_image->numcomps - 1; - l_nb_bytes += opj_j2k_get_max_toc_size(p_j2k); - - if (!(OPJ_IS_CINEMA(p_j2k->m_cp.rsiz))) { - l_coc_bytes = opj_j2k_get_max_coc_size(p_j2k); - l_nb_bytes += l_nb_comps * l_coc_bytes; - - l_qcc_bytes = opj_j2k_get_max_qcc_size(p_j2k); - l_nb_bytes += l_nb_comps * l_qcc_bytes; - } - - l_nb_bytes += opj_j2k_get_max_poc_size(p_j2k); - - /*** DEVELOPER CORNER, Add room for your headers ***/ - - return l_nb_bytes; -} - -/** - * Reads a POC marker (Progression Order Change) - * - * @param p_header_data the data contained in the POC box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the POC marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_poc(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 i, l_nb_comp, l_tmp; - opj_image_t * l_image = 00; - OPJ_UINT32 l_old_poc_nb, l_current_poc_nb, l_current_poc_remaining; - OPJ_UINT32 l_chunk_size, l_comp_room; - - opj_cp_t *l_cp = 00; - opj_tcp_t *l_tcp = 00; - opj_poc_t *l_current_poc = 00; - - /* preconditions */ - assert(p_header_data != 00); - assert(p_j2k != 00); - assert(p_manager != 00); - - l_image = p_j2k->m_private_image; - l_nb_comp = l_image->numcomps; - if (l_nb_comp <= 256) { - l_comp_room = 1; - } else { - l_comp_room = 2; - } - l_chunk_size = 5 + 2 * l_comp_room; - l_current_poc_nb = p_header_size / l_chunk_size; - l_current_poc_remaining = p_header_size % l_chunk_size; - - if ((l_current_poc_nb <= 0) || (l_current_poc_remaining != 0)) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading POC marker\n"); - return OPJ_FALSE; - } - - l_cp = &(p_j2k->m_cp); - l_tcp = (p_j2k->m_specific_param.m_decoder.m_state == J2K_STATE_TPH) ? - &l_cp->tcps[p_j2k->m_current_tile_number] : - p_j2k->m_specific_param.m_decoder.m_default_tcp; - l_old_poc_nb = l_tcp->POC ? l_tcp->numpocs + 1 : 0; - l_current_poc_nb += l_old_poc_nb; - - if (l_current_poc_nb >= J2K_MAX_POCS) { - opj_event_msg(p_manager, EVT_ERROR, "Too many POCs %d\n", l_current_poc_nb); - return OPJ_FALSE; - } - - /* now poc is in use.*/ - l_tcp->POC = 1; - - l_current_poc = &l_tcp->pocs[l_old_poc_nb]; - for (i = l_old_poc_nb; i < l_current_poc_nb; ++i) { - opj_read_bytes(p_header_data, &(l_current_poc->resno0), - 1); /* RSpoc_i */ - ++p_header_data; - opj_read_bytes(p_header_data, &(l_current_poc->compno0), - l_comp_room); /* CSpoc_i */ - p_header_data += l_comp_room; - opj_read_bytes(p_header_data, &(l_current_poc->layno1), - 2); /* LYEpoc_i */ - /* make sure layer end is in acceptable bounds */ - l_current_poc->layno1 = opj_uint_min(l_current_poc->layno1, l_tcp->numlayers); - p_header_data += 2; - opj_read_bytes(p_header_data, &(l_current_poc->resno1), - 1); /* REpoc_i */ - ++p_header_data; - opj_read_bytes(p_header_data, &(l_current_poc->compno1), - l_comp_room); /* CEpoc_i */ - p_header_data += l_comp_room; - opj_read_bytes(p_header_data, &l_tmp, - 1); /* Ppoc_i */ - ++p_header_data; - l_current_poc->prg = (OPJ_PROG_ORDER) l_tmp; - /* make sure comp is in acceptable bounds */ - l_current_poc->compno1 = opj_uint_min(l_current_poc->compno1, l_nb_comp); - ++l_current_poc; - } - - l_tcp->numpocs = l_current_poc_nb - 1; - return OPJ_TRUE; -} - -/** - * Reads a CRG marker (Component registration) - * - * @param p_header_data the data contained in the TLM box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the TLM marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_crg(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 l_nb_comp; - /* preconditions */ - assert(p_header_data != 00); - assert(p_j2k != 00); - assert(p_manager != 00); - - OPJ_UNUSED(p_header_data); - - l_nb_comp = p_j2k->m_private_image->numcomps; - - if (p_header_size != l_nb_comp * 4) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading CRG marker\n"); - return OPJ_FALSE; - } - /* Do not care of this at the moment since only local variables are set here */ - /* - for - (i = 0; i < l_nb_comp; ++i) - { - opj_read_bytes(p_header_data,&l_Xcrg_i,2); // Xcrg_i - p_header_data+=2; - opj_read_bytes(p_header_data,&l_Ycrg_i,2); // Xcrg_i - p_header_data+=2; - } - */ - return OPJ_TRUE; -} - -/** - * Reads a TLM marker (Tile Length Marker) - * - * @param p_header_data the data contained in the TLM box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the TLM marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_tlm(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 l_Ztlm, l_Stlm, l_ST, l_SP, l_tot_num_tp_remaining, l_quotient, - l_Ptlm_size; - /* preconditions */ - assert(p_header_data != 00); - assert(p_j2k != 00); - assert(p_manager != 00); - - OPJ_UNUSED(p_j2k); - - if (p_header_size < 2) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading TLM marker\n"); - return OPJ_FALSE; - } - p_header_size -= 2; - - opj_read_bytes(p_header_data, &l_Ztlm, - 1); /* Ztlm */ - ++p_header_data; - opj_read_bytes(p_header_data, &l_Stlm, - 1); /* Stlm */ - ++p_header_data; - - l_ST = ((l_Stlm >> 4) & 0x3); - l_SP = (l_Stlm >> 6) & 0x1; - - l_Ptlm_size = (l_SP + 1) * 2; - l_quotient = l_Ptlm_size + l_ST; - - l_tot_num_tp_remaining = p_header_size % l_quotient; - - if (l_tot_num_tp_remaining != 0) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading TLM marker\n"); - return OPJ_FALSE; - } - /* FIXME Do not care of this at the moment since only local variables are set here */ - /* - for - (i = 0; i < l_tot_num_tp; ++i) - { - opj_read_bytes(p_header_data,&l_Ttlm_i,l_ST); // Ttlm_i - p_header_data += l_ST; - opj_read_bytes(p_header_data,&l_Ptlm_i,l_Ptlm_size); // Ptlm_i - p_header_data += l_Ptlm_size; - }*/ - return OPJ_TRUE; -} - -/** - * Reads a PLM marker (Packet length, main header marker) - * - * @param p_header_data the data contained in the TLM box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the TLM marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_plm(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager - ) -{ - /* preconditions */ - assert(p_header_data != 00); - assert(p_j2k != 00); - assert(p_manager != 00); - - OPJ_UNUSED(p_j2k); - OPJ_UNUSED(p_header_data); - - if (p_header_size < 1) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading PLM marker\n"); - return OPJ_FALSE; - } - /* Do not care of this at the moment since only local variables are set here */ - /* - opj_read_bytes(p_header_data,&l_Zplm,1); // Zplm - ++p_header_data; - --p_header_size; - - while - (p_header_size > 0) - { - opj_read_bytes(p_header_data,&l_Nplm,1); // Nplm - ++p_header_data; - p_header_size -= (1+l_Nplm); - if - (p_header_size < 0) - { - opj_event_msg(p_manager, EVT_ERROR, "Error reading PLM marker\n"); - return false; - } - for - (i = 0; i < l_Nplm; ++i) - { - opj_read_bytes(p_header_data,&l_tmp,1); // Iplm_ij - ++p_header_data; - // take only the last seven bytes - l_packet_len |= (l_tmp & 0x7f); - if - (l_tmp & 0x80) - { - l_packet_len <<= 7; - } - else - { - // store packet length and proceed to next packet - l_packet_len = 0; - } - } - if - (l_packet_len != 0) - { - opj_event_msg(p_manager, EVT_ERROR, "Error reading PLM marker\n"); - return false; - } - } - */ - return OPJ_TRUE; -} - -/** - * Reads a PLT marker (Packet length, tile-part header) - * - * @param p_header_data the data contained in the PLT box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the PLT marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_plt(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 l_Zplt, l_tmp, l_packet_len = 0, i; - - /* preconditions */ - assert(p_header_data != 00); - assert(p_j2k != 00); - assert(p_manager != 00); - - OPJ_UNUSED(p_j2k); - - if (p_header_size < 1) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading PLT marker\n"); - return OPJ_FALSE; - } - - opj_read_bytes(p_header_data, &l_Zplt, 1); /* Zplt */ - ++p_header_data; - --p_header_size; - - for (i = 0; i < p_header_size; ++i) { - opj_read_bytes(p_header_data, &l_tmp, 1); /* Iplt_ij */ - ++p_header_data; - /* take only the last seven bytes */ - l_packet_len |= (l_tmp & 0x7f); - if (l_tmp & 0x80) { - l_packet_len <<= 7; - } else { - /* store packet length and proceed to next packet */ - l_packet_len = 0; - } - } - - if (l_packet_len != 0) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading PLT marker\n"); - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -/** - * Reads a PPM marker (Packed packet headers, main header) - * - * @param p_header_data the data contained in the POC box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the POC marker. - * @param p_manager the user event manager. - */ - -static OPJ_BOOL opj_j2k_read_ppm( - opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager) -{ - opj_cp_t *l_cp = 00; - OPJ_UINT32 l_Z_ppm; - - /* preconditions */ - assert(p_header_data != 00); - assert(p_j2k != 00); - assert(p_manager != 00); - - /* We need to have the Z_ppm element + 1 byte of Nppm/Ippm at minimum */ - if (p_header_size < 2) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading PPM marker\n"); - return OPJ_FALSE; - } - - l_cp = &(p_j2k->m_cp); - l_cp->ppm = 1; - - opj_read_bytes(p_header_data, &l_Z_ppm, 1); /* Z_ppm */ - ++p_header_data; - --p_header_size; - - /* check allocation needed */ - if (l_cp->ppm_markers == NULL) { /* first PPM marker */ - OPJ_UINT32 l_newCount = l_Z_ppm + 1U; /* can't overflow, l_Z_ppm is UINT8 */ - assert(l_cp->ppm_markers_count == 0U); - - l_cp->ppm_markers = (opj_ppx *) opj_calloc(l_newCount, sizeof(opj_ppx)); - if (l_cp->ppm_markers == NULL) { - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to read PPM marker\n"); - return OPJ_FALSE; - } - l_cp->ppm_markers_count = l_newCount; - } else if (l_cp->ppm_markers_count <= l_Z_ppm) { - OPJ_UINT32 l_newCount = l_Z_ppm + 1U; /* can't overflow, l_Z_ppm is UINT8 */ - opj_ppx *new_ppm_markers; - new_ppm_markers = (opj_ppx *) opj_realloc(l_cp->ppm_markers, - l_newCount * sizeof(opj_ppx)); - if (new_ppm_markers == NULL) { - /* clean up to be done on l_cp destruction */ - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to read PPM marker\n"); - return OPJ_FALSE; - } - l_cp->ppm_markers = new_ppm_markers; - memset(l_cp->ppm_markers + l_cp->ppm_markers_count, 0, - (l_newCount - l_cp->ppm_markers_count) * sizeof(opj_ppx)); - l_cp->ppm_markers_count = l_newCount; - } - - if (l_cp->ppm_markers[l_Z_ppm].m_data != NULL) { - /* clean up to be done on l_cp destruction */ - opj_event_msg(p_manager, EVT_ERROR, "Zppm %u already read\n", l_Z_ppm); - return OPJ_FALSE; - } - - l_cp->ppm_markers[l_Z_ppm].m_data = (OPJ_BYTE *) opj_malloc(p_header_size); - if (l_cp->ppm_markers[l_Z_ppm].m_data == NULL) { - /* clean up to be done on l_cp destruction */ - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to read PPM marker\n"); - return OPJ_FALSE; - } - l_cp->ppm_markers[l_Z_ppm].m_data_size = p_header_size; - memcpy(l_cp->ppm_markers[l_Z_ppm].m_data, p_header_data, p_header_size); - - return OPJ_TRUE; -} - -/** - * Merges all PPM markers read (Packed headers, main header) - * - * @param p_cp main coding parameters. - * @param p_manager the user event manager. - */ -static OPJ_BOOL opj_j2k_merge_ppm(opj_cp_t *p_cp, opj_event_mgr_t * p_manager) -{ - OPJ_UINT32 i, l_ppm_data_size, l_N_ppm_remaining; - - /* preconditions */ - assert(p_cp != 00); - assert(p_manager != 00); - assert(p_cp->ppm_buffer == NULL); - - if (p_cp->ppm == 0U) { - return OPJ_TRUE; - } - - l_ppm_data_size = 0U; - l_N_ppm_remaining = 0U; - for (i = 0U; i < p_cp->ppm_markers_count; ++i) { - if (p_cp->ppm_markers[i].m_data != - NULL) { /* standard doesn't seem to require contiguous Zppm */ - OPJ_UINT32 l_N_ppm; - OPJ_UINT32 l_data_size = p_cp->ppm_markers[i].m_data_size; - const OPJ_BYTE* l_data = p_cp->ppm_markers[i].m_data; - - if (l_N_ppm_remaining >= l_data_size) { - l_N_ppm_remaining -= l_data_size; - l_data_size = 0U; - } else { - l_data += l_N_ppm_remaining; - l_data_size -= l_N_ppm_remaining; - l_N_ppm_remaining = 0U; - } - - if (l_data_size > 0U) { - do { - /* read Nppm */ - if (l_data_size < 4U) { - /* clean up to be done on l_cp destruction */ - opj_event_msg(p_manager, EVT_ERROR, "Not enough bytes to read Nppm\n"); - return OPJ_FALSE; - } - opj_read_bytes(l_data, &l_N_ppm, 4); - l_data += 4; - l_data_size -= 4; - l_ppm_data_size += - l_N_ppm; /* can't overflow, max 256 markers of max 65536 bytes, that is when PPM markers are not corrupted which is checked elsewhere */ - - if (l_data_size >= l_N_ppm) { - l_data_size -= l_N_ppm; - l_data += l_N_ppm; - } else { - l_N_ppm_remaining = l_N_ppm - l_data_size; - l_data_size = 0U; - } - } while (l_data_size > 0U); - } - } - } - - if (l_N_ppm_remaining != 0U) { - /* clean up to be done on l_cp destruction */ - opj_event_msg(p_manager, EVT_ERROR, "Corrupted PPM markers\n"); - return OPJ_FALSE; - } - - p_cp->ppm_buffer = (OPJ_BYTE *) opj_malloc(l_ppm_data_size); - if (p_cp->ppm_buffer == 00) { - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to read PPM marker\n"); - return OPJ_FALSE; - } - p_cp->ppm_len = l_ppm_data_size; - l_ppm_data_size = 0U; - l_N_ppm_remaining = 0U; - for (i = 0U; i < p_cp->ppm_markers_count; ++i) { - if (p_cp->ppm_markers[i].m_data != - NULL) { /* standard doesn't seem to require contiguous Zppm */ - OPJ_UINT32 l_N_ppm; - OPJ_UINT32 l_data_size = p_cp->ppm_markers[i].m_data_size; - const OPJ_BYTE* l_data = p_cp->ppm_markers[i].m_data; - - if (l_N_ppm_remaining >= l_data_size) { - memcpy(p_cp->ppm_buffer + l_ppm_data_size, l_data, l_data_size); - l_ppm_data_size += l_data_size; - l_N_ppm_remaining -= l_data_size; - l_data_size = 0U; - } else { - memcpy(p_cp->ppm_buffer + l_ppm_data_size, l_data, l_N_ppm_remaining); - l_ppm_data_size += l_N_ppm_remaining; - l_data += l_N_ppm_remaining; - l_data_size -= l_N_ppm_remaining; - l_N_ppm_remaining = 0U; - } - - if (l_data_size > 0U) { - do { - /* read Nppm */ - if (l_data_size < 4U) { - /* clean up to be done on l_cp destruction */ - opj_event_msg(p_manager, EVT_ERROR, "Not enough bytes to read Nppm\n"); - return OPJ_FALSE; - } - opj_read_bytes(l_data, &l_N_ppm, 4); - l_data += 4; - l_data_size -= 4; - - if (l_data_size >= l_N_ppm) { - memcpy(p_cp->ppm_buffer + l_ppm_data_size, l_data, l_N_ppm); - l_ppm_data_size += l_N_ppm; - l_data_size -= l_N_ppm; - l_data += l_N_ppm; - } else { - memcpy(p_cp->ppm_buffer + l_ppm_data_size, l_data, l_data_size); - l_ppm_data_size += l_data_size; - l_N_ppm_remaining = l_N_ppm - l_data_size; - l_data_size = 0U; - } - } while (l_data_size > 0U); - } - opj_free(p_cp->ppm_markers[i].m_data); - p_cp->ppm_markers[i].m_data = NULL; - p_cp->ppm_markers[i].m_data_size = 0U; - } - } - - p_cp->ppm_data = p_cp->ppm_buffer; - p_cp->ppm_data_size = p_cp->ppm_len; - - p_cp->ppm_markers_count = 0U; - opj_free(p_cp->ppm_markers); - p_cp->ppm_markers = NULL; - - return OPJ_TRUE; -} - -/** - * Reads a PPT marker (Packed packet headers, tile-part header) - * - * @param p_header_data the data contained in the PPT box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the PPT marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_ppt(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager - ) -{ - opj_cp_t *l_cp = 00; - opj_tcp_t *l_tcp = 00; - OPJ_UINT32 l_Z_ppt; - - /* preconditions */ - assert(p_header_data != 00); - assert(p_j2k != 00); - assert(p_manager != 00); - - /* We need to have the Z_ppt element + 1 byte of Ippt at minimum */ - if (p_header_size < 2) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading PPT marker\n"); - return OPJ_FALSE; - } - - l_cp = &(p_j2k->m_cp); - if (l_cp->ppm) { - opj_event_msg(p_manager, EVT_ERROR, - "Error reading PPT marker: packet header have been previously found in the main header (PPM marker).\n"); - return OPJ_FALSE; - } - - l_tcp = &(l_cp->tcps[p_j2k->m_current_tile_number]); - l_tcp->ppt = 1; - - opj_read_bytes(p_header_data, &l_Z_ppt, 1); /* Z_ppt */ - ++p_header_data; - --p_header_size; - - /* check allocation needed */ - if (l_tcp->ppt_markers == NULL) { /* first PPT marker */ - OPJ_UINT32 l_newCount = l_Z_ppt + 1U; /* can't overflow, l_Z_ppt is UINT8 */ - assert(l_tcp->ppt_markers_count == 0U); - - l_tcp->ppt_markers = (opj_ppx *) opj_calloc(l_newCount, sizeof(opj_ppx)); - if (l_tcp->ppt_markers == NULL) { - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to read PPT marker\n"); - return OPJ_FALSE; - } - l_tcp->ppt_markers_count = l_newCount; - } else if (l_tcp->ppt_markers_count <= l_Z_ppt) { - OPJ_UINT32 l_newCount = l_Z_ppt + 1U; /* can't overflow, l_Z_ppt is UINT8 */ - opj_ppx *new_ppt_markers; - new_ppt_markers = (opj_ppx *) opj_realloc(l_tcp->ppt_markers, - l_newCount * sizeof(opj_ppx)); - if (new_ppt_markers == NULL) { - /* clean up to be done on l_tcp destruction */ - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to read PPT marker\n"); - return OPJ_FALSE; - } - l_tcp->ppt_markers = new_ppt_markers; - memset(l_tcp->ppt_markers + l_tcp->ppt_markers_count, 0, - (l_newCount - l_tcp->ppt_markers_count) * sizeof(opj_ppx)); - l_tcp->ppt_markers_count = l_newCount; - } - - if (l_tcp->ppt_markers[l_Z_ppt].m_data != NULL) { - /* clean up to be done on l_tcp destruction */ - opj_event_msg(p_manager, EVT_ERROR, "Zppt %u already read\n", l_Z_ppt); - return OPJ_FALSE; - } - - l_tcp->ppt_markers[l_Z_ppt].m_data = (OPJ_BYTE *) opj_malloc(p_header_size); - if (l_tcp->ppt_markers[l_Z_ppt].m_data == NULL) { - /* clean up to be done on l_tcp destruction */ - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to read PPT marker\n"); - return OPJ_FALSE; - } - l_tcp->ppt_markers[l_Z_ppt].m_data_size = p_header_size; - memcpy(l_tcp->ppt_markers[l_Z_ppt].m_data, p_header_data, p_header_size); - return OPJ_TRUE; -} - -/** - * Merges all PPT markers read (Packed packet headers, tile-part header) - * - * @param p_tcp the tile. - * @param p_manager the user event manager. - */ -static OPJ_BOOL opj_j2k_merge_ppt(opj_tcp_t *p_tcp, opj_event_mgr_t * p_manager) -{ - OPJ_UINT32 i, l_ppt_data_size; - /* preconditions */ - assert(p_tcp != 00); - assert(p_manager != 00); - assert(p_tcp->ppt_buffer == NULL); - - if (p_tcp->ppt == 0U) { - return OPJ_TRUE; - } - - l_ppt_data_size = 0U; - for (i = 0U; i < p_tcp->ppt_markers_count; ++i) { - l_ppt_data_size += - p_tcp->ppt_markers[i].m_data_size; /* can't overflow, max 256 markers of max 65536 bytes */ - } - - p_tcp->ppt_buffer = (OPJ_BYTE *) opj_malloc(l_ppt_data_size); - if (p_tcp->ppt_buffer == 00) { - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to read PPT marker\n"); - return OPJ_FALSE; - } - p_tcp->ppt_len = l_ppt_data_size; - l_ppt_data_size = 0U; - for (i = 0U; i < p_tcp->ppt_markers_count; ++i) { - if (p_tcp->ppt_markers[i].m_data != - NULL) { /* standard doesn't seem to require contiguous Zppt */ - memcpy(p_tcp->ppt_buffer + l_ppt_data_size, p_tcp->ppt_markers[i].m_data, - p_tcp->ppt_markers[i].m_data_size); - l_ppt_data_size += - p_tcp->ppt_markers[i].m_data_size; /* can't overflow, max 256 markers of max 65536 bytes */ - - opj_free(p_tcp->ppt_markers[i].m_data); - p_tcp->ppt_markers[i].m_data = NULL; - p_tcp->ppt_markers[i].m_data_size = 0U; - } - } - - p_tcp->ppt_markers_count = 0U; - opj_free(p_tcp->ppt_markers); - p_tcp->ppt_markers = NULL; - - p_tcp->ppt_data = p_tcp->ppt_buffer; - p_tcp->ppt_data_size = p_tcp->ppt_len; - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_tlm(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager - ) -{ - OPJ_BYTE * l_current_data = 00; - OPJ_UINT32 l_tlm_size; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - l_tlm_size = 6 + (5 * p_j2k->m_specific_param.m_encoder.m_total_tile_parts); - - if (l_tlm_size > p_j2k->m_specific_param.m_encoder.m_header_tile_data_size) { - OPJ_BYTE *new_header_tile_data = (OPJ_BYTE *) opj_realloc( - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_tlm_size); - if (! new_header_tile_data) { - opj_free(p_j2k->m_specific_param.m_encoder.m_header_tile_data); - p_j2k->m_specific_param.m_encoder.m_header_tile_data = NULL; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = 0; - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to write TLM marker\n"); - return OPJ_FALSE; - } - p_j2k->m_specific_param.m_encoder.m_header_tile_data = new_header_tile_data; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = l_tlm_size; - } - - l_current_data = p_j2k->m_specific_param.m_encoder.m_header_tile_data; - - /* change the way data is written to avoid seeking if possible */ - /* TODO */ - p_j2k->m_specific_param.m_encoder.m_tlm_start = opj_stream_tell(p_stream); - - opj_write_bytes(l_current_data, J2K_MS_TLM, - 2); /* TLM */ - l_current_data += 2; - - opj_write_bytes(l_current_data, l_tlm_size - 2, - 2); /* Lpoc */ - l_current_data += 2; - - opj_write_bytes(l_current_data, 0, - 1); /* Ztlm=0*/ - ++l_current_data; - - opj_write_bytes(l_current_data, 0x50, - 1); /* Stlm ST=1(8bits-255 tiles max),SP=1(Ptlm=32bits) */ - ++l_current_data; - - /* do nothing on the 5 * l_j2k->m_specific_param.m_encoder.m_total_tile_parts remaining data */ - if (opj_stream_write_data(p_stream, - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_tlm_size, - p_manager) != l_tlm_size) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_sot(opj_j2k_t *p_j2k, - OPJ_BYTE * p_data, - OPJ_UINT32 p_total_data_size, - OPJ_UINT32 * p_data_written, - const opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager - ) -{ - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - OPJ_UNUSED(p_stream); - - if (p_total_data_size < 12) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough bytes in output buffer to write SOT marker\n"); - return OPJ_FALSE; - } - - opj_write_bytes(p_data, J2K_MS_SOT, - 2); /* SOT */ - p_data += 2; - - opj_write_bytes(p_data, 10, - 2); /* Lsot */ - p_data += 2; - - opj_write_bytes(p_data, p_j2k->m_current_tile_number, - 2); /* Isot */ - p_data += 2; - - /* Psot */ - p_data += 4; - - opj_write_bytes(p_data, - p_j2k->m_specific_param.m_encoder.m_current_tile_part_number, - 1); /* TPsot */ - ++p_data; - - opj_write_bytes(p_data, - p_j2k->m_cp.tcps[p_j2k->m_current_tile_number].m_nb_tile_parts, - 1); /* TNsot */ - ++p_data; - - /* UniPG>> */ -#ifdef USE_JPWL - /* update markers struct */ - /* - OPJ_BOOL res = j2k_add_marker(p_j2k->cstr_info, J2K_MS_SOT, p_j2k->sot_start, len + 2); - */ - assert(0 && "TODO"); -#endif /* USE_JPWL */ - - * p_data_written = 12; - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_get_sot_values(OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - OPJ_UINT32* p_tile_no, - OPJ_UINT32* p_tot_len, - OPJ_UINT32* p_current_part, - OPJ_UINT32* p_num_parts, - opj_event_mgr_t * p_manager) -{ - /* preconditions */ - assert(p_header_data != 00); - assert(p_manager != 00); - - /* Size of this marker is fixed = 12 (we have already read marker and its size)*/ - if (p_header_size != 8) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading SOT marker\n"); - return OPJ_FALSE; - } - - opj_read_bytes(p_header_data, p_tile_no, 2); /* Isot */ - p_header_data += 2; - opj_read_bytes(p_header_data, p_tot_len, 4); /* Psot */ - p_header_data += 4; - opj_read_bytes(p_header_data, p_current_part, 1); /* TPsot */ - ++p_header_data; - opj_read_bytes(p_header_data, p_num_parts, 1); /* TNsot */ - ++p_header_data; - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_read_sot(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager) -{ - opj_cp_t *l_cp = 00; - opj_tcp_t *l_tcp = 00; - OPJ_UINT32 l_tot_len, l_num_parts = 0; - OPJ_UINT32 l_current_part; - OPJ_UINT32 l_tile_x, l_tile_y; - - /* preconditions */ - - assert(p_j2k != 00); - assert(p_manager != 00); - - if (! opj_j2k_get_sot_values(p_header_data, p_header_size, - &(p_j2k->m_current_tile_number), &l_tot_len, &l_current_part, &l_num_parts, - p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading SOT marker\n"); - return OPJ_FALSE; - } -#ifdef DEBUG_VERBOSE - fprintf(stderr, "SOT %d %d %d %d\n", - p_j2k->m_current_tile_number, l_tot_len, l_current_part, l_num_parts); -#endif - - l_cp = &(p_j2k->m_cp); - - /* testcase 2.pdf.SIGFPE.706.1112 */ - if (p_j2k->m_current_tile_number >= l_cp->tw * l_cp->th) { - opj_event_msg(p_manager, EVT_ERROR, "Invalid tile number %d\n", - p_j2k->m_current_tile_number); - return OPJ_FALSE; - } - - l_tcp = &l_cp->tcps[p_j2k->m_current_tile_number]; - l_tile_x = p_j2k->m_current_tile_number % l_cp->tw; - l_tile_y = p_j2k->m_current_tile_number / l_cp->tw; - - if (p_j2k->m_specific_param.m_decoder.m_tile_ind_to_dec < 0 || - p_j2k->m_current_tile_number == (OPJ_UINT32) - p_j2k->m_specific_param.m_decoder.m_tile_ind_to_dec) { - /* Do only this check if we decode all tile part headers, or if */ - /* we decode one precise tile. Otherwise the m_current_tile_part_number */ - /* might not be valid */ - /* Fixes issue with id_000020,sig_06,src_001958,op_flip4,pos_149 */ - /* of https://github.com/uclouvain/openjpeg/issues/939 */ - /* We must avoid reading twice the same tile part number for a given tile */ - /* so as to avoid various issues, like opj_j2k_merge_ppt being called */ - /* several times. */ - /* ISO 15444-1 A.4.2 Start of tile-part (SOT) mandates that tile parts */ - /* should appear in increasing order. */ - if (l_tcp->m_current_tile_part_number + 1 != (OPJ_INT32)l_current_part) { - opj_event_msg(p_manager, EVT_ERROR, - "Invalid tile part index for tile number %d. " - "Got %d, expected %d\n", - p_j2k->m_current_tile_number, - l_current_part, - l_tcp->m_current_tile_part_number + 1); - return OPJ_FALSE; - } - } - - l_tcp->m_current_tile_part_number = (OPJ_INT32) l_current_part; - -#ifdef USE_JPWL - if (l_cp->correct) { - - OPJ_UINT32 tileno = p_j2k->m_current_tile_number; - static OPJ_UINT32 backup_tileno = 0; - - /* tileno is negative or larger than the number of tiles!!! */ - if (tileno > (l_cp->tw * l_cp->th)) { - opj_event_msg(p_manager, EVT_ERROR, - "JPWL: bad tile number (%d out of a maximum of %d)\n", - tileno, (l_cp->tw * l_cp->th)); - if (!JPWL_ASSUME) { - opj_event_msg(p_manager, EVT_ERROR, "JPWL: giving up\n"); - return OPJ_FALSE; - } - /* we try to correct */ - tileno = backup_tileno; - opj_event_msg(p_manager, EVT_WARNING, "- trying to adjust this\n" - "- setting tile number to %d\n", - tileno); - } - - /* keep your private count of tiles */ - backup_tileno++; - }; -#endif /* USE_JPWL */ - - /* look for the tile in the list of already processed tile (in parts). */ - /* Optimization possible here with a more complex data structure and with the removing of tiles */ - /* since the time taken by this function can only grow at the time */ - - /* PSot should be equal to zero or >=14 or <= 2^32-1 */ - if ((l_tot_len != 0) && (l_tot_len < 14)) { - if (l_tot_len == - 12) { /* MSD: Special case for the PHR data which are read by kakadu*/ - opj_event_msg(p_manager, EVT_WARNING, "Empty SOT marker detected: Psot=%d.\n", - l_tot_len); - } else { - opj_event_msg(p_manager, EVT_ERROR, - "Psot value is not correct regards to the JPEG2000 norm: %d.\n", l_tot_len); - return OPJ_FALSE; - } - } - -#ifdef USE_JPWL - if (l_cp->correct) { - - /* totlen is negative or larger than the bytes left!!! */ - if (/*(l_tot_len < 0) ||*/ (l_tot_len > - p_header_size)) { /* FIXME it seems correct; for info in V1 -> (p_stream_numbytesleft(p_stream) + 8))) { */ - opj_event_msg(p_manager, EVT_ERROR, - "JPWL: bad tile byte size (%d bytes against %d bytes left)\n", - l_tot_len, - p_header_size); /* FIXME it seems correct; for info in V1 -> p_stream_numbytesleft(p_stream) + 8); */ - if (!JPWL_ASSUME) { - opj_event_msg(p_manager, EVT_ERROR, "JPWL: giving up\n"); - return OPJ_FALSE; - } - /* we try to correct */ - l_tot_len = 0; - opj_event_msg(p_manager, EVT_WARNING, "- trying to adjust this\n" - "- setting Psot to %d => assuming it is the last tile\n", - l_tot_len); - } - }; -#endif /* USE_JPWL */ - - /* Ref A.4.2: Psot could be equal zero if it is the last tile-part of the codestream.*/ - if (!l_tot_len) { - opj_event_msg(p_manager, EVT_INFO, - "Psot value of the current tile-part is equal to zero, " - "we assuming it is the last tile-part of the codestream.\n"); - p_j2k->m_specific_param.m_decoder.m_last_tile_part = 1; - } - - if (l_tcp->m_nb_tile_parts != 0 && l_current_part >= l_tcp->m_nb_tile_parts) { - /* Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=2851 */ - opj_event_msg(p_manager, EVT_ERROR, - "In SOT marker, TPSot (%d) is not valid regards to the previous " - "number of tile-part (%d), giving up\n", l_current_part, - l_tcp->m_nb_tile_parts); - p_j2k->m_specific_param.m_decoder.m_last_tile_part = 1; - return OPJ_FALSE; - } - - if (l_num_parts != - 0) { /* Number of tile-part header is provided by this tile-part header */ - l_num_parts += p_j2k->m_specific_param.m_decoder.m_nb_tile_parts_correction; - /* Useful to manage the case of textGBR.jp2 file because two values of TNSot are allowed: the correct numbers of - * tile-parts for that tile and zero (A.4.2 of 15444-1 : 2002). */ - if (l_tcp->m_nb_tile_parts) { - if (l_current_part >= l_tcp->m_nb_tile_parts) { - opj_event_msg(p_manager, EVT_ERROR, - "In SOT marker, TPSot (%d) is not valid regards to the current " - "number of tile-part (%d), giving up\n", l_current_part, - l_tcp->m_nb_tile_parts); - p_j2k->m_specific_param.m_decoder.m_last_tile_part = 1; - return OPJ_FALSE; - } - } - if (l_current_part >= l_num_parts) { - /* testcase 451.pdf.SIGSEGV.ce9.3723 */ - opj_event_msg(p_manager, EVT_ERROR, - "In SOT marker, TPSot (%d) is not valid regards to the current " - "number of tile-part (header) (%d), giving up\n", l_current_part, l_num_parts); - p_j2k->m_specific_param.m_decoder.m_last_tile_part = 1; - return OPJ_FALSE; - } - l_tcp->m_nb_tile_parts = l_num_parts; - } - - /* If know the number of tile part header we will check if we didn't read the last*/ - if (l_tcp->m_nb_tile_parts) { - if (l_tcp->m_nb_tile_parts == (l_current_part + 1)) { - p_j2k->m_specific_param.m_decoder.m_can_decode = - 1; /* Process the last tile-part header*/ - } - } - - if (!p_j2k->m_specific_param.m_decoder.m_last_tile_part) { - /* Keep the size of data to skip after this marker */ - p_j2k->m_specific_param.m_decoder.m_sot_length = l_tot_len - - 12; /* SOT_marker_size = 12 */ - } else { - /* FIXME: need to be computed from the number of bytes remaining in the codestream */ - p_j2k->m_specific_param.m_decoder.m_sot_length = 0; - } - - p_j2k->m_specific_param.m_decoder.m_state = J2K_STATE_TPH; - - /* Check if the current tile is outside the area we want decode or not corresponding to the tile index*/ - if (p_j2k->m_specific_param.m_decoder.m_tile_ind_to_dec == -1) { - p_j2k->m_specific_param.m_decoder.m_skip_data = - (l_tile_x < p_j2k->m_specific_param.m_decoder.m_start_tile_x) - || (l_tile_x >= p_j2k->m_specific_param.m_decoder.m_end_tile_x) - || (l_tile_y < p_j2k->m_specific_param.m_decoder.m_start_tile_y) - || (l_tile_y >= p_j2k->m_specific_param.m_decoder.m_end_tile_y); - } else { - assert(p_j2k->m_specific_param.m_decoder.m_tile_ind_to_dec >= 0); - p_j2k->m_specific_param.m_decoder.m_skip_data = - (p_j2k->m_current_tile_number != (OPJ_UINT32) - p_j2k->m_specific_param.m_decoder.m_tile_ind_to_dec); - } - - /* Index */ - if (p_j2k->cstr_index) { - assert(p_j2k->cstr_index->tile_index != 00); - p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].tileno = - p_j2k->m_current_tile_number; - p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].current_tpsno = - l_current_part; - - if (l_num_parts != 0) { - p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].nb_tps = - l_num_parts; - p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].current_nb_tps = - l_num_parts; - - if (!p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].tp_index) { - p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].tp_index = - (opj_tp_index_t*)opj_calloc(l_num_parts, sizeof(opj_tp_index_t)); - if (!p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].tp_index) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to read SOT marker. Tile index allocation failed\n"); - return OPJ_FALSE; - } - } else { - opj_tp_index_t *new_tp_index = (opj_tp_index_t *) opj_realloc( - p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].tp_index, - l_num_parts * sizeof(opj_tp_index_t)); - if (! new_tp_index) { - opj_free(p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].tp_index); - p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].tp_index = NULL; - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to read SOT marker. Tile index allocation failed\n"); - return OPJ_FALSE; - } - p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].tp_index = - new_tp_index; - } - } else { - /*if (!p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].tp_index)*/ { - - if (!p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].tp_index) { - p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].current_nb_tps = 10; - p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].tp_index = - (opj_tp_index_t*)opj_calloc( - p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].current_nb_tps, - sizeof(opj_tp_index_t)); - if (!p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].tp_index) { - p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].current_nb_tps = 0; - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to read SOT marker. Tile index allocation failed\n"); - return OPJ_FALSE; - } - } - - if (l_current_part >= - p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].current_nb_tps) { - opj_tp_index_t *new_tp_index; - p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].current_nb_tps = - l_current_part + 1; - new_tp_index = (opj_tp_index_t *) opj_realloc( - p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].tp_index, - p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].current_nb_tps * - sizeof(opj_tp_index_t)); - if (! new_tp_index) { - opj_free(p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].tp_index); - p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].tp_index = NULL; - p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].current_nb_tps = 0; - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to read SOT marker. Tile index allocation failed\n"); - return OPJ_FALSE; - } - p_j2k->cstr_index->tile_index[p_j2k->m_current_tile_number].tp_index = - new_tp_index; - } - } - - } - - } - - /* FIXME move this onto a separate method to call before reading any SOT, remove part about main_end header, use a index struct inside p_j2k */ - /* if (p_j2k->cstr_info) { - if (l_tcp->first) { - if (tileno == 0) { - p_j2k->cstr_info->main_head_end = p_stream_tell(p_stream) - 13; - } - - p_j2k->cstr_info->tile[tileno].tileno = tileno; - p_j2k->cstr_info->tile[tileno].start_pos = p_stream_tell(p_stream) - 12; - p_j2k->cstr_info->tile[tileno].end_pos = p_j2k->cstr_info->tile[tileno].start_pos + totlen - 1; - p_j2k->cstr_info->tile[tileno].num_tps = numparts; - - if (numparts) { - p_j2k->cstr_info->tile[tileno].tp = (opj_tp_info_t *) opj_malloc(numparts * sizeof(opj_tp_info_t)); - } - else { - p_j2k->cstr_info->tile[tileno].tp = (opj_tp_info_t *) opj_malloc(10 * sizeof(opj_tp_info_t)); // Fixme (10) - } - } - else { - p_j2k->cstr_info->tile[tileno].end_pos += totlen; - } - - p_j2k->cstr_info->tile[tileno].tp[partno].tp_start_pos = p_stream_tell(p_stream) - 12; - p_j2k->cstr_info->tile[tileno].tp[partno].tp_end_pos = - p_j2k->cstr_info->tile[tileno].tp[partno].tp_start_pos + totlen - 1; - }*/ - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_sod(opj_j2k_t *p_j2k, - opj_tcd_t * p_tile_coder, - OPJ_BYTE * p_data, - OPJ_UINT32 * p_data_written, - OPJ_UINT32 p_total_data_size, - const opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager - ) -{ - opj_codestream_info_t *l_cstr_info = 00; - OPJ_UINT32 l_remaining_data; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - OPJ_UNUSED(p_stream); - - if (p_total_data_size < 4) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough bytes in output buffer to write SOD marker\n"); - return OPJ_FALSE; - } - - opj_write_bytes(p_data, J2K_MS_SOD, - 2); /* SOD */ - p_data += 2; - - /* make room for the EOF marker */ - l_remaining_data = p_total_data_size - 4; - - /* update tile coder */ - p_tile_coder->tp_num = - p_j2k->m_specific_param.m_encoder.m_current_poc_tile_part_number ; - p_tile_coder->cur_tp_num = - p_j2k->m_specific_param.m_encoder.m_current_tile_part_number; - - /* INDEX >> */ - /* TODO mergeV2: check this part which use cstr_info */ - /*l_cstr_info = p_j2k->cstr_info; - if (l_cstr_info) { - if (!p_j2k->m_specific_param.m_encoder.m_current_tile_part_number ) { - //TODO cstr_info->tile[p_j2k->m_current_tile_number].end_header = p_stream_tell(p_stream) + p_j2k->pos_correction - 1; - l_cstr_info->tile[p_j2k->m_current_tile_number].tileno = p_j2k->m_current_tile_number; - } - else {*/ - /* - TODO - if - (cstr_info->tile[p_j2k->m_current_tile_number].packet[cstr_info->packno - 1].end_pos < p_stream_tell(p_stream)) - { - cstr_info->tile[p_j2k->m_current_tile_number].packet[cstr_info->packno].start_pos = p_stream_tell(p_stream); - }*/ - /*}*/ - /* UniPG>> */ -#ifdef USE_JPWL - /* update markers struct */ - /*OPJ_BOOL res = j2k_add_marker(p_j2k->cstr_info, J2K_MS_SOD, p_j2k->sod_start, 2); - */ - assert(0 && "TODO"); -#endif /* USE_JPWL */ - /* <m_specific_param.m_encoder.m_current_tile_part_number == 0) { - p_tile_coder->tcd_image->tiles->packno = 0; -#ifdef deadcode - if (l_cstr_info) { - l_cstr_info->packno = 0; - } -#endif - } - - *p_data_written = 0; - - if (! opj_tcd_encode_tile(p_tile_coder, p_j2k->m_current_tile_number, p_data, - p_data_written, l_remaining_data, l_cstr_info, - p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, "Cannot encode tile\n"); - return OPJ_FALSE; - } - - *p_data_written += 2; - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_read_sod(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager - ) -{ - OPJ_SIZE_T l_current_read_size; - opj_codestream_index_t * l_cstr_index = 00; - OPJ_BYTE ** l_current_data = 00; - opj_tcp_t * l_tcp = 00; - OPJ_UINT32 * l_tile_len = 00; - OPJ_BOOL l_sot_length_pb_detected = OPJ_FALSE; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - l_tcp = &(p_j2k->m_cp.tcps[p_j2k->m_current_tile_number]); - - if (p_j2k->m_specific_param.m_decoder.m_last_tile_part) { - /* opj_stream_get_number_byte_left returns OPJ_OFF_T - // but we are in the last tile part, - // so its result will fit on OPJ_UINT32 unless we find - // a file with a single tile part of more than 4 GB...*/ - p_j2k->m_specific_param.m_decoder.m_sot_length = (OPJ_UINT32)( - opj_stream_get_number_byte_left(p_stream) - 2); - } else { - /* Check to avoid pass the limit of OPJ_UINT32 */ - if (p_j2k->m_specific_param.m_decoder.m_sot_length >= 2) { - p_j2k->m_specific_param.m_decoder.m_sot_length -= 2; - } else { - /* MSD: case commented to support empty SOT marker (PHR data) */ - } - } - - l_current_data = &(l_tcp->m_data); - l_tile_len = &l_tcp->m_data_size; - - /* Patch to support new PHR data */ - if (p_j2k->m_specific_param.m_decoder.m_sot_length) { - /* If we are here, we'll try to read the data after allocation */ - /* Check enough bytes left in stream before allocation */ - if ((OPJ_OFF_T)p_j2k->m_specific_param.m_decoder.m_sot_length > - opj_stream_get_number_byte_left(p_stream)) { - opj_event_msg(p_manager, EVT_ERROR, - "Tile part length size inconsistent with stream length\n"); - return OPJ_FALSE; - } - if (p_j2k->m_specific_param.m_decoder.m_sot_length > - UINT_MAX - OPJ_COMMON_CBLK_DATA_EXTRA) { - opj_event_msg(p_manager, EVT_ERROR, - "p_j2k->m_specific_param.m_decoder.m_sot_length > " - "UINT_MAX - OPJ_COMMON_CBLK_DATA_EXTRA"); - return OPJ_FALSE; - } - /* Add a margin of OPJ_COMMON_CBLK_DATA_EXTRA to the allocation we */ - /* do so that opj_mqc_init_dec_common() can safely add a synthetic */ - /* 0xFFFF marker. */ - if (! *l_current_data) { - /* LH: oddly enough, in this path, l_tile_len!=0. - * TODO: If this was consistent, we could simplify the code to only use realloc(), as realloc(0,...) default to malloc(0,...). - */ - *l_current_data = (OPJ_BYTE*) opj_malloc( - p_j2k->m_specific_param.m_decoder.m_sot_length + OPJ_COMMON_CBLK_DATA_EXTRA); - } else { - OPJ_BYTE *l_new_current_data; - if (*l_tile_len > UINT_MAX - OPJ_COMMON_CBLK_DATA_EXTRA - - p_j2k->m_specific_param.m_decoder.m_sot_length) { - opj_event_msg(p_manager, EVT_ERROR, - "*l_tile_len > UINT_MAX - OPJ_COMMON_CBLK_DATA_EXTRA - " - "p_j2k->m_specific_param.m_decoder.m_sot_length"); - return OPJ_FALSE; - } - - l_new_current_data = (OPJ_BYTE *) opj_realloc(*l_current_data, - *l_tile_len + p_j2k->m_specific_param.m_decoder.m_sot_length + - OPJ_COMMON_CBLK_DATA_EXTRA); - if (! l_new_current_data) { - opj_free(*l_current_data); - /*nothing more is done as l_current_data will be set to null, and just - afterward we enter in the error path - and the actual tile_len is updated (committed) at the end of the - function. */ - } - *l_current_data = l_new_current_data; - } - - if (*l_current_data == 00) { - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to decode tile\n"); - return OPJ_FALSE; - } - } else { - l_sot_length_pb_detected = OPJ_TRUE; - } - - /* Index */ - l_cstr_index = p_j2k->cstr_index; - if (l_cstr_index) { - OPJ_OFF_T l_current_pos = opj_stream_tell(p_stream) - 2; - - OPJ_UINT32 l_current_tile_part = - l_cstr_index->tile_index[p_j2k->m_current_tile_number].current_tpsno; - l_cstr_index->tile_index[p_j2k->m_current_tile_number].tp_index[l_current_tile_part].end_header - = - l_current_pos; - l_cstr_index->tile_index[p_j2k->m_current_tile_number].tp_index[l_current_tile_part].end_pos - = - l_current_pos + p_j2k->m_specific_param.m_decoder.m_sot_length + 2; - - if (OPJ_FALSE == opj_j2k_add_tlmarker(p_j2k->m_current_tile_number, - l_cstr_index, - J2K_MS_SOD, - l_current_pos, - p_j2k->m_specific_param.m_decoder.m_sot_length + 2)) { - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to add tl marker\n"); - return OPJ_FALSE; - } - - /*l_cstr_index->packno = 0;*/ - } - - /* Patch to support new PHR data */ - if (!l_sot_length_pb_detected) { - l_current_read_size = opj_stream_read_data( - p_stream, - *l_current_data + *l_tile_len, - p_j2k->m_specific_param.m_decoder.m_sot_length, - p_manager); - } else { - l_current_read_size = 0; - } - - if (l_current_read_size != p_j2k->m_specific_param.m_decoder.m_sot_length) { - p_j2k->m_specific_param.m_decoder.m_state = J2K_STATE_NEOC; - } else { - p_j2k->m_specific_param.m_decoder.m_state = J2K_STATE_TPHSOT; - } - - *l_tile_len += (OPJ_UINT32)l_current_read_size; - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_rgn(opj_j2k_t *p_j2k, - OPJ_UINT32 p_tile_no, - OPJ_UINT32 p_comp_no, - OPJ_UINT32 nb_comps, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager - ) -{ - OPJ_BYTE * l_current_data = 00; - OPJ_UINT32 l_rgn_size; - opj_cp_t *l_cp = 00; - opj_tcp_t *l_tcp = 00; - opj_tccp_t *l_tccp = 00; - OPJ_UINT32 l_comp_room; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - l_cp = &(p_j2k->m_cp); - l_tcp = &l_cp->tcps[p_tile_no]; - l_tccp = &l_tcp->tccps[p_comp_no]; - - if (nb_comps <= 256) { - l_comp_room = 1; - } else { - l_comp_room = 2; - } - - l_rgn_size = 6 + l_comp_room; - - l_current_data = p_j2k->m_specific_param.m_encoder.m_header_tile_data; - - opj_write_bytes(l_current_data, J2K_MS_RGN, - 2); /* RGN */ - l_current_data += 2; - - opj_write_bytes(l_current_data, l_rgn_size - 2, - 2); /* Lrgn */ - l_current_data += 2; - - opj_write_bytes(l_current_data, p_comp_no, - l_comp_room); /* Crgn */ - l_current_data += l_comp_room; - - opj_write_bytes(l_current_data, 0, - 1); /* Srgn */ - ++l_current_data; - - opj_write_bytes(l_current_data, (OPJ_UINT32)l_tccp->roishift, - 1); /* SPrgn */ - ++l_current_data; - - if (opj_stream_write_data(p_stream, - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_rgn_size, - p_manager) != l_rgn_size) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_eoc(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager - ) -{ - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - opj_write_bytes(p_j2k->m_specific_param.m_encoder.m_header_tile_data, - J2K_MS_EOC, 2); /* EOC */ - - /* UniPG>> */ -#ifdef USE_JPWL - /* update markers struct */ - /* - OPJ_BOOL res = j2k_add_marker(p_j2k->cstr_info, J2K_MS_EOC, p_stream_tell(p_stream) - 2, 2); - */ -#endif /* USE_JPWL */ - - if (opj_stream_write_data(p_stream, - p_j2k->m_specific_param.m_encoder.m_header_tile_data, 2, p_manager) != 2) { - return OPJ_FALSE; - } - - if (! opj_stream_flush(p_stream, p_manager)) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -/** - * Reads a RGN marker (Region Of Interest) - * - * @param p_header_data the data contained in the POC box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the POC marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_rgn(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 l_nb_comp; - opj_image_t * l_image = 00; - - opj_cp_t *l_cp = 00; - opj_tcp_t *l_tcp = 00; - OPJ_UINT32 l_comp_room, l_comp_no, l_roi_sty; - - /* preconditions*/ - assert(p_header_data != 00); - assert(p_j2k != 00); - assert(p_manager != 00); - - l_image = p_j2k->m_private_image; - l_nb_comp = l_image->numcomps; - - if (l_nb_comp <= 256) { - l_comp_room = 1; - } else { - l_comp_room = 2; - } - - if (p_header_size != 2 + l_comp_room) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading RGN marker\n"); - return OPJ_FALSE; - } - - l_cp = &(p_j2k->m_cp); - l_tcp = (p_j2k->m_specific_param.m_decoder.m_state == J2K_STATE_TPH) ? - &l_cp->tcps[p_j2k->m_current_tile_number] : - p_j2k->m_specific_param.m_decoder.m_default_tcp; - - opj_read_bytes(p_header_data, &l_comp_no, l_comp_room); /* Crgn */ - p_header_data += l_comp_room; - opj_read_bytes(p_header_data, &l_roi_sty, - 1); /* Srgn */ - ++p_header_data; - -#ifdef USE_JPWL - if (l_cp->correct) { - /* totlen is negative or larger than the bytes left!!! */ - if (l_comp_room >= l_nb_comp) { - opj_event_msg(p_manager, EVT_ERROR, - "JPWL: bad component number in RGN (%d when there are only %d)\n", - l_comp_room, l_nb_comp); - if (!JPWL_ASSUME) { - opj_event_msg(p_manager, EVT_ERROR, "JPWL: giving up\n"); - return OPJ_FALSE; - } - } - }; -#endif /* USE_JPWL */ - - /* testcase 3635.pdf.asan.77.2930 */ - if (l_comp_no >= l_nb_comp) { - opj_event_msg(p_manager, EVT_ERROR, - "bad component number in RGN (%d when there are only %d)\n", - l_comp_no, l_nb_comp); - return OPJ_FALSE; - } - - opj_read_bytes(p_header_data, - (OPJ_UINT32 *)(&(l_tcp->tccps[l_comp_no].roishift)), 1); /* SPrgn */ - ++p_header_data; - - return OPJ_TRUE; - -} - -static OPJ_FLOAT32 opj_j2k_get_tp_stride(opj_tcp_t * p_tcp) -{ - return (OPJ_FLOAT32)((p_tcp->m_nb_tile_parts - 1) * 14); -} - -static OPJ_FLOAT32 opj_j2k_get_default_stride(opj_tcp_t * p_tcp) -{ - (void)p_tcp; - return 0; -} - -static OPJ_BOOL opj_j2k_update_rates(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager) -{ - opj_cp_t * l_cp = 00; - opj_image_t * l_image = 00; - opj_tcp_t * l_tcp = 00; - opj_image_comp_t * l_img_comp = 00; - - OPJ_UINT32 i, j, k; - OPJ_INT32 l_x0, l_y0, l_x1, l_y1; - OPJ_FLOAT32 * l_rates = 0; - OPJ_FLOAT32 l_sot_remove; - OPJ_UINT32 l_bits_empty, l_size_pixel; - OPJ_UINT32 l_tile_size = 0; - OPJ_UINT32 l_last_res; - OPJ_FLOAT32(* l_tp_stride_func)(opj_tcp_t *) = 00; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - OPJ_UNUSED(p_manager); - - l_cp = &(p_j2k->m_cp); - l_image = p_j2k->m_private_image; - l_tcp = l_cp->tcps; - - l_bits_empty = 8 * l_image->comps->dx * l_image->comps->dy; - l_size_pixel = l_image->numcomps * l_image->comps->prec; - l_sot_remove = (OPJ_FLOAT32) opj_stream_tell(p_stream) / (OPJ_FLOAT32)( - l_cp->th * l_cp->tw); - - if (l_cp->m_specific_param.m_enc.m_tp_on) { - l_tp_stride_func = opj_j2k_get_tp_stride; - } else { - l_tp_stride_func = opj_j2k_get_default_stride; - } - - for (i = 0; i < l_cp->th; ++i) { - for (j = 0; j < l_cp->tw; ++j) { - OPJ_FLOAT32 l_offset = (OPJ_FLOAT32)(*l_tp_stride_func)(l_tcp) / - (OPJ_FLOAT32)l_tcp->numlayers; - - /* 4 borders of the tile rescale on the image if necessary */ - l_x0 = opj_int_max((OPJ_INT32)(l_cp->tx0 + j * l_cp->tdx), - (OPJ_INT32)l_image->x0); - l_y0 = opj_int_max((OPJ_INT32)(l_cp->ty0 + i * l_cp->tdy), - (OPJ_INT32)l_image->y0); - l_x1 = opj_int_min((OPJ_INT32)(l_cp->tx0 + (j + 1) * l_cp->tdx), - (OPJ_INT32)l_image->x1); - l_y1 = opj_int_min((OPJ_INT32)(l_cp->ty0 + (i + 1) * l_cp->tdy), - (OPJ_INT32)l_image->y1); - - l_rates = l_tcp->rates; - - /* Modification of the RATE >> */ - if (*l_rates > 0.0f) { - *l_rates = (((OPJ_FLOAT32)(l_size_pixel * (OPJ_UINT32)(l_x1 - l_x0) * - (OPJ_UINT32)(l_y1 - l_y0))) - / - ((*l_rates) * (OPJ_FLOAT32)l_bits_empty) - ) - - - l_offset; - } - - ++l_rates; - - for (k = 1; k < l_tcp->numlayers; ++k) { - if (*l_rates > 0.0f) { - *l_rates = (((OPJ_FLOAT32)(l_size_pixel * (OPJ_UINT32)(l_x1 - l_x0) * - (OPJ_UINT32)(l_y1 - l_y0))) - / - ((*l_rates) * (OPJ_FLOAT32)l_bits_empty) - ) - - - l_offset; - } - - ++l_rates; - } - - ++l_tcp; - - } - } - - l_tcp = l_cp->tcps; - - for (i = 0; i < l_cp->th; ++i) { - for (j = 0; j < l_cp->tw; ++j) { - l_rates = l_tcp->rates; - - if (*l_rates > 0.0f) { - *l_rates -= l_sot_remove; - - if (*l_rates < 30.0f) { - *l_rates = 30.0f; - } - } - - ++l_rates; - - l_last_res = l_tcp->numlayers - 1; - - for (k = 1; k < l_last_res; ++k) { - - if (*l_rates > 0.0f) { - *l_rates -= l_sot_remove; - - if (*l_rates < * (l_rates - 1) + 10.0f) { - *l_rates = (*(l_rates - 1)) + 20.0f; - } - } - - ++l_rates; - } - - if (*l_rates > 0.0f) { - *l_rates -= (l_sot_remove + 2.f); - - if (*l_rates < * (l_rates - 1) + 10.0f) { - *l_rates = (*(l_rates - 1)) + 20.0f; - } - } - - ++l_tcp; - } - } - - l_img_comp = l_image->comps; - l_tile_size = 0; - - for (i = 0; i < l_image->numcomps; ++i) { - l_tile_size += (opj_uint_ceildiv(l_cp->tdx, l_img_comp->dx) - * - opj_uint_ceildiv(l_cp->tdy, l_img_comp->dy) - * - l_img_comp->prec - ); - - ++l_img_comp; - } - - /* TODO: where does this magic value come from ? */ - /* This used to be 1.3 / 8, but with random data and very small code */ - /* block sizes, this is not enough. For example with */ - /* bin/test_tile_encoder 1 256 256 32 32 8 0 reversible_with_precinct.j2k 4 4 3 0 0 1 16 16 */ - /* TODO revise this to take into account the overhead linked to the */ - /* number of packets and number of code blocks in packets */ - l_tile_size = (OPJ_UINT32)(l_tile_size * 1.4 / 8); - - /* Arbitrary amount to make the following work: */ - /* bin/test_tile_encoder 1 256 256 17 16 8 0 reversible_no_precinct.j2k 4 4 3 0 0 1 */ - l_tile_size += 500; - - l_tile_size += opj_j2k_get_specific_header_sizes(p_j2k); - - p_j2k->m_specific_param.m_encoder.m_encoded_tile_size = l_tile_size; - p_j2k->m_specific_param.m_encoder.m_encoded_tile_data = - (OPJ_BYTE *) opj_malloc(p_j2k->m_specific_param.m_encoder.m_encoded_tile_size); - if (p_j2k->m_specific_param.m_encoder.m_encoded_tile_data == 00) { - return OPJ_FALSE; - } - - if (OPJ_IS_CINEMA(l_cp->rsiz)) { - p_j2k->m_specific_param.m_encoder.m_tlm_sot_offsets_buffer = - (OPJ_BYTE *) opj_malloc(5 * - p_j2k->m_specific_param.m_encoder.m_total_tile_parts); - if (! p_j2k->m_specific_param.m_encoder.m_tlm_sot_offsets_buffer) { - return OPJ_FALSE; - } - - p_j2k->m_specific_param.m_encoder.m_tlm_sot_offsets_current = - p_j2k->m_specific_param.m_encoder.m_tlm_sot_offsets_buffer; - } - - return OPJ_TRUE; -} - -#if 0 -static OPJ_BOOL opj_j2k_read_eoc(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager) -{ - OPJ_UINT32 i; - opj_tcd_t * l_tcd = 00; - OPJ_UINT32 l_nb_tiles; - opj_tcp_t * l_tcp = 00; - OPJ_BOOL l_success; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - l_nb_tiles = p_j2k->m_cp.th * p_j2k->m_cp.tw; - l_tcp = p_j2k->m_cp.tcps; - - l_tcd = opj_tcd_create(OPJ_TRUE); - if (l_tcd == 00) { - opj_event_msg(p_manager, EVT_ERROR, "Cannot decode tile, memory error\n"); - return OPJ_FALSE; - } - - for (i = 0; i < l_nb_tiles; ++i) { - if (l_tcp->m_data) { - if (! opj_tcd_init_decode_tile(l_tcd, i)) { - opj_tcd_destroy(l_tcd); - opj_event_msg(p_manager, EVT_ERROR, "Cannot decode tile, memory error\n"); - return OPJ_FALSE; - } - - l_success = opj_tcd_decode_tile(l_tcd, l_tcp->m_data, l_tcp->m_data_size, i, - p_j2k->cstr_index); - /* cleanup */ - - if (! l_success) { - p_j2k->m_specific_param.m_decoder.m_state |= J2K_STATE_ERR; - break; - } - } - - opj_j2k_tcp_destroy(l_tcp); - ++l_tcp; - } - - opj_tcd_destroy(l_tcd); - return OPJ_TRUE; -} -#endif - -static OPJ_BOOL opj_j2k_get_end_header(opj_j2k_t *p_j2k, - struct opj_stream_private *p_stream, - struct opj_event_mgr * p_manager) -{ - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - OPJ_UNUSED(p_manager); - - p_j2k->cstr_index->main_head_end = opj_stream_tell(p_stream); - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_mct_data_group(opj_j2k_t *p_j2k, - struct opj_stream_private *p_stream, - struct opj_event_mgr * p_manager) -{ - OPJ_UINT32 i; - opj_simple_mcc_decorrelation_data_t * l_mcc_record; - opj_mct_data_t * l_mct_record; - opj_tcp_t * l_tcp; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_stream != 00); - assert(p_manager != 00); - - if (! opj_j2k_write_cbd(p_j2k, p_stream, p_manager)) { - return OPJ_FALSE; - } - - l_tcp = &(p_j2k->m_cp.tcps[p_j2k->m_current_tile_number]); - l_mct_record = l_tcp->m_mct_records; - - for (i = 0; i < l_tcp->m_nb_mct_records; ++i) { - - if (! opj_j2k_write_mct_record(p_j2k, l_mct_record, p_stream, p_manager)) { - return OPJ_FALSE; - } - - ++l_mct_record; - } - - l_mcc_record = l_tcp->m_mcc_records; - - for (i = 0; i < l_tcp->m_nb_mcc_records; ++i) { - - if (! opj_j2k_write_mcc_record(p_j2k, l_mcc_record, p_stream, p_manager)) { - return OPJ_FALSE; - } - - ++l_mcc_record; - } - - if (! opj_j2k_write_mco(p_j2k, p_stream, p_manager)) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_all_coc( - opj_j2k_t *p_j2k, - struct opj_stream_private *p_stream, - struct opj_event_mgr * p_manager) -{ - OPJ_UINT32 compno; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - for (compno = 1; compno < p_j2k->m_private_image->numcomps; ++compno) { - /* cod is first component of first tile */ - if (! opj_j2k_compare_coc(p_j2k, 0, compno)) { - if (! opj_j2k_write_coc(p_j2k, compno, p_stream, p_manager)) { - return OPJ_FALSE; - } - } - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_all_qcc( - opj_j2k_t *p_j2k, - struct opj_stream_private *p_stream, - struct opj_event_mgr * p_manager) -{ - OPJ_UINT32 compno; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - for (compno = 1; compno < p_j2k->m_private_image->numcomps; ++compno) { - /* qcd is first component of first tile */ - if (! opj_j2k_compare_qcc(p_j2k, 0, compno)) { - if (! opj_j2k_write_qcc(p_j2k, compno, p_stream, p_manager)) { - return OPJ_FALSE; - } - } - } - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_regions(opj_j2k_t *p_j2k, - struct opj_stream_private *p_stream, - struct opj_event_mgr * p_manager) -{ - OPJ_UINT32 compno; - const opj_tccp_t *l_tccp = 00; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - l_tccp = p_j2k->m_cp.tcps->tccps; - - for (compno = 0; compno < p_j2k->m_private_image->numcomps; ++compno) { - if (l_tccp->roishift) { - - if (! opj_j2k_write_rgn(p_j2k, 0, compno, p_j2k->m_private_image->numcomps, - p_stream, p_manager)) { - return OPJ_FALSE; - } - } - - ++l_tccp; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_epc(opj_j2k_t *p_j2k, - struct opj_stream_private *p_stream, - struct opj_event_mgr * p_manager) -{ - opj_codestream_index_t * l_cstr_index = 00; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - OPJ_UNUSED(p_manager); - - l_cstr_index = p_j2k->cstr_index; - if (l_cstr_index) { - l_cstr_index->codestream_size = (OPJ_UINT64)opj_stream_tell(p_stream); - /* UniPG>> */ - /* The following adjustment is done to adjust the codestream size */ - /* if SOD is not at 0 in the buffer. Useful in case of JP2, where */ - /* the first bunch of bytes is not in the codestream */ - l_cstr_index->codestream_size -= (OPJ_UINT64)l_cstr_index->main_head_start; - /* <epc_on) { - - /* encode according to JPWL */ - jpwl_encode(p_j2k, p_stream, image); - - } -#endif - assert(0 && "TODO"); -#endif /* USE_JPWL */ - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_read_unk(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - OPJ_UINT32 *output_marker, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 l_unknown_marker; - const opj_dec_memory_marker_handler_t * l_marker_handler; - OPJ_UINT32 l_size_unk = 2; - - /* preconditions*/ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - opj_event_msg(p_manager, EVT_WARNING, "Unknown marker\n"); - - for (;;) { - /* Try to read 2 bytes (the next marker ID) from stream and copy them into the buffer*/ - if (opj_stream_read_data(p_stream, - p_j2k->m_specific_param.m_decoder.m_header_data, 2, p_manager) != 2) { - opj_event_msg(p_manager, EVT_ERROR, "Stream too short\n"); - return OPJ_FALSE; - } - - /* read 2 bytes as the new marker ID*/ - opj_read_bytes(p_j2k->m_specific_param.m_decoder.m_header_data, - &l_unknown_marker, 2); - - if (!(l_unknown_marker < 0xff00)) { - - /* Get the marker handler from the marker ID*/ - l_marker_handler = opj_j2k_get_marker_handler(l_unknown_marker); - - if (!(p_j2k->m_specific_param.m_decoder.m_state & l_marker_handler->states)) { - opj_event_msg(p_manager, EVT_ERROR, - "Marker is not compliant with its position\n"); - return OPJ_FALSE; - } else { - if (l_marker_handler->id != J2K_MS_UNK) { - /* Add the marker to the codestream index*/ - if (l_marker_handler->id != J2K_MS_SOT) { - OPJ_BOOL res = opj_j2k_add_mhmarker(p_j2k->cstr_index, J2K_MS_UNK, - (OPJ_UINT32) opj_stream_tell(p_stream) - l_size_unk, - l_size_unk); - if (res == OPJ_FALSE) { - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to add mh marker\n"); - return OPJ_FALSE; - } - } - break; /* next marker is known and well located */ - } else { - l_size_unk += 2; - } - } - } - } - - *output_marker = l_marker_handler->id ; - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_mct_record(opj_j2k_t *p_j2k, - opj_mct_data_t * p_mct_record, - struct opj_stream_private *p_stream, - struct opj_event_mgr * p_manager) -{ - OPJ_UINT32 l_mct_size; - OPJ_BYTE * l_current_data = 00; - OPJ_UINT32 l_tmp; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - l_mct_size = 10 + p_mct_record->m_data_size; - - if (l_mct_size > p_j2k->m_specific_param.m_encoder.m_header_tile_data_size) { - OPJ_BYTE *new_header_tile_data = (OPJ_BYTE *) opj_realloc( - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_mct_size); - if (! new_header_tile_data) { - opj_free(p_j2k->m_specific_param.m_encoder.m_header_tile_data); - p_j2k->m_specific_param.m_encoder.m_header_tile_data = NULL; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = 0; - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to write MCT marker\n"); - return OPJ_FALSE; - } - p_j2k->m_specific_param.m_encoder.m_header_tile_data = new_header_tile_data; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = l_mct_size; - } - - l_current_data = p_j2k->m_specific_param.m_encoder.m_header_tile_data; - - opj_write_bytes(l_current_data, J2K_MS_MCT, - 2); /* MCT */ - l_current_data += 2; - - opj_write_bytes(l_current_data, l_mct_size - 2, - 2); /* Lmct */ - l_current_data += 2; - - opj_write_bytes(l_current_data, 0, - 2); /* Zmct */ - l_current_data += 2; - - /* only one marker atm */ - l_tmp = (p_mct_record->m_index & 0xff) | (p_mct_record->m_array_type << 8) | - (p_mct_record->m_element_type << 10); - - opj_write_bytes(l_current_data, l_tmp, 2); - l_current_data += 2; - - opj_write_bytes(l_current_data, 0, - 2); /* Ymct */ - l_current_data += 2; - - memcpy(l_current_data, p_mct_record->m_data, p_mct_record->m_data_size); - - if (opj_stream_write_data(p_stream, - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_mct_size, - p_manager) != l_mct_size) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -/** - * Reads a MCT marker (Multiple Component Transform) - * - * @param p_header_data the data contained in the MCT box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the MCT marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_mct(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 i; - opj_tcp_t *l_tcp = 00; - OPJ_UINT32 l_tmp; - OPJ_UINT32 l_indix; - opj_mct_data_t * l_mct_data; - - /* preconditions */ - assert(p_header_data != 00); - assert(p_j2k != 00); - - l_tcp = p_j2k->m_specific_param.m_decoder.m_state == J2K_STATE_TPH ? - &p_j2k->m_cp.tcps[p_j2k->m_current_tile_number] : - p_j2k->m_specific_param.m_decoder.m_default_tcp; - - if (p_header_size < 2) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading MCT marker\n"); - return OPJ_FALSE; - } - - /* first marker */ - opj_read_bytes(p_header_data, &l_tmp, 2); /* Zmct */ - p_header_data += 2; - if (l_tmp != 0) { - opj_event_msg(p_manager, EVT_WARNING, - "Cannot take in charge mct data within multiple MCT records\n"); - return OPJ_TRUE; - } - - if (p_header_size <= 6) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading MCT marker\n"); - return OPJ_FALSE; - } - - /* Imct -> no need for other values, take the first, type is double with decorrelation x0000 1101 0000 0000*/ - opj_read_bytes(p_header_data, &l_tmp, 2); /* Imct */ - p_header_data += 2; - - l_indix = l_tmp & 0xff; - l_mct_data = l_tcp->m_mct_records; - - for (i = 0; i < l_tcp->m_nb_mct_records; ++i) { - if (l_mct_data->m_index == l_indix) { - break; - } - ++l_mct_data; - } - - /* NOT FOUND */ - if (i == l_tcp->m_nb_mct_records) { - if (l_tcp->m_nb_mct_records == l_tcp->m_nb_max_mct_records) { - opj_mct_data_t *new_mct_records; - l_tcp->m_nb_max_mct_records += OPJ_J2K_MCT_DEFAULT_NB_RECORDS; - - new_mct_records = (opj_mct_data_t *) opj_realloc(l_tcp->m_mct_records, - l_tcp->m_nb_max_mct_records * sizeof(opj_mct_data_t)); - if (! new_mct_records) { - opj_free(l_tcp->m_mct_records); - l_tcp->m_mct_records = NULL; - l_tcp->m_nb_max_mct_records = 0; - l_tcp->m_nb_mct_records = 0; - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to read MCT marker\n"); - return OPJ_FALSE; - } - - /* Update m_mcc_records[].m_offset_array and m_decorrelation_array - * to point to the new addresses */ - if (new_mct_records != l_tcp->m_mct_records) { - for (i = 0; i < l_tcp->m_nb_mcc_records; ++i) { - opj_simple_mcc_decorrelation_data_t* l_mcc_record = - &(l_tcp->m_mcc_records[i]); - if (l_mcc_record->m_decorrelation_array) { - l_mcc_record->m_decorrelation_array = - new_mct_records + - (l_mcc_record->m_decorrelation_array - - l_tcp->m_mct_records); - } - if (l_mcc_record->m_offset_array) { - l_mcc_record->m_offset_array = - new_mct_records + - (l_mcc_record->m_offset_array - - l_tcp->m_mct_records); - } - } - } - - l_tcp->m_mct_records = new_mct_records; - l_mct_data = l_tcp->m_mct_records + l_tcp->m_nb_mct_records; - memset(l_mct_data, 0, (l_tcp->m_nb_max_mct_records - l_tcp->m_nb_mct_records) * - sizeof(opj_mct_data_t)); - } - - l_mct_data = l_tcp->m_mct_records + l_tcp->m_nb_mct_records; - ++l_tcp->m_nb_mct_records; - } - - if (l_mct_data->m_data) { - opj_free(l_mct_data->m_data); - l_mct_data->m_data = 00; - l_mct_data->m_data_size = 0; - } - - l_mct_data->m_index = l_indix; - l_mct_data->m_array_type = (J2K_MCT_ARRAY_TYPE)((l_tmp >> 8) & 3); - l_mct_data->m_element_type = (J2K_MCT_ELEMENT_TYPE)((l_tmp >> 10) & 3); - - opj_read_bytes(p_header_data, &l_tmp, 2); /* Ymct */ - p_header_data += 2; - if (l_tmp != 0) { - opj_event_msg(p_manager, EVT_WARNING, - "Cannot take in charge multiple MCT markers\n"); - return OPJ_TRUE; - } - - p_header_size -= 6; - - l_mct_data->m_data = (OPJ_BYTE*)opj_malloc(p_header_size); - if (! l_mct_data->m_data) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading MCT marker\n"); - return OPJ_FALSE; - } - memcpy(l_mct_data->m_data, p_header_data, p_header_size); - - l_mct_data->m_data_size = p_header_size; - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_mcc_record(opj_j2k_t *p_j2k, - struct opj_simple_mcc_decorrelation_data * p_mcc_record, - struct opj_stream_private *p_stream, - struct opj_event_mgr * p_manager) -{ - OPJ_UINT32 i; - OPJ_UINT32 l_mcc_size; - OPJ_BYTE * l_current_data = 00; - OPJ_UINT32 l_nb_bytes_for_comp; - OPJ_UINT32 l_mask; - OPJ_UINT32 l_tmcc; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - if (p_mcc_record->m_nb_comps > 255) { - l_nb_bytes_for_comp = 2; - l_mask = 0x8000; - } else { - l_nb_bytes_for_comp = 1; - l_mask = 0; - } - - l_mcc_size = p_mcc_record->m_nb_comps * 2 * l_nb_bytes_for_comp + 19; - if (l_mcc_size > p_j2k->m_specific_param.m_encoder.m_header_tile_data_size) { - OPJ_BYTE *new_header_tile_data = (OPJ_BYTE *) opj_realloc( - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_mcc_size); - if (! new_header_tile_data) { - opj_free(p_j2k->m_specific_param.m_encoder.m_header_tile_data); - p_j2k->m_specific_param.m_encoder.m_header_tile_data = NULL; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = 0; - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to write MCC marker\n"); - return OPJ_FALSE; - } - p_j2k->m_specific_param.m_encoder.m_header_tile_data = new_header_tile_data; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = l_mcc_size; - } - - l_current_data = p_j2k->m_specific_param.m_encoder.m_header_tile_data; - - opj_write_bytes(l_current_data, J2K_MS_MCC, - 2); /* MCC */ - l_current_data += 2; - - opj_write_bytes(l_current_data, l_mcc_size - 2, - 2); /* Lmcc */ - l_current_data += 2; - - /* first marker */ - opj_write_bytes(l_current_data, 0, - 2); /* Zmcc */ - l_current_data += 2; - - opj_write_bytes(l_current_data, p_mcc_record->m_index, - 1); /* Imcc -> no need for other values, take the first */ - ++l_current_data; - - /* only one marker atm */ - opj_write_bytes(l_current_data, 0, - 2); /* Ymcc */ - l_current_data += 2; - - opj_write_bytes(l_current_data, 1, - 2); /* Qmcc -> number of collections -> 1 */ - l_current_data += 2; - - opj_write_bytes(l_current_data, 0x1, - 1); /* Xmcci type of component transformation -> array based decorrelation */ - ++l_current_data; - - opj_write_bytes(l_current_data, p_mcc_record->m_nb_comps | l_mask, - 2); /* Nmcci number of input components involved and size for each component offset = 8 bits */ - l_current_data += 2; - - for (i = 0; i < p_mcc_record->m_nb_comps; ++i) { - opj_write_bytes(l_current_data, i, - l_nb_bytes_for_comp); /* Cmccij Component offset*/ - l_current_data += l_nb_bytes_for_comp; - } - - opj_write_bytes(l_current_data, p_mcc_record->m_nb_comps | l_mask, - 2); /* Mmcci number of output components involved and size for each component offset = 8 bits */ - l_current_data += 2; - - for (i = 0; i < p_mcc_record->m_nb_comps; ++i) { - opj_write_bytes(l_current_data, i, - l_nb_bytes_for_comp); /* Wmccij Component offset*/ - l_current_data += l_nb_bytes_for_comp; - } - - l_tmcc = ((!p_mcc_record->m_is_irreversible) & 1U) << 16; - - if (p_mcc_record->m_decorrelation_array) { - l_tmcc |= p_mcc_record->m_decorrelation_array->m_index; - } - - if (p_mcc_record->m_offset_array) { - l_tmcc |= ((p_mcc_record->m_offset_array->m_index) << 8); - } - - opj_write_bytes(l_current_data, l_tmcc, - 3); /* Tmcci : use MCT defined as number 1 and irreversible array based. */ - l_current_data += 3; - - if (opj_stream_write_data(p_stream, - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_mcc_size, - p_manager) != l_mcc_size) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_read_mcc(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager) -{ - OPJ_UINT32 i, j; - OPJ_UINT32 l_tmp; - OPJ_UINT32 l_indix; - opj_tcp_t * l_tcp; - opj_simple_mcc_decorrelation_data_t * l_mcc_record; - opj_mct_data_t * l_mct_data; - OPJ_UINT32 l_nb_collections; - OPJ_UINT32 l_nb_comps; - OPJ_UINT32 l_nb_bytes_by_comp; - OPJ_BOOL l_new_mcc = OPJ_FALSE; - - /* preconditions */ - assert(p_header_data != 00); - assert(p_j2k != 00); - assert(p_manager != 00); - - l_tcp = p_j2k->m_specific_param.m_decoder.m_state == J2K_STATE_TPH ? - &p_j2k->m_cp.tcps[p_j2k->m_current_tile_number] : - p_j2k->m_specific_param.m_decoder.m_default_tcp; - - if (p_header_size < 2) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading MCC marker\n"); - return OPJ_FALSE; - } - - /* first marker */ - opj_read_bytes(p_header_data, &l_tmp, 2); /* Zmcc */ - p_header_data += 2; - if (l_tmp != 0) { - opj_event_msg(p_manager, EVT_WARNING, - "Cannot take in charge multiple data spanning\n"); - return OPJ_TRUE; - } - - if (p_header_size < 7) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading MCC marker\n"); - return OPJ_FALSE; - } - - opj_read_bytes(p_header_data, &l_indix, - 1); /* Imcc -> no need for other values, take the first */ - ++p_header_data; - - l_mcc_record = l_tcp->m_mcc_records; - - for (i = 0; i < l_tcp->m_nb_mcc_records; ++i) { - if (l_mcc_record->m_index == l_indix) { - break; - } - ++l_mcc_record; - } - - /** NOT FOUND */ - if (i == l_tcp->m_nb_mcc_records) { - if (l_tcp->m_nb_mcc_records == l_tcp->m_nb_max_mcc_records) { - opj_simple_mcc_decorrelation_data_t *new_mcc_records; - l_tcp->m_nb_max_mcc_records += OPJ_J2K_MCC_DEFAULT_NB_RECORDS; - - new_mcc_records = (opj_simple_mcc_decorrelation_data_t *) opj_realloc( - l_tcp->m_mcc_records, l_tcp->m_nb_max_mcc_records * sizeof( - opj_simple_mcc_decorrelation_data_t)); - if (! new_mcc_records) { - opj_free(l_tcp->m_mcc_records); - l_tcp->m_mcc_records = NULL; - l_tcp->m_nb_max_mcc_records = 0; - l_tcp->m_nb_mcc_records = 0; - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to read MCC marker\n"); - return OPJ_FALSE; - } - l_tcp->m_mcc_records = new_mcc_records; - l_mcc_record = l_tcp->m_mcc_records + l_tcp->m_nb_mcc_records; - memset(l_mcc_record, 0, (l_tcp->m_nb_max_mcc_records - l_tcp->m_nb_mcc_records) - * sizeof(opj_simple_mcc_decorrelation_data_t)); - } - l_mcc_record = l_tcp->m_mcc_records + l_tcp->m_nb_mcc_records; - l_new_mcc = OPJ_TRUE; - } - l_mcc_record->m_index = l_indix; - - /* only one marker atm */ - opj_read_bytes(p_header_data, &l_tmp, 2); /* Ymcc */ - p_header_data += 2; - if (l_tmp != 0) { - opj_event_msg(p_manager, EVT_WARNING, - "Cannot take in charge multiple data spanning\n"); - return OPJ_TRUE; - } - - opj_read_bytes(p_header_data, &l_nb_collections, - 2); /* Qmcc -> number of collections -> 1 */ - p_header_data += 2; - - if (l_nb_collections > 1) { - opj_event_msg(p_manager, EVT_WARNING, - "Cannot take in charge multiple collections\n"); - return OPJ_TRUE; - } - - p_header_size -= 7; - - for (i = 0; i < l_nb_collections; ++i) { - if (p_header_size < 3) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading MCC marker\n"); - return OPJ_FALSE; - } - - opj_read_bytes(p_header_data, &l_tmp, - 1); /* Xmcci type of component transformation -> array based decorrelation */ - ++p_header_data; - - if (l_tmp != 1) { - opj_event_msg(p_manager, EVT_WARNING, - "Cannot take in charge collections other than array decorrelation\n"); - return OPJ_TRUE; - } - - opj_read_bytes(p_header_data, &l_nb_comps, 2); - - p_header_data += 2; - p_header_size -= 3; - - l_nb_bytes_by_comp = 1 + (l_nb_comps >> 15); - l_mcc_record->m_nb_comps = l_nb_comps & 0x7fff; - - if (p_header_size < (l_nb_bytes_by_comp * l_mcc_record->m_nb_comps + 2)) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading MCC marker\n"); - return OPJ_FALSE; - } - - p_header_size -= (l_nb_bytes_by_comp * l_mcc_record->m_nb_comps + 2); - - for (j = 0; j < l_mcc_record->m_nb_comps; ++j) { - opj_read_bytes(p_header_data, &l_tmp, - l_nb_bytes_by_comp); /* Cmccij Component offset*/ - p_header_data += l_nb_bytes_by_comp; - - if (l_tmp != j) { - opj_event_msg(p_manager, EVT_WARNING, - "Cannot take in charge collections with indix shuffle\n"); - return OPJ_TRUE; - } - } - - opj_read_bytes(p_header_data, &l_nb_comps, 2); - p_header_data += 2; - - l_nb_bytes_by_comp = 1 + (l_nb_comps >> 15); - l_nb_comps &= 0x7fff; - - if (l_nb_comps != l_mcc_record->m_nb_comps) { - opj_event_msg(p_manager, EVT_WARNING, - "Cannot take in charge collections without same number of indixes\n"); - return OPJ_TRUE; - } - - if (p_header_size < (l_nb_bytes_by_comp * l_mcc_record->m_nb_comps + 3)) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading MCC marker\n"); - return OPJ_FALSE; - } - - p_header_size -= (l_nb_bytes_by_comp * l_mcc_record->m_nb_comps + 3); - - for (j = 0; j < l_mcc_record->m_nb_comps; ++j) { - opj_read_bytes(p_header_data, &l_tmp, - l_nb_bytes_by_comp); /* Wmccij Component offset*/ - p_header_data += l_nb_bytes_by_comp; - - if (l_tmp != j) { - opj_event_msg(p_manager, EVT_WARNING, - "Cannot take in charge collections with indix shuffle\n"); - return OPJ_TRUE; - } - } - - opj_read_bytes(p_header_data, &l_tmp, 3); /* Wmccij Component offset*/ - p_header_data += 3; - - l_mcc_record->m_is_irreversible = !((l_tmp >> 16) & 1); - l_mcc_record->m_decorrelation_array = 00; - l_mcc_record->m_offset_array = 00; - - l_indix = l_tmp & 0xff; - if (l_indix != 0) { - l_mct_data = l_tcp->m_mct_records; - for (j = 0; j < l_tcp->m_nb_mct_records; ++j) { - if (l_mct_data->m_index == l_indix) { - l_mcc_record->m_decorrelation_array = l_mct_data; - break; - } - ++l_mct_data; - } - - if (l_mcc_record->m_decorrelation_array == 00) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading MCC marker\n"); - return OPJ_FALSE; - } - } - - l_indix = (l_tmp >> 8) & 0xff; - if (l_indix != 0) { - l_mct_data = l_tcp->m_mct_records; - for (j = 0; j < l_tcp->m_nb_mct_records; ++j) { - if (l_mct_data->m_index == l_indix) { - l_mcc_record->m_offset_array = l_mct_data; - break; - } - ++l_mct_data; - } - - if (l_mcc_record->m_offset_array == 00) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading MCC marker\n"); - return OPJ_FALSE; - } - } - } - - if (p_header_size != 0) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading MCC marker\n"); - return OPJ_FALSE; - } - - if (l_new_mcc) { - ++l_tcp->m_nb_mcc_records; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_mco(opj_j2k_t *p_j2k, - struct opj_stream_private *p_stream, - struct opj_event_mgr * p_manager - ) -{ - OPJ_BYTE * l_current_data = 00; - OPJ_UINT32 l_mco_size; - opj_tcp_t * l_tcp = 00; - opj_simple_mcc_decorrelation_data_t * l_mcc_record; - OPJ_UINT32 i; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - l_tcp = &(p_j2k->m_cp.tcps[p_j2k->m_current_tile_number]); - - l_mco_size = 5 + l_tcp->m_nb_mcc_records; - if (l_mco_size > p_j2k->m_specific_param.m_encoder.m_header_tile_data_size) { - - OPJ_BYTE *new_header_tile_data = (OPJ_BYTE *) opj_realloc( - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_mco_size); - if (! new_header_tile_data) { - opj_free(p_j2k->m_specific_param.m_encoder.m_header_tile_data); - p_j2k->m_specific_param.m_encoder.m_header_tile_data = NULL; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = 0; - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to write MCO marker\n"); - return OPJ_FALSE; - } - p_j2k->m_specific_param.m_encoder.m_header_tile_data = new_header_tile_data; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = l_mco_size; - } - l_current_data = p_j2k->m_specific_param.m_encoder.m_header_tile_data; - - - opj_write_bytes(l_current_data, J2K_MS_MCO, 2); /* MCO */ - l_current_data += 2; - - opj_write_bytes(l_current_data, l_mco_size - 2, 2); /* Lmco */ - l_current_data += 2; - - opj_write_bytes(l_current_data, l_tcp->m_nb_mcc_records, - 1); /* Nmco : only one transform stage*/ - ++l_current_data; - - l_mcc_record = l_tcp->m_mcc_records; - for (i = 0; i < l_tcp->m_nb_mcc_records; ++i) { - opj_write_bytes(l_current_data, l_mcc_record->m_index, - 1); /* Imco -> use the mcc indicated by 1*/ - ++l_current_data; - ++l_mcc_record; - } - - if (opj_stream_write_data(p_stream, - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_mco_size, - p_manager) != l_mco_size) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -/** - * Reads a MCO marker (Multiple Component Transform Ordering) - * - * @param p_header_data the data contained in the MCO box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the MCO marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_mco(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 l_tmp, i; - OPJ_UINT32 l_nb_stages; - opj_tcp_t * l_tcp; - opj_tccp_t * l_tccp; - opj_image_t * l_image; - - /* preconditions */ - assert(p_header_data != 00); - assert(p_j2k != 00); - assert(p_manager != 00); - - l_image = p_j2k->m_private_image; - l_tcp = p_j2k->m_specific_param.m_decoder.m_state == J2K_STATE_TPH ? - &p_j2k->m_cp.tcps[p_j2k->m_current_tile_number] : - p_j2k->m_specific_param.m_decoder.m_default_tcp; - - if (p_header_size < 1) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading MCO marker\n"); - return OPJ_FALSE; - } - - opj_read_bytes(p_header_data, &l_nb_stages, - 1); /* Nmco : only one transform stage*/ - ++p_header_data; - - if (l_nb_stages > 1) { - opj_event_msg(p_manager, EVT_WARNING, - "Cannot take in charge multiple transformation stages.\n"); - return OPJ_TRUE; - } - - if (p_header_size != l_nb_stages + 1) { - opj_event_msg(p_manager, EVT_WARNING, "Error reading MCO marker\n"); - return OPJ_FALSE; - } - - l_tccp = l_tcp->tccps; - - for (i = 0; i < l_image->numcomps; ++i) { - l_tccp->m_dc_level_shift = 0; - ++l_tccp; - } - - if (l_tcp->m_mct_decoding_matrix) { - opj_free(l_tcp->m_mct_decoding_matrix); - l_tcp->m_mct_decoding_matrix = 00; - } - - for (i = 0; i < l_nb_stages; ++i) { - opj_read_bytes(p_header_data, &l_tmp, 1); - ++p_header_data; - - if (! opj_j2k_add_mct(l_tcp, p_j2k->m_private_image, l_tmp)) { - return OPJ_FALSE; - } - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_add_mct(opj_tcp_t * p_tcp, opj_image_t * p_image, - OPJ_UINT32 p_index) -{ - OPJ_UINT32 i; - opj_simple_mcc_decorrelation_data_t * l_mcc_record; - opj_mct_data_t * l_deco_array, * l_offset_array; - OPJ_UINT32 l_data_size, l_mct_size, l_offset_size; - OPJ_UINT32 l_nb_elem; - OPJ_UINT32 * l_offset_data, * l_current_offset_data; - opj_tccp_t * l_tccp; - - /* preconditions */ - assert(p_tcp != 00); - - l_mcc_record = p_tcp->m_mcc_records; - - for (i = 0; i < p_tcp->m_nb_mcc_records; ++i) { - if (l_mcc_record->m_index == p_index) { - break; - } - } - - if (i == p_tcp->m_nb_mcc_records) { - /** element discarded **/ - return OPJ_TRUE; - } - - if (l_mcc_record->m_nb_comps != p_image->numcomps) { - /** do not support number of comps != image */ - return OPJ_TRUE; - } - - l_deco_array = l_mcc_record->m_decorrelation_array; - - if (l_deco_array) { - l_data_size = MCT_ELEMENT_SIZE[l_deco_array->m_element_type] * p_image->numcomps - * p_image->numcomps; - if (l_deco_array->m_data_size != l_data_size) { - return OPJ_FALSE; - } - - l_nb_elem = p_image->numcomps * p_image->numcomps; - l_mct_size = l_nb_elem * (OPJ_UINT32)sizeof(OPJ_FLOAT32); - p_tcp->m_mct_decoding_matrix = (OPJ_FLOAT32*)opj_malloc(l_mct_size); - - if (! p_tcp->m_mct_decoding_matrix) { - return OPJ_FALSE; - } - - j2k_mct_read_functions_to_float[l_deco_array->m_element_type]( - l_deco_array->m_data, p_tcp->m_mct_decoding_matrix, l_nb_elem); - } - - l_offset_array = l_mcc_record->m_offset_array; - - if (l_offset_array) { - l_data_size = MCT_ELEMENT_SIZE[l_offset_array->m_element_type] * - p_image->numcomps; - if (l_offset_array->m_data_size != l_data_size) { - return OPJ_FALSE; - } - - l_nb_elem = p_image->numcomps; - l_offset_size = l_nb_elem * (OPJ_UINT32)sizeof(OPJ_UINT32); - l_offset_data = (OPJ_UINT32*)opj_malloc(l_offset_size); - - if (! l_offset_data) { - return OPJ_FALSE; - } - - j2k_mct_read_functions_to_int32[l_offset_array->m_element_type]( - l_offset_array->m_data, l_offset_data, l_nb_elem); - - l_tccp = p_tcp->tccps; - l_current_offset_data = l_offset_data; - - for (i = 0; i < p_image->numcomps; ++i) { - l_tccp->m_dc_level_shift = (OPJ_INT32) * (l_current_offset_data++); - ++l_tccp; - } - - opj_free(l_offset_data); - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_cbd(opj_j2k_t *p_j2k, - struct opj_stream_private *p_stream, - struct opj_event_mgr * p_manager) -{ - OPJ_UINT32 i; - OPJ_UINT32 l_cbd_size; - OPJ_BYTE * l_current_data = 00; - opj_image_t *l_image = 00; - opj_image_comp_t * l_comp = 00; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - l_image = p_j2k->m_private_image; - l_cbd_size = 6 + p_j2k->m_private_image->numcomps; - - if (l_cbd_size > p_j2k->m_specific_param.m_encoder.m_header_tile_data_size) { - OPJ_BYTE *new_header_tile_data = (OPJ_BYTE *) opj_realloc( - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_cbd_size); - if (! new_header_tile_data) { - opj_free(p_j2k->m_specific_param.m_encoder.m_header_tile_data); - p_j2k->m_specific_param.m_encoder.m_header_tile_data = NULL; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = 0; - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to write CBD marker\n"); - return OPJ_FALSE; - } - p_j2k->m_specific_param.m_encoder.m_header_tile_data = new_header_tile_data; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = l_cbd_size; - } - - l_current_data = p_j2k->m_specific_param.m_encoder.m_header_tile_data; - - opj_write_bytes(l_current_data, J2K_MS_CBD, 2); /* CBD */ - l_current_data += 2; - - opj_write_bytes(l_current_data, l_cbd_size - 2, 2); /* L_CBD */ - l_current_data += 2; - - opj_write_bytes(l_current_data, l_image->numcomps, 2); /* Ncbd */ - l_current_data += 2; - - l_comp = l_image->comps; - - for (i = 0; i < l_image->numcomps; ++i) { - opj_write_bytes(l_current_data, (l_comp->sgnd << 7) | (l_comp->prec - 1), - 1); /* Component bit depth */ - ++l_current_data; - - ++l_comp; - } - - if (opj_stream_write_data(p_stream, - p_j2k->m_specific_param.m_encoder.m_header_tile_data, l_cbd_size, - p_manager) != l_cbd_size) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -/** - * Reads a CBD marker (Component bit depth definition) - * @param p_header_data the data contained in the CBD box. - * @param p_j2k the jpeg2000 codec. - * @param p_header_size the size of the data contained in the CBD marker. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_read_cbd(opj_j2k_t *p_j2k, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 l_nb_comp, l_num_comp; - OPJ_UINT32 l_comp_def; - OPJ_UINT32 i; - opj_image_comp_t * l_comp = 00; - - /* preconditions */ - assert(p_header_data != 00); - assert(p_j2k != 00); - assert(p_manager != 00); - - l_num_comp = p_j2k->m_private_image->numcomps; - - if (p_header_size != (p_j2k->m_private_image->numcomps + 2)) { - opj_event_msg(p_manager, EVT_ERROR, "Crror reading CBD marker\n"); - return OPJ_FALSE; - } - - opj_read_bytes(p_header_data, &l_nb_comp, - 2); /* Ncbd */ - p_header_data += 2; - - if (l_nb_comp != l_num_comp) { - opj_event_msg(p_manager, EVT_ERROR, "Crror reading CBD marker\n"); - return OPJ_FALSE; - } - - l_comp = p_j2k->m_private_image->comps; - for (i = 0; i < l_num_comp; ++i) { - opj_read_bytes(p_header_data, &l_comp_def, - 1); /* Component bit depth */ - ++p_header_data; - l_comp->sgnd = (l_comp_def >> 7) & 1; - l_comp->prec = (l_comp_def & 0x7f) + 1; - - if (l_comp->prec > 31) { - opj_event_msg(p_manager, EVT_ERROR, - "Invalid values for comp = %d : prec=%u (should be between 1 and 38 according to the JPEG2000 norm. OpenJpeg only supports up to 31)\n", - i, l_comp->prec); - return OPJ_FALSE; - } - ++l_comp; - } - - return OPJ_TRUE; -} - -/* ----------------------------------------------------------------------- */ -/* J2K / JPT decoder interface */ -/* ----------------------------------------------------------------------- */ - -void opj_j2k_setup_decoder(opj_j2k_t *j2k, opj_dparameters_t *parameters) -{ - if (j2k && parameters) { - j2k->m_cp.m_specific_param.m_dec.m_layer = parameters->cp_layer; - j2k->m_cp.m_specific_param.m_dec.m_reduce = parameters->cp_reduce; - - j2k->dump_state = (parameters->flags & OPJ_DPARAMETERS_DUMP_FLAG); -#ifdef USE_JPWL - j2k->m_cp.correct = parameters->jpwl_correct; - j2k->m_cp.exp_comps = parameters->jpwl_exp_comps; - j2k->m_cp.max_tiles = parameters->jpwl_max_tiles; -#endif /* USE_JPWL */ - } -} - -OPJ_BOOL opj_j2k_set_threads(opj_j2k_t *j2k, OPJ_UINT32 num_threads) -{ - /* Currently we pass the thread-pool to the tcd, so we cannot re-set it */ - /* afterwards */ - if (opj_has_thread_support() && j2k->m_tcd == NULL) { - opj_thread_pool_destroy(j2k->m_tp); - j2k->m_tp = NULL; - if (num_threads <= (OPJ_UINT32)INT_MAX) { - j2k->m_tp = opj_thread_pool_create((int)num_threads); - } - if (j2k->m_tp == NULL) { - j2k->m_tp = opj_thread_pool_create(0); - return OPJ_FALSE; - } - return OPJ_TRUE; - } - return OPJ_FALSE; -} - -static int opj_j2k_get_default_thread_count() -{ - const char* num_threads_str = getenv("OPJ_NUM_THREADS"); - int num_cpus; - int num_threads; - - if (num_threads_str == NULL || !opj_has_thread_support()) { - return 0; - } - num_cpus = opj_get_num_cpus(); - if (strcmp(num_threads_str, "ALL_CPUS") == 0) { - return num_cpus; - } - if (num_cpus == 0) { - num_cpus = 32; - } - num_threads = atoi(num_threads_str); - if (num_threads < 0) { - num_threads = 0; - } else if (num_threads > 2 * num_cpus) { - num_threads = 2 * num_cpus; - } - return num_threads; -} - -/* ----------------------------------------------------------------------- */ -/* J2K encoder interface */ -/* ----------------------------------------------------------------------- */ - -opj_j2k_t* opj_j2k_create_compress(void) -{ - opj_j2k_t *l_j2k = (opj_j2k_t*) opj_calloc(1, sizeof(opj_j2k_t)); - if (!l_j2k) { - return NULL; - } - - - l_j2k->m_is_decoder = 0; - l_j2k->m_cp.m_is_decoder = 0; - - l_j2k->m_specific_param.m_encoder.m_header_tile_data = (OPJ_BYTE *) opj_malloc( - OPJ_J2K_DEFAULT_HEADER_SIZE); - if (! l_j2k->m_specific_param.m_encoder.m_header_tile_data) { - opj_j2k_destroy(l_j2k); - return NULL; - } - - l_j2k->m_specific_param.m_encoder.m_header_tile_data_size = - OPJ_J2K_DEFAULT_HEADER_SIZE; - - /* validation list creation*/ - l_j2k->m_validation_list = opj_procedure_list_create(); - if (! l_j2k->m_validation_list) { - opj_j2k_destroy(l_j2k); - return NULL; - } - - /* execution list creation*/ - l_j2k->m_procedure_list = opj_procedure_list_create(); - if (! l_j2k->m_procedure_list) { - opj_j2k_destroy(l_j2k); - return NULL; - } - - l_j2k->m_tp = opj_thread_pool_create(opj_j2k_get_default_thread_count()); - if (!l_j2k->m_tp) { - l_j2k->m_tp = opj_thread_pool_create(0); - } - if (!l_j2k->m_tp) { - opj_j2k_destroy(l_j2k); - return NULL; - } - - return l_j2k; -} - -static int opj_j2k_initialise_4K_poc(opj_poc_t *POC, int numres) -{ - POC[0].tile = 1; - POC[0].resno0 = 0; - POC[0].compno0 = 0; - POC[0].layno1 = 1; - POC[0].resno1 = (OPJ_UINT32)(numres - 1); - POC[0].compno1 = 3; - POC[0].prg1 = OPJ_CPRL; - POC[1].tile = 1; - POC[1].resno0 = (OPJ_UINT32)(numres - 1); - POC[1].compno0 = 0; - POC[1].layno1 = 1; - POC[1].resno1 = (OPJ_UINT32)numres; - POC[1].compno1 = 3; - POC[1].prg1 = OPJ_CPRL; - return 2; -} - -static void opj_j2k_set_cinema_parameters(opj_cparameters_t *parameters, - opj_image_t *image, opj_event_mgr_t *p_manager) -{ - /* Configure cinema parameters */ - int i; - - /* No tiling */ - parameters->tile_size_on = OPJ_FALSE; - parameters->cp_tdx = 1; - parameters->cp_tdy = 1; - - /* One tile part for each component */ - parameters->tp_flag = 'C'; - parameters->tp_on = 1; - - /* Tile and Image shall be at (0,0) */ - parameters->cp_tx0 = 0; - parameters->cp_ty0 = 0; - parameters->image_offset_x0 = 0; - parameters->image_offset_y0 = 0; - - /* Codeblock size= 32*32 */ - parameters->cblockw_init = 32; - parameters->cblockh_init = 32; - - /* Codeblock style: no mode switch enabled */ - parameters->mode = 0; - - /* No ROI */ - parameters->roi_compno = -1; - - /* No subsampling */ - parameters->subsampling_dx = 1; - parameters->subsampling_dy = 1; - - /* 9-7 transform */ - parameters->irreversible = 1; - - /* Number of layers */ - if (parameters->tcp_numlayers > 1) { - opj_event_msg(p_manager, EVT_WARNING, - "JPEG 2000 Profile-3 and 4 (2k/4k dc profile) requires:\n" - "1 single quality layer" - "-> Number of layers forced to 1 (rather than %d)\n" - "-> Rate of the last layer (%3.1f) will be used", - parameters->tcp_numlayers, - parameters->tcp_rates[parameters->tcp_numlayers - 1]); - parameters->tcp_rates[0] = parameters->tcp_rates[parameters->tcp_numlayers - 1]; - parameters->tcp_numlayers = 1; - } - - /* Resolution levels */ - switch (parameters->rsiz) { - case OPJ_PROFILE_CINEMA_2K: - if (parameters->numresolution > 6) { - opj_event_msg(p_manager, EVT_WARNING, - "JPEG 2000 Profile-3 (2k dc profile) requires:\n" - "Number of decomposition levels <= 5\n" - "-> Number of decomposition levels forced to 5 (rather than %d)\n", - parameters->numresolution + 1); - parameters->numresolution = 6; - } - break; - case OPJ_PROFILE_CINEMA_4K: - if (parameters->numresolution < 2) { - opj_event_msg(p_manager, EVT_WARNING, - "JPEG 2000 Profile-4 (4k dc profile) requires:\n" - "Number of decomposition levels >= 1 && <= 6\n" - "-> Number of decomposition levels forced to 1 (rather than %d)\n", - parameters->numresolution + 1); - parameters->numresolution = 1; - } else if (parameters->numresolution > 7) { - opj_event_msg(p_manager, EVT_WARNING, - "JPEG 2000 Profile-4 (4k dc profile) requires:\n" - "Number of decomposition levels >= 1 && <= 6\n" - "-> Number of decomposition levels forced to 6 (rather than %d)\n", - parameters->numresolution + 1); - parameters->numresolution = 7; - } - break; - default : - break; - } - - /* Precincts */ - parameters->csty |= 0x01; - if (parameters->numresolution == 1) { - parameters->res_spec = 1; - parameters->prcw_init[0] = 128; - parameters->prch_init[0] = 128; - } else { - parameters->res_spec = parameters->numresolution - 1; - for (i = 0; i < parameters->res_spec; i++) { - parameters->prcw_init[i] = 256; - parameters->prch_init[i] = 256; - } - } - - /* The progression order shall be CPRL */ - parameters->prog_order = OPJ_CPRL; - - /* Progression order changes for 4K, disallowed for 2K */ - if (parameters->rsiz == OPJ_PROFILE_CINEMA_4K) { - parameters->numpocs = (OPJ_UINT32)opj_j2k_initialise_4K_poc(parameters->POC, - parameters->numresolution); - } else { - parameters->numpocs = 0; - } - - /* Limited bit-rate */ - parameters->cp_disto_alloc = 1; - if (parameters->max_cs_size <= 0) { - /* No rate has been introduced, 24 fps is assumed */ - parameters->max_cs_size = OPJ_CINEMA_24_CS; - opj_event_msg(p_manager, EVT_WARNING, - "JPEG 2000 Profile-3 and 4 (2k/4k dc profile) requires:\n" - "Maximum 1302083 compressed bytes @ 24fps\n" - "As no rate has been given, this limit will be used.\n"); - } else if (parameters->max_cs_size > OPJ_CINEMA_24_CS) { - opj_event_msg(p_manager, EVT_WARNING, - "JPEG 2000 Profile-3 and 4 (2k/4k dc profile) requires:\n" - "Maximum 1302083 compressed bytes @ 24fps\n" - "-> Specified rate exceeds this limit. Rate will be forced to 1302083 bytes.\n"); - parameters->max_cs_size = OPJ_CINEMA_24_CS; - } - - if (parameters->max_comp_size <= 0) { - /* No rate has been introduced, 24 fps is assumed */ - parameters->max_comp_size = OPJ_CINEMA_24_COMP; - opj_event_msg(p_manager, EVT_WARNING, - "JPEG 2000 Profile-3 and 4 (2k/4k dc profile) requires:\n" - "Maximum 1041666 compressed bytes @ 24fps\n" - "As no rate has been given, this limit will be used.\n"); - } else if (parameters->max_comp_size > OPJ_CINEMA_24_COMP) { - opj_event_msg(p_manager, EVT_WARNING, - "JPEG 2000 Profile-3 and 4 (2k/4k dc profile) requires:\n" - "Maximum 1041666 compressed bytes @ 24fps\n" - "-> Specified rate exceeds this limit. Rate will be forced to 1041666 bytes.\n"); - parameters->max_comp_size = OPJ_CINEMA_24_COMP; - } - - parameters->tcp_rates[0] = (OPJ_FLOAT32)(image->numcomps * image->comps[0].w * - image->comps[0].h * image->comps[0].prec) / - (OPJ_FLOAT32)(((OPJ_UINT32)parameters->max_cs_size) * 8 * image->comps[0].dx * - image->comps[0].dy); - -} - -static OPJ_BOOL opj_j2k_is_cinema_compliant(opj_image_t *image, OPJ_UINT16 rsiz, - opj_event_mgr_t *p_manager) -{ - OPJ_UINT32 i; - - /* Number of components */ - if (image->numcomps != 3) { - opj_event_msg(p_manager, EVT_WARNING, - "JPEG 2000 Profile-3 (2k dc profile) requires:\n" - "3 components" - "-> Number of components of input image (%d) is not compliant\n" - "-> Non-profile-3 codestream will be generated\n", - image->numcomps); - return OPJ_FALSE; - } - - /* Bitdepth */ - for (i = 0; i < image->numcomps; i++) { - if ((image->comps[i].bpp != 12) | (image->comps[i].sgnd)) { - char signed_str[] = "signed"; - char unsigned_str[] = "unsigned"; - char *tmp_str = image->comps[i].sgnd ? signed_str : unsigned_str; - opj_event_msg(p_manager, EVT_WARNING, - "JPEG 2000 Profile-3 (2k dc profile) requires:\n" - "Precision of each component shall be 12 bits unsigned" - "-> At least component %d of input image (%d bits, %s) is not compliant\n" - "-> Non-profile-3 codestream will be generated\n", - i, image->comps[i].bpp, tmp_str); - return OPJ_FALSE; - } - } - - /* Image size */ - switch (rsiz) { - case OPJ_PROFILE_CINEMA_2K: - if (((image->comps[0].w > 2048) | (image->comps[0].h > 1080))) { - opj_event_msg(p_manager, EVT_WARNING, - "JPEG 2000 Profile-3 (2k dc profile) requires:\n" - "width <= 2048 and height <= 1080\n" - "-> Input image size %d x %d is not compliant\n" - "-> Non-profile-3 codestream will be generated\n", - image->comps[0].w, image->comps[0].h); - return OPJ_FALSE; - } - break; - case OPJ_PROFILE_CINEMA_4K: - if (((image->comps[0].w > 4096) | (image->comps[0].h > 2160))) { - opj_event_msg(p_manager, EVT_WARNING, - "JPEG 2000 Profile-4 (4k dc profile) requires:\n" - "width <= 4096 and height <= 2160\n" - "-> Image size %d x %d is not compliant\n" - "-> Non-profile-4 codestream will be generated\n", - image->comps[0].w, image->comps[0].h); - return OPJ_FALSE; - } - break; - default : - break; - } - - return OPJ_TRUE; -} - -OPJ_BOOL opj_j2k_setup_encoder(opj_j2k_t *p_j2k, - opj_cparameters_t *parameters, - opj_image_t *image, - opj_event_mgr_t * p_manager) -{ - OPJ_UINT32 i, j, tileno, numpocs_tile; - opj_cp_t *cp = 00; - OPJ_UINT32 cblkw, cblkh; - - if (!p_j2k || !parameters || ! image) { - return OPJ_FALSE; - } - - if ((parameters->numresolution <= 0) || - (parameters->numresolution > OPJ_J2K_MAXRLVLS)) { - opj_event_msg(p_manager, EVT_ERROR, - "Invalid number of resolutions : %d not in range [1,%d]\n", - parameters->numresolution, OPJ_J2K_MAXRLVLS); - return OPJ_FALSE; - } - - if (parameters->cblockw_init < 4 || parameters->cblockw_init > 1024) { - opj_event_msg(p_manager, EVT_ERROR, - "Invalid value for cblockw_init: %d not a power of 2 in range [4,1024]\n", - parameters->cblockw_init); - return OPJ_FALSE; - } - if (parameters->cblockh_init < 4 || parameters->cblockh_init > 1024) { - opj_event_msg(p_manager, EVT_ERROR, - "Invalid value for cblockh_init: %d not a power of 2 not in range [4,1024]\n", - parameters->cblockh_init); - return OPJ_FALSE; - } - if (parameters->cblockw_init * parameters->cblockh_init > 4096) { - opj_event_msg(p_manager, EVT_ERROR, - "Invalid value for cblockw_init * cblockh_init: should be <= 4096\n"); - return OPJ_FALSE; - } - cblkw = (OPJ_UINT32)opj_int_floorlog2(parameters->cblockw_init); - cblkh = (OPJ_UINT32)opj_int_floorlog2(parameters->cblockh_init); - if (parameters->cblockw_init != (1 << cblkw)) { - opj_event_msg(p_manager, EVT_ERROR, - "Invalid value for cblockw_init: %d not a power of 2 in range [4,1024]\n", - parameters->cblockw_init); - return OPJ_FALSE; - } - if (parameters->cblockh_init != (1 << cblkh)) { - opj_event_msg(p_manager, EVT_ERROR, - "Invalid value for cblockw_init: %d not a power of 2 in range [4,1024]\n", - parameters->cblockh_init); - return OPJ_FALSE; - } - - /* keep a link to cp so that we can destroy it later in j2k_destroy_compress */ - cp = &(p_j2k->m_cp); - - /* set default values for cp */ - cp->tw = 1; - cp->th = 1; - - /* FIXME ADE: to be removed once deprecated cp_cinema and cp_rsiz have been removed */ - if (parameters->rsiz == - OPJ_PROFILE_NONE) { /* consider deprecated fields only if RSIZ has not been set */ - OPJ_BOOL deprecated_used = OPJ_FALSE; - switch (parameters->cp_cinema) { - case OPJ_CINEMA2K_24: - parameters->rsiz = OPJ_PROFILE_CINEMA_2K; - parameters->max_cs_size = OPJ_CINEMA_24_CS; - parameters->max_comp_size = OPJ_CINEMA_24_COMP; - deprecated_used = OPJ_TRUE; - break; - case OPJ_CINEMA2K_48: - parameters->rsiz = OPJ_PROFILE_CINEMA_2K; - parameters->max_cs_size = OPJ_CINEMA_48_CS; - parameters->max_comp_size = OPJ_CINEMA_48_COMP; - deprecated_used = OPJ_TRUE; - break; - case OPJ_CINEMA4K_24: - parameters->rsiz = OPJ_PROFILE_CINEMA_4K; - parameters->max_cs_size = OPJ_CINEMA_24_CS; - parameters->max_comp_size = OPJ_CINEMA_24_COMP; - deprecated_used = OPJ_TRUE; - break; - case OPJ_OFF: - default: - break; - } - switch (parameters->cp_rsiz) { - case OPJ_CINEMA2K: - parameters->rsiz = OPJ_PROFILE_CINEMA_2K; - deprecated_used = OPJ_TRUE; - break; - case OPJ_CINEMA4K: - parameters->rsiz = OPJ_PROFILE_CINEMA_4K; - deprecated_used = OPJ_TRUE; - break; - case OPJ_MCT: - parameters->rsiz = OPJ_PROFILE_PART2 | OPJ_EXTENSION_MCT; - deprecated_used = OPJ_TRUE; - case OPJ_STD_RSIZ: - default: - break; - } - if (deprecated_used) { - opj_event_msg(p_manager, EVT_WARNING, - "Deprecated fields cp_cinema or cp_rsiz are used\n" - "Please consider using only the rsiz field\n" - "See openjpeg.h documentation for more details\n"); - } - } - - /* If no explicit layers are provided, use lossless settings */ - if (parameters->tcp_numlayers == 0) { - parameters->tcp_numlayers = 1; - parameters->cp_disto_alloc = 1; - parameters->tcp_rates[0] = 0; - } - - if (parameters->cp_disto_alloc) { - /* Emit warnings if tcp_rates are not decreasing */ - for (i = 1; i < (OPJ_UINT32) parameters->tcp_numlayers; i++) { - OPJ_FLOAT32 rate_i_corr = parameters->tcp_rates[i]; - OPJ_FLOAT32 rate_i_m_1_corr = parameters->tcp_rates[i - 1]; - if (rate_i_corr <= 1.0) { - rate_i_corr = 1.0; - } - if (rate_i_m_1_corr <= 1.0) { - rate_i_m_1_corr = 1.0; - } - if (rate_i_corr >= rate_i_m_1_corr) { - if (rate_i_corr != parameters->tcp_rates[i] && - rate_i_m_1_corr != parameters->tcp_rates[i - 1]) { - opj_event_msg(p_manager, EVT_WARNING, - "tcp_rates[%d]=%f (corrected as %f) should be strictly lesser " - "than tcp_rates[%d]=%f (corrected as %f)\n", - i, parameters->tcp_rates[i], rate_i_corr, - i - 1, parameters->tcp_rates[i - 1], rate_i_m_1_corr); - } else if (rate_i_corr != parameters->tcp_rates[i]) { - opj_event_msg(p_manager, EVT_WARNING, - "tcp_rates[%d]=%f (corrected as %f) should be strictly lesser " - "than tcp_rates[%d]=%f\n", - i, parameters->tcp_rates[i], rate_i_corr, - i - 1, parameters->tcp_rates[i - 1]); - } else if (rate_i_m_1_corr != parameters->tcp_rates[i - 1]) { - opj_event_msg(p_manager, EVT_WARNING, - "tcp_rates[%d]=%f should be strictly lesser " - "than tcp_rates[%d]=%f (corrected as %f)\n", - i, parameters->tcp_rates[i], - i - 1, parameters->tcp_rates[i - 1], rate_i_m_1_corr); - } else { - opj_event_msg(p_manager, EVT_WARNING, - "tcp_rates[%d]=%f should be strictly lesser " - "than tcp_rates[%d]=%f\n", - i, parameters->tcp_rates[i], - i - 1, parameters->tcp_rates[i - 1]); - } - } - } - } else if (parameters->cp_fixed_quality) { - /* Emit warnings if tcp_distoratio are not increasing */ - for (i = 1; i < (OPJ_UINT32) parameters->tcp_numlayers; i++) { - if (parameters->tcp_distoratio[i] < parameters->tcp_distoratio[i - 1] && - !(i == (OPJ_UINT32)parameters->tcp_numlayers - 1 && - parameters->tcp_distoratio[i] == 0)) { - opj_event_msg(p_manager, EVT_WARNING, - "tcp_distoratio[%d]=%f should be strictly greater " - "than tcp_distoratio[%d]=%f\n", - i, parameters->tcp_distoratio[i], i - 1, - parameters->tcp_distoratio[i - 1]); - } - } - } - - /* see if max_codestream_size does limit input rate */ - if (parameters->max_cs_size <= 0) { - if (parameters->tcp_rates[parameters->tcp_numlayers - 1] > 0) { - OPJ_FLOAT32 temp_size; - temp_size = (OPJ_FLOAT32)(((double)image->numcomps * image->comps[0].w * - image->comps[0].h * image->comps[0].prec) / - ((double)parameters->tcp_rates[parameters->tcp_numlayers - 1] * 8 * - image->comps[0].dx * image->comps[0].dy)); - if (temp_size > INT_MAX) { - parameters->max_cs_size = INT_MAX; - } else { - parameters->max_cs_size = (int) floor(temp_size); - } - } else { - parameters->max_cs_size = 0; - } - } else { - OPJ_FLOAT32 temp_rate; - OPJ_BOOL cap = OPJ_FALSE; - temp_rate = (OPJ_FLOAT32)(((double)image->numcomps * image->comps[0].w * - image->comps[0].h * image->comps[0].prec) / - (((double)parameters->max_cs_size) * 8 * image->comps[0].dx * - image->comps[0].dy)); - for (i = 0; i < (OPJ_UINT32) parameters->tcp_numlayers; i++) { - if (parameters->tcp_rates[i] < temp_rate) { - parameters->tcp_rates[i] = temp_rate; - cap = OPJ_TRUE; - } - } - if (cap) { - opj_event_msg(p_manager, EVT_WARNING, - "The desired maximum codestream size has limited\n" - "at least one of the desired quality layers\n"); - } - } - - /* Manage profiles and applications and set RSIZ */ - /* set cinema parameters if required */ - if (OPJ_IS_CINEMA(parameters->rsiz)) { - if ((parameters->rsiz == OPJ_PROFILE_CINEMA_S2K) - || (parameters->rsiz == OPJ_PROFILE_CINEMA_S4K)) { - opj_event_msg(p_manager, EVT_WARNING, - "JPEG 2000 Scalable Digital Cinema profiles not yet supported\n"); - parameters->rsiz = OPJ_PROFILE_NONE; - } else { - opj_j2k_set_cinema_parameters(parameters, image, p_manager); - if (!opj_j2k_is_cinema_compliant(image, parameters->rsiz, p_manager)) { - parameters->rsiz = OPJ_PROFILE_NONE; - } - } - } else if (OPJ_IS_STORAGE(parameters->rsiz)) { - opj_event_msg(p_manager, EVT_WARNING, - "JPEG 2000 Long Term Storage profile not yet supported\n"); - parameters->rsiz = OPJ_PROFILE_NONE; - } else if (OPJ_IS_BROADCAST(parameters->rsiz)) { - opj_event_msg(p_manager, EVT_WARNING, - "JPEG 2000 Broadcast profiles not yet supported\n"); - parameters->rsiz = OPJ_PROFILE_NONE; - } else if (OPJ_IS_IMF(parameters->rsiz)) { - opj_event_msg(p_manager, EVT_WARNING, - "JPEG 2000 IMF profiles not yet supported\n"); - parameters->rsiz = OPJ_PROFILE_NONE; - } else if (OPJ_IS_PART2(parameters->rsiz)) { - if (parameters->rsiz == ((OPJ_PROFILE_PART2) | (OPJ_EXTENSION_NONE))) { - opj_event_msg(p_manager, EVT_WARNING, - "JPEG 2000 Part-2 profile defined\n" - "but no Part-2 extension enabled.\n" - "Profile set to NONE.\n"); - parameters->rsiz = OPJ_PROFILE_NONE; - } else if (parameters->rsiz != ((OPJ_PROFILE_PART2) | (OPJ_EXTENSION_MCT))) { - opj_event_msg(p_manager, EVT_WARNING, - "Unsupported Part-2 extension enabled\n" - "Profile set to NONE.\n"); - parameters->rsiz = OPJ_PROFILE_NONE; - } - } - - /* - copy user encoding parameters - */ - cp->m_specific_param.m_enc.m_max_comp_size = (OPJ_UINT32) - parameters->max_comp_size; - cp->rsiz = parameters->rsiz; - cp->m_specific_param.m_enc.m_disto_alloc = (OPJ_UINT32) - parameters->cp_disto_alloc & 1u; - cp->m_specific_param.m_enc.m_fixed_alloc = (OPJ_UINT32) - parameters->cp_fixed_alloc & 1u; - cp->m_specific_param.m_enc.m_fixed_quality = (OPJ_UINT32) - parameters->cp_fixed_quality & 1u; - - /* mod fixed_quality */ - if (parameters->cp_fixed_alloc && parameters->cp_matrice) { - size_t array_size = (size_t)parameters->tcp_numlayers * - (size_t)parameters->numresolution * 3 * sizeof(OPJ_INT32); - cp->m_specific_param.m_enc.m_matrice = (OPJ_INT32 *) opj_malloc(array_size); - if (!cp->m_specific_param.m_enc.m_matrice) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to allocate copy of user encoding parameters matrix \n"); - return OPJ_FALSE; - } - memcpy(cp->m_specific_param.m_enc.m_matrice, parameters->cp_matrice, - array_size); - } - - /* tiles */ - cp->tdx = (OPJ_UINT32)parameters->cp_tdx; - cp->tdy = (OPJ_UINT32)parameters->cp_tdy; - - /* tile offset */ - cp->tx0 = (OPJ_UINT32)parameters->cp_tx0; - cp->ty0 = (OPJ_UINT32)parameters->cp_ty0; - - /* comment string */ - if (parameters->cp_comment) { - cp->comment = (char*)opj_malloc(strlen(parameters->cp_comment) + 1U); - if (!cp->comment) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to allocate copy of comment string\n"); - return OPJ_FALSE; - } - strcpy(cp->comment, parameters->cp_comment); - } else { - /* Create default comment for codestream */ - const char comment[] = "Created by OpenJPEG version "; - const size_t clen = strlen(comment); - const char *version = opj_version(); - - /* UniPG>> */ -#ifdef USE_JPWL - cp->comment = (char*)opj_malloc(clen + strlen(version) + 11); - if (!cp->comment) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to allocate comment string\n"); - return OPJ_FALSE; - } - sprintf(cp->comment, "%s%s with JPWL", comment, version); -#else - cp->comment = (char*)opj_malloc(clen + strlen(version) + 1); - if (!cp->comment) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to allocate comment string\n"); - return OPJ_FALSE; - } - sprintf(cp->comment, "%s%s", comment, version); -#endif - /* <tile_size_on) { - cp->tw = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)(image->x1 - cp->tx0), - (OPJ_INT32)cp->tdx); - cp->th = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)(image->y1 - cp->ty0), - (OPJ_INT32)cp->tdy); - } else { - cp->tdx = image->x1 - cp->tx0; - cp->tdy = image->y1 - cp->ty0; - } - - if (parameters->tp_on) { - cp->m_specific_param.m_enc.m_tp_flag = (OPJ_BYTE)parameters->tp_flag; - cp->m_specific_param.m_enc.m_tp_on = 1; - } - -#ifdef USE_JPWL - /* - calculate JPWL encoding parameters - */ - - if (parameters->jpwl_epc_on) { - OPJ_INT32 i; - - /* set JPWL on */ - cp->epc_on = OPJ_TRUE; - cp->info_on = OPJ_FALSE; /* no informative technique */ - - /* set EPB on */ - if ((parameters->jpwl_hprot_MH > 0) || (parameters->jpwl_hprot_TPH[0] > 0)) { - cp->epb_on = OPJ_TRUE; - - cp->hprot_MH = parameters->jpwl_hprot_MH; - for (i = 0; i < JPWL_MAX_NO_TILESPECS; i++) { - cp->hprot_TPH_tileno[i] = parameters->jpwl_hprot_TPH_tileno[i]; - cp->hprot_TPH[i] = parameters->jpwl_hprot_TPH[i]; - } - /* if tile specs are not specified, copy MH specs */ - if (cp->hprot_TPH[0] == -1) { - cp->hprot_TPH_tileno[0] = 0; - cp->hprot_TPH[0] = parameters->jpwl_hprot_MH; - } - for (i = 0; i < JPWL_MAX_NO_PACKSPECS; i++) { - cp->pprot_tileno[i] = parameters->jpwl_pprot_tileno[i]; - cp->pprot_packno[i] = parameters->jpwl_pprot_packno[i]; - cp->pprot[i] = parameters->jpwl_pprot[i]; - } - } - - /* set ESD writing */ - if ((parameters->jpwl_sens_size == 1) || (parameters->jpwl_sens_size == 2)) { - cp->esd_on = OPJ_TRUE; - - cp->sens_size = parameters->jpwl_sens_size; - cp->sens_addr = parameters->jpwl_sens_addr; - cp->sens_range = parameters->jpwl_sens_range; - - cp->sens_MH = parameters->jpwl_sens_MH; - for (i = 0; i < JPWL_MAX_NO_TILESPECS; i++) { - cp->sens_TPH_tileno[i] = parameters->jpwl_sens_TPH_tileno[i]; - cp->sens_TPH[i] = parameters->jpwl_sens_TPH[i]; - } - } - - /* always set RED writing to false: we are at the encoder */ - cp->red_on = OPJ_FALSE; - - } else { - cp->epc_on = OPJ_FALSE; - } -#endif /* USE_JPWL */ - - /* initialize the mutiple tiles */ - /* ---------------------------- */ - cp->tcps = (opj_tcp_t*) opj_calloc(cp->tw * cp->th, sizeof(opj_tcp_t)); - if (!cp->tcps) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to allocate tile coding parameters\n"); - return OPJ_FALSE; - } - if (parameters->numpocs) { - /* initialisation of POC */ - opj_j2k_check_poc_val(parameters->POC, parameters->numpocs, - (OPJ_UINT32)parameters->numresolution, image->numcomps, - (OPJ_UINT32)parameters->tcp_numlayers, p_manager); - /* TODO MSD use the return value*/ - } - - for (tileno = 0; tileno < cp->tw * cp->th; tileno++) { - opj_tcp_t *tcp = &cp->tcps[tileno]; - tcp->numlayers = (OPJ_UINT32)parameters->tcp_numlayers; - - for (j = 0; j < tcp->numlayers; j++) { - if (OPJ_IS_CINEMA(cp->rsiz)) { - if (cp->m_specific_param.m_enc.m_fixed_quality) { - tcp->distoratio[j] = parameters->tcp_distoratio[j]; - } - tcp->rates[j] = parameters->tcp_rates[j]; - } else { - if (cp->m_specific_param.m_enc.m_fixed_quality) { /* add fixed_quality */ - tcp->distoratio[j] = parameters->tcp_distoratio[j]; - } else { - tcp->rates[j] = parameters->tcp_rates[j]; - } - } - if (!cp->m_specific_param.m_enc.m_fixed_quality && - tcp->rates[j] <= 1.0) { - tcp->rates[j] = 0.0; /* force lossless */ - } - } - - tcp->csty = (OPJ_UINT32)parameters->csty; - tcp->prg = parameters->prog_order; - tcp->mct = (OPJ_UINT32)parameters->tcp_mct; - - numpocs_tile = 0; - tcp->POC = 0; - - if (parameters->numpocs) { - /* initialisation of POC */ - tcp->POC = 1; - for (i = 0; i < parameters->numpocs; i++) { - if (tileno + 1 == parameters->POC[i].tile) { - opj_poc_t *tcp_poc = &tcp->pocs[numpocs_tile]; - - tcp_poc->resno0 = parameters->POC[numpocs_tile].resno0; - tcp_poc->compno0 = parameters->POC[numpocs_tile].compno0; - tcp_poc->layno1 = parameters->POC[numpocs_tile].layno1; - tcp_poc->resno1 = parameters->POC[numpocs_tile].resno1; - tcp_poc->compno1 = parameters->POC[numpocs_tile].compno1; - tcp_poc->prg1 = parameters->POC[numpocs_tile].prg1; - tcp_poc->tile = parameters->POC[numpocs_tile].tile; - - numpocs_tile++; - } - } - - tcp->numpocs = numpocs_tile - 1 ; - } else { - tcp->numpocs = 0; - } - - tcp->tccps = (opj_tccp_t*) opj_calloc(image->numcomps, sizeof(opj_tccp_t)); - if (!tcp->tccps) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to allocate tile component coding parameters\n"); - return OPJ_FALSE; - } - if (parameters->mct_data) { - - OPJ_UINT32 lMctSize = image->numcomps * image->numcomps * (OPJ_UINT32)sizeof( - OPJ_FLOAT32); - OPJ_FLOAT32 * lTmpBuf = (OPJ_FLOAT32*)opj_malloc(lMctSize); - OPJ_INT32 * l_dc_shift = (OPJ_INT32 *)((OPJ_BYTE *) parameters->mct_data + - lMctSize); - - if (!lTmpBuf) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to allocate temp buffer\n"); - return OPJ_FALSE; - } - - tcp->mct = 2; - tcp->m_mct_coding_matrix = (OPJ_FLOAT32*)opj_malloc(lMctSize); - if (! tcp->m_mct_coding_matrix) { - opj_free(lTmpBuf); - lTmpBuf = NULL; - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to allocate encoder MCT coding matrix \n"); - return OPJ_FALSE; - } - memcpy(tcp->m_mct_coding_matrix, parameters->mct_data, lMctSize); - memcpy(lTmpBuf, parameters->mct_data, lMctSize); - - tcp->m_mct_decoding_matrix = (OPJ_FLOAT32*)opj_malloc(lMctSize); - if (! tcp->m_mct_decoding_matrix) { - opj_free(lTmpBuf); - lTmpBuf = NULL; - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to allocate encoder MCT decoding matrix \n"); - return OPJ_FALSE; - } - if (opj_matrix_inversion_f(lTmpBuf, (tcp->m_mct_decoding_matrix), - image->numcomps) == OPJ_FALSE) { - opj_free(lTmpBuf); - lTmpBuf = NULL; - opj_event_msg(p_manager, EVT_ERROR, - "Failed to inverse encoder MCT decoding matrix \n"); - return OPJ_FALSE; - } - - tcp->mct_norms = (OPJ_FLOAT64*) - opj_malloc(image->numcomps * sizeof(OPJ_FLOAT64)); - if (! tcp->mct_norms) { - opj_free(lTmpBuf); - lTmpBuf = NULL; - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to allocate encoder MCT norms \n"); - return OPJ_FALSE; - } - opj_calculate_norms(tcp->mct_norms, image->numcomps, - tcp->m_mct_decoding_matrix); - opj_free(lTmpBuf); - - for (i = 0; i < image->numcomps; i++) { - opj_tccp_t *tccp = &tcp->tccps[i]; - tccp->m_dc_level_shift = l_dc_shift[i]; - } - - if (opj_j2k_setup_mct_encoding(tcp, image) == OPJ_FALSE) { - /* free will be handled by opj_j2k_destroy */ - opj_event_msg(p_manager, EVT_ERROR, "Failed to setup j2k mct encoding\n"); - return OPJ_FALSE; - } - } else { - if (tcp->mct == 1 && image->numcomps >= 3) { /* RGB->YCC MCT is enabled */ - if ((image->comps[0].dx != image->comps[1].dx) || - (image->comps[0].dx != image->comps[2].dx) || - (image->comps[0].dy != image->comps[1].dy) || - (image->comps[0].dy != image->comps[2].dy)) { - opj_event_msg(p_manager, EVT_WARNING, - "Cannot perform MCT on components with different sizes. Disabling MCT.\n"); - tcp->mct = 0; - } - } - for (i = 0; i < image->numcomps; i++) { - opj_tccp_t *tccp = &tcp->tccps[i]; - opj_image_comp_t * l_comp = &(image->comps[i]); - - if (! l_comp->sgnd) { - tccp->m_dc_level_shift = 1 << (l_comp->prec - 1); - } - } - } - - for (i = 0; i < image->numcomps; i++) { - opj_tccp_t *tccp = &tcp->tccps[i]; - - tccp->csty = parameters->csty & - 0x01; /* 0 => one precinct || 1 => custom precinct */ - tccp->numresolutions = (OPJ_UINT32)parameters->numresolution; - tccp->cblkw = (OPJ_UINT32)opj_int_floorlog2(parameters->cblockw_init); - tccp->cblkh = (OPJ_UINT32)opj_int_floorlog2(parameters->cblockh_init); - tccp->cblksty = (OPJ_UINT32)parameters->mode; - tccp->qmfbid = parameters->irreversible ? 0 : 1; - tccp->qntsty = parameters->irreversible ? J2K_CCP_QNTSTY_SEQNT : - J2K_CCP_QNTSTY_NOQNT; - tccp->numgbits = 2; - - if ((OPJ_INT32)i == parameters->roi_compno) { - tccp->roishift = parameters->roi_shift; - } else { - tccp->roishift = 0; - } - - if (parameters->csty & J2K_CCP_CSTY_PRT) { - OPJ_INT32 p = 0, it_res; - assert(tccp->numresolutions > 0); - for (it_res = (OPJ_INT32)tccp->numresolutions - 1; it_res >= 0; it_res--) { - if (p < parameters->res_spec) { - - if (parameters->prcw_init[p] < 1) { - tccp->prcw[it_res] = 1; - } else { - tccp->prcw[it_res] = (OPJ_UINT32)opj_int_floorlog2(parameters->prcw_init[p]); - } - - if (parameters->prch_init[p] < 1) { - tccp->prch[it_res] = 1; - } else { - tccp->prch[it_res] = (OPJ_UINT32)opj_int_floorlog2(parameters->prch_init[p]); - } - - } else { - OPJ_INT32 res_spec = parameters->res_spec; - OPJ_INT32 size_prcw = 0; - OPJ_INT32 size_prch = 0; - - assert(res_spec > 0); /* issue 189 */ - size_prcw = parameters->prcw_init[res_spec - 1] >> (p - (res_spec - 1)); - size_prch = parameters->prch_init[res_spec - 1] >> (p - (res_spec - 1)); - - - if (size_prcw < 1) { - tccp->prcw[it_res] = 1; - } else { - tccp->prcw[it_res] = (OPJ_UINT32)opj_int_floorlog2(size_prcw); - } - - if (size_prch < 1) { - tccp->prch[it_res] = 1; - } else { - tccp->prch[it_res] = (OPJ_UINT32)opj_int_floorlog2(size_prch); - } - } - p++; - /*printf("\nsize precinct for level %d : %d,%d\n", it_res,tccp->prcw[it_res], tccp->prch[it_res]); */ - } /*end for*/ - } else { - for (j = 0; j < tccp->numresolutions; j++) { - tccp->prcw[j] = 15; - tccp->prch[j] = 15; - } - } - - opj_dwt_calc_explicit_stepsizes(tccp, image->comps[i].prec); - } - } - - if (parameters->mct_data) { - opj_free(parameters->mct_data); - parameters->mct_data = 00; - } - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_add_mhmarker(opj_codestream_index_t *cstr_index, - OPJ_UINT32 type, OPJ_OFF_T pos, OPJ_UINT32 len) -{ - assert(cstr_index != 00); - - /* expand the list? */ - if ((cstr_index->marknum + 1) > cstr_index->maxmarknum) { - opj_marker_info_t *new_marker; - cstr_index->maxmarknum = (OPJ_UINT32)(100 + (OPJ_FLOAT32) - cstr_index->maxmarknum); - new_marker = (opj_marker_info_t *) opj_realloc(cstr_index->marker, - cstr_index->maxmarknum * sizeof(opj_marker_info_t)); - if (! new_marker) { - opj_free(cstr_index->marker); - cstr_index->marker = NULL; - cstr_index->maxmarknum = 0; - cstr_index->marknum = 0; - /* opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to add mh marker\n"); */ - return OPJ_FALSE; - } - cstr_index->marker = new_marker; - } - - /* add the marker */ - cstr_index->marker[cstr_index->marknum].type = (OPJ_UINT16)type; - cstr_index->marker[cstr_index->marknum].pos = (OPJ_INT32)pos; - cstr_index->marker[cstr_index->marknum].len = (OPJ_INT32)len; - cstr_index->marknum++; - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_add_tlmarker(OPJ_UINT32 tileno, - opj_codestream_index_t *cstr_index, OPJ_UINT32 type, OPJ_OFF_T pos, - OPJ_UINT32 len) -{ - assert(cstr_index != 00); - assert(cstr_index->tile_index != 00); - - /* expand the list? */ - if ((cstr_index->tile_index[tileno].marknum + 1) > - cstr_index->tile_index[tileno].maxmarknum) { - opj_marker_info_t *new_marker; - cstr_index->tile_index[tileno].maxmarknum = (OPJ_UINT32)(100 + - (OPJ_FLOAT32) cstr_index->tile_index[tileno].maxmarknum); - new_marker = (opj_marker_info_t *) opj_realloc( - cstr_index->tile_index[tileno].marker, - cstr_index->tile_index[tileno].maxmarknum * sizeof(opj_marker_info_t)); - if (! new_marker) { - opj_free(cstr_index->tile_index[tileno].marker); - cstr_index->tile_index[tileno].marker = NULL; - cstr_index->tile_index[tileno].maxmarknum = 0; - cstr_index->tile_index[tileno].marknum = 0; - /* opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to add tl marker\n"); */ - return OPJ_FALSE; - } - cstr_index->tile_index[tileno].marker = new_marker; - } - - /* add the marker */ - cstr_index->tile_index[tileno].marker[cstr_index->tile_index[tileno].marknum].type - = (OPJ_UINT16)type; - cstr_index->tile_index[tileno].marker[cstr_index->tile_index[tileno].marknum].pos - = (OPJ_INT32)pos; - cstr_index->tile_index[tileno].marker[cstr_index->tile_index[tileno].marknum].len - = (OPJ_INT32)len; - cstr_index->tile_index[tileno].marknum++; - - if (type == J2K_MS_SOT) { - OPJ_UINT32 l_current_tile_part = cstr_index->tile_index[tileno].current_tpsno; - - if (cstr_index->tile_index[tileno].tp_index) { - cstr_index->tile_index[tileno].tp_index[l_current_tile_part].start_pos = pos; - } - - } - return OPJ_TRUE; -} - -/* - * ----------------------------------------------------------------------- - * ----------------------------------------------------------------------- - * ----------------------------------------------------------------------- - */ - -OPJ_BOOL opj_j2k_end_decompress(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager - ) -{ - (void)p_j2k; - (void)p_stream; - (void)p_manager; - return OPJ_TRUE; -} - -OPJ_BOOL opj_j2k_read_header(opj_stream_private_t *p_stream, - opj_j2k_t* p_j2k, - opj_image_t** p_image, - opj_event_mgr_t* p_manager) -{ - /* preconditions */ - assert(p_j2k != 00); - assert(p_stream != 00); - assert(p_manager != 00); - - /* create an empty image header */ - p_j2k->m_private_image = opj_image_create0(); - if (! p_j2k->m_private_image) { - return OPJ_FALSE; - } - - /* customization of the validation */ - if (! opj_j2k_setup_decoding_validation(p_j2k, p_manager)) { - opj_image_destroy(p_j2k->m_private_image); - p_j2k->m_private_image = NULL; - return OPJ_FALSE; - } - - /* validation of the parameters codec */ - if (! opj_j2k_exec(p_j2k, p_j2k->m_validation_list, p_stream, p_manager)) { - opj_image_destroy(p_j2k->m_private_image); - p_j2k->m_private_image = NULL; - return OPJ_FALSE; - } - - /* customization of the encoding */ - if (! opj_j2k_setup_header_reading(p_j2k, p_manager)) { - opj_image_destroy(p_j2k->m_private_image); - p_j2k->m_private_image = NULL; - return OPJ_FALSE; - } - - /* read header */ - if (! opj_j2k_exec(p_j2k, p_j2k->m_procedure_list, p_stream, p_manager)) { - opj_image_destroy(p_j2k->m_private_image); - p_j2k->m_private_image = NULL; - return OPJ_FALSE; - } - - *p_image = opj_image_create0(); - if (!(*p_image)) { - return OPJ_FALSE; - } - - /* Copy codestream image information to the output image */ - opj_copy_image_header(p_j2k->m_private_image, *p_image); - - /*Allocate and initialize some elements of codestrem index*/ - if (!opj_j2k_allocate_tile_element_cstr_index(p_j2k)) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_setup_header_reading(opj_j2k_t *p_j2k, - opj_event_mgr_t * p_manager) -{ - /* preconditions*/ - assert(p_j2k != 00); - assert(p_manager != 00); - - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_read_header_procedure, p_manager)) { - return OPJ_FALSE; - } - - /* DEVELOPER CORNER, add your custom procedures */ - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_copy_default_tcp_and_create_tcd, p_manager)) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_setup_decoding_validation(opj_j2k_t *p_j2k, - opj_event_mgr_t * p_manager) -{ - /* preconditions*/ - assert(p_j2k != 00); - assert(p_manager != 00); - - if (! opj_procedure_list_add_procedure(p_j2k->m_validation_list, - (opj_procedure)opj_j2k_build_decoder, p_manager)) { - return OPJ_FALSE; - } - if (! opj_procedure_list_add_procedure(p_j2k->m_validation_list, - (opj_procedure)opj_j2k_decoding_validation, p_manager)) { - return OPJ_FALSE; - } - - /* DEVELOPER CORNER, add your custom validation procedure */ - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_mct_validation(opj_j2k_t * p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager) -{ - OPJ_BOOL l_is_valid = OPJ_TRUE; - OPJ_UINT32 i, j; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_stream != 00); - assert(p_manager != 00); - - OPJ_UNUSED(p_stream); - OPJ_UNUSED(p_manager); - - if ((p_j2k->m_cp.rsiz & 0x8200) == 0x8200) { - OPJ_UINT32 l_nb_tiles = p_j2k->m_cp.th * p_j2k->m_cp.tw; - opj_tcp_t * l_tcp = p_j2k->m_cp.tcps; - - for (i = 0; i < l_nb_tiles; ++i) { - if (l_tcp->mct == 2) { - opj_tccp_t * l_tccp = l_tcp->tccps; - l_is_valid &= (l_tcp->m_mct_coding_matrix != 00); - - for (j = 0; j < p_j2k->m_private_image->numcomps; ++j) { - l_is_valid &= !(l_tccp->qmfbid & 1); - ++l_tccp; - } - } - ++l_tcp; - } - } - - return l_is_valid; -} - -OPJ_BOOL opj_j2k_setup_mct_encoding(opj_tcp_t * p_tcp, opj_image_t * p_image) -{ - OPJ_UINT32 i; - OPJ_UINT32 l_indix = 1; - opj_mct_data_t * l_mct_deco_data = 00, * l_mct_offset_data = 00; - opj_simple_mcc_decorrelation_data_t * l_mcc_data; - OPJ_UINT32 l_mct_size, l_nb_elem; - OPJ_FLOAT32 * l_data, * l_current_data; - opj_tccp_t * l_tccp; - - /* preconditions */ - assert(p_tcp != 00); - - if (p_tcp->mct != 2) { - return OPJ_TRUE; - } - - if (p_tcp->m_mct_decoding_matrix) { - if (p_tcp->m_nb_mct_records == p_tcp->m_nb_max_mct_records) { - opj_mct_data_t *new_mct_records; - p_tcp->m_nb_max_mct_records += OPJ_J2K_MCT_DEFAULT_NB_RECORDS; - - new_mct_records = (opj_mct_data_t *) opj_realloc(p_tcp->m_mct_records, - p_tcp->m_nb_max_mct_records * sizeof(opj_mct_data_t)); - if (! new_mct_records) { - opj_free(p_tcp->m_mct_records); - p_tcp->m_mct_records = NULL; - p_tcp->m_nb_max_mct_records = 0; - p_tcp->m_nb_mct_records = 0; - /* opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to setup mct encoding\n"); */ - return OPJ_FALSE; - } - p_tcp->m_mct_records = new_mct_records; - l_mct_deco_data = p_tcp->m_mct_records + p_tcp->m_nb_mct_records; - - memset(l_mct_deco_data, 0, - (p_tcp->m_nb_max_mct_records - p_tcp->m_nb_mct_records) * sizeof( - opj_mct_data_t)); - } - l_mct_deco_data = p_tcp->m_mct_records + p_tcp->m_nb_mct_records; - - if (l_mct_deco_data->m_data) { - opj_free(l_mct_deco_data->m_data); - l_mct_deco_data->m_data = 00; - } - - l_mct_deco_data->m_index = l_indix++; - l_mct_deco_data->m_array_type = MCT_TYPE_DECORRELATION; - l_mct_deco_data->m_element_type = MCT_TYPE_FLOAT; - l_nb_elem = p_image->numcomps * p_image->numcomps; - l_mct_size = l_nb_elem * MCT_ELEMENT_SIZE[l_mct_deco_data->m_element_type]; - l_mct_deco_data->m_data = (OPJ_BYTE*)opj_malloc(l_mct_size); - - if (! l_mct_deco_data->m_data) { - return OPJ_FALSE; - } - - j2k_mct_write_functions_from_float[l_mct_deco_data->m_element_type]( - p_tcp->m_mct_decoding_matrix, l_mct_deco_data->m_data, l_nb_elem); - - l_mct_deco_data->m_data_size = l_mct_size; - ++p_tcp->m_nb_mct_records; - } - - if (p_tcp->m_nb_mct_records == p_tcp->m_nb_max_mct_records) { - opj_mct_data_t *new_mct_records; - p_tcp->m_nb_max_mct_records += OPJ_J2K_MCT_DEFAULT_NB_RECORDS; - new_mct_records = (opj_mct_data_t *) opj_realloc(p_tcp->m_mct_records, - p_tcp->m_nb_max_mct_records * sizeof(opj_mct_data_t)); - if (! new_mct_records) { - opj_free(p_tcp->m_mct_records); - p_tcp->m_mct_records = NULL; - p_tcp->m_nb_max_mct_records = 0; - p_tcp->m_nb_mct_records = 0; - /* opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to setup mct encoding\n"); */ - return OPJ_FALSE; - } - p_tcp->m_mct_records = new_mct_records; - l_mct_offset_data = p_tcp->m_mct_records + p_tcp->m_nb_mct_records; - - memset(l_mct_offset_data, 0, - (p_tcp->m_nb_max_mct_records - p_tcp->m_nb_mct_records) * sizeof( - opj_mct_data_t)); - - if (l_mct_deco_data) { - l_mct_deco_data = l_mct_offset_data - 1; - } - } - - l_mct_offset_data = p_tcp->m_mct_records + p_tcp->m_nb_mct_records; - - if (l_mct_offset_data->m_data) { - opj_free(l_mct_offset_data->m_data); - l_mct_offset_data->m_data = 00; - } - - l_mct_offset_data->m_index = l_indix++; - l_mct_offset_data->m_array_type = MCT_TYPE_OFFSET; - l_mct_offset_data->m_element_type = MCT_TYPE_FLOAT; - l_nb_elem = p_image->numcomps; - l_mct_size = l_nb_elem * MCT_ELEMENT_SIZE[l_mct_offset_data->m_element_type]; - l_mct_offset_data->m_data = (OPJ_BYTE*)opj_malloc(l_mct_size); - - if (! l_mct_offset_data->m_data) { - return OPJ_FALSE; - } - - l_data = (OPJ_FLOAT32*)opj_malloc(l_nb_elem * sizeof(OPJ_FLOAT32)); - if (! l_data) { - opj_free(l_mct_offset_data->m_data); - l_mct_offset_data->m_data = 00; - return OPJ_FALSE; - } - - l_tccp = p_tcp->tccps; - l_current_data = l_data; - - for (i = 0; i < l_nb_elem; ++i) { - *(l_current_data++) = (OPJ_FLOAT32)(l_tccp->m_dc_level_shift); - ++l_tccp; - } - - j2k_mct_write_functions_from_float[l_mct_offset_data->m_element_type](l_data, - l_mct_offset_data->m_data, l_nb_elem); - - opj_free(l_data); - - l_mct_offset_data->m_data_size = l_mct_size; - - ++p_tcp->m_nb_mct_records; - - if (p_tcp->m_nb_mcc_records == p_tcp->m_nb_max_mcc_records) { - opj_simple_mcc_decorrelation_data_t *new_mcc_records; - p_tcp->m_nb_max_mcc_records += OPJ_J2K_MCT_DEFAULT_NB_RECORDS; - new_mcc_records = (opj_simple_mcc_decorrelation_data_t *) opj_realloc( - p_tcp->m_mcc_records, p_tcp->m_nb_max_mcc_records * sizeof( - opj_simple_mcc_decorrelation_data_t)); - if (! new_mcc_records) { - opj_free(p_tcp->m_mcc_records); - p_tcp->m_mcc_records = NULL; - p_tcp->m_nb_max_mcc_records = 0; - p_tcp->m_nb_mcc_records = 0; - /* opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to setup mct encoding\n"); */ - return OPJ_FALSE; - } - p_tcp->m_mcc_records = new_mcc_records; - l_mcc_data = p_tcp->m_mcc_records + p_tcp->m_nb_mcc_records; - memset(l_mcc_data, 0, (p_tcp->m_nb_max_mcc_records - p_tcp->m_nb_mcc_records) * - sizeof(opj_simple_mcc_decorrelation_data_t)); - - } - - l_mcc_data = p_tcp->m_mcc_records + p_tcp->m_nb_mcc_records; - l_mcc_data->m_decorrelation_array = l_mct_deco_data; - l_mcc_data->m_is_irreversible = 1; - l_mcc_data->m_nb_comps = p_image->numcomps; - l_mcc_data->m_index = l_indix++; - l_mcc_data->m_offset_array = l_mct_offset_data; - ++p_tcp->m_nb_mcc_records; - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_build_decoder(opj_j2k_t * p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager) -{ - /* add here initialization of cp - copy paste of setup_decoder */ - (void)p_j2k; - (void)p_stream; - (void)p_manager; - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_build_encoder(opj_j2k_t * p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager) -{ - /* add here initialization of cp - copy paste of setup_encoder */ - (void)p_j2k; - (void)p_stream; - (void)p_manager; - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_encoding_validation(opj_j2k_t * p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager) -{ - OPJ_BOOL l_is_valid = OPJ_TRUE; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_stream != 00); - assert(p_manager != 00); - - OPJ_UNUSED(p_stream); - - /* STATE checking */ - /* make sure the state is at 0 */ - l_is_valid &= (p_j2k->m_specific_param.m_decoder.m_state == J2K_STATE_NONE); - - /* POINTER validation */ - /* make sure a p_j2k codec is present */ - l_is_valid &= (p_j2k->m_procedure_list != 00); - /* make sure a validation list is present */ - l_is_valid &= (p_j2k->m_validation_list != 00); - - /* ISO 15444-1:2004 states between 1 & 33 (0 -> 32) */ - /* 33 (32) would always fail the check below (if a cast to 64bits was done) */ - /* FIXME Shall we change OPJ_J2K_MAXRLVLS to 32 ? */ - if ((p_j2k->m_cp.tcps->tccps->numresolutions <= 0) || - (p_j2k->m_cp.tcps->tccps->numresolutions > 32)) { - opj_event_msg(p_manager, EVT_ERROR, - "Number of resolutions is too high in comparison to the size of tiles\n"); - return OPJ_FALSE; - } - - if ((p_j2k->m_cp.tdx) < (OPJ_UINT32)(1 << - (p_j2k->m_cp.tcps->tccps->numresolutions - 1U))) { - opj_event_msg(p_manager, EVT_ERROR, - "Number of resolutions is too high in comparison to the size of tiles\n"); - return OPJ_FALSE; - } - - if ((p_j2k->m_cp.tdy) < (OPJ_UINT32)(1 << - (p_j2k->m_cp.tcps->tccps->numresolutions - 1U))) { - opj_event_msg(p_manager, EVT_ERROR, - "Number of resolutions is too high in comparison to the size of tiles\n"); - return OPJ_FALSE; - } - - /* PARAMETER VALIDATION */ - return l_is_valid; -} - -static OPJ_BOOL opj_j2k_decoding_validation(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager - ) -{ - OPJ_BOOL l_is_valid = OPJ_TRUE; - - /* preconditions*/ - assert(p_j2k != 00); - assert(p_stream != 00); - assert(p_manager != 00); - - OPJ_UNUSED(p_stream); - OPJ_UNUSED(p_manager); - - /* STATE checking */ - /* make sure the state is at 0 */ -#ifdef TODO_MSD - l_is_valid &= (p_j2k->m_specific_param.m_decoder.m_state == J2K_DEC_STATE_NONE); -#endif - l_is_valid &= (p_j2k->m_specific_param.m_decoder.m_state == 0x0000); - - /* POINTER validation */ - /* make sure a p_j2k codec is present */ - /* make sure a procedure list is present */ - l_is_valid &= (p_j2k->m_procedure_list != 00); - /* make sure a validation list is present */ - l_is_valid &= (p_j2k->m_validation_list != 00); - - /* PARAMETER VALIDATION */ - return l_is_valid; -} - -static OPJ_BOOL opj_j2k_read_header_procedure(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager) -{ - OPJ_UINT32 l_current_marker; - OPJ_UINT32 l_marker_size; - const opj_dec_memory_marker_handler_t * l_marker_handler = 00; - OPJ_BOOL l_has_siz = 0; - OPJ_BOOL l_has_cod = 0; - OPJ_BOOL l_has_qcd = 0; - - /* preconditions */ - assert(p_stream != 00); - assert(p_j2k != 00); - assert(p_manager != 00); - - /* We enter in the main header */ - p_j2k->m_specific_param.m_decoder.m_state = J2K_STATE_MHSOC; - - /* Try to read the SOC marker, the codestream must begin with SOC marker */ - if (! opj_j2k_read_soc(p_j2k, p_stream, p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, "Expected a SOC marker \n"); - return OPJ_FALSE; - } - - /* Try to read 2 bytes (the next marker ID) from stream and copy them into the buffer */ - if (opj_stream_read_data(p_stream, - p_j2k->m_specific_param.m_decoder.m_header_data, 2, p_manager) != 2) { - opj_event_msg(p_manager, EVT_ERROR, "Stream too short\n"); - return OPJ_FALSE; - } - - /* Read 2 bytes as the new marker ID */ - opj_read_bytes(p_j2k->m_specific_param.m_decoder.m_header_data, - &l_current_marker, 2); - - /* Try to read until the SOT is detected */ - while (l_current_marker != J2K_MS_SOT) { - - /* Check if the current marker ID is valid */ - if (l_current_marker < 0xff00) { - opj_event_msg(p_manager, EVT_ERROR, - "A marker ID was expected (0xff--) instead of %.8x\n", l_current_marker); - return OPJ_FALSE; - } - - /* Get the marker handler from the marker ID */ - l_marker_handler = opj_j2k_get_marker_handler(l_current_marker); - - /* Manage case where marker is unknown */ - if (l_marker_handler->id == J2K_MS_UNK) { - if (! opj_j2k_read_unk(p_j2k, p_stream, &l_current_marker, p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, - "Unknow marker have been detected and generated error.\n"); - return OPJ_FALSE; - } - - if (l_current_marker == J2K_MS_SOT) { - break; /* SOT marker is detected main header is completely read */ - } else { /* Get the marker handler from the marker ID */ - l_marker_handler = opj_j2k_get_marker_handler(l_current_marker); - } - } - - if (l_marker_handler->id == J2K_MS_SIZ) { - /* Mark required SIZ marker as found */ - l_has_siz = 1; - } - if (l_marker_handler->id == J2K_MS_COD) { - /* Mark required COD marker as found */ - l_has_cod = 1; - } - if (l_marker_handler->id == J2K_MS_QCD) { - /* Mark required QCD marker as found */ - l_has_qcd = 1; - } - - /* Check if the marker is known and if it is the right place to find it */ - if (!(p_j2k->m_specific_param.m_decoder.m_state & l_marker_handler->states)) { - opj_event_msg(p_manager, EVT_ERROR, - "Marker is not compliant with its position\n"); - return OPJ_FALSE; - } - - /* Try to read 2 bytes (the marker size) from stream and copy them into the buffer */ - if (opj_stream_read_data(p_stream, - p_j2k->m_specific_param.m_decoder.m_header_data, 2, p_manager) != 2) { - opj_event_msg(p_manager, EVT_ERROR, "Stream too short\n"); - return OPJ_FALSE; - } - - /* read 2 bytes as the marker size */ - opj_read_bytes(p_j2k->m_specific_param.m_decoder.m_header_data, &l_marker_size, - 2); - if (l_marker_size < 2) { - opj_event_msg(p_manager, EVT_ERROR, "Invalid marker size\n"); - return OPJ_FALSE; - } - l_marker_size -= 2; /* Subtract the size of the marker ID already read */ - - /* Check if the marker size is compatible with the header data size */ - if (l_marker_size > p_j2k->m_specific_param.m_decoder.m_header_data_size) { - OPJ_BYTE *new_header_data = (OPJ_BYTE *) opj_realloc( - p_j2k->m_specific_param.m_decoder.m_header_data, l_marker_size); - if (! new_header_data) { - opj_free(p_j2k->m_specific_param.m_decoder.m_header_data); - p_j2k->m_specific_param.m_decoder.m_header_data = NULL; - p_j2k->m_specific_param.m_decoder.m_header_data_size = 0; - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to read header\n"); - return OPJ_FALSE; - } - p_j2k->m_specific_param.m_decoder.m_header_data = new_header_data; - p_j2k->m_specific_param.m_decoder.m_header_data_size = l_marker_size; - } - - /* Try to read the rest of the marker segment from stream and copy them into the buffer */ - if (opj_stream_read_data(p_stream, - p_j2k->m_specific_param.m_decoder.m_header_data, l_marker_size, - p_manager) != l_marker_size) { - opj_event_msg(p_manager, EVT_ERROR, "Stream too short\n"); - return OPJ_FALSE; - } - - /* Read the marker segment with the correct marker handler */ - if (!(*(l_marker_handler->handler))(p_j2k, - p_j2k->m_specific_param.m_decoder.m_header_data, l_marker_size, p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, - "Marker handler function failed to read the marker segment\n"); - return OPJ_FALSE; - } - - /* Add the marker to the codestream index*/ - if (OPJ_FALSE == opj_j2k_add_mhmarker( - p_j2k->cstr_index, - l_marker_handler->id, - (OPJ_UINT32) opj_stream_tell(p_stream) - l_marker_size - 4, - l_marker_size + 4)) { - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to add mh marker\n"); - return OPJ_FALSE; - } - - /* Try to read 2 bytes (the next marker ID) from stream and copy them into the buffer */ - if (opj_stream_read_data(p_stream, - p_j2k->m_specific_param.m_decoder.m_header_data, 2, p_manager) != 2) { - opj_event_msg(p_manager, EVT_ERROR, "Stream too short\n"); - return OPJ_FALSE; - } - - /* read 2 bytes as the new marker ID */ - opj_read_bytes(p_j2k->m_specific_param.m_decoder.m_header_data, - &l_current_marker, 2); - } - - if (l_has_siz == 0) { - opj_event_msg(p_manager, EVT_ERROR, - "required SIZ marker not found in main header\n"); - return OPJ_FALSE; - } - if (l_has_cod == 0) { - opj_event_msg(p_manager, EVT_ERROR, - "required COD marker not found in main header\n"); - return OPJ_FALSE; - } - if (l_has_qcd == 0) { - opj_event_msg(p_manager, EVT_ERROR, - "required QCD marker not found in main header\n"); - return OPJ_FALSE; - } - - if (! opj_j2k_merge_ppm(&(p_j2k->m_cp), p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, "Failed to merge PPM data\n"); - return OPJ_FALSE; - } - - opj_event_msg(p_manager, EVT_INFO, "Main header has been correctly decoded.\n"); - - /* Position of the last element if the main header */ - p_j2k->cstr_index->main_head_end = (OPJ_UINT32) opj_stream_tell(p_stream) - 2; - - /* Next step: read a tile-part header */ - p_j2k->m_specific_param.m_decoder.m_state = J2K_STATE_TPHSOT; - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_exec(opj_j2k_t * p_j2k, - opj_procedure_list_t * p_procedure_list, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager) -{ - OPJ_BOOL(** l_procedure)(opj_j2k_t *, opj_stream_private_t *, - opj_event_mgr_t *) = 00; - OPJ_BOOL l_result = OPJ_TRUE; - OPJ_UINT32 l_nb_proc, i; - - /* preconditions*/ - assert(p_procedure_list != 00); - assert(p_j2k != 00); - assert(p_stream != 00); - assert(p_manager != 00); - - l_nb_proc = opj_procedure_list_get_nb_procedures(p_procedure_list); - l_procedure = (OPJ_BOOL(**)(opj_j2k_t *, opj_stream_private_t *, - opj_event_mgr_t *)) opj_procedure_list_get_first_procedure(p_procedure_list); - - for (i = 0; i < l_nb_proc; ++i) { - l_result = l_result && ((*l_procedure)(p_j2k, p_stream, p_manager)); - ++l_procedure; - } - - /* and clear the procedure list at the end.*/ - opj_procedure_list_clear(p_procedure_list); - return l_result; -} - -/* FIXME DOC*/ -static OPJ_BOOL opj_j2k_copy_default_tcp_and_create_tcd(opj_j2k_t * p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager - ) -{ - opj_tcp_t * l_tcp = 00; - opj_tcp_t * l_default_tcp = 00; - OPJ_UINT32 l_nb_tiles; - OPJ_UINT32 i, j; - opj_tccp_t *l_current_tccp = 00; - OPJ_UINT32 l_tccp_size; - OPJ_UINT32 l_mct_size; - opj_image_t * l_image; - OPJ_UINT32 l_mcc_records_size, l_mct_records_size; - opj_mct_data_t * l_src_mct_rec, *l_dest_mct_rec; - opj_simple_mcc_decorrelation_data_t * l_src_mcc_rec, *l_dest_mcc_rec; - OPJ_UINT32 l_offset; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_stream != 00); - assert(p_manager != 00); - - OPJ_UNUSED(p_stream); - - l_image = p_j2k->m_private_image; - l_nb_tiles = p_j2k->m_cp.th * p_j2k->m_cp.tw; - l_tcp = p_j2k->m_cp.tcps; - l_tccp_size = l_image->numcomps * (OPJ_UINT32)sizeof(opj_tccp_t); - l_default_tcp = p_j2k->m_specific_param.m_decoder.m_default_tcp; - l_mct_size = l_image->numcomps * l_image->numcomps * (OPJ_UINT32)sizeof( - OPJ_FLOAT32); - - /* For each tile */ - for (i = 0; i < l_nb_tiles; ++i) { - /* keep the tile-compo coding parameters pointer of the current tile coding parameters*/ - l_current_tccp = l_tcp->tccps; - /*Copy default coding parameters into the current tile coding parameters*/ - memcpy(l_tcp, l_default_tcp, sizeof(opj_tcp_t)); - /* Initialize some values of the current tile coding parameters*/ - l_tcp->cod = 0; - l_tcp->ppt = 0; - l_tcp->ppt_data = 00; - l_tcp->m_current_tile_part_number = -1; - /* Remove memory not owned by this tile in case of early error return. */ - l_tcp->m_mct_decoding_matrix = 00; - l_tcp->m_nb_max_mct_records = 0; - l_tcp->m_mct_records = 00; - l_tcp->m_nb_max_mcc_records = 0; - l_tcp->m_mcc_records = 00; - /* Reconnect the tile-compo coding parameters pointer to the current tile coding parameters*/ - l_tcp->tccps = l_current_tccp; - - /* Get the mct_decoding_matrix of the dflt_tile_cp and copy them into the current tile cp*/ - if (l_default_tcp->m_mct_decoding_matrix) { - l_tcp->m_mct_decoding_matrix = (OPJ_FLOAT32*)opj_malloc(l_mct_size); - if (! l_tcp->m_mct_decoding_matrix) { - return OPJ_FALSE; - } - memcpy(l_tcp->m_mct_decoding_matrix, l_default_tcp->m_mct_decoding_matrix, - l_mct_size); - } - - /* Get the mct_record of the dflt_tile_cp and copy them into the current tile cp*/ - l_mct_records_size = l_default_tcp->m_nb_max_mct_records * (OPJ_UINT32)sizeof( - opj_mct_data_t); - l_tcp->m_mct_records = (opj_mct_data_t*)opj_malloc(l_mct_records_size); - if (! l_tcp->m_mct_records) { - return OPJ_FALSE; - } - memcpy(l_tcp->m_mct_records, l_default_tcp->m_mct_records, l_mct_records_size); - - /* Copy the mct record data from dflt_tile_cp to the current tile*/ - l_src_mct_rec = l_default_tcp->m_mct_records; - l_dest_mct_rec = l_tcp->m_mct_records; - - for (j = 0; j < l_default_tcp->m_nb_mct_records; ++j) { - - if (l_src_mct_rec->m_data) { - - l_dest_mct_rec->m_data = (OPJ_BYTE*) opj_malloc(l_src_mct_rec->m_data_size); - if (! l_dest_mct_rec->m_data) { - return OPJ_FALSE; - } - memcpy(l_dest_mct_rec->m_data, l_src_mct_rec->m_data, - l_src_mct_rec->m_data_size); - } - - ++l_src_mct_rec; - ++l_dest_mct_rec; - /* Update with each pass to free exactly what has been allocated on early return. */ - l_tcp->m_nb_max_mct_records += 1; - } - - /* Get the mcc_record of the dflt_tile_cp and copy them into the current tile cp*/ - l_mcc_records_size = l_default_tcp->m_nb_max_mcc_records * (OPJ_UINT32)sizeof( - opj_simple_mcc_decorrelation_data_t); - l_tcp->m_mcc_records = (opj_simple_mcc_decorrelation_data_t*) opj_malloc( - l_mcc_records_size); - if (! l_tcp->m_mcc_records) { - return OPJ_FALSE; - } - memcpy(l_tcp->m_mcc_records, l_default_tcp->m_mcc_records, l_mcc_records_size); - l_tcp->m_nb_max_mcc_records = l_default_tcp->m_nb_max_mcc_records; - - /* Copy the mcc record data from dflt_tile_cp to the current tile*/ - l_src_mcc_rec = l_default_tcp->m_mcc_records; - l_dest_mcc_rec = l_tcp->m_mcc_records; - - for (j = 0; j < l_default_tcp->m_nb_max_mcc_records; ++j) { - - if (l_src_mcc_rec->m_decorrelation_array) { - l_offset = (OPJ_UINT32)(l_src_mcc_rec->m_decorrelation_array - - l_default_tcp->m_mct_records); - l_dest_mcc_rec->m_decorrelation_array = l_tcp->m_mct_records + l_offset; - } - - if (l_src_mcc_rec->m_offset_array) { - l_offset = (OPJ_UINT32)(l_src_mcc_rec->m_offset_array - - l_default_tcp->m_mct_records); - l_dest_mcc_rec->m_offset_array = l_tcp->m_mct_records + l_offset; - } - - ++l_src_mcc_rec; - ++l_dest_mcc_rec; - } - - /* Copy all the dflt_tile_compo_cp to the current tile cp */ - memcpy(l_current_tccp, l_default_tcp->tccps, l_tccp_size); - - /* Move to next tile cp*/ - ++l_tcp; - } - - /* Create the current tile decoder*/ - p_j2k->m_tcd = opj_tcd_create(OPJ_TRUE); - if (! p_j2k->m_tcd) { - return OPJ_FALSE; - } - - if (!opj_tcd_init(p_j2k->m_tcd, l_image, &(p_j2k->m_cp), p_j2k->m_tp)) { - opj_tcd_destroy(p_j2k->m_tcd); - p_j2k->m_tcd = 00; - opj_event_msg(p_manager, EVT_ERROR, "Cannot decode tile, memory error\n"); - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -static const opj_dec_memory_marker_handler_t * opj_j2k_get_marker_handler( - OPJ_UINT32 p_id) -{ - const opj_dec_memory_marker_handler_t *e; - for (e = j2k_memory_marker_handler_tab; e->id != 0; ++e) { - if (e->id == p_id) { - break; /* we find a handler corresponding to the marker ID*/ - } - } - return e; -} - -void opj_j2k_destroy(opj_j2k_t *p_j2k) -{ - if (p_j2k == 00) { - return; - } - - if (p_j2k->m_is_decoder) { - - if (p_j2k->m_specific_param.m_decoder.m_default_tcp != 00) { - opj_j2k_tcp_destroy(p_j2k->m_specific_param.m_decoder.m_default_tcp); - opj_free(p_j2k->m_specific_param.m_decoder.m_default_tcp); - p_j2k->m_specific_param.m_decoder.m_default_tcp = 00; - } - - if (p_j2k->m_specific_param.m_decoder.m_header_data != 00) { - opj_free(p_j2k->m_specific_param.m_decoder.m_header_data); - p_j2k->m_specific_param.m_decoder.m_header_data = 00; - p_j2k->m_specific_param.m_decoder.m_header_data_size = 0; - } - - opj_free(p_j2k->m_specific_param.m_decoder.m_comps_indices_to_decode); - p_j2k->m_specific_param.m_decoder.m_comps_indices_to_decode = 00; - p_j2k->m_specific_param.m_decoder.m_numcomps_to_decode = 0; - - } else { - - if (p_j2k->m_specific_param.m_encoder.m_encoded_tile_data) { - opj_free(p_j2k->m_specific_param.m_encoder.m_encoded_tile_data); - p_j2k->m_specific_param.m_encoder.m_encoded_tile_data = 00; - } - - if (p_j2k->m_specific_param.m_encoder.m_tlm_sot_offsets_buffer) { - opj_free(p_j2k->m_specific_param.m_encoder.m_tlm_sot_offsets_buffer); - p_j2k->m_specific_param.m_encoder.m_tlm_sot_offsets_buffer = 00; - p_j2k->m_specific_param.m_encoder.m_tlm_sot_offsets_current = 00; - } - - if (p_j2k->m_specific_param.m_encoder.m_header_tile_data) { - opj_free(p_j2k->m_specific_param.m_encoder.m_header_tile_data); - p_j2k->m_specific_param.m_encoder.m_header_tile_data = 00; - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = 0; - } - } - - opj_tcd_destroy(p_j2k->m_tcd); - - opj_j2k_cp_destroy(&(p_j2k->m_cp)); - memset(&(p_j2k->m_cp), 0, sizeof(opj_cp_t)); - - opj_procedure_list_destroy(p_j2k->m_procedure_list); - p_j2k->m_procedure_list = 00; - - opj_procedure_list_destroy(p_j2k->m_validation_list); - p_j2k->m_procedure_list = 00; - - j2k_destroy_cstr_index(p_j2k->cstr_index); - p_j2k->cstr_index = NULL; - - opj_image_destroy(p_j2k->m_private_image); - p_j2k->m_private_image = NULL; - - opj_image_destroy(p_j2k->m_output_image); - p_j2k->m_output_image = NULL; - - opj_thread_pool_destroy(p_j2k->m_tp); - p_j2k->m_tp = NULL; - - opj_free(p_j2k); -} - -void j2k_destroy_cstr_index(opj_codestream_index_t *p_cstr_ind) -{ - if (p_cstr_ind) { - - if (p_cstr_ind->marker) { - opj_free(p_cstr_ind->marker); - p_cstr_ind->marker = NULL; - } - - if (p_cstr_ind->tile_index) { - OPJ_UINT32 it_tile = 0; - - for (it_tile = 0; it_tile < p_cstr_ind->nb_of_tiles; it_tile++) { - - if (p_cstr_ind->tile_index[it_tile].packet_index) { - opj_free(p_cstr_ind->tile_index[it_tile].packet_index); - p_cstr_ind->tile_index[it_tile].packet_index = NULL; - } - - if (p_cstr_ind->tile_index[it_tile].tp_index) { - opj_free(p_cstr_ind->tile_index[it_tile].tp_index); - p_cstr_ind->tile_index[it_tile].tp_index = NULL; - } - - if (p_cstr_ind->tile_index[it_tile].marker) { - opj_free(p_cstr_ind->tile_index[it_tile].marker); - p_cstr_ind->tile_index[it_tile].marker = NULL; - - } - } - - opj_free(p_cstr_ind->tile_index); - p_cstr_ind->tile_index = NULL; - } - - opj_free(p_cstr_ind); - } -} - -static void opj_j2k_tcp_destroy(opj_tcp_t *p_tcp) -{ - if (p_tcp == 00) { - return; - } - - if (p_tcp->ppt_markers != 00) { - OPJ_UINT32 i; - for (i = 0U; i < p_tcp->ppt_markers_count; ++i) { - if (p_tcp->ppt_markers[i].m_data != NULL) { - opj_free(p_tcp->ppt_markers[i].m_data); - } - } - p_tcp->ppt_markers_count = 0U; - opj_free(p_tcp->ppt_markers); - p_tcp->ppt_markers = NULL; - } - - if (p_tcp->ppt_buffer != 00) { - opj_free(p_tcp->ppt_buffer); - p_tcp->ppt_buffer = 00; - } - - if (p_tcp->tccps != 00) { - opj_free(p_tcp->tccps); - p_tcp->tccps = 00; - } - - if (p_tcp->m_mct_coding_matrix != 00) { - opj_free(p_tcp->m_mct_coding_matrix); - p_tcp->m_mct_coding_matrix = 00; - } - - if (p_tcp->m_mct_decoding_matrix != 00) { - opj_free(p_tcp->m_mct_decoding_matrix); - p_tcp->m_mct_decoding_matrix = 00; - } - - if (p_tcp->m_mcc_records) { - opj_free(p_tcp->m_mcc_records); - p_tcp->m_mcc_records = 00; - p_tcp->m_nb_max_mcc_records = 0; - p_tcp->m_nb_mcc_records = 0; - } - - if (p_tcp->m_mct_records) { - opj_mct_data_t * l_mct_data = p_tcp->m_mct_records; - OPJ_UINT32 i; - - for (i = 0; i < p_tcp->m_nb_mct_records; ++i) { - if (l_mct_data->m_data) { - opj_free(l_mct_data->m_data); - l_mct_data->m_data = 00; - } - - ++l_mct_data; - } - - opj_free(p_tcp->m_mct_records); - p_tcp->m_mct_records = 00; - } - - if (p_tcp->mct_norms != 00) { - opj_free(p_tcp->mct_norms); - p_tcp->mct_norms = 00; - } - - opj_j2k_tcp_data_destroy(p_tcp); - -} - -static void opj_j2k_tcp_data_destroy(opj_tcp_t *p_tcp) -{ - if (p_tcp->m_data) { - opj_free(p_tcp->m_data); - p_tcp->m_data = NULL; - p_tcp->m_data_size = 0; - } -} - -static void opj_j2k_cp_destroy(opj_cp_t *p_cp) -{ - OPJ_UINT32 l_nb_tiles; - opj_tcp_t * l_current_tile = 00; - - if (p_cp == 00) { - return; - } - if (p_cp->tcps != 00) { - OPJ_UINT32 i; - l_current_tile = p_cp->tcps; - l_nb_tiles = p_cp->th * p_cp->tw; - - for (i = 0U; i < l_nb_tiles; ++i) { - opj_j2k_tcp_destroy(l_current_tile); - ++l_current_tile; - } - opj_free(p_cp->tcps); - p_cp->tcps = 00; - } - if (p_cp->ppm_markers != 00) { - OPJ_UINT32 i; - for (i = 0U; i < p_cp->ppm_markers_count; ++i) { - if (p_cp->ppm_markers[i].m_data != NULL) { - opj_free(p_cp->ppm_markers[i].m_data); - } - } - p_cp->ppm_markers_count = 0U; - opj_free(p_cp->ppm_markers); - p_cp->ppm_markers = NULL; - } - opj_free(p_cp->ppm_buffer); - p_cp->ppm_buffer = 00; - p_cp->ppm_data = - NULL; /* ppm_data belongs to the allocated buffer pointed by ppm_buffer */ - opj_free(p_cp->comment); - p_cp->comment = 00; - if (! p_cp->m_is_decoder) { - opj_free(p_cp->m_specific_param.m_enc.m_matrice); - p_cp->m_specific_param.m_enc.m_matrice = 00; - } -} - -static OPJ_BOOL opj_j2k_need_nb_tile_parts_correction(opj_stream_private_t - *p_stream, OPJ_UINT32 tile_no, OPJ_BOOL* p_correction_needed, - opj_event_mgr_t * p_manager) -{ - OPJ_BYTE l_header_data[10]; - OPJ_OFF_T l_stream_pos_backup; - OPJ_UINT32 l_current_marker; - OPJ_UINT32 l_marker_size; - OPJ_UINT32 l_tile_no, l_tot_len, l_current_part, l_num_parts; - - /* initialize to no correction needed */ - *p_correction_needed = OPJ_FALSE; - - if (!opj_stream_has_seek(p_stream)) { - /* We can't do much in this case, seek is needed */ - return OPJ_TRUE; - } - - l_stream_pos_backup = opj_stream_tell(p_stream); - if (l_stream_pos_backup == -1) { - /* let's do nothing */ - return OPJ_TRUE; - } - - for (;;) { - /* Try to read 2 bytes (the next marker ID) from stream and copy them into the buffer */ - if (opj_stream_read_data(p_stream, l_header_data, 2, p_manager) != 2) { - /* assume all is OK */ - if (! opj_stream_seek(p_stream, l_stream_pos_backup, p_manager)) { - return OPJ_FALSE; - } - return OPJ_TRUE; - } - - /* Read 2 bytes from buffer as the new marker ID */ - opj_read_bytes(l_header_data, &l_current_marker, 2); - - if (l_current_marker != J2K_MS_SOT) { - /* assume all is OK */ - if (! opj_stream_seek(p_stream, l_stream_pos_backup, p_manager)) { - return OPJ_FALSE; - } - return OPJ_TRUE; - } - - /* Try to read 2 bytes (the marker size) from stream and copy them into the buffer */ - if (opj_stream_read_data(p_stream, l_header_data, 2, p_manager) != 2) { - opj_event_msg(p_manager, EVT_ERROR, "Stream too short\n"); - return OPJ_FALSE; - } - - /* Read 2 bytes from the buffer as the marker size */ - opj_read_bytes(l_header_data, &l_marker_size, 2); - - /* Check marker size for SOT Marker */ - if (l_marker_size != 10) { - opj_event_msg(p_manager, EVT_ERROR, "Inconsistent marker size\n"); - return OPJ_FALSE; - } - l_marker_size -= 2; - - if (opj_stream_read_data(p_stream, l_header_data, l_marker_size, - p_manager) != l_marker_size) { - opj_event_msg(p_manager, EVT_ERROR, "Stream too short\n"); - return OPJ_FALSE; - } - - if (! opj_j2k_get_sot_values(l_header_data, l_marker_size, &l_tile_no, - &l_tot_len, &l_current_part, &l_num_parts, p_manager)) { - return OPJ_FALSE; - } - - if (l_tile_no == tile_no) { - /* we found what we were looking for */ - break; - } - - if (l_tot_len < 14U) { - /* last SOT until EOC or invalid Psot value */ - /* assume all is OK */ - if (! opj_stream_seek(p_stream, l_stream_pos_backup, p_manager)) { - return OPJ_FALSE; - } - return OPJ_TRUE; - } - l_tot_len -= 12U; - /* look for next SOT marker */ - if (opj_stream_skip(p_stream, (OPJ_OFF_T)(l_tot_len), - p_manager) != (OPJ_OFF_T)(l_tot_len)) { - /* assume all is OK */ - if (! opj_stream_seek(p_stream, l_stream_pos_backup, p_manager)) { - return OPJ_FALSE; - } - return OPJ_TRUE; - } - } - - /* check for correction */ - if (l_current_part == l_num_parts) { - *p_correction_needed = OPJ_TRUE; - } - - if (! opj_stream_seek(p_stream, l_stream_pos_backup, p_manager)) { - return OPJ_FALSE; - } - return OPJ_TRUE; -} - -OPJ_BOOL opj_j2k_read_tile_header(opj_j2k_t * p_j2k, - OPJ_UINT32 * p_tile_index, - OPJ_UINT32 * p_data_size, - OPJ_INT32 * p_tile_x0, OPJ_INT32 * p_tile_y0, - OPJ_INT32 * p_tile_x1, OPJ_INT32 * p_tile_y1, - OPJ_UINT32 * p_nb_comps, - OPJ_BOOL * p_go_on, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager) -{ - OPJ_UINT32 l_current_marker = J2K_MS_SOT; - OPJ_UINT32 l_marker_size; - const opj_dec_memory_marker_handler_t * l_marker_handler = 00; - opj_tcp_t * l_tcp = NULL; - - /* preconditions */ - assert(p_stream != 00); - assert(p_j2k != 00); - assert(p_manager != 00); - - /* Reach the End Of Codestream ?*/ - if (p_j2k->m_specific_param.m_decoder.m_state == J2K_STATE_EOC) { - l_current_marker = J2K_MS_EOC; - } - /* We need to encounter a SOT marker (a new tile-part header) */ - else if (p_j2k->m_specific_param.m_decoder.m_state != J2K_STATE_TPHSOT) { - return OPJ_FALSE; - } - - /* Read into the codestream until reach the EOC or ! can_decode ??? FIXME */ - while ((!p_j2k->m_specific_param.m_decoder.m_can_decode) && - (l_current_marker != J2K_MS_EOC)) { - - /* Try to read until the Start Of Data is detected */ - while (l_current_marker != J2K_MS_SOD) { - - if (opj_stream_get_number_byte_left(p_stream) == 0) { - p_j2k->m_specific_param.m_decoder.m_state = J2K_STATE_NEOC; - break; - } - - /* Try to read 2 bytes (the marker size) from stream and copy them into the buffer */ - if (opj_stream_read_data(p_stream, - p_j2k->m_specific_param.m_decoder.m_header_data, 2, p_manager) != 2) { - opj_event_msg(p_manager, EVT_ERROR, "Stream too short\n"); - return OPJ_FALSE; - } - - /* Read 2 bytes from the buffer as the marker size */ - opj_read_bytes(p_j2k->m_specific_param.m_decoder.m_header_data, &l_marker_size, - 2); - - /* Check marker size (does not include marker ID but includes marker size) */ - if (l_marker_size < 2) { - opj_event_msg(p_manager, EVT_ERROR, "Inconsistent marker size\n"); - return OPJ_FALSE; - } - - /* cf. https://code.google.com/p/openjpeg/issues/detail?id=226 */ - if (l_current_marker == 0x8080 && - opj_stream_get_number_byte_left(p_stream) == 0) { - p_j2k->m_specific_param.m_decoder.m_state = J2K_STATE_NEOC; - break; - } - - /* Why this condition? FIXME */ - if (p_j2k->m_specific_param.m_decoder.m_state & J2K_STATE_TPH) { - p_j2k->m_specific_param.m_decoder.m_sot_length -= (l_marker_size + 2); - } - l_marker_size -= 2; /* Subtract the size of the marker ID already read */ - - /* Get the marker handler from the marker ID */ - l_marker_handler = opj_j2k_get_marker_handler(l_current_marker); - - /* Check if the marker is known and if it is the right place to find it */ - if (!(p_j2k->m_specific_param.m_decoder.m_state & l_marker_handler->states)) { - opj_event_msg(p_manager, EVT_ERROR, - "Marker is not compliant with its position\n"); - return OPJ_FALSE; - } - /* FIXME manage case of unknown marker as in the main header ? */ - - /* Check if the marker size is compatible with the header data size */ - if (l_marker_size > p_j2k->m_specific_param.m_decoder.m_header_data_size) { - OPJ_BYTE *new_header_data = NULL; - /* If we are here, this means we consider this marker as known & we will read it */ - /* Check enough bytes left in stream before allocation */ - if ((OPJ_OFF_T)l_marker_size > opj_stream_get_number_byte_left(p_stream)) { - opj_event_msg(p_manager, EVT_ERROR, - "Marker size inconsistent with stream length\n"); - return OPJ_FALSE; - } - new_header_data = (OPJ_BYTE *) opj_realloc( - p_j2k->m_specific_param.m_decoder.m_header_data, l_marker_size); - if (! new_header_data) { - opj_free(p_j2k->m_specific_param.m_decoder.m_header_data); - p_j2k->m_specific_param.m_decoder.m_header_data = NULL; - p_j2k->m_specific_param.m_decoder.m_header_data_size = 0; - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to read header\n"); - return OPJ_FALSE; - } - p_j2k->m_specific_param.m_decoder.m_header_data = new_header_data; - p_j2k->m_specific_param.m_decoder.m_header_data_size = l_marker_size; - } - - /* Try to read the rest of the marker segment from stream and copy them into the buffer */ - if (opj_stream_read_data(p_stream, - p_j2k->m_specific_param.m_decoder.m_header_data, l_marker_size, - p_manager) != l_marker_size) { - opj_event_msg(p_manager, EVT_ERROR, "Stream too short\n"); - return OPJ_FALSE; - } - - if (!l_marker_handler->handler) { - /* See issue #175 */ - opj_event_msg(p_manager, EVT_ERROR, "Not sure how that happened.\n"); - return OPJ_FALSE; - } - /* Read the marker segment with the correct marker handler */ - if (!(*(l_marker_handler->handler))(p_j2k, - p_j2k->m_specific_param.m_decoder.m_header_data, l_marker_size, p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, - "Fail to read the current marker segment (%#x)\n", l_current_marker); - return OPJ_FALSE; - } - - /* Add the marker to the codestream index*/ - if (OPJ_FALSE == opj_j2k_add_tlmarker(p_j2k->m_current_tile_number, - p_j2k->cstr_index, - l_marker_handler->id, - (OPJ_UINT32) opj_stream_tell(p_stream) - l_marker_size - 4, - l_marker_size + 4)) { - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to add tl marker\n"); - return OPJ_FALSE; - } - - /* Keep the position of the last SOT marker read */ - if (l_marker_handler->id == J2K_MS_SOT) { - OPJ_UINT32 sot_pos = (OPJ_UINT32) opj_stream_tell(p_stream) - l_marker_size - 4 - ; - if (sot_pos > p_j2k->m_specific_param.m_decoder.m_last_sot_read_pos) { - p_j2k->m_specific_param.m_decoder.m_last_sot_read_pos = sot_pos; - } - } - - if (p_j2k->m_specific_param.m_decoder.m_skip_data) { - /* Skip the rest of the tile part header*/ - if (opj_stream_skip(p_stream, p_j2k->m_specific_param.m_decoder.m_sot_length, - p_manager) != p_j2k->m_specific_param.m_decoder.m_sot_length) { - opj_event_msg(p_manager, EVT_ERROR, "Stream too short\n"); - return OPJ_FALSE; - } - l_current_marker = J2K_MS_SOD; /* Normally we reached a SOD */ - } else { - /* Try to read 2 bytes (the next marker ID) from stream and copy them into the buffer*/ - if (opj_stream_read_data(p_stream, - p_j2k->m_specific_param.m_decoder.m_header_data, 2, p_manager) != 2) { - opj_event_msg(p_manager, EVT_ERROR, "Stream too short\n"); - return OPJ_FALSE; - } - /* Read 2 bytes from the buffer as the new marker ID */ - opj_read_bytes(p_j2k->m_specific_param.m_decoder.m_header_data, - &l_current_marker, 2); - } - } - if (opj_stream_get_number_byte_left(p_stream) == 0 - && p_j2k->m_specific_param.m_decoder.m_state == J2K_STATE_NEOC) { - break; - } - - /* If we didn't skip data before, we need to read the SOD marker*/ - if (! p_j2k->m_specific_param.m_decoder.m_skip_data) { - /* Try to read the SOD marker and skip data ? FIXME */ - if (! opj_j2k_read_sod(p_j2k, p_stream, p_manager)) { - return OPJ_FALSE; - } - if (p_j2k->m_specific_param.m_decoder.m_can_decode && - !p_j2k->m_specific_param.m_decoder.m_nb_tile_parts_correction_checked) { - /* Issue 254 */ - OPJ_BOOL l_correction_needed; - - p_j2k->m_specific_param.m_decoder.m_nb_tile_parts_correction_checked = 1; - if (!opj_j2k_need_nb_tile_parts_correction(p_stream, - p_j2k->m_current_tile_number, &l_correction_needed, p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, - "opj_j2k_apply_nb_tile_parts_correction error\n"); - return OPJ_FALSE; - } - if (l_correction_needed) { - OPJ_UINT32 l_nb_tiles = p_j2k->m_cp.tw * p_j2k->m_cp.th; - OPJ_UINT32 l_tile_no; - - p_j2k->m_specific_param.m_decoder.m_can_decode = 0; - p_j2k->m_specific_param.m_decoder.m_nb_tile_parts_correction = 1; - /* correct tiles */ - for (l_tile_no = 0U; l_tile_no < l_nb_tiles; ++l_tile_no) { - if (p_j2k->m_cp.tcps[l_tile_no].m_nb_tile_parts != 0U) { - p_j2k->m_cp.tcps[l_tile_no].m_nb_tile_parts += 1; - } - } - opj_event_msg(p_manager, EVT_WARNING, - "Non conformant codestream TPsot==TNsot.\n"); - } - } - if (! p_j2k->m_specific_param.m_decoder.m_can_decode) { - /* Try to read 2 bytes (the next marker ID) from stream and copy them into the buffer */ - if (opj_stream_read_data(p_stream, - p_j2k->m_specific_param.m_decoder.m_header_data, 2, p_manager) != 2) { - opj_event_msg(p_manager, EVT_ERROR, "Stream too short\n"); - return OPJ_FALSE; - } - - /* Read 2 bytes from buffer as the new marker ID */ - opj_read_bytes(p_j2k->m_specific_param.m_decoder.m_header_data, - &l_current_marker, 2); - } - } else { - /* Indicate we will try to read a new tile-part header*/ - p_j2k->m_specific_param.m_decoder.m_skip_data = 0; - p_j2k->m_specific_param.m_decoder.m_can_decode = 0; - p_j2k->m_specific_param.m_decoder.m_state = J2K_STATE_TPHSOT; - - /* Try to read 2 bytes (the next marker ID) from stream and copy them into the buffer */ - if (opj_stream_read_data(p_stream, - p_j2k->m_specific_param.m_decoder.m_header_data, 2, p_manager) != 2) { - opj_event_msg(p_manager, EVT_ERROR, "Stream too short\n"); - return OPJ_FALSE; - } - - /* Read 2 bytes from buffer as the new marker ID */ - opj_read_bytes(p_j2k->m_specific_param.m_decoder.m_header_data, - &l_current_marker, 2); - } - } - - /* Current marker is the EOC marker ?*/ - if (l_current_marker == J2K_MS_EOC) { - p_j2k->m_specific_param.m_decoder.m_state = J2K_STATE_EOC; - } - - /* FIXME DOC ???*/ - if (! p_j2k->m_specific_param.m_decoder.m_can_decode) { - OPJ_UINT32 l_nb_tiles = p_j2k->m_cp.th * p_j2k->m_cp.tw; - l_tcp = p_j2k->m_cp.tcps + p_j2k->m_current_tile_number; - - while ((p_j2k->m_current_tile_number < l_nb_tiles) && (l_tcp->m_data == 00)) { - ++p_j2k->m_current_tile_number; - ++l_tcp; - } - - if (p_j2k->m_current_tile_number == l_nb_tiles) { - *p_go_on = OPJ_FALSE; - return OPJ_TRUE; - } - } - - if (! opj_j2k_merge_ppt(p_j2k->m_cp.tcps + p_j2k->m_current_tile_number, - p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, "Failed to merge PPT data\n"); - return OPJ_FALSE; - } - /*FIXME ???*/ - if (! opj_tcd_init_decode_tile(p_j2k->m_tcd, p_j2k->m_current_tile_number, - p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, "Cannot decode tile, memory error\n"); - return OPJ_FALSE; - } - - opj_event_msg(p_manager, EVT_INFO, "Header of tile %d / %d has been read.\n", - p_j2k->m_current_tile_number + 1, (p_j2k->m_cp.th * p_j2k->m_cp.tw)); - - *p_tile_index = p_j2k->m_current_tile_number; - *p_go_on = OPJ_TRUE; - if (p_data_size) { - /* For internal use in j2k.c, we don't need this */ - /* This is just needed for folks using the opj_read_tile_header() / opj_decode_tile_data() combo */ - *p_data_size = opj_tcd_get_decoded_tile_size(p_j2k->m_tcd, OPJ_FALSE); - if (*p_data_size == UINT_MAX) { - return OPJ_FALSE; - } - } - *p_tile_x0 = p_j2k->m_tcd->tcd_image->tiles->x0; - *p_tile_y0 = p_j2k->m_tcd->tcd_image->tiles->y0; - *p_tile_x1 = p_j2k->m_tcd->tcd_image->tiles->x1; - *p_tile_y1 = p_j2k->m_tcd->tcd_image->tiles->y1; - *p_nb_comps = p_j2k->m_tcd->tcd_image->tiles->numcomps; - - p_j2k->m_specific_param.m_decoder.m_state |= J2K_STATE_DATA; - - return OPJ_TRUE; -} - -OPJ_BOOL opj_j2k_decode_tile(opj_j2k_t * p_j2k, - OPJ_UINT32 p_tile_index, - OPJ_BYTE * p_data, - OPJ_UINT32 p_data_size, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager) -{ - OPJ_UINT32 l_current_marker; - OPJ_BYTE l_data [2]; - opj_tcp_t * l_tcp; - opj_image_t* l_image_for_bounds; - - /* preconditions */ - assert(p_stream != 00); - assert(p_j2k != 00); - assert(p_manager != 00); - - if (!(p_j2k->m_specific_param.m_decoder.m_state & J2K_STATE_DATA) - || (p_tile_index != p_j2k->m_current_tile_number)) { - return OPJ_FALSE; - } - - l_tcp = &(p_j2k->m_cp.tcps[p_tile_index]); - if (! l_tcp->m_data) { - opj_j2k_tcp_destroy(l_tcp); - return OPJ_FALSE; - } - - /* When using the opj_read_tile_header / opj_decode_tile_data API */ - /* such as in test_tile_decoder, m_output_image is NULL, so fall back */ - /* to the full image dimension. This is a bit surprising that */ - /* opj_set_decode_area() is only used to determinte intersecting tiles, */ - /* but full tile decoding is done */ - l_image_for_bounds = p_j2k->m_output_image ? p_j2k->m_output_image : - p_j2k->m_private_image; - if (! opj_tcd_decode_tile(p_j2k->m_tcd, - l_image_for_bounds->x0, - l_image_for_bounds->y0, - l_image_for_bounds->x1, - l_image_for_bounds->y1, - p_j2k->m_specific_param.m_decoder.m_numcomps_to_decode, - p_j2k->m_specific_param.m_decoder.m_comps_indices_to_decode, - l_tcp->m_data, - l_tcp->m_data_size, - p_tile_index, - p_j2k->cstr_index, p_manager)) { - opj_j2k_tcp_destroy(l_tcp); - p_j2k->m_specific_param.m_decoder.m_state |= J2K_STATE_ERR; - opj_event_msg(p_manager, EVT_ERROR, "Failed to decode.\n"); - return OPJ_FALSE; - } - - /* p_data can be set to NULL when the call will take care of using */ - /* itself the TCD data. This is typically the case for whole single */ - /* tile decoding optimization. */ - if (p_data != NULL) { - if (! opj_tcd_update_tile_data(p_j2k->m_tcd, p_data, p_data_size)) { - return OPJ_FALSE; - } - - /* To avoid to destroy the tcp which can be useful when we try to decode a tile decoded before (cf j2k_random_tile_access) - * we destroy just the data which will be re-read in read_tile_header*/ - /*opj_j2k_tcp_destroy(l_tcp); - p_j2k->m_tcd->tcp = 0;*/ - opj_j2k_tcp_data_destroy(l_tcp); - } - - p_j2k->m_specific_param.m_decoder.m_can_decode = 0; - p_j2k->m_specific_param.m_decoder.m_state &= (~(OPJ_UINT32)J2K_STATE_DATA); - - if (opj_stream_get_number_byte_left(p_stream) == 0 - && p_j2k->m_specific_param.m_decoder.m_state == J2K_STATE_NEOC) { - return OPJ_TRUE; - } - - if (p_j2k->m_specific_param.m_decoder.m_state != J2K_STATE_EOC) { - if (opj_stream_read_data(p_stream, l_data, 2, p_manager) != 2) { - opj_event_msg(p_manager, EVT_ERROR, "Stream too short\n"); - return OPJ_FALSE; - } - - opj_read_bytes(l_data, &l_current_marker, 2); - - if (l_current_marker == J2K_MS_EOC) { - p_j2k->m_current_tile_number = 0; - p_j2k->m_specific_param.m_decoder.m_state = J2K_STATE_EOC; - } else if (l_current_marker != J2K_MS_SOT) { - if (opj_stream_get_number_byte_left(p_stream) == 0) { - p_j2k->m_specific_param.m_decoder.m_state = J2K_STATE_NEOC; - opj_event_msg(p_manager, EVT_WARNING, "Stream does not end with EOC\n"); - return OPJ_TRUE; - } - opj_event_msg(p_manager, EVT_ERROR, "Stream too short, expected SOT\n"); - return OPJ_FALSE; - } - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_update_image_data(opj_tcd_t * p_tcd, - opj_image_t* p_output_image) -{ - OPJ_UINT32 i, j; - OPJ_UINT32 l_width_src, l_height_src; - OPJ_UINT32 l_width_dest, l_height_dest; - OPJ_INT32 l_offset_x0_src, l_offset_y0_src, l_offset_x1_src, l_offset_y1_src; - OPJ_SIZE_T l_start_offset_src; - OPJ_UINT32 l_start_x_dest, l_start_y_dest; - OPJ_UINT32 l_x0_dest, l_y0_dest, l_x1_dest, l_y1_dest; - OPJ_SIZE_T l_start_offset_dest; - - opj_image_comp_t * l_img_comp_src = 00; - opj_image_comp_t * l_img_comp_dest = 00; - - opj_tcd_tilecomp_t * l_tilec = 00; - opj_image_t * l_image_src = 00; - OPJ_INT32 * l_dest_ptr; - - l_tilec = p_tcd->tcd_image->tiles->comps; - l_image_src = p_tcd->image; - l_img_comp_src = l_image_src->comps; - - l_img_comp_dest = p_output_image->comps; - - for (i = 0; i < l_image_src->numcomps; - i++, ++l_img_comp_dest, ++l_img_comp_src, ++l_tilec) { - OPJ_INT32 res_x0, res_x1, res_y0, res_y1; - OPJ_UINT32 src_data_stride; - const OPJ_INT32* p_src_data; - - /* Copy info from decoded comp image to output image */ - l_img_comp_dest->resno_decoded = l_img_comp_src->resno_decoded; - - if (p_tcd->whole_tile_decoding) { - opj_tcd_resolution_t* l_res = l_tilec->resolutions + - l_img_comp_src->resno_decoded; - res_x0 = l_res->x0; - res_y0 = l_res->y0; - res_x1 = l_res->x1; - res_y1 = l_res->y1; - src_data_stride = (OPJ_UINT32)( - l_tilec->resolutions[l_tilec->minimum_num_resolutions - 1].x1 - - l_tilec->resolutions[l_tilec->minimum_num_resolutions - 1].x0); - p_src_data = l_tilec->data; - } else { - opj_tcd_resolution_t* l_res = l_tilec->resolutions + - l_img_comp_src->resno_decoded; - res_x0 = (OPJ_INT32)l_res->win_x0; - res_y0 = (OPJ_INT32)l_res->win_y0; - res_x1 = (OPJ_INT32)l_res->win_x1; - res_y1 = (OPJ_INT32)l_res->win_y1; - src_data_stride = l_res->win_x1 - l_res->win_x0; - p_src_data = l_tilec->data_win; - } - - if (p_src_data == NULL) { - /* Happens for partial component decoding */ - continue; - } - - l_width_src = (OPJ_UINT32)(res_x1 - res_x0); - l_height_src = (OPJ_UINT32)(res_y1 - res_y0); - - - /* Current tile component size*/ - /*if (i == 0) { - fprintf(stdout, "SRC: l_res_x0=%d, l_res_x1=%d, l_res_y0=%d, l_res_y1=%d\n", - res_x0, res_x1, res_y0, res_y1); - }*/ - - - /* Border of the current output component*/ - l_x0_dest = opj_uint_ceildivpow2(l_img_comp_dest->x0, l_img_comp_dest->factor); - l_y0_dest = opj_uint_ceildivpow2(l_img_comp_dest->y0, l_img_comp_dest->factor); - l_x1_dest = l_x0_dest + - l_img_comp_dest->w; /* can't overflow given that image->x1 is uint32 */ - l_y1_dest = l_y0_dest + l_img_comp_dest->h; - - /*if (i == 0) { - fprintf(stdout, "DEST: l_x0_dest=%d, l_x1_dest=%d, l_y0_dest=%d, l_y1_dest=%d (%d)\n", - l_x0_dest, l_x1_dest, l_y0_dest, l_y1_dest, l_img_comp_dest->factor ); - }*/ - - /*-----*/ - /* Compute the area (l_offset_x0_src, l_offset_y0_src, l_offset_x1_src, l_offset_y1_src) - * of the input buffer (decoded tile component) which will be move - * in the output buffer. Compute the area of the output buffer (l_start_x_dest, - * l_start_y_dest, l_width_dest, l_height_dest) which will be modified - * by this input area. - * */ - assert(res_x0 >= 0); - assert(res_x1 >= 0); - if (l_x0_dest < (OPJ_UINT32)res_x0) { - l_start_x_dest = (OPJ_UINT32)res_x0 - l_x0_dest; - l_offset_x0_src = 0; - - if (l_x1_dest >= (OPJ_UINT32)res_x1) { - l_width_dest = l_width_src; - l_offset_x1_src = 0; - } else { - l_width_dest = l_x1_dest - (OPJ_UINT32)res_x0 ; - l_offset_x1_src = (OPJ_INT32)(l_width_src - l_width_dest); - } - } else { - l_start_x_dest = 0U; - l_offset_x0_src = (OPJ_INT32)l_x0_dest - res_x0; - - if (l_x1_dest >= (OPJ_UINT32)res_x1) { - l_width_dest = l_width_src - (OPJ_UINT32)l_offset_x0_src; - l_offset_x1_src = 0; - } else { - l_width_dest = l_img_comp_dest->w ; - l_offset_x1_src = res_x1 - (OPJ_INT32)l_x1_dest; - } - } - - if (l_y0_dest < (OPJ_UINT32)res_y0) { - l_start_y_dest = (OPJ_UINT32)res_y0 - l_y0_dest; - l_offset_y0_src = 0; - - if (l_y1_dest >= (OPJ_UINT32)res_y1) { - l_height_dest = l_height_src; - l_offset_y1_src = 0; - } else { - l_height_dest = l_y1_dest - (OPJ_UINT32)res_y0 ; - l_offset_y1_src = (OPJ_INT32)(l_height_src - l_height_dest); - } - } else { - l_start_y_dest = 0U; - l_offset_y0_src = (OPJ_INT32)l_y0_dest - res_y0; - - if (l_y1_dest >= (OPJ_UINT32)res_y1) { - l_height_dest = l_height_src - (OPJ_UINT32)l_offset_y0_src; - l_offset_y1_src = 0; - } else { - l_height_dest = l_img_comp_dest->h ; - l_offset_y1_src = res_y1 - (OPJ_INT32)l_y1_dest; - } - } - - if ((l_offset_x0_src < 0) || (l_offset_y0_src < 0) || (l_offset_x1_src < 0) || - (l_offset_y1_src < 0)) { - return OPJ_FALSE; - } - /* testcase 2977.pdf.asan.67.2198 */ - if ((OPJ_INT32)l_width_dest < 0 || (OPJ_INT32)l_height_dest < 0) { - return OPJ_FALSE; - } - /*-----*/ - - /* Compute the input buffer offset */ - l_start_offset_src = (OPJ_SIZE_T)l_offset_x0_src + (OPJ_SIZE_T)l_offset_y0_src - * (OPJ_SIZE_T)src_data_stride; - - /* Compute the output buffer offset */ - l_start_offset_dest = (OPJ_SIZE_T)l_start_x_dest + (OPJ_SIZE_T)l_start_y_dest - * (OPJ_SIZE_T)l_img_comp_dest->w; - - /* Allocate output component buffer if necessary */ - if (l_img_comp_dest->data == NULL && - l_start_offset_src == 0 && l_start_offset_dest == 0 && - src_data_stride == l_img_comp_dest->w && - l_width_dest == l_img_comp_dest->w && - l_height_dest == l_img_comp_dest->h) { - /* If the final image matches the tile buffer, then borrow it */ - /* directly to save a copy */ - if (p_tcd->whole_tile_decoding) { - l_img_comp_dest->data = l_tilec->data; - l_tilec->data = NULL; - } else { - l_img_comp_dest->data = l_tilec->data_win; - l_tilec->data_win = NULL; - } - continue; - } else if (l_img_comp_dest->data == NULL) { - OPJ_SIZE_T l_width = l_img_comp_dest->w; - OPJ_SIZE_T l_height = l_img_comp_dest->h; - - if ((l_height == 0U) || (l_width > (SIZE_MAX / l_height)) || - l_width * l_height > SIZE_MAX / sizeof(OPJ_INT32)) { - /* would overflow */ - return OPJ_FALSE; - } - l_img_comp_dest->data = (OPJ_INT32*) opj_image_data_alloc(l_width * l_height * - sizeof(OPJ_INT32)); - if (! l_img_comp_dest->data) { - return OPJ_FALSE; - } - - if (l_img_comp_dest->w != l_width_dest || - l_img_comp_dest->h != l_height_dest) { - memset(l_img_comp_dest->data, 0, - (OPJ_SIZE_T)l_img_comp_dest->w * l_img_comp_dest->h * sizeof(OPJ_INT32)); - } - } - - /* Move the output buffer to the first place where we will write*/ - l_dest_ptr = l_img_comp_dest->data + l_start_offset_dest; - - { - const OPJ_INT32 * l_src_ptr = p_src_data; - l_src_ptr += l_start_offset_src; - - for (j = 0; j < l_height_dest; ++j) { - memcpy(l_dest_ptr, l_src_ptr, l_width_dest * sizeof(OPJ_INT32)); - l_dest_ptr += l_img_comp_dest->w; - l_src_ptr += src_data_stride; - } - } - - - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_update_image_dimensions(opj_image_t* p_image, - opj_event_mgr_t * p_manager) -{ - OPJ_UINT32 it_comp; - OPJ_INT32 l_comp_x1, l_comp_y1; - opj_image_comp_t* l_img_comp = NULL; - - l_img_comp = p_image->comps; - for (it_comp = 0; it_comp < p_image->numcomps; ++it_comp) { - OPJ_INT32 l_h, l_w; - - l_img_comp->x0 = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)p_image->x0, - (OPJ_INT32)l_img_comp->dx); - l_img_comp->y0 = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)p_image->y0, - (OPJ_INT32)l_img_comp->dy); - l_comp_x1 = opj_int_ceildiv((OPJ_INT32)p_image->x1, (OPJ_INT32)l_img_comp->dx); - l_comp_y1 = opj_int_ceildiv((OPJ_INT32)p_image->y1, (OPJ_INT32)l_img_comp->dy); - - l_w = opj_int_ceildivpow2(l_comp_x1, (OPJ_INT32)l_img_comp->factor) - - opj_int_ceildivpow2((OPJ_INT32)l_img_comp->x0, (OPJ_INT32)l_img_comp->factor); - if (l_w < 0) { - opj_event_msg(p_manager, EVT_ERROR, - "Size x of the decoded component image is incorrect (comp[%d].w=%d).\n", - it_comp, l_w); - return OPJ_FALSE; - } - l_img_comp->w = (OPJ_UINT32)l_w; - - l_h = opj_int_ceildivpow2(l_comp_y1, (OPJ_INT32)l_img_comp->factor) - - opj_int_ceildivpow2((OPJ_INT32)l_img_comp->y0, (OPJ_INT32)l_img_comp->factor); - if (l_h < 0) { - opj_event_msg(p_manager, EVT_ERROR, - "Size y of the decoded component image is incorrect (comp[%d].h=%d).\n", - it_comp, l_h); - return OPJ_FALSE; - } - l_img_comp->h = (OPJ_UINT32)l_h; - - l_img_comp++; - } - - return OPJ_TRUE; -} - -OPJ_BOOL opj_j2k_set_decoded_components(opj_j2k_t *p_j2k, - OPJ_UINT32 numcomps, - const OPJ_UINT32* comps_indices, - opj_event_mgr_t * p_manager) -{ - OPJ_UINT32 i; - OPJ_BOOL* already_mapped; - - if (p_j2k->m_private_image == NULL) { - opj_event_msg(p_manager, EVT_ERROR, - "opj_read_header() should be called before " - "opj_set_decoded_components().\n"); - return OPJ_FALSE; - } - - already_mapped = (OPJ_BOOL*) opj_calloc(sizeof(OPJ_BOOL), - p_j2k->m_private_image->numcomps); - if (already_mapped == NULL) { - return OPJ_FALSE; - } - - for (i = 0; i < numcomps; i++) { - if (comps_indices[i] >= p_j2k->m_private_image->numcomps) { - opj_event_msg(p_manager, EVT_ERROR, - "Invalid component index: %u\n", - comps_indices[i]); - opj_free(already_mapped); - return OPJ_FALSE; - } - if (already_mapped[comps_indices[i]]) { - opj_event_msg(p_manager, EVT_ERROR, - "Component index %u used several times\n", - comps_indices[i]); - opj_free(already_mapped); - return OPJ_FALSE; - } - already_mapped[comps_indices[i]] = OPJ_TRUE; - } - opj_free(already_mapped); - - opj_free(p_j2k->m_specific_param.m_decoder.m_comps_indices_to_decode); - if (numcomps) { - p_j2k->m_specific_param.m_decoder.m_comps_indices_to_decode = - (OPJ_UINT32*) opj_malloc(numcomps * sizeof(OPJ_UINT32)); - if (p_j2k->m_specific_param.m_decoder.m_comps_indices_to_decode == NULL) { - p_j2k->m_specific_param.m_decoder.m_numcomps_to_decode = 0; - return OPJ_FALSE; - } - memcpy(p_j2k->m_specific_param.m_decoder.m_comps_indices_to_decode, - comps_indices, - numcomps * sizeof(OPJ_UINT32)); - } else { - p_j2k->m_specific_param.m_decoder.m_comps_indices_to_decode = NULL; - } - p_j2k->m_specific_param.m_decoder.m_numcomps_to_decode = numcomps; - - return OPJ_TRUE; -} - - -OPJ_BOOL opj_j2k_set_decode_area(opj_j2k_t *p_j2k, - opj_image_t* p_image, - OPJ_INT32 p_start_x, OPJ_INT32 p_start_y, - OPJ_INT32 p_end_x, OPJ_INT32 p_end_y, - opj_event_mgr_t * p_manager) -{ - opj_cp_t * l_cp = &(p_j2k->m_cp); - opj_image_t * l_image = p_j2k->m_private_image; - OPJ_BOOL ret; - OPJ_UINT32 it_comp; - - if (p_j2k->m_cp.tw == 1 && p_j2k->m_cp.th == 1 && - p_j2k->m_cp.tcps[0].m_data != NULL) { - /* In the case of a single-tiled image whose codestream we have already */ - /* ingested, go on */ - } - /* Check if we are read the main header */ - else if (p_j2k->m_specific_param.m_decoder.m_state != J2K_STATE_TPHSOT) { - opj_event_msg(p_manager, EVT_ERROR, - "Need to decode the main header before begin to decode the remaining codestream.\n"); - return OPJ_FALSE; - } - - /* Update the comps[].factor member of the output image with the one */ - /* of m_reduce */ - for (it_comp = 0; it_comp < p_image->numcomps; ++it_comp) { - p_image->comps[it_comp].factor = p_j2k->m_cp.m_specific_param.m_dec.m_reduce; - } - - if (!p_start_x && !p_start_y && !p_end_x && !p_end_y) { - opj_event_msg(p_manager, EVT_INFO, - "No decoded area parameters, set the decoded area to the whole image\n"); - - p_j2k->m_specific_param.m_decoder.m_start_tile_x = 0; - p_j2k->m_specific_param.m_decoder.m_start_tile_y = 0; - p_j2k->m_specific_param.m_decoder.m_end_tile_x = l_cp->tw; - p_j2k->m_specific_param.m_decoder.m_end_tile_y = l_cp->th; - - p_image->x0 = l_image->x0; - p_image->y0 = l_image->y0; - p_image->x1 = l_image->x1; - p_image->y1 = l_image->y1; - - return opj_j2k_update_image_dimensions(p_image, p_manager); - } - - /* ----- */ - /* Check if the positions provided by the user are correct */ - - /* Left */ - if (p_start_x < 0) { - opj_event_msg(p_manager, EVT_ERROR, - "Left position of the decoded area (region_x0=%d) should be >= 0.\n", - p_start_x); - return OPJ_FALSE; - } else if ((OPJ_UINT32)p_start_x > l_image->x1) { - opj_event_msg(p_manager, EVT_ERROR, - "Left position of the decoded area (region_x0=%d) is outside the image area (Xsiz=%d).\n", - p_start_x, l_image->x1); - return OPJ_FALSE; - } else if ((OPJ_UINT32)p_start_x < l_image->x0) { - opj_event_msg(p_manager, EVT_WARNING, - "Left position of the decoded area (region_x0=%d) is outside the image area (XOsiz=%d).\n", - p_start_x, l_image->x0); - p_j2k->m_specific_param.m_decoder.m_start_tile_x = 0; - p_image->x0 = l_image->x0; - } else { - p_j2k->m_specific_param.m_decoder.m_start_tile_x = ((OPJ_UINT32)p_start_x - - l_cp->tx0) / l_cp->tdx; - p_image->x0 = (OPJ_UINT32)p_start_x; - } - - /* Up */ - if (p_start_y < 0) { - opj_event_msg(p_manager, EVT_ERROR, - "Up position of the decoded area (region_y0=%d) should be >= 0.\n", - p_start_y); - return OPJ_FALSE; - } else if ((OPJ_UINT32)p_start_y > l_image->y1) { - opj_event_msg(p_manager, EVT_ERROR, - "Up position of the decoded area (region_y0=%d) is outside the image area (Ysiz=%d).\n", - p_start_y, l_image->y1); - return OPJ_FALSE; - } else if ((OPJ_UINT32)p_start_y < l_image->y0) { - opj_event_msg(p_manager, EVT_WARNING, - "Up position of the decoded area (region_y0=%d) is outside the image area (YOsiz=%d).\n", - p_start_y, l_image->y0); - p_j2k->m_specific_param.m_decoder.m_start_tile_y = 0; - p_image->y0 = l_image->y0; - } else { - p_j2k->m_specific_param.m_decoder.m_start_tile_y = ((OPJ_UINT32)p_start_y - - l_cp->ty0) / l_cp->tdy; - p_image->y0 = (OPJ_UINT32)p_start_y; - } - - /* Right */ - if (p_end_x <= 0) { - opj_event_msg(p_manager, EVT_ERROR, - "Right position of the decoded area (region_x1=%d) should be > 0.\n", - p_end_x); - return OPJ_FALSE; - } else if ((OPJ_UINT32)p_end_x < l_image->x0) { - opj_event_msg(p_manager, EVT_ERROR, - "Right position of the decoded area (region_x1=%d) is outside the image area (XOsiz=%d).\n", - p_end_x, l_image->x0); - return OPJ_FALSE; - } else if ((OPJ_UINT32)p_end_x > l_image->x1) { - opj_event_msg(p_manager, EVT_WARNING, - "Right position of the decoded area (region_x1=%d) is outside the image area (Xsiz=%d).\n", - p_end_x, l_image->x1); - p_j2k->m_specific_param.m_decoder.m_end_tile_x = l_cp->tw; - p_image->x1 = l_image->x1; - } else { - p_j2k->m_specific_param.m_decoder.m_end_tile_x = (OPJ_UINT32)opj_int_ceildiv( - p_end_x - (OPJ_INT32)l_cp->tx0, (OPJ_INT32)l_cp->tdx); - p_image->x1 = (OPJ_UINT32)p_end_x; - } - - /* Bottom */ - if (p_end_y <= 0) { - opj_event_msg(p_manager, EVT_ERROR, - "Bottom position of the decoded area (region_y1=%d) should be > 0.\n", - p_end_y); - return OPJ_FALSE; - } else if ((OPJ_UINT32)p_end_y < l_image->y0) { - opj_event_msg(p_manager, EVT_ERROR, - "Bottom position of the decoded area (region_y1=%d) is outside the image area (YOsiz=%d).\n", - p_end_y, l_image->y0); - return OPJ_FALSE; - } - if ((OPJ_UINT32)p_end_y > l_image->y1) { - opj_event_msg(p_manager, EVT_WARNING, - "Bottom position of the decoded area (region_y1=%d) is outside the image area (Ysiz=%d).\n", - p_end_y, l_image->y1); - p_j2k->m_specific_param.m_decoder.m_end_tile_y = l_cp->th; - p_image->y1 = l_image->y1; - } else { - p_j2k->m_specific_param.m_decoder.m_end_tile_y = (OPJ_UINT32)opj_int_ceildiv( - p_end_y - (OPJ_INT32)l_cp->ty0, (OPJ_INT32)l_cp->tdy); - p_image->y1 = (OPJ_UINT32)p_end_y; - } - /* ----- */ - - p_j2k->m_specific_param.m_decoder.m_discard_tiles = 1; - - ret = opj_j2k_update_image_dimensions(p_image, p_manager); - - if (ret) { - opj_event_msg(p_manager, EVT_INFO, "Setting decoding area to %d,%d,%d,%d\n", - p_image->x0, p_image->y0, p_image->x1, p_image->y1); - } - - return ret; -} - -opj_j2k_t* opj_j2k_create_decompress(void) -{ - opj_j2k_t *l_j2k = (opj_j2k_t*) opj_calloc(1, sizeof(opj_j2k_t)); - if (!l_j2k) { - return 00; - } - - l_j2k->m_is_decoder = 1; - l_j2k->m_cp.m_is_decoder = 1; - /* in the absence of JP2 boxes, consider different bit depth / sign */ - /* per component is allowed */ - l_j2k->m_cp.allow_different_bit_depth_sign = 1; - -#ifdef OPJ_DISABLE_TPSOT_FIX - l_j2k->m_specific_param.m_decoder.m_nb_tile_parts_correction_checked = 1; -#endif - - l_j2k->m_specific_param.m_decoder.m_default_tcp = (opj_tcp_t*) opj_calloc(1, - sizeof(opj_tcp_t)); - if (!l_j2k->m_specific_param.m_decoder.m_default_tcp) { - opj_j2k_destroy(l_j2k); - return 00; - } - - l_j2k->m_specific_param.m_decoder.m_header_data = (OPJ_BYTE *) opj_calloc(1, - OPJ_J2K_DEFAULT_HEADER_SIZE); - if (! l_j2k->m_specific_param.m_decoder.m_header_data) { - opj_j2k_destroy(l_j2k); - return 00; - } - - l_j2k->m_specific_param.m_decoder.m_header_data_size = - OPJ_J2K_DEFAULT_HEADER_SIZE; - - l_j2k->m_specific_param.m_decoder.m_tile_ind_to_dec = -1 ; - - l_j2k->m_specific_param.m_decoder.m_last_sot_read_pos = 0 ; - - /* codestream index creation */ - l_j2k->cstr_index = opj_j2k_create_cstr_index(); - if (!l_j2k->cstr_index) { - opj_j2k_destroy(l_j2k); - return 00; - } - - /* validation list creation */ - l_j2k->m_validation_list = opj_procedure_list_create(); - if (! l_j2k->m_validation_list) { - opj_j2k_destroy(l_j2k); - return 00; - } - - /* execution list creation */ - l_j2k->m_procedure_list = opj_procedure_list_create(); - if (! l_j2k->m_procedure_list) { - opj_j2k_destroy(l_j2k); - return 00; - } - - l_j2k->m_tp = opj_thread_pool_create(opj_j2k_get_default_thread_count()); - if (!l_j2k->m_tp) { - l_j2k->m_tp = opj_thread_pool_create(0); - } - if (!l_j2k->m_tp) { - opj_j2k_destroy(l_j2k); - return NULL; - } - - return l_j2k; -} - -static opj_codestream_index_t* opj_j2k_create_cstr_index(void) -{ - opj_codestream_index_t* cstr_index = (opj_codestream_index_t*) - opj_calloc(1, sizeof(opj_codestream_index_t)); - if (!cstr_index) { - return NULL; - } - - cstr_index->maxmarknum = 100; - cstr_index->marknum = 0; - cstr_index->marker = (opj_marker_info_t*) - opj_calloc(cstr_index->maxmarknum, sizeof(opj_marker_info_t)); - if (!cstr_index-> marker) { - opj_free(cstr_index); - return NULL; - } - - cstr_index->tile_index = NULL; - - return cstr_index; -} - -static OPJ_UINT32 opj_j2k_get_SPCod_SPCoc_size(opj_j2k_t *p_j2k, - OPJ_UINT32 p_tile_no, - OPJ_UINT32 p_comp_no) -{ - opj_cp_t *l_cp = 00; - opj_tcp_t *l_tcp = 00; - opj_tccp_t *l_tccp = 00; - - /* preconditions */ - assert(p_j2k != 00); - - l_cp = &(p_j2k->m_cp); - l_tcp = &l_cp->tcps[p_tile_no]; - l_tccp = &l_tcp->tccps[p_comp_no]; - - /* preconditions again */ - assert(p_tile_no < (l_cp->tw * l_cp->th)); - assert(p_comp_no < p_j2k->m_private_image->numcomps); - - if (l_tccp->csty & J2K_CCP_CSTY_PRT) { - return 5 + l_tccp->numresolutions; - } else { - return 5; - } -} - -static OPJ_BOOL opj_j2k_compare_SPCod_SPCoc(opj_j2k_t *p_j2k, - OPJ_UINT32 p_tile_no, OPJ_UINT32 p_first_comp_no, OPJ_UINT32 p_second_comp_no) -{ - OPJ_UINT32 i; - opj_cp_t *l_cp = NULL; - opj_tcp_t *l_tcp = NULL; - opj_tccp_t *l_tccp0 = NULL; - opj_tccp_t *l_tccp1 = NULL; - - /* preconditions */ - assert(p_j2k != 00); - - l_cp = &(p_j2k->m_cp); - l_tcp = &l_cp->tcps[p_tile_no]; - l_tccp0 = &l_tcp->tccps[p_first_comp_no]; - l_tccp1 = &l_tcp->tccps[p_second_comp_no]; - - if (l_tccp0->numresolutions != l_tccp1->numresolutions) { - return OPJ_FALSE; - } - if (l_tccp0->cblkw != l_tccp1->cblkw) { - return OPJ_FALSE; - } - if (l_tccp0->cblkh != l_tccp1->cblkh) { - return OPJ_FALSE; - } - if (l_tccp0->cblksty != l_tccp1->cblksty) { - return OPJ_FALSE; - } - if (l_tccp0->qmfbid != l_tccp1->qmfbid) { - return OPJ_FALSE; - } - if ((l_tccp0->csty & J2K_CCP_CSTY_PRT) != (l_tccp1->csty & J2K_CCP_CSTY_PRT)) { - return OPJ_FALSE; - } - - for (i = 0U; i < l_tccp0->numresolutions; ++i) { - if (l_tccp0->prcw[i] != l_tccp1->prcw[i]) { - return OPJ_FALSE; - } - if (l_tccp0->prch[i] != l_tccp1->prch[i]) { - return OPJ_FALSE; - } - } - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_SPCod_SPCoc(opj_j2k_t *p_j2k, - OPJ_UINT32 p_tile_no, - OPJ_UINT32 p_comp_no, - OPJ_BYTE * p_data, - OPJ_UINT32 * p_header_size, - struct opj_event_mgr * p_manager) -{ - OPJ_UINT32 i; - opj_cp_t *l_cp = 00; - opj_tcp_t *l_tcp = 00; - opj_tccp_t *l_tccp = 00; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_header_size != 00); - assert(p_manager != 00); - assert(p_data != 00); - - l_cp = &(p_j2k->m_cp); - l_tcp = &l_cp->tcps[p_tile_no]; - l_tccp = &l_tcp->tccps[p_comp_no]; - - /* preconditions again */ - assert(p_tile_no < (l_cp->tw * l_cp->th)); - assert(p_comp_no < (p_j2k->m_private_image->numcomps)); - - if (*p_header_size < 5) { - opj_event_msg(p_manager, EVT_ERROR, "Error writing SPCod SPCoc element\n"); - return OPJ_FALSE; - } - - opj_write_bytes(p_data, l_tccp->numresolutions - 1, 1); /* SPcoc (D) */ - ++p_data; - - opj_write_bytes(p_data, l_tccp->cblkw - 2, 1); /* SPcoc (E) */ - ++p_data; - - opj_write_bytes(p_data, l_tccp->cblkh - 2, 1); /* SPcoc (F) */ - ++p_data; - - opj_write_bytes(p_data, l_tccp->cblksty, - 1); /* SPcoc (G) */ - ++p_data; - - opj_write_bytes(p_data, l_tccp->qmfbid, - 1); /* SPcoc (H) */ - ++p_data; - - *p_header_size = *p_header_size - 5; - - if (l_tccp->csty & J2K_CCP_CSTY_PRT) { - - if (*p_header_size < l_tccp->numresolutions) { - opj_event_msg(p_manager, EVT_ERROR, "Error writing SPCod SPCoc element\n"); - return OPJ_FALSE; - } - - for (i = 0; i < l_tccp->numresolutions; ++i) { - opj_write_bytes(p_data, l_tccp->prcw[i] + (l_tccp->prch[i] << 4), - 1); /* SPcoc (I_i) */ - ++p_data; - } - - *p_header_size = *p_header_size - l_tccp->numresolutions; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_read_SPCod_SPCoc(opj_j2k_t *p_j2k, - OPJ_UINT32 compno, - OPJ_BYTE * p_header_data, - OPJ_UINT32 * p_header_size, - opj_event_mgr_t * p_manager) -{ - OPJ_UINT32 i, l_tmp; - opj_cp_t *l_cp = NULL; - opj_tcp_t *l_tcp = NULL; - opj_tccp_t *l_tccp = NULL; - OPJ_BYTE * l_current_ptr = NULL; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_header_data != 00); - - l_cp = &(p_j2k->m_cp); - l_tcp = (p_j2k->m_specific_param.m_decoder.m_state == J2K_STATE_TPH) ? - &l_cp->tcps[p_j2k->m_current_tile_number] : - p_j2k->m_specific_param.m_decoder.m_default_tcp; - - /* precondition again */ - assert(compno < p_j2k->m_private_image->numcomps); - - l_tccp = &l_tcp->tccps[compno]; - l_current_ptr = p_header_data; - - /* make sure room is sufficient */ - if (*p_header_size < 5) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading SPCod SPCoc element\n"); - return OPJ_FALSE; - } - - opj_read_bytes(l_current_ptr, &l_tccp->numresolutions, - 1); /* SPcox (D) */ - ++l_tccp->numresolutions; /* tccp->numresolutions = read() + 1 */ - if (l_tccp->numresolutions > OPJ_J2K_MAXRLVLS) { - opj_event_msg(p_manager, EVT_ERROR, - "Invalid value for numresolutions : %d, max value is set in openjpeg.h at %d\n", - l_tccp->numresolutions, OPJ_J2K_MAXRLVLS); - return OPJ_FALSE; - } - ++l_current_ptr; - - /* If user wants to remove more resolutions than the codestream contains, return error */ - if (l_cp->m_specific_param.m_dec.m_reduce >= l_tccp->numresolutions) { - opj_event_msg(p_manager, EVT_ERROR, - "Error decoding component %d.\nThe number of resolutions " - "to remove (%d) is greater or equal than the number " - "of resolutions of this component (%d)\nModify the cp_reduce parameter.\n\n", - compno, l_cp->m_specific_param.m_dec.m_reduce, l_tccp->numresolutions); - p_j2k->m_specific_param.m_decoder.m_state |= - 0x8000;/* FIXME J2K_DEC_STATE_ERR;*/ - return OPJ_FALSE; - } - - opj_read_bytes(l_current_ptr, &l_tccp->cblkw, 1); /* SPcoc (E) */ - ++l_current_ptr; - l_tccp->cblkw += 2; - - opj_read_bytes(l_current_ptr, &l_tccp->cblkh, 1); /* SPcoc (F) */ - ++l_current_ptr; - l_tccp->cblkh += 2; - - if ((l_tccp->cblkw > 10) || (l_tccp->cblkh > 10) || - ((l_tccp->cblkw + l_tccp->cblkh) > 12)) { - opj_event_msg(p_manager, EVT_ERROR, - "Error reading SPCod SPCoc element, Invalid cblkw/cblkh combination\n"); - return OPJ_FALSE; - } - - - opj_read_bytes(l_current_ptr, &l_tccp->cblksty, 1); /* SPcoc (G) */ - ++l_current_ptr; - if (l_tccp->cblksty & 0xC0U) { /* 2 msb are reserved, assume we can't read */ - opj_event_msg(p_manager, EVT_ERROR, - "Error reading SPCod SPCoc element, Invalid code-block style found\n"); - return OPJ_FALSE; - } - - opj_read_bytes(l_current_ptr, &l_tccp->qmfbid, 1); /* SPcoc (H) */ - ++l_current_ptr; - - *p_header_size = *p_header_size - 5; - - /* use custom precinct size ? */ - if (l_tccp->csty & J2K_CCP_CSTY_PRT) { - if (*p_header_size < l_tccp->numresolutions) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading SPCod SPCoc element\n"); - return OPJ_FALSE; - } - - for (i = 0; i < l_tccp->numresolutions; ++i) { - opj_read_bytes(l_current_ptr, &l_tmp, 1); /* SPcoc (I_i) */ - ++l_current_ptr; - /* Precinct exponent 0 is only allowed for lowest resolution level (Table A.21) */ - if ((i != 0) && (((l_tmp & 0xf) == 0) || ((l_tmp >> 4) == 0))) { - opj_event_msg(p_manager, EVT_ERROR, "Invalid precinct size\n"); - return OPJ_FALSE; - } - l_tccp->prcw[i] = l_tmp & 0xf; - l_tccp->prch[i] = l_tmp >> 4; - } - - *p_header_size = *p_header_size - l_tccp->numresolutions; - } else { - /* set default size for the precinct width and height */ - for (i = 0; i < l_tccp->numresolutions; ++i) { - l_tccp->prcw[i] = 15; - l_tccp->prch[i] = 15; - } - } - -#ifdef WIP_REMOVE_MSD - /* INDEX >> */ - if (p_j2k->cstr_info && compno == 0) { - OPJ_UINT32 l_data_size = l_tccp->numresolutions * sizeof(OPJ_UINT32); - - p_j2k->cstr_info->tile[p_j2k->m_current_tile_number].tccp_info[compno].cblkh = - l_tccp->cblkh; - p_j2k->cstr_info->tile[p_j2k->m_current_tile_number].tccp_info[compno].cblkw = - l_tccp->cblkw; - p_j2k->cstr_info->tile[p_j2k->m_current_tile_number].tccp_info[compno].numresolutions - = l_tccp->numresolutions; - p_j2k->cstr_info->tile[p_j2k->m_current_tile_number].tccp_info[compno].cblksty = - l_tccp->cblksty; - p_j2k->cstr_info->tile[p_j2k->m_current_tile_number].tccp_info[compno].qmfbid = - l_tccp->qmfbid; - - memcpy(p_j2k->cstr_info->tile[p_j2k->m_current_tile_number].pdx, l_tccp->prcw, - l_data_size); - memcpy(p_j2k->cstr_info->tile[p_j2k->m_current_tile_number].pdy, l_tccp->prch, - l_data_size); - } - /* << INDEX */ -#endif - - return OPJ_TRUE; -} - -static void opj_j2k_copy_tile_component_parameters(opj_j2k_t *p_j2k) -{ - /* loop */ - OPJ_UINT32 i; - opj_cp_t *l_cp = NULL; - opj_tcp_t *l_tcp = NULL; - opj_tccp_t *l_ref_tccp = NULL, *l_copied_tccp = NULL; - OPJ_UINT32 l_prc_size; - - /* preconditions */ - assert(p_j2k != 00); - - l_cp = &(p_j2k->m_cp); - l_tcp = (p_j2k->m_specific_param.m_decoder.m_state == J2K_STATE_TPH) - ? - &l_cp->tcps[p_j2k->m_current_tile_number] : - p_j2k->m_specific_param.m_decoder.m_default_tcp; - - l_ref_tccp = &l_tcp->tccps[0]; - l_copied_tccp = l_ref_tccp + 1; - l_prc_size = l_ref_tccp->numresolutions * (OPJ_UINT32)sizeof(OPJ_UINT32); - - for (i = 1; i < p_j2k->m_private_image->numcomps; ++i) { - l_copied_tccp->numresolutions = l_ref_tccp->numresolutions; - l_copied_tccp->cblkw = l_ref_tccp->cblkw; - l_copied_tccp->cblkh = l_ref_tccp->cblkh; - l_copied_tccp->cblksty = l_ref_tccp->cblksty; - l_copied_tccp->qmfbid = l_ref_tccp->qmfbid; - memcpy(l_copied_tccp->prcw, l_ref_tccp->prcw, l_prc_size); - memcpy(l_copied_tccp->prch, l_ref_tccp->prch, l_prc_size); - ++l_copied_tccp; - } -} - -static OPJ_UINT32 opj_j2k_get_SQcd_SQcc_size(opj_j2k_t *p_j2k, - OPJ_UINT32 p_tile_no, - OPJ_UINT32 p_comp_no) -{ - OPJ_UINT32 l_num_bands; - - opj_cp_t *l_cp = 00; - opj_tcp_t *l_tcp = 00; - opj_tccp_t *l_tccp = 00; - - /* preconditions */ - assert(p_j2k != 00); - - l_cp = &(p_j2k->m_cp); - l_tcp = &l_cp->tcps[p_tile_no]; - l_tccp = &l_tcp->tccps[p_comp_no]; - - /* preconditions again */ - assert(p_tile_no < l_cp->tw * l_cp->th); - assert(p_comp_no < p_j2k->m_private_image->numcomps); - - l_num_bands = (l_tccp->qntsty == J2K_CCP_QNTSTY_SIQNT) ? 1 : - (l_tccp->numresolutions * 3 - 2); - - if (l_tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) { - return 1 + l_num_bands; - } else { - return 1 + 2 * l_num_bands; - } -} - -static OPJ_BOOL opj_j2k_compare_SQcd_SQcc(opj_j2k_t *p_j2k, - OPJ_UINT32 p_tile_no, OPJ_UINT32 p_first_comp_no, OPJ_UINT32 p_second_comp_no) -{ - opj_cp_t *l_cp = NULL; - opj_tcp_t *l_tcp = NULL; - opj_tccp_t *l_tccp0 = NULL; - opj_tccp_t *l_tccp1 = NULL; - OPJ_UINT32 l_band_no, l_num_bands; - - /* preconditions */ - assert(p_j2k != 00); - - l_cp = &(p_j2k->m_cp); - l_tcp = &l_cp->tcps[p_tile_no]; - l_tccp0 = &l_tcp->tccps[p_first_comp_no]; - l_tccp1 = &l_tcp->tccps[p_second_comp_no]; - - if (l_tccp0->qntsty != l_tccp1->qntsty) { - return OPJ_FALSE; - } - if (l_tccp0->numgbits != l_tccp1->numgbits) { - return OPJ_FALSE; - } - if (l_tccp0->qntsty == J2K_CCP_QNTSTY_SIQNT) { - l_num_bands = 1U; - } else { - l_num_bands = l_tccp0->numresolutions * 3U - 2U; - if (l_num_bands != (l_tccp1->numresolutions * 3U - 2U)) { - return OPJ_FALSE; - } - } - - for (l_band_no = 0; l_band_no < l_num_bands; ++l_band_no) { - if (l_tccp0->stepsizes[l_band_no].expn != l_tccp1->stepsizes[l_band_no].expn) { - return OPJ_FALSE; - } - } - if (l_tccp0->qntsty != J2K_CCP_QNTSTY_NOQNT) { - for (l_band_no = 0; l_band_no < l_num_bands; ++l_band_no) { - if (l_tccp0->stepsizes[l_band_no].mant != l_tccp1->stepsizes[l_band_no].mant) { - return OPJ_FALSE; - } - } - } - return OPJ_TRUE; -} - - -static OPJ_BOOL opj_j2k_write_SQcd_SQcc(opj_j2k_t *p_j2k, - OPJ_UINT32 p_tile_no, - OPJ_UINT32 p_comp_no, - OPJ_BYTE * p_data, - OPJ_UINT32 * p_header_size, - struct opj_event_mgr * p_manager) -{ - OPJ_UINT32 l_header_size; - OPJ_UINT32 l_band_no, l_num_bands; - OPJ_UINT32 l_expn, l_mant; - - opj_cp_t *l_cp = 00; - opj_tcp_t *l_tcp = 00; - opj_tccp_t *l_tccp = 00; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_header_size != 00); - assert(p_manager != 00); - assert(p_data != 00); - - l_cp = &(p_j2k->m_cp); - l_tcp = &l_cp->tcps[p_tile_no]; - l_tccp = &l_tcp->tccps[p_comp_no]; - - /* preconditions again */ - assert(p_tile_no < l_cp->tw * l_cp->th); - assert(p_comp_no < p_j2k->m_private_image->numcomps); - - l_num_bands = (l_tccp->qntsty == J2K_CCP_QNTSTY_SIQNT) ? 1 : - (l_tccp->numresolutions * 3 - 2); - - if (l_tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) { - l_header_size = 1 + l_num_bands; - - if (*p_header_size < l_header_size) { - opj_event_msg(p_manager, EVT_ERROR, "Error writing SQcd SQcc element\n"); - return OPJ_FALSE; - } - - opj_write_bytes(p_data, l_tccp->qntsty + (l_tccp->numgbits << 5), - 1); /* Sqcx */ - ++p_data; - - for (l_band_no = 0; l_band_no < l_num_bands; ++l_band_no) { - l_expn = (OPJ_UINT32)l_tccp->stepsizes[l_band_no].expn; - opj_write_bytes(p_data, l_expn << 3, 1); /* SPqcx_i */ - ++p_data; - } - } else { - l_header_size = 1 + 2 * l_num_bands; - - if (*p_header_size < l_header_size) { - opj_event_msg(p_manager, EVT_ERROR, "Error writing SQcd SQcc element\n"); - return OPJ_FALSE; - } - - opj_write_bytes(p_data, l_tccp->qntsty + (l_tccp->numgbits << 5), - 1); /* Sqcx */ - ++p_data; - - for (l_band_no = 0; l_band_no < l_num_bands; ++l_band_no) { - l_expn = (OPJ_UINT32)l_tccp->stepsizes[l_band_no].expn; - l_mant = (OPJ_UINT32)l_tccp->stepsizes[l_band_no].mant; - - opj_write_bytes(p_data, (l_expn << 11) + l_mant, 2); /* SPqcx_i */ - p_data += 2; - } - } - - *p_header_size = *p_header_size - l_header_size; - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_read_SQcd_SQcc(opj_j2k_t *p_j2k, - OPJ_UINT32 p_comp_no, - OPJ_BYTE* p_header_data, - OPJ_UINT32 * p_header_size, - opj_event_mgr_t * p_manager - ) -{ - /* loop*/ - OPJ_UINT32 l_band_no; - opj_cp_t *l_cp = 00; - opj_tcp_t *l_tcp = 00; - opj_tccp_t *l_tccp = 00; - OPJ_BYTE * l_current_ptr = 00; - OPJ_UINT32 l_tmp, l_num_band; - - /* preconditions*/ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_header_data != 00); - - l_cp = &(p_j2k->m_cp); - /* come from tile part header or main header ?*/ - l_tcp = (p_j2k->m_specific_param.m_decoder.m_state == J2K_STATE_TPH) - ? - &l_cp->tcps[p_j2k->m_current_tile_number] : - p_j2k->m_specific_param.m_decoder.m_default_tcp; - - /* precondition again*/ - assert(p_comp_no < p_j2k->m_private_image->numcomps); - - l_tccp = &l_tcp->tccps[p_comp_no]; - l_current_ptr = p_header_data; - - if (*p_header_size < 1) { - opj_event_msg(p_manager, EVT_ERROR, "Error reading SQcd or SQcc element\n"); - return OPJ_FALSE; - } - *p_header_size -= 1; - - opj_read_bytes(l_current_ptr, &l_tmp, 1); /* Sqcx */ - ++l_current_ptr; - - l_tccp->qntsty = l_tmp & 0x1f; - l_tccp->numgbits = l_tmp >> 5; - if (l_tccp->qntsty == J2K_CCP_QNTSTY_SIQNT) { - l_num_band = 1; - } else { - l_num_band = (l_tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) ? - (*p_header_size) : - (*p_header_size) / 2; - - if (l_num_band > OPJ_J2K_MAXBANDS) { - opj_event_msg(p_manager, EVT_WARNING, - "While reading CCP_QNTSTY element inside QCD or QCC marker segment, " - "number of subbands (%d) is greater to OPJ_J2K_MAXBANDS (%d). So we limit the number of elements stored to " - "OPJ_J2K_MAXBANDS (%d) and skip the rest. \n", l_num_band, OPJ_J2K_MAXBANDS, - OPJ_J2K_MAXBANDS); - /*return OPJ_FALSE;*/ - } - } - -#ifdef USE_JPWL - if (l_cp->correct) { - - /* if JPWL is on, we check whether there are too many subbands */ - if (/*(l_num_band < 0) ||*/ (l_num_band >= OPJ_J2K_MAXBANDS)) { - opj_event_msg(p_manager, JPWL_ASSUME ? EVT_WARNING : EVT_ERROR, - "JPWL: bad number of subbands in Sqcx (%d)\n", - l_num_band); - if (!JPWL_ASSUME) { - opj_event_msg(p_manager, EVT_ERROR, "JPWL: giving up\n"); - return OPJ_FALSE; - } - /* we try to correct */ - l_num_band = 1; - opj_event_msg(p_manager, EVT_WARNING, "- trying to adjust them\n" - "- setting number of bands to %d => HYPOTHESIS!!!\n", - l_num_band); - }; - - }; -#endif /* USE_JPWL */ - - if (l_tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) { - for (l_band_no = 0; l_band_no < l_num_band; l_band_no++) { - opj_read_bytes(l_current_ptr, &l_tmp, 1); /* SPqcx_i */ - ++l_current_ptr; - if (l_band_no < OPJ_J2K_MAXBANDS) { - l_tccp->stepsizes[l_band_no].expn = (OPJ_INT32)(l_tmp >> 3); - l_tccp->stepsizes[l_band_no].mant = 0; - } - } - *p_header_size = *p_header_size - l_num_band; - } else { - for (l_band_no = 0; l_band_no < l_num_band; l_band_no++) { - opj_read_bytes(l_current_ptr, &l_tmp, 2); /* SPqcx_i */ - l_current_ptr += 2; - if (l_band_no < OPJ_J2K_MAXBANDS) { - l_tccp->stepsizes[l_band_no].expn = (OPJ_INT32)(l_tmp >> 11); - l_tccp->stepsizes[l_band_no].mant = l_tmp & 0x7ff; - } - } - *p_header_size = *p_header_size - 2 * l_num_band; - } - - /* Add Antonin : if scalar_derived -> compute other stepsizes */ - if (l_tccp->qntsty == J2K_CCP_QNTSTY_SIQNT) { - for (l_band_no = 1; l_band_no < OPJ_J2K_MAXBANDS; l_band_no++) { - l_tccp->stepsizes[l_band_no].expn = - ((OPJ_INT32)(l_tccp->stepsizes[0].expn) - (OPJ_INT32)((l_band_no - 1) / 3) > 0) - ? - (OPJ_INT32)(l_tccp->stepsizes[0].expn) - (OPJ_INT32)((l_band_no - 1) / 3) : 0; - l_tccp->stepsizes[l_band_no].mant = l_tccp->stepsizes[0].mant; - } - } - - return OPJ_TRUE; -} - -static void opj_j2k_copy_tile_quantization_parameters(opj_j2k_t *p_j2k) -{ - OPJ_UINT32 i; - opj_cp_t *l_cp = NULL; - opj_tcp_t *l_tcp = NULL; - opj_tccp_t *l_ref_tccp = NULL; - opj_tccp_t *l_copied_tccp = NULL; - OPJ_UINT32 l_size; - - /* preconditions */ - assert(p_j2k != 00); - - l_cp = &(p_j2k->m_cp); - l_tcp = p_j2k->m_specific_param.m_decoder.m_state == J2K_STATE_TPH ? - &l_cp->tcps[p_j2k->m_current_tile_number] : - p_j2k->m_specific_param.m_decoder.m_default_tcp; - - l_ref_tccp = &l_tcp->tccps[0]; - l_copied_tccp = l_ref_tccp + 1; - l_size = OPJ_J2K_MAXBANDS * sizeof(opj_stepsize_t); - - for (i = 1; i < p_j2k->m_private_image->numcomps; ++i) { - l_copied_tccp->qntsty = l_ref_tccp->qntsty; - l_copied_tccp->numgbits = l_ref_tccp->numgbits; - memcpy(l_copied_tccp->stepsizes, l_ref_tccp->stepsizes, l_size); - ++l_copied_tccp; - } -} - -static void opj_j2k_dump_tile_info(opj_tcp_t * l_default_tile, - OPJ_INT32 numcomps, FILE* out_stream) -{ - if (l_default_tile) { - OPJ_INT32 compno; - - fprintf(out_stream, "\t default tile {\n"); - fprintf(out_stream, "\t\t csty=%#x\n", l_default_tile->csty); - fprintf(out_stream, "\t\t prg=%#x\n", l_default_tile->prg); - fprintf(out_stream, "\t\t numlayers=%d\n", l_default_tile->numlayers); - fprintf(out_stream, "\t\t mct=%x\n", l_default_tile->mct); - - for (compno = 0; compno < numcomps; compno++) { - opj_tccp_t *l_tccp = &(l_default_tile->tccps[compno]); - OPJ_UINT32 resno; - OPJ_INT32 bandno, numbands; - - /* coding style*/ - fprintf(out_stream, "\t\t comp %d {\n", compno); - fprintf(out_stream, "\t\t\t csty=%#x\n", l_tccp->csty); - fprintf(out_stream, "\t\t\t numresolutions=%d\n", l_tccp->numresolutions); - fprintf(out_stream, "\t\t\t cblkw=2^%d\n", l_tccp->cblkw); - fprintf(out_stream, "\t\t\t cblkh=2^%d\n", l_tccp->cblkh); - fprintf(out_stream, "\t\t\t cblksty=%#x\n", l_tccp->cblksty); - fprintf(out_stream, "\t\t\t qmfbid=%d\n", l_tccp->qmfbid); - - fprintf(out_stream, "\t\t\t preccintsize (w,h)="); - for (resno = 0; resno < l_tccp->numresolutions; resno++) { - fprintf(out_stream, "(%d,%d) ", l_tccp->prcw[resno], l_tccp->prch[resno]); - } - fprintf(out_stream, "\n"); - - /* quantization style*/ - fprintf(out_stream, "\t\t\t qntsty=%d\n", l_tccp->qntsty); - fprintf(out_stream, "\t\t\t numgbits=%d\n", l_tccp->numgbits); - fprintf(out_stream, "\t\t\t stepsizes (m,e)="); - numbands = (l_tccp->qntsty == J2K_CCP_QNTSTY_SIQNT) ? 1 : - (OPJ_INT32)l_tccp->numresolutions * 3 - 2; - for (bandno = 0; bandno < numbands; bandno++) { - fprintf(out_stream, "(%d,%d) ", l_tccp->stepsizes[bandno].mant, - l_tccp->stepsizes[bandno].expn); - } - fprintf(out_stream, "\n"); - - /* RGN value*/ - fprintf(out_stream, "\t\t\t roishift=%d\n", l_tccp->roishift); - - fprintf(out_stream, "\t\t }\n"); - } /*end of component of default tile*/ - fprintf(out_stream, "\t }\n"); /*end of default tile*/ - } -} - -void j2k_dump(opj_j2k_t* p_j2k, OPJ_INT32 flag, FILE* out_stream) -{ - /* Check if the flag is compatible with j2k file*/ - if ((flag & OPJ_JP2_INFO) || (flag & OPJ_JP2_IND)) { - fprintf(out_stream, "Wrong flag\n"); - return; - } - - /* Dump the image_header */ - if (flag & OPJ_IMG_INFO) { - if (p_j2k->m_private_image) { - j2k_dump_image_header(p_j2k->m_private_image, 0, out_stream); - } - } - - /* Dump the codestream info from main header */ - if (flag & OPJ_J2K_MH_INFO) { - if (p_j2k->m_private_image) { - opj_j2k_dump_MH_info(p_j2k, out_stream); - } - } - /* Dump all tile/codestream info */ - if (flag & OPJ_J2K_TCH_INFO) { - OPJ_UINT32 l_nb_tiles = p_j2k->m_cp.th * p_j2k->m_cp.tw; - OPJ_UINT32 i; - opj_tcp_t * l_tcp = p_j2k->m_cp.tcps; - if (p_j2k->m_private_image) { - for (i = 0; i < l_nb_tiles; ++i) { - opj_j2k_dump_tile_info(l_tcp, (OPJ_INT32)p_j2k->m_private_image->numcomps, - out_stream); - ++l_tcp; - } - } - } - - /* Dump the codestream info of the current tile */ - if (flag & OPJ_J2K_TH_INFO) { - - } - - /* Dump the codestream index from main header */ - if (flag & OPJ_J2K_MH_IND) { - opj_j2k_dump_MH_index(p_j2k, out_stream); - } - - /* Dump the codestream index of the current tile */ - if (flag & OPJ_J2K_TH_IND) { - - } - -} - -static void opj_j2k_dump_MH_index(opj_j2k_t* p_j2k, FILE* out_stream) -{ - opj_codestream_index_t* cstr_index = p_j2k->cstr_index; - OPJ_UINT32 it_marker, it_tile, it_tile_part; - - fprintf(out_stream, "Codestream index from main header: {\n"); - - fprintf(out_stream, "\t Main header start position=%" PRIi64 "\n" - "\t Main header end position=%" PRIi64 "\n", - cstr_index->main_head_start, cstr_index->main_head_end); - - fprintf(out_stream, "\t Marker list: {\n"); - - if (cstr_index->marker) { - for (it_marker = 0; it_marker < cstr_index->marknum ; it_marker++) { - fprintf(out_stream, "\t\t type=%#x, pos=%" PRIi64 ", len=%d\n", - cstr_index->marker[it_marker].type, - cstr_index->marker[it_marker].pos, - cstr_index->marker[it_marker].len); - } - } - - fprintf(out_stream, "\t }\n"); - - if (cstr_index->tile_index) { - - /* Simple test to avoid to write empty information*/ - OPJ_UINT32 l_acc_nb_of_tile_part = 0; - for (it_tile = 0; it_tile < cstr_index->nb_of_tiles ; it_tile++) { - l_acc_nb_of_tile_part += cstr_index->tile_index[it_tile].nb_tps; - } - - if (l_acc_nb_of_tile_part) { - fprintf(out_stream, "\t Tile index: {\n"); - - for (it_tile = 0; it_tile < cstr_index->nb_of_tiles ; it_tile++) { - OPJ_UINT32 nb_of_tile_part = cstr_index->tile_index[it_tile].nb_tps; - - fprintf(out_stream, "\t\t nb of tile-part in tile [%d]=%d\n", it_tile, - nb_of_tile_part); - - if (cstr_index->tile_index[it_tile].tp_index) { - for (it_tile_part = 0; it_tile_part < nb_of_tile_part; it_tile_part++) { - fprintf(out_stream, "\t\t\t tile-part[%d]: star_pos=%" PRIi64 ", end_header=%" - PRIi64 ", end_pos=%" PRIi64 ".\n", - it_tile_part, - cstr_index->tile_index[it_tile].tp_index[it_tile_part].start_pos, - cstr_index->tile_index[it_tile].tp_index[it_tile_part].end_header, - cstr_index->tile_index[it_tile].tp_index[it_tile_part].end_pos); - } - } - - if (cstr_index->tile_index[it_tile].marker) { - for (it_marker = 0; it_marker < cstr_index->tile_index[it_tile].marknum ; - it_marker++) { - fprintf(out_stream, "\t\t type=%#x, pos=%" PRIi64 ", len=%d\n", - cstr_index->tile_index[it_tile].marker[it_marker].type, - cstr_index->tile_index[it_tile].marker[it_marker].pos, - cstr_index->tile_index[it_tile].marker[it_marker].len); - } - } - } - fprintf(out_stream, "\t }\n"); - } - } - - fprintf(out_stream, "}\n"); - -} - - -static void opj_j2k_dump_MH_info(opj_j2k_t* p_j2k, FILE* out_stream) -{ - - fprintf(out_stream, "Codestream info from main header: {\n"); - - fprintf(out_stream, "\t tx0=%d, ty0=%d\n", p_j2k->m_cp.tx0, p_j2k->m_cp.ty0); - fprintf(out_stream, "\t tdx=%d, tdy=%d\n", p_j2k->m_cp.tdx, p_j2k->m_cp.tdy); - fprintf(out_stream, "\t tw=%d, th=%d\n", p_j2k->m_cp.tw, p_j2k->m_cp.th); - opj_j2k_dump_tile_info(p_j2k->m_specific_param.m_decoder.m_default_tcp, - (OPJ_INT32)p_j2k->m_private_image->numcomps, out_stream); - fprintf(out_stream, "}\n"); -} - -void j2k_dump_image_header(opj_image_t* img_header, OPJ_BOOL dev_dump_flag, - FILE* out_stream) -{ - char tab[2]; - - if (dev_dump_flag) { - fprintf(stdout, "[DEV] Dump an image_header struct {\n"); - tab[0] = '\0'; - } else { - fprintf(out_stream, "Image info {\n"); - tab[0] = '\t'; - tab[1] = '\0'; - } - - fprintf(out_stream, "%s x0=%d, y0=%d\n", tab, img_header->x0, img_header->y0); - fprintf(out_stream, "%s x1=%d, y1=%d\n", tab, img_header->x1, - img_header->y1); - fprintf(out_stream, "%s numcomps=%d\n", tab, img_header->numcomps); - - if (img_header->comps) { - OPJ_UINT32 compno; - for (compno = 0; compno < img_header->numcomps; compno++) { - fprintf(out_stream, "%s\t component %d {\n", tab, compno); - j2k_dump_image_comp_header(&(img_header->comps[compno]), dev_dump_flag, - out_stream); - fprintf(out_stream, "%s}\n", tab); - } - } - - fprintf(out_stream, "}\n"); -} - -void j2k_dump_image_comp_header(opj_image_comp_t* comp_header, - OPJ_BOOL dev_dump_flag, FILE* out_stream) -{ - char tab[3]; - - if (dev_dump_flag) { - fprintf(stdout, "[DEV] Dump an image_comp_header struct {\n"); - tab[0] = '\0'; - } else { - tab[0] = '\t'; - tab[1] = '\t'; - tab[2] = '\0'; - } - - fprintf(out_stream, "%s dx=%d, dy=%d\n", tab, comp_header->dx, comp_header->dy); - fprintf(out_stream, "%s prec=%d\n", tab, comp_header->prec); - fprintf(out_stream, "%s sgnd=%d\n", tab, comp_header->sgnd); - - if (dev_dump_flag) { - fprintf(out_stream, "}\n"); - } -} - -opj_codestream_info_v2_t* j2k_get_cstr_info(opj_j2k_t* p_j2k) -{ - OPJ_UINT32 compno; - OPJ_UINT32 numcomps = p_j2k->m_private_image->numcomps; - opj_tcp_t *l_default_tile; - opj_codestream_info_v2_t* cstr_info = (opj_codestream_info_v2_t*) opj_calloc(1, - sizeof(opj_codestream_info_v2_t)); - if (!cstr_info) { - return NULL; - } - - cstr_info->nbcomps = p_j2k->m_private_image->numcomps; - - cstr_info->tx0 = p_j2k->m_cp.tx0; - cstr_info->ty0 = p_j2k->m_cp.ty0; - cstr_info->tdx = p_j2k->m_cp.tdx; - cstr_info->tdy = p_j2k->m_cp.tdy; - cstr_info->tw = p_j2k->m_cp.tw; - cstr_info->th = p_j2k->m_cp.th; - - cstr_info->tile_info = NULL; /* Not fill from the main header*/ - - l_default_tile = p_j2k->m_specific_param.m_decoder.m_default_tcp; - - cstr_info->m_default_tile_info.csty = l_default_tile->csty; - cstr_info->m_default_tile_info.prg = l_default_tile->prg; - cstr_info->m_default_tile_info.numlayers = l_default_tile->numlayers; - cstr_info->m_default_tile_info.mct = l_default_tile->mct; - - cstr_info->m_default_tile_info.tccp_info = (opj_tccp_info_t*) opj_calloc( - cstr_info->nbcomps, sizeof(opj_tccp_info_t)); - if (!cstr_info->m_default_tile_info.tccp_info) { - opj_destroy_cstr_info(&cstr_info); - return NULL; - } - - for (compno = 0; compno < numcomps; compno++) { - opj_tccp_t *l_tccp = &(l_default_tile->tccps[compno]); - opj_tccp_info_t *l_tccp_info = & - (cstr_info->m_default_tile_info.tccp_info[compno]); - OPJ_INT32 bandno, numbands; - - /* coding style*/ - l_tccp_info->csty = l_tccp->csty; - l_tccp_info->numresolutions = l_tccp->numresolutions; - l_tccp_info->cblkw = l_tccp->cblkw; - l_tccp_info->cblkh = l_tccp->cblkh; - l_tccp_info->cblksty = l_tccp->cblksty; - l_tccp_info->qmfbid = l_tccp->qmfbid; - if (l_tccp->numresolutions < OPJ_J2K_MAXRLVLS) { - memcpy(l_tccp_info->prch, l_tccp->prch, l_tccp->numresolutions); - memcpy(l_tccp_info->prcw, l_tccp->prcw, l_tccp->numresolutions); - } - - /* quantization style*/ - l_tccp_info->qntsty = l_tccp->qntsty; - l_tccp_info->numgbits = l_tccp->numgbits; - - numbands = (l_tccp->qntsty == J2K_CCP_QNTSTY_SIQNT) ? 1 : - (OPJ_INT32)l_tccp->numresolutions * 3 - 2; - if (numbands < OPJ_J2K_MAXBANDS) { - for (bandno = 0; bandno < numbands; bandno++) { - l_tccp_info->stepsizes_mant[bandno] = (OPJ_UINT32) - l_tccp->stepsizes[bandno].mant; - l_tccp_info->stepsizes_expn[bandno] = (OPJ_UINT32) - l_tccp->stepsizes[bandno].expn; - } - } - - /* RGN value*/ - l_tccp_info->roishift = l_tccp->roishift; - } - - return cstr_info; -} - -opj_codestream_index_t* j2k_get_cstr_index(opj_j2k_t* p_j2k) -{ - opj_codestream_index_t* l_cstr_index = (opj_codestream_index_t*) - opj_calloc(1, sizeof(opj_codestream_index_t)); - if (!l_cstr_index) { - return NULL; - } - - l_cstr_index->main_head_start = p_j2k->cstr_index->main_head_start; - l_cstr_index->main_head_end = p_j2k->cstr_index->main_head_end; - l_cstr_index->codestream_size = p_j2k->cstr_index->codestream_size; - - l_cstr_index->marknum = p_j2k->cstr_index->marknum; - l_cstr_index->marker = (opj_marker_info_t*)opj_malloc(l_cstr_index->marknum * - sizeof(opj_marker_info_t)); - if (!l_cstr_index->marker) { - opj_free(l_cstr_index); - return NULL; - } - - if (p_j2k->cstr_index->marker) { - memcpy(l_cstr_index->marker, p_j2k->cstr_index->marker, - l_cstr_index->marknum * sizeof(opj_marker_info_t)); - } else { - opj_free(l_cstr_index->marker); - l_cstr_index->marker = NULL; - } - - l_cstr_index->nb_of_tiles = p_j2k->cstr_index->nb_of_tiles; - l_cstr_index->tile_index = (opj_tile_index_t*)opj_calloc( - l_cstr_index->nb_of_tiles, sizeof(opj_tile_index_t)); - if (!l_cstr_index->tile_index) { - opj_free(l_cstr_index->marker); - opj_free(l_cstr_index); - return NULL; - } - - if (!p_j2k->cstr_index->tile_index) { - opj_free(l_cstr_index->tile_index); - l_cstr_index->tile_index = NULL; - } else { - OPJ_UINT32 it_tile = 0; - for (it_tile = 0; it_tile < l_cstr_index->nb_of_tiles; it_tile++) { - - /* Tile Marker*/ - l_cstr_index->tile_index[it_tile].marknum = - p_j2k->cstr_index->tile_index[it_tile].marknum; - - l_cstr_index->tile_index[it_tile].marker = - (opj_marker_info_t*)opj_malloc(l_cstr_index->tile_index[it_tile].marknum * - sizeof(opj_marker_info_t)); - - if (!l_cstr_index->tile_index[it_tile].marker) { - OPJ_UINT32 it_tile_free; - - for (it_tile_free = 0; it_tile_free < it_tile; it_tile_free++) { - opj_free(l_cstr_index->tile_index[it_tile_free].marker); - } - - opj_free(l_cstr_index->tile_index); - opj_free(l_cstr_index->marker); - opj_free(l_cstr_index); - return NULL; - } - - if (p_j2k->cstr_index->tile_index[it_tile].marker) - memcpy(l_cstr_index->tile_index[it_tile].marker, - p_j2k->cstr_index->tile_index[it_tile].marker, - l_cstr_index->tile_index[it_tile].marknum * sizeof(opj_marker_info_t)); - else { - opj_free(l_cstr_index->tile_index[it_tile].marker); - l_cstr_index->tile_index[it_tile].marker = NULL; - } - - /* Tile part index*/ - l_cstr_index->tile_index[it_tile].nb_tps = - p_j2k->cstr_index->tile_index[it_tile].nb_tps; - - l_cstr_index->tile_index[it_tile].tp_index = - (opj_tp_index_t*)opj_malloc(l_cstr_index->tile_index[it_tile].nb_tps * sizeof( - opj_tp_index_t)); - - if (!l_cstr_index->tile_index[it_tile].tp_index) { - OPJ_UINT32 it_tile_free; - - for (it_tile_free = 0; it_tile_free < it_tile; it_tile_free++) { - opj_free(l_cstr_index->tile_index[it_tile_free].marker); - opj_free(l_cstr_index->tile_index[it_tile_free].tp_index); - } - - opj_free(l_cstr_index->tile_index); - opj_free(l_cstr_index->marker); - opj_free(l_cstr_index); - return NULL; - } - - if (p_j2k->cstr_index->tile_index[it_tile].tp_index) { - memcpy(l_cstr_index->tile_index[it_tile].tp_index, - p_j2k->cstr_index->tile_index[it_tile].tp_index, - l_cstr_index->tile_index[it_tile].nb_tps * sizeof(opj_tp_index_t)); - } else { - opj_free(l_cstr_index->tile_index[it_tile].tp_index); - l_cstr_index->tile_index[it_tile].tp_index = NULL; - } - - /* Packet index (NOT USED)*/ - l_cstr_index->tile_index[it_tile].nb_packet = 0; - l_cstr_index->tile_index[it_tile].packet_index = NULL; - - } - } - - return l_cstr_index; -} - -static OPJ_BOOL opj_j2k_allocate_tile_element_cstr_index(opj_j2k_t *p_j2k) -{ - OPJ_UINT32 it_tile = 0; - - p_j2k->cstr_index->nb_of_tiles = p_j2k->m_cp.tw * p_j2k->m_cp.th; - p_j2k->cstr_index->tile_index = (opj_tile_index_t*)opj_calloc( - p_j2k->cstr_index->nb_of_tiles, sizeof(opj_tile_index_t)); - if (!p_j2k->cstr_index->tile_index) { - return OPJ_FALSE; - } - - for (it_tile = 0; it_tile < p_j2k->cstr_index->nb_of_tiles; it_tile++) { - p_j2k->cstr_index->tile_index[it_tile].maxmarknum = 100; - p_j2k->cstr_index->tile_index[it_tile].marknum = 0; - p_j2k->cstr_index->tile_index[it_tile].marker = (opj_marker_info_t*) - opj_calloc(p_j2k->cstr_index->tile_index[it_tile].maxmarknum, - sizeof(opj_marker_info_t)); - if (!p_j2k->cstr_index->tile_index[it_tile].marker) { - return OPJ_FALSE; - } - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_decode_tiles(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager) -{ - OPJ_BOOL l_go_on = OPJ_TRUE; - OPJ_UINT32 l_current_tile_no; - OPJ_INT32 l_tile_x0, l_tile_y0, l_tile_x1, l_tile_y1; - OPJ_UINT32 l_nb_comps; - OPJ_UINT32 nr_tiles = 0; - - /* Particular case for whole single tile decoding */ - /* We can avoid allocating intermediate tile buffers */ - if (p_j2k->m_cp.tw == 1 && p_j2k->m_cp.th == 1 && - p_j2k->m_cp.tx0 == 0 && p_j2k->m_cp.ty0 == 0 && - p_j2k->m_output_image->x0 == 0 && - p_j2k->m_output_image->y0 == 0 && - p_j2k->m_output_image->x1 == p_j2k->m_cp.tdx && - p_j2k->m_output_image->y1 == p_j2k->m_cp.tdy) { - OPJ_UINT32 i; - if (! opj_j2k_read_tile_header(p_j2k, - &l_current_tile_no, - NULL, - &l_tile_x0, &l_tile_y0, - &l_tile_x1, &l_tile_y1, - &l_nb_comps, - &l_go_on, - p_stream, - p_manager)) { - return OPJ_FALSE; - } - - if (! opj_j2k_decode_tile(p_j2k, l_current_tile_no, NULL, 0, - p_stream, p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, "Failed to decode tile 1/1\n"); - return OPJ_FALSE; - } - - /* Transfer TCD data to output image data */ - for (i = 0; i < p_j2k->m_output_image->numcomps; i++) { - opj_image_data_free(p_j2k->m_output_image->comps[i].data); - p_j2k->m_output_image->comps[i].data = - p_j2k->m_tcd->tcd_image->tiles->comps[i].data; - p_j2k->m_output_image->comps[i].resno_decoded = - p_j2k->m_tcd->image->comps[i].resno_decoded; - p_j2k->m_tcd->tcd_image->tiles->comps[i].data = NULL; - } - - return OPJ_TRUE; - } - - for (;;) { - if (p_j2k->m_cp.tw == 1 && p_j2k->m_cp.th == 1 && - p_j2k->m_cp.tcps[0].m_data != NULL) { - l_current_tile_no = 0; - p_j2k->m_current_tile_number = 0; - p_j2k->m_specific_param.m_decoder.m_state |= J2K_STATE_DATA; - } else { - if (! opj_j2k_read_tile_header(p_j2k, - &l_current_tile_no, - NULL, - &l_tile_x0, &l_tile_y0, - &l_tile_x1, &l_tile_y1, - &l_nb_comps, - &l_go_on, - p_stream, - p_manager)) { - return OPJ_FALSE; - } - - if (! l_go_on) { - break; - } - } - - if (! opj_j2k_decode_tile(p_j2k, l_current_tile_no, NULL, 0, - p_stream, p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, "Failed to decode tile %d/%d\n", - l_current_tile_no + 1, p_j2k->m_cp.th * p_j2k->m_cp.tw); - return OPJ_FALSE; - } - - opj_event_msg(p_manager, EVT_INFO, "Tile %d/%d has been decoded.\n", - l_current_tile_no + 1, p_j2k->m_cp.th * p_j2k->m_cp.tw); - - if (! opj_j2k_update_image_data(p_j2k->m_tcd, - p_j2k->m_output_image)) { - return OPJ_FALSE; - } - - if (p_j2k->m_cp.tw == 1 && p_j2k->m_cp.th == 1 && - !(p_j2k->m_output_image->x0 == p_j2k->m_private_image->x0 && - p_j2k->m_output_image->y0 == p_j2k->m_private_image->y0 && - p_j2k->m_output_image->x1 == p_j2k->m_private_image->x1 && - p_j2k->m_output_image->y1 == p_j2k->m_private_image->y1)) { - /* Keep current tcp data */ - } else { - opj_j2k_tcp_data_destroy(&p_j2k->m_cp.tcps[l_current_tile_no]); - } - - opj_event_msg(p_manager, EVT_INFO, - "Image data has been updated with tile %d.\n\n", l_current_tile_no + 1); - - if (opj_stream_get_number_byte_left(p_stream) == 0 - && p_j2k->m_specific_param.m_decoder.m_state == J2K_STATE_NEOC) { - break; - } - if (++nr_tiles == p_j2k->m_cp.th * p_j2k->m_cp.tw) { - break; - } - } - - return OPJ_TRUE; -} - -/** - * Sets up the procedures to do on decoding data. Developpers wanting to extend the library can add their own reading procedures. - */ -static OPJ_BOOL opj_j2k_setup_decoding(opj_j2k_t *p_j2k, - opj_event_mgr_t * p_manager) -{ - /* preconditions*/ - assert(p_j2k != 00); - assert(p_manager != 00); - - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_decode_tiles, p_manager)) { - return OPJ_FALSE; - } - /* DEVELOPER CORNER, add your custom procedures */ - - return OPJ_TRUE; -} - -/* - * Read and decode one tile. - */ -static OPJ_BOOL opj_j2k_decode_one_tile(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager) -{ - OPJ_BOOL l_go_on = OPJ_TRUE; - OPJ_UINT32 l_current_tile_no; - OPJ_UINT32 l_tile_no_to_dec; - OPJ_INT32 l_tile_x0, l_tile_y0, l_tile_x1, l_tile_y1; - OPJ_UINT32 l_nb_comps; - OPJ_UINT32 l_nb_tiles; - OPJ_UINT32 i; - - /*Allocate and initialize some elements of codestrem index if not already done*/ - if (!p_j2k->cstr_index->tile_index) { - if (!opj_j2k_allocate_tile_element_cstr_index(p_j2k)) { - return OPJ_FALSE; - } - } - /* Move into the codestream to the first SOT used to decode the desired tile */ - l_tile_no_to_dec = (OPJ_UINT32) - p_j2k->m_specific_param.m_decoder.m_tile_ind_to_dec; - if (p_j2k->cstr_index->tile_index) - if (p_j2k->cstr_index->tile_index->tp_index) { - if (! p_j2k->cstr_index->tile_index[l_tile_no_to_dec].nb_tps) { - /* the index for this tile has not been built, - * so move to the last SOT read */ - if (!(opj_stream_read_seek(p_stream, - p_j2k->m_specific_param.m_decoder.m_last_sot_read_pos + 2, p_manager))) { - opj_event_msg(p_manager, EVT_ERROR, "Problem with seek function\n"); - return OPJ_FALSE; - } - } else { - if (!(opj_stream_read_seek(p_stream, - p_j2k->cstr_index->tile_index[l_tile_no_to_dec].tp_index[0].start_pos + 2, - p_manager))) { - opj_event_msg(p_manager, EVT_ERROR, "Problem with seek function\n"); - return OPJ_FALSE; - } - } - /* Special case if we have previously read the EOC marker (if the previous tile getted is the last ) */ - if (p_j2k->m_specific_param.m_decoder.m_state == J2K_STATE_EOC) { - p_j2k->m_specific_param.m_decoder.m_state = J2K_STATE_TPHSOT; - } - } - - /* Reset current tile part number for all tiles, and not only the one */ - /* of interest. */ - /* Not completely sure this is always correct but required for */ - /* ./build/bin/j2k_random_tile_access ./build/tests/tte1.j2k */ - l_nb_tiles = p_j2k->m_cp.tw * p_j2k->m_cp.th; - for (i = 0; i < l_nb_tiles; ++i) { - p_j2k->m_cp.tcps[i].m_current_tile_part_number = -1; - } - - for (;;) { - if (! opj_j2k_read_tile_header(p_j2k, - &l_current_tile_no, - NULL, - &l_tile_x0, &l_tile_y0, - &l_tile_x1, &l_tile_y1, - &l_nb_comps, - &l_go_on, - p_stream, - p_manager)) { - return OPJ_FALSE; - } - - if (! l_go_on) { - break; - } - - if (! opj_j2k_decode_tile(p_j2k, l_current_tile_no, NULL, 0, - p_stream, p_manager)) { - return OPJ_FALSE; - } - opj_event_msg(p_manager, EVT_INFO, "Tile %d/%d has been decoded.\n", - l_current_tile_no + 1, p_j2k->m_cp.th * p_j2k->m_cp.tw); - - if (! opj_j2k_update_image_data(p_j2k->m_tcd, - p_j2k->m_output_image)) { - return OPJ_FALSE; - } - opj_j2k_tcp_data_destroy(&p_j2k->m_cp.tcps[l_current_tile_no]); - - opj_event_msg(p_manager, EVT_INFO, - "Image data has been updated with tile %d.\n\n", l_current_tile_no + 1); - - if (l_current_tile_no == l_tile_no_to_dec) { - /* move into the codestream to the first SOT (FIXME or not move?)*/ - if (!(opj_stream_read_seek(p_stream, p_j2k->cstr_index->main_head_end + 2, - p_manager))) { - opj_event_msg(p_manager, EVT_ERROR, "Problem with seek function\n"); - return OPJ_FALSE; - } - break; - } else { - opj_event_msg(p_manager, EVT_WARNING, - "Tile read, decoded and updated is not the desired one (%d vs %d).\n", - l_current_tile_no + 1, l_tile_no_to_dec + 1); - } - - } - - return OPJ_TRUE; -} - -/** - * Sets up the procedures to do on decoding one tile. Developpers wanting to extend the library can add their own reading procedures. - */ -static OPJ_BOOL opj_j2k_setup_decoding_tile(opj_j2k_t *p_j2k, - opj_event_mgr_t * p_manager) -{ - /* preconditions*/ - assert(p_j2k != 00); - assert(p_manager != 00); - - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_decode_one_tile, p_manager)) { - return OPJ_FALSE; - } - /* DEVELOPER CORNER, add your custom procedures */ - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_move_data_from_codec_to_output_image(opj_j2k_t * p_j2k, - opj_image_t * p_image) -{ - OPJ_UINT32 compno; - - /* Move data and copy one information from codec to output image*/ - if (p_j2k->m_specific_param.m_decoder.m_numcomps_to_decode > 0) { - opj_image_comp_t* newcomps = - (opj_image_comp_t*) opj_malloc( - p_j2k->m_specific_param.m_decoder.m_numcomps_to_decode * - sizeof(opj_image_comp_t)); - if (newcomps == NULL) { - opj_image_destroy(p_j2k->m_private_image); - p_j2k->m_private_image = NULL; - return OPJ_FALSE; - } - for (compno = 0; compno < p_image->numcomps; compno++) { - opj_image_data_free(p_image->comps[compno].data); - p_image->comps[compno].data = NULL; - } - for (compno = 0; - compno < p_j2k->m_specific_param.m_decoder.m_numcomps_to_decode; compno++) { - OPJ_UINT32 src_compno = - p_j2k->m_specific_param.m_decoder.m_comps_indices_to_decode[compno]; - memcpy(&(newcomps[compno]), - &(p_j2k->m_output_image->comps[src_compno]), - sizeof(opj_image_comp_t)); - newcomps[compno].resno_decoded = - p_j2k->m_output_image->comps[src_compno].resno_decoded; - newcomps[compno].data = p_j2k->m_output_image->comps[src_compno].data; - p_j2k->m_output_image->comps[src_compno].data = NULL; - } - for (compno = 0; compno < p_image->numcomps; compno++) { - assert(p_j2k->m_output_image->comps[compno].data == NULL); - opj_image_data_free(p_j2k->m_output_image->comps[compno].data); - p_j2k->m_output_image->comps[compno].data = NULL; - } - p_image->numcomps = p_j2k->m_specific_param.m_decoder.m_numcomps_to_decode; - opj_free(p_image->comps); - p_image->comps = newcomps; - } else { - for (compno = 0; compno < p_image->numcomps; compno++) { - p_image->comps[compno].resno_decoded = - p_j2k->m_output_image->comps[compno].resno_decoded; - opj_image_data_free(p_image->comps[compno].data); - p_image->comps[compno].data = p_j2k->m_output_image->comps[compno].data; -#if 0 - char fn[256]; - sprintf(fn, "/tmp/%d.raw", compno); - FILE *debug = fopen(fn, "wb"); - fwrite(p_image->comps[compno].data, sizeof(OPJ_INT32), - p_image->comps[compno].w * p_image->comps[compno].h, debug); - fclose(debug); -#endif - p_j2k->m_output_image->comps[compno].data = NULL; - } - } - return OPJ_TRUE; -} - -OPJ_BOOL opj_j2k_decode(opj_j2k_t * p_j2k, - opj_stream_private_t * p_stream, - opj_image_t * p_image, - opj_event_mgr_t * p_manager) -{ - if (!p_image) { - return OPJ_FALSE; - } - - /* Heuristics to detect sequence opj_read_header(), opj_set_decoded_resolution_factor() */ - /* and finally opj_decode_image() without manual setting of comps[].factor */ - /* We could potentially always execute it, if we don't allow people to do */ - /* opj_read_header(), modify x0,y0,x1,y1 of returned image an call opj_decode_image() */ - if (p_j2k->m_cp.m_specific_param.m_dec.m_reduce > 0 && - p_j2k->m_private_image != NULL && - p_j2k->m_private_image->numcomps > 0 && - p_j2k->m_private_image->comps[0].factor == - p_j2k->m_cp.m_specific_param.m_dec.m_reduce && - p_image->numcomps > 0 && - p_image->comps[0].factor == 0 && - /* Don't mess with image dimension if the user has allocated it */ - p_image->comps[0].data == NULL) { - OPJ_UINT32 it_comp; - - /* Update the comps[].factor member of the output image with the one */ - /* of m_reduce */ - for (it_comp = 0; it_comp < p_image->numcomps; ++it_comp) { - p_image->comps[it_comp].factor = p_j2k->m_cp.m_specific_param.m_dec.m_reduce; - } - if (!opj_j2k_update_image_dimensions(p_image, p_manager)) { - return OPJ_FALSE; - } - } - - if (p_j2k->m_output_image == NULL) { - p_j2k->m_output_image = opj_image_create0(); - if (!(p_j2k->m_output_image)) { - return OPJ_FALSE; - } - } - opj_copy_image_header(p_image, p_j2k->m_output_image); - - /* customization of the decoding */ - if (!opj_j2k_setup_decoding(p_j2k, p_manager)) { - return OPJ_FALSE; - } - - /* Decode the codestream */ - if (! opj_j2k_exec(p_j2k, p_j2k->m_procedure_list, p_stream, p_manager)) { - opj_image_destroy(p_j2k->m_private_image); - p_j2k->m_private_image = NULL; - return OPJ_FALSE; - } - - /* Move data and copy one information from codec to output image*/ - return opj_j2k_move_data_from_codec_to_output_image(p_j2k, p_image); -} - -OPJ_BOOL opj_j2k_get_tile(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_image_t* p_image, - opj_event_mgr_t * p_manager, - OPJ_UINT32 tile_index) -{ - OPJ_UINT32 compno; - OPJ_UINT32 l_tile_x, l_tile_y; - opj_image_comp_t* l_img_comp; - - if (!p_image) { - opj_event_msg(p_manager, EVT_ERROR, "We need an image previously created.\n"); - return OPJ_FALSE; - } - - if (p_image->numcomps < p_j2k->m_private_image->numcomps) { - opj_event_msg(p_manager, EVT_ERROR, - "Image has less components than codestream.\n"); - return OPJ_FALSE; - } - - if (/*(tile_index < 0) &&*/ (tile_index >= p_j2k->m_cp.tw * p_j2k->m_cp.th)) { - opj_event_msg(p_manager, EVT_ERROR, - "Tile index provided by the user is incorrect %d (max = %d) \n", tile_index, - (p_j2k->m_cp.tw * p_j2k->m_cp.th) - 1); - return OPJ_FALSE; - } - - /* Compute the dimension of the desired tile*/ - l_tile_x = tile_index % p_j2k->m_cp.tw; - l_tile_y = tile_index / p_j2k->m_cp.tw; - - p_image->x0 = l_tile_x * p_j2k->m_cp.tdx + p_j2k->m_cp.tx0; - if (p_image->x0 < p_j2k->m_private_image->x0) { - p_image->x0 = p_j2k->m_private_image->x0; - } - p_image->x1 = (l_tile_x + 1) * p_j2k->m_cp.tdx + p_j2k->m_cp.tx0; - if (p_image->x1 > p_j2k->m_private_image->x1) { - p_image->x1 = p_j2k->m_private_image->x1; - } - - p_image->y0 = l_tile_y * p_j2k->m_cp.tdy + p_j2k->m_cp.ty0; - if (p_image->y0 < p_j2k->m_private_image->y0) { - p_image->y0 = p_j2k->m_private_image->y0; - } - p_image->y1 = (l_tile_y + 1) * p_j2k->m_cp.tdy + p_j2k->m_cp.ty0; - if (p_image->y1 > p_j2k->m_private_image->y1) { - p_image->y1 = p_j2k->m_private_image->y1; - } - - l_img_comp = p_image->comps; - for (compno = 0; compno < p_j2k->m_private_image->numcomps; ++compno) { - OPJ_INT32 l_comp_x1, l_comp_y1; - - l_img_comp->factor = p_j2k->m_private_image->comps[compno].factor; - - l_img_comp->x0 = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)p_image->x0, - (OPJ_INT32)l_img_comp->dx); - l_img_comp->y0 = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)p_image->y0, - (OPJ_INT32)l_img_comp->dy); - l_comp_x1 = opj_int_ceildiv((OPJ_INT32)p_image->x1, (OPJ_INT32)l_img_comp->dx); - l_comp_y1 = opj_int_ceildiv((OPJ_INT32)p_image->y1, (OPJ_INT32)l_img_comp->dy); - - l_img_comp->w = (OPJ_UINT32)(opj_int_ceildivpow2(l_comp_x1, - (OPJ_INT32)l_img_comp->factor) - opj_int_ceildivpow2((OPJ_INT32)l_img_comp->x0, - (OPJ_INT32)l_img_comp->factor)); - l_img_comp->h = (OPJ_UINT32)(opj_int_ceildivpow2(l_comp_y1, - (OPJ_INT32)l_img_comp->factor) - opj_int_ceildivpow2((OPJ_INT32)l_img_comp->y0, - (OPJ_INT32)l_img_comp->factor)); - - l_img_comp++; - } - - if (p_image->numcomps > p_j2k->m_private_image->numcomps) { - /* Can happen when calling repeatdly opj_get_decoded_tile() on an - * image with a color palette, where color palette expansion is done - * later in jp2.c */ - for (compno = p_j2k->m_private_image->numcomps; compno < p_image->numcomps; - ++compno) { - opj_image_data_free(p_image->comps[compno].data); - p_image->comps[compno].data = NULL; - } - p_image->numcomps = p_j2k->m_private_image->numcomps; - } - - /* Destroy the previous output image*/ - if (p_j2k->m_output_image) { - opj_image_destroy(p_j2k->m_output_image); - } - - /* Create the ouput image from the information previously computed*/ - p_j2k->m_output_image = opj_image_create0(); - if (!(p_j2k->m_output_image)) { - return OPJ_FALSE; - } - opj_copy_image_header(p_image, p_j2k->m_output_image); - - p_j2k->m_specific_param.m_decoder.m_tile_ind_to_dec = (OPJ_INT32)tile_index; - - /* customization of the decoding */ - if (!opj_j2k_setup_decoding_tile(p_j2k, p_manager)) { - return OPJ_FALSE; - } - - /* Decode the codestream */ - if (! opj_j2k_exec(p_j2k, p_j2k->m_procedure_list, p_stream, p_manager)) { - opj_image_destroy(p_j2k->m_private_image); - p_j2k->m_private_image = NULL; - return OPJ_FALSE; - } - - /* Move data and copy one information from codec to output image*/ - return opj_j2k_move_data_from_codec_to_output_image(p_j2k, p_image); -} - -OPJ_BOOL opj_j2k_set_decoded_resolution_factor(opj_j2k_t *p_j2k, - OPJ_UINT32 res_factor, - opj_event_mgr_t * p_manager) -{ - OPJ_UINT32 it_comp; - - p_j2k->m_cp.m_specific_param.m_dec.m_reduce = res_factor; - - if (p_j2k->m_private_image) { - if (p_j2k->m_private_image->comps) { - if (p_j2k->m_specific_param.m_decoder.m_default_tcp) { - if (p_j2k->m_specific_param.m_decoder.m_default_tcp->tccps) { - for (it_comp = 0 ; it_comp < p_j2k->m_private_image->numcomps; it_comp++) { - OPJ_UINT32 max_res = - p_j2k->m_specific_param.m_decoder.m_default_tcp->tccps[it_comp].numresolutions; - if (res_factor >= max_res) { - opj_event_msg(p_manager, EVT_ERROR, - "Resolution factor is greater than the maximum resolution in the component.\n"); - return OPJ_FALSE; - } - p_j2k->m_private_image->comps[it_comp].factor = res_factor; - } - return OPJ_TRUE; - } - } - } - } - - return OPJ_FALSE; -} - -OPJ_BOOL opj_j2k_encode(opj_j2k_t * p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager) -{ - OPJ_UINT32 i, j; - OPJ_UINT32 l_nb_tiles; - OPJ_SIZE_T l_max_tile_size = 0, l_current_tile_size; - OPJ_BYTE * l_current_data = 00; - OPJ_BOOL l_reuse_data = OPJ_FALSE; - opj_tcd_t* p_tcd = 00; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_stream != 00); - assert(p_manager != 00); - - p_tcd = p_j2k->m_tcd; - - l_nb_tiles = p_j2k->m_cp.th * p_j2k->m_cp.tw; - if (l_nb_tiles == 1) { - l_reuse_data = OPJ_TRUE; -#ifdef __SSE__ - for (j = 0; j < p_j2k->m_tcd->image->numcomps; ++j) { - opj_image_comp_t * l_img_comp = p_tcd->image->comps + j; - if (((size_t)l_img_comp->data & 0xFU) != - 0U) { /* tile data shall be aligned on 16 bytes */ - l_reuse_data = OPJ_FALSE; - } - } -#endif - } - for (i = 0; i < l_nb_tiles; ++i) { - if (! opj_j2k_pre_write_tile(p_j2k, i, p_stream, p_manager)) { - if (l_current_data) { - opj_free(l_current_data); - } - return OPJ_FALSE; - } - - /* if we only have one tile, then simply set tile component data equal to image component data */ - /* otherwise, allocate the data */ - for (j = 0; j < p_j2k->m_tcd->image->numcomps; ++j) { - opj_tcd_tilecomp_t* l_tilec = p_tcd->tcd_image->tiles->comps + j; - if (l_reuse_data) { - opj_image_comp_t * l_img_comp = p_tcd->image->comps + j; - l_tilec->data = l_img_comp->data; - l_tilec->ownsData = OPJ_FALSE; - } else { - if (! opj_alloc_tile_component_data(l_tilec)) { - opj_event_msg(p_manager, EVT_ERROR, "Error allocating tile component data."); - if (l_current_data) { - opj_free(l_current_data); - } - return OPJ_FALSE; - } - } - } - l_current_tile_size = opj_tcd_get_encoded_tile_size(p_j2k->m_tcd); - if (!l_reuse_data) { - if (l_current_tile_size > l_max_tile_size) { - OPJ_BYTE *l_new_current_data = (OPJ_BYTE *) opj_realloc(l_current_data, - l_current_tile_size); - if (! l_new_current_data) { - if (l_current_data) { - opj_free(l_current_data); - } - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to encode all tiles\n"); - return OPJ_FALSE; - } - l_current_data = l_new_current_data; - l_max_tile_size = l_current_tile_size; - } - if (l_current_data == NULL) { - /* Should not happen in practice, but will avoid Coverity to */ - /* complain about a null pointer dereference */ - assert(0); - return OPJ_FALSE; - } - - /* copy image data (32 bit) to l_current_data as contiguous, all-component, zero offset buffer */ - /* 32 bit components @ 8 bit precision get converted to 8 bit */ - /* 32 bit components @ 16 bit precision get converted to 16 bit */ - opj_j2k_get_tile_data(p_j2k->m_tcd, l_current_data); - - /* now copy this data into the tile component */ - if (! opj_tcd_copy_tile_data(p_j2k->m_tcd, l_current_data, - l_current_tile_size)) { - opj_event_msg(p_manager, EVT_ERROR, - "Size mismatch between tile data and sent data."); - opj_free(l_current_data); - return OPJ_FALSE; - } - } - - if (! opj_j2k_post_write_tile(p_j2k, p_stream, p_manager)) { - if (l_current_data) { - opj_free(l_current_data); - } - return OPJ_FALSE; - } - } - - if (l_current_data) { - opj_free(l_current_data); - } - return OPJ_TRUE; -} - -OPJ_BOOL opj_j2k_end_compress(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager) -{ - /* customization of the encoding */ - if (! opj_j2k_setup_end_compress(p_j2k, p_manager)) { - return OPJ_FALSE; - } - - if (! opj_j2k_exec(p_j2k, p_j2k->m_procedure_list, p_stream, p_manager)) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -OPJ_BOOL opj_j2k_start_compress(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_image_t * p_image, - opj_event_mgr_t * p_manager) -{ - /* preconditions */ - assert(p_j2k != 00); - assert(p_stream != 00); - assert(p_manager != 00); - - p_j2k->m_private_image = opj_image_create0(); - if (! p_j2k->m_private_image) { - opj_event_msg(p_manager, EVT_ERROR, "Failed to allocate image header."); - return OPJ_FALSE; - } - opj_copy_image_header(p_image, p_j2k->m_private_image); - - /* TODO_MSD: Find a better way */ - if (p_image->comps) { - OPJ_UINT32 it_comp; - for (it_comp = 0 ; it_comp < p_image->numcomps; it_comp++) { - if (p_image->comps[it_comp].data) { - p_j2k->m_private_image->comps[it_comp].data = p_image->comps[it_comp].data; - p_image->comps[it_comp].data = NULL; - - } - } - } - - /* customization of the validation */ - if (! opj_j2k_setup_encoding_validation(p_j2k, p_manager)) { - return OPJ_FALSE; - } - - /* validation of the parameters codec */ - if (! opj_j2k_exec(p_j2k, p_j2k->m_validation_list, p_stream, p_manager)) { - return OPJ_FALSE; - } - - /* customization of the encoding */ - if (! opj_j2k_setup_header_writing(p_j2k, p_manager)) { - return OPJ_FALSE; - } - - /* write header */ - if (! opj_j2k_exec(p_j2k, p_j2k->m_procedure_list, p_stream, p_manager)) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_pre_write_tile(opj_j2k_t * p_j2k, - OPJ_UINT32 p_tile_index, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager) -{ - (void)p_stream; - if (p_tile_index != p_j2k->m_current_tile_number) { - opj_event_msg(p_manager, EVT_ERROR, "The given tile index does not match."); - return OPJ_FALSE; - } - - opj_event_msg(p_manager, EVT_INFO, "tile number %d / %d\n", - p_j2k->m_current_tile_number + 1, p_j2k->m_cp.tw * p_j2k->m_cp.th); - - p_j2k->m_specific_param.m_encoder.m_current_tile_part_number = 0; - p_j2k->m_tcd->cur_totnum_tp = p_j2k->m_cp.tcps[p_tile_index].m_nb_tile_parts; - p_j2k->m_specific_param.m_encoder.m_current_poc_tile_part_number = 0; - - /* initialisation before tile encoding */ - if (! opj_tcd_init_encode_tile(p_j2k->m_tcd, p_j2k->m_current_tile_number, - p_manager)) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -static void opj_get_tile_dimensions(opj_image_t * l_image, - opj_tcd_tilecomp_t * l_tilec, - opj_image_comp_t * l_img_comp, - OPJ_UINT32* l_size_comp, - OPJ_UINT32* l_width, - OPJ_UINT32* l_height, - OPJ_UINT32* l_offset_x, - OPJ_UINT32* l_offset_y, - OPJ_UINT32* l_image_width, - OPJ_UINT32* l_stride, - OPJ_UINT32* l_tile_offset) -{ - OPJ_UINT32 l_remaining; - *l_size_comp = l_img_comp->prec >> 3; /* (/8) */ - l_remaining = l_img_comp->prec & 7; /* (%8) */ - if (l_remaining) { - *l_size_comp += 1; - } - - if (*l_size_comp == 3) { - *l_size_comp = 4; - } - - *l_width = (OPJ_UINT32)(l_tilec->x1 - l_tilec->x0); - *l_height = (OPJ_UINT32)(l_tilec->y1 - l_tilec->y0); - *l_offset_x = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)l_image->x0, - (OPJ_INT32)l_img_comp->dx); - *l_offset_y = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)l_image->y0, - (OPJ_INT32)l_img_comp->dy); - *l_image_width = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)l_image->x1 - - (OPJ_INT32)l_image->x0, (OPJ_INT32)l_img_comp->dx); - *l_stride = *l_image_width - *l_width; - *l_tile_offset = ((OPJ_UINT32)l_tilec->x0 - *l_offset_x) + (( - OPJ_UINT32)l_tilec->y0 - *l_offset_y) * *l_image_width; -} - -static void opj_j2k_get_tile_data(opj_tcd_t * p_tcd, OPJ_BYTE * p_data) -{ - OPJ_UINT32 i, j, k = 0; - - for (i = 0; i < p_tcd->image->numcomps; ++i) { - opj_image_t * l_image = p_tcd->image; - OPJ_INT32 * l_src_ptr; - opj_tcd_tilecomp_t * l_tilec = p_tcd->tcd_image->tiles->comps + i; - opj_image_comp_t * l_img_comp = l_image->comps + i; - OPJ_UINT32 l_size_comp, l_width, l_height, l_offset_x, l_offset_y, - l_image_width, l_stride, l_tile_offset; - - opj_get_tile_dimensions(l_image, - l_tilec, - l_img_comp, - &l_size_comp, - &l_width, - &l_height, - &l_offset_x, - &l_offset_y, - &l_image_width, - &l_stride, - &l_tile_offset); - - l_src_ptr = l_img_comp->data + l_tile_offset; - - switch (l_size_comp) { - case 1: { - OPJ_CHAR * l_dest_ptr = (OPJ_CHAR*) p_data; - if (l_img_comp->sgnd) { - for (j = 0; j < l_height; ++j) { - for (k = 0; k < l_width; ++k) { - *(l_dest_ptr) = (OPJ_CHAR)(*l_src_ptr); - ++l_dest_ptr; - ++l_src_ptr; - } - l_src_ptr += l_stride; - } - } else { - for (j = 0; j < l_height; ++j) { - for (k = 0; k < l_width; ++k) { - *(l_dest_ptr) = (OPJ_CHAR)((*l_src_ptr) & 0xff); - ++l_dest_ptr; - ++l_src_ptr; - } - l_src_ptr += l_stride; - } - } - - p_data = (OPJ_BYTE*) l_dest_ptr; - } - break; - case 2: { - OPJ_INT16 * l_dest_ptr = (OPJ_INT16 *) p_data; - if (l_img_comp->sgnd) { - for (j = 0; j < l_height; ++j) { - for (k = 0; k < l_width; ++k) { - *(l_dest_ptr++) = (OPJ_INT16)(*(l_src_ptr++)); - } - l_src_ptr += l_stride; - } - } else { - for (j = 0; j < l_height; ++j) { - for (k = 0; k < l_width; ++k) { - *(l_dest_ptr++) = (OPJ_INT16)((*(l_src_ptr++)) & 0xffff); - } - l_src_ptr += l_stride; - } - } - - p_data = (OPJ_BYTE*) l_dest_ptr; - } - break; - case 4: { - OPJ_INT32 * l_dest_ptr = (OPJ_INT32 *) p_data; - for (j = 0; j < l_height; ++j) { - for (k = 0; k < l_width; ++k) { - *(l_dest_ptr++) = *(l_src_ptr++); - } - l_src_ptr += l_stride; - } - - p_data = (OPJ_BYTE*) l_dest_ptr; - } - break; - } - } -} - -static OPJ_BOOL opj_j2k_post_write_tile(opj_j2k_t * p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager) -{ - OPJ_UINT32 l_nb_bytes_written; - OPJ_BYTE * l_current_data = 00; - OPJ_UINT32 l_tile_size = 0; - OPJ_UINT32 l_available_data; - - /* preconditions */ - assert(p_j2k->m_specific_param.m_encoder.m_encoded_tile_data); - - l_tile_size = p_j2k->m_specific_param.m_encoder.m_encoded_tile_size; - l_available_data = l_tile_size; - l_current_data = p_j2k->m_specific_param.m_encoder.m_encoded_tile_data; - - l_nb_bytes_written = 0; - if (! opj_j2k_write_first_tile_part(p_j2k, l_current_data, &l_nb_bytes_written, - l_available_data, p_stream, p_manager)) { - return OPJ_FALSE; - } - l_current_data += l_nb_bytes_written; - l_available_data -= l_nb_bytes_written; - - l_nb_bytes_written = 0; - if (! opj_j2k_write_all_tile_parts(p_j2k, l_current_data, &l_nb_bytes_written, - l_available_data, p_stream, p_manager)) { - return OPJ_FALSE; - } - - l_available_data -= l_nb_bytes_written; - l_nb_bytes_written = l_tile_size - l_available_data; - - if (opj_stream_write_data(p_stream, - p_j2k->m_specific_param.m_encoder.m_encoded_tile_data, - l_nb_bytes_written, p_manager) != l_nb_bytes_written) { - return OPJ_FALSE; - } - - ++p_j2k->m_current_tile_number; - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_setup_end_compress(opj_j2k_t *p_j2k, - opj_event_mgr_t * p_manager) -{ - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - - /* DEVELOPER CORNER, insert your custom procedures */ - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_write_eoc, p_manager)) { - return OPJ_FALSE; - } - - if (OPJ_IS_CINEMA(p_j2k->m_cp.rsiz)) { - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_write_updated_tlm, p_manager)) { - return OPJ_FALSE; - } - } - - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_write_epc, p_manager)) { - return OPJ_FALSE; - } - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_end_encoding, p_manager)) { - return OPJ_FALSE; - } - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_destroy_header_memory, p_manager)) { - return OPJ_FALSE; - } - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_setup_encoding_validation(opj_j2k_t *p_j2k, - opj_event_mgr_t * p_manager) -{ - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - - if (! opj_procedure_list_add_procedure(p_j2k->m_validation_list, - (opj_procedure)opj_j2k_build_encoder, p_manager)) { - return OPJ_FALSE; - } - if (! opj_procedure_list_add_procedure(p_j2k->m_validation_list, - (opj_procedure)opj_j2k_encoding_validation, p_manager)) { - return OPJ_FALSE; - } - - /* DEVELOPER CORNER, add your custom validation procedure */ - if (! opj_procedure_list_add_procedure(p_j2k->m_validation_list, - (opj_procedure)opj_j2k_mct_validation, p_manager)) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_setup_header_writing(opj_j2k_t *p_j2k, - opj_event_mgr_t * p_manager) -{ - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_init_info, p_manager)) { - return OPJ_FALSE; - } - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_write_soc, p_manager)) { - return OPJ_FALSE; - } - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_write_siz, p_manager)) { - return OPJ_FALSE; - } - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_write_cod, p_manager)) { - return OPJ_FALSE; - } - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_write_qcd, p_manager)) { - return OPJ_FALSE; - } - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_write_all_coc, p_manager)) { - return OPJ_FALSE; - } - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_write_all_qcc, p_manager)) { - return OPJ_FALSE; - } - - if (OPJ_IS_CINEMA(p_j2k->m_cp.rsiz)) { - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_write_tlm, p_manager)) { - return OPJ_FALSE; - } - - if (p_j2k->m_cp.rsiz == OPJ_PROFILE_CINEMA_4K) { - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_write_poc, p_manager)) { - return OPJ_FALSE; - } - } - } - - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_write_regions, p_manager)) { - return OPJ_FALSE; - } - - if (p_j2k->m_cp.comment != 00) { - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_write_com, p_manager)) { - return OPJ_FALSE; - } - } - - /* DEVELOPER CORNER, insert your custom procedures */ - if (p_j2k->m_cp.rsiz & OPJ_EXTENSION_MCT) { - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_write_mct_data_group, p_manager)) { - return OPJ_FALSE; - } - } - /* End of Developer Corner */ - - if (p_j2k->cstr_index) { - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_get_end_header, p_manager)) { - return OPJ_FALSE; - } - } - - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_create_tcd, p_manager)) { - return OPJ_FALSE; - } - if (! opj_procedure_list_add_procedure(p_j2k->m_procedure_list, - (opj_procedure)opj_j2k_update_rates, p_manager)) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_first_tile_part(opj_j2k_t *p_j2k, - OPJ_BYTE * p_data, - OPJ_UINT32 * p_data_written, - OPJ_UINT32 p_total_data_size, - opj_stream_private_t *p_stream, - struct opj_event_mgr * p_manager) -{ - OPJ_UINT32 l_nb_bytes_written = 0; - OPJ_UINT32 l_current_nb_bytes_written; - OPJ_BYTE * l_begin_data = 00; - - opj_tcd_t * l_tcd = 00; - opj_cp_t * l_cp = 00; - - l_tcd = p_j2k->m_tcd; - l_cp = &(p_j2k->m_cp); - - l_tcd->cur_pino = 0; - - /*Get number of tile parts*/ - p_j2k->m_specific_param.m_encoder.m_current_poc_tile_part_number = 0; - - /* INDEX >> */ - /* << INDEX */ - - l_current_nb_bytes_written = 0; - l_begin_data = p_data; - if (! opj_j2k_write_sot(p_j2k, p_data, p_total_data_size, - &l_current_nb_bytes_written, p_stream, - p_manager)) { - return OPJ_FALSE; - } - - l_nb_bytes_written += l_current_nb_bytes_written; - p_data += l_current_nb_bytes_written; - p_total_data_size -= l_current_nb_bytes_written; - - if (!OPJ_IS_CINEMA(l_cp->rsiz)) { -#if 0 - for (compno = 1; compno < p_j2k->m_private_image->numcomps; compno++) { - l_current_nb_bytes_written = 0; - opj_j2k_write_coc_in_memory(p_j2k, compno, p_data, &l_current_nb_bytes_written, - p_manager); - l_nb_bytes_written += l_current_nb_bytes_written; - p_data += l_current_nb_bytes_written; - p_total_data_size -= l_current_nb_bytes_written; - - l_current_nb_bytes_written = 0; - opj_j2k_write_qcc_in_memory(p_j2k, compno, p_data, &l_current_nb_bytes_written, - p_manager); - l_nb_bytes_written += l_current_nb_bytes_written; - p_data += l_current_nb_bytes_written; - p_total_data_size -= l_current_nb_bytes_written; - } -#endif - if (l_cp->tcps[p_j2k->m_current_tile_number].numpocs) { - l_current_nb_bytes_written = 0; - opj_j2k_write_poc_in_memory(p_j2k, p_data, &l_current_nb_bytes_written, - p_manager); - l_nb_bytes_written += l_current_nb_bytes_written; - p_data += l_current_nb_bytes_written; - p_total_data_size -= l_current_nb_bytes_written; - } - } - - l_current_nb_bytes_written = 0; - if (! opj_j2k_write_sod(p_j2k, l_tcd, p_data, &l_current_nb_bytes_written, - p_total_data_size, p_stream, p_manager)) { - return OPJ_FALSE; - } - - l_nb_bytes_written += l_current_nb_bytes_written; - * p_data_written = l_nb_bytes_written; - - /* Writing Psot in SOT marker */ - opj_write_bytes(l_begin_data + 6, l_nb_bytes_written, - 4); /* PSOT */ - - if (OPJ_IS_CINEMA(l_cp->rsiz)) { - opj_j2k_update_tlm(p_j2k, l_nb_bytes_written); - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_all_tile_parts(opj_j2k_t *p_j2k, - OPJ_BYTE * p_data, - OPJ_UINT32 * p_data_written, - OPJ_UINT32 p_total_data_size, - opj_stream_private_t *p_stream, - struct opj_event_mgr * p_manager - ) -{ - OPJ_UINT32 tilepartno = 0; - OPJ_UINT32 l_nb_bytes_written = 0; - OPJ_UINT32 l_current_nb_bytes_written; - OPJ_UINT32 l_part_tile_size; - OPJ_UINT32 tot_num_tp; - OPJ_UINT32 pino; - - OPJ_BYTE * l_begin_data; - opj_tcp_t *l_tcp = 00; - opj_tcd_t * l_tcd = 00; - opj_cp_t * l_cp = 00; - - l_tcd = p_j2k->m_tcd; - l_cp = &(p_j2k->m_cp); - l_tcp = l_cp->tcps + p_j2k->m_current_tile_number; - - /*Get number of tile parts*/ - tot_num_tp = opj_j2k_get_num_tp(l_cp, 0, p_j2k->m_current_tile_number); - - /* start writing remaining tile parts */ - ++p_j2k->m_specific_param.m_encoder.m_current_tile_part_number; - for (tilepartno = 1; tilepartno < tot_num_tp ; ++tilepartno) { - p_j2k->m_specific_param.m_encoder.m_current_poc_tile_part_number = tilepartno; - l_current_nb_bytes_written = 0; - l_part_tile_size = 0; - l_begin_data = p_data; - - if (! opj_j2k_write_sot(p_j2k, p_data, - p_total_data_size, - &l_current_nb_bytes_written, - p_stream, - p_manager)) { - return OPJ_FALSE; - } - - l_nb_bytes_written += l_current_nb_bytes_written; - p_data += l_current_nb_bytes_written; - p_total_data_size -= l_current_nb_bytes_written; - l_part_tile_size += l_current_nb_bytes_written; - - l_current_nb_bytes_written = 0; - if (! opj_j2k_write_sod(p_j2k, l_tcd, p_data, &l_current_nb_bytes_written, - p_total_data_size, p_stream, p_manager)) { - return OPJ_FALSE; - } - - p_data += l_current_nb_bytes_written; - l_nb_bytes_written += l_current_nb_bytes_written; - p_total_data_size -= l_current_nb_bytes_written; - l_part_tile_size += l_current_nb_bytes_written; - - /* Writing Psot in SOT marker */ - opj_write_bytes(l_begin_data + 6, l_part_tile_size, - 4); /* PSOT */ - - if (OPJ_IS_CINEMA(l_cp->rsiz)) { - opj_j2k_update_tlm(p_j2k, l_part_tile_size); - } - - ++p_j2k->m_specific_param.m_encoder.m_current_tile_part_number; - } - - for (pino = 1; pino <= l_tcp->numpocs; ++pino) { - l_tcd->cur_pino = pino; - - /*Get number of tile parts*/ - tot_num_tp = opj_j2k_get_num_tp(l_cp, pino, p_j2k->m_current_tile_number); - for (tilepartno = 0; tilepartno < tot_num_tp ; ++tilepartno) { - p_j2k->m_specific_param.m_encoder.m_current_poc_tile_part_number = tilepartno; - l_current_nb_bytes_written = 0; - l_part_tile_size = 0; - l_begin_data = p_data; - - if (! opj_j2k_write_sot(p_j2k, p_data, - p_total_data_size, - &l_current_nb_bytes_written, p_stream, - p_manager)) { - return OPJ_FALSE; - } - - l_nb_bytes_written += l_current_nb_bytes_written; - p_data += l_current_nb_bytes_written; - p_total_data_size -= l_current_nb_bytes_written; - l_part_tile_size += l_current_nb_bytes_written; - - l_current_nb_bytes_written = 0; - - if (! opj_j2k_write_sod(p_j2k, l_tcd, p_data, &l_current_nb_bytes_written, - p_total_data_size, p_stream, p_manager)) { - return OPJ_FALSE; - } - - l_nb_bytes_written += l_current_nb_bytes_written; - p_data += l_current_nb_bytes_written; - p_total_data_size -= l_current_nb_bytes_written; - l_part_tile_size += l_current_nb_bytes_written; - - /* Writing Psot in SOT marker */ - opj_write_bytes(l_begin_data + 6, l_part_tile_size, - 4); /* PSOT */ - - if (OPJ_IS_CINEMA(l_cp->rsiz)) { - opj_j2k_update_tlm(p_j2k, l_part_tile_size); - } - - ++p_j2k->m_specific_param.m_encoder.m_current_tile_part_number; - } - } - - *p_data_written = l_nb_bytes_written; - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_write_updated_tlm(opj_j2k_t *p_j2k, - struct opj_stream_private *p_stream, - struct opj_event_mgr * p_manager) -{ - OPJ_UINT32 l_tlm_size; - OPJ_OFF_T l_tlm_position, l_current_position; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - l_tlm_size = 5 * p_j2k->m_specific_param.m_encoder.m_total_tile_parts; - l_tlm_position = 6 + p_j2k->m_specific_param.m_encoder.m_tlm_start; - l_current_position = opj_stream_tell(p_stream); - - if (! opj_stream_seek(p_stream, l_tlm_position, p_manager)) { - return OPJ_FALSE; - } - - if (opj_stream_write_data(p_stream, - p_j2k->m_specific_param.m_encoder.m_tlm_sot_offsets_buffer, l_tlm_size, - p_manager) != l_tlm_size) { - return OPJ_FALSE; - } - - if (! opj_stream_seek(p_stream, l_current_position, p_manager)) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_end_encoding(opj_j2k_t *p_j2k, - struct opj_stream_private *p_stream, - struct opj_event_mgr * p_manager) -{ - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - OPJ_UNUSED(p_stream); - OPJ_UNUSED(p_manager); - - opj_tcd_destroy(p_j2k->m_tcd); - p_j2k->m_tcd = 00; - - if (p_j2k->m_specific_param.m_encoder.m_tlm_sot_offsets_buffer) { - opj_free(p_j2k->m_specific_param.m_encoder.m_tlm_sot_offsets_buffer); - p_j2k->m_specific_param.m_encoder.m_tlm_sot_offsets_buffer = 0; - p_j2k->m_specific_param.m_encoder.m_tlm_sot_offsets_current = 0; - } - - if (p_j2k->m_specific_param.m_encoder.m_encoded_tile_data) { - opj_free(p_j2k->m_specific_param.m_encoder.m_encoded_tile_data); - p_j2k->m_specific_param.m_encoder.m_encoded_tile_data = 0; - } - - p_j2k->m_specific_param.m_encoder.m_encoded_tile_size = 0; - - return OPJ_TRUE; -} - -/** - * Destroys the memory associated with the decoding of headers. - */ -static OPJ_BOOL opj_j2k_destroy_header_memory(opj_j2k_t * p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager - ) -{ - /* preconditions */ - assert(p_j2k != 00); - assert(p_stream != 00); - assert(p_manager != 00); - - OPJ_UNUSED(p_stream); - OPJ_UNUSED(p_manager); - - if (p_j2k->m_specific_param.m_encoder.m_header_tile_data) { - opj_free(p_j2k->m_specific_param.m_encoder.m_header_tile_data); - p_j2k->m_specific_param.m_encoder.m_header_tile_data = 0; - } - - p_j2k->m_specific_param.m_encoder.m_header_tile_data_size = 0; - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_j2k_init_info(opj_j2k_t *p_j2k, - struct opj_stream_private *p_stream, - struct opj_event_mgr * p_manager) -{ - opj_codestream_info_t * l_cstr_info = 00; - - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - (void)l_cstr_info; - - OPJ_UNUSED(p_stream); - - /* TODO mergeV2: check this part which use cstr_info */ - /*l_cstr_info = p_j2k->cstr_info; - - if (l_cstr_info) { - OPJ_UINT32 compno; - l_cstr_info->tile = (opj_tile_info_t *) opj_malloc(p_j2k->m_cp.tw * p_j2k->m_cp.th * sizeof(opj_tile_info_t)); - - l_cstr_info->image_w = p_j2k->m_image->x1 - p_j2k->m_image->x0; - l_cstr_info->image_h = p_j2k->m_image->y1 - p_j2k->m_image->y0; - - l_cstr_info->prog = (&p_j2k->m_cp.tcps[0])->prg; - - l_cstr_info->tw = p_j2k->m_cp.tw; - l_cstr_info->th = p_j2k->m_cp.th; - - l_cstr_info->tile_x = p_j2k->m_cp.tdx;*/ /* new version parser */ - /*l_cstr_info->tile_y = p_j2k->m_cp.tdy;*/ /* new version parser */ - /*l_cstr_info->tile_Ox = p_j2k->m_cp.tx0;*/ /* new version parser */ - /*l_cstr_info->tile_Oy = p_j2k->m_cp.ty0;*/ /* new version parser */ - - /*l_cstr_info->numcomps = p_j2k->m_image->numcomps; - - l_cstr_info->numlayers = (&p_j2k->m_cp.tcps[0])->numlayers; - - l_cstr_info->numdecompos = (OPJ_INT32*) opj_malloc(p_j2k->m_image->numcomps * sizeof(OPJ_INT32)); - - for (compno=0; compno < p_j2k->m_image->numcomps; compno++) { - l_cstr_info->numdecompos[compno] = (&p_j2k->m_cp.tcps[0])->tccps->numresolutions - 1; - } - - l_cstr_info->D_max = 0.0; */ /* ADD Marcela */ - - /*l_cstr_info->main_head_start = opj_stream_tell(p_stream);*/ /* position of SOC */ - - /*l_cstr_info->maxmarknum = 100; - l_cstr_info->marker = (opj_marker_info_t *) opj_malloc(l_cstr_info->maxmarknum * sizeof(opj_marker_info_t)); - l_cstr_info->marknum = 0; - }*/ - - return opj_j2k_calculate_tp(p_j2k, &(p_j2k->m_cp), - &p_j2k->m_specific_param.m_encoder.m_total_tile_parts, p_j2k->m_private_image, - p_manager); -} - -/** - * Creates a tile-coder encoder. - * - * @param p_stream the stream to write data to. - * @param p_j2k J2K codec. - * @param p_manager the user event manager. -*/ -static OPJ_BOOL opj_j2k_create_tcd(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager - ) -{ - /* preconditions */ - assert(p_j2k != 00); - assert(p_manager != 00); - assert(p_stream != 00); - - OPJ_UNUSED(p_stream); - - p_j2k->m_tcd = opj_tcd_create(OPJ_FALSE); - - if (! p_j2k->m_tcd) { - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to create Tile Coder\n"); - return OPJ_FALSE; - } - - if (!opj_tcd_init(p_j2k->m_tcd, p_j2k->m_private_image, &p_j2k->m_cp, - p_j2k->m_tp)) { - opj_tcd_destroy(p_j2k->m_tcd); - p_j2k->m_tcd = 00; - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -OPJ_BOOL opj_j2k_write_tile(opj_j2k_t * p_j2k, - OPJ_UINT32 p_tile_index, - OPJ_BYTE * p_data, - OPJ_UINT32 p_data_size, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager) -{ - if (! opj_j2k_pre_write_tile(p_j2k, p_tile_index, p_stream, p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, - "Error while opj_j2k_pre_write_tile with tile index = %d\n", p_tile_index); - return OPJ_FALSE; - } else { - OPJ_UINT32 j; - /* Allocate data */ - for (j = 0; j < p_j2k->m_tcd->image->numcomps; ++j) { - opj_tcd_tilecomp_t* l_tilec = p_j2k->m_tcd->tcd_image->tiles->comps + j; - - if (! opj_alloc_tile_component_data(l_tilec)) { - opj_event_msg(p_manager, EVT_ERROR, "Error allocating tile component data."); - return OPJ_FALSE; - } - } - - /* now copy data into the tile component */ - if (! opj_tcd_copy_tile_data(p_j2k->m_tcd, p_data, p_data_size)) { - opj_event_msg(p_manager, EVT_ERROR, - "Size mismatch between tile data and sent data."); - return OPJ_FALSE; - } - if (! opj_j2k_post_write_tile(p_j2k, p_stream, p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, - "Error while opj_j2k_post_write_tile with tile index = %d\n", p_tile_index); - return OPJ_FALSE; - } - } - - return OPJ_TRUE; -} diff --git a/src/3rd/LibOpenJpeg/j2k.h b/src/3rd/LibOpenJpeg/j2k.h deleted file mode 100644 index 5d393c98..00000000 --- a/src/3rd/LibOpenJpeg/j2k.h +++ /dev/null @@ -1,880 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2006-2007, Parvatha Elangovan - * Copyright (c) 2008, Jerome Fimes, Communications & Systemes - * Copyright (c) 2011-2012, Centre National d'Etudes Spatiales (CNES), France - * Copyright (c) 2012, CS Systemes d'Information, France - * - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef OPJ_J2K_H -#define OPJ_J2K_H -/** -@file j2k.h -@brief The JPEG-2000 Codestream Reader/Writer (J2K) - -The functions in J2K.C have for goal to read/write the several parts of the codestream: markers and data. -*/ - -/** @defgroup J2K J2K - JPEG-2000 codestream reader/writer */ -/*@{*/ - -#define J2K_CP_CSTY_PRT 0x01 -#define J2K_CP_CSTY_SOP 0x02 -#define J2K_CP_CSTY_EPH 0x04 -#define J2K_CCP_CSTY_PRT 0x01 -#define J2K_CCP_CBLKSTY_LAZY 0x01 /**< Selective arithmetic coding bypass */ -#define J2K_CCP_CBLKSTY_RESET 0x02 /**< Reset context probabilities on coding pass boundaries */ -#define J2K_CCP_CBLKSTY_TERMALL 0x04 /**< Termination on each coding pass */ -#define J2K_CCP_CBLKSTY_VSC 0x08 /**< Vertically stripe causal context */ -#define J2K_CCP_CBLKSTY_PTERM 0x10 /**< Predictable termination */ -#define J2K_CCP_CBLKSTY_SEGSYM 0x20 /**< Segmentation symbols are used */ -#define J2K_CCP_QNTSTY_NOQNT 0 -#define J2K_CCP_QNTSTY_SIQNT 1 -#define J2K_CCP_QNTSTY_SEQNT 2 - -/* ----------------------------------------------------------------------- */ - -#define J2K_MS_SOC 0xff4f /**< SOC marker value */ -#define J2K_MS_SOT 0xff90 /**< SOT marker value */ -#define J2K_MS_SOD 0xff93 /**< SOD marker value */ -#define J2K_MS_EOC 0xffd9 /**< EOC marker value */ -#define J2K_MS_SIZ 0xff51 /**< SIZ marker value */ -#define J2K_MS_COD 0xff52 /**< COD marker value */ -#define J2K_MS_COC 0xff53 /**< COC marker value */ -#define J2K_MS_RGN 0xff5e /**< RGN marker value */ -#define J2K_MS_QCD 0xff5c /**< QCD marker value */ -#define J2K_MS_QCC 0xff5d /**< QCC marker value */ -#define J2K_MS_POC 0xff5f /**< POC marker value */ -#define J2K_MS_TLM 0xff55 /**< TLM marker value */ -#define J2K_MS_PLM 0xff57 /**< PLM marker value */ -#define J2K_MS_PLT 0xff58 /**< PLT marker value */ -#define J2K_MS_PPM 0xff60 /**< PPM marker value */ -#define J2K_MS_PPT 0xff61 /**< PPT marker value */ -#define J2K_MS_SOP 0xff91 /**< SOP marker value */ -#define J2K_MS_EPH 0xff92 /**< EPH marker value */ -#define J2K_MS_CRG 0xff63 /**< CRG marker value */ -#define J2K_MS_COM 0xff64 /**< COM marker value */ -#define J2K_MS_CBD 0xff78 /**< CBD marker value */ -#define J2K_MS_MCC 0xff75 /**< MCC marker value */ -#define J2K_MS_MCT 0xff74 /**< MCT marker value */ -#define J2K_MS_MCO 0xff77 /**< MCO marker value */ - -#define J2K_MS_UNK 0 /**< UNKNOWN marker value */ - -/* UniPG>> */ -#ifdef USE_JPWL -#define J2K_MS_EPC 0xff68 /**< EPC marker value (Part 11: JPEG 2000 for Wireless) */ -#define J2K_MS_EPB 0xff66 /**< EPB marker value (Part 11: JPEG 2000 for Wireless) */ -#define J2K_MS_ESD 0xff67 /**< ESD marker value (Part 11: JPEG 2000 for Wireless) */ -#define J2K_MS_RED 0xff69 /**< RED marker value (Part 11: JPEG 2000 for Wireless) */ -#endif /* USE_JPWL */ -#ifdef USE_JPSEC -#define J2K_MS_SEC 0xff65 /**< SEC marker value (Part 8: Secure JPEG 2000) */ -#define J2K_MS_INSEC 0xff94 /**< INSEC marker value (Part 8: Secure JPEG 2000) */ -#endif /* USE_JPSEC */ -/* < Zppx not read yet */ - OPJ_UINT32 m_data_size; -} opj_ppx; - -/** -Tile coding parameters : -this structure is used to store coding/decoding parameters common to all -tiles (information like COD, COC in main header) -*/ -typedef struct opj_tcp { - /** coding style */ - OPJ_UINT32 csty; - /** progression order */ - OPJ_PROG_ORDER prg; - /** number of layers */ - OPJ_UINT32 numlayers; - OPJ_UINT32 num_layers_to_decode; - /** multi-component transform identifier */ - OPJ_UINT32 mct; - /** rates of layers */ - OPJ_FLOAT32 rates[100]; - /** number of progression order changes */ - OPJ_UINT32 numpocs; - /** progression order changes */ - opj_poc_t pocs[J2K_MAX_POCS]; - - /** number of ppt markers (reserved size) */ - OPJ_UINT32 ppt_markers_count; - /** ppt markers data (table indexed by Zppt) */ - opj_ppx* ppt_markers; - - /** packet header store there for future use in t2_decode_packet */ - OPJ_BYTE *ppt_data; - /** used to keep a track of the allocated memory */ - OPJ_BYTE *ppt_buffer; - /** Number of bytes stored inside ppt_data*/ - OPJ_UINT32 ppt_data_size; - /** size of ppt_data*/ - OPJ_UINT32 ppt_len; - /** add fixed_quality */ - OPJ_FLOAT32 distoratio[100]; - /** tile-component coding parameters */ - opj_tccp_t *tccps; - /** current tile part number or -1 if first time into this tile */ - OPJ_INT32 m_current_tile_part_number; - /** number of tile parts for the tile. */ - OPJ_UINT32 m_nb_tile_parts; - /** data for the tile */ - OPJ_BYTE * m_data; - /** size of data */ - OPJ_UINT32 m_data_size; - /** encoding norms */ - OPJ_FLOAT64 * mct_norms; - /** the mct decoding matrix */ - OPJ_FLOAT32 * m_mct_decoding_matrix; - /** the mct coding matrix */ - OPJ_FLOAT32 * m_mct_coding_matrix; - /** mct records */ - opj_mct_data_t * m_mct_records; - /** the number of mct records. */ - OPJ_UINT32 m_nb_mct_records; - /** the max number of mct records. */ - OPJ_UINT32 m_nb_max_mct_records; - /** mcc records */ - opj_simple_mcc_decorrelation_data_t * m_mcc_records; - /** the number of mct records. */ - OPJ_UINT32 m_nb_mcc_records; - /** the max number of mct records. */ - OPJ_UINT32 m_nb_max_mcc_records; - - - /***** FLAGS *******/ - /** If cod == 1 --> there was a COD marker for the present tile */ - OPJ_BITFIELD cod : 1; - /** If ppt == 1 --> there was a PPT marker for the present tile */ - OPJ_BITFIELD ppt : 1; - /** indicates if a POC marker has been used O:NO, 1:YES */ - OPJ_BITFIELD POC : 1; -} opj_tcp_t; - - - - -typedef struct opj_encoding_param { - /** Maximum rate for each component. If == 0, component size limitation is not considered */ - OPJ_UINT32 m_max_comp_size; - /** Position of tile part flag in progression order*/ - OPJ_INT32 m_tp_pos; - /** fixed layer */ - OPJ_INT32 *m_matrice; - /** Flag determining tile part generation*/ - OPJ_BYTE m_tp_flag; - /** allocation by rate/distortion */ - OPJ_BITFIELD m_disto_alloc : 1; - /** allocation by fixed layer */ - OPJ_BITFIELD m_fixed_alloc : 1; - /** add fixed_quality */ - OPJ_BITFIELD m_fixed_quality : 1; - /** Enabling Tile part generation*/ - OPJ_BITFIELD m_tp_on : 1; -} -opj_encoding_param_t; - -typedef struct opj_decoding_param { - /** if != 0, then original dimension divided by 2^(reduce); if == 0 or not used, image is decoded to the full resolution */ - OPJ_UINT32 m_reduce; - /** if != 0, then only the first "layer" layers are decoded; if == 0 or not used, all the quality layers are decoded */ - OPJ_UINT32 m_layer; -} -opj_decoding_param_t; - - -/** - * Coding parameters - */ -typedef struct opj_cp { - /** Size of the image in bits*/ - /*int img_size;*/ - /** Rsiz*/ - OPJ_UINT16 rsiz; - /** XTOsiz */ - OPJ_UINT32 tx0; /* MSD see norm */ - /** YTOsiz */ - OPJ_UINT32 ty0; /* MSD see norm */ - /** XTsiz */ - OPJ_UINT32 tdx; - /** YTsiz */ - OPJ_UINT32 tdy; - /** comment */ - OPJ_CHAR *comment; - /** number of tiles in width */ - OPJ_UINT32 tw; - /** number of tiles in height */ - OPJ_UINT32 th; - - /** number of ppm markers (reserved size) */ - OPJ_UINT32 ppm_markers_count; - /** ppm markers data (table indexed by Zppm) */ - opj_ppx* ppm_markers; - - /** packet header store there for future use in t2_decode_packet */ - OPJ_BYTE *ppm_data; - /** size of the ppm_data*/ - OPJ_UINT32 ppm_len; - /** size of the ppm_data*/ - OPJ_UINT32 ppm_data_read; - - OPJ_BYTE *ppm_data_current; - - /** packet header storage original buffer */ - OPJ_BYTE *ppm_buffer; - /** pointer remaining on the first byte of the first header if ppm is used */ - OPJ_BYTE *ppm_data_first; - /** Number of bytes actually stored inside the ppm_data */ - OPJ_UINT32 ppm_data_size; - /** use in case of multiple marker PPM (number of info already store) */ - OPJ_INT32 ppm_store; - /** use in case of multiple marker PPM (case on non-finished previous info) */ - OPJ_INT32 ppm_previous; - - /** tile coding parameters */ - opj_tcp_t *tcps; - - union { - opj_decoding_param_t m_dec; - opj_encoding_param_t m_enc; - } - m_specific_param; - - - /* UniPG>> */ -#ifdef USE_JPWL - /** enables writing of EPC in MH, thus activating JPWL */ - OPJ_BOOL epc_on; - /** enables writing of EPB, in case of activated JPWL */ - OPJ_BOOL epb_on; - /** enables writing of ESD, in case of activated JPWL */ - OPJ_BOOL esd_on; - /** enables writing of informative techniques of ESD, in case of activated JPWL */ - OPJ_BOOL info_on; - /** enables writing of RED, in case of activated JPWL */ - OPJ_BOOL red_on; - /** error protection method for MH (0,1,16,32,37-128) */ - int hprot_MH; - /** tile number of header protection specification (>=0) */ - int hprot_TPH_tileno[JPWL_MAX_NO_TILESPECS]; - /** error protection methods for TPHs (0,1,16,32,37-128) */ - int hprot_TPH[JPWL_MAX_NO_TILESPECS]; - /** tile number of packet protection specification (>=0) */ - int pprot_tileno[JPWL_MAX_NO_PACKSPECS]; - /** packet number of packet protection specification (>=0) */ - int pprot_packno[JPWL_MAX_NO_PACKSPECS]; - /** error protection methods for packets (0,1,16,32,37-128) */ - int pprot[JPWL_MAX_NO_PACKSPECS]; - /** enables writing of ESD, (0/2/4 bytes) */ - int sens_size; - /** sensitivity addressing size (0=auto/2/4 bytes) */ - int sens_addr; - /** sensitivity range (0-3) */ - int sens_range; - /** sensitivity method for MH (-1,0-7) */ - int sens_MH; - /** tile number of sensitivity specification (>=0) */ - int sens_TPH_tileno[JPWL_MAX_NO_TILESPECS]; - /** sensitivity methods for TPHs (-1,0-7) */ - int sens_TPH[JPWL_MAX_NO_TILESPECS]; - /** enables JPWL correction at the decoder */ - OPJ_BOOL correct; - /** expected number of components at the decoder */ - int exp_comps; - /** maximum number of tiles at the decoder */ - OPJ_UINT32 max_tiles; -#endif /* USE_JPWL */ - - /******** FLAGS *********/ - /** if ppm == 1 --> there was a PPM marker*/ - OPJ_BITFIELD ppm : 1; - /** tells if the parameter is a coding or decoding one */ - OPJ_BITFIELD m_is_decoder : 1; - /** whether different bit depth or sign per component is allowed. Decoder only for ow */ - OPJ_BITFIELD allow_different_bit_depth_sign : 1; - /* <cp. -@param j2k J2K decompressor handle -@param parameters decompression parameters -*/ -void opj_j2k_setup_decoder(opj_j2k_t *j2k, opj_dparameters_t *parameters); - -OPJ_BOOL opj_j2k_set_threads(opj_j2k_t *j2k, OPJ_UINT32 num_threads); - -/** - * Creates a J2K compression structure - * - * @return Returns a handle to a J2K compressor if successful, returns NULL otherwise -*/ -opj_j2k_t* opj_j2k_create_compress(void); - - -OPJ_BOOL opj_j2k_setup_encoder(opj_j2k_t *p_j2k, - opj_cparameters_t *parameters, - opj_image_t *image, - opj_event_mgr_t * p_manager); - -/** -Converts an enum type progression order to string type -*/ -const char *opj_j2k_convert_progression_order(OPJ_PROG_ORDER prg_order); - -/* ----------------------------------------------------------------------- */ -/*@}*/ - -/*@}*/ - -/** - * Ends the decompression procedures and possibiliy add data to be read after the - * codestream. - */ -OPJ_BOOL opj_j2k_end_decompress(opj_j2k_t *j2k, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Reads a jpeg2000 codestream header structure. - * - * @param p_stream the stream to read data from. - * @param p_j2k the jpeg2000 codec. - * @param p_image FIXME DOC - * @param p_manager the user event manager. - * - * @return true if the box is valid. - */ -OPJ_BOOL opj_j2k_read_header(opj_stream_private_t *p_stream, - opj_j2k_t* p_j2k, - opj_image_t** p_image, - opj_event_mgr_t* p_manager); - - -/** - * Destroys a jpeg2000 codec. - * - * @param p_j2k the jpeg20000 structure to destroy. - */ -void opj_j2k_destroy(opj_j2k_t *p_j2k); - -/** - * Destroys a codestream index structure. - * - * @param p_cstr_ind the codestream index parameter to destroy. - */ -void j2k_destroy_cstr_index(opj_codestream_index_t *p_cstr_ind); - -/** - * Decode tile data. - * @param p_j2k the jpeg2000 codec. - * @param p_tile_index - * @param p_data FIXME DOC - * @param p_data_size FIXME DOC - * @param p_stream the stream to write data to. - * @param p_manager the user event manager. - */ -OPJ_BOOL opj_j2k_decode_tile(opj_j2k_t * p_j2k, - OPJ_UINT32 p_tile_index, - OPJ_BYTE * p_data, - OPJ_UINT32 p_data_size, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Reads a tile header. - * @param p_j2k the jpeg2000 codec. - * @param p_tile_index FIXME DOC - * @param p_data_size FIXME DOC - * @param p_tile_x0 FIXME DOC - * @param p_tile_y0 FIXME DOC - * @param p_tile_x1 FIXME DOC - * @param p_tile_y1 FIXME DOC - * @param p_nb_comps FIXME DOC - * @param p_go_on FIXME DOC - * @param p_stream the stream to write data to. - * @param p_manager the user event manager. - */ -OPJ_BOOL opj_j2k_read_tile_header(opj_j2k_t * p_j2k, - OPJ_UINT32 * p_tile_index, - OPJ_UINT32 * p_data_size, - OPJ_INT32 * p_tile_x0, - OPJ_INT32 * p_tile_y0, - OPJ_INT32 * p_tile_x1, - OPJ_INT32 * p_tile_y1, - OPJ_UINT32 * p_nb_comps, - OPJ_BOOL * p_go_on, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - - -/** Sets the indices of the components to decode. - * - * @param p_j2k the jpeg2000 codec. - * @param numcomps Number of components to decode. - * @param comps_indices Array of num_compts indices (numbering starting at 0) - * corresponding to the components to decode. - * @param p_manager Event manager - * - * @return OPJ_TRUE in case of success. - */ -OPJ_BOOL opj_j2k_set_decoded_components(opj_j2k_t *p_j2k, - OPJ_UINT32 numcomps, - const OPJ_UINT32* comps_indices, - opj_event_mgr_t * p_manager); - -/** - * Sets the given area to be decoded. This function should be called right after opj_read_header and before any tile header reading. - * - * @param p_j2k the jpeg2000 codec. - * @param p_image FIXME DOC - * @param p_start_x the left position of the rectangle to decode (in image coordinates). - * @param p_start_y the up position of the rectangle to decode (in image coordinates). - * @param p_end_x the right position of the rectangle to decode (in image coordinates). - * @param p_end_y the bottom position of the rectangle to decode (in image coordinates). - * @param p_manager the user event manager - * - * @return true if the area could be set. - */ -OPJ_BOOL opj_j2k_set_decode_area(opj_j2k_t *p_j2k, - opj_image_t* p_image, - OPJ_INT32 p_start_x, OPJ_INT32 p_start_y, - OPJ_INT32 p_end_x, OPJ_INT32 p_end_y, - opj_event_mgr_t * p_manager); - -/** - * Creates a J2K decompression structure. - * - * @return a handle to a J2K decompressor if successful, NULL otherwise. - */ -opj_j2k_t* opj_j2k_create_decompress(void); - - -/** - * Dump some elements from the J2K decompression structure . - * - *@param p_j2k the jpeg2000 codec. - *@param flag flag to describe what elements are dump. - *@param out_stream output stream where dump the elements. - * -*/ -void j2k_dump(opj_j2k_t* p_j2k, OPJ_INT32 flag, FILE* out_stream); - - - -/** - * Dump an image header structure. - * - *@param image the image header to dump. - *@param dev_dump_flag flag to describe if we are in the case of this function is use outside j2k_dump function - *@param out_stream output stream where dump the elements. - */ -void j2k_dump_image_header(opj_image_t* image, OPJ_BOOL dev_dump_flag, - FILE* out_stream); - -/** - * Dump a component image header structure. - * - *@param comp the component image header to dump. - *@param dev_dump_flag flag to describe if we are in the case of this function is use outside j2k_dump function - *@param out_stream output stream where dump the elements. - */ -void j2k_dump_image_comp_header(opj_image_comp_t* comp, OPJ_BOOL dev_dump_flag, - FILE* out_stream); - -/** - * Get the codestream info from a JPEG2000 codec. - * - *@param p_j2k the component image header to dump. - * - *@return the codestream information extract from the jpg2000 codec - */ -opj_codestream_info_v2_t* j2k_get_cstr_info(opj_j2k_t* p_j2k); - -/** - * Get the codestream index from a JPEG2000 codec. - * - *@param p_j2k the component image header to dump. - * - *@return the codestream index extract from the jpg2000 codec - */ -opj_codestream_index_t* j2k_get_cstr_index(opj_j2k_t* p_j2k); - -/** - * Decode an image from a JPEG-2000 codestream - * @param j2k J2K decompressor handle - * @param p_stream FIXME DOC - * @param p_image FIXME DOC - * @param p_manager FIXME DOC - * @return FIXME DOC -*/ -OPJ_BOOL opj_j2k_decode(opj_j2k_t *j2k, - opj_stream_private_t *p_stream, - opj_image_t *p_image, - opj_event_mgr_t *p_manager); - - -OPJ_BOOL opj_j2k_get_tile(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_image_t* p_image, - opj_event_mgr_t * p_manager, - OPJ_UINT32 tile_index); - -OPJ_BOOL opj_j2k_set_decoded_resolution_factor(opj_j2k_t *p_j2k, - OPJ_UINT32 res_factor, - opj_event_mgr_t * p_manager); - - -/** - * Writes a tile. - * @param p_j2k the jpeg2000 codec. - * @param p_tile_index FIXME DOC - * @param p_data FIXME DOC - * @param p_data_size FIXME DOC - * @param p_stream the stream to write data to. - * @param p_manager the user event manager. - */ -OPJ_BOOL opj_j2k_write_tile(opj_j2k_t * p_j2k, - OPJ_UINT32 p_tile_index, - OPJ_BYTE * p_data, - OPJ_UINT32 p_data_size, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Encodes an image into a JPEG-2000 codestream - */ -OPJ_BOOL opj_j2k_encode(opj_j2k_t * p_j2k, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager); - -/** - * Starts a compression scheme, i.e. validates the codec parameters, writes the header. - * - * @param p_j2k the jpeg2000 codec. - * @param p_stream the stream object. - * @param p_image FIXME DOC - * @param p_manager the user event manager. - * - * @return true if the codec is valid. - */ -OPJ_BOOL opj_j2k_start_compress(opj_j2k_t *p_j2k, - opj_stream_private_t *p_stream, - opj_image_t * p_image, - opj_event_mgr_t * p_manager); - -/** - * Ends the compression procedures and possibiliy add data to be read after the - * codestream. - */ -OPJ_BOOL opj_j2k_end_compress(opj_j2k_t *p_j2k, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager); - -OPJ_BOOL opj_j2k_setup_mct_encoding(opj_tcp_t * p_tcp, opj_image_t * p_image); - - -#endif /* OPJ_J2K_H */ diff --git a/src/3rd/LibOpenJpeg/jp2.c b/src/3rd/LibOpenJpeg/jp2.c deleted file mode 100644 index 34007652..00000000 --- a/src/3rd/LibOpenJpeg/jp2.c +++ /dev/null @@ -1,3428 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2010-2011, Kaori Hagihara - * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR - * Copyright (c) 2012, CS Systemes d'Information, France - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -#include "opj_includes.h" - -/** @defgroup JP2 JP2 - JPEG-2000 file format reader/writer */ -/*@{*/ - -#define OPJ_BOX_SIZE 1024 - -#define OPJ_UNUSED(x) (void)x - -/** @name Local static functions */ -/*@{*/ - -/*static void jp2_write_url(opj_cio_t *cio, char *Idx_file);*/ - -/** - * Reads a IHDR box - Image Header box - * - * @param p_image_header_data pointer to actual data (already read from file) - * @param jp2 the jpeg2000 file codec. - * @param p_image_header_size the size of the image header - * @param p_manager the user event manager. - * - * @return true if the image header is valid, false else. - */ -static OPJ_BOOL opj_jp2_read_ihdr(opj_jp2_t *jp2, - OPJ_BYTE *p_image_header_data, - OPJ_UINT32 p_image_header_size, - opj_event_mgr_t * p_manager); - -/** - * Writes the Image Header box - Image Header box. - * - * @param jp2 jpeg2000 file codec. - * @param p_nb_bytes_written pointer to store the nb of bytes written by the function. - * - * @return the data being copied. -*/ -static OPJ_BYTE * opj_jp2_write_ihdr(opj_jp2_t *jp2, - OPJ_UINT32 * p_nb_bytes_written); - -/** - * Writes the Bit per Component box. - * - * @param jp2 jpeg2000 file codec. - * @param p_nb_bytes_written pointer to store the nb of bytes written by the function. - * - * @return the data being copied. -*/ -static OPJ_BYTE * opj_jp2_write_bpcc(opj_jp2_t *jp2, - OPJ_UINT32 * p_nb_bytes_written); - -/** - * Reads a Bit per Component box. - * - * @param p_bpc_header_data pointer to actual data (already read from file) - * @param jp2 the jpeg2000 file codec. - * @param p_bpc_header_size the size of the bpc header - * @param p_manager the user event manager. - * - * @return true if the bpc header is valid, false else. - */ -static OPJ_BOOL opj_jp2_read_bpcc(opj_jp2_t *jp2, - OPJ_BYTE * p_bpc_header_data, - OPJ_UINT32 p_bpc_header_size, - opj_event_mgr_t * p_manager); - -static OPJ_BOOL opj_jp2_read_cdef(opj_jp2_t * jp2, - OPJ_BYTE * p_cdef_header_data, - OPJ_UINT32 p_cdef_header_size, - opj_event_mgr_t * p_manager); - -static void opj_jp2_apply_cdef(opj_image_t *image, opj_jp2_color_t *color, - opj_event_mgr_t *); - -/** - * Writes the Channel Definition box. - * - * @param jp2 jpeg2000 file codec. - * @param p_nb_bytes_written pointer to store the nb of bytes written by the function. - * - * @return the data being copied. - */ -static OPJ_BYTE * opj_jp2_write_cdef(opj_jp2_t *jp2, - OPJ_UINT32 * p_nb_bytes_written); - -/** - * Writes the Colour Specification box. - * - * @param jp2 jpeg2000 file codec. - * @param p_nb_bytes_written pointer to store the nb of bytes written by the function. - * - * @return the data being copied. -*/ -static OPJ_BYTE * opj_jp2_write_colr(opj_jp2_t *jp2, - OPJ_UINT32 * p_nb_bytes_written); - -/** - * Writes a FTYP box - File type box - * - * @param cio the stream to write data to. - * @param jp2 the jpeg2000 file codec. - * @param p_manager the user event manager. - * - * @return true if writing was successful. - */ -static OPJ_BOOL opj_jp2_write_ftyp(opj_jp2_t *jp2, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager); - -/** - * Reads a a FTYP box - File type box - * - * @param p_header_data the data contained in the FTYP box. - * @param jp2 the jpeg2000 file codec. - * @param p_header_size the size of the data contained in the FTYP box. - * @param p_manager the user event manager. - * - * @return true if the FTYP box is valid. - */ -static OPJ_BOOL opj_jp2_read_ftyp(opj_jp2_t *jp2, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); - -static OPJ_BOOL opj_jp2_skip_jp2c(opj_jp2_t *jp2, - opj_stream_private_t *stream, - opj_event_mgr_t * p_manager); - -/** - * Reads the Jpeg2000 file Header box - JP2 Header box (warning, this is a super box). - * - * @param p_header_data the data contained in the file header box. - * @param jp2 the jpeg2000 file codec. - * @param p_header_size the size of the data contained in the file header box. - * @param p_manager the user event manager. - * - * @return true if the JP2 Header box was successfully recognized. -*/ -static OPJ_BOOL opj_jp2_read_jp2h(opj_jp2_t *jp2, - OPJ_BYTE *p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); - -/** - * Writes the Jpeg2000 file Header box - JP2 Header box (warning, this is a super box). - * - * @param jp2 the jpeg2000 file codec. - * @param stream the stream to write data to. - * @param p_manager user event manager. - * - * @return true if writing was successful. - */ -static OPJ_BOOL opj_jp2_write_jp2h(opj_jp2_t *jp2, - opj_stream_private_t *stream, - opj_event_mgr_t * p_manager); - -/** - * Writes the Jpeg2000 codestream Header box - JP2C Header box. This function must be called AFTER the coding has been done. - * - * @param cio the stream to write data to. - * @param jp2 the jpeg2000 file codec. - * @param p_manager user event manager. - * - * @return true if writing was successful. -*/ -static OPJ_BOOL opj_jp2_write_jp2c(opj_jp2_t *jp2, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager); - -#ifdef USE_JPIP -/** - * Write index Finder box - * @param cio the stream to write to. - * @param jp2 the jpeg2000 file codec. - * @param p_manager user event manager. -*/ -static OPJ_BOOL opj_jpip_write_iptr(opj_jp2_t *jp2, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager); - -/** - * Write index Finder box - * @param cio the stream to write to. - * @param jp2 the jpeg2000 file codec. - * @param p_manager user event manager. - */ -static OPJ_BOOL opj_jpip_write_cidx(opj_jp2_t *jp2, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager); - -/** - * Write file Index (superbox) - * @param cio the stream to write to. - * @param jp2 the jpeg2000 file codec. - * @param p_manager user event manager. - */ -static OPJ_BOOL opj_jpip_write_fidx(opj_jp2_t *jp2, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager); -#endif /* USE_JPIP */ - -/** - * Reads a jpeg2000 file signature box. - * - * @param p_header_data the data contained in the signature box. - * @param jp2 the jpeg2000 file codec. - * @param p_header_size the size of the data contained in the signature box. - * @param p_manager the user event manager. - * - * @return true if the file signature box is valid. - */ -static OPJ_BOOL opj_jp2_read_jp(opj_jp2_t *jp2, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); - -/** - * Writes a jpeg2000 file signature box. - * - * @param cio the stream to write data to. - * @param jp2 the jpeg2000 file codec. - * @param p_manager the user event manager. - * - * @return true if writing was successful. - */ -static OPJ_BOOL opj_jp2_write_jp(opj_jp2_t *jp2, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager); - -/** -Apply collected palette data -@param image Image. -@param color Collector for profile, cdef and pclr data. -@param p_manager the user event manager. -@return true in case of success -*/ -static OPJ_BOOL opj_jp2_apply_pclr(opj_image_t *image, - opj_jp2_color_t *color, - opj_event_mgr_t * p_manager); - -static void opj_jp2_free_pclr(opj_jp2_color_t *color); - -/** - * Collect palette data - * - * @param jp2 JP2 handle - * @param p_pclr_header_data FIXME DOC - * @param p_pclr_header_size FIXME DOC - * @param p_manager - * - * @return Returns true if successful, returns false otherwise -*/ -static OPJ_BOOL opj_jp2_read_pclr(opj_jp2_t *jp2, - OPJ_BYTE * p_pclr_header_data, - OPJ_UINT32 p_pclr_header_size, - opj_event_mgr_t * p_manager); - -/** - * Collect component mapping data - * - * @param jp2 JP2 handle - * @param p_cmap_header_data FIXME DOC - * @param p_cmap_header_size FIXME DOC - * @param p_manager FIXME DOC - * - * @return Returns true if successful, returns false otherwise -*/ - -static OPJ_BOOL opj_jp2_read_cmap(opj_jp2_t * jp2, - OPJ_BYTE * p_cmap_header_data, - OPJ_UINT32 p_cmap_header_size, - opj_event_mgr_t * p_manager); - -/** - * Reads the Color Specification box. - * - * @param p_colr_header_data pointer to actual data (already read from file) - * @param jp2 the jpeg2000 file codec. - * @param p_colr_header_size the size of the color header - * @param p_manager the user event manager. - * - * @return true if the bpc header is valid, false else. -*/ -static OPJ_BOOL opj_jp2_read_colr(opj_jp2_t *jp2, - OPJ_BYTE * p_colr_header_data, - OPJ_UINT32 p_colr_header_size, - opj_event_mgr_t * p_manager); - -/*@}*/ - -/*@}*/ - -/** - * Sets up the procedures to do on writing header after the codestream. - * Developpers wanting to extend the library can add their own writing procedures. - */ -static OPJ_BOOL opj_jp2_setup_end_header_writing(opj_jp2_t *jp2, - opj_event_mgr_t * p_manager); - -/** - * Sets up the procedures to do on reading header after the codestream. - * Developpers wanting to extend the library can add their own writing procedures. - */ -static OPJ_BOOL opj_jp2_setup_end_header_reading(opj_jp2_t *jp2, - opj_event_mgr_t * p_manager); - -/** - * Reads a jpeg2000 file header structure. - * - * @param jp2 the jpeg2000 file header structure. - * @param stream the stream to read data from. - * @param p_manager the user event manager. - * - * @return true if the box is valid. - */ -static OPJ_BOOL opj_jp2_read_header_procedure(opj_jp2_t *jp2, - opj_stream_private_t *stream, - opj_event_mgr_t * p_manager); - -/** - * Executes the given procedures on the given codec. - * - * @param p_procedure_list the list of procedures to execute - * @param jp2 the jpeg2000 file codec to execute the procedures on. - * @param stream the stream to execute the procedures on. - * @param p_manager the user manager. - * - * @return true if all the procedures were successfully executed. - */ -static OPJ_BOOL opj_jp2_exec(opj_jp2_t * jp2, - opj_procedure_list_t * p_procedure_list, - opj_stream_private_t *stream, - opj_event_mgr_t * p_manager); - -/** - * Reads a box header. The box is the way data is packed inside a jpeg2000 file structure. - * - * @param cio the input stream to read data from. - * @param box the box structure to fill. - * @param p_number_bytes_read pointer to an int that will store the number of bytes read from the stream (shoul usually be 2). - * @param p_manager user event manager. - * - * @return true if the box is recognized, false otherwise -*/ -static OPJ_BOOL opj_jp2_read_boxhdr(opj_jp2_box_t *box, - OPJ_UINT32 * p_number_bytes_read, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager); - -/** - * Sets up the validation ,i.e. adds the procedures to launch to make sure the codec parameters - * are valid. Developpers wanting to extend the library can add their own validation procedures. - */ -static OPJ_BOOL opj_jp2_setup_encoding_validation(opj_jp2_t *jp2, - opj_event_mgr_t * p_manager); - -/** - * Sets up the procedures to do on writing header. Developpers wanting to extend the library can add their own writing procedures. - */ -static OPJ_BOOL opj_jp2_setup_header_writing(opj_jp2_t *jp2, - opj_event_mgr_t * p_manager); - -static OPJ_BOOL opj_jp2_default_validation(opj_jp2_t * jp2, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager); - -/** - * Finds the image execution function related to the given box id. - * - * @param p_id the id of the handler to fetch. - * - * @return the given handler or NULL if it could not be found. - */ -static const opj_jp2_header_handler_t * opj_jp2_img_find_handler( - OPJ_UINT32 p_id); - -/** - * Finds the execution function related to the given box id. - * - * @param p_id the id of the handler to fetch. - * - * @return the given handler or NULL if it could not be found. - */ -static const opj_jp2_header_handler_t * opj_jp2_find_handler(OPJ_UINT32 p_id); - -static const opj_jp2_header_handler_t jp2_header [] = { - {JP2_JP, opj_jp2_read_jp}, - {JP2_FTYP, opj_jp2_read_ftyp}, - {JP2_JP2H, opj_jp2_read_jp2h} -}; - -static const opj_jp2_header_handler_t jp2_img_header [] = { - {JP2_IHDR, opj_jp2_read_ihdr}, - {JP2_COLR, opj_jp2_read_colr}, - {JP2_BPCC, opj_jp2_read_bpcc}, - {JP2_PCLR, opj_jp2_read_pclr}, - {JP2_CMAP, opj_jp2_read_cmap}, - {JP2_CDEF, opj_jp2_read_cdef} - -}; - -/** - * Reads a box header. The box is the way data is packed inside a jpeg2000 file structure. Data is read from a character string - * - * @param box the box structure to fill. - * @param p_data the character string to read data from. - * @param p_number_bytes_read pointer to an int that will store the number of bytes read from the stream (shoul usually be 2). - * @param p_box_max_size the maximum number of bytes in the box. - * @param p_manager FIXME DOC - * - * @return true if the box is recognized, false otherwise -*/ -static OPJ_BOOL opj_jp2_read_boxhdr_char(opj_jp2_box_t *box, - OPJ_BYTE * p_data, - OPJ_UINT32 * p_number_bytes_read, - OPJ_UINT32 p_box_max_size, - opj_event_mgr_t * p_manager); - -/** - * Sets up the validation ,i.e. adds the procedures to launch to make sure the codec parameters - * are valid. Developpers wanting to extend the library can add their own validation procedures. - */ -static OPJ_BOOL opj_jp2_setup_decoding_validation(opj_jp2_t *jp2, - opj_event_mgr_t * p_manager); - -/** - * Sets up the procedures to do on reading header. - * Developpers wanting to extend the library can add their own writing procedures. - */ -static OPJ_BOOL opj_jp2_setup_header_reading(opj_jp2_t *jp2, - opj_event_mgr_t * p_manager); - -/* ----------------------------------------------------------------------- */ -static OPJ_BOOL opj_jp2_read_boxhdr(opj_jp2_box_t *box, - OPJ_UINT32 * p_number_bytes_read, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager) -{ - /* read header from file */ - OPJ_BYTE l_data_header [8]; - - /* preconditions */ - assert(cio != 00); - assert(box != 00); - assert(p_number_bytes_read != 00); - assert(p_manager != 00); - - *p_number_bytes_read = (OPJ_UINT32)opj_stream_read_data(cio, l_data_header, 8, - p_manager); - if (*p_number_bytes_read != 8) { - return OPJ_FALSE; - } - - /* process read data */ - opj_read_bytes(l_data_header, &(box->length), 4); - opj_read_bytes(l_data_header + 4, &(box->type), 4); - - if (box->length == 0) { /* last box */ - const OPJ_OFF_T bleft = opj_stream_get_number_byte_left(cio); - if (bleft > (OPJ_OFF_T)(0xFFFFFFFFU - 8U)) { - opj_event_msg(p_manager, EVT_ERROR, - "Cannot handle box sizes higher than 2^32\n"); - return OPJ_FALSE; - } - box->length = (OPJ_UINT32)bleft + 8U; - assert((OPJ_OFF_T)box->length == bleft + 8); - return OPJ_TRUE; - } - - /* do we have a "special very large box ?" */ - /* read then the XLBox */ - if (box->length == 1) { - OPJ_UINT32 l_xl_part_size; - - OPJ_UINT32 l_nb_bytes_read = (OPJ_UINT32)opj_stream_read_data(cio, - l_data_header, 8, p_manager); - if (l_nb_bytes_read != 8) { - if (l_nb_bytes_read > 0) { - *p_number_bytes_read += l_nb_bytes_read; - } - - return OPJ_FALSE; - } - - *p_number_bytes_read = 16; - opj_read_bytes(l_data_header, &l_xl_part_size, 4); - if (l_xl_part_size != 0) { - opj_event_msg(p_manager, EVT_ERROR, - "Cannot handle box sizes higher than 2^32\n"); - return OPJ_FALSE; - } - opj_read_bytes(l_data_header + 4, &(box->length), 4); - } - return OPJ_TRUE; -} - -#if 0 -static void jp2_write_url(opj_cio_t *cio, char *Idx_file) -{ - OPJ_UINT32 i; - opj_jp2_box_t box; - - box.init_pos = cio_tell(cio); - cio_skip(cio, 4); - cio_write(cio, JP2_URL, 4); /* DBTL */ - cio_write(cio, 0, 1); /* VERS */ - cio_write(cio, 0, 3); /* FLAG */ - - if (Idx_file) { - for (i = 0; i < strlen(Idx_file); i++) { - cio_write(cio, Idx_file[i], 1); - } - } - - box.length = cio_tell(cio) - box.init_pos; - cio_seek(cio, box.init_pos); - cio_write(cio, box.length, 4); /* L */ - cio_seek(cio, box.init_pos + box.length); -} -#endif - -static OPJ_BOOL opj_jp2_read_ihdr(opj_jp2_t *jp2, - OPJ_BYTE *p_image_header_data, - OPJ_UINT32 p_image_header_size, - opj_event_mgr_t * p_manager) -{ - /* preconditions */ - assert(p_image_header_data != 00); - assert(jp2 != 00); - assert(p_manager != 00); - - if (jp2->comps != NULL) { - opj_event_msg(p_manager, EVT_WARNING, - "Ignoring ihdr box. First ihdr box already read\n"); - return OPJ_TRUE; - } - - if (p_image_header_size != 14) { - opj_event_msg(p_manager, EVT_ERROR, "Bad image header box (bad size)\n"); - return OPJ_FALSE; - } - - opj_read_bytes(p_image_header_data, &(jp2->h), 4); /* HEIGHT */ - p_image_header_data += 4; - opj_read_bytes(p_image_header_data, &(jp2->w), 4); /* WIDTH */ - p_image_header_data += 4; - opj_read_bytes(p_image_header_data, &(jp2->numcomps), 2); /* NC */ - p_image_header_data += 2; - - if ((jp2->numcomps - 1U) >= - 16384U) { /* unsigned underflow is well defined: 1U <= jp2->numcomps <= 16384U */ - opj_event_msg(p_manager, EVT_ERROR, "Invalid number of components (ihdr)\n"); - return OPJ_FALSE; - } - - /* allocate memory for components */ - jp2->comps = (opj_jp2_comps_t*) opj_calloc(jp2->numcomps, - sizeof(opj_jp2_comps_t)); - if (jp2->comps == 0) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to handle image header (ihdr)\n"); - return OPJ_FALSE; - } - - opj_read_bytes(p_image_header_data, &(jp2->bpc), 1); /* BPC */ - ++ p_image_header_data; - - opj_read_bytes(p_image_header_data, &(jp2->C), 1); /* C */ - ++ p_image_header_data; - - /* Should be equal to 7 cf. chapter about image header box of the norm */ - if (jp2->C != 7) { - opj_event_msg(p_manager, EVT_INFO, - "JP2 IHDR box: compression type indicate that the file is not a conforming JP2 file (%d) \n", - jp2->C); - } - - opj_read_bytes(p_image_header_data, &(jp2->UnkC), 1); /* UnkC */ - ++ p_image_header_data; - opj_read_bytes(p_image_header_data, &(jp2->IPR), 1); /* IPR */ - ++ p_image_header_data; - - jp2->j2k->m_cp.allow_different_bit_depth_sign = (jp2->bpc == 255); - jp2->j2k->ihdr_w = jp2->w; - jp2->j2k->ihdr_h = jp2->h; - jp2->has_ihdr = 1; - - return OPJ_TRUE; -} - -static OPJ_BYTE * opj_jp2_write_ihdr(opj_jp2_t *jp2, - OPJ_UINT32 * p_nb_bytes_written - ) -{ - OPJ_BYTE * l_ihdr_data, * l_current_ihdr_ptr; - - /* preconditions */ - assert(jp2 != 00); - assert(p_nb_bytes_written != 00); - - /* default image header is 22 bytes wide */ - l_ihdr_data = (OPJ_BYTE *) opj_calloc(1, 22); - if (l_ihdr_data == 00) { - return 00; - } - - l_current_ihdr_ptr = l_ihdr_data; - - opj_write_bytes(l_current_ihdr_ptr, 22, 4); /* write box size */ - l_current_ihdr_ptr += 4; - - opj_write_bytes(l_current_ihdr_ptr, JP2_IHDR, 4); /* IHDR */ - l_current_ihdr_ptr += 4; - - opj_write_bytes(l_current_ihdr_ptr, jp2->h, 4); /* HEIGHT */ - l_current_ihdr_ptr += 4; - - opj_write_bytes(l_current_ihdr_ptr, jp2->w, 4); /* WIDTH */ - l_current_ihdr_ptr += 4; - - opj_write_bytes(l_current_ihdr_ptr, jp2->numcomps, 2); /* NC */ - l_current_ihdr_ptr += 2; - - opj_write_bytes(l_current_ihdr_ptr, jp2->bpc, 1); /* BPC */ - ++l_current_ihdr_ptr; - - opj_write_bytes(l_current_ihdr_ptr, jp2->C, 1); /* C : Always 7 */ - ++l_current_ihdr_ptr; - - opj_write_bytes(l_current_ihdr_ptr, jp2->UnkC, - 1); /* UnkC, colorspace unknown */ - ++l_current_ihdr_ptr; - - opj_write_bytes(l_current_ihdr_ptr, jp2->IPR, - 1); /* IPR, no intellectual property */ - ++l_current_ihdr_ptr; - - *p_nb_bytes_written = 22; - - return l_ihdr_data; -} - -static OPJ_BYTE * opj_jp2_write_bpcc(opj_jp2_t *jp2, - OPJ_UINT32 * p_nb_bytes_written - ) -{ - OPJ_UINT32 i; - /* room for 8 bytes for box and 1 byte for each component */ - OPJ_UINT32 l_bpcc_size; - OPJ_BYTE * l_bpcc_data, * l_current_bpcc_ptr; - - /* preconditions */ - assert(jp2 != 00); - assert(p_nb_bytes_written != 00); - l_bpcc_size = 8 + jp2->numcomps; - - l_bpcc_data = (OPJ_BYTE *) opj_calloc(1, l_bpcc_size); - if (l_bpcc_data == 00) { - return 00; - } - - l_current_bpcc_ptr = l_bpcc_data; - - opj_write_bytes(l_current_bpcc_ptr, l_bpcc_size, - 4); /* write box size */ - l_current_bpcc_ptr += 4; - - opj_write_bytes(l_current_bpcc_ptr, JP2_BPCC, 4); /* BPCC */ - l_current_bpcc_ptr += 4; - - for (i = 0; i < jp2->numcomps; ++i) { - opj_write_bytes(l_current_bpcc_ptr, jp2->comps[i].bpcc, - 1); /* write each component information */ - ++l_current_bpcc_ptr; - } - - *p_nb_bytes_written = l_bpcc_size; - - return l_bpcc_data; -} - -static OPJ_BOOL opj_jp2_read_bpcc(opj_jp2_t *jp2, - OPJ_BYTE * p_bpc_header_data, - OPJ_UINT32 p_bpc_header_size, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 i; - - /* preconditions */ - assert(p_bpc_header_data != 00); - assert(jp2 != 00); - assert(p_manager != 00); - - - if (jp2->bpc != 255) { - opj_event_msg(p_manager, EVT_WARNING, - "A BPCC header box is available although BPC given by the IHDR box (%d) indicate components bit depth is constant\n", - jp2->bpc); - } - - /* and length is relevant */ - if (p_bpc_header_size != jp2->numcomps) { - opj_event_msg(p_manager, EVT_ERROR, "Bad BPCC header box (bad size)\n"); - return OPJ_FALSE; - } - - /* read info for each component */ - for (i = 0; i < jp2->numcomps; ++i) { - opj_read_bytes(p_bpc_header_data, &jp2->comps[i].bpcc, - 1); /* read each BPCC component */ - ++p_bpc_header_data; - } - - return OPJ_TRUE; -} -static OPJ_BYTE * opj_jp2_write_cdef(opj_jp2_t *jp2, - OPJ_UINT32 * p_nb_bytes_written) -{ - /* room for 8 bytes for box, 2 for n */ - OPJ_UINT32 l_cdef_size = 10; - OPJ_BYTE * l_cdef_data, * l_current_cdef_ptr; - OPJ_UINT32 l_value; - OPJ_UINT16 i; - - /* preconditions */ - assert(jp2 != 00); - assert(p_nb_bytes_written != 00); - assert(jp2->color.jp2_cdef != 00); - assert(jp2->color.jp2_cdef->info != 00); - assert(jp2->color.jp2_cdef->n > 0U); - - l_cdef_size += 6U * jp2->color.jp2_cdef->n; - - l_cdef_data = (OPJ_BYTE *) opj_malloc(l_cdef_size); - if (l_cdef_data == 00) { - return 00; - } - - l_current_cdef_ptr = l_cdef_data; - - opj_write_bytes(l_current_cdef_ptr, l_cdef_size, 4); /* write box size */ - l_current_cdef_ptr += 4; - - opj_write_bytes(l_current_cdef_ptr, JP2_CDEF, 4); /* BPCC */ - l_current_cdef_ptr += 4; - - l_value = jp2->color.jp2_cdef->n; - opj_write_bytes(l_current_cdef_ptr, l_value, 2); /* N */ - l_current_cdef_ptr += 2; - - for (i = 0U; i < jp2->color.jp2_cdef->n; ++i) { - l_value = jp2->color.jp2_cdef->info[i].cn; - opj_write_bytes(l_current_cdef_ptr, l_value, 2); /* Cni */ - l_current_cdef_ptr += 2; - l_value = jp2->color.jp2_cdef->info[i].typ; - opj_write_bytes(l_current_cdef_ptr, l_value, 2); /* Typi */ - l_current_cdef_ptr += 2; - l_value = jp2->color.jp2_cdef->info[i].asoc; - opj_write_bytes(l_current_cdef_ptr, l_value, 2); /* Asoci */ - l_current_cdef_ptr += 2; - } - *p_nb_bytes_written = l_cdef_size; - - return l_cdef_data; -} - -static OPJ_BYTE * opj_jp2_write_colr(opj_jp2_t *jp2, - OPJ_UINT32 * p_nb_bytes_written - ) -{ - /* room for 8 bytes for box 3 for common data and variable upon profile*/ - OPJ_UINT32 l_colr_size = 11; - OPJ_BYTE * l_colr_data, * l_current_colr_ptr; - - /* preconditions */ - assert(jp2 != 00); - assert(p_nb_bytes_written != 00); - assert(jp2->meth == 1 || jp2->meth == 2); - - switch (jp2->meth) { - case 1 : - l_colr_size += 4; /* EnumCS */ - break; - case 2 : - assert(jp2->color.icc_profile_len); /* ICC profile */ - l_colr_size += jp2->color.icc_profile_len; - break; - default : - return 00; - } - - l_colr_data = (OPJ_BYTE *) opj_calloc(1, l_colr_size); - if (l_colr_data == 00) { - return 00; - } - - l_current_colr_ptr = l_colr_data; - - opj_write_bytes(l_current_colr_ptr, l_colr_size, - 4); /* write box size */ - l_current_colr_ptr += 4; - - opj_write_bytes(l_current_colr_ptr, JP2_COLR, 4); /* BPCC */ - l_current_colr_ptr += 4; - - opj_write_bytes(l_current_colr_ptr, jp2->meth, 1); /* METH */ - ++l_current_colr_ptr; - - opj_write_bytes(l_current_colr_ptr, jp2->precedence, 1); /* PRECEDENCE */ - ++l_current_colr_ptr; - - opj_write_bytes(l_current_colr_ptr, jp2->approx, 1); /* APPROX */ - ++l_current_colr_ptr; - - if (jp2->meth == - 1) { /* Meth value is restricted to 1 or 2 (Table I.9 of part 1) */ - opj_write_bytes(l_current_colr_ptr, jp2->enumcs, 4); - } /* EnumCS */ - else { - if (jp2->meth == 2) { /* ICC profile */ - OPJ_UINT32 i; - for (i = 0; i < jp2->color.icc_profile_len; ++i) { - opj_write_bytes(l_current_colr_ptr, jp2->color.icc_profile_buf[i], 1); - ++l_current_colr_ptr; - } - } - } - - *p_nb_bytes_written = l_colr_size; - - return l_colr_data; -} - -static void opj_jp2_free_pclr(opj_jp2_color_t *color) -{ - opj_free(color->jp2_pclr->channel_sign); - opj_free(color->jp2_pclr->channel_size); - opj_free(color->jp2_pclr->entries); - - if (color->jp2_pclr->cmap) { - opj_free(color->jp2_pclr->cmap); - } - - opj_free(color->jp2_pclr); - color->jp2_pclr = NULL; -} - -static OPJ_BOOL opj_jp2_check_color(opj_image_t *image, opj_jp2_color_t *color, - opj_event_mgr_t *p_manager) -{ - OPJ_UINT16 i; - - /* testcase 4149.pdf.SIGSEGV.cf7.3501 */ - if (color->jp2_cdef) { - opj_jp2_cdef_info_t *info = color->jp2_cdef->info; - OPJ_UINT16 n = color->jp2_cdef->n; - OPJ_UINT32 nr_channels = - image->numcomps; /* FIXME image->numcomps == jp2->numcomps before color is applied ??? */ - - /* cdef applies to cmap channels if any */ - if (color->jp2_pclr && color->jp2_pclr->cmap) { - nr_channels = (OPJ_UINT32)color->jp2_pclr->nr_channels; - } - - for (i = 0; i < n; i++) { - if (info[i].cn >= nr_channels) { - opj_event_msg(p_manager, EVT_ERROR, "Invalid component index %d (>= %d).\n", - info[i].cn, nr_channels); - return OPJ_FALSE; - } - if (info[i].asoc == 65535U) { - continue; - } - - if (info[i].asoc > 0 && (OPJ_UINT32)(info[i].asoc - 1) >= nr_channels) { - opj_event_msg(p_manager, EVT_ERROR, "Invalid component index %d (>= %d).\n", - info[i].asoc - 1, nr_channels); - return OPJ_FALSE; - } - } - - /* issue 397 */ - /* ISO 15444-1 states that if cdef is present, it shall contain a complete list of channel definitions. */ - while (nr_channels > 0) { - for (i = 0; i < n; ++i) { - if ((OPJ_UINT32)info[i].cn == (nr_channels - 1U)) { - break; - } - } - if (i == n) { - opj_event_msg(p_manager, EVT_ERROR, "Incomplete channel definitions.\n"); - return OPJ_FALSE; - } - --nr_channels; - } - } - - /* testcases 451.pdf.SIGSEGV.f4c.3723, 451.pdf.SIGSEGV.5b5.3723 and - 66ea31acbb0f23a2bbc91f64d69a03f5_signal_sigsegv_13937c0_7030_5725.pdf */ - if (color->jp2_pclr && color->jp2_pclr->cmap) { - OPJ_UINT16 nr_channels = color->jp2_pclr->nr_channels; - opj_jp2_cmap_comp_t *cmap = color->jp2_pclr->cmap; - OPJ_BOOL *pcol_usage, is_sane = OPJ_TRUE; - - /* verify that all original components match an existing one */ - for (i = 0; i < nr_channels; i++) { - if (cmap[i].cmp >= image->numcomps) { - opj_event_msg(p_manager, EVT_ERROR, "Invalid component index %d (>= %d).\n", - cmap[i].cmp, image->numcomps); - is_sane = OPJ_FALSE; - } - } - - pcol_usage = (OPJ_BOOL *) opj_calloc(nr_channels, sizeof(OPJ_BOOL)); - if (!pcol_usage) { - opj_event_msg(p_manager, EVT_ERROR, "Unexpected OOM.\n"); - return OPJ_FALSE; - } - /* verify that no component is targeted more than once */ - for (i = 0; i < nr_channels; i++) { - OPJ_BYTE mtyp = cmap[i].mtyp; - OPJ_BYTE pcol = cmap[i].pcol; - /* See ISO 15444-1 Table I.14 – MTYPi field values */ - if (mtyp != 0 && mtyp != 1) { - opj_event_msg(p_manager, EVT_ERROR, - "Invalid value for cmap[%d].mtyp = %d.\n", i, - mtyp); - is_sane = OPJ_FALSE; - } else if (pcol >= nr_channels) { - opj_event_msg(p_manager, EVT_ERROR, - "Invalid component/palette index for direct mapping %d.\n", pcol); - is_sane = OPJ_FALSE; - } else if (pcol_usage[pcol] && mtyp == 1) { - opj_event_msg(p_manager, EVT_ERROR, "Component %d is mapped twice.\n", pcol); - is_sane = OPJ_FALSE; - } else if (mtyp == 0 && pcol != 0) { - /* I.5.3.5 PCOL: If the value of the MTYP field for this channel is 0, then - * the value of this field shall be 0. */ - opj_event_msg(p_manager, EVT_ERROR, "Direct use at #%d however pcol=%d.\n", i, - pcol); - is_sane = OPJ_FALSE; - } else if (mtyp == 1 && pcol != i) { - /* OpenJPEG implementation limitation. See assert(i == pcol); */ - /* in opj_jp2_apply_pclr() */ - opj_event_msg(p_manager, EVT_ERROR, - "Implementation limitation: for palette mapping, " - "pcol[%d] should be equal to %d, but is equal " - "to %d.\n", i, i, pcol); - is_sane = OPJ_FALSE; - } else { - pcol_usage[pcol] = OPJ_TRUE; - } - } - /* verify that all components are targeted at least once */ - for (i = 0; i < nr_channels; i++) { - if (!pcol_usage[i] && cmap[i].mtyp != 0) { - opj_event_msg(p_manager, EVT_ERROR, "Component %d doesn't have a mapping.\n", - i); - is_sane = OPJ_FALSE; - } - } - /* Issue 235/447 weird cmap */ - if (1 && is_sane && (image->numcomps == 1U)) { - for (i = 0; i < nr_channels; i++) { - if (!pcol_usage[i]) { - is_sane = 0U; - opj_event_msg(p_manager, EVT_WARNING, - "Component mapping seems wrong. Trying to correct.\n", i); - break; - } - } - if (!is_sane) { - is_sane = OPJ_TRUE; - for (i = 0; i < nr_channels; i++) { - cmap[i].mtyp = 1U; - cmap[i].pcol = (OPJ_BYTE) i; - } - } - } - opj_free(pcol_usage); - if (!is_sane) { - return OPJ_FALSE; - } - } - - return OPJ_TRUE; -} - -/* file9.jp2 */ -static OPJ_BOOL opj_jp2_apply_pclr(opj_image_t *image, - opj_jp2_color_t *color, - opj_event_mgr_t * p_manager) -{ - opj_image_comp_t *old_comps, *new_comps; - OPJ_BYTE *channel_size, *channel_sign; - OPJ_UINT32 *entries; - opj_jp2_cmap_comp_t *cmap; - OPJ_INT32 *src, *dst; - OPJ_UINT32 j, max; - OPJ_UINT16 i, nr_channels, cmp, pcol; - OPJ_INT32 k, top_k; - - channel_size = color->jp2_pclr->channel_size; - channel_sign = color->jp2_pclr->channel_sign; - entries = color->jp2_pclr->entries; - cmap = color->jp2_pclr->cmap; - nr_channels = color->jp2_pclr->nr_channels; - - for (i = 0; i < nr_channels; ++i) { - /* Palette mapping: */ - cmp = cmap[i].cmp; - if (image->comps[cmp].data == NULL) { - opj_event_msg(p_manager, EVT_ERROR, - "image->comps[%d].data == NULL in opj_jp2_apply_pclr().\n", i); - return OPJ_FALSE; - } - } - - old_comps = image->comps; - new_comps = (opj_image_comp_t*) - opj_malloc(nr_channels * sizeof(opj_image_comp_t)); - if (!new_comps) { - opj_event_msg(p_manager, EVT_ERROR, - "Memory allocation failure in opj_jp2_apply_pclr().\n"); - return OPJ_FALSE; - } - for (i = 0; i < nr_channels; ++i) { - pcol = cmap[i].pcol; - cmp = cmap[i].cmp; - - /* Direct use */ - if (cmap[i].mtyp == 0) { - assert(pcol == 0); - new_comps[i] = old_comps[cmp]; - } else { - assert(i == pcol); - new_comps[pcol] = old_comps[cmp]; - } - - /* Palette mapping: */ - new_comps[i].data = (OPJ_INT32*) - opj_image_data_alloc(old_comps[cmp].w * old_comps[cmp].h * sizeof(OPJ_INT32)); - if (!new_comps[i].data) { - while (i > 0) { - -- i; - opj_image_data_free(new_comps[i].data); - } - opj_free(new_comps); - opj_event_msg(p_manager, EVT_ERROR, - "Memory allocation failure in opj_jp2_apply_pclr().\n"); - return OPJ_FALSE; - } - new_comps[i].prec = channel_size[i]; - new_comps[i].sgnd = channel_sign[i]; - } - - top_k = color->jp2_pclr->nr_entries - 1; - - for (i = 0; i < nr_channels; ++i) { - /* Palette mapping: */ - cmp = cmap[i].cmp; - pcol = cmap[i].pcol; - src = old_comps[cmp].data; - assert(src); /* verified above */ - max = new_comps[pcol].w * new_comps[pcol].h; - - /* Direct use: */ - if (cmap[i].mtyp == 0) { - assert(cmp == 0); - dst = new_comps[i].data; - assert(dst); - for (j = 0; j < max; ++j) { - dst[j] = src[j]; - } - } else { - assert(i == pcol); - dst = new_comps[pcol].data; - assert(dst); - for (j = 0; j < max; ++j) { - /* The index */ - if ((k = src[j]) < 0) { - k = 0; - } else if (k > top_k) { - k = top_k; - } - - /* The colour */ - dst[j] = (OPJ_INT32)entries[k * nr_channels + pcol]; - } - } - } - - max = image->numcomps; - for (i = 0; i < max; ++i) { - if (old_comps[i].data) { - opj_image_data_free(old_comps[i].data); - } - } - - opj_free(old_comps); - image->comps = new_comps; - image->numcomps = nr_channels; - - return OPJ_TRUE; -}/* apply_pclr() */ - -static OPJ_BOOL opj_jp2_read_pclr(opj_jp2_t *jp2, - OPJ_BYTE * p_pclr_header_data, - OPJ_UINT32 p_pclr_header_size, - opj_event_mgr_t * p_manager - ) -{ - opj_jp2_pclr_t *jp2_pclr; - OPJ_BYTE *channel_size, *channel_sign; - OPJ_UINT32 *entries; - OPJ_UINT16 nr_entries, nr_channels; - OPJ_UINT16 i, j; - OPJ_UINT32 l_value; - OPJ_BYTE *orig_header_data = p_pclr_header_data; - - /* preconditions */ - assert(p_pclr_header_data != 00); - assert(jp2 != 00); - assert(p_manager != 00); - (void)p_pclr_header_size; - - if (jp2->color.jp2_pclr) { - return OPJ_FALSE; - } - - if (p_pclr_header_size < 3) { - return OPJ_FALSE; - } - - opj_read_bytes(p_pclr_header_data, &l_value, 2); /* NE */ - p_pclr_header_data += 2; - nr_entries = (OPJ_UINT16) l_value; - if ((nr_entries == 0U) || (nr_entries > 1024U)) { - opj_event_msg(p_manager, EVT_ERROR, "Invalid PCLR box. Reports %d entries\n", - (int)nr_entries); - return OPJ_FALSE; - } - - opj_read_bytes(p_pclr_header_data, &l_value, 1); /* NPC */ - ++p_pclr_header_data; - nr_channels = (OPJ_UINT16) l_value; - if (nr_channels == 0U) { - opj_event_msg(p_manager, EVT_ERROR, - "Invalid PCLR box. Reports 0 palette columns\n"); - return OPJ_FALSE; - } - - if (p_pclr_header_size < 3 + (OPJ_UINT32)nr_channels) { - return OPJ_FALSE; - } - - entries = (OPJ_UINT32*) opj_malloc((size_t)nr_channels * nr_entries * sizeof( - OPJ_UINT32)); - if (!entries) { - return OPJ_FALSE; - } - channel_size = (OPJ_BYTE*) opj_malloc(nr_channels); - if (!channel_size) { - opj_free(entries); - return OPJ_FALSE; - } - channel_sign = (OPJ_BYTE*) opj_malloc(nr_channels); - if (!channel_sign) { - opj_free(entries); - opj_free(channel_size); - return OPJ_FALSE; - } - - jp2_pclr = (opj_jp2_pclr_t*)opj_malloc(sizeof(opj_jp2_pclr_t)); - if (!jp2_pclr) { - opj_free(entries); - opj_free(channel_size); - opj_free(channel_sign); - return OPJ_FALSE; - } - - jp2_pclr->channel_sign = channel_sign; - jp2_pclr->channel_size = channel_size; - jp2_pclr->entries = entries; - jp2_pclr->nr_entries = nr_entries; - jp2_pclr->nr_channels = (OPJ_BYTE) l_value; - jp2_pclr->cmap = NULL; - - jp2->color.jp2_pclr = jp2_pclr; - - for (i = 0; i < nr_channels; ++i) { - opj_read_bytes(p_pclr_header_data, &l_value, 1); /* Bi */ - ++p_pclr_header_data; - - channel_size[i] = (OPJ_BYTE)((l_value & 0x7f) + 1); - channel_sign[i] = (l_value & 0x80) ? 1 : 0; - } - - for (j = 0; j < nr_entries; ++j) { - for (i = 0; i < nr_channels; ++i) { - OPJ_UINT32 bytes_to_read = (OPJ_UINT32)((channel_size[i] + 7) >> 3); - - if (bytes_to_read > sizeof(OPJ_UINT32)) { - bytes_to_read = sizeof(OPJ_UINT32); - } - if ((ptrdiff_t)p_pclr_header_size < (ptrdiff_t)(p_pclr_header_data - - orig_header_data) + (ptrdiff_t)bytes_to_read) { - return OPJ_FALSE; - } - - opj_read_bytes(p_pclr_header_data, &l_value, bytes_to_read); /* Cji */ - p_pclr_header_data += bytes_to_read; - *entries = (OPJ_UINT32) l_value; - entries++; - } - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_jp2_read_cmap(opj_jp2_t * jp2, - OPJ_BYTE * p_cmap_header_data, - OPJ_UINT32 p_cmap_header_size, - opj_event_mgr_t * p_manager - ) -{ - opj_jp2_cmap_comp_t *cmap; - OPJ_BYTE i, nr_channels; - OPJ_UINT32 l_value; - - /* preconditions */ - assert(jp2 != 00); - assert(p_cmap_header_data != 00); - assert(p_manager != 00); - (void)p_cmap_header_size; - - /* Need nr_channels: */ - if (jp2->color.jp2_pclr == NULL) { - opj_event_msg(p_manager, EVT_ERROR, - "Need to read a PCLR box before the CMAP box.\n"); - return OPJ_FALSE; - } - - /* Part 1, I.5.3.5: 'There shall be at most one Component Mapping box - * inside a JP2 Header box' : - */ - if (jp2->color.jp2_pclr->cmap) { - opj_event_msg(p_manager, EVT_ERROR, "Only one CMAP box is allowed.\n"); - return OPJ_FALSE; - } - - nr_channels = jp2->color.jp2_pclr->nr_channels; - if (p_cmap_header_size < (OPJ_UINT32)nr_channels * 4) { - opj_event_msg(p_manager, EVT_ERROR, "Insufficient data for CMAP box.\n"); - return OPJ_FALSE; - } - - cmap = (opj_jp2_cmap_comp_t*) opj_malloc(nr_channels * sizeof( - opj_jp2_cmap_comp_t)); - if (!cmap) { - return OPJ_FALSE; - } - - - for (i = 0; i < nr_channels; ++i) { - opj_read_bytes(p_cmap_header_data, &l_value, 2); /* CMP^i */ - p_cmap_header_data += 2; - cmap[i].cmp = (OPJ_UINT16) l_value; - - opj_read_bytes(p_cmap_header_data, &l_value, 1); /* MTYP^i */ - ++p_cmap_header_data; - cmap[i].mtyp = (OPJ_BYTE) l_value; - - opj_read_bytes(p_cmap_header_data, &l_value, 1); /* PCOL^i */ - ++p_cmap_header_data; - cmap[i].pcol = (OPJ_BYTE) l_value; - } - - jp2->color.jp2_pclr->cmap = cmap; - - return OPJ_TRUE; -} - -static void opj_jp2_apply_cdef(opj_image_t *image, opj_jp2_color_t *color, - opj_event_mgr_t *manager) -{ - opj_jp2_cdef_info_t *info; - OPJ_UINT16 i, n, cn, asoc, acn; - - info = color->jp2_cdef->info; - n = color->jp2_cdef->n; - - for (i = 0; i < n; ++i) { - /* WATCH: acn = asoc - 1 ! */ - asoc = info[i].asoc; - cn = info[i].cn; - - if (cn >= image->numcomps) { - opj_event_msg(manager, EVT_WARNING, "opj_jp2_apply_cdef: cn=%d, numcomps=%d\n", - cn, image->numcomps); - continue; - } - if (asoc == 0 || asoc == 65535) { - image->comps[cn].alpha = info[i].typ; - continue; - } - - acn = (OPJ_UINT16)(asoc - 1); - if (acn >= image->numcomps) { - opj_event_msg(manager, EVT_WARNING, "opj_jp2_apply_cdef: acn=%d, numcomps=%d\n", - acn, image->numcomps); - continue; - } - - /* Swap only if color channel */ - if ((cn != acn) && (info[i].typ == 0)) { - opj_image_comp_t saved; - OPJ_UINT16 j; - - memcpy(&saved, &image->comps[cn], sizeof(opj_image_comp_t)); - memcpy(&image->comps[cn], &image->comps[acn], sizeof(opj_image_comp_t)); - memcpy(&image->comps[acn], &saved, sizeof(opj_image_comp_t)); - - /* Swap channels in following channel definitions, don't bother with j <= i that are already processed */ - for (j = (OPJ_UINT16)(i + 1U); j < n ; ++j) { - if (info[j].cn == cn) { - info[j].cn = acn; - } else if (info[j].cn == acn) { - info[j].cn = cn; - } - /* asoc is related to color index. Do not update. */ - } - } - - image->comps[cn].alpha = info[i].typ; - } - - if (color->jp2_cdef->info) { - opj_free(color->jp2_cdef->info); - } - - opj_free(color->jp2_cdef); - color->jp2_cdef = NULL; - -}/* jp2_apply_cdef() */ - -static OPJ_BOOL opj_jp2_read_cdef(opj_jp2_t * jp2, - OPJ_BYTE * p_cdef_header_data, - OPJ_UINT32 p_cdef_header_size, - opj_event_mgr_t * p_manager - ) -{ - opj_jp2_cdef_info_t *cdef_info; - OPJ_UINT16 i; - OPJ_UINT32 l_value; - - /* preconditions */ - assert(jp2 != 00); - assert(p_cdef_header_data != 00); - assert(p_manager != 00); - (void)p_cdef_header_size; - - /* Part 1, I.5.3.6: 'The shall be at most one Channel Definition box - * inside a JP2 Header box.'*/ - if (jp2->color.jp2_cdef) { - return OPJ_FALSE; - } - - if (p_cdef_header_size < 2) { - opj_event_msg(p_manager, EVT_ERROR, "Insufficient data for CDEF box.\n"); - return OPJ_FALSE; - } - - opj_read_bytes(p_cdef_header_data, &l_value, 2); /* N */ - p_cdef_header_data += 2; - - if ((OPJ_UINT16)l_value == 0) { /* szukw000: FIXME */ - opj_event_msg(p_manager, EVT_ERROR, - "Number of channel description is equal to zero in CDEF box.\n"); - return OPJ_FALSE; - } - - if (p_cdef_header_size < 2 + (OPJ_UINT32)(OPJ_UINT16)l_value * 6) { - opj_event_msg(p_manager, EVT_ERROR, "Insufficient data for CDEF box.\n"); - return OPJ_FALSE; - } - - cdef_info = (opj_jp2_cdef_info_t*) opj_malloc(l_value * sizeof( - opj_jp2_cdef_info_t)); - if (!cdef_info) { - return OPJ_FALSE; - } - - jp2->color.jp2_cdef = (opj_jp2_cdef_t*)opj_malloc(sizeof(opj_jp2_cdef_t)); - if (!jp2->color.jp2_cdef) { - opj_free(cdef_info); - return OPJ_FALSE; - } - jp2->color.jp2_cdef->info = cdef_info; - jp2->color.jp2_cdef->n = (OPJ_UINT16) l_value; - - for (i = 0; i < jp2->color.jp2_cdef->n; ++i) { - opj_read_bytes(p_cdef_header_data, &l_value, 2); /* Cn^i */ - p_cdef_header_data += 2; - cdef_info[i].cn = (OPJ_UINT16) l_value; - - opj_read_bytes(p_cdef_header_data, &l_value, 2); /* Typ^i */ - p_cdef_header_data += 2; - cdef_info[i].typ = (OPJ_UINT16) l_value; - - opj_read_bytes(p_cdef_header_data, &l_value, 2); /* Asoc^i */ - p_cdef_header_data += 2; - cdef_info[i].asoc = (OPJ_UINT16) l_value; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_jp2_read_colr(opj_jp2_t *jp2, - OPJ_BYTE * p_colr_header_data, - OPJ_UINT32 p_colr_header_size, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 l_value; - - /* preconditions */ - assert(jp2 != 00); - assert(p_colr_header_data != 00); - assert(p_manager != 00); - - if (p_colr_header_size < 3) { - opj_event_msg(p_manager, EVT_ERROR, "Bad COLR header box (bad size)\n"); - return OPJ_FALSE; - } - - /* Part 1, I.5.3.3 : 'A conforming JP2 reader shall ignore all Colour - * Specification boxes after the first.' - */ - if (jp2->color.jp2_has_colr) { - opj_event_msg(p_manager, EVT_INFO, - "A conforming JP2 reader shall ignore all Colour Specification boxes after the first, so we ignore this one.\n"); - p_colr_header_data += p_colr_header_size; - return OPJ_TRUE; - } - - opj_read_bytes(p_colr_header_data, &jp2->meth, 1); /* METH */ - ++p_colr_header_data; - - opj_read_bytes(p_colr_header_data, &jp2->precedence, 1); /* PRECEDENCE */ - ++p_colr_header_data; - - opj_read_bytes(p_colr_header_data, &jp2->approx, 1); /* APPROX */ - ++p_colr_header_data; - - if (jp2->meth == 1) { - if (p_colr_header_size < 7) { - opj_event_msg(p_manager, EVT_ERROR, "Bad COLR header box (bad size: %d)\n", - p_colr_header_size); - return OPJ_FALSE; - } - if ((p_colr_header_size > 7) && - (jp2->enumcs != 14)) { /* handled below for CIELab) */ - /* testcase Altona_Technical_v20_x4.pdf */ - opj_event_msg(p_manager, EVT_WARNING, "Bad COLR header box (bad size: %d)\n", - p_colr_header_size); - } - - opj_read_bytes(p_colr_header_data, &jp2->enumcs, 4); /* EnumCS */ - - p_colr_header_data += 4; - - if (jp2->enumcs == 14) { /* CIELab */ - OPJ_UINT32 *cielab; - OPJ_UINT32 rl, ol, ra, oa, rb, ob, il; - - cielab = (OPJ_UINT32*)opj_malloc(9 * sizeof(OPJ_UINT32)); - if (cielab == NULL) { - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory for cielab\n"); - return OPJ_FALSE; - } - cielab[0] = 14; /* enumcs */ - - /* default values */ - rl = ra = rb = ol = oa = ob = 0; - il = 0x00443530; /* D50 */ - cielab[1] = 0x44454600;/* DEF */ - - if (p_colr_header_size == 35) { - opj_read_bytes(p_colr_header_data, &rl, 4); - p_colr_header_data += 4; - opj_read_bytes(p_colr_header_data, &ol, 4); - p_colr_header_data += 4; - opj_read_bytes(p_colr_header_data, &ra, 4); - p_colr_header_data += 4; - opj_read_bytes(p_colr_header_data, &oa, 4); - p_colr_header_data += 4; - opj_read_bytes(p_colr_header_data, &rb, 4); - p_colr_header_data += 4; - opj_read_bytes(p_colr_header_data, &ob, 4); - p_colr_header_data += 4; - opj_read_bytes(p_colr_header_data, &il, 4); - p_colr_header_data += 4; - - cielab[1] = 0; - } else if (p_colr_header_size != 7) { - opj_event_msg(p_manager, EVT_WARNING, - "Bad COLR header box (CIELab, bad size: %d)\n", p_colr_header_size); - } - cielab[2] = rl; - cielab[4] = ra; - cielab[6] = rb; - cielab[3] = ol; - cielab[5] = oa; - cielab[7] = ob; - cielab[8] = il; - - jp2->color.icc_profile_buf = (OPJ_BYTE*)cielab; - jp2->color.icc_profile_len = 0; - } - jp2->color.jp2_has_colr = 1; - } else if (jp2->meth == 2) { - /* ICC profile */ - OPJ_INT32 it_icc_value = 0; - OPJ_INT32 icc_len = (OPJ_INT32)p_colr_header_size - 3; - - jp2->color.icc_profile_len = (OPJ_UINT32)icc_len; - jp2->color.icc_profile_buf = (OPJ_BYTE*) opj_calloc(1, (size_t)icc_len); - if (!jp2->color.icc_profile_buf) { - jp2->color.icc_profile_len = 0; - return OPJ_FALSE; - } - - for (it_icc_value = 0; it_icc_value < icc_len; ++it_icc_value) { - opj_read_bytes(p_colr_header_data, &l_value, 1); /* icc values */ - ++p_colr_header_data; - jp2->color.icc_profile_buf[it_icc_value] = (OPJ_BYTE) l_value; - } - - jp2->color.jp2_has_colr = 1; - } else if (jp2->meth > 2) { - /* ISO/IEC 15444-1:2004 (E), Table I.9 Legal METH values: - conforming JP2 reader shall ignore the entire Colour Specification box.*/ - opj_event_msg(p_manager, EVT_INFO, - "COLR BOX meth value is not a regular value (%d), " - "so we will ignore the entire Colour Specification box. \n", jp2->meth); - } - if (jp2->color.jp2_has_colr) { - jp2->j2k->enumcs = jp2->enumcs; - } - return OPJ_TRUE; -} - -OPJ_BOOL opj_jp2_decode(opj_jp2_t *jp2, - opj_stream_private_t *p_stream, - opj_image_t* p_image, - opj_event_mgr_t * p_manager) -{ - if (!p_image) { - return OPJ_FALSE; - } - - /* J2K decoding */ - if (! opj_j2k_decode(jp2->j2k, p_stream, p_image, p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, - "Failed to decode the codestream in the JP2 file\n"); - return OPJ_FALSE; - } - - if (jp2->j2k->m_specific_param.m_decoder.m_numcomps_to_decode) { - /* Bypass all JP2 component transforms */ - return OPJ_TRUE; - } - - if (!jp2->ignore_pclr_cmap_cdef) { - if (!opj_jp2_check_color(p_image, &(jp2->color), p_manager)) { - return OPJ_FALSE; - } - - /* Set Image Color Space */ - if (jp2->enumcs == 16) { - p_image->color_space = OPJ_CLRSPC_SRGB; - } else if (jp2->enumcs == 17) { - p_image->color_space = OPJ_CLRSPC_GRAY; - } else if (jp2->enumcs == 18) { - p_image->color_space = OPJ_CLRSPC_SYCC; - } else if (jp2->enumcs == 24) { - p_image->color_space = OPJ_CLRSPC_EYCC; - } else if (jp2->enumcs == 12) { - p_image->color_space = OPJ_CLRSPC_CMYK; - } else { - p_image->color_space = OPJ_CLRSPC_UNKNOWN; - } - - if (jp2->color.jp2_pclr) { - /* Part 1, I.5.3.4: Either both or none : */ - if (!jp2->color.jp2_pclr->cmap) { - opj_jp2_free_pclr(&(jp2->color)); - } else { - if (!opj_jp2_apply_pclr(p_image, &(jp2->color), p_manager)) { - return OPJ_FALSE; - } - } - } - - /* Apply the color space if needed */ - if (jp2->color.jp2_cdef) { - opj_jp2_apply_cdef(p_image, &(jp2->color), p_manager); - } - - if (jp2->color.icc_profile_buf) { - p_image->icc_profile_buf = jp2->color.icc_profile_buf; - p_image->icc_profile_len = jp2->color.icc_profile_len; - jp2->color.icc_profile_buf = NULL; - } - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_jp2_write_jp2h(opj_jp2_t *jp2, - opj_stream_private_t *stream, - opj_event_mgr_t * p_manager - ) -{ - opj_jp2_img_header_writer_handler_t l_writers [4]; - opj_jp2_img_header_writer_handler_t * l_current_writer; - - OPJ_INT32 i, l_nb_pass; - /* size of data for super box*/ - OPJ_UINT32 l_jp2h_size = 8; - OPJ_BOOL l_result = OPJ_TRUE; - - /* to store the data of the super box */ - OPJ_BYTE l_jp2h_data [8]; - - /* preconditions */ - assert(stream != 00); - assert(jp2 != 00); - assert(p_manager != 00); - - memset(l_writers, 0, sizeof(l_writers)); - - if (jp2->bpc == 255) { - l_nb_pass = 3; - l_writers[0].handler = opj_jp2_write_ihdr; - l_writers[1].handler = opj_jp2_write_bpcc; - l_writers[2].handler = opj_jp2_write_colr; - } else { - l_nb_pass = 2; - l_writers[0].handler = opj_jp2_write_ihdr; - l_writers[1].handler = opj_jp2_write_colr; - } - - if (jp2->color.jp2_cdef != NULL) { - l_writers[l_nb_pass].handler = opj_jp2_write_cdef; - l_nb_pass++; - } - - /* write box header */ - /* write JP2H type */ - opj_write_bytes(l_jp2h_data + 4, JP2_JP2H, 4); - - l_current_writer = l_writers; - for (i = 0; i < l_nb_pass; ++i) { - l_current_writer->m_data = l_current_writer->handler(jp2, - &(l_current_writer->m_size)); - if (l_current_writer->m_data == 00) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to hold JP2 Header data\n"); - l_result = OPJ_FALSE; - break; - } - - l_jp2h_size += l_current_writer->m_size; - ++l_current_writer; - } - - if (! l_result) { - l_current_writer = l_writers; - for (i = 0; i < l_nb_pass; ++i) { - if (l_current_writer->m_data != 00) { - opj_free(l_current_writer->m_data); - } - ++l_current_writer; - } - - return OPJ_FALSE; - } - - /* write super box size */ - opj_write_bytes(l_jp2h_data, l_jp2h_size, 4); - - /* write super box data on stream */ - if (opj_stream_write_data(stream, l_jp2h_data, 8, p_manager) != 8) { - opj_event_msg(p_manager, EVT_ERROR, - "Stream error while writing JP2 Header box\n"); - l_result = OPJ_FALSE; - } - - if (l_result) { - l_current_writer = l_writers; - for (i = 0; i < l_nb_pass; ++i) { - if (opj_stream_write_data(stream, l_current_writer->m_data, - l_current_writer->m_size, p_manager) != l_current_writer->m_size) { - opj_event_msg(p_manager, EVT_ERROR, - "Stream error while writing JP2 Header box\n"); - l_result = OPJ_FALSE; - break; - } - ++l_current_writer; - } - } - - l_current_writer = l_writers; - - /* cleanup */ - for (i = 0; i < l_nb_pass; ++i) { - if (l_current_writer->m_data != 00) { - opj_free(l_current_writer->m_data); - } - ++l_current_writer; - } - - return l_result; -} - -static OPJ_BOOL opj_jp2_write_ftyp(opj_jp2_t *jp2, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager) -{ - OPJ_UINT32 i; - OPJ_UINT32 l_ftyp_size; - OPJ_BYTE * l_ftyp_data, * l_current_data_ptr; - OPJ_BOOL l_result; - - /* preconditions */ - assert(cio != 00); - assert(jp2 != 00); - assert(p_manager != 00); - l_ftyp_size = 16 + 4 * jp2->numcl; - - l_ftyp_data = (OPJ_BYTE *) opj_calloc(1, l_ftyp_size); - - if (l_ftyp_data == 00) { - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to handle ftyp data\n"); - return OPJ_FALSE; - } - - l_current_data_ptr = l_ftyp_data; - - opj_write_bytes(l_current_data_ptr, l_ftyp_size, 4); /* box size */ - l_current_data_ptr += 4; - - opj_write_bytes(l_current_data_ptr, JP2_FTYP, 4); /* FTYP */ - l_current_data_ptr += 4; - - opj_write_bytes(l_current_data_ptr, jp2->brand, 4); /* BR */ - l_current_data_ptr += 4; - - opj_write_bytes(l_current_data_ptr, jp2->minversion, 4); /* MinV */ - l_current_data_ptr += 4; - - for (i = 0; i < jp2->numcl; i++) { - opj_write_bytes(l_current_data_ptr, jp2->cl[i], 4); /* CL */ - } - - l_result = (opj_stream_write_data(cio, l_ftyp_data, l_ftyp_size, - p_manager) == l_ftyp_size); - if (! l_result) { - opj_event_msg(p_manager, EVT_ERROR, - "Error while writing ftyp data to stream\n"); - } - - opj_free(l_ftyp_data); - - return l_result; -} - -static OPJ_BOOL opj_jp2_write_jp2c(opj_jp2_t *jp2, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager) -{ - OPJ_OFF_T j2k_codestream_exit; - OPJ_BYTE l_data_header [8]; - - /* preconditions */ - assert(jp2 != 00); - assert(cio != 00); - assert(p_manager != 00); - assert(opj_stream_has_seek(cio)); - - j2k_codestream_exit = opj_stream_tell(cio); - opj_write_bytes(l_data_header, - (OPJ_UINT32)(j2k_codestream_exit - jp2->j2k_codestream_offset), - 4); /* size of codestream */ - opj_write_bytes(l_data_header + 4, JP2_JP2C, - 4); /* JP2C */ - - if (! opj_stream_seek(cio, jp2->j2k_codestream_offset, p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, "Failed to seek in the stream.\n"); - return OPJ_FALSE; - } - - if (opj_stream_write_data(cio, l_data_header, 8, p_manager) != 8) { - opj_event_msg(p_manager, EVT_ERROR, "Failed to seek in the stream.\n"); - return OPJ_FALSE; - } - - if (! opj_stream_seek(cio, j2k_codestream_exit, p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, "Failed to seek in the stream.\n"); - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_jp2_write_jp(opj_jp2_t *jp2, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager) -{ - /* 12 bytes will be read */ - OPJ_BYTE l_signature_data [12]; - - /* preconditions */ - assert(cio != 00); - assert(jp2 != 00); - assert(p_manager != 00); - - OPJ_UNUSED(jp2); - - /* write box length */ - opj_write_bytes(l_signature_data, 12, 4); - /* writes box type */ - opj_write_bytes(l_signature_data + 4, JP2_JP, 4); - /* writes magic number*/ - opj_write_bytes(l_signature_data + 8, 0x0d0a870a, 4); - - if (opj_stream_write_data(cio, l_signature_data, 12, p_manager) != 12) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -/* ----------------------------------------------------------------------- */ -/* JP2 decoder interface */ -/* ----------------------------------------------------------------------- */ - -void opj_jp2_setup_decoder(opj_jp2_t *jp2, opj_dparameters_t *parameters) -{ - /* setup the J2K codec */ - opj_j2k_setup_decoder(jp2->j2k, parameters); - - /* further JP2 initializations go here */ - jp2->color.jp2_has_colr = 0; - jp2->ignore_pclr_cmap_cdef = parameters->flags & - OPJ_DPARAMETERS_IGNORE_PCLR_CMAP_CDEF_FLAG; -} - -OPJ_BOOL opj_jp2_set_threads(opj_jp2_t *jp2, OPJ_UINT32 num_threads) -{ - return opj_j2k_set_threads(jp2->j2k, num_threads); -} - -/* ----------------------------------------------------------------------- */ -/* JP2 encoder interface */ -/* ----------------------------------------------------------------------- */ - -OPJ_BOOL opj_jp2_setup_encoder(opj_jp2_t *jp2, - opj_cparameters_t *parameters, - opj_image_t *image, - opj_event_mgr_t * p_manager) -{ - OPJ_UINT32 i; - OPJ_UINT32 depth_0; - OPJ_UINT32 sign; - OPJ_UINT32 alpha_count; - OPJ_UINT32 color_channels = 0U; - OPJ_UINT32 alpha_channel = 0U; - - - if (!jp2 || !parameters || !image) { - return OPJ_FALSE; - } - - /* setup the J2K codec */ - /* ------------------- */ - - /* Check if number of components respects standard */ - if (image->numcomps < 1 || image->numcomps > 16384) { - opj_event_msg(p_manager, EVT_ERROR, - "Invalid number of components specified while setting up JP2 encoder\n"); - return OPJ_FALSE; - } - - if (opj_j2k_setup_encoder(jp2->j2k, parameters, image, - p_manager) == OPJ_FALSE) { - return OPJ_FALSE; - } - - /* setup the JP2 codec */ - /* ------------------- */ - - /* Profile box */ - - jp2->brand = JP2_JP2; /* BR */ - jp2->minversion = 0; /* MinV */ - jp2->numcl = 1; - jp2->cl = (OPJ_UINT32*) opj_malloc(jp2->numcl * sizeof(OPJ_UINT32)); - if (!jp2->cl) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory when setup the JP2 encoder\n"); - return OPJ_FALSE; - } - jp2->cl[0] = JP2_JP2; /* CL0 : JP2 */ - - /* Image Header box */ - - jp2->numcomps = image->numcomps; /* NC */ - jp2->comps = (opj_jp2_comps_t*) opj_malloc(jp2->numcomps * sizeof( - opj_jp2_comps_t)); - if (!jp2->comps) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory when setup the JP2 encoder\n"); - /* Memory of jp2->cl will be freed by opj_jp2_destroy */ - return OPJ_FALSE; - } - - jp2->h = image->y1 - image->y0; /* HEIGHT */ - jp2->w = image->x1 - image->x0; /* WIDTH */ - /* BPC */ - depth_0 = image->comps[0].prec - 1; - sign = image->comps[0].sgnd; - jp2->bpc = depth_0 + (sign << 7); - for (i = 1; i < image->numcomps; i++) { - OPJ_UINT32 depth = image->comps[i].prec - 1; - sign = image->comps[i].sgnd; - if (depth_0 != depth) { - jp2->bpc = 255; - } - } - jp2->C = 7; /* C : Always 7 */ - jp2->UnkC = 0; /* UnkC, colorspace specified in colr box */ - jp2->IPR = 0; /* IPR, no intellectual property */ - - /* BitsPerComponent box */ - for (i = 0; i < image->numcomps; i++) { - jp2->comps[i].bpcc = image->comps[i].prec - 1 + (image->comps[i].sgnd << 7); - } - - /* Colour Specification box */ - if (image->icc_profile_len) { - jp2->meth = 2; - jp2->enumcs = 0; - } else { - jp2->meth = 1; - if (image->color_space == 1) { - jp2->enumcs = 16; /* sRGB as defined by IEC 61966-2-1 */ - } else if (image->color_space == 2) { - jp2->enumcs = 17; /* greyscale */ - } else if (image->color_space == 3) { - jp2->enumcs = 18; /* YUV */ - } - } - - /* Channel Definition box */ - /* FIXME not provided by parameters */ - /* We try to do what we can... */ - alpha_count = 0U; - for (i = 0; i < image->numcomps; i++) { - if (image->comps[i].alpha != 0) { - alpha_count++; - alpha_channel = i; - } - } - if (alpha_count == 1U) { /* no way to deal with more than 1 alpha channel */ - switch (jp2->enumcs) { - case 16: - case 18: - color_channels = 3; - break; - case 17: - color_channels = 1; - break; - default: - alpha_count = 0U; - break; - } - if (alpha_count == 0U) { - opj_event_msg(p_manager, EVT_WARNING, - "Alpha channel specified but unknown enumcs. No cdef box will be created.\n"); - } else if (image->numcomps < (color_channels + 1)) { - opj_event_msg(p_manager, EVT_WARNING, - "Alpha channel specified but not enough image components for an automatic cdef box creation.\n"); - alpha_count = 0U; - } else if ((OPJ_UINT32)alpha_channel < color_channels) { - opj_event_msg(p_manager, EVT_WARNING, - "Alpha channel position conflicts with color channel. No cdef box will be created.\n"); - alpha_count = 0U; - } - } else if (alpha_count > 1) { - opj_event_msg(p_manager, EVT_WARNING, - "Multiple alpha channels specified. No cdef box will be created.\n"); - } - if (alpha_count == 1U) { /* if here, we know what we can do */ - jp2->color.jp2_cdef = (opj_jp2_cdef_t*)opj_malloc(sizeof(opj_jp2_cdef_t)); - if (!jp2->color.jp2_cdef) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to setup the JP2 encoder\n"); - return OPJ_FALSE; - } - /* no memset needed, all values will be overwritten except if jp2->color.jp2_cdef->info allocation fails, */ - /* in which case jp2->color.jp2_cdef->info will be NULL => valid for destruction */ - jp2->color.jp2_cdef->info = (opj_jp2_cdef_info_t*) opj_malloc( - image->numcomps * sizeof(opj_jp2_cdef_info_t)); - if (!jp2->color.jp2_cdef->info) { - /* memory will be freed by opj_jp2_destroy */ - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to setup the JP2 encoder\n"); - return OPJ_FALSE; - } - jp2->color.jp2_cdef->n = (OPJ_UINT16) - image->numcomps; /* cast is valid : image->numcomps [1,16384] */ - for (i = 0U; i < color_channels; i++) { - jp2->color.jp2_cdef->info[i].cn = (OPJ_UINT16) - i; /* cast is valid : image->numcomps [1,16384] */ - jp2->color.jp2_cdef->info[i].typ = 0U; - jp2->color.jp2_cdef->info[i].asoc = (OPJ_UINT16)(i + - 1U); /* No overflow + cast is valid : image->numcomps [1,16384] */ - } - for (; i < image->numcomps; i++) { - if (image->comps[i].alpha != 0) { /* we'll be here exactly once */ - jp2->color.jp2_cdef->info[i].cn = (OPJ_UINT16) - i; /* cast is valid : image->numcomps [1,16384] */ - jp2->color.jp2_cdef->info[i].typ = 1U; /* Opacity channel */ - jp2->color.jp2_cdef->info[i].asoc = - 0U; /* Apply alpha channel to the whole image */ - } else { - /* Unknown channel */ - jp2->color.jp2_cdef->info[i].cn = (OPJ_UINT16) - i; /* cast is valid : image->numcomps [1,16384] */ - jp2->color.jp2_cdef->info[i].typ = 65535U; - jp2->color.jp2_cdef->info[i].asoc = 65535U; - } - } - } - - jp2->precedence = 0; /* PRECEDENCE */ - jp2->approx = 0; /* APPROX */ - - jp2->jpip_on = parameters->jpip_on; - - return OPJ_TRUE; -} - -OPJ_BOOL opj_jp2_encode(opj_jp2_t *jp2, - opj_stream_private_t *stream, - opj_event_mgr_t * p_manager) -{ - return opj_j2k_encode(jp2->j2k, stream, p_manager); -} - -OPJ_BOOL opj_jp2_end_decompress(opj_jp2_t *jp2, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager - ) -{ - /* preconditions */ - assert(jp2 != 00); - assert(cio != 00); - assert(p_manager != 00); - - /* customization of the end encoding */ - if (! opj_jp2_setup_end_header_reading(jp2, p_manager)) { - return OPJ_FALSE; - } - - /* write header */ - if (! opj_jp2_exec(jp2, jp2->m_procedure_list, cio, p_manager)) { - return OPJ_FALSE; - } - - return opj_j2k_end_decompress(jp2->j2k, cio, p_manager); -} - -OPJ_BOOL opj_jp2_end_compress(opj_jp2_t *jp2, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager - ) -{ - /* preconditions */ - assert(jp2 != 00); - assert(cio != 00); - assert(p_manager != 00); - - /* customization of the end encoding */ - if (! opj_jp2_setup_end_header_writing(jp2, p_manager)) { - return OPJ_FALSE; - } - - if (! opj_j2k_end_compress(jp2->j2k, cio, p_manager)) { - return OPJ_FALSE; - } - - /* write header */ - return opj_jp2_exec(jp2, jp2->m_procedure_list, cio, p_manager); -} - -static OPJ_BOOL opj_jp2_setup_end_header_writing(opj_jp2_t *jp2, - opj_event_mgr_t * p_manager) -{ - /* preconditions */ - assert(jp2 != 00); - assert(p_manager != 00); - -#ifdef USE_JPIP - if (jp2->jpip_on) { - if (! opj_procedure_list_add_procedure(jp2->m_procedure_list, - (opj_procedure)opj_jpip_write_iptr, p_manager)) { - return OPJ_FALSE; - } - } -#endif - if (! opj_procedure_list_add_procedure(jp2->m_procedure_list, - (opj_procedure)opj_jp2_write_jp2c, p_manager)) { - return OPJ_FALSE; - } - /* DEVELOPER CORNER, add your custom procedures */ -#ifdef USE_JPIP - if (jp2->jpip_on) { - if (! opj_procedure_list_add_procedure(jp2->m_procedure_list, - (opj_procedure)opj_jpip_write_cidx, p_manager)) { - return OPJ_FALSE; - } - if (! opj_procedure_list_add_procedure(jp2->m_procedure_list, - (opj_procedure)opj_jpip_write_fidx, p_manager)) { - return OPJ_FALSE; - } - } -#endif - return OPJ_TRUE; -} - -static OPJ_BOOL opj_jp2_setup_end_header_reading(opj_jp2_t *jp2, - opj_event_mgr_t * p_manager) -{ - /* preconditions */ - assert(jp2 != 00); - assert(p_manager != 00); - - if (! opj_procedure_list_add_procedure(jp2->m_procedure_list, - (opj_procedure)opj_jp2_read_header_procedure, p_manager)) { - return OPJ_FALSE; - } - /* DEVELOPER CORNER, add your custom procedures */ - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_jp2_default_validation(opj_jp2_t * jp2, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager - ) -{ - OPJ_BOOL l_is_valid = OPJ_TRUE; - OPJ_UINT32 i; - - /* preconditions */ - assert(jp2 != 00); - assert(cio != 00); - assert(p_manager != 00); - - OPJ_UNUSED(p_manager); - - /* JPEG2000 codec validation */ - - /* STATE checking */ - /* make sure the state is at 0 */ - l_is_valid &= (jp2->jp2_state == JP2_STATE_NONE); - - /* make sure not reading a jp2h ???? WEIRD */ - l_is_valid &= (jp2->jp2_img_state == JP2_IMG_STATE_NONE); - - /* POINTER validation */ - /* make sure a j2k codec is present */ - l_is_valid &= (jp2->j2k != 00); - - /* make sure a procedure list is present */ - l_is_valid &= (jp2->m_procedure_list != 00); - - /* make sure a validation list is present */ - l_is_valid &= (jp2->m_validation_list != 00); - - /* PARAMETER VALIDATION */ - /* number of components */ - l_is_valid &= (jp2->numcl > 0); - /* width */ - l_is_valid &= (jp2->h > 0); - /* height */ - l_is_valid &= (jp2->w > 0); - /* precision */ - for (i = 0; i < jp2->numcomps; ++i) { - l_is_valid &= ((jp2->comps[i].bpcc & 0x7FU) < - 38U); /* 0 is valid, ignore sign for check */ - } - - /* METH */ - l_is_valid &= ((jp2->meth > 0) && (jp2->meth < 3)); - - /* stream validation */ - /* back and forth is needed */ - l_is_valid &= opj_stream_has_seek(cio); - - return l_is_valid; -} - -static OPJ_BOOL opj_jp2_read_header_procedure(opj_jp2_t *jp2, - opj_stream_private_t *stream, - opj_event_mgr_t * p_manager - ) -{ - opj_jp2_box_t box; - OPJ_UINT32 l_nb_bytes_read; - const opj_jp2_header_handler_t * l_current_handler; - const opj_jp2_header_handler_t * l_current_handler_misplaced; - OPJ_UINT32 l_last_data_size = OPJ_BOX_SIZE; - OPJ_UINT32 l_current_data_size; - OPJ_BYTE * l_current_data = 00; - - /* preconditions */ - assert(stream != 00); - assert(jp2 != 00); - assert(p_manager != 00); - - l_current_data = (OPJ_BYTE*)opj_calloc(1, l_last_data_size); - - if (l_current_data == 00) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to handle jpeg2000 file header\n"); - return OPJ_FALSE; - } - - while (opj_jp2_read_boxhdr(&box, &l_nb_bytes_read, stream, p_manager)) { - /* is it the codestream box ? */ - if (box.type == JP2_JP2C) { - if (jp2->jp2_state & JP2_STATE_HEADER) { - jp2->jp2_state |= JP2_STATE_CODESTREAM; - opj_free(l_current_data); - return OPJ_TRUE; - } else { - opj_event_msg(p_manager, EVT_ERROR, "bad placed jpeg codestream\n"); - opj_free(l_current_data); - return OPJ_FALSE; - } - } else if (box.length == 0) { - opj_event_msg(p_manager, EVT_ERROR, "Cannot handle box of undefined sizes\n"); - opj_free(l_current_data); - return OPJ_FALSE; - } - /* testcase 1851.pdf.SIGSEGV.ce9.948 */ - else if (box.length < l_nb_bytes_read) { - opj_event_msg(p_manager, EVT_ERROR, "invalid box size %d (%x)\n", box.length, - box.type); - opj_free(l_current_data); - return OPJ_FALSE; - } - - l_current_handler = opj_jp2_find_handler(box.type); - l_current_handler_misplaced = opj_jp2_img_find_handler(box.type); - l_current_data_size = box.length - l_nb_bytes_read; - - if ((l_current_handler != 00) || (l_current_handler_misplaced != 00)) { - if (l_current_handler == 00) { - opj_event_msg(p_manager, EVT_WARNING, - "Found a misplaced '%c%c%c%c' box outside jp2h box\n", - (OPJ_BYTE)(box.type >> 24), (OPJ_BYTE)(box.type >> 16), - (OPJ_BYTE)(box.type >> 8), (OPJ_BYTE)(box.type >> 0)); - if (jp2->jp2_state & JP2_STATE_HEADER) { - /* read anyway, we already have jp2h */ - l_current_handler = l_current_handler_misplaced; - } else { - opj_event_msg(p_manager, EVT_WARNING, - "JPEG2000 Header box not read yet, '%c%c%c%c' box will be ignored\n", - (OPJ_BYTE)(box.type >> 24), (OPJ_BYTE)(box.type >> 16), - (OPJ_BYTE)(box.type >> 8), (OPJ_BYTE)(box.type >> 0)); - jp2->jp2_state |= JP2_STATE_UNKNOWN; - if (opj_stream_skip(stream, l_current_data_size, - p_manager) != l_current_data_size) { - opj_event_msg(p_manager, EVT_ERROR, - "Problem with skipping JPEG2000 box, stream error\n"); - opj_free(l_current_data); - return OPJ_FALSE; - } - continue; - } - } - if ((OPJ_OFF_T)l_current_data_size > opj_stream_get_number_byte_left(stream)) { - /* do not even try to malloc if we can't read */ - opj_event_msg(p_manager, EVT_ERROR, - "Invalid box size %d for box '%c%c%c%c'. Need %d bytes, %d bytes remaining \n", - box.length, (OPJ_BYTE)(box.type >> 24), (OPJ_BYTE)(box.type >> 16), - (OPJ_BYTE)(box.type >> 8), (OPJ_BYTE)(box.type >> 0), l_current_data_size, - (OPJ_UINT32)opj_stream_get_number_byte_left(stream)); - opj_free(l_current_data); - return OPJ_FALSE; - } - if (l_current_data_size > l_last_data_size) { - OPJ_BYTE* new_current_data = (OPJ_BYTE*)opj_realloc(l_current_data, - l_current_data_size); - if (!new_current_data) { - opj_free(l_current_data); - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to handle jpeg2000 box\n"); - return OPJ_FALSE; - } - l_current_data = new_current_data; - l_last_data_size = l_current_data_size; - } - - l_nb_bytes_read = (OPJ_UINT32)opj_stream_read_data(stream, l_current_data, - l_current_data_size, p_manager); - if (l_nb_bytes_read != l_current_data_size) { - opj_event_msg(p_manager, EVT_ERROR, - "Problem with reading JPEG2000 box, stream error\n"); - opj_free(l_current_data); - return OPJ_FALSE; - } - - if (! l_current_handler->handler(jp2, l_current_data, l_current_data_size, - p_manager)) { - opj_free(l_current_data); - return OPJ_FALSE; - } - } else { - if (!(jp2->jp2_state & JP2_STATE_SIGNATURE)) { - opj_event_msg(p_manager, EVT_ERROR, - "Malformed JP2 file format: first box must be JPEG 2000 signature box\n"); - opj_free(l_current_data); - return OPJ_FALSE; - } - if (!(jp2->jp2_state & JP2_STATE_FILE_TYPE)) { - opj_event_msg(p_manager, EVT_ERROR, - "Malformed JP2 file format: second box must be file type box\n"); - opj_free(l_current_data); - return OPJ_FALSE; - } - jp2->jp2_state |= JP2_STATE_UNKNOWN; - if (opj_stream_skip(stream, l_current_data_size, - p_manager) != l_current_data_size) { - if (jp2->jp2_state & JP2_STATE_CODESTREAM) { - /* If we already read the codestream, do not error out */ - /* Needed for data/input/nonregression/issue254.jp2 */ - opj_event_msg(p_manager, EVT_WARNING, - "Problem with skipping JPEG2000 box, stream error\n"); - opj_free(l_current_data); - return OPJ_TRUE; - } else { - opj_event_msg(p_manager, EVT_ERROR, - "Problem with skipping JPEG2000 box, stream error\n"); - opj_free(l_current_data); - return OPJ_FALSE; - } - } - } - } - - opj_free(l_current_data); - - return OPJ_TRUE; -} - -/** - * Executes the given procedures on the given codec. - * - * @param p_procedure_list the list of procedures to execute - * @param jp2 the jpeg2000 file codec to execute the procedures on. - * @param stream the stream to execute the procedures on. - * @param p_manager the user manager. - * - * @return true if all the procedures were successfully executed. - */ -static OPJ_BOOL opj_jp2_exec(opj_jp2_t * jp2, - opj_procedure_list_t * p_procedure_list, - opj_stream_private_t *stream, - opj_event_mgr_t * p_manager - ) - -{ - OPJ_BOOL(** l_procedure)(opj_jp2_t * jp2, opj_stream_private_t *, - opj_event_mgr_t *) = 00; - OPJ_BOOL l_result = OPJ_TRUE; - OPJ_UINT32 l_nb_proc, i; - - /* preconditions */ - assert(p_procedure_list != 00); - assert(jp2 != 00); - assert(stream != 00); - assert(p_manager != 00); - - l_nb_proc = opj_procedure_list_get_nb_procedures(p_procedure_list); - l_procedure = (OPJ_BOOL(**)(opj_jp2_t * jp2, opj_stream_private_t *, - opj_event_mgr_t *)) opj_procedure_list_get_first_procedure(p_procedure_list); - - for (i = 0; i < l_nb_proc; ++i) { - l_result = l_result && (*l_procedure)(jp2, stream, p_manager); - ++l_procedure; - } - - /* and clear the procedure list at the end. */ - opj_procedure_list_clear(p_procedure_list); - return l_result; -} - -OPJ_BOOL opj_jp2_start_compress(opj_jp2_t *jp2, - opj_stream_private_t *stream, - opj_image_t * p_image, - opj_event_mgr_t * p_manager - ) -{ - /* preconditions */ - assert(jp2 != 00); - assert(stream != 00); - assert(p_manager != 00); - - /* customization of the validation */ - if (! opj_jp2_setup_encoding_validation(jp2, p_manager)) { - return OPJ_FALSE; - } - - /* validation of the parameters codec */ - if (! opj_jp2_exec(jp2, jp2->m_validation_list, stream, p_manager)) { - return OPJ_FALSE; - } - - /* customization of the encoding */ - if (! opj_jp2_setup_header_writing(jp2, p_manager)) { - return OPJ_FALSE; - } - - /* write header */ - if (! opj_jp2_exec(jp2, jp2->m_procedure_list, stream, p_manager)) { - return OPJ_FALSE; - } - - return opj_j2k_start_compress(jp2->j2k, stream, p_image, p_manager); -} - -static const opj_jp2_header_handler_t * opj_jp2_find_handler(OPJ_UINT32 p_id) -{ - OPJ_UINT32 i, l_handler_size = sizeof(jp2_header) / sizeof( - opj_jp2_header_handler_t); - - for (i = 0; i < l_handler_size; ++i) { - if (jp2_header[i].id == p_id) { - return &jp2_header[i]; - } - } - return NULL; -} - -/** - * Finds the image execution function related to the given box id. - * - * @param p_id the id of the handler to fetch. - * - * @return the given handler or 00 if it could not be found. - */ -static const opj_jp2_header_handler_t * opj_jp2_img_find_handler( - OPJ_UINT32 p_id) -{ - OPJ_UINT32 i, l_handler_size = sizeof(jp2_img_header) / sizeof( - opj_jp2_header_handler_t); - for (i = 0; i < l_handler_size; ++i) { - if (jp2_img_header[i].id == p_id) { - return &jp2_img_header[i]; - } - } - - return NULL; -} - -/** - * Reads a jpeg2000 file signature box. - * - * @param p_header_data the data contained in the signature box. - * @param jp2 the jpeg2000 file codec. - * @param p_header_size the size of the data contained in the signature box. - * @param p_manager the user event manager. - * - * @return true if the file signature box is valid. - */ -static OPJ_BOOL opj_jp2_read_jp(opj_jp2_t *jp2, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager - ) - -{ - OPJ_UINT32 l_magic_number; - - /* preconditions */ - assert(p_header_data != 00); - assert(jp2 != 00); - assert(p_manager != 00); - - if (jp2->jp2_state != JP2_STATE_NONE) { - opj_event_msg(p_manager, EVT_ERROR, - "The signature box must be the first box in the file.\n"); - return OPJ_FALSE; - } - - /* assure length of data is correct (4 -> magic number) */ - if (p_header_size != 4) { - opj_event_msg(p_manager, EVT_ERROR, "Error with JP signature Box size\n"); - return OPJ_FALSE; - } - - /* rearrange data */ - opj_read_bytes(p_header_data, &l_magic_number, 4); - if (l_magic_number != 0x0d0a870a) { - opj_event_msg(p_manager, EVT_ERROR, - "Error with JP Signature : bad magic number\n"); - return OPJ_FALSE; - } - - jp2->jp2_state |= JP2_STATE_SIGNATURE; - - return OPJ_TRUE; -} - -/** - * Reads a a FTYP box - File type box - * - * @param p_header_data the data contained in the FTYP box. - * @param jp2 the jpeg2000 file codec. - * @param p_header_size the size of the data contained in the FTYP box. - * @param p_manager the user event manager. - * - * @return true if the FTYP box is valid. - */ -static OPJ_BOOL opj_jp2_read_ftyp(opj_jp2_t *jp2, - OPJ_BYTE * p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 i, l_remaining_bytes; - - /* preconditions */ - assert(p_header_data != 00); - assert(jp2 != 00); - assert(p_manager != 00); - - if (jp2->jp2_state != JP2_STATE_SIGNATURE) { - opj_event_msg(p_manager, EVT_ERROR, - "The ftyp box must be the second box in the file.\n"); - return OPJ_FALSE; - } - - /* assure length of data is correct */ - if (p_header_size < 8) { - opj_event_msg(p_manager, EVT_ERROR, "Error with FTYP signature Box size\n"); - return OPJ_FALSE; - } - - opj_read_bytes(p_header_data, &jp2->brand, 4); /* BR */ - p_header_data += 4; - - opj_read_bytes(p_header_data, &jp2->minversion, 4); /* MinV */ - p_header_data += 4; - - l_remaining_bytes = p_header_size - 8; - - /* the number of remaining bytes should be a multiple of 4 */ - if ((l_remaining_bytes & 0x3) != 0) { - opj_event_msg(p_manager, EVT_ERROR, "Error with FTYP signature Box size\n"); - return OPJ_FALSE; - } - - /* div by 4 */ - jp2->numcl = l_remaining_bytes >> 2; - if (jp2->numcl) { - jp2->cl = (OPJ_UINT32 *) opj_calloc(jp2->numcl, sizeof(OPJ_UINT32)); - if (jp2->cl == 00) { - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory with FTYP Box\n"); - return OPJ_FALSE; - } - } - - for (i = 0; i < jp2->numcl; ++i) { - opj_read_bytes(p_header_data, &jp2->cl[i], 4); /* CLi */ - p_header_data += 4; - } - - jp2->jp2_state |= JP2_STATE_FILE_TYPE; - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_jp2_skip_jp2c(opj_jp2_t *jp2, - opj_stream_private_t *stream, - opj_event_mgr_t * p_manager) -{ - /* preconditions */ - assert(jp2 != 00); - assert(stream != 00); - assert(p_manager != 00); - - jp2->j2k_codestream_offset = opj_stream_tell(stream); - - if (opj_stream_skip(stream, 8, p_manager) != 8) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_jpip_skip_iptr(opj_jp2_t *jp2, - opj_stream_private_t *stream, - opj_event_mgr_t * p_manager) -{ - /* preconditions */ - assert(jp2 != 00); - assert(stream != 00); - assert(p_manager != 00); - - jp2->jpip_iptr_offset = opj_stream_tell(stream); - - if (opj_stream_skip(stream, 24, p_manager) != 24) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -/** - * Reads the Jpeg2000 file Header box - JP2 Header box (warning, this is a super box). - * - * @param p_header_data the data contained in the file header box. - * @param jp2 the jpeg2000 file codec. - * @param p_header_size the size of the data contained in the file header box. - * @param p_manager the user event manager. - * - * @return true if the JP2 Header box was successfully recognized. -*/ -static OPJ_BOOL opj_jp2_read_jp2h(opj_jp2_t *jp2, - OPJ_BYTE *p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 l_box_size = 0, l_current_data_size = 0; - opj_jp2_box_t box; - const opj_jp2_header_handler_t * l_current_handler; - OPJ_BOOL l_has_ihdr = 0; - - /* preconditions */ - assert(p_header_data != 00); - assert(jp2 != 00); - assert(p_manager != 00); - - /* make sure the box is well placed */ - if ((jp2->jp2_state & JP2_STATE_FILE_TYPE) != JP2_STATE_FILE_TYPE) { - opj_event_msg(p_manager, EVT_ERROR, - "The box must be the first box in the file.\n"); - return OPJ_FALSE; - } - - jp2->jp2_img_state = JP2_IMG_STATE_NONE; - - /* iterate while remaining data */ - while (p_header_size > 0) { - - if (! opj_jp2_read_boxhdr_char(&box, p_header_data, &l_box_size, p_header_size, - p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, - "Stream error while reading JP2 Header box\n"); - return OPJ_FALSE; - } - - if (box.length > p_header_size) { - opj_event_msg(p_manager, EVT_ERROR, - "Stream error while reading JP2 Header box: box length is inconsistent.\n"); - return OPJ_FALSE; - } - - l_current_handler = opj_jp2_img_find_handler(box.type); - l_current_data_size = box.length - l_box_size; - p_header_data += l_box_size; - - if (l_current_handler != 00) { - if (! l_current_handler->handler(jp2, p_header_data, l_current_data_size, - p_manager)) { - return OPJ_FALSE; - } - } else { - jp2->jp2_img_state |= JP2_IMG_STATE_UNKNOWN; - } - - if (box.type == JP2_IHDR) { - l_has_ihdr = 1; - } - - p_header_data += l_current_data_size; - p_header_size -= box.length; - } - - if (l_has_ihdr == 0) { - opj_event_msg(p_manager, EVT_ERROR, - "Stream error while reading JP2 Header box: no 'ihdr' box.\n"); - return OPJ_FALSE; - } - - jp2->jp2_state |= JP2_STATE_HEADER; - jp2->has_jp2h = 1; - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_jp2_read_boxhdr_char(opj_jp2_box_t *box, - OPJ_BYTE * p_data, - OPJ_UINT32 * p_number_bytes_read, - OPJ_UINT32 p_box_max_size, - opj_event_mgr_t * p_manager - ) -{ - OPJ_UINT32 l_value; - - /* preconditions */ - assert(p_data != 00); - assert(box != 00); - assert(p_number_bytes_read != 00); - assert(p_manager != 00); - - if (p_box_max_size < 8) { - opj_event_msg(p_manager, EVT_ERROR, "Cannot handle box of less than 8 bytes\n"); - return OPJ_FALSE; - } - - /* process read data */ - opj_read_bytes(p_data, &l_value, 4); - p_data += 4; - box->length = (OPJ_UINT32)(l_value); - - opj_read_bytes(p_data, &l_value, 4); - p_data += 4; - box->type = (OPJ_UINT32)(l_value); - - *p_number_bytes_read = 8; - - /* do we have a "special very large box ?" */ - /* read then the XLBox */ - if (box->length == 1) { - OPJ_UINT32 l_xl_part_size; - - if (p_box_max_size < 16) { - opj_event_msg(p_manager, EVT_ERROR, - "Cannot handle XL box of less than 16 bytes\n"); - return OPJ_FALSE; - } - - opj_read_bytes(p_data, &l_xl_part_size, 4); - p_data += 4; - *p_number_bytes_read += 4; - - if (l_xl_part_size != 0) { - opj_event_msg(p_manager, EVT_ERROR, - "Cannot handle box sizes higher than 2^32\n"); - return OPJ_FALSE; - } - - opj_read_bytes(p_data, &l_value, 4); - *p_number_bytes_read += 4; - box->length = (OPJ_UINT32)(l_value); - - if (box->length == 0) { - opj_event_msg(p_manager, EVT_ERROR, "Cannot handle box of undefined sizes\n"); - return OPJ_FALSE; - } - } else if (box->length == 0) { - opj_event_msg(p_manager, EVT_ERROR, "Cannot handle box of undefined sizes\n"); - return OPJ_FALSE; - } - if (box->length < *p_number_bytes_read) { - opj_event_msg(p_manager, EVT_ERROR, "Box length is inconsistent.\n"); - return OPJ_FALSE; - } - return OPJ_TRUE; -} - -OPJ_BOOL opj_jp2_read_header(opj_stream_private_t *p_stream, - opj_jp2_t *jp2, - opj_image_t ** p_image, - opj_event_mgr_t * p_manager - ) -{ - /* preconditions */ - assert(jp2 != 00); - assert(p_stream != 00); - assert(p_manager != 00); - - /* customization of the validation */ - if (! opj_jp2_setup_decoding_validation(jp2, p_manager)) { - return OPJ_FALSE; - } - - /* customization of the encoding */ - if (! opj_jp2_setup_header_reading(jp2, p_manager)) { - return OPJ_FALSE; - } - - /* validation of the parameters codec */ - if (! opj_jp2_exec(jp2, jp2->m_validation_list, p_stream, p_manager)) { - return OPJ_FALSE; - } - - /* read header */ - if (! opj_jp2_exec(jp2, jp2->m_procedure_list, p_stream, p_manager)) { - return OPJ_FALSE; - } - if (jp2->has_jp2h == 0) { - opj_event_msg(p_manager, EVT_ERROR, "JP2H box missing. Required.\n"); - return OPJ_FALSE; - } - if (jp2->has_ihdr == 0) { - opj_event_msg(p_manager, EVT_ERROR, "IHDR box_missing. Required.\n"); - return OPJ_FALSE; - } - - return opj_j2k_read_header(p_stream, - jp2->j2k, - p_image, - p_manager); -} - -static OPJ_BOOL opj_jp2_setup_encoding_validation(opj_jp2_t *jp2, - opj_event_mgr_t * p_manager) -{ - /* preconditions */ - assert(jp2 != 00); - assert(p_manager != 00); - - if (! opj_procedure_list_add_procedure(jp2->m_validation_list, - (opj_procedure)opj_jp2_default_validation, p_manager)) { - return OPJ_FALSE; - } - /* DEVELOPER CORNER, add your custom validation procedure */ - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_jp2_setup_decoding_validation(opj_jp2_t *jp2, - opj_event_mgr_t * p_manager) -{ - /* preconditions */ - assert(jp2 != 00); - assert(p_manager != 00); - - OPJ_UNUSED(jp2); - OPJ_UNUSED(p_manager); - - /* DEVELOPER CORNER, add your custom validation procedure */ - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_jp2_setup_header_writing(opj_jp2_t *jp2, - opj_event_mgr_t * p_manager) -{ - /* preconditions */ - assert(jp2 != 00); - assert(p_manager != 00); - - if (! opj_procedure_list_add_procedure(jp2->m_procedure_list, - (opj_procedure)opj_jp2_write_jp, p_manager)) { - return OPJ_FALSE; - } - if (! opj_procedure_list_add_procedure(jp2->m_procedure_list, - (opj_procedure)opj_jp2_write_ftyp, p_manager)) { - return OPJ_FALSE; - } - if (! opj_procedure_list_add_procedure(jp2->m_procedure_list, - (opj_procedure)opj_jp2_write_jp2h, p_manager)) { - return OPJ_FALSE; - } - if (jp2->jpip_on) { - if (! opj_procedure_list_add_procedure(jp2->m_procedure_list, - (opj_procedure)opj_jpip_skip_iptr, p_manager)) { - return OPJ_FALSE; - } - } - if (! opj_procedure_list_add_procedure(jp2->m_procedure_list, - (opj_procedure)opj_jp2_skip_jp2c, p_manager)) { - return OPJ_FALSE; - } - - /* DEVELOPER CORNER, insert your custom procedures */ - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_jp2_setup_header_reading(opj_jp2_t *jp2, - opj_event_mgr_t * p_manager) -{ - /* preconditions */ - assert(jp2 != 00); - assert(p_manager != 00); - - if (! opj_procedure_list_add_procedure(jp2->m_procedure_list, - (opj_procedure)opj_jp2_read_header_procedure, p_manager)) { - return OPJ_FALSE; - } - - /* DEVELOPER CORNER, add your custom procedures */ - - return OPJ_TRUE; -} - -OPJ_BOOL opj_jp2_read_tile_header(opj_jp2_t * p_jp2, - OPJ_UINT32 * p_tile_index, - OPJ_UINT32 * p_data_size, - OPJ_INT32 * p_tile_x0, - OPJ_INT32 * p_tile_y0, - OPJ_INT32 * p_tile_x1, - OPJ_INT32 * p_tile_y1, - OPJ_UINT32 * p_nb_comps, - OPJ_BOOL * p_go_on, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager - ) -{ - return opj_j2k_read_tile_header(p_jp2->j2k, - p_tile_index, - p_data_size, - p_tile_x0, p_tile_y0, - p_tile_x1, p_tile_y1, - p_nb_comps, - p_go_on, - p_stream, - p_manager); -} - -OPJ_BOOL opj_jp2_write_tile(opj_jp2_t *p_jp2, - OPJ_UINT32 p_tile_index, - OPJ_BYTE * p_data, - OPJ_UINT32 p_data_size, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager - ) - -{ - return opj_j2k_write_tile(p_jp2->j2k, p_tile_index, p_data, p_data_size, - p_stream, p_manager); -} - -OPJ_BOOL opj_jp2_decode_tile(opj_jp2_t * p_jp2, - OPJ_UINT32 p_tile_index, - OPJ_BYTE * p_data, - OPJ_UINT32 p_data_size, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager - ) -{ - return opj_j2k_decode_tile(p_jp2->j2k, p_tile_index, p_data, p_data_size, - p_stream, p_manager); -} - -void opj_jp2_destroy(opj_jp2_t *jp2) -{ - if (jp2) { - /* destroy the J2K codec */ - opj_j2k_destroy(jp2->j2k); - jp2->j2k = 00; - - if (jp2->comps) { - opj_free(jp2->comps); - jp2->comps = 00; - } - - if (jp2->cl) { - opj_free(jp2->cl); - jp2->cl = 00; - } - - if (jp2->color.icc_profile_buf) { - opj_free(jp2->color.icc_profile_buf); - jp2->color.icc_profile_buf = 00; - } - - if (jp2->color.jp2_cdef) { - if (jp2->color.jp2_cdef->info) { - opj_free(jp2->color.jp2_cdef->info); - jp2->color.jp2_cdef->info = NULL; - } - - opj_free(jp2->color.jp2_cdef); - jp2->color.jp2_cdef = 00; - } - - if (jp2->color.jp2_pclr) { - if (jp2->color.jp2_pclr->cmap) { - opj_free(jp2->color.jp2_pclr->cmap); - jp2->color.jp2_pclr->cmap = NULL; - } - if (jp2->color.jp2_pclr->channel_sign) { - opj_free(jp2->color.jp2_pclr->channel_sign); - jp2->color.jp2_pclr->channel_sign = NULL; - } - if (jp2->color.jp2_pclr->channel_size) { - opj_free(jp2->color.jp2_pclr->channel_size); - jp2->color.jp2_pclr->channel_size = NULL; - } - if (jp2->color.jp2_pclr->entries) { - opj_free(jp2->color.jp2_pclr->entries); - jp2->color.jp2_pclr->entries = NULL; - } - - opj_free(jp2->color.jp2_pclr); - jp2->color.jp2_pclr = 00; - } - - if (jp2->m_validation_list) { - opj_procedure_list_destroy(jp2->m_validation_list); - jp2->m_validation_list = 00; - } - - if (jp2->m_procedure_list) { - opj_procedure_list_destroy(jp2->m_procedure_list); - jp2->m_procedure_list = 00; - } - - opj_free(jp2); - } -} - -OPJ_BOOL opj_jp2_set_decoded_components(opj_jp2_t *p_jp2, - OPJ_UINT32 numcomps, - const OPJ_UINT32* comps_indices, - opj_event_mgr_t * p_manager) -{ - return opj_j2k_set_decoded_components(p_jp2->j2k, - numcomps, comps_indices, - p_manager); -} - -OPJ_BOOL opj_jp2_set_decode_area(opj_jp2_t *p_jp2, - opj_image_t* p_image, - OPJ_INT32 p_start_x, OPJ_INT32 p_start_y, - OPJ_INT32 p_end_x, OPJ_INT32 p_end_y, - opj_event_mgr_t * p_manager - ) -{ - return opj_j2k_set_decode_area(p_jp2->j2k, p_image, p_start_x, p_start_y, - p_end_x, p_end_y, p_manager); -} - -OPJ_BOOL opj_jp2_get_tile(opj_jp2_t *p_jp2, - opj_stream_private_t *p_stream, - opj_image_t* p_image, - opj_event_mgr_t * p_manager, - OPJ_UINT32 tile_index - ) -{ - if (!p_image) { - return OPJ_FALSE; - } - - opj_event_msg(p_manager, EVT_WARNING, - "JP2 box which are after the codestream will not be read by this function.\n"); - - if (! opj_j2k_get_tile(p_jp2->j2k, p_stream, p_image, p_manager, tile_index)) { - opj_event_msg(p_manager, EVT_ERROR, - "Failed to decode the codestream in the JP2 file\n"); - return OPJ_FALSE; - } - - if (p_jp2->j2k->m_specific_param.m_decoder.m_numcomps_to_decode) { - /* Bypass all JP2 component transforms */ - return OPJ_TRUE; - } - - if (!opj_jp2_check_color(p_image, &(p_jp2->color), p_manager)) { - return OPJ_FALSE; - } - - /* Set Image Color Space */ - if (p_jp2->enumcs == 16) { - p_image->color_space = OPJ_CLRSPC_SRGB; - } else if (p_jp2->enumcs == 17) { - p_image->color_space = OPJ_CLRSPC_GRAY; - } else if (p_jp2->enumcs == 18) { - p_image->color_space = OPJ_CLRSPC_SYCC; - } else if (p_jp2->enumcs == 24) { - p_image->color_space = OPJ_CLRSPC_EYCC; - } else if (p_jp2->enumcs == 12) { - p_image->color_space = OPJ_CLRSPC_CMYK; - } else { - p_image->color_space = OPJ_CLRSPC_UNKNOWN; - } - - if (p_jp2->color.jp2_pclr) { - /* Part 1, I.5.3.4: Either both or none : */ - if (!p_jp2->color.jp2_pclr->cmap) { - opj_jp2_free_pclr(&(p_jp2->color)); - } else { - if (!opj_jp2_apply_pclr(p_image, &(p_jp2->color), p_manager)) { - return OPJ_FALSE; - } - } - } - - /* Apply the color space if needed */ - if (p_jp2->color.jp2_cdef) { - opj_jp2_apply_cdef(p_image, &(p_jp2->color), p_manager); - } - - if (p_jp2->color.icc_profile_buf) { - p_image->icc_profile_buf = p_jp2->color.icc_profile_buf; - p_image->icc_profile_len = p_jp2->color.icc_profile_len; - p_jp2->color.icc_profile_buf = NULL; - } - - return OPJ_TRUE; -} - -/* ----------------------------------------------------------------------- */ -/* JP2 encoder interface */ -/* ----------------------------------------------------------------------- */ - -opj_jp2_t* opj_jp2_create(OPJ_BOOL p_is_decoder) -{ - opj_jp2_t *jp2 = (opj_jp2_t*)opj_calloc(1, sizeof(opj_jp2_t)); - if (jp2) { - - /* create the J2K codec */ - if (! p_is_decoder) { - jp2->j2k = opj_j2k_create_compress(); - } else { - jp2->j2k = opj_j2k_create_decompress(); - } - - if (jp2->j2k == 00) { - opj_jp2_destroy(jp2); - return 00; - } - - /* Color structure */ - jp2->color.icc_profile_buf = NULL; - jp2->color.icc_profile_len = 0; - jp2->color.jp2_cdef = NULL; - jp2->color.jp2_pclr = NULL; - jp2->color.jp2_has_colr = 0; - - /* validation list creation */ - jp2->m_validation_list = opj_procedure_list_create(); - if (! jp2->m_validation_list) { - opj_jp2_destroy(jp2); - return 00; - } - - /* execution list creation */ - jp2->m_procedure_list = opj_procedure_list_create(); - if (! jp2->m_procedure_list) { - opj_jp2_destroy(jp2); - return 00; - } - } - - return jp2; -} - -void jp2_dump(opj_jp2_t* p_jp2, OPJ_INT32 flag, FILE* out_stream) -{ - /* preconditions */ - assert(p_jp2 != 00); - - j2k_dump(p_jp2->j2k, - flag, - out_stream); -} - -opj_codestream_index_t* jp2_get_cstr_index(opj_jp2_t* p_jp2) -{ - return j2k_get_cstr_index(p_jp2->j2k); -} - -opj_codestream_info_v2_t* jp2_get_cstr_info(opj_jp2_t* p_jp2) -{ - return j2k_get_cstr_info(p_jp2->j2k); -} - -OPJ_BOOL opj_jp2_set_decoded_resolution_factor(opj_jp2_t *p_jp2, - OPJ_UINT32 res_factor, - opj_event_mgr_t * p_manager) -{ - return opj_j2k_set_decoded_resolution_factor(p_jp2->j2k, res_factor, p_manager); -} - -/* JPIP specific */ - -#ifdef USE_JPIP -static OPJ_BOOL opj_jpip_write_iptr(opj_jp2_t *jp2, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager) -{ - OPJ_OFF_T j2k_codestream_exit; - OPJ_BYTE l_data_header [24]; - - /* preconditions */ - assert(jp2 != 00); - assert(cio != 00); - assert(p_manager != 00); - assert(opj_stream_has_seek(cio)); - - j2k_codestream_exit = opj_stream_tell(cio); - opj_write_bytes(l_data_header, 24, 4); /* size of iptr */ - opj_write_bytes(l_data_header + 4, JPIP_IPTR, - 4); /* IPTR */ -#if 0 - opj_write_bytes(l_data_header + 4 + 4, 0, 8); /* offset */ - opj_write_bytes(l_data_header + 8 + 8, 0, 8); /* length */ -#else - opj_write_double(l_data_header + 4 + 4, 0); /* offset */ - opj_write_double(l_data_header + 8 + 8, 0); /* length */ -#endif - - if (! opj_stream_seek(cio, jp2->jpip_iptr_offset, p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, "Failed to seek in the stream.\n"); - return OPJ_FALSE; - } - - if (opj_stream_write_data(cio, l_data_header, 24, p_manager) != 24) { - opj_event_msg(p_manager, EVT_ERROR, "Failed to seek in the stream.\n"); - return OPJ_FALSE; - } - - if (! opj_stream_seek(cio, j2k_codestream_exit, p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, "Failed to seek in the stream.\n"); - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_jpip_write_fidx(opj_jp2_t *jp2, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager) -{ - OPJ_OFF_T j2k_codestream_exit; - OPJ_BYTE l_data_header [24]; - - OPJ_UNUSED(jp2); - - /* preconditions */ - assert(jp2 != 00); - assert(cio != 00); - assert(p_manager != 00); - assert(opj_stream_has_seek(cio)); - - opj_write_bytes(l_data_header, 24, 4); /* size of iptr */ - opj_write_bytes(l_data_header + 4, JPIP_FIDX, - 4); /* IPTR */ - opj_write_double(l_data_header + 4 + 4, 0); /* offset */ - opj_write_double(l_data_header + 8 + 8, 0); /* length */ - - if (opj_stream_write_data(cio, l_data_header, 24, p_manager) != 24) { - opj_event_msg(p_manager, EVT_ERROR, "Failed to seek in the stream.\n"); - return OPJ_FALSE; - } - - j2k_codestream_exit = opj_stream_tell(cio); - if (! opj_stream_seek(cio, j2k_codestream_exit, p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, "Failed to seek in the stream.\n"); - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_jpip_write_cidx(opj_jp2_t *jp2, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager) -{ - OPJ_OFF_T j2k_codestream_exit; - OPJ_BYTE l_data_header [24]; - - OPJ_UNUSED(jp2); - - /* preconditions */ - assert(jp2 != 00); - assert(cio != 00); - assert(p_manager != 00); - assert(opj_stream_has_seek(cio)); - - j2k_codestream_exit = opj_stream_tell(cio); - opj_write_bytes(l_data_header, 24, 4); /* size of iptr */ - opj_write_bytes(l_data_header + 4, JPIP_CIDX, - 4); /* IPTR */ -#if 0 - opj_write_bytes(l_data_header + 4 + 4, 0, 8); /* offset */ - opj_write_bytes(l_data_header + 8 + 8, 0, 8); /* length */ -#else - opj_write_double(l_data_header + 4 + 4, 0); /* offset */ - opj_write_double(l_data_header + 8 + 8, 0); /* length */ -#endif - - if (! opj_stream_seek(cio, j2k_codestream_exit, p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, "Failed to seek in the stream.\n"); - return OPJ_FALSE; - } - - if (opj_stream_write_data(cio, l_data_header, 24, p_manager) != 24) { - opj_event_msg(p_manager, EVT_ERROR, "Failed to seek in the stream.\n"); - return OPJ_FALSE; - } - - j2k_codestream_exit = opj_stream_tell(cio); - if (! opj_stream_seek(cio, j2k_codestream_exit, p_manager)) { - opj_event_msg(p_manager, EVT_ERROR, "Failed to seek in the stream.\n"); - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -#if 0 -static void write_prxy(int offset_jp2c, int length_jp2c, int offset_idx, - int length_idx, opj_stream_private_t *cio, - opj_event_mgr_t * p_manager) -{ - OPJ_BYTE l_data_header [8]; - OPJ_OFF_T len, lenp; - - lenp = opj_stream_tell(cio); - opj_stream_skip(cio, 4, p_manager); /* L [at the end] */ - opj_write_bytes(l_data_header, JPIP_PRXY, 4); /* IPTR */ - opj_stream_write_data(cio, l_data_header, 4, p_manager); - - opj_write_bytes(l_data_header, offset_jp2c, 8); /* OOFF */ - opj_stream_write_data(cio, l_data_header, 8, p_manager); - opj_write_bytes(l_data_header, length_jp2c, 4); /* OBH part 1 */ - opj_write_bytes(l_data_header + 4, JP2_JP2C, 4); /* OBH part 2 */ - opj_stream_write_data(cio, l_data_header, 8, p_manager); - - opj_write_bytes(l_data_header, 1, 1); /* NI */ - opj_stream_write_data(cio, l_data_header, 1, p_manager); - - opj_write_bytes(l_data_header, offset_idx, 8); /* IOFF */ - opj_stream_write_data(cio, l_data_header, 8, p_manager); - opj_write_bytes(l_data_header, length_idx, 4); /* IBH part 1 */ - opj_write_bytes(l_data_header + 4, JPIP_CIDX, 4); /* IBH part 2 */ - opj_stream_write_data(cio, l_data_header, 8, p_manager); - - len = opj_stream_tell(cio) - lenp; - opj_stream_skip(cio, lenp, p_manager); - opj_write_bytes(l_data_header, len, 4); /* L */ - opj_stream_write_data(cio, l_data_header, 4, p_manager); - opj_stream_seek(cio, lenp + len, p_manager); -} -#endif - - -#if 0 -static int write_fidx(int offset_jp2c, int length_jp2c, int offset_idx, - int length_idx, opj_stream_private_t *cio, - opj_event_mgr_t * p_manager) -{ - OPJ_BYTE l_data_header [4]; - OPJ_OFF_T len, lenp; - - lenp = opj_stream_tell(cio); - opj_stream_skip(cio, 4, p_manager); - opj_write_bytes(l_data_header, JPIP_FIDX, 4); /* FIDX */ - opj_stream_write_data(cio, l_data_header, 4, p_manager); - - write_prxy(offset_jp2c, length_jp2c, offset_idx, length_idx, cio, p_manager); - - len = opj_stream_tell(cio) - lenp; - opj_stream_skip(cio, lenp, p_manager); - opj_write_bytes(l_data_header, len, 4); /* L */ - opj_stream_write_data(cio, l_data_header, 4, p_manager); - opj_stream_seek(cio, lenp + len, p_manager); - - return len; -} -#endif -#endif /* USE_JPIP */ diff --git a/src/3rd/LibOpenJpeg/jp2.h b/src/3rd/LibOpenJpeg/jp2.h deleted file mode 100644 index 34abd511..00000000 --- a/src/3rd/LibOpenJpeg/jp2.h +++ /dev/null @@ -1,498 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR - * Copyright (c) 2012, CS Systemes d'Information, France - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef OPJ_JP2_H -#define OPJ_JP2_H -/** -@file jp2.h -@brief The JPEG-2000 file format Reader/Writer (JP2) - -*/ - -/** @defgroup JP2 JP2 - JPEG-2000 file format reader/writer */ -/*@{*/ - -/*#define JPIP_JPIP 0x6a706970*/ - -#define JP2_JP 0x6a502020 /**< JPEG 2000 signature box */ -#define JP2_FTYP 0x66747970 /**< File type box */ -#define JP2_JP2H 0x6a703268 /**< JP2 header box (super-box) */ -#define JP2_IHDR 0x69686472 /**< Image header box */ -#define JP2_COLR 0x636f6c72 /**< Colour specification box */ -#define JP2_JP2C 0x6a703263 /**< Contiguous codestream box */ -#define JP2_URL 0x75726c20 /**< Data entry URL box */ -#define JP2_PCLR 0x70636c72 /**< Palette box */ -#define JP2_CMAP 0x636d6170 /**< Component Mapping box */ -#define JP2_CDEF 0x63646566 /**< Channel Definition box */ -#define JP2_DTBL 0x6474626c /**< Data Reference box */ -#define JP2_BPCC 0x62706363 /**< Bits per component box */ -#define JP2_JP2 0x6a703220 /**< File type fields */ - -/* For the future */ -/* #define JP2_RES 0x72657320 */ /**< Resolution box (super-box) */ -/* #define JP2_JP2I 0x6a703269 */ /**< Intellectual property box */ -/* #define JP2_XML 0x786d6c20 */ /**< XML box */ -/* #define JP2_UUID 0x75756994 */ /**< UUID box */ -/* #define JP2_UINF 0x75696e66 */ /**< UUID info box (super-box) */ -/* #define JP2_ULST 0x756c7374 */ /**< UUID list box */ - -/* ----------------------------------------------------------------------- */ - -typedef enum { - JP2_STATE_NONE = 0x0, - JP2_STATE_SIGNATURE = 0x1, - JP2_STATE_FILE_TYPE = 0x2, - JP2_STATE_HEADER = 0x4, - JP2_STATE_CODESTREAM = 0x8, - JP2_STATE_END_CODESTREAM = 0x10, - JP2_STATE_UNKNOWN = 0x7fffffff /* ISO C restricts enumerator values to range of 'int' */ -} -JP2_STATE; - -typedef enum { - JP2_IMG_STATE_NONE = 0x0, - JP2_IMG_STATE_UNKNOWN = 0x7fffffff -} -JP2_IMG_STATE; - -/** -Channel description: channel index, type, association -*/ -typedef struct opj_jp2_cdef_info { - OPJ_UINT16 cn, typ, asoc; -} opj_jp2_cdef_info_t; - -/** -Channel descriptions and number of descriptions -*/ -typedef struct opj_jp2_cdef { - opj_jp2_cdef_info_t *info; - OPJ_UINT16 n; -} opj_jp2_cdef_t; - -/** -Component mappings: channel index, mapping type, palette index -*/ -typedef struct opj_jp2_cmap_comp { - OPJ_UINT16 cmp; - OPJ_BYTE mtyp, pcol; -} opj_jp2_cmap_comp_t; - -/** -Palette data: table entries, palette columns -*/ -typedef struct opj_jp2_pclr { - OPJ_UINT32 *entries; - OPJ_BYTE *channel_sign; - OPJ_BYTE *channel_size; - opj_jp2_cmap_comp_t *cmap; - OPJ_UINT16 nr_entries; - OPJ_BYTE nr_channels; -} opj_jp2_pclr_t; - -/** -Collector for ICC profile, palette, component mapping, channel description -*/ -typedef struct opj_jp2_color { - OPJ_BYTE *icc_profile_buf; - OPJ_UINT32 icc_profile_len; - - opj_jp2_cdef_t *jp2_cdef; - opj_jp2_pclr_t *jp2_pclr; - OPJ_BYTE jp2_has_colr; -} opj_jp2_color_t; - -/** -JP2 component -*/ -typedef struct opj_jp2_comps { - OPJ_UINT32 depth; - OPJ_UINT32 sgnd; - OPJ_UINT32 bpcc; -} opj_jp2_comps_t; - -/** -JPEG-2000 file format reader/writer -*/ -typedef struct opj_jp2 { - /** handle to the J2K codec */ - opj_j2k_t *j2k; - /** list of validation procedures */ - struct opj_procedure_list * m_validation_list; - /** list of execution procedures */ - struct opj_procedure_list * m_procedure_list; - - /* width of image */ - OPJ_UINT32 w; - /* height of image */ - OPJ_UINT32 h; - /* number of components in the image */ - OPJ_UINT32 numcomps; - OPJ_UINT32 bpc; - OPJ_UINT32 C; - OPJ_UINT32 UnkC; - OPJ_UINT32 IPR; - OPJ_UINT32 meth; - OPJ_UINT32 approx; - OPJ_UINT32 enumcs; - OPJ_UINT32 precedence; - OPJ_UINT32 brand; - OPJ_UINT32 minversion; - OPJ_UINT32 numcl; - OPJ_UINT32 *cl; - opj_jp2_comps_t *comps; - /* FIXME: The following two variables are used to save offset - as we write out a JP2 file to disk. This mechanism is not flexible - as codec writers will need to extand those fields as new part - of the standard are implemented. - */ - OPJ_OFF_T j2k_codestream_offset; - OPJ_OFF_T jpip_iptr_offset; - OPJ_BOOL jpip_on; - OPJ_UINT32 jp2_state; - OPJ_UINT32 jp2_img_state; - - opj_jp2_color_t color; - - OPJ_BOOL ignore_pclr_cmap_cdef; - OPJ_BYTE has_jp2h; - OPJ_BYTE has_ihdr; -} -opj_jp2_t; - -/** -JP2 Box -*/ -typedef struct opj_jp2_box { - OPJ_UINT32 length; - OPJ_UINT32 type; - OPJ_INT32 init_pos; -} opj_jp2_box_t; - -typedef struct opj_jp2_header_handler { - /* marker value */ - OPJ_UINT32 id; - /* action linked to the marker */ - OPJ_BOOL(*handler)(opj_jp2_t *jp2, - OPJ_BYTE *p_header_data, - OPJ_UINT32 p_header_size, - opj_event_mgr_t * p_manager); -} -opj_jp2_header_handler_t; - - -typedef struct opj_jp2_img_header_writer_handler { - /* action to perform */ - OPJ_BYTE* (*handler)(opj_jp2_t *jp2, OPJ_UINT32 * p_data_size); - /* result of the action : data */ - OPJ_BYTE* m_data; - /* size of data */ - OPJ_UINT32 m_size; -} -opj_jp2_img_header_writer_handler_t; - -/** @name Exported functions */ -/*@{*/ -/* ----------------------------------------------------------------------- */ - -/** -Setup the decoder decoding parameters using user parameters. -Decoding parameters are returned in jp2->j2k->cp. -@param jp2 JP2 decompressor handle -@param parameters decompression parameters -*/ -void opj_jp2_setup_decoder(opj_jp2_t *jp2, opj_dparameters_t *parameters); - -/** Allocates worker threads for the compressor/decompressor. - * - * @param jp2 JP2 decompressor handle - * @param num_threads Number of threads. - * @return OPJ_TRUE in case of success. - */ -OPJ_BOOL opj_jp2_set_threads(opj_jp2_t *jp2, OPJ_UINT32 num_threads); - -/** - * Decode an image from a JPEG-2000 file stream - * @param jp2 JP2 decompressor handle - * @param p_stream FIXME DOC - * @param p_image FIXME DOC - * @param p_manager FIXME DOC - * - * @return Returns a decoded image if successful, returns NULL otherwise -*/ -OPJ_BOOL opj_jp2_decode(opj_jp2_t *jp2, - opj_stream_private_t *p_stream, - opj_image_t* p_image, - opj_event_mgr_t * p_manager); - -/** - * Setup the encoder parameters using the current image and using user parameters. - * Coding parameters are returned in jp2->j2k->cp. - * - * @param jp2 JP2 compressor handle - * @param parameters compression parameters - * @param image input filled image - * @param p_manager FIXME DOC - * @return OPJ_TRUE if successful, OPJ_FALSE otherwise -*/ -OPJ_BOOL opj_jp2_setup_encoder(opj_jp2_t *jp2, - opj_cparameters_t *parameters, - opj_image_t *image, - opj_event_mgr_t * p_manager); - -/** -Encode an image into a JPEG-2000 file stream -@param jp2 JP2 compressor handle -@param stream Output buffer stream -@param p_manager event manager -@return Returns true if successful, returns false otherwise -*/ -OPJ_BOOL opj_jp2_encode(opj_jp2_t *jp2, - opj_stream_private_t *stream, - opj_event_mgr_t * p_manager); - - -/** - * Starts a compression scheme, i.e. validates the codec parameters, writes the header. - * - * @param jp2 the jpeg2000 file codec. - * @param stream the stream object. - * @param p_image FIXME DOC - * @param p_manager FIXME DOC - * - * @return true if the codec is valid. - */ -OPJ_BOOL opj_jp2_start_compress(opj_jp2_t *jp2, - opj_stream_private_t *stream, - opj_image_t * p_image, - opj_event_mgr_t * p_manager); - - -/** - * Ends the compression procedures and possibiliy add data to be read after the - * codestream. - */ -OPJ_BOOL opj_jp2_end_compress(opj_jp2_t *jp2, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager); - -/* ----------------------------------------------------------------------- */ - -/** - * Ends the decompression procedures and possibiliy add data to be read after the - * codestream. - */ -OPJ_BOOL opj_jp2_end_decompress(opj_jp2_t *jp2, - opj_stream_private_t *cio, - opj_event_mgr_t * p_manager); - -/** - * Reads a jpeg2000 file header structure. - * - * @param p_stream the stream to read data from. - * @param jp2 the jpeg2000 file header structure. - * @param p_image FIXME DOC - * @param p_manager the user event manager. - * - * @return true if the box is valid. - */ -OPJ_BOOL opj_jp2_read_header(opj_stream_private_t *p_stream, - opj_jp2_t *jp2, - opj_image_t ** p_image, - opj_event_mgr_t * p_manager); - -/** Sets the indices of the components to decode. - * - * @param jp2 JP2 decompressor handle - * @param numcomps Number of components to decode. - * @param comps_indices Array of num_compts indices (numbering starting at 0) - * corresponding to the components to decode. - * @param p_manager Event manager; - * - * @return OPJ_TRUE in case of success. - */ -OPJ_BOOL opj_jp2_set_decoded_components(opj_jp2_t *jp2, - OPJ_UINT32 numcomps, - const OPJ_UINT32* comps_indices, - opj_event_mgr_t * p_manager); - -/** - * Reads a tile header. - * @param p_jp2 the jpeg2000 codec. - * @param p_tile_index FIXME DOC - * @param p_data_size FIXME DOC - * @param p_tile_x0 FIXME DOC - * @param p_tile_y0 FIXME DOC - * @param p_tile_x1 FIXME DOC - * @param p_tile_y1 FIXME DOC - * @param p_nb_comps FIXME DOC - * @param p_go_on FIXME DOC - * @param p_stream the stream to write data to. - * @param p_manager the user event manager. - */ -OPJ_BOOL opj_jp2_read_tile_header(opj_jp2_t * p_jp2, - OPJ_UINT32 * p_tile_index, - OPJ_UINT32 * p_data_size, - OPJ_INT32 * p_tile_x0, - OPJ_INT32 * p_tile_y0, - OPJ_INT32 * p_tile_x1, - OPJ_INT32 * p_tile_y1, - OPJ_UINT32 * p_nb_comps, - OPJ_BOOL * p_go_on, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Writes a tile. - * - * @param p_jp2 the jpeg2000 codec. - * @param p_tile_index FIXME DOC - * @param p_data FIXME DOC - * @param p_data_size FIXME DOC - * @param p_stream the stream to write data to. - * @param p_manager the user event manager. - */ -OPJ_BOOL opj_jp2_write_tile(opj_jp2_t *p_jp2, - OPJ_UINT32 p_tile_index, - OPJ_BYTE * p_data, - OPJ_UINT32 p_data_size, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Decode tile data. - * @param p_jp2 the jpeg2000 codec. - * @param p_tile_index FIXME DOC - * @param p_data FIXME DOC - * @param p_data_size FIXME DOC - * @param p_stream the stream to write data to. - * @param p_manager the user event manager. - * - * @return FIXME DOC - */ -OPJ_BOOL opj_jp2_decode_tile(opj_jp2_t * p_jp2, - OPJ_UINT32 p_tile_index, - OPJ_BYTE * p_data, - OPJ_UINT32 p_data_size, - opj_stream_private_t *p_stream, - opj_event_mgr_t * p_manager); - -/** - * Creates a jpeg2000 file decompressor. - * - * @return an empty jpeg2000 file codec. - */ -opj_jp2_t* opj_jp2_create(OPJ_BOOL p_is_decoder); - -/** -Destroy a JP2 decompressor handle -@param jp2 JP2 decompressor handle to destroy -*/ -void opj_jp2_destroy(opj_jp2_t *jp2); - - -/** - * Sets the given area to be decoded. This function should be called right after opj_read_header and before any tile header reading. - * - * @param p_jp2 the jpeg2000 codec. - * @param p_image FIXME DOC - * @param p_start_x the left position of the rectangle to decode (in image coordinates). - * @param p_start_y the up position of the rectangle to decode (in image coordinates). - * @param p_end_x the right position of the rectangle to decode (in image coordinates). - * @param p_end_y the bottom position of the rectangle to decode (in image coordinates). - * @param p_manager the user event manager - * - * @return true if the area could be set. - */ -OPJ_BOOL opj_jp2_set_decode_area(opj_jp2_t *p_jp2, - opj_image_t* p_image, - OPJ_INT32 p_start_x, OPJ_INT32 p_start_y, - OPJ_INT32 p_end_x, OPJ_INT32 p_end_y, - opj_event_mgr_t * p_manager); - -/** -* -*/ -OPJ_BOOL opj_jp2_get_tile(opj_jp2_t *p_jp2, - opj_stream_private_t *p_stream, - opj_image_t* p_image, - opj_event_mgr_t * p_manager, - OPJ_UINT32 tile_index); - - -/** - * - */ -OPJ_BOOL opj_jp2_set_decoded_resolution_factor(opj_jp2_t *p_jp2, - OPJ_UINT32 res_factor, - opj_event_mgr_t * p_manager); - - -/* TODO MSD: clean these 3 functions */ -/** - * Dump some elements from the JP2 decompression structure . - * - *@param p_jp2 the jp2 codec. - *@param flag flag to describe what elements are dump. - *@param out_stream output stream where dump the elements. - * -*/ -void jp2_dump(opj_jp2_t* p_jp2, OPJ_INT32 flag, FILE* out_stream); - -/** - * Get the codestream info from a JPEG2000 codec. - * - *@param p_jp2 jp2 codec. - * - *@return the codestream information extract from the jpg2000 codec - */ -opj_codestream_info_v2_t* jp2_get_cstr_info(opj_jp2_t* p_jp2); - -/** - * Get the codestream index from a JPEG2000 codec. - * - *@param p_jp2 jp2 codec. - * - *@return the codestream index extract from the jpg2000 codec - */ -opj_codestream_index_t* jp2_get_cstr_index(opj_jp2_t* p_jp2); - - -/*@}*/ - -/*@}*/ - -#endif /* OPJ_JP2_H */ - diff --git a/src/3rd/LibOpenJpeg/mct.c b/src/3rd/LibOpenJpeg/mct.c deleted file mode 100644 index b79d4b87..00000000 --- a/src/3rd/LibOpenJpeg/mct.c +++ /dev/null @@ -1,567 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR - * Copyright (c) 2012, CS Systemes d'Information, France - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifdef __SSE__ -#include -#endif -#ifdef __SSE2__ -#include -#endif -#ifdef __SSE4_1__ -#include -#endif - -#include "opj_includes.h" - -/* */ -/* This table contains the norms of the basis function of the reversible MCT. */ -/* */ -static const OPJ_FLOAT64 opj_mct_norms[3] = { 1.732, .8292, .8292 }; - -/* */ -/* This table contains the norms of the basis function of the irreversible MCT. */ -/* */ -static const OPJ_FLOAT64 opj_mct_norms_real[3] = { 1.732, 1.805, 1.573 }; - -const OPJ_FLOAT64 * opj_mct_get_mct_norms() -{ - return opj_mct_norms; -} - -const OPJ_FLOAT64 * opj_mct_get_mct_norms_real() -{ - return opj_mct_norms_real; -} - -/* */ -/* Forward reversible MCT. */ -/* */ -#ifdef __SSE2__ -void opj_mct_encode( - OPJ_INT32* OPJ_RESTRICT c0, - OPJ_INT32* OPJ_RESTRICT c1, - OPJ_INT32* OPJ_RESTRICT c2, - OPJ_SIZE_T n) -{ - OPJ_SIZE_T i; - const OPJ_SIZE_T len = n; - /* buffer are aligned on 16 bytes */ - assert(((size_t)c0 & 0xf) == 0); - assert(((size_t)c1 & 0xf) == 0); - assert(((size_t)c2 & 0xf) == 0); - - for (i = 0; i < (len & ~3U); i += 4) { - __m128i y, u, v; - __m128i r = _mm_load_si128((const __m128i *) & (c0[i])); - __m128i g = _mm_load_si128((const __m128i *) & (c1[i])); - __m128i b = _mm_load_si128((const __m128i *) & (c2[i])); - y = _mm_add_epi32(g, g); - y = _mm_add_epi32(y, b); - y = _mm_add_epi32(y, r); - y = _mm_srai_epi32(y, 2); - u = _mm_sub_epi32(b, g); - v = _mm_sub_epi32(r, g); - _mm_store_si128((__m128i *) & (c0[i]), y); - _mm_store_si128((__m128i *) & (c1[i]), u); - _mm_store_si128((__m128i *) & (c2[i]), v); - } - - for (; i < len; ++i) { - OPJ_INT32 r = c0[i]; - OPJ_INT32 g = c1[i]; - OPJ_INT32 b = c2[i]; - OPJ_INT32 y = (r + (g * 2) + b) >> 2; - OPJ_INT32 u = b - g; - OPJ_INT32 v = r - g; - c0[i] = y; - c1[i] = u; - c2[i] = v; - } -} -#else -void opj_mct_encode( - OPJ_INT32* OPJ_RESTRICT c0, - OPJ_INT32* OPJ_RESTRICT c1, - OPJ_INT32* OPJ_RESTRICT c2, - OPJ_SIZE_T n) -{ - OPJ_SIZE_T i; - const OPJ_SIZE_T len = n; - - for (i = 0; i < len; ++i) { - OPJ_INT32 r = c0[i]; - OPJ_INT32 g = c1[i]; - OPJ_INT32 b = c2[i]; - OPJ_INT32 y = (r + (g * 2) + b) >> 2; - OPJ_INT32 u = b - g; - OPJ_INT32 v = r - g; - c0[i] = y; - c1[i] = u; - c2[i] = v; - } -} -#endif - -/* */ -/* Inverse reversible MCT. */ -/* */ -#ifdef __SSE2__ -void opj_mct_decode( - OPJ_INT32* OPJ_RESTRICT c0, - OPJ_INT32* OPJ_RESTRICT c1, - OPJ_INT32* OPJ_RESTRICT c2, - OPJ_SIZE_T n) -{ - OPJ_SIZE_T i; - const OPJ_SIZE_T len = n; - - for (i = 0; i < (len & ~3U); i += 4) { - __m128i r, g, b; - __m128i y = _mm_load_si128((const __m128i *) & (c0[i])); - __m128i u = _mm_load_si128((const __m128i *) & (c1[i])); - __m128i v = _mm_load_si128((const __m128i *) & (c2[i])); - g = y; - g = _mm_sub_epi32(g, _mm_srai_epi32(_mm_add_epi32(u, v), 2)); - r = _mm_add_epi32(v, g); - b = _mm_add_epi32(u, g); - _mm_store_si128((__m128i *) & (c0[i]), r); - _mm_store_si128((__m128i *) & (c1[i]), g); - _mm_store_si128((__m128i *) & (c2[i]), b); - } - for (; i < len; ++i) { - OPJ_INT32 y = c0[i]; - OPJ_INT32 u = c1[i]; - OPJ_INT32 v = c2[i]; - OPJ_INT32 g = y - ((u + v) >> 2); - OPJ_INT32 r = v + g; - OPJ_INT32 b = u + g; - c0[i] = r; - c1[i] = g; - c2[i] = b; - } -} -#else -void opj_mct_decode( - OPJ_INT32* OPJ_RESTRICT c0, - OPJ_INT32* OPJ_RESTRICT c1, - OPJ_INT32* OPJ_RESTRICT c2, - OPJ_SIZE_T n) -{ - OPJ_UINT32 i; - for (i = 0; i < n; ++i) { - OPJ_INT32 y = c0[i]; - OPJ_INT32 u = c1[i]; - OPJ_INT32 v = c2[i]; - OPJ_INT32 g = y - ((u + v) >> 2); - OPJ_INT32 r = v + g; - OPJ_INT32 b = u + g; - c0[i] = r; - c1[i] = g; - c2[i] = b; - } -} -#endif - -/* */ -/* Get norm of basis function of reversible MCT. */ -/* */ -OPJ_FLOAT64 opj_mct_getnorm(OPJ_UINT32 compno) -{ - return opj_mct_norms[compno]; -} - -/* */ -/* Forward irreversible MCT. */ -/* */ -#ifdef __SSE4_1__ -void opj_mct_encode_real( - OPJ_INT32* OPJ_RESTRICT c0, - OPJ_INT32* OPJ_RESTRICT c1, - OPJ_INT32* OPJ_RESTRICT c2, - OPJ_SIZE_T n) -{ - OPJ_SIZE_T i; - const OPJ_SIZE_T len = n; - - const __m128i ry = _mm_set1_epi32(2449); - const __m128i gy = _mm_set1_epi32(4809); - const __m128i by = _mm_set1_epi32(934); - const __m128i ru = _mm_set1_epi32(1382); - const __m128i gu = _mm_set1_epi32(2714); - /* const __m128i bu = _mm_set1_epi32(4096); */ - /* const __m128i rv = _mm_set1_epi32(4096); */ - const __m128i gv = _mm_set1_epi32(3430); - const __m128i bv = _mm_set1_epi32(666); - const __m128i mulround = _mm_shuffle_epi32(_mm_cvtsi32_si128(4096), - _MM_SHUFFLE(1, 0, 1, 0)); - - for (i = 0; i < (len & ~3U); i += 4) { - __m128i lo, hi; - __m128i y, u, v; - __m128i r = _mm_load_si128((const __m128i *) & (c0[i])); - __m128i g = _mm_load_si128((const __m128i *) & (c1[i])); - __m128i b = _mm_load_si128((const __m128i *) & (c2[i])); - - lo = r; - hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, ry); - hi = _mm_mul_epi32(hi, ry); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - y = _mm_blend_epi16(lo, hi, 0xCC); - - lo = g; - hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, gy); - hi = _mm_mul_epi32(hi, gy); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC)); - - lo = b; - hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, by); - hi = _mm_mul_epi32(hi, by); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC)); - _mm_store_si128((__m128i *) & (c0[i]), y); - - /*lo = b; - hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, mulround); - hi = _mm_mul_epi32(hi, mulround);*/ - lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 2, 0))); - hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 3, 1))); - lo = _mm_slli_epi64(lo, 12); - hi = _mm_slli_epi64(hi, 12); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - u = _mm_blend_epi16(lo, hi, 0xCC); - - lo = r; - hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, ru); - hi = _mm_mul_epi32(hi, ru); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC)); - - lo = g; - hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, gu); - hi = _mm_mul_epi32(hi, gu); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC)); - _mm_store_si128((__m128i *) & (c1[i]), u); - - /*lo = r; - hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, mulround); - hi = _mm_mul_epi32(hi, mulround);*/ - lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 2, 0))); - hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 3, 1))); - lo = _mm_slli_epi64(lo, 12); - hi = _mm_slli_epi64(hi, 12); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - v = _mm_blend_epi16(lo, hi, 0xCC); - - lo = g; - hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, gv); - hi = _mm_mul_epi32(hi, gv); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC)); - - lo = b; - hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, bv); - hi = _mm_mul_epi32(hi, bv); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC)); - _mm_store_si128((__m128i *) & (c2[i]), v); - } - for (; i < len; ++i) { - OPJ_INT32 r = c0[i]; - OPJ_INT32 g = c1[i]; - OPJ_INT32 b = c2[i]; - OPJ_INT32 y = opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g, - 4809) + opj_int_fix_mul(b, 934); - OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g, - 2714) + opj_int_fix_mul(b, 4096); - OPJ_INT32 v = opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g, - 3430) - opj_int_fix_mul(b, 666); - c0[i] = y; - c1[i] = u; - c2[i] = v; - } -} -#else -void opj_mct_encode_real( - OPJ_INT32* OPJ_RESTRICT c0, - OPJ_INT32* OPJ_RESTRICT c1, - OPJ_INT32* OPJ_RESTRICT c2, - OPJ_SIZE_T n) -{ - OPJ_UINT32 i; - for (i = 0; i < n; ++i) { - OPJ_INT32 r = c0[i]; - OPJ_INT32 g = c1[i]; - OPJ_INT32 b = c2[i]; - OPJ_INT32 y = opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g, - 4809) + opj_int_fix_mul(b, 934); - OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g, - 2714) + opj_int_fix_mul(b, 4096); - OPJ_INT32 v = opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g, - 3430) - opj_int_fix_mul(b, 666); - c0[i] = y; - c1[i] = u; - c2[i] = v; - } -} -#endif - -/* */ -/* Inverse irreversible MCT. */ -/* */ -void opj_mct_decode_real( - OPJ_FLOAT32* OPJ_RESTRICT c0, - OPJ_FLOAT32* OPJ_RESTRICT c1, - OPJ_FLOAT32* OPJ_RESTRICT c2, - OPJ_SIZE_T n) -{ - OPJ_UINT32 i; -#ifdef __SSE__ - __m128 vrv, vgu, vgv, vbu; - vrv = _mm_set1_ps(1.402f); - vgu = _mm_set1_ps(0.34413f); - vgv = _mm_set1_ps(0.71414f); - vbu = _mm_set1_ps(1.772f); - for (i = 0; i < (n >> 3); ++i) { - __m128 vy, vu, vv; - __m128 vr, vg, vb; - - vy = _mm_load_ps(c0); - vu = _mm_load_ps(c1); - vv = _mm_load_ps(c2); - vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv)); - vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv)); - vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu)); - _mm_store_ps(c0, vr); - _mm_store_ps(c1, vg); - _mm_store_ps(c2, vb); - c0 += 4; - c1 += 4; - c2 += 4; - - vy = _mm_load_ps(c0); - vu = _mm_load_ps(c1); - vv = _mm_load_ps(c2); - vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv)); - vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv)); - vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu)); - _mm_store_ps(c0, vr); - _mm_store_ps(c1, vg); - _mm_store_ps(c2, vb); - c0 += 4; - c1 += 4; - c2 += 4; - } - n &= 7; -#endif - for (i = 0; i < n; ++i) { - OPJ_FLOAT32 y = c0[i]; - OPJ_FLOAT32 u = c1[i]; - OPJ_FLOAT32 v = c2[i]; - OPJ_FLOAT32 r = y + (v * 1.402f); - OPJ_FLOAT32 g = y - (u * 0.34413f) - (v * (0.71414f)); - OPJ_FLOAT32 b = y + (u * 1.772f); - c0[i] = r; - c1[i] = g; - c2[i] = b; - } -} - -/* */ -/* Get norm of basis function of irreversible MCT. */ -/* */ -OPJ_FLOAT64 opj_mct_getnorm_real(OPJ_UINT32 compno) -{ - return opj_mct_norms_real[compno]; -} - - -OPJ_BOOL opj_mct_encode_custom( - OPJ_BYTE * pCodingdata, - OPJ_SIZE_T n, - OPJ_BYTE ** pData, - OPJ_UINT32 pNbComp, - OPJ_UINT32 isSigned) -{ - OPJ_FLOAT32 * lMct = (OPJ_FLOAT32 *) pCodingdata; - OPJ_SIZE_T i; - OPJ_UINT32 j; - OPJ_UINT32 k; - OPJ_UINT32 lNbMatCoeff = pNbComp * pNbComp; - OPJ_INT32 * lCurrentData = 00; - OPJ_INT32 * lCurrentMatrix = 00; - OPJ_INT32 ** lData = (OPJ_INT32 **) pData; - OPJ_UINT32 lMultiplicator = 1 << 13; - OPJ_INT32 * lMctPtr; - - OPJ_ARG_NOT_USED(isSigned); - - lCurrentData = (OPJ_INT32 *) opj_malloc((pNbComp + lNbMatCoeff) * sizeof( - OPJ_INT32)); - if (! lCurrentData) { - return OPJ_FALSE; - } - - lCurrentMatrix = lCurrentData + pNbComp; - - for (i = 0; i < lNbMatCoeff; ++i) { - lCurrentMatrix[i] = (OPJ_INT32)(*(lMct++) * (OPJ_FLOAT32)lMultiplicator); - } - - for (i = 0; i < n; ++i) { - lMctPtr = lCurrentMatrix; - for (j = 0; j < pNbComp; ++j) { - lCurrentData[j] = (*(lData[j])); - } - - for (j = 0; j < pNbComp; ++j) { - *(lData[j]) = 0; - for (k = 0; k < pNbComp; ++k) { - *(lData[j]) += opj_int_fix_mul(*lMctPtr, lCurrentData[k]); - ++lMctPtr; - } - - ++lData[j]; - } - } - - opj_free(lCurrentData); - - return OPJ_TRUE; -} - -OPJ_BOOL opj_mct_decode_custom( - OPJ_BYTE * pDecodingData, - OPJ_SIZE_T n, - OPJ_BYTE ** pData, - OPJ_UINT32 pNbComp, - OPJ_UINT32 isSigned) -{ - OPJ_FLOAT32 * lMct; - OPJ_SIZE_T i; - OPJ_UINT32 j; - OPJ_UINT32 k; - - OPJ_FLOAT32 * lCurrentData = 00; - OPJ_FLOAT32 * lCurrentResult = 00; - OPJ_FLOAT32 ** lData = (OPJ_FLOAT32 **) pData; - - OPJ_ARG_NOT_USED(isSigned); - - lCurrentData = (OPJ_FLOAT32 *) opj_malloc(2 * pNbComp * sizeof(OPJ_FLOAT32)); - if (! lCurrentData) { - return OPJ_FALSE; - } - lCurrentResult = lCurrentData + pNbComp; - - for (i = 0; i < n; ++i) { - lMct = (OPJ_FLOAT32 *) pDecodingData; - for (j = 0; j < pNbComp; ++j) { - lCurrentData[j] = (OPJ_FLOAT32)(*(lData[j])); - } - for (j = 0; j < pNbComp; ++j) { - lCurrentResult[j] = 0; - for (k = 0; k < pNbComp; ++k) { - lCurrentResult[j] += *(lMct++) * lCurrentData[k]; - } - *(lData[j]++) = (OPJ_FLOAT32)(lCurrentResult[j]); - } - } - opj_free(lCurrentData); - return OPJ_TRUE; -} - -void opj_calculate_norms(OPJ_FLOAT64 * pNorms, - OPJ_UINT32 pNbComps, - OPJ_FLOAT32 * pMatrix) -{ - OPJ_UINT32 i, j, lIndex; - OPJ_FLOAT32 lCurrentValue; - OPJ_FLOAT64 * lNorms = (OPJ_FLOAT64 *) pNorms; - OPJ_FLOAT32 * lMatrix = (OPJ_FLOAT32 *) pMatrix; - - for (i = 0; i < pNbComps; ++i) { - lNorms[i] = 0; - lIndex = i; - - for (j = 0; j < pNbComps; ++j) { - lCurrentValue = lMatrix[lIndex]; - lIndex += pNbComps; - lNorms[i] += lCurrentValue * lCurrentValue; - } - lNorms[i] = sqrt(lNorms[i]); - } -} diff --git a/src/3rd/LibOpenJpeg/mct.h b/src/3rd/LibOpenJpeg/mct.h deleted file mode 100644 index 2e37ce73..00000000 --- a/src/3rd/LibOpenJpeg/mct.h +++ /dev/null @@ -1,159 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR - * Copyright (c) 2012, CS Systemes d'Information, France - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef OPJ_MCT_H -#define OPJ_MCT_H -/** -@file mct.h -@brief Implementation of a multi-component transforms (MCT) - -The functions in MCT.C have for goal to realize reversible and irreversible multicomponent -transform. The functions in MCT.C are used by some function in TCD.C. -*/ - -/** @defgroup MCT MCT - Implementation of a multi-component transform */ -/*@{*/ - -/** @name Exported functions */ -/*@{*/ -/* ----------------------------------------------------------------------- */ -/** -Apply a reversible multi-component transform to an image -@param c0 Samples for red component -@param c1 Samples for green component -@param c2 Samples blue component -@param n Number of samples for each component -*/ -void opj_mct_encode(OPJ_INT32* OPJ_RESTRICT c0, OPJ_INT32* OPJ_RESTRICT c1, - OPJ_INT32* OPJ_RESTRICT c2, OPJ_SIZE_T n); -/** -Apply a reversible multi-component inverse transform to an image -@param c0 Samples for luminance component -@param c1 Samples for red chrominance component -@param c2 Samples for blue chrominance component -@param n Number of samples for each component -*/ -void opj_mct_decode(OPJ_INT32* OPJ_RESTRICT c0, OPJ_INT32* OPJ_RESTRICT c1, - OPJ_INT32* OPJ_RESTRICT c2, OPJ_SIZE_T n); -/** -Get norm of the basis function used for the reversible multi-component transform -@param compno Number of the component (0->Y, 1->U, 2->V) -@return -*/ -OPJ_FLOAT64 opj_mct_getnorm(OPJ_UINT32 compno); - -/** -Apply an irreversible multi-component transform to an image -@param c0 Samples for red component -@param c1 Samples for green component -@param c2 Samples blue component -@param n Number of samples for each component -*/ -void opj_mct_encode_real(OPJ_INT32* OPJ_RESTRICT c0, OPJ_INT32* OPJ_RESTRICT c1, - OPJ_INT32* OPJ_RESTRICT c2, OPJ_SIZE_T n); -/** -Apply an irreversible multi-component inverse transform to an image -@param c0 Samples for luminance component -@param c1 Samples for red chrominance component -@param c2 Samples for blue chrominance component -@param n Number of samples for each component -*/ -void opj_mct_decode_real(OPJ_FLOAT32* OPJ_RESTRICT c0, - OPJ_FLOAT32* OPJ_RESTRICT c1, OPJ_FLOAT32* OPJ_RESTRICT c2, OPJ_SIZE_T n); -/** -Get norm of the basis function used for the irreversible multi-component transform -@param compno Number of the component (0->Y, 1->U, 2->V) -@return -*/ -OPJ_FLOAT64 opj_mct_getnorm_real(OPJ_UINT32 compno); - -/** -FIXME DOC -@param p_coding_data MCT data -@param n size of components -@param p_data components -@param p_nb_comp nb of components (i.e. size of p_data) -@param is_signed tells if the data is signed -@return OPJ_FALSE if function encounter a problem, OPJ_TRUE otherwise -*/ -OPJ_BOOL opj_mct_encode_custom( - OPJ_BYTE * p_coding_data, - OPJ_SIZE_T n, - OPJ_BYTE ** p_data, - OPJ_UINT32 p_nb_comp, - OPJ_UINT32 is_signed); -/** -FIXME DOC -@param pDecodingData MCT data -@param n size of components -@param pData components -@param pNbComp nb of components (i.e. size of p_data) -@param isSigned tells if the data is signed -@return OPJ_FALSE if function encounter a problem, OPJ_TRUE otherwise -*/ -OPJ_BOOL opj_mct_decode_custom( - OPJ_BYTE * pDecodingData, - OPJ_SIZE_T n, - OPJ_BYTE ** pData, - OPJ_UINT32 pNbComp, - OPJ_UINT32 isSigned); -/** -FIXME DOC -@param pNorms MCT data -@param p_nb_comps size of components -@param pMatrix components -@return -*/ -void opj_calculate_norms(OPJ_FLOAT64 * pNorms, - OPJ_UINT32 p_nb_comps, - OPJ_FLOAT32 * pMatrix); -/** -FIXME DOC -*/ -const OPJ_FLOAT64 * opj_mct_get_mct_norms(void); -/** -FIXME DOC -*/ -const OPJ_FLOAT64 * opj_mct_get_mct_norms_real(void); -/* ----------------------------------------------------------------------- */ -/*@}*/ - -/*@}*/ - -#endif /* OPJ_MCT_H */ diff --git a/src/3rd/LibOpenJpeg/mqc.c b/src/3rd/LibOpenJpeg/mqc.c deleted file mode 100644 index 7ca44329..00000000 --- a/src/3rd/LibOpenJpeg/mqc.c +++ /dev/null @@ -1,560 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2008, Jerome Fimes, Communications & Systemes - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "opj_includes.h" - -#include - -/** @defgroup MQC MQC - Implementation of an MQ-Coder */ -/*@{*/ - -/** @name Local static functions */ -/*@{*/ - -/** -Output a byte, doing bit-stuffing if necessary. -After a 0xff byte, the next byte must be smaller than 0x90. -@param mqc MQC handle -*/ -static void opj_mqc_byteout(opj_mqc_t *mqc); -/** -Renormalize mqc->a and mqc->c while encoding, so that mqc->a stays between 0x8000 and 0x10000 -@param mqc MQC handle -*/ -static void opj_mqc_renorme(opj_mqc_t *mqc); -/** -Encode the most probable symbol -@param mqc MQC handle -*/ -static void opj_mqc_codemps(opj_mqc_t *mqc); -/** -Encode the most least symbol -@param mqc MQC handle -*/ -static void opj_mqc_codelps(opj_mqc_t *mqc); -/** -Fill mqc->c with 1's for flushing -@param mqc MQC handle -*/ -static void opj_mqc_setbits(opj_mqc_t *mqc); -/*@}*/ - -/*@}*/ - -/* */ -/* This array defines all the possible states for a context. */ -/* */ -static const opj_mqc_state_t mqc_states[47 * 2] = { - {0x5601, 0, &mqc_states[2], &mqc_states[3]}, - {0x5601, 1, &mqc_states[3], &mqc_states[2]}, - {0x3401, 0, &mqc_states[4], &mqc_states[12]}, - {0x3401, 1, &mqc_states[5], &mqc_states[13]}, - {0x1801, 0, &mqc_states[6], &mqc_states[18]}, - {0x1801, 1, &mqc_states[7], &mqc_states[19]}, - {0x0ac1, 0, &mqc_states[8], &mqc_states[24]}, - {0x0ac1, 1, &mqc_states[9], &mqc_states[25]}, - {0x0521, 0, &mqc_states[10], &mqc_states[58]}, - {0x0521, 1, &mqc_states[11], &mqc_states[59]}, - {0x0221, 0, &mqc_states[76], &mqc_states[66]}, - {0x0221, 1, &mqc_states[77], &mqc_states[67]}, - {0x5601, 0, &mqc_states[14], &mqc_states[13]}, - {0x5601, 1, &mqc_states[15], &mqc_states[12]}, - {0x5401, 0, &mqc_states[16], &mqc_states[28]}, - {0x5401, 1, &mqc_states[17], &mqc_states[29]}, - {0x4801, 0, &mqc_states[18], &mqc_states[28]}, - {0x4801, 1, &mqc_states[19], &mqc_states[29]}, - {0x3801, 0, &mqc_states[20], &mqc_states[28]}, - {0x3801, 1, &mqc_states[21], &mqc_states[29]}, - {0x3001, 0, &mqc_states[22], &mqc_states[34]}, - {0x3001, 1, &mqc_states[23], &mqc_states[35]}, - {0x2401, 0, &mqc_states[24], &mqc_states[36]}, - {0x2401, 1, &mqc_states[25], &mqc_states[37]}, - {0x1c01, 0, &mqc_states[26], &mqc_states[40]}, - {0x1c01, 1, &mqc_states[27], &mqc_states[41]}, - {0x1601, 0, &mqc_states[58], &mqc_states[42]}, - {0x1601, 1, &mqc_states[59], &mqc_states[43]}, - {0x5601, 0, &mqc_states[30], &mqc_states[29]}, - {0x5601, 1, &mqc_states[31], &mqc_states[28]}, - {0x5401, 0, &mqc_states[32], &mqc_states[28]}, - {0x5401, 1, &mqc_states[33], &mqc_states[29]}, - {0x5101, 0, &mqc_states[34], &mqc_states[30]}, - {0x5101, 1, &mqc_states[35], &mqc_states[31]}, - {0x4801, 0, &mqc_states[36], &mqc_states[32]}, - {0x4801, 1, &mqc_states[37], &mqc_states[33]}, - {0x3801, 0, &mqc_states[38], &mqc_states[34]}, - {0x3801, 1, &mqc_states[39], &mqc_states[35]}, - {0x3401, 0, &mqc_states[40], &mqc_states[36]}, - {0x3401, 1, &mqc_states[41], &mqc_states[37]}, - {0x3001, 0, &mqc_states[42], &mqc_states[38]}, - {0x3001, 1, &mqc_states[43], &mqc_states[39]}, - {0x2801, 0, &mqc_states[44], &mqc_states[38]}, - {0x2801, 1, &mqc_states[45], &mqc_states[39]}, - {0x2401, 0, &mqc_states[46], &mqc_states[40]}, - {0x2401, 1, &mqc_states[47], &mqc_states[41]}, - {0x2201, 0, &mqc_states[48], &mqc_states[42]}, - {0x2201, 1, &mqc_states[49], &mqc_states[43]}, - {0x1c01, 0, &mqc_states[50], &mqc_states[44]}, - {0x1c01, 1, &mqc_states[51], &mqc_states[45]}, - {0x1801, 0, &mqc_states[52], &mqc_states[46]}, - {0x1801, 1, &mqc_states[53], &mqc_states[47]}, - {0x1601, 0, &mqc_states[54], &mqc_states[48]}, - {0x1601, 1, &mqc_states[55], &mqc_states[49]}, - {0x1401, 0, &mqc_states[56], &mqc_states[50]}, - {0x1401, 1, &mqc_states[57], &mqc_states[51]}, - {0x1201, 0, &mqc_states[58], &mqc_states[52]}, - {0x1201, 1, &mqc_states[59], &mqc_states[53]}, - {0x1101, 0, &mqc_states[60], &mqc_states[54]}, - {0x1101, 1, &mqc_states[61], &mqc_states[55]}, - {0x0ac1, 0, &mqc_states[62], &mqc_states[56]}, - {0x0ac1, 1, &mqc_states[63], &mqc_states[57]}, - {0x09c1, 0, &mqc_states[64], &mqc_states[58]}, - {0x09c1, 1, &mqc_states[65], &mqc_states[59]}, - {0x08a1, 0, &mqc_states[66], &mqc_states[60]}, - {0x08a1, 1, &mqc_states[67], &mqc_states[61]}, - {0x0521, 0, &mqc_states[68], &mqc_states[62]}, - {0x0521, 1, &mqc_states[69], &mqc_states[63]}, - {0x0441, 0, &mqc_states[70], &mqc_states[64]}, - {0x0441, 1, &mqc_states[71], &mqc_states[65]}, - {0x02a1, 0, &mqc_states[72], &mqc_states[66]}, - {0x02a1, 1, &mqc_states[73], &mqc_states[67]}, - {0x0221, 0, &mqc_states[74], &mqc_states[68]}, - {0x0221, 1, &mqc_states[75], &mqc_states[69]}, - {0x0141, 0, &mqc_states[76], &mqc_states[70]}, - {0x0141, 1, &mqc_states[77], &mqc_states[71]}, - {0x0111, 0, &mqc_states[78], &mqc_states[72]}, - {0x0111, 1, &mqc_states[79], &mqc_states[73]}, - {0x0085, 0, &mqc_states[80], &mqc_states[74]}, - {0x0085, 1, &mqc_states[81], &mqc_states[75]}, - {0x0049, 0, &mqc_states[82], &mqc_states[76]}, - {0x0049, 1, &mqc_states[83], &mqc_states[77]}, - {0x0025, 0, &mqc_states[84], &mqc_states[78]}, - {0x0025, 1, &mqc_states[85], &mqc_states[79]}, - {0x0015, 0, &mqc_states[86], &mqc_states[80]}, - {0x0015, 1, &mqc_states[87], &mqc_states[81]}, - {0x0009, 0, &mqc_states[88], &mqc_states[82]}, - {0x0009, 1, &mqc_states[89], &mqc_states[83]}, - {0x0005, 0, &mqc_states[90], &mqc_states[84]}, - {0x0005, 1, &mqc_states[91], &mqc_states[85]}, - {0x0001, 0, &mqc_states[90], &mqc_states[86]}, - {0x0001, 1, &mqc_states[91], &mqc_states[87]}, - {0x5601, 0, &mqc_states[92], &mqc_states[92]}, - {0x5601, 1, &mqc_states[93], &mqc_states[93]}, -}; - -/* -========================================================== - local functions -========================================================== -*/ - -static void opj_mqc_byteout(opj_mqc_t *mqc) -{ - /* bp is initialized to start - 1 in opj_mqc_init_enc() */ - /* but this is safe, see opj_tcd_code_block_enc_allocate_data() */ - assert(mqc->bp >= mqc->start - 1); - if (*mqc->bp == 0xff) { - mqc->bp++; - *mqc->bp = (OPJ_BYTE)(mqc->c >> 20); - mqc->c &= 0xfffff; - mqc->ct = 7; - } else { - if ((mqc->c & 0x8000000) == 0) { - mqc->bp++; - *mqc->bp = (OPJ_BYTE)(mqc->c >> 19); - mqc->c &= 0x7ffff; - mqc->ct = 8; - } else { - (*mqc->bp)++; - if (*mqc->bp == 0xff) { - mqc->c &= 0x7ffffff; - mqc->bp++; - *mqc->bp = (OPJ_BYTE)(mqc->c >> 20); - mqc->c &= 0xfffff; - mqc->ct = 7; - } else { - mqc->bp++; - *mqc->bp = (OPJ_BYTE)(mqc->c >> 19); - mqc->c &= 0x7ffff; - mqc->ct = 8; - } - } - } -} - -static void opj_mqc_renorme(opj_mqc_t *mqc) -{ - do { - mqc->a <<= 1; - mqc->c <<= 1; - mqc->ct--; - if (mqc->ct == 0) { - opj_mqc_byteout(mqc); - } - } while ((mqc->a & 0x8000) == 0); -} - -static void opj_mqc_codemps(opj_mqc_t *mqc) -{ - mqc->a -= (*mqc->curctx)->qeval; - if ((mqc->a & 0x8000) == 0) { - if (mqc->a < (*mqc->curctx)->qeval) { - mqc->a = (*mqc->curctx)->qeval; - } else { - mqc->c += (*mqc->curctx)->qeval; - } - *mqc->curctx = (*mqc->curctx)->nmps; - opj_mqc_renorme(mqc); - } else { - mqc->c += (*mqc->curctx)->qeval; - } -} - -static void opj_mqc_codelps(opj_mqc_t *mqc) -{ - mqc->a -= (*mqc->curctx)->qeval; - if (mqc->a < (*mqc->curctx)->qeval) { - mqc->c += (*mqc->curctx)->qeval; - } else { - mqc->a = (*mqc->curctx)->qeval; - } - *mqc->curctx = (*mqc->curctx)->nlps; - opj_mqc_renorme(mqc); -} - -static void opj_mqc_setbits(opj_mqc_t *mqc) -{ - OPJ_UINT32 tempc = mqc->c + mqc->a; - mqc->c |= 0xffff; - if (mqc->c >= tempc) { - mqc->c -= 0x8000; - } -} - -/* -========================================================== - MQ-Coder interface -========================================================== -*/ - -OPJ_UINT32 opj_mqc_numbytes(opj_mqc_t *mqc) -{ - const ptrdiff_t diff = mqc->bp - mqc->start; -#if 0 - assert(diff <= 0xffffffff && diff >= 0); /* UINT32_MAX */ -#endif - return (OPJ_UINT32)diff; -} - -void opj_mqc_init_enc(opj_mqc_t *mqc, OPJ_BYTE *bp) -{ - /* To avoid the curctx pointer to be dangling, but not strictly */ - /* required as the current context is always set before encoding */ - opj_mqc_setcurctx(mqc, 0); - - /* As specified in Figure C.10 - Initialization of the encoder */ - /* (C.2.8 Initialization of the encoder (INITENC)) */ - mqc->a = 0x8000; - mqc->c = 0; - /* Yes, we point before the start of the buffer, but this is safe */ - /* given opj_tcd_code_block_enc_allocate_data() */ - mqc->bp = bp - 1; - mqc->ct = 12; - /* At this point we should test *(mqc->bp) against 0xFF, but this is not */ - /* necessary, as this is only used at the beginning of the code block */ - /* and our initial fake byte is set at 0 */ - assert(*(mqc->bp) != 0xff); - - mqc->start = bp; - mqc->end_of_byte_stream_counter = 0; -} - -void opj_mqc_encode(opj_mqc_t *mqc, OPJ_UINT32 d) -{ - if ((*mqc->curctx)->mps == d) { - opj_mqc_codemps(mqc); - } else { - opj_mqc_codelps(mqc); - } -} - -void opj_mqc_flush(opj_mqc_t *mqc) -{ - /* C.2.9 Termination of coding (FLUSH) */ - /* Figure C.11 – FLUSH procedure */ - opj_mqc_setbits(mqc); - mqc->c <<= mqc->ct; - opj_mqc_byteout(mqc); - mqc->c <<= mqc->ct; - opj_mqc_byteout(mqc); - - /* It is forbidden that a coding pass ends with 0xff */ - if (*mqc->bp != 0xff) { - /* Advance pointer so that opj_mqc_numbytes() returns a valid value */ - mqc->bp++; - } -} - -#define BYPASS_CT_INIT 0xDEADBEEF - -void opj_mqc_bypass_init_enc(opj_mqc_t *mqc) -{ - /* This function is normally called after at least one opj_mqc_flush() */ - /* which will have advance mqc->bp by at least 2 bytes beyond its */ - /* initial position */ - assert(mqc->bp >= mqc->start); - mqc->c = 0; - /* in theory we should initialize to 8, but use this special value */ - /* as a hint that opj_mqc_bypass_enc() has never been called, so */ - /* as to avoid the 0xff 0x7f elimination trick in opj_mqc_bypass_flush_enc() */ - /* to trigger when we don't have output any bit during this bypass sequence */ - /* Any value > 8 will do */ - mqc->ct = BYPASS_CT_INIT; - /* Given that we are called after opj_mqc_flush(), the previous byte */ - /* cannot be 0xff. */ - assert(mqc->bp[-1] != 0xff); -} - -void opj_mqc_bypass_enc(opj_mqc_t *mqc, OPJ_UINT32 d) -{ - if (mqc->ct == BYPASS_CT_INIT) { - mqc->ct = 8; - } - mqc->ct--; - mqc->c = mqc->c + (d << mqc->ct); - if (mqc->ct == 0) { - *mqc->bp = (OPJ_BYTE)mqc->c; - mqc->ct = 8; - /* If the previous byte was 0xff, make sure that the next msb is 0 */ - if (*mqc->bp == 0xff) { - mqc->ct = 7; - } - mqc->bp++; - mqc->c = 0; - } -} - -OPJ_UINT32 opj_mqc_bypass_get_extra_bytes(opj_mqc_t *mqc, OPJ_BOOL erterm) -{ - return (mqc->ct < 7 || - (mqc->ct == 7 && (erterm || mqc->bp[-1] != 0xff))) ? 1 : 0; -} - -void opj_mqc_bypass_flush_enc(opj_mqc_t *mqc, OPJ_BOOL erterm) -{ - /* Is there any bit remaining to be flushed ? */ - /* If the last output byte is 0xff, we can discard it, unless */ - /* erterm is required (I'm not completely sure why in erterm */ - /* we must output 0xff 0x2a if the last byte was 0xff instead of */ - /* discarding it, but Kakadu requires it when decoding */ - /* in -fussy mode) */ - if (mqc->ct < 7 || (mqc->ct == 7 && (erterm || mqc->bp[-1] != 0xff))) { - OPJ_BYTE bit_value = 0; - /* If so, fill the remaining lsbs with an alternating sequence of */ - /* 0,1,... */ - /* Note: it seems the standard only requires that for a ERTERM flush */ - /* and doesn't specify what to do for a regular BYPASS flush */ - while (mqc->ct > 0) { - mqc->ct--; - mqc->c += (OPJ_UINT32)(bit_value << mqc->ct); - bit_value = (OPJ_BYTE)(1U - bit_value); - } - *mqc->bp = (OPJ_BYTE)mqc->c; - /* Advance pointer so that opj_mqc_numbytes() returns a valid value */ - mqc->bp++; - } else if (mqc->ct == 7 && mqc->bp[-1] == 0xff) { - /* Discard last 0xff */ - assert(!erterm); - mqc->bp --; - } else if (mqc->ct == 8 && !erterm && - mqc->bp[-1] == 0x7f && mqc->bp[-2] == 0xff) { - /* Tiny optimization: discard terminating 0xff 0x7f since it is */ - /* interpreted as 0xff 0x7f [0xff 0xff] by the decoder, and given */ - /* the bit stuffing, in fact as 0xff 0xff [0xff ..] */ - /* Happens once on opj_compress -i ../MAPA.tif -o MAPA.j2k -M 1 */ - mqc->bp -= 2; - } - - assert(mqc->bp[-1] != 0xff); -} - -void opj_mqc_reset_enc(opj_mqc_t *mqc) -{ - opj_mqc_resetstates(mqc); - opj_mqc_setstate(mqc, T1_CTXNO_UNI, 0, 46); - opj_mqc_setstate(mqc, T1_CTXNO_AGG, 0, 3); - opj_mqc_setstate(mqc, T1_CTXNO_ZC, 0, 4); -} - -#ifdef notdef -OPJ_UINT32 opj_mqc_restart_enc(opj_mqc_t *mqc) -{ - OPJ_UINT32 correction = 1; - - /* */ - OPJ_INT32 n = (OPJ_INT32)(27 - 15 - mqc->ct); - mqc->c <<= mqc->ct; - while (n > 0) { - opj_mqc_byteout(mqc); - n -= (OPJ_INT32)mqc->ct; - mqc->c <<= mqc->ct; - } - opj_mqc_byteout(mqc); - - return correction; -} -#endif - -void opj_mqc_restart_init_enc(opj_mqc_t *mqc) -{ - /* */ - - /* As specified in Figure C.10 - Initialization of the encoder */ - /* (C.2.8 Initialization of the encoder (INITENC)) */ - mqc->a = 0x8000; - mqc->c = 0; - mqc->ct = 12; - /* This function is normally called after at least one opj_mqc_flush() */ - /* which will have advance mqc->bp by at least 2 bytes beyond its */ - /* initial position */ - mqc->bp --; - assert(mqc->bp >= mqc->start - 1); - assert(*mqc->bp != 0xff); - if (*mqc->bp == 0xff) { - mqc->ct = 13; - } -} - -void opj_mqc_erterm_enc(opj_mqc_t *mqc) -{ - OPJ_INT32 k = (OPJ_INT32)(11 - mqc->ct + 1); - - while (k > 0) { - mqc->c <<= mqc->ct; - mqc->ct = 0; - opj_mqc_byteout(mqc); - k -= (OPJ_INT32)mqc->ct; - } - - if (*mqc->bp != 0xff) { - opj_mqc_byteout(mqc); - } -} - -void opj_mqc_segmark_enc(opj_mqc_t *mqc) -{ - OPJ_UINT32 i; - opj_mqc_setcurctx(mqc, 18); - - for (i = 1; i < 5; i++) { - opj_mqc_encode(mqc, i % 2); - } -} - -static void opj_mqc_init_dec_common(opj_mqc_t *mqc, - OPJ_BYTE *bp, - OPJ_UINT32 len, - OPJ_UINT32 extra_writable_bytes) -{ - (void)extra_writable_bytes; - - assert(extra_writable_bytes >= OPJ_COMMON_CBLK_DATA_EXTRA); - mqc->start = bp; - mqc->end = bp + len; - /* Insert an artificial 0xFF 0xFF marker at end of the code block */ - /* data so that the bytein routines stop on it. This saves us comparing */ - /* the bp and end pointers */ - /* But before inserting it, backup th bytes we will overwrite */ - memcpy(mqc->backup, mqc->end, OPJ_COMMON_CBLK_DATA_EXTRA); - mqc->end[0] = 0xFF; - mqc->end[1] = 0xFF; - mqc->bp = bp; -} -void opj_mqc_init_dec(opj_mqc_t *mqc, OPJ_BYTE *bp, OPJ_UINT32 len, - OPJ_UINT32 extra_writable_bytes) -{ - /* Implements ISO 15444-1 C.3.5 Initialization of the decoder (INITDEC) */ - /* Note: alternate "J.1 - Initialization of the software-conventions */ - /* decoder" has been tried, but does */ - /* not bring any improvement. */ - /* See https://github.com/uclouvain/openjpeg/issues/921 */ - opj_mqc_init_dec_common(mqc, bp, len, extra_writable_bytes); - opj_mqc_setcurctx(mqc, 0); - mqc->end_of_byte_stream_counter = 0; - if (len == 0) { - mqc->c = 0xff << 16; - } else { - mqc->c = (OPJ_UINT32)(*mqc->bp << 16); - } - - opj_mqc_bytein(mqc); - mqc->c <<= 7; - mqc->ct -= 7; - mqc->a = 0x8000; -} - - -void opj_mqc_raw_init_dec(opj_mqc_t *mqc, OPJ_BYTE *bp, OPJ_UINT32 len, - OPJ_UINT32 extra_writable_bytes) -{ - opj_mqc_init_dec_common(mqc, bp, len, extra_writable_bytes); - mqc->c = 0; - mqc->ct = 0; -} - - -void opq_mqc_finish_dec(opj_mqc_t *mqc) -{ - /* Restore the bytes overwritten by opj_mqc_init_dec_common() */ - memcpy(mqc->end, mqc->backup, OPJ_COMMON_CBLK_DATA_EXTRA); -} - -void opj_mqc_resetstates(opj_mqc_t *mqc) -{ - OPJ_UINT32 i; - for (i = 0; i < MQC_NUMCTXS; i++) { - mqc->ctxs[i] = mqc_states; - } -} - -void opj_mqc_setstate(opj_mqc_t *mqc, OPJ_UINT32 ctxno, OPJ_UINT32 msb, - OPJ_INT32 prob) -{ - mqc->ctxs[ctxno] = &mqc_states[msb + (OPJ_UINT32)(prob << 1)]; -} - - diff --git a/src/3rd/LibOpenJpeg/mqc.h b/src/3rd/LibOpenJpeg/mqc.h deleted file mode 100644 index 7bc2d14e..00000000 --- a/src/3rd/LibOpenJpeg/mqc.h +++ /dev/null @@ -1,271 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2008, Jerome Fimes, Communications & Systemes - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef OPJ_MQC_H -#define OPJ_MQC_H - -#include "opj_common.h" - -/** -@file mqc.h -@brief Implementation of an MQ-Coder (MQC) - -The functions in MQC.C have for goal to realize the MQ-coder operations. The functions -in MQC.C are used by some function in T1.C. -*/ - -/** @defgroup MQC MQC - Implementation of an MQ-Coder */ -/*@{*/ - -/** -This struct defines the state of a context. -*/ -typedef struct opj_mqc_state { - /** the probability of the Least Probable Symbol (0.75->0x8000, 1.5->0xffff) */ - OPJ_UINT32 qeval; - /** the Most Probable Symbol (0 or 1) */ - OPJ_UINT32 mps; - /** next state if the next encoded symbol is the MPS */ - const struct opj_mqc_state *nmps; - /** next state if the next encoded symbol is the LPS */ - const struct opj_mqc_state *nlps; -} opj_mqc_state_t; - -#define MQC_NUMCTXS 19 - -/** -MQ coder -*/ -typedef struct opj_mqc { - /** temporary buffer where bits are coded or decoded */ - OPJ_UINT32 c; - /** only used by MQ decoder */ - OPJ_UINT32 a; - /** number of bits already read or free to write */ - OPJ_UINT32 ct; - /* only used by decoder, to count the number of times a terminating 0xFF >0x8F marker is read */ - OPJ_UINT32 end_of_byte_stream_counter; - /** pointer to the current position in the buffer */ - OPJ_BYTE *bp; - /** pointer to the start of the buffer */ - OPJ_BYTE *start; - /** pointer to the end of the buffer */ - OPJ_BYTE *end; - /** Array of contexts */ - const opj_mqc_state_t *ctxs[MQC_NUMCTXS]; - /** Active context */ - const opj_mqc_state_t **curctx; - /* lut_ctxno_zc shifted by (1 << 9) * bandno */ - const OPJ_BYTE* lut_ctxno_zc_orient; - /** Original value of the 2 bytes at end[0] and end[1] */ - OPJ_BYTE backup[OPJ_COMMON_CBLK_DATA_EXTRA]; -} opj_mqc_t; - -#include "mqc_inl.h" - -/** @name Exported functions */ -/*@{*/ -/* ----------------------------------------------------------------------- */ - -/** -Return the number of bytes written/read since initialisation -@param mqc MQC handle -@return Returns the number of bytes already encoded -*/ -OPJ_UINT32 opj_mqc_numbytes(opj_mqc_t *mqc); -/** -Reset the states of all the context of the coder/decoder -(each context is set to a state where 0 and 1 are more or less equiprobable) -@param mqc MQC handle -*/ -void opj_mqc_resetstates(opj_mqc_t *mqc); -/** -Set the state of a particular context -@param mqc MQC handle -@param ctxno Number that identifies the context -@param msb The MSB of the new state of the context -@param prob Number that identifies the probability of the symbols for the new state of the context -*/ -void opj_mqc_setstate(opj_mqc_t *mqc, OPJ_UINT32 ctxno, OPJ_UINT32 msb, - OPJ_INT32 prob); -/** -Initialize the encoder -@param mqc MQC handle -@param bp Pointer to the start of the buffer where the bytes will be written -*/ -void opj_mqc_init_enc(opj_mqc_t *mqc, OPJ_BYTE *bp); -/** -Set the current context used for coding/decoding -@param mqc MQC handle -@param ctxno Number that identifies the context -*/ -#define opj_mqc_setcurctx(mqc, ctxno) (mqc)->curctx = &(mqc)->ctxs[(OPJ_UINT32)(ctxno)] -/** -Encode a symbol using the MQ-coder -@param mqc MQC handle -@param d The symbol to be encoded (0 or 1) -*/ -void opj_mqc_encode(opj_mqc_t *mqc, OPJ_UINT32 d); -/** -Flush the encoder, so that all remaining data is written -@param mqc MQC handle -*/ -void opj_mqc_flush(opj_mqc_t *mqc); -/** -BYPASS mode switch, initialization operation. -JPEG 2000 p 505. -@param mqc MQC handle -*/ -void opj_mqc_bypass_init_enc(opj_mqc_t *mqc); - -/** Return number of extra bytes to add to opj_mqc_numbytes() for the² - size of a non-terminating BYPASS pass -@param mqc MQC handle -@param erterm 1 if ERTERM is enabled, 0 otherwise -*/ -OPJ_UINT32 opj_mqc_bypass_get_extra_bytes(opj_mqc_t *mqc, OPJ_BOOL erterm); - -/** -BYPASS mode switch, coding operation. -JPEG 2000 p 505. -@param mqc MQC handle -@param d The symbol to be encoded (0 or 1) -*/ -void opj_mqc_bypass_enc(opj_mqc_t *mqc, OPJ_UINT32 d); -/** -BYPASS mode switch, flush operation -@param mqc MQC handle -@param erterm 1 if ERTERM is enabled, 0 otherwise -*/ -void opj_mqc_bypass_flush_enc(opj_mqc_t *mqc, OPJ_BOOL erterm); -/** -RESET mode switch -@param mqc MQC handle -*/ -void opj_mqc_reset_enc(opj_mqc_t *mqc); - -#ifdef notdef -/** -RESTART mode switch (TERMALL) -@param mqc MQC handle -@return Returns 1 (always) -*/ -OPJ_UINT32 opj_mqc_restart_enc(opj_mqc_t *mqc); -#endif - -/** -RESTART mode switch (TERMALL) reinitialisation -@param mqc MQC handle -*/ -void opj_mqc_restart_init_enc(opj_mqc_t *mqc); -/** -ERTERM mode switch (PTERM) -@param mqc MQC handle -*/ -void opj_mqc_erterm_enc(opj_mqc_t *mqc); -/** -SEGMARK mode switch (SEGSYM) -@param mqc MQC handle -*/ -void opj_mqc_segmark_enc(opj_mqc_t *mqc); - -/** -Initialize the decoder for MQ decoding. - -opj_mqc_finish_dec() must be absolutely called after finishing the decoding -passes, so as to restore the bytes temporarily overwritten. - -@param mqc MQC handle -@param bp Pointer to the start of the buffer from which the bytes will be read - Note that OPJ_COMMON_CBLK_DATA_EXTRA bytes at the end of the buffer - will be temporarily overwritten with an artificial 0xFF 0xFF marker. - (they will be backuped in the mqc structure to be restored later) - So bp must be at least len + OPJ_COMMON_CBLK_DATA_EXTRA large, and - writable. -@param len Length of the input buffer -@param extra_writable_bytes Indicate how many bytes after len are writable. - This is to indicate your consent that bp must be - large enough. -*/ -void opj_mqc_init_dec(opj_mqc_t *mqc, OPJ_BYTE *bp, OPJ_UINT32 len, - OPJ_UINT32 extra_writable_bytes); - -/** -Initialize the decoder for RAW decoding. - -opj_mqc_finish_dec() must be absolutely called after finishing the decoding -passes, so as to restore the bytes temporarily overwritten. - -@param mqc MQC handle -@param bp Pointer to the start of the buffer from which the bytes will be read - Note that OPJ_COMMON_CBLK_DATA_EXTRA bytes at the end of the buffer - will be temporarily overwritten with an artificial 0xFF 0xFF marker. - (they will be backuped in the mqc structure to be restored later) - So bp must be at least len + OPJ_COMMON_CBLK_DATA_EXTRA large, and - writable. -@param len Length of the input buffer -@param extra_writable_bytes Indicate how many bytes after len are writable. - This is to indicate your consent that bp must be - large enough. -*/ -void opj_mqc_raw_init_dec(opj_mqc_t *mqc, OPJ_BYTE *bp, OPJ_UINT32 len, - OPJ_UINT32 extra_writable_bytes); - - -/** -Terminate RAW/MQC decoding - -This restores the bytes temporarily overwritten by opj_mqc_init_dec()/ -opj_mqc_raw_init_dec() - -@param mqc MQC handle -*/ -void opq_mqc_finish_dec(opj_mqc_t *mqc); - -/** -Decode a symbol -@param mqc MQC handle -@return Returns the decoded symbol (0 or 1) -*/ -/*static INLINE OPJ_UINT32 opj_mqc_decode(opj_mqc_t * const mqc);*/ -/* ----------------------------------------------------------------------- */ -/*@}*/ - -/*@}*/ - -#endif /* OPJ_MQC_H */ diff --git a/src/3rd/LibOpenJpeg/mqc_inl.h b/src/3rd/LibOpenJpeg/mqc_inl.h deleted file mode 100644 index 310a3287..00000000 --- a/src/3rd/LibOpenJpeg/mqc_inl.h +++ /dev/null @@ -1,196 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2008, Jerome Fimes, Communications & Systemes - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef OPJ_MQC_INL_H -#define OPJ_MQC_INL_H - -/* For internal use of opj_mqc_decode_macro() */ -#define opj_mqc_mpsexchange_macro(d, curctx, a) \ -{ \ - if (a < (*curctx)->qeval) { \ - d = !((*curctx)->mps); \ - *curctx = (*curctx)->nlps; \ - } else { \ - d = (*curctx)->mps; \ - *curctx = (*curctx)->nmps; \ - } \ -} - -/* For internal use of opj_mqc_decode_macro() */ -#define opj_mqc_lpsexchange_macro(d, curctx, a) \ -{ \ - if (a < (*curctx)->qeval) { \ - a = (*curctx)->qeval; \ - d = (*curctx)->mps; \ - *curctx = (*curctx)->nmps; \ - } else { \ - a = (*curctx)->qeval; \ - d = !((*curctx)->mps); \ - *curctx = (*curctx)->nlps; \ - } \ -} - - -/** -Decode a symbol using raw-decoder. Cfr p.506 TAUBMAN -@param mqc MQC handle -@return Returns the decoded symbol (0 or 1) -*/ -static INLINE OPJ_UINT32 opj_mqc_raw_decode(opj_mqc_t *mqc) -{ - OPJ_UINT32 d; - if (mqc->ct == 0) { - /* Given opj_mqc_raw_init_dec() we know that at some point we will */ - /* have a 0xFF 0xFF artificial marker */ - if (mqc->c == 0xff) { - if (*mqc->bp > 0x8f) { - mqc->c = 0xff; - mqc->ct = 8; - } else { - mqc->c = *mqc->bp; - mqc->bp ++; - mqc->ct = 7; - } - } else { - mqc->c = *mqc->bp; - mqc->bp ++; - mqc->ct = 8; - } - } - mqc->ct--; - d = ((OPJ_UINT32)mqc->c >> mqc->ct) & 0x01U; - - return d; -} - - -#define opj_mqc_bytein_macro(mqc, c, ct) \ -{ \ - OPJ_UINT32 l_c; \ - /* Given opj_mqc_init_dec() we know that at some point we will */ \ - /* have a 0xFF 0xFF artificial marker */ \ - l_c = *(mqc->bp + 1); \ - if (*mqc->bp == 0xff) { \ - if (l_c > 0x8f) { \ - c += 0xff00; \ - ct = 8; \ - mqc->end_of_byte_stream_counter ++; \ - } else { \ - mqc->bp++; \ - c += l_c << 9; \ - ct = 7; \ - } \ - } else { \ - mqc->bp++; \ - c += l_c << 8; \ - ct = 8; \ - } \ -} - -/* For internal use of opj_mqc_decode_macro() */ -#define opj_mqc_renormd_macro(mqc, a, c, ct) \ -{ \ - do { \ - if (ct == 0) { \ - opj_mqc_bytein_macro(mqc, c, ct); \ - } \ - a <<= 1; \ - c <<= 1; \ - ct--; \ - } while (a < 0x8000); \ -} - -#define opj_mqc_decode_macro(d, mqc, curctx, a, c, ct) \ -{ \ - /* Implements ISO 15444-1 C.3.2 Decoding a decision (DECODE) */ \ - /* Note: alternate "J.2 - Decoding an MPS or an LPS in the */ \ - /* software-conventions decoder" has been tried, but does not bring any */ \ - /* improvement. See https://github.com/uclouvain/openjpeg/issues/921 */ \ - a -= (*curctx)->qeval; \ - if ((c >> 16) < (*curctx)->qeval) { \ - opj_mqc_lpsexchange_macro(d, curctx, a); \ - opj_mqc_renormd_macro(mqc, a, c, ct); \ - } else { \ - c -= (*curctx)->qeval << 16; \ - if ((a & 0x8000) == 0) { \ - opj_mqc_mpsexchange_macro(d, curctx, a); \ - opj_mqc_renormd_macro(mqc, a, c, ct); \ - } else { \ - d = (*curctx)->mps; \ - } \ - } \ -} - -#define DOWNLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct) \ - register const opj_mqc_state_t **curctx = mqc->curctx; \ - register OPJ_UINT32 c = mqc->c; \ - register OPJ_UINT32 a = mqc->a; \ - register OPJ_UINT32 ct = mqc->ct - -#define UPLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct) \ - mqc->curctx = curctx; \ - mqc->c = c; \ - mqc->a = a; \ - mqc->ct = ct; - -/** -Input a byte -@param mqc MQC handle -*/ -static INLINE void opj_mqc_bytein(opj_mqc_t *const mqc) -{ - opj_mqc_bytein_macro(mqc, mqc->c, mqc->ct); -} - -/** -Renormalize mqc->a and mqc->c while decoding -@param mqc MQC handle -*/ -#define opj_mqc_renormd(mqc) \ - opj_mqc_renormd_macro(mqc, mqc->a, mqc->c, mqc->ct) - -/** -Decode a symbol -@param d OPJ_UINT32 value where to store the decoded symbol -@param mqc MQC handle -@return Returns the decoded symbol (0 or 1) in d -*/ -#define opj_mqc_decode(d, mqc) \ - opj_mqc_decode_macro(d, mqc, mqc->curctx, mqc->a, mqc->c, mqc->ct) - -#endif /* OPJ_MQC_INL_H */ diff --git a/src/3rd/LibOpenJpeg/openjpeg.c b/src/3rd/LibOpenJpeg/openjpeg.c deleted file mode 100644 index 7b123034..00000000 --- a/src/3rd/LibOpenJpeg/openjpeg.c +++ /dev/null @@ -1,1065 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR - * Copyright (c) 2012, CS Systemes d'Information, France - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifdef _WIN32 -#include -#endif /* _WIN32 */ - -#include "opj_includes.h" - - -/* ---------------------------------------------------------------------- */ -/* Functions to set the message handlers */ - -OPJ_BOOL OPJ_CALLCONV opj_set_info_handler(opj_codec_t * p_codec, - opj_msg_callback p_callback, - void * p_user_data) -{ - opj_codec_private_t * l_codec = (opj_codec_private_t *) p_codec; - if (! l_codec) { - return OPJ_FALSE; - } - - l_codec->m_event_mgr.info_handler = p_callback; - l_codec->m_event_mgr.m_info_data = p_user_data; - - return OPJ_TRUE; -} - -OPJ_BOOL OPJ_CALLCONV opj_set_warning_handler(opj_codec_t * p_codec, - opj_msg_callback p_callback, - void * p_user_data) -{ - opj_codec_private_t * l_codec = (opj_codec_private_t *) p_codec; - if (! l_codec) { - return OPJ_FALSE; - } - - l_codec->m_event_mgr.warning_handler = p_callback; - l_codec->m_event_mgr.m_warning_data = p_user_data; - - return OPJ_TRUE; -} - -OPJ_BOOL OPJ_CALLCONV opj_set_error_handler(opj_codec_t * p_codec, - opj_msg_callback p_callback, - void * p_user_data) -{ - opj_codec_private_t * l_codec = (opj_codec_private_t *) p_codec; - if (! l_codec) { - return OPJ_FALSE; - } - - l_codec->m_event_mgr.error_handler = p_callback; - l_codec->m_event_mgr.m_error_data = p_user_data; - - return OPJ_TRUE; -} - -/* ---------------------------------------------------------------------- */ - -static OPJ_SIZE_T opj_read_from_file(void * p_buffer, OPJ_SIZE_T p_nb_bytes, - FILE * p_file) -{ - OPJ_SIZE_T l_nb_read = fread(p_buffer, 1, p_nb_bytes, p_file); - return l_nb_read ? l_nb_read : (OPJ_SIZE_T) - 1; -} - -static OPJ_UINT64 opj_get_data_length_from_file(FILE * p_file) -{ - OPJ_OFF_T file_length = 0; - - OPJ_FSEEK(p_file, 0, SEEK_END); - file_length = (OPJ_OFF_T)OPJ_FTELL(p_file); - OPJ_FSEEK(p_file, 0, SEEK_SET); - - return (OPJ_UINT64)file_length; -} - -static OPJ_SIZE_T opj_write_from_file(void * p_buffer, OPJ_SIZE_T p_nb_bytes, - FILE * p_file) -{ - return fwrite(p_buffer, 1, p_nb_bytes, p_file); -} - -static OPJ_OFF_T opj_skip_from_file(OPJ_OFF_T p_nb_bytes, FILE * p_user_data) -{ - if (OPJ_FSEEK(p_user_data, p_nb_bytes, SEEK_CUR)) { - return -1; - } - - return p_nb_bytes; -} - -static OPJ_BOOL opj_seek_from_file(OPJ_OFF_T p_nb_bytes, FILE * p_user_data) -{ - if (OPJ_FSEEK(p_user_data, p_nb_bytes, SEEK_SET)) { - return OPJ_FALSE; - } - - return OPJ_TRUE; -} - -/* ---------------------------------------------------------------------- */ -#ifdef _WIN32 -#ifndef OPJ_STATIC -BOOL APIENTRY -DllMain(HINSTANCE hModule, DWORD ul_reason_for_call, LPVOID lpReserved) -{ - - OPJ_ARG_NOT_USED(lpReserved); - OPJ_ARG_NOT_USED(hModule); - - switch (ul_reason_for_call) { - case DLL_PROCESS_ATTACH : - break; - case DLL_PROCESS_DETACH : - break; - case DLL_THREAD_ATTACH : - case DLL_THREAD_DETACH : - break; - } - - return TRUE; -} -#endif /* OPJ_STATIC */ -#endif /* _WIN32 */ - -/* ---------------------------------------------------------------------- */ - -const char* OPJ_CALLCONV opj_version(void) -{ - return OPJ_PACKAGE_VERSION; -} - -/* ---------------------------------------------------------------------- */ -/* DECOMPRESSION FUNCTIONS*/ - -opj_codec_t* OPJ_CALLCONV opj_create_decompress(OPJ_CODEC_FORMAT p_format) -{ - opj_codec_private_t *l_codec = 00; - - l_codec = (opj_codec_private_t*) opj_calloc(1, sizeof(opj_codec_private_t)); - if (!l_codec) { - return 00; - } - - l_codec->is_decompressor = 1; - - switch (p_format) { - case OPJ_CODEC_J2K: - l_codec->opj_dump_codec = (void (*)(void*, OPJ_INT32, FILE*)) j2k_dump; - - l_codec->opj_get_codec_info = (opj_codestream_info_v2_t* (*)( - void*)) j2k_get_cstr_info; - - l_codec->opj_get_codec_index = (opj_codestream_index_t* (*)( - void*)) j2k_get_cstr_index; - - l_codec->m_codec_data.m_decompression.opj_decode = - (OPJ_BOOL(*)(void *, - struct opj_stream_private *, - opj_image_t*, struct opj_event_mgr *)) opj_j2k_decode; - - l_codec->m_codec_data.m_decompression.opj_end_decompress = - (OPJ_BOOL(*)(void *, - struct opj_stream_private *, - struct opj_event_mgr *)) opj_j2k_end_decompress; - - l_codec->m_codec_data.m_decompression.opj_read_header = - (OPJ_BOOL(*)(struct opj_stream_private *, - void *, - opj_image_t **, - struct opj_event_mgr *)) opj_j2k_read_header; - - l_codec->m_codec_data.m_decompression.opj_destroy = - (void (*)(void *))opj_j2k_destroy; - - l_codec->m_codec_data.m_decompression.opj_setup_decoder = - (void (*)(void *, opj_dparameters_t *)) opj_j2k_setup_decoder; - - l_codec->m_codec_data.m_decompression.opj_read_tile_header = - (OPJ_BOOL(*)(void *, - OPJ_UINT32*, - OPJ_UINT32*, - OPJ_INT32*, OPJ_INT32*, - OPJ_INT32*, OPJ_INT32*, - OPJ_UINT32*, - OPJ_BOOL*, - struct opj_stream_private *, - struct opj_event_mgr *)) opj_j2k_read_tile_header; - - l_codec->m_codec_data.m_decompression.opj_decode_tile_data = - (OPJ_BOOL(*)(void *, - OPJ_UINT32, - OPJ_BYTE*, - OPJ_UINT32, - struct opj_stream_private *, - struct opj_event_mgr *)) opj_j2k_decode_tile; - - l_codec->m_codec_data.m_decompression.opj_set_decode_area = - (OPJ_BOOL(*)(void *, - opj_image_t*, - OPJ_INT32, OPJ_INT32, OPJ_INT32, OPJ_INT32, - struct opj_event_mgr *)) opj_j2k_set_decode_area; - - l_codec->m_codec_data.m_decompression.opj_get_decoded_tile = - (OPJ_BOOL(*)(void *p_codec, - opj_stream_private_t *p_cio, - opj_image_t *p_image, - struct opj_event_mgr * p_manager, - OPJ_UINT32 tile_index)) opj_j2k_get_tile; - - l_codec->m_codec_data.m_decompression.opj_set_decoded_resolution_factor = - (OPJ_BOOL(*)(void * p_codec, - OPJ_UINT32 res_factor, - struct opj_event_mgr * p_manager)) opj_j2k_set_decoded_resolution_factor; - - l_codec->m_codec_data.m_decompression.opj_set_decoded_components = - (OPJ_BOOL(*)(void * p_codec, - OPJ_UINT32 numcomps, - const OPJ_UINT32 * comps_indices, - struct opj_event_mgr * p_manager)) opj_j2k_set_decoded_components; - - l_codec->opj_set_threads = - (OPJ_BOOL(*)(void * p_codec, OPJ_UINT32 num_threads)) opj_j2k_set_threads; - - l_codec->m_codec = opj_j2k_create_decompress(); - - if (! l_codec->m_codec) { - opj_free(l_codec); - return NULL; - } - - break; - - case OPJ_CODEC_JP2: - /* get a JP2 decoder handle */ - l_codec->opj_dump_codec = (void (*)(void*, OPJ_INT32, FILE*)) jp2_dump; - - l_codec->opj_get_codec_info = (opj_codestream_info_v2_t* (*)( - void*)) jp2_get_cstr_info; - - l_codec->opj_get_codec_index = (opj_codestream_index_t* (*)( - void*)) jp2_get_cstr_index; - - l_codec->m_codec_data.m_decompression.opj_decode = - (OPJ_BOOL(*)(void *, - struct opj_stream_private *, - opj_image_t*, - struct opj_event_mgr *)) opj_jp2_decode; - - l_codec->m_codec_data.m_decompression.opj_end_decompress = - (OPJ_BOOL(*)(void *, - struct opj_stream_private *, - struct opj_event_mgr *)) opj_jp2_end_decompress; - - l_codec->m_codec_data.m_decompression.opj_read_header = - (OPJ_BOOL(*)(struct opj_stream_private *, - void *, - opj_image_t **, - struct opj_event_mgr *)) opj_jp2_read_header; - - l_codec->m_codec_data.m_decompression.opj_read_tile_header = - (OPJ_BOOL(*)(void *, - OPJ_UINT32*, - OPJ_UINT32*, - OPJ_INT32*, - OPJ_INT32*, - OPJ_INT32 *, - OPJ_INT32 *, - OPJ_UINT32 *, - OPJ_BOOL *, - struct opj_stream_private *, - struct opj_event_mgr *)) opj_jp2_read_tile_header; - - l_codec->m_codec_data.m_decompression.opj_decode_tile_data = - (OPJ_BOOL(*)(void *, - OPJ_UINT32, OPJ_BYTE*, OPJ_UINT32, - struct opj_stream_private *, - struct opj_event_mgr *)) opj_jp2_decode_tile; - - l_codec->m_codec_data.m_decompression.opj_destroy = (void (*)( - void *))opj_jp2_destroy; - - l_codec->m_codec_data.m_decompression.opj_setup_decoder = - (void (*)(void *, opj_dparameters_t *)) opj_jp2_setup_decoder; - - l_codec->m_codec_data.m_decompression.opj_set_decode_area = - (OPJ_BOOL(*)(void *, - opj_image_t*, - OPJ_INT32, OPJ_INT32, OPJ_INT32, OPJ_INT32, - struct opj_event_mgr *)) opj_jp2_set_decode_area; - - l_codec->m_codec_data.m_decompression.opj_get_decoded_tile = - (OPJ_BOOL(*)(void *p_codec, - opj_stream_private_t *p_cio, - opj_image_t *p_image, - struct opj_event_mgr * p_manager, - OPJ_UINT32 tile_index)) opj_jp2_get_tile; - - l_codec->m_codec_data.m_decompression.opj_set_decoded_resolution_factor = - (OPJ_BOOL(*)(void * p_codec, - OPJ_UINT32 res_factor, - opj_event_mgr_t * p_manager)) opj_jp2_set_decoded_resolution_factor; - - l_codec->m_codec_data.m_decompression.opj_set_decoded_components = - (OPJ_BOOL(*)(void * p_codec, - OPJ_UINT32 numcomps, - const OPJ_UINT32 * comps_indices, - struct opj_event_mgr * p_manager)) opj_jp2_set_decoded_components; - - l_codec->opj_set_threads = - (OPJ_BOOL(*)(void * p_codec, OPJ_UINT32 num_threads)) opj_jp2_set_threads; - - l_codec->m_codec = opj_jp2_create(OPJ_TRUE); - - if (! l_codec->m_codec) { - opj_free(l_codec); - return 00; - } - - break; - case OPJ_CODEC_UNKNOWN: - case OPJ_CODEC_JPT: - default: - opj_free(l_codec); - return 00; - } - - opj_set_default_event_handler(&(l_codec->m_event_mgr)); - return (opj_codec_t*) l_codec; -} - -void OPJ_CALLCONV opj_set_default_decoder_parameters(opj_dparameters_t - *parameters) -{ - if (parameters) { - memset(parameters, 0, sizeof(opj_dparameters_t)); - /* default decoding parameters */ - parameters->cp_layer = 0; - parameters->cp_reduce = 0; - - parameters->decod_format = -1; - parameters->cod_format = -1; - parameters->flags = 0; - /* UniPG>> */ -#ifdef USE_JPWL - parameters->jpwl_correct = OPJ_FALSE; - parameters->jpwl_exp_comps = JPWL_EXPECTED_COMPONENTS; - parameters->jpwl_max_tiles = JPWL_MAXIMUM_TILES; -#endif /* USE_JPWL */ - /* <= 0)) { - opj_codec_private_t * l_codec = (opj_codec_private_t *) p_codec; - - return l_codec->opj_set_threads(l_codec->m_codec, (OPJ_UINT32)num_threads); - } - return OPJ_FALSE; -} - -OPJ_BOOL OPJ_CALLCONV opj_setup_decoder(opj_codec_t *p_codec, - opj_dparameters_t *parameters - ) -{ - if (p_codec && parameters) { - opj_codec_private_t * l_codec = (opj_codec_private_t *) p_codec; - - if (! l_codec->is_decompressor) { - opj_event_msg(&(l_codec->m_event_mgr), EVT_ERROR, - "Codec provided to the opj_setup_decoder function is not a decompressor handler.\n"); - return OPJ_FALSE; - } - - l_codec->m_codec_data.m_decompression.opj_setup_decoder(l_codec->m_codec, - parameters); - return OPJ_TRUE; - } - return OPJ_FALSE; -} - -OPJ_BOOL OPJ_CALLCONV opj_read_header(opj_stream_t *p_stream, - opj_codec_t *p_codec, - opj_image_t **p_image) -{ - if (p_codec && p_stream) { - opj_codec_private_t* l_codec = (opj_codec_private_t*) p_codec; - opj_stream_private_t* l_stream = (opj_stream_private_t*) p_stream; - - if (! l_codec->is_decompressor) { - opj_event_msg(&(l_codec->m_event_mgr), EVT_ERROR, - "Codec provided to the opj_read_header function is not a decompressor handler.\n"); - return OPJ_FALSE; - } - - return l_codec->m_codec_data.m_decompression.opj_read_header(l_stream, - l_codec->m_codec, - p_image, - &(l_codec->m_event_mgr)); - } - - return OPJ_FALSE; -} - - -OPJ_BOOL OPJ_CALLCONV opj_set_decoded_components(opj_codec_t *p_codec, - OPJ_UINT32 numcomps, - const OPJ_UINT32* comps_indices, - OPJ_BOOL apply_color_transforms) -{ - if (p_codec) { - opj_codec_private_t * l_codec = (opj_codec_private_t *) p_codec; - - if (! l_codec->is_decompressor) { - opj_event_msg(&(l_codec->m_event_mgr), EVT_ERROR, - "Codec provided to the opj_set_decoded_components function is not a decompressor handler.\n"); - return OPJ_FALSE; - } - - if (apply_color_transforms) { - opj_event_msg(&(l_codec->m_event_mgr), EVT_ERROR, - "apply_color_transforms = OPJ_TRUE is not supported.\n"); - return OPJ_FALSE; - } - - return l_codec->m_codec_data.m_decompression.opj_set_decoded_components( - l_codec->m_codec, - numcomps, - comps_indices, - &(l_codec->m_event_mgr)); - } - return OPJ_FALSE; -} - -OPJ_BOOL OPJ_CALLCONV opj_decode(opj_codec_t *p_codec, - opj_stream_t *p_stream, - opj_image_t* p_image) -{ - if (p_codec && p_stream) { - opj_codec_private_t * l_codec = (opj_codec_private_t *) p_codec; - opj_stream_private_t * l_stream = (opj_stream_private_t *) p_stream; - - if (! l_codec->is_decompressor) { - return OPJ_FALSE; - } - - return l_codec->m_codec_data.m_decompression.opj_decode(l_codec->m_codec, - l_stream, - p_image, - &(l_codec->m_event_mgr)); - } - - return OPJ_FALSE; -} - -OPJ_BOOL OPJ_CALLCONV opj_set_decode_area(opj_codec_t *p_codec, - opj_image_t* p_image, - OPJ_INT32 p_start_x, OPJ_INT32 p_start_y, - OPJ_INT32 p_end_x, OPJ_INT32 p_end_y - ) -{ - if (p_codec) { - opj_codec_private_t * l_codec = (opj_codec_private_t *) p_codec; - - if (! l_codec->is_decompressor) { - return OPJ_FALSE; - } - - return l_codec->m_codec_data.m_decompression.opj_set_decode_area( - l_codec->m_codec, - p_image, - p_start_x, p_start_y, - p_end_x, p_end_y, - &(l_codec->m_event_mgr)); - } - return OPJ_FALSE; -} - -OPJ_BOOL OPJ_CALLCONV opj_read_tile_header(opj_codec_t *p_codec, - opj_stream_t * p_stream, - OPJ_UINT32 * p_tile_index, - OPJ_UINT32 * p_data_size, - OPJ_INT32 * p_tile_x0, OPJ_INT32 * p_tile_y0, - OPJ_INT32 * p_tile_x1, OPJ_INT32 * p_tile_y1, - OPJ_UINT32 * p_nb_comps, - OPJ_BOOL * p_should_go_on) -{ - if (p_codec && p_stream && p_data_size && p_tile_index) { - opj_codec_private_t * l_codec = (opj_codec_private_t *) p_codec; - opj_stream_private_t * l_stream = (opj_stream_private_t *) p_stream; - - if (! l_codec->is_decompressor) { - return OPJ_FALSE; - } - - return l_codec->m_codec_data.m_decompression.opj_read_tile_header( - l_codec->m_codec, - p_tile_index, - p_data_size, - p_tile_x0, p_tile_y0, - p_tile_x1, p_tile_y1, - p_nb_comps, - p_should_go_on, - l_stream, - &(l_codec->m_event_mgr)); - } - return OPJ_FALSE; -} - -OPJ_BOOL OPJ_CALLCONV opj_decode_tile_data(opj_codec_t *p_codec, - OPJ_UINT32 p_tile_index, - OPJ_BYTE * p_data, - OPJ_UINT32 p_data_size, - opj_stream_t *p_stream - ) -{ - if (p_codec && p_data && p_stream) { - opj_codec_private_t * l_codec = (opj_codec_private_t *) p_codec; - opj_stream_private_t * l_stream = (opj_stream_private_t *) p_stream; - - if (! l_codec->is_decompressor) { - return OPJ_FALSE; - } - - return l_codec->m_codec_data.m_decompression.opj_decode_tile_data( - l_codec->m_codec, - p_tile_index, - p_data, - p_data_size, - l_stream, - &(l_codec->m_event_mgr)); - } - return OPJ_FALSE; -} - -OPJ_BOOL OPJ_CALLCONV opj_get_decoded_tile(opj_codec_t *p_codec, - opj_stream_t *p_stream, - opj_image_t *p_image, - OPJ_UINT32 tile_index) -{ - if (p_codec && p_stream) { - opj_codec_private_t * l_codec = (opj_codec_private_t *) p_codec; - opj_stream_private_t * l_stream = (opj_stream_private_t *) p_stream; - - if (! l_codec->is_decompressor) { - return OPJ_FALSE; - } - - return l_codec->m_codec_data.m_decompression.opj_get_decoded_tile( - l_codec->m_codec, - l_stream, - p_image, - &(l_codec->m_event_mgr), - tile_index); - } - - return OPJ_FALSE; -} - -OPJ_BOOL OPJ_CALLCONV opj_set_decoded_resolution_factor(opj_codec_t *p_codec, - OPJ_UINT32 res_factor) -{ - opj_codec_private_t * l_codec = (opj_codec_private_t *) p_codec; - - if (!l_codec) { - return OPJ_FALSE; - } - - return l_codec->m_codec_data.m_decompression.opj_set_decoded_resolution_factor( - l_codec->m_codec, - res_factor, - &(l_codec->m_event_mgr)); -} - -/* ---------------------------------------------------------------------- */ -/* COMPRESSION FUNCTIONS*/ - -opj_codec_t* OPJ_CALLCONV opj_create_compress(OPJ_CODEC_FORMAT p_format) -{ - opj_codec_private_t *l_codec = 00; - - l_codec = (opj_codec_private_t*)opj_calloc(1, sizeof(opj_codec_private_t)); - if (!l_codec) { - return 00; - } - - l_codec->is_decompressor = 0; - - switch (p_format) { - case OPJ_CODEC_J2K: - l_codec->m_codec_data.m_compression.opj_encode = (OPJ_BOOL(*)(void *, - struct opj_stream_private *, - struct opj_event_mgr *)) opj_j2k_encode; - - l_codec->m_codec_data.m_compression.opj_end_compress = (OPJ_BOOL(*)(void *, - struct opj_stream_private *, - struct opj_event_mgr *)) opj_j2k_end_compress; - - l_codec->m_codec_data.m_compression.opj_start_compress = (OPJ_BOOL(*)(void *, - struct opj_stream_private *, - struct opj_image *, - struct opj_event_mgr *)) opj_j2k_start_compress; - - l_codec->m_codec_data.m_compression.opj_write_tile = (OPJ_BOOL(*)(void *, - OPJ_UINT32, - OPJ_BYTE*, - OPJ_UINT32, - struct opj_stream_private *, - struct opj_event_mgr *)) opj_j2k_write_tile; - - l_codec->m_codec_data.m_compression.opj_destroy = (void (*)( - void *)) opj_j2k_destroy; - - l_codec->m_codec_data.m_compression.opj_setup_encoder = (OPJ_BOOL(*)(void *, - opj_cparameters_t *, - struct opj_image *, - struct opj_event_mgr *)) opj_j2k_setup_encoder; - - l_codec->m_codec = opj_j2k_create_compress(); - if (! l_codec->m_codec) { - opj_free(l_codec); - return 00; - } - - break; - - case OPJ_CODEC_JP2: - /* get a JP2 decoder handle */ - l_codec->m_codec_data.m_compression.opj_encode = (OPJ_BOOL(*)(void *, - struct opj_stream_private *, - struct opj_event_mgr *)) opj_jp2_encode; - - l_codec->m_codec_data.m_compression.opj_end_compress = (OPJ_BOOL(*)(void *, - struct opj_stream_private *, - struct opj_event_mgr *)) opj_jp2_end_compress; - - l_codec->m_codec_data.m_compression.opj_start_compress = (OPJ_BOOL(*)(void *, - struct opj_stream_private *, - struct opj_image *, - struct opj_event_mgr *)) opj_jp2_start_compress; - - l_codec->m_codec_data.m_compression.opj_write_tile = (OPJ_BOOL(*)(void *, - OPJ_UINT32, - OPJ_BYTE*, - OPJ_UINT32, - struct opj_stream_private *, - struct opj_event_mgr *)) opj_jp2_write_tile; - - l_codec->m_codec_data.m_compression.opj_destroy = (void (*)( - void *)) opj_jp2_destroy; - - l_codec->m_codec_data.m_compression.opj_setup_encoder = (OPJ_BOOL(*)(void *, - opj_cparameters_t *, - struct opj_image *, - struct opj_event_mgr *)) opj_jp2_setup_encoder; - - l_codec->m_codec = opj_jp2_create(OPJ_FALSE); - if (! l_codec->m_codec) { - opj_free(l_codec); - return 00; - } - - break; - - case OPJ_CODEC_UNKNOWN: - case OPJ_CODEC_JPT: - default: - opj_free(l_codec); - return 00; - } - - opj_set_default_event_handler(&(l_codec->m_event_mgr)); - return (opj_codec_t*) l_codec; -} - -void OPJ_CALLCONV opj_set_default_encoder_parameters(opj_cparameters_t - *parameters) -{ - if (parameters) { - memset(parameters, 0, sizeof(opj_cparameters_t)); - /* default coding parameters */ - parameters->cp_cinema = OPJ_OFF; /* DEPRECATED */ - parameters->rsiz = OPJ_PROFILE_NONE; - parameters->max_comp_size = 0; - parameters->numresolution = 6; - parameters->cp_rsiz = OPJ_STD_RSIZ; /* DEPRECATED */ - parameters->cblockw_init = 64; - parameters->cblockh_init = 64; - parameters->prog_order = OPJ_LRCP; - parameters->roi_compno = -1; /* no ROI */ - parameters->subsampling_dx = 1; - parameters->subsampling_dy = 1; - parameters->tp_on = 0; - parameters->decod_format = -1; - parameters->cod_format = -1; - parameters->tcp_rates[0] = 0; - parameters->tcp_numlayers = 0; - parameters->cp_disto_alloc = 0; - parameters->cp_fixed_alloc = 0; - parameters->cp_fixed_quality = 0; - parameters->jpip_on = OPJ_FALSE; - /* UniPG>> */ -#ifdef USE_JPWL - parameters->jpwl_epc_on = OPJ_FALSE; - parameters->jpwl_hprot_MH = -1; /* -1 means unassigned */ - { - int i; - for (i = 0; i < JPWL_MAX_NO_TILESPECS; i++) { - parameters->jpwl_hprot_TPH_tileno[i] = -1; /* unassigned */ - parameters->jpwl_hprot_TPH[i] = 0; /* absent */ - } - }; - { - int i; - for (i = 0; i < JPWL_MAX_NO_PACKSPECS; i++) { - parameters->jpwl_pprot_tileno[i] = -1; /* unassigned */ - parameters->jpwl_pprot_packno[i] = -1; /* unassigned */ - parameters->jpwl_pprot[i] = 0; /* absent */ - } - }; - parameters->jpwl_sens_size = 0; /* 0 means no ESD */ - parameters->jpwl_sens_addr = 0; /* 0 means auto */ - parameters->jpwl_sens_range = 0; /* 0 means packet */ - parameters->jpwl_sens_MH = -1; /* -1 means unassigned */ - { - int i; - for (i = 0; i < JPWL_MAX_NO_TILESPECS; i++) { - parameters->jpwl_sens_TPH_tileno[i] = -1; /* unassigned */ - parameters->jpwl_sens_TPH[i] = -1; /* absent */ - } - }; -#endif /* USE_JPWL */ - /* <is_decompressor) { - return l_codec->m_codec_data.m_compression.opj_setup_encoder(l_codec->m_codec, - parameters, - p_image, - &(l_codec->m_event_mgr)); - } - } - - return OPJ_FALSE; -} - -OPJ_BOOL OPJ_CALLCONV opj_start_compress(opj_codec_t *p_codec, - opj_image_t * p_image, - opj_stream_t *p_stream) -{ - if (p_codec && p_stream) { - opj_codec_private_t * l_codec = (opj_codec_private_t *) p_codec; - opj_stream_private_t * l_stream = (opj_stream_private_t *) p_stream; - - if (! l_codec->is_decompressor) { - return l_codec->m_codec_data.m_compression.opj_start_compress(l_codec->m_codec, - l_stream, - p_image, - &(l_codec->m_event_mgr)); - } - } - - return OPJ_FALSE; -} - -OPJ_BOOL OPJ_CALLCONV opj_encode(opj_codec_t *p_info, opj_stream_t *p_stream) -{ - if (p_info && p_stream) { - opj_codec_private_t * l_codec = (opj_codec_private_t *) p_info; - opj_stream_private_t * l_stream = (opj_stream_private_t *) p_stream; - - if (! l_codec->is_decompressor) { - return l_codec->m_codec_data.m_compression.opj_encode(l_codec->m_codec, - l_stream, - &(l_codec->m_event_mgr)); - } - } - - return OPJ_FALSE; - -} - -OPJ_BOOL OPJ_CALLCONV opj_end_compress(opj_codec_t *p_codec, - opj_stream_t *p_stream) -{ - if (p_codec && p_stream) { - opj_codec_private_t * l_codec = (opj_codec_private_t *) p_codec; - opj_stream_private_t * l_stream = (opj_stream_private_t *) p_stream; - - if (! l_codec->is_decompressor) { - return l_codec->m_codec_data.m_compression.opj_end_compress(l_codec->m_codec, - l_stream, - &(l_codec->m_event_mgr)); - } - } - return OPJ_FALSE; - -} - -OPJ_BOOL OPJ_CALLCONV opj_end_decompress(opj_codec_t *p_codec, - opj_stream_t *p_stream) -{ - if (p_codec && p_stream) { - opj_codec_private_t * l_codec = (opj_codec_private_t *) p_codec; - opj_stream_private_t * l_stream = (opj_stream_private_t *) p_stream; - - if (! l_codec->is_decompressor) { - return OPJ_FALSE; - } - - return l_codec->m_codec_data.m_decompression.opj_end_decompress( - l_codec->m_codec, - l_stream, - &(l_codec->m_event_mgr)); - } - - return OPJ_FALSE; -} - -OPJ_BOOL OPJ_CALLCONV opj_set_MCT(opj_cparameters_t *parameters, - OPJ_FLOAT32 * pEncodingMatrix, - OPJ_INT32 * p_dc_shift, OPJ_UINT32 pNbComp) -{ - OPJ_UINT32 l_matrix_size = pNbComp * pNbComp * (OPJ_UINT32)sizeof(OPJ_FLOAT32); - OPJ_UINT32 l_dc_shift_size = pNbComp * (OPJ_UINT32)sizeof(OPJ_INT32); - OPJ_UINT32 l_mct_total_size = l_matrix_size + l_dc_shift_size; - - /* add MCT capability */ - if (OPJ_IS_PART2(parameters->rsiz)) { - parameters->rsiz |= OPJ_EXTENSION_MCT; - } else { - parameters->rsiz = ((OPJ_PROFILE_PART2) | (OPJ_EXTENSION_MCT)); - } - parameters->irreversible = 1; - - /* use array based MCT */ - parameters->tcp_mct = 2; - parameters->mct_data = opj_malloc(l_mct_total_size); - if (! parameters->mct_data) { - return OPJ_FALSE; - } - - memcpy(parameters->mct_data, pEncodingMatrix, l_matrix_size); - memcpy(((OPJ_BYTE *) parameters->mct_data) + l_matrix_size, p_dc_shift, - l_dc_shift_size); - - return OPJ_TRUE; -} - -OPJ_BOOL OPJ_CALLCONV opj_write_tile(opj_codec_t *p_codec, - OPJ_UINT32 p_tile_index, - OPJ_BYTE * p_data, - OPJ_UINT32 p_data_size, - opj_stream_t *p_stream) -{ - if (p_codec && p_stream && p_data) { - opj_codec_private_t * l_codec = (opj_codec_private_t *) p_codec; - opj_stream_private_t * l_stream = (opj_stream_private_t *) p_stream; - - if (l_codec->is_decompressor) { - return OPJ_FALSE; - } - - return l_codec->m_codec_data.m_compression.opj_write_tile(l_codec->m_codec, - p_tile_index, - p_data, - p_data_size, - l_stream, - &(l_codec->m_event_mgr)); - } - - return OPJ_FALSE; -} - -/* ---------------------------------------------------------------------- */ - -void OPJ_CALLCONV opj_destroy_codec(opj_codec_t *p_codec) -{ - if (p_codec) { - opj_codec_private_t * l_codec = (opj_codec_private_t *) p_codec; - - if (l_codec->is_decompressor) { - l_codec->m_codec_data.m_decompression.opj_destroy(l_codec->m_codec); - } else { - l_codec->m_codec_data.m_compression.opj_destroy(l_codec->m_codec); - } - - l_codec->m_codec = 00; - opj_free(l_codec); - } -} - -/* ---------------------------------------------------------------------- */ - -void OPJ_CALLCONV opj_dump_codec(opj_codec_t *p_codec, - OPJ_INT32 info_flag, - FILE* output_stream) -{ - if (p_codec) { - opj_codec_private_t* l_codec = (opj_codec_private_t*) p_codec; - - l_codec->opj_dump_codec(l_codec->m_codec, info_flag, output_stream); - return; - } - - /* TODO return error */ - /* fprintf(stderr, "[ERROR] Input parameter of the dump_codec function are incorrect.\n"); */ - return; -} - -opj_codestream_info_v2_t* OPJ_CALLCONV opj_get_cstr_info(opj_codec_t *p_codec) -{ - if (p_codec) { - opj_codec_private_t* l_codec = (opj_codec_private_t*) p_codec; - - return l_codec->opj_get_codec_info(l_codec->m_codec); - } - - return NULL; -} - -void OPJ_CALLCONV opj_destroy_cstr_info(opj_codestream_info_v2_t **cstr_info) -{ - if (cstr_info) { - - if ((*cstr_info)->m_default_tile_info.tccp_info) { - opj_free((*cstr_info)->m_default_tile_info.tccp_info); - } - - if ((*cstr_info)->tile_info) { - /* FIXME not used for the moment*/ - } - - opj_free((*cstr_info)); - (*cstr_info) = NULL; - } -} - -opj_codestream_index_t * OPJ_CALLCONV opj_get_cstr_index(opj_codec_t *p_codec) -{ - if (p_codec) { - opj_codec_private_t* l_codec = (opj_codec_private_t*) p_codec; - - return l_codec->opj_get_codec_index(l_codec->m_codec); - } - - return NULL; -} - -void OPJ_CALLCONV opj_destroy_cstr_index(opj_codestream_index_t **p_cstr_index) -{ - if (*p_cstr_index) { - j2k_destroy_cstr_index(*p_cstr_index); - (*p_cstr_index) = NULL; - } -} - -opj_stream_t* OPJ_CALLCONV opj_stream_create_default_file_stream( - const char *fname, OPJ_BOOL p_is_read_stream) -{ - return opj_stream_create_file_stream(fname, OPJ_J2K_STREAM_CHUNK_SIZE, - p_is_read_stream); -} - -opj_stream_t* OPJ_CALLCONV opj_stream_create_file_stream( - const char *fname, - OPJ_SIZE_T p_size, - OPJ_BOOL p_is_read_stream) -{ - opj_stream_t* l_stream = 00; - FILE *p_file; - const char *mode; - - if (! fname) { - return NULL; - } - - if (p_is_read_stream) { - mode = "rb"; - } else { - mode = "wb"; - } - - p_file = fopen(fname, mode); - - if (! p_file) { - return NULL; - } - - l_stream = opj_stream_create(p_size, p_is_read_stream); - if (! l_stream) { - fclose(p_file); - return NULL; - } - - opj_stream_set_user_data(l_stream, p_file, - (opj_stream_free_user_data_fn) fclose); - opj_stream_set_user_data_length(l_stream, - opj_get_data_length_from_file(p_file)); - opj_stream_set_read_function(l_stream, (opj_stream_read_fn) opj_read_from_file); - opj_stream_set_write_function(l_stream, - (opj_stream_write_fn) opj_write_from_file); - opj_stream_set_skip_function(l_stream, (opj_stream_skip_fn) opj_skip_from_file); - opj_stream_set_seek_function(l_stream, (opj_stream_seek_fn) opj_seek_from_file); - - return l_stream; -} - - -void* OPJ_CALLCONV opj_image_data_alloc(OPJ_SIZE_T size) -{ - void* ret = opj_aligned_malloc(size); - /* printf("opj_image_data_alloc %p\n", ret); */ - return ret; -} - -void OPJ_CALLCONV opj_image_data_free(void* ptr) -{ - /* printf("opj_image_data_free %p\n", ptr); */ - opj_aligned_free(ptr); -} diff --git a/src/3rd/LibOpenJpeg/openjpeg.h b/src/3rd/LibOpenJpeg/openjpeg.h deleted file mode 100644 index f36286eb..00000000 --- a/src/3rd/LibOpenJpeg/openjpeg.h +++ /dev/null @@ -1,1687 +0,0 @@ -/* -* The copyright in this software is being made available under the 2-clauses -* BSD License, included below. This software may be subject to other third -* party and contributor rights, including patent rights, and no such rights -* are granted under this license. -* -* Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium -* Copyright (c) 2002-2014, Professor Benoit Macq -* Copyright (c) 2001-2003, David Janssens -* Copyright (c) 2002-2003, Yannick Verschueren -* Copyright (c) 2003-2007, Francois-Olivier Devaux -* Copyright (c) 2003-2014, Antonin Descampe -* Copyright (c) 2005, Herve Drolon, FreeImage Team -* Copyright (c) 2006-2007, Parvatha Elangovan -* Copyright (c) 2008, Jerome Fimes, Communications & Systemes -* Copyright (c) 2010-2011, Kaori Hagihara -* Copyright (c) 2011-2012, Centre National d'Etudes Spatiales (CNES), France -* Copyright (c) 2012, CS Systemes d'Information, France -* All rights reserved. -* -* Redistribution and use in source and binary forms, with or without -* modification, are permitted provided that the following conditions -* are met: -* 1. Redistributions of source code must retain the above copyright -* notice, this list of conditions and the following disclaimer. -* 2. Redistributions in binary form must reproduce the above copyright -* notice, this list of conditions and the following disclaimer in the -* documentation and/or other materials provided with the distribution. -* -* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' -* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -* POSSIBILITY OF SUCH DAMAGE. -*/ -#ifndef OPENJPEG_H -#define OPENJPEG_H - - -/* -========================================================== - Compiler directives -========================================================== -*/ - -/* -The inline keyword is supported by C99 but not by C90. -Most compilers implement their own version of this keyword ... -*/ -#ifndef INLINE -#if defined(_MSC_VER) -#define INLINE __forceinline -#elif defined(__GNUC__) -#define INLINE __inline__ -#elif defined(__MWERKS__) -#define INLINE inline -#else -/* add other compilers here ... */ -#define INLINE -#endif /* defined() */ -#endif /* INLINE */ - -/* deprecated attribute */ -#ifdef __GNUC__ -#define OPJ_DEPRECATED(func) func __attribute__ ((deprecated)) -#elif defined(_MSC_VER) -#define OPJ_DEPRECATED(func) __declspec(deprecated) func -#else -#pragma message("WARNING: You need to implement DEPRECATED for this compiler") -#define OPJ_DEPRECATED(func) func -#endif - -#if defined(OPJ_STATIC) || !defined(_WIN32) -/* http://gcc.gnu.org/wiki/Visibility */ -# if __GNUC__ >= 4 -# if defined(OPJ_STATIC) /* static library uses "hidden" */ -# define OPJ_API __attribute__ ((visibility ("hidden"))) -# else -# define OPJ_API __attribute__ ((visibility ("default"))) -# endif -# define OPJ_LOCAL __attribute__ ((visibility ("hidden"))) -# else -# define OPJ_API -# define OPJ_LOCAL -# endif -# define OPJ_CALLCONV -#else -# define OPJ_CALLCONV __stdcall -/* -The following ifdef block is the standard way of creating macros which make exporting -from a DLL simpler. All files within this DLL are compiled with the OPJ_EXPORTS -symbol defined on the command line. this symbol should not be defined on any project -that uses this DLL. This way any other project whose source files include this file see -OPJ_API functions as being imported from a DLL, whereas this DLL sees symbols -defined with this macro as being exported. -*/ -# if defined(OPJ_EXPORTS) || defined(DLL_EXPORT) -# define OPJ_API __declspec(dllexport) -# else -# define OPJ_API __declspec(dllimport) -# endif /* OPJ_EXPORTS */ -#endif /* !OPJ_STATIC || !_WIN32 */ - -typedef int OPJ_BOOL; -#define OPJ_TRUE 1 -#define OPJ_FALSE 0 - -typedef char OPJ_CHAR; -typedef float OPJ_FLOAT32; -typedef double OPJ_FLOAT64; -typedef unsigned char OPJ_BYTE; - -#include "opj_stdint.h" - -typedef int8_t OPJ_INT8; -typedef uint8_t OPJ_UINT8; -typedef int16_t OPJ_INT16; -typedef uint16_t OPJ_UINT16; -typedef int32_t OPJ_INT32; -typedef uint32_t OPJ_UINT32; -typedef int64_t OPJ_INT64; -typedef uint64_t OPJ_UINT64; - -typedef int64_t OPJ_OFF_T; /* 64-bit file offset type */ - -#include -typedef size_t OPJ_SIZE_T; - -/* Avoid compile-time warning because parameter is not used */ -#define OPJ_ARG_NOT_USED(x) (void)(x) - -/* -========================================================== - Useful constant definitions -========================================================== -*/ - -#define OPJ_PATH_LEN 4096 /**< Maximum allowed size for filenames */ - -#define OPJ_J2K_MAXRLVLS 33 /**< Number of maximum resolution level authorized */ -#define OPJ_J2K_MAXBANDS (3*OPJ_J2K_MAXRLVLS-2) /**< Number of maximum sub-band linked to number of resolution level */ - -#define OPJ_J2K_DEFAULT_NB_SEGS 10 -#define OPJ_J2K_STREAM_CHUNK_SIZE 0x100000 /** 1 mega by default */ -#define OPJ_J2K_DEFAULT_HEADER_SIZE 1000 -#define OPJ_J2K_MCC_DEFAULT_NB_RECORDS 10 -#define OPJ_J2K_MCT_DEFAULT_NB_RECORDS 10 - -/* UniPG>> */ /* NOT YET USED IN THE V2 VERSION OF OPENJPEG */ -#define JPWL_MAX_NO_TILESPECS 16 /**< Maximum number of tile parts expected by JPWL: increase at your will */ -#define JPWL_MAX_NO_PACKSPECS 16 /**< Maximum number of packet parts expected by JPWL: increase at your will */ -#define JPWL_MAX_NO_MARKERS 512 /**< Maximum number of JPWL markers: increase at your will */ -#define JPWL_PRIVATEINDEX_NAME "jpwl_index_privatefilename" /**< index file name used when JPWL is on */ -#define JPWL_EXPECTED_COMPONENTS 3 /**< Expect this number of components, so you'll find better the first EPB */ -#define JPWL_MAXIMUM_TILES 8192 /**< Expect this maximum number of tiles, to avoid some crashes */ -#define JPWL_MAXIMUM_HAMMING 2 /**< Expect this maximum number of bit errors in marker id's */ -#define JPWL_MAXIMUM_EPB_ROOM 65450 /**< Expect this maximum number of bytes for composition of EPBs */ -/* <= OPJ_PROFILE_CINEMA_2K)&&((v) <= OPJ_PROFILE_CINEMA_S4K)) -#define OPJ_IS_STORAGE(v) ((v) == OPJ_PROFILE_CINEMA_LTS) -#define OPJ_IS_BROADCAST(v) (((v) >= OPJ_PROFILE_BC_SINGLE)&&((v) <= ((OPJ_PROFILE_BC_MULTI_R) | (0x000b)))) -#define OPJ_IS_IMF(v) (((v) >= OPJ_PROFILE_IMF_2K)&&((v) <= ((OPJ_PROFILE_IMF_8K_R) | (0x009b)))) -#define OPJ_IS_PART2(v) ((v) & OPJ_PROFILE_PART2) - -/** - * JPEG 2000 codestream and component size limits in cinema profiles - * */ -#define OPJ_CINEMA_24_CS 1302083 /** Maximum codestream length for 24fps */ -#define OPJ_CINEMA_48_CS 651041 /** Maximum codestream length for 48fps */ -#define OPJ_CINEMA_24_COMP 1041666 /** Maximum size per color component for 2K & 4K @ 24fps */ -#define OPJ_CINEMA_48_COMP 520833 /** Maximum size per color component for 2K @ 48fps */ - -/* -========================================================== - enum definitions -========================================================== -*/ - -/** - * DEPRECATED: use RSIZ, OPJ_PROFILE_* and OPJ_EXTENSION_* instead - * Rsiz Capabilities - * */ -typedef enum RSIZ_CAPABILITIES { - OPJ_STD_RSIZ = 0, /** Standard JPEG2000 profile*/ - OPJ_CINEMA2K = 3, /** Profile name for a 2K image*/ - OPJ_CINEMA4K = 4, /** Profile name for a 4K image*/ - OPJ_MCT = 0x8100 -} OPJ_RSIZ_CAPABILITIES; - -/** - * DEPRECATED: use RSIZ, OPJ_PROFILE_* and OPJ_EXTENSION_* instead - * Digital cinema operation mode - * */ -typedef enum CINEMA_MODE { - OPJ_OFF = 0, /** Not Digital Cinema*/ - OPJ_CINEMA2K_24 = 1, /** 2K Digital Cinema at 24 fps*/ - OPJ_CINEMA2K_48 = 2, /** 2K Digital Cinema at 48 fps*/ - OPJ_CINEMA4K_24 = 3 /** 4K Digital Cinema at 24 fps*/ -} OPJ_CINEMA_MODE; - -/** - * Progression order - * */ -typedef enum PROG_ORDER { - OPJ_PROG_UNKNOWN = -1, /**< place-holder */ - OPJ_LRCP = 0, /**< layer-resolution-component-precinct order */ - OPJ_RLCP = 1, /**< resolution-layer-component-precinct order */ - OPJ_RPCL = 2, /**< resolution-precinct-component-layer order */ - OPJ_PCRL = 3, /**< precinct-component-resolution-layer order */ - OPJ_CPRL = 4 /**< component-precinct-resolution-layer order */ -} OPJ_PROG_ORDER; - -/** - * Supported image color spaces -*/ -typedef enum COLOR_SPACE { - OPJ_CLRSPC_UNKNOWN = -1, /**< not supported by the library */ - OPJ_CLRSPC_UNSPECIFIED = 0, /**< not specified in the codestream */ - OPJ_CLRSPC_SRGB = 1, /**< sRGB */ - OPJ_CLRSPC_GRAY = 2, /**< grayscale */ - OPJ_CLRSPC_SYCC = 3, /**< YUV */ - OPJ_CLRSPC_EYCC = 4, /**< e-YCC */ - OPJ_CLRSPC_CMYK = 5 /**< CMYK */ -} OPJ_COLOR_SPACE; - -/** - * Supported codec -*/ -typedef enum CODEC_FORMAT { - OPJ_CODEC_UNKNOWN = -1, /**< place-holder */ - OPJ_CODEC_J2K = 0, /**< JPEG-2000 codestream : read/write */ - OPJ_CODEC_JPT = 1, /**< JPT-stream (JPEG 2000, JPIP) : read only */ - OPJ_CODEC_JP2 = 2, /**< JP2 file format : read/write */ - OPJ_CODEC_JPP = 3, /**< JPP-stream (JPEG 2000, JPIP) : to be coded */ - OPJ_CODEC_JPX = 4 /**< JPX file format (JPEG 2000 Part-2) : to be coded */ -} OPJ_CODEC_FORMAT; - - -/* -========================================================== - event manager typedef definitions -========================================================== -*/ - -/** - * Callback function prototype for events - * @param msg Event message - * @param client_data Client object where will be return the event message - * */ -typedef void (*opj_msg_callback)(const char *msg, void *client_data); - -/* -========================================================== - codec typedef definitions -========================================================== -*/ - -/** - * Progression order changes - * - */ -typedef struct opj_poc { - /** Resolution num start, Component num start, given by POC */ - OPJ_UINT32 resno0, compno0; - /** Layer num end,Resolution num end, Component num end, given by POC */ - OPJ_UINT32 layno1, resno1, compno1; - /** Layer num start,Precinct num start, Precinct num end */ - OPJ_UINT32 layno0, precno0, precno1; - /** Progression order enum*/ - OPJ_PROG_ORDER prg1, prg; - /** Progression order string*/ - OPJ_CHAR progorder[5]; - /** Tile number */ - OPJ_UINT32 tile; - /** Start and end values for Tile width and height*/ - OPJ_INT32 tx0, tx1, ty0, ty1; - /** Start value, initialised in pi_initialise_encode*/ - OPJ_UINT32 layS, resS, compS, prcS; - /** End value, initialised in pi_initialise_encode */ - OPJ_UINT32 layE, resE, compE, prcE; - /** Start and end values of Tile width and height, initialised in pi_initialise_encode*/ - OPJ_UINT32 txS, txE, tyS, tyE, dx, dy; - /** Temporary values for Tile parts, initialised in pi_create_encode */ - OPJ_UINT32 lay_t, res_t, comp_t, prc_t, tx0_t, ty0_t; -} opj_poc_t; - -/** - * Compression parameters - * */ -typedef struct opj_cparameters { - /** size of tile: tile_size_on = false (not in argument) or = true (in argument) */ - OPJ_BOOL tile_size_on; - /** XTOsiz */ - int cp_tx0; - /** YTOsiz */ - int cp_ty0; - /** XTsiz */ - int cp_tdx; - /** YTsiz */ - int cp_tdy; - /** allocation by rate/distortion */ - int cp_disto_alloc; - /** allocation by fixed layer */ - int cp_fixed_alloc; - /** add fixed_quality */ - int cp_fixed_quality; - /** fixed layer */ - int *cp_matrice; - /** comment for coding */ - char *cp_comment; - /** csty : coding style */ - int csty; - /** progression order (default OPJ_LRCP) */ - OPJ_PROG_ORDER prog_order; - /** progression order changes */ - opj_poc_t POC[32]; - /** number of progression order changes (POC), default to 0 */ - OPJ_UINT32 numpocs; - /** number of layers */ - int tcp_numlayers; - /** rates of layers - might be subsequently limited by the max_cs_size field. - * Should be decreasing. 1 can be - * used as last value to indicate the last layer is lossless. */ - float tcp_rates[100]; - /** different psnr for successive layers. Should be increasing. 0 can be - * used as last value to indicate the last layer is lossless. */ - float tcp_distoratio[100]; - /** number of resolutions */ - int numresolution; - /** initial code block width, default to 64 */ - int cblockw_init; - /** initial code block height, default to 64 */ - int cblockh_init; - /** mode switch (cblk_style) */ - int mode; - /** 1 : use the irreversible DWT 9-7, 0 : use lossless compression (default) */ - int irreversible; - /** region of interest: affected component in [0..3], -1 means no ROI */ - int roi_compno; - /** region of interest: upshift value */ - int roi_shift; - /* number of precinct size specifications */ - int res_spec; - /** initial precinct width */ - int prcw_init[OPJ_J2K_MAXRLVLS]; - /** initial precinct height */ - int prch_init[OPJ_J2K_MAXRLVLS]; - - /**@name command line encoder parameters (not used inside the library) */ - /*@{*/ - /** input file name */ - char infile[OPJ_PATH_LEN]; - /** output file name */ - char outfile[OPJ_PATH_LEN]; - /** DEPRECATED. Index generation is now handeld with the opj_encode_with_info() function. Set to NULL */ - int index_on; - /** DEPRECATED. Index generation is now handeld with the opj_encode_with_info() function. Set to NULL */ - char index[OPJ_PATH_LEN]; - /** subimage encoding: origin image offset in x direction */ - int image_offset_x0; - /** subimage encoding: origin image offset in y direction */ - int image_offset_y0; - /** subsampling value for dx */ - int subsampling_dx; - /** subsampling value for dy */ - int subsampling_dy; - /** input file format 0: PGX, 1: PxM, 2: BMP 3:TIF*/ - int decod_format; - /** output file format 0: J2K, 1: JP2, 2: JPT */ - int cod_format; - /*@}*/ - - /* UniPG>> */ /* NOT YET USED IN THE V2 VERSION OF OPENJPEG */ - /**@name JPWL encoding parameters */ - /*@{*/ - /** enables writing of EPC in MH, thus activating JPWL */ - OPJ_BOOL jpwl_epc_on; - /** error protection method for MH (0,1,16,32,37-128) */ - int jpwl_hprot_MH; - /** tile number of header protection specification (>=0) */ - int jpwl_hprot_TPH_tileno[JPWL_MAX_NO_TILESPECS]; - /** error protection methods for TPHs (0,1,16,32,37-128) */ - int jpwl_hprot_TPH[JPWL_MAX_NO_TILESPECS]; - /** tile number of packet protection specification (>=0) */ - int jpwl_pprot_tileno[JPWL_MAX_NO_PACKSPECS]; - /** packet number of packet protection specification (>=0) */ - int jpwl_pprot_packno[JPWL_MAX_NO_PACKSPECS]; - /** error protection methods for packets (0,1,16,32,37-128) */ - int jpwl_pprot[JPWL_MAX_NO_PACKSPECS]; - /** enables writing of ESD, (0=no/1/2 bytes) */ - int jpwl_sens_size; - /** sensitivity addressing size (0=auto/2/4 bytes) */ - int jpwl_sens_addr; - /** sensitivity range (0-3) */ - int jpwl_sens_range; - /** sensitivity method for MH (-1=no,0-7) */ - int jpwl_sens_MH; - /** tile number of sensitivity specification (>=0) */ - int jpwl_sens_TPH_tileno[JPWL_MAX_NO_TILESPECS]; - /** sensitivity methods for TPHs (-1=no,0-7) */ - int jpwl_sens_TPH[JPWL_MAX_NO_TILESPECS]; - /*@}*/ - /* <> */ /* NOT YET USED IN THE V2 VERSION OF OPENJPEG */ - /**@name JPWL decoding parameters */ - /*@{*/ - /** activates the JPWL correction capabilities */ - OPJ_BOOL jpwl_correct; - /** expected number of components */ - int jpwl_exp_comps; - /** maximum number of tiles */ - int jpwl_max_tiles; - /*@}*/ - /* <> */ -/** - * Marker structure - * */ -typedef struct opj_marker_info { - /** marker type */ - unsigned short int type; - /** position in codestream */ - OPJ_OFF_T pos; - /** length, marker val included */ - int len; -} opj_marker_info_t; -/* <> */ - /** number of markers */ - int marknum; - /** list of markers */ - opj_marker_info_t *marker; - /** actual size of markers array */ - int maxmarknum; - /* <> */ /* NOT USED FOR THE MOMENT IN THE V2 VERSION */ - /** number of markers */ - OPJ_UINT32 marknum; - /** list of markers */ - opj_marker_info_t *marker; - /** actual size of markers array */ - OPJ_UINT32 maxmarknum; - /* <> */ /* NOT USED FOR THE MOMENT IN THE V2 VERSION */ - /** number of markers */ - OPJ_UINT32 marknum; - /** list of markers */ - opj_marker_info_t *marker; - /** actual size of markers array */ - OPJ_UINT32 maxmarknum; - /* < */ - -/* -========================================================== - Metadata from the JP2file -========================================================== -*/ - -/** - * Info structure of the JP2 file - * EXPERIMENTAL FOR THE MOMENT - */ -typedef struct opj_jp2_metadata { - /** */ - OPJ_INT32 not_used; - -} opj_jp2_metadata_t; - -/** - * Index structure of the JP2 file - * EXPERIMENTAL FOR THE MOMENT - */ -typedef struct opj_jp2_index { - /** */ - OPJ_INT32 not_used; - -} opj_jp2_index_t; - - -#ifdef __cplusplus -extern "C" { -#endif - - -/* -========================================================== - openjpeg version -========================================================== -*/ - -/* Get the version of the openjpeg library*/ -OPJ_API const char * OPJ_CALLCONV opj_version(void); - -/* -========================================================== - image functions definitions -========================================================== -*/ - -/** - * Create an image - * - * @param numcmpts number of components - * @param cmptparms components parameters - * @param clrspc image color space - * @return returns a new image structure if successful, returns NULL otherwise - * */ -OPJ_API opj_image_t* OPJ_CALLCONV opj_image_create(OPJ_UINT32 numcmpts, - opj_image_cmptparm_t *cmptparms, OPJ_COLOR_SPACE clrspc); - -/** - * Deallocate any resources associated with an image - * - * @param image image to be destroyed - */ -OPJ_API void OPJ_CALLCONV opj_image_destroy(opj_image_t *image); - -/** - * Creates an image without allocating memory for the image (used in the new version of the library). - * - * @param numcmpts the number of components - * @param cmptparms the components parameters - * @param clrspc the image color space - * - * @return a new image structure if successful, NULL otherwise. -*/ -OPJ_API opj_image_t* OPJ_CALLCONV opj_image_tile_create(OPJ_UINT32 numcmpts, - opj_image_cmptparm_t *cmptparms, OPJ_COLOR_SPACE clrspc); - -/** - * Allocator for opj_image_t->comps[].data - * To be paired with opj_image_data_free. - * - * @param size number of bytes to allocate - * - * @return a new pointer if successful, NULL otherwise. - * @since 2.2.0 -*/ -OPJ_API void* OPJ_CALLCONV opj_image_data_alloc(OPJ_SIZE_T size); - -/** - * Destructor for opj_image_t->comps[].data - * To be paired with opj_image_data_alloc. - * - * @param ptr Pointer to free - * - * @since 2.2.0 -*/ -OPJ_API void OPJ_CALLCONV opj_image_data_free(void* ptr); - -/* -========================================================== - stream functions definitions -========================================================== -*/ - -/** - * Creates an abstract stream. This function does nothing except allocating memory and initializing the abstract stream. - * - * @param p_is_input if set to true then the stream will be an input stream, an output stream else. - * - * @return a stream object. -*/ -OPJ_API opj_stream_t* OPJ_CALLCONV opj_stream_default_create( - OPJ_BOOL p_is_input); - -/** - * Creates an abstract stream. This function does nothing except allocating memory and initializing the abstract stream. - * - * @param p_buffer_size FIXME DOC - * @param p_is_input if set to true then the stream will be an input stream, an output stream else. - * - * @return a stream object. -*/ -OPJ_API opj_stream_t* OPJ_CALLCONV opj_stream_create(OPJ_SIZE_T p_buffer_size, - OPJ_BOOL p_is_input); - -/** - * Destroys a stream created by opj_create_stream. This function does NOT close the abstract stream. If needed the user must - * close its own implementation of the stream. - * - * @param p_stream the stream to destroy. - */ -OPJ_API void OPJ_CALLCONV opj_stream_destroy(opj_stream_t* p_stream); - -/** - * Sets the given function to be used as a read function. - * @param p_stream the stream to modify - * @param p_function the function to use a read function. -*/ -OPJ_API void OPJ_CALLCONV opj_stream_set_read_function(opj_stream_t* p_stream, - opj_stream_read_fn p_function); - -/** - * Sets the given function to be used as a write function. - * @param p_stream the stream to modify - * @param p_function the function to use a write function. -*/ -OPJ_API void OPJ_CALLCONV opj_stream_set_write_function(opj_stream_t* p_stream, - opj_stream_write_fn p_function); - -/** - * Sets the given function to be used as a skip function. - * @param p_stream the stream to modify - * @param p_function the function to use a skip function. -*/ -OPJ_API void OPJ_CALLCONV opj_stream_set_skip_function(opj_stream_t* p_stream, - opj_stream_skip_fn p_function); - -/** - * Sets the given function to be used as a seek function, the stream is then seekable. - * @param p_stream the stream to modify - * @param p_function the function to use a skip function. -*/ -OPJ_API void OPJ_CALLCONV opj_stream_set_seek_function(opj_stream_t* p_stream, - opj_stream_seek_fn p_function); - -/** - * Sets the given data to be used as a user data for the stream. - * @param p_stream the stream to modify - * @param p_data the data to set. - * @param p_function the function to free p_data when opj_stream_destroy() is called. -*/ -OPJ_API void OPJ_CALLCONV opj_stream_set_user_data(opj_stream_t* p_stream, - void * p_data, opj_stream_free_user_data_fn p_function); - -/** - * Sets the length of the user data for the stream. - * - * @param p_stream the stream to modify - * @param data_length length of the user_data. -*/ -OPJ_API void OPJ_CALLCONV opj_stream_set_user_data_length( - opj_stream_t* p_stream, OPJ_UINT64 data_length); - -/** - * Create a stream from a file identified with its filename with default parameters (helper function) - * @param fname the filename of the file to stream - * @param p_is_read_stream whether the stream is a read stream (true) or not (false) -*/ -OPJ_API opj_stream_t* OPJ_CALLCONV opj_stream_create_default_file_stream( - const char *fname, OPJ_BOOL p_is_read_stream); - -/** Create a stream from a file identified with its filename with a specific buffer size - * @param fname the filename of the file to stream - * @param p_buffer_size size of the chunk used to stream - * @param p_is_read_stream whether the stream is a read stream (true) or not (false) -*/ -OPJ_API opj_stream_t* OPJ_CALLCONV opj_stream_create_file_stream( - const char *fname, - OPJ_SIZE_T p_buffer_size, - OPJ_BOOL p_is_read_stream); - -/* -========================================================== - event manager functions definitions -========================================================== -*/ -/** - * Set the info handler use by openjpeg. - * @param p_codec the codec previously initialise - * @param p_callback the callback function which will be used - * @param p_user_data client object where will be returned the message -*/ -OPJ_API OPJ_BOOL OPJ_CALLCONV opj_set_info_handler(opj_codec_t * p_codec, - opj_msg_callback p_callback, - void * p_user_data); -/** - * Set the warning handler use by openjpeg. - * @param p_codec the codec previously initialise - * @param p_callback the callback function which will be used - * @param p_user_data client object where will be returned the message -*/ -OPJ_API OPJ_BOOL OPJ_CALLCONV opj_set_warning_handler(opj_codec_t * p_codec, - opj_msg_callback p_callback, - void * p_user_data); -/** - * Set the error handler use by openjpeg. - * @param p_codec the codec previously initialise - * @param p_callback the callback function which will be used - * @param p_user_data client object where will be returned the message -*/ -OPJ_API OPJ_BOOL OPJ_CALLCONV opj_set_error_handler(opj_codec_t * p_codec, - opj_msg_callback p_callback, - void * p_user_data); - -/* -========================================================== - codec functions definitions -========================================================== -*/ - -/** - * Creates a J2K/JP2 decompression structure - * @param format Decoder to select - * - * @return Returns a handle to a decompressor if successful, returns NULL otherwise - * */ -OPJ_API opj_codec_t* OPJ_CALLCONV opj_create_decompress( - OPJ_CODEC_FORMAT format); - -/** - * Destroy a decompressor handle - * - * @param p_codec decompressor handle to destroy - */ -OPJ_API void OPJ_CALLCONV opj_destroy_codec(opj_codec_t * p_codec); - -/** - * Read after the codestream if necessary - * @param p_codec the JPEG2000 codec to read. - * @param p_stream the JPEG2000 stream. - */ -OPJ_API OPJ_BOOL OPJ_CALLCONV opj_end_decompress(opj_codec_t *p_codec, - opj_stream_t *p_stream); - - -/** - * Set decoding parameters to default values - * @param parameters Decompression parameters - */ -OPJ_API void OPJ_CALLCONV opj_set_default_decoder_parameters( - opj_dparameters_t *parameters); - -/** - * Setup the decoder with decompression parameters provided by the user and with the message handler - * provided by the user. - * - * @param p_codec decompressor handler - * @param parameters decompression parameters - * - * @return true if the decoder is correctly set - */ -OPJ_API OPJ_BOOL OPJ_CALLCONV opj_setup_decoder(opj_codec_t *p_codec, - opj_dparameters_t *parameters); - -/** - * Allocates worker threads for the compressor/decompressor. - * - * By default, only the main thread is used. If this function is not used, - * but the OPJ_NUM_THREADS environment variable is set, its value will be - * used to initialize the number of threads. The value can be either an integer - * number, or "ALL_CPUS". If OPJ_NUM_THREADS is set and this function is called, - * this function will override the behaviour of the environment variable. - * - * Note: currently only has effect on the decompressor. - * - * @param p_codec decompressor handler - * @param num_threads number of threads. - * - * @return OPJ_TRUE if the decoder is correctly set - */ -OPJ_API OPJ_BOOL OPJ_CALLCONV opj_codec_set_threads(opj_codec_t *p_codec, - int num_threads); - -/** - * Decodes an image header. - * - * @param p_stream the jpeg2000 stream. - * @param p_codec the jpeg2000 codec to read. - * @param p_image the image structure initialized with the characteristics of encoded image. - * - * @return true if the main header of the codestream and the JP2 header is correctly read. - */ -OPJ_API OPJ_BOOL OPJ_CALLCONV opj_read_header(opj_stream_t *p_stream, - opj_codec_t *p_codec, - opj_image_t **p_image); - - -/** Restrict the number of components to decode. - * - * This function should be called after opj_read_header(). - * - * This function enables to restrict the set of decoded components to the - * specified indices. - * Note that the current implementation (apply_color_transforms == OPJ_FALSE) - * is such that neither the multi-component transform at codestream level, - * nor JP2 channel transformations will be applied. - * Consequently the indices are relative to the codestream. - * - * Note: opj_decode_tile_data() should not be used together with opj_set_decoded_components(). - * - * @param p_codec the jpeg2000 codec to read. - * @param numcomps Size of the comps_indices array. - * @param comps_indices Array of numcomps values representing the indices - * of the components to decode (relative to the - * codestream, starting at 0) - * @param apply_color_transforms Whether multi-component transform at codestream level - * or JP2 channel transformations should be applied. - * Currently this parameter should be set to OPJ_FALSE. - * Setting it to OPJ_TRUE will result in an error. - * - * @return OPJ_TRUE in case of success. - */ -OPJ_API OPJ_BOOL OPJ_CALLCONV opj_set_decoded_components(opj_codec_t *p_codec, - OPJ_UINT32 numcomps, - const OPJ_UINT32* comps_indices, - OPJ_BOOL apply_color_transforms); - -/** - * Sets the given area to be decoded. This function should be called right after opj_read_header and before any tile header reading. - * - * The coordinates passed to this function should be expressed in the reference grid, - * that is to say at the highest resolution level, even if requesting the image at lower - * resolution levels. - * - * Generally opj_set_decode_area() should be followed by opj_decode(), and the - * codec cannot be re-used. - * In the particular case of an image made of a single tile, several sequences of - * calls to opoj_set_decode_area() and opj_decode() are allowed, and will bring - * performance improvements when reading an image by chunks. - * - * @param p_codec the jpeg2000 codec. - * @param p_image the decoded image previously setted by opj_read_header - * @param p_start_x the left position of the rectangle to decode (in image coordinates). - * @param p_end_x the right position of the rectangle to decode (in image coordinates). - * @param p_start_y the up position of the rectangle to decode (in image coordinates). - * @param p_end_y the bottom position of the rectangle to decode (in image coordinates). - * - * @return true if the area could be set. - */ -OPJ_API OPJ_BOOL OPJ_CALLCONV opj_set_decode_area(opj_codec_t *p_codec, - opj_image_t* p_image, - OPJ_INT32 p_start_x, OPJ_INT32 p_start_y, - OPJ_INT32 p_end_x, OPJ_INT32 p_end_y); - -/** - * Decode an image from a JPEG-2000 codestream - * - * @param p_decompressor decompressor handle - * @param p_stream Input buffer stream - * @param p_image the decoded image - * @return true if success, otherwise false - * */ -OPJ_API OPJ_BOOL OPJ_CALLCONV opj_decode(opj_codec_t *p_decompressor, - opj_stream_t *p_stream, - opj_image_t *p_image); - -/** - * Get the decoded tile from the codec - * - * @param p_codec the jpeg2000 codec. - * @param p_stream input streamm - * @param p_image output image - * @param tile_index index of the tile which will be decode - * - * @return true if success, otherwise false - */ -OPJ_API OPJ_BOOL OPJ_CALLCONV opj_get_decoded_tile(opj_codec_t *p_codec, - opj_stream_t *p_stream, - opj_image_t *p_image, - OPJ_UINT32 tile_index); - -/** - * Set the resolution factor of the decoded image - * @param p_codec the jpeg2000 codec. - * @param res_factor resolution factor to set - * - * @return true if success, otherwise false - */ -OPJ_API OPJ_BOOL OPJ_CALLCONV opj_set_decoded_resolution_factor( - opj_codec_t *p_codec, OPJ_UINT32 res_factor); - -/** - * Writes a tile with the given data. - * - * @param p_codec the jpeg2000 codec. - * @param p_tile_index the index of the tile to write. At the moment, the tiles must be written from 0 to n-1 in sequence. - * @param p_data pointer to the data to write. Data is arranged in sequence, data_comp0, then data_comp1, then ... NO INTERLEAVING should be set. - * @param p_data_size this value os used to make sure the data being written is correct. The size must be equal to the sum for each component of - * tile_width * tile_height * component_size. component_size can be 1,2 or 4 bytes, depending on the precision of the given component. - * @param p_stream the stream to write data to. - * - * @return true if the data could be written. - */ -OPJ_API OPJ_BOOL OPJ_CALLCONV opj_write_tile(opj_codec_t *p_codec, - OPJ_UINT32 p_tile_index, - OPJ_BYTE * p_data, - OPJ_UINT32 p_data_size, - opj_stream_t *p_stream); - -/** - * Reads a tile header. This function is compulsory and allows one to know the size of the tile that will be decoded. - * The user may need to refer to the image got by opj_read_header to understand the size being taken by the tile. - * - * @param p_codec the jpeg2000 codec. - * @param p_tile_index pointer to a value that will hold the index of the tile being decoded, in case of success. - * @param p_data_size pointer to a value that will hold the maximum size of the decoded data, in case of success. In case - * of truncated codestreams, the actual number of bytes decoded may be lower. The computation of the size is the same - * as depicted in opj_write_tile. - * @param p_tile_x0 pointer to a value that will hold the x0 pos of the tile (in the image). - * @param p_tile_y0 pointer to a value that will hold the y0 pos of the tile (in the image). - * @param p_tile_x1 pointer to a value that will hold the x1 pos of the tile (in the image). - * @param p_tile_y1 pointer to a value that will hold the y1 pos of the tile (in the image). - * @param p_nb_comps pointer to a value that will hold the number of components in the tile. - * @param p_should_go_on pointer to a boolean that will hold the fact that the decoding should go on. In case the - * codestream is over at the time of the call, the value will be set to false. The user should then stop - * the decoding. - * @param p_stream the stream to decode. - * @return true if the tile header could be decoded. In case the decoding should end, the returned value is still true. - * returning false may be the result of a shortage of memory or an internal error. - */ -OPJ_API OPJ_BOOL OPJ_CALLCONV opj_read_tile_header(opj_codec_t *p_codec, - opj_stream_t * p_stream, - OPJ_UINT32 * p_tile_index, - OPJ_UINT32 * p_data_size, - OPJ_INT32 * p_tile_x0, OPJ_INT32 * p_tile_y0, - OPJ_INT32 * p_tile_x1, OPJ_INT32 * p_tile_y1, - OPJ_UINT32 * p_nb_comps, - OPJ_BOOL * p_should_go_on); - -/** - * Reads a tile data. This function is compulsory and allows one to decode tile data. opj_read_tile_header should be called before. - * The user may need to refer to the image got by opj_read_header to understand the size being taken by the tile. - * - * Note: opj_decode_tile_data() should not be used together with opj_set_decoded_components(). - * - * @param p_codec the jpeg2000 codec. - * @param p_tile_index the index of the tile being decoded, this should be the value set by opj_read_tile_header. - * @param p_data pointer to a memory block that will hold the decoded data. - * @param p_data_size size of p_data. p_data_size should be bigger or equal to the value set by opj_read_tile_header. - * @param p_stream the stream to decode. - * - * @return true if the data could be decoded. - */ -OPJ_API OPJ_BOOL OPJ_CALLCONV opj_decode_tile_data(opj_codec_t *p_codec, - OPJ_UINT32 p_tile_index, - OPJ_BYTE * p_data, - OPJ_UINT32 p_data_size, - opj_stream_t *p_stream); - -/* COMPRESSION FUNCTIONS*/ - -/** - * Creates a J2K/JP2 compression structure - * @param format Coder to select - * @return Returns a handle to a compressor if successful, returns NULL otherwise - */ -OPJ_API opj_codec_t* OPJ_CALLCONV opj_create_compress(OPJ_CODEC_FORMAT format); - -/** -Set encoding parameters to default values, that means : -
    -
  • Lossless -
  • 1 tile -
  • Size of precinct : 2^15 x 2^15 (means 1 precinct) -
  • Size of code-block : 64 x 64 -
  • Number of resolutions: 6 -
  • No SOP marker in the codestream -
  • No EPH marker in the codestream -
  • No sub-sampling in x or y direction -
  • No mode switch activated -
  • Progression order: LRCP -
  • No index file -
  • No ROI upshifted -
  • No offset of the origin of the image -
  • No offset of the origin of the tiles -
  • Reversible DWT 5-3 -
-@param parameters Compression parameters -*/ -OPJ_API void OPJ_CALLCONV opj_set_default_encoder_parameters( - opj_cparameters_t *parameters); - -/** - * Setup the encoder parameters using the current image and using user parameters. - * @param p_codec Compressor handle - * @param parameters Compression parameters - * @param image Input filled image - */ -OPJ_API OPJ_BOOL OPJ_CALLCONV opj_setup_encoder(opj_codec_t *p_codec, - opj_cparameters_t *parameters, - opj_image_t *image); - -/** - * Start to compress the current image. - * @param p_codec Compressor handle - * @param p_image Input filled image - * @param p_stream Input stgream - */ -OPJ_API OPJ_BOOL OPJ_CALLCONV opj_start_compress(opj_codec_t *p_codec, - opj_image_t * p_image, - opj_stream_t *p_stream); - -/** - * End to compress the current image. - * @param p_codec Compressor handle - * @param p_stream Input stgream - */ -OPJ_API OPJ_BOOL OPJ_CALLCONV opj_end_compress(opj_codec_t *p_codec, - opj_stream_t *p_stream); - -/** - * Encode an image into a JPEG-2000 codestream - * @param p_codec compressor handle - * @param p_stream Output buffer stream - * - * @return Returns true if successful, returns false otherwise - */ -OPJ_API OPJ_BOOL OPJ_CALLCONV opj_encode(opj_codec_t *p_codec, - opj_stream_t *p_stream); -/* -========================================================== - codec output functions definitions -========================================================== -*/ -/* EXPERIMENTAL FUNCTIONS FOR NOW, USED ONLY IN J2K_DUMP*/ - -/** -Destroy Codestream information after compression or decompression -@param cstr_info Codestream information structure -*/ -OPJ_API void OPJ_CALLCONV opj_destroy_cstr_info(opj_codestream_info_v2_t - **cstr_info); - - -/** - * Dump the codec information into the output stream - * - * @param p_codec the jpeg2000 codec. - * @param info_flag type of information dump. - * @param output_stream output stream where dump the information gotten from the codec. - * - */ -OPJ_API void OPJ_CALLCONV opj_dump_codec(opj_codec_t *p_codec, - OPJ_INT32 info_flag, - FILE* output_stream); - -/** - * Get the codestream information from the codec - * - * @param p_codec the jpeg2000 codec. - * - * @return a pointer to a codestream information structure. - * - */ -OPJ_API opj_codestream_info_v2_t* OPJ_CALLCONV opj_get_cstr_info( - opj_codec_t *p_codec); - -/** - * Get the codestream index from the codec - * - * @param p_codec the jpeg2000 codec. - * - * @return a pointer to a codestream index structure. - * - */ -OPJ_API opj_codestream_index_t * OPJ_CALLCONV opj_get_cstr_index( - opj_codec_t *p_codec); - -OPJ_API void OPJ_CALLCONV opj_destroy_cstr_index(opj_codestream_index_t - **p_cstr_index); - - -/** - * Get the JP2 file information from the codec FIXME - * - * @param p_codec the jpeg2000 codec. - * - * @return a pointer to a JP2 metadata structure. - * - */ -OPJ_API opj_jp2_metadata_t* OPJ_CALLCONV opj_get_jp2_metadata( - opj_codec_t *p_codec); - -/** - * Get the JP2 file index from the codec FIXME - * - * @param p_codec the jpeg2000 codec. - * - * @return a pointer to a JP2 index structure. - * - */ -OPJ_API opj_jp2_index_t* OPJ_CALLCONV opj_get_jp2_index(opj_codec_t *p_codec); - - -/* -========================================================== - MCT functions -========================================================== -*/ - -/** - * Sets the MCT matrix to use. - * - * @param parameters the parameters to change. - * @param pEncodingMatrix the encoding matrix. - * @param p_dc_shift the dc shift coefficients to use. - * @param pNbComp the number of components of the image. - * - * @return true if the parameters could be set. - */ -OPJ_API OPJ_BOOL OPJ_CALLCONV opj_set_MCT(opj_cparameters_t *parameters, - OPJ_FLOAT32 * pEncodingMatrix, - OPJ_INT32 * p_dc_shift, - OPJ_UINT32 pNbComp); - -/* -========================================================== - Thread functions -========================================================== -*/ - -/** Returns if the library is built with thread support. - * OPJ_TRUE if mutex, condition, thread, thread pool are available. - */ -OPJ_API OPJ_BOOL OPJ_CALLCONV opj_has_thread_support(void); - -/** Return the number of virtual CPUs */ -OPJ_API int OPJ_CALLCONV opj_get_num_cpus(void); - - -#ifdef __cplusplus -} -#endif - -#endif /* OPENJPEG_H */ diff --git a/src/3rd/LibOpenJpeg/opj_clock.c b/src/3rd/LibOpenJpeg/opj_clock.c deleted file mode 100644 index 24f79a9a..00000000 --- a/src/3rd/LibOpenJpeg/opj_clock.c +++ /dev/null @@ -1,67 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "opj_includes.h" - -#ifdef _WIN32 -#include -#else -#include -#include -#include -#endif /* _WIN32 */ - -OPJ_FLOAT64 opj_clock(void) -{ -#ifdef _WIN32 - /* _WIN32: use QueryPerformance (very accurate) */ - LARGE_INTEGER freq, t ; - /* freq is the clock speed of the CPU */ - QueryPerformanceFrequency(&freq) ; - /* cout << "freq = " << ((double) freq.QuadPart) << endl; */ - /* t is the high resolution performance counter (see MSDN) */ - QueryPerformanceCounter(& t) ; - return ((OPJ_FLOAT64) t.QuadPart / (OPJ_FLOAT64) freq.QuadPart) ; -#else - /* Unix or Linux: use resource usage */ - struct rusage t; - OPJ_FLOAT64 procTime; - /* (1) Get the rusage data structure at this moment (man getrusage) */ - getrusage(0, &t); - /* (2) What is the elapsed time ? - CPU time = User time + System time */ - /* (2a) Get the seconds */ - procTime = (OPJ_FLOAT64)(t.ru_utime.tv_sec + t.ru_stime.tv_sec); - /* (2b) More precisely! Get the microseconds part ! */ - return (procTime + (OPJ_FLOAT64)(t.ru_utime.tv_usec + t.ru_stime.tv_usec) * - 1e-6) ; -#endif -} - diff --git a/src/3rd/LibOpenJpeg/opj_clock.h b/src/3rd/LibOpenJpeg/opj_clock.h deleted file mode 100644 index 76366f53..00000000 --- a/src/3rd/LibOpenJpeg/opj_clock.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef OPJ_CLOCK_H -#define OPJ_CLOCK_H -/** -@file opj_clock.h -@brief Internal function for timing - -The functions in OPJ_CLOCK.C are internal utilities mainly used for timing. -*/ - -/** @defgroup MISC MISC - Miscellaneous internal functions */ -/*@{*/ - -/** @name Exported functions */ -/*@{*/ -/* ----------------------------------------------------------------------- */ - -/** -Difference in successive opj_clock() calls tells you the elapsed time -@return Returns time in seconds -*/ -OPJ_FLOAT64 opj_clock(void); - -/* ----------------------------------------------------------------------- */ -/*@}*/ - -/*@}*/ - -#endif /* OPJ_CLOCK_H */ - diff --git a/src/3rd/LibOpenJpeg/opj_codec.h b/src/3rd/LibOpenJpeg/opj_codec.h deleted file mode 100644 index b962b121..00000000 --- a/src/3rd/LibOpenJpeg/opj_codec.h +++ /dev/null @@ -1,171 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef OPJ_CODEC_H -#define OPJ_CODEC_H -/** -@file opj_codec.h -*/ - - -/** - * Main codec handler used for compression or decompression. - */ -typedef struct opj_codec_private { - /** FIXME DOC */ - union { - /** - * Decompression handler. - */ - struct opj_decompression { - /** Main header reading function handler */ - OPJ_BOOL(*opj_read_header)(struct opj_stream_private * cio, - void * p_codec, - opj_image_t **p_image, - struct opj_event_mgr * p_manager); - - /** Decoding function */ - OPJ_BOOL(*opj_decode)(void * p_codec, - struct opj_stream_private * p_cio, - opj_image_t * p_image, - struct opj_event_mgr * p_manager); - - /** FIXME DOC */ - OPJ_BOOL(*opj_read_tile_header)(void * p_codec, - OPJ_UINT32 * p_tile_index, - OPJ_UINT32 * p_data_size, - OPJ_INT32 * p_tile_x0, - OPJ_INT32 * p_tile_y0, - OPJ_INT32 * p_tile_x1, - OPJ_INT32 * p_tile_y1, - OPJ_UINT32 * p_nb_comps, - OPJ_BOOL * p_should_go_on, - struct opj_stream_private * p_cio, - struct opj_event_mgr * p_manager); - - /** FIXME DOC */ - OPJ_BOOL(*opj_decode_tile_data)(void * p_codec, - OPJ_UINT32 p_tile_index, - OPJ_BYTE * p_data, - OPJ_UINT32 p_data_size, - struct opj_stream_private * p_cio, - struct opj_event_mgr * p_manager); - - /** Reading function used after codestream if necessary */ - OPJ_BOOL(* opj_end_decompress)(void *p_codec, - struct opj_stream_private * cio, - struct opj_event_mgr * p_manager); - - /** Codec destroy function handler */ - void (*opj_destroy)(void * p_codec); - - /** Setup decoder function handler */ - void (*opj_setup_decoder)(void * p_codec, opj_dparameters_t * p_param); - - /** Set decode area function handler */ - OPJ_BOOL(*opj_set_decode_area)(void * p_codec, - opj_image_t * p_image, - OPJ_INT32 p_start_x, - OPJ_INT32 p_end_x, - OPJ_INT32 p_start_y, - OPJ_INT32 p_end_y, - struct opj_event_mgr * p_manager); - - /** Get tile function */ - OPJ_BOOL(*opj_get_decoded_tile)(void *p_codec, - opj_stream_private_t * p_cio, - opj_image_t *p_image, - struct opj_event_mgr * p_manager, - OPJ_UINT32 tile_index); - - /** Set the decoded resolution factor */ - OPJ_BOOL(*opj_set_decoded_resolution_factor)(void * p_codec, - OPJ_UINT32 res_factor, - opj_event_mgr_t * p_manager); - - /** Set the decoded components */ - OPJ_BOOL(*opj_set_decoded_components)(void * p_codec, - OPJ_UINT32 num_comps, - const OPJ_UINT32* comps_indices, - opj_event_mgr_t * p_manager); - } m_decompression; - - /** - * Compression handler. FIXME DOC - */ - struct opj_compression { - OPJ_BOOL(* opj_start_compress)(void *p_codec, - struct opj_stream_private * cio, - struct opj_image * p_image, - struct opj_event_mgr * p_manager); - - OPJ_BOOL(* opj_encode)(void * p_codec, - struct opj_stream_private *p_cio, - struct opj_event_mgr * p_manager); - - OPJ_BOOL(* opj_write_tile)(void * p_codec, - OPJ_UINT32 p_tile_index, - OPJ_BYTE * p_data, - OPJ_UINT32 p_data_size, - struct opj_stream_private * p_cio, - struct opj_event_mgr * p_manager); - - OPJ_BOOL(* opj_end_compress)(void * p_codec, - struct opj_stream_private * p_cio, - struct opj_event_mgr * p_manager); - - void (* opj_destroy)(void * p_codec); - - OPJ_BOOL(* opj_setup_encoder)(void * p_codec, - opj_cparameters_t * p_param, - struct opj_image * p_image, - struct opj_event_mgr * p_manager); - } m_compression; - } m_codec_data; - /** FIXME DOC*/ - void * m_codec; - /** Event handler */ - opj_event_mgr_t m_event_mgr; - /** Flag to indicate if the codec is used to decode or encode*/ - OPJ_BOOL is_decompressor; - void (*opj_dump_codec)(void * p_codec, OPJ_INT32 info_flag, - FILE* output_stream); - opj_codestream_info_v2_t* (*opj_get_codec_info)(void* p_codec); - opj_codestream_index_t* (*opj_get_codec_index)(void* p_codec); - - /** Set number of threads */ - OPJ_BOOL(*opj_set_threads)(void * p_codec, OPJ_UINT32 num_threads); -} -opj_codec_private_t; - - -#endif /* OPJ_CODEC_H */ - diff --git a/src/3rd/LibOpenJpeg/opj_common.h b/src/3rd/LibOpenJpeg/opj_common.h deleted file mode 100644 index a0513391..00000000 --- a/src/3rd/LibOpenJpeg/opj_common.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2017, IntoPIX SA - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef OPJ_COMMMON_H -#define OPJ_COMMMON_H - -/* - ========================================================== - Common constants shared among several modules - ========================================================== -*/ -#define OPJ_COMMON_CBLK_DATA_EXTRA 2 /**< Margin for a fake FFFF marker */ - -#endif /* OPJ_COMMMON_H */ diff --git a/src/3rd/LibOpenJpeg/opj_config.h b/src/3rd/LibOpenJpeg/opj_config.h deleted file mode 100644 index 5a62baac..00000000 --- a/src/3rd/LibOpenJpeg/opj_config.h +++ /dev/null @@ -1,10 +0,0 @@ -/* create opj_config.h for CMake */ -#define OPJ_HAVE_STDINT_H 1 - -/*--------------------------------------------------------------------------*/ -/* OpenJPEG Versioning */ - -/* Version number. */ -#define OPJ_VERSION_MAJOR 2 -#define OPJ_VERSION_MINOR 3 -#define OPJ_VERSION_BUILD 0 diff --git a/src/3rd/LibOpenJpeg/opj_config_private.h b/src/3rd/LibOpenJpeg/opj_config_private.h deleted file mode 100644 index 44d3997f..00000000 --- a/src/3rd/LibOpenJpeg/opj_config_private.h +++ /dev/null @@ -1,49 +0,0 @@ -/* create opj_config_private.h for CMake */ -#define OPJ_HAVE_INTTYPES_H 1 - -#define OPJ_PACKAGE_VERSION "2.3.0" - -/* Not used by openjp2*/ -/*#define HAVE_MEMORY_H 1*/ -/*#define HAVE_STDLIB_H 1*/ -/* #undef HAVE_STRINGS_H */ -/*#define HAVE_STRING_H 1*/ -/*#define HAVE_SYS_STAT_H 1*/ -/*#define HAVE_SYS_TYPES_H 1 */ -/* #undef HAVE_UNISTD_H */ - -/* #undef _LARGEFILE_SOURCE */ -/* #undef _LARGE_FILES */ -/* #undef _FILE_OFFSET_BITS */ -/* #undef OPJ_HAVE_FSEEKO */ - -/* find whether or not have */ -#define OPJ_HAVE_MALLOC_H -/* check if function `aligned_alloc` exists */ -/* #undef OPJ_HAVE_ALIGNED_ALLOC */ -/* check if function `_aligned_malloc` exists */ -#define OPJ_HAVE__ALIGNED_MALLOC -/* check if function `memalign` exists */ -/* #undef OPJ_HAVE_MEMALIGN */ -/* check if function `posix_memalign` exists */ -/* #undef OPJ_HAVE_POSIX_MEMALIGN */ - -#if !defined(_POSIX_C_SOURCE) -#if defined(OPJ_HAVE_FSEEKO) || defined(OPJ_HAVE_POSIX_MEMALIGN) -/* Get declarations of fseeko, ftello, posix_memalign. */ -#define _POSIX_C_SOURCE 200112L -#endif -#endif - -/* Byte order. */ -/* All compilers that support Mac OS X define either __BIG_ENDIAN__ or -__LITTLE_ENDIAN__ to match the endianness of the architecture being -compiled for. This is not necessarily the same as the architecture of the -machine doing the building. In order to support Universal Binaries on -Mac OS X, we prefer those defines to decide the endianness. -On other platforms we use the result of the TRY_RUN. */ -#if !defined(__APPLE__) -/* #undef OPJ_BIG_ENDIAN */ -#elif defined(__BIG_ENDIAN__) -# define OPJ_BIG_ENDIAN -#endif diff --git a/src/3rd/LibOpenJpeg/opj_includes.h b/src/3rd/LibOpenJpeg/opj_includes.h deleted file mode 100644 index 0a8628c9..00000000 --- a/src/3rd/LibOpenJpeg/opj_includes.h +++ /dev/null @@ -1,265 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR - * Copyright (c) 2012, CS Systemes d'Information, France - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef OPJ_INCLUDES_H -#define OPJ_INCLUDES_H - -/* - * This must be included before any system headers, - * since they can react to macro defined there - */ -#include "opj_config_private.h" - -/* - ========================================================== - Standard includes used by the library - ========================================================== -*/ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - Use fseeko() and ftello() if they are available since they use - 'off_t' rather than 'long'. It is wrong to use fseeko() and - ftello() only on systems with special LFS support since some systems - (e.g. FreeBSD) support a 64-bit off_t by default. -*/ -#if defined(OPJ_HAVE_FSEEKO) && !defined(fseek) -# define fseek fseeko -# define ftell ftello -#endif - - -#if defined(WIN32) && !defined(Windows95) && !defined(__BORLANDC__) && \ - !(defined(_MSC_VER) && _MSC_VER < 1400) && \ - !(defined(__MINGW32__) && __MSVCRT_VERSION__ < 0x800) -/* - Windows '95 and Borland C do not support _lseeki64 - Visual Studio does not support _fseeki64 and _ftelli64 until the 2005 release. - Without these interfaces, files over 2GB in size are not supported for Windows. -*/ -# define OPJ_FSEEK(stream,offset,whence) _fseeki64(stream,/* __int64 */ offset,whence) -# define OPJ_FSTAT(fildes,stat_buff) _fstati64(fildes,/* struct _stati64 */ stat_buff) -# define OPJ_FTELL(stream) /* __int64 */ _ftelli64(stream) -# define OPJ_STAT_STRUCT_T struct _stati64 -# define OPJ_STAT(path,stat_buff) _stati64(path,/* struct _stati64 */ stat_buff) -#else -# define OPJ_FSEEK(stream,offset,whence) fseek(stream,offset,whence) -# define OPJ_FSTAT(fildes,stat_buff) fstat(fildes,stat_buff) -# define OPJ_FTELL(stream) ftell(stream) -# define OPJ_STAT_STRUCT_T struct stat -# define OPJ_STAT(path,stat_buff) stat(path,stat_buff) -#endif - - -/* - ========================================================== - OpenJPEG interface - ========================================================== - */ -#include "openjpeg.h" - -/* - ========================================================== - OpenJPEG modules - ========================================================== -*/ - -/* Are restricted pointers available? (C99) */ -#if (__STDC_VERSION__ >= 199901L) -#define OPJ_RESTRICT restrict -#else -/* Not a C99 compiler */ -#if defined(__GNUC__) -#define OPJ_RESTRICT __restrict__ - -/* - vc14 (2015) outputs wrong results. - Need to check OPJ_RESTRICT usage (or a bug in vc14) - #elif defined(_MSC_VER) && (_MSC_VER >= 1400) - #define OPJ_RESTRICT __restrict -*/ -#else -#define OPJ_RESTRICT /* restrict */ -#endif -#endif - -#ifdef __has_attribute -#if __has_attribute(no_sanitize) -#define OPJ_NOSANITIZE(kind) __attribute__((no_sanitize(kind))) -#endif -#endif -#ifndef OPJ_NOSANITIZE -#define OPJ_NOSANITIZE(kind) -#endif - - -/* MSVC before 2013 and Borland C do not have lrintf */ -#if defined(_MSC_VER) -#include -static INLINE long opj_lrintf(float f) -{ -#ifdef _M_X64 - return _mm_cvt_ss2si(_mm_load_ss(&f)); - - /* commented out line breaks many tests */ - /* return (long)((f>0.0f) ? (f + 0.5f):(f -0.5f)); */ -#elif defined(_M_IX86) - int i; - _asm{ - fld f - fistp i - }; - - return i; -#else - return (long)((f>0.0f) ? (f + 0.5f) : (f - 0.5f)); -#endif -} -#elif defined(__BORLANDC__) -static INLINE long opj_lrintf(float f) -{ -#ifdef _M_X64 - return (long)((f > 0.0f) ? (f + 0.5f) : (f - 0.5f)); -#else - int i; - - _asm { - fld f - fistp i - }; - - return i; -#endif -} -#else -static INLINE long opj_lrintf(float f) -{ - return lrintf(f); -} -#endif - -#if defined(_MSC_VER) && (_MSC_VER < 1400) -#define vsnprintf _vsnprintf -#endif - -/* MSVC x86 is really bad at doing int64 = int32 * int32 on its own. Use intrinsic. */ -#if defined(_MSC_VER) && (_MSC_VER >= 1400) && !defined(__INTEL_COMPILER) && defined(_M_IX86) -# include -# pragma intrinsic(__emul) -#endif - -/* Apparently Visual Studio doesn't define __SSE__ / __SSE2__ macros */ -#if defined(_M_X64) -/* Intel 64bit support SSE and SSE2 */ -# ifndef __SSE__ -# define __SSE__ 1 -# endif -# ifndef __SSE2__ -# define __SSE2__ 1 -# endif -#endif - -/* For x86, test the value of the _M_IX86_FP macro. */ -/* See https://msdn.microsoft.com/en-us/library/b0084kay.aspx */ -#if defined(_M_IX86_FP) -# if _M_IX86_FP >= 1 -# ifndef __SSE__ -# define __SSE__ 1 -# endif -# endif -# if _M_IX86_FP >= 2 -# ifndef __SSE2__ -# define __SSE2__ 1 -# endif -# endif -#endif - -/* Type to use for bit-fields in internal headers */ -typedef unsigned int OPJ_BITFIELD; - -#define OPJ_UNUSED(x) (void)x - -#include "opj_inttypes.h" -#include "opj_clock.h" -#include "opj_malloc.h" -#include "event.h" -#include "function_list.h" -#include "bio.h" -#include "cio.h" - -#include "thread.h" -#include "tls_keys.h" - -#include "image.h" -#include "invert.h" -#include "j2k.h" -#include "jp2.h" - -#include "mqc.h" -#include "bio.h" - -#include "pi.h" -#include "tgt.h" -#include "tcd.h" -#include "t1.h" -#include "dwt.h" -#include "t2.h" -#include "mct.h" -#include "opj_intmath.h" -#include "sparse_array.h" - -#ifdef USE_JPIP -#include "cidx_manager.h" -#include "indexbox_manager.h" -#endif - -/* JPWL>> */ -#ifdef USE_JPWL -#include "openjpwl/jpwl.h" -#endif /* USE_JPWL */ -/* < b else b -*/ -static INLINE OPJ_INT32 opj_int_max(OPJ_INT32 a, OPJ_INT32 b) -{ - return (a > b) ? a : b; -} - -/** -Get the maximum of two integers -@return Returns a if a > b else b -*/ -static INLINE OPJ_UINT32 opj_uint_max(OPJ_UINT32 a, OPJ_UINT32 b) -{ - return (a > b) ? a : b; -} - -/** - Get the saturated sum of two unsigned integers - @return Returns saturated sum of a+b - */ -static INLINE OPJ_UINT32 opj_uint_adds(OPJ_UINT32 a, OPJ_UINT32 b) -{ - OPJ_UINT64 sum = (OPJ_UINT64)a + (OPJ_UINT64)b; - return (OPJ_UINT32)(-(OPJ_INT32)(sum >> 32)) | (OPJ_UINT32)sum; -} - -/** - Get the saturated difference of two unsigned integers - @return Returns saturated sum of a-b - */ -static INLINE OPJ_UINT32 opj_uint_subs(OPJ_UINT32 a, OPJ_UINT32 b) -{ - return (a >= b) ? a - b : 0; -} - -/** -Clamp an integer inside an interval -@return -
    -
  • Returns a if (min < a < max) -
  • Returns max if (a > max) -
  • Returns min if (a < min) -
-*/ -static INLINE OPJ_INT32 opj_int_clamp(OPJ_INT32 a, OPJ_INT32 min, - OPJ_INT32 max) -{ - if (a < min) { - return min; - } - if (a > max) { - return max; - } - return a; -} - -/** -Clamp an integer inside an interval -@return -
    -
  • Returns a if (min < a < max) -
  • Returns max if (a > max) -
  • Returns min if (a < min) -
-*/ -static INLINE OPJ_INT64 opj_int64_clamp(OPJ_INT64 a, OPJ_INT64 min, - OPJ_INT64 max) -{ - if (a < min) { - return min; - } - if (a > max) { - return max; - } - return a; -} - -/** -@return Get absolute value of integer -*/ -static INLINE OPJ_INT32 opj_int_abs(OPJ_INT32 a) -{ - return a < 0 ? -a : a; -} -/** -Divide an integer and round upwards -@return Returns a divided by b -*/ -static INLINE OPJ_INT32 opj_int_ceildiv(OPJ_INT32 a, OPJ_INT32 b) -{ - assert(b); - return (OPJ_INT32)(((OPJ_INT64)a + b - 1) / b); -} - -/** -Divide an integer and round upwards -@return Returns a divided by b -*/ -static INLINE OPJ_UINT32 opj_uint_ceildiv(OPJ_UINT32 a, OPJ_UINT32 b) -{ - assert(b); - return (a + b - 1) / b; -} - -/** -Divide an integer by a power of 2 and round upwards -@return Returns a divided by 2^b -*/ -static INLINE OPJ_INT32 opj_int_ceildivpow2(OPJ_INT32 a, OPJ_INT32 b) -{ - return (OPJ_INT32)((a + ((OPJ_INT64)1 << b) - 1) >> b); -} - -/** - Divide a 64bits integer by a power of 2 and round upwards - @return Returns a divided by 2^b - */ -static INLINE OPJ_INT32 opj_int64_ceildivpow2(OPJ_INT64 a, OPJ_INT32 b) -{ - return (OPJ_INT32)((a + ((OPJ_INT64)1 << b) - 1) >> b); -} - -/** - Divide an integer by a power of 2 and round upwards - @return Returns a divided by 2^b - */ -static INLINE OPJ_UINT32 opj_uint_ceildivpow2(OPJ_UINT32 a, OPJ_UINT32 b) -{ - return (OPJ_UINT32)((a + ((OPJ_UINT64)1U << b) - 1U) >> b); -} - -/** -Divide an integer by a power of 2 and round downwards -@return Returns a divided by 2^b -*/ -static INLINE OPJ_INT32 opj_int_floordivpow2(OPJ_INT32 a, OPJ_INT32 b) -{ - return a >> b; -} -/** -Get logarithm of an integer and round downwards -@return Returns log2(a) -*/ -static INLINE OPJ_INT32 opj_int_floorlog2(OPJ_INT32 a) -{ - OPJ_INT32 l; - for (l = 0; a > 1; l++) { - a >>= 1; - } - return l; -} -/** -Get logarithm of an integer and round downwards -@return Returns log2(a) -*/ -static INLINE OPJ_UINT32 opj_uint_floorlog2(OPJ_UINT32 a) -{ - OPJ_UINT32 l; - for (l = 0; a > 1; ++l) { - a >>= 1; - } - return l; -} - -/** -Multiply two fixed-precision rational numbers. -@param a -@param b -@return Returns a * b -*/ -static INLINE OPJ_INT32 opj_int_fix_mul(OPJ_INT32 a, OPJ_INT32 b) -{ -#if defined(_MSC_VER) && (_MSC_VER >= 1400) && !defined(__INTEL_COMPILER) && defined(_M_IX86) - OPJ_INT64 temp = __emul(a, b); -#else - OPJ_INT64 temp = (OPJ_INT64) a * (OPJ_INT64) b ; -#endif - temp += 4096; - assert((temp >> 13) <= (OPJ_INT64)0x7FFFFFFF); - assert((temp >> 13) >= (-(OPJ_INT64)0x7FFFFFFF - (OPJ_INT64)1)); - return (OPJ_INT32)(temp >> 13); -} - -static INLINE OPJ_INT32 opj_int_fix_mul_t1(OPJ_INT32 a, OPJ_INT32 b) -{ -#if defined(_MSC_VER) && (_MSC_VER >= 1400) && !defined(__INTEL_COMPILER) && defined(_M_IX86) - OPJ_INT64 temp = __emul(a, b); -#else - OPJ_INT64 temp = (OPJ_INT64) a * (OPJ_INT64) b ; -#endif - temp += 4096; - assert((temp >> (13 + 11 - T1_NMSEDEC_FRACBITS)) <= (OPJ_INT64)0x7FFFFFFF); - assert((temp >> (13 + 11 - T1_NMSEDEC_FRACBITS)) >= (-(OPJ_INT64)0x7FFFFFFF - - (OPJ_INT64)1)); - return (OPJ_INT32)(temp >> (13 + 11 - T1_NMSEDEC_FRACBITS)) ; -} - -/* ----------------------------------------------------------------------- */ -/*@}*/ - -/*@}*/ - -#endif /* OPJ_INTMATH_H */ diff --git a/src/3rd/LibOpenJpeg/opj_inttypes.h b/src/3rd/LibOpenJpeg/opj_inttypes.h deleted file mode 100644 index 2c9749a1..00000000 --- a/src/3rd/LibOpenJpeg/opj_inttypes.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2012, Mathieu Malaterre - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef OPJ_INTTYPES_H -#define OPJ_INTTYPES_H - -#include "opj_config_private.h" -#ifdef OPJ_HAVE_INTTYPES_H -#include -#else -#if defined(_WIN32) -#define PRId64 "I64d" -#define PRIi64 "I64i" -#define PRIu64 "I64u" -#define PRIx64 "I64x" -#else -#error unsupported platform -#endif -#endif - -#endif /* OPJ_INTTYPES_H */ diff --git a/src/3rd/LibOpenJpeg/opj_malloc.c b/src/3rd/LibOpenJpeg/opj_malloc.c deleted file mode 100644 index dca91bfc..00000000 --- a/src/3rd/LibOpenJpeg/opj_malloc.c +++ /dev/null @@ -1,249 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2015, Mathieu Malaterre - * Copyright (c) 2015, Matthieu Darbois - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -#define OPJ_SKIP_POISON -#include "opj_includes.h" - -#if defined(OPJ_HAVE_MALLOC_H) && defined(OPJ_HAVE_MEMALIGN) -# include -#endif - -#ifndef SIZE_MAX -# define SIZE_MAX ((size_t) -1) -#endif - -static INLINE void *opj_aligned_alloc_n(size_t alignment, size_t size) -{ - void* ptr; - - /* alignment shall be power of 2 */ - assert((alignment != 0U) && ((alignment & (alignment - 1U)) == 0U)); - /* alignment shall be at least sizeof(void*) */ - assert(alignment >= sizeof(void*)); - - if (size == 0U) { /* prevent implementation defined behavior of realloc */ - return NULL; - } - -#if defined(OPJ_HAVE_POSIX_MEMALIGN) - /* aligned_alloc requires c11, restrict to posix_memalign for now. Quote: - * This function was introduced in POSIX 1003.1d. Although this function is - * superseded by aligned_alloc, it is more portable to older POSIX systems - * that do not support ISO C11. */ - if (posix_memalign(&ptr, alignment, size)) { - ptr = NULL; - } - /* older linux */ -#elif defined(OPJ_HAVE_MEMALIGN) - ptr = memalign(alignment, size); - /* _MSC_VER */ -#elif defined(OPJ_HAVE__ALIGNED_MALLOC) - ptr = _aligned_malloc(size, alignment); -#else - /* - * Generic aligned malloc implementation. - * Uses size_t offset for the integer manipulation of the pointer, - * as uintptr_t is not available in C89 to do - * bitwise operations on the pointer itself. - */ - alignment--; - { - size_t offset; - OPJ_UINT8 *mem; - - /* Room for padding and extra pointer stored in front of allocated area */ - size_t overhead = alignment + sizeof(void *); - - /* let's be extra careful */ - assert(alignment <= (SIZE_MAX - sizeof(void *))); - - /* Avoid integer overflow */ - if (size > (SIZE_MAX - overhead)) { - return NULL; - } - - mem = (OPJ_UINT8*)malloc(size + overhead); - if (mem == NULL) { - return mem; - } - /* offset = ((alignment + 1U) - ((size_t)(mem + sizeof(void*)) & alignment)) & alignment; */ - /* Use the fact that alignment + 1U is a power of 2 */ - offset = ((alignment ^ ((size_t)(mem + sizeof(void*)) & alignment)) + 1U) & - alignment; - ptr = (void *)(mem + sizeof(void*) + offset); - ((void**) ptr)[-1] = mem; - } -#endif - return ptr; -} -static INLINE void *opj_aligned_realloc_n(void *ptr, size_t alignment, - size_t new_size) -{ - void *r_ptr; - - /* alignment shall be power of 2 */ - assert((alignment != 0U) && ((alignment & (alignment - 1U)) == 0U)); - /* alignment shall be at least sizeof(void*) */ - assert(alignment >= sizeof(void*)); - - if (new_size == 0U) { /* prevent implementation defined behavior of realloc */ - return NULL; - } - - /* no portable aligned realloc */ -#if defined(OPJ_HAVE_POSIX_MEMALIGN) || defined(OPJ_HAVE_MEMALIGN) - /* glibc doc states one can mix aligned malloc with realloc */ - r_ptr = realloc(ptr, new_size); /* fast path */ - /* we simply use `size_t` to cast, since we are only interest in binary AND - * operator */ - if (((size_t)r_ptr & (alignment - 1U)) != 0U) { - /* this is non-trivial to implement a portable aligned realloc, so use a - * simple approach where we do not need a function that return the size of an - * allocated array (eg. _msize on Windows, malloc_size on MacOS, - * malloc_usable_size on systems with glibc) */ - void *a_ptr = opj_aligned_alloc_n(alignment, new_size); - if (a_ptr != NULL) { - memcpy(a_ptr, r_ptr, new_size); - } - free(r_ptr); - r_ptr = a_ptr; - } - /* _MSC_VER */ -#elif defined(OPJ_HAVE__ALIGNED_MALLOC) - r_ptr = _aligned_realloc(ptr, new_size, alignment); -#else - if (ptr == NULL) { - return opj_aligned_alloc_n(alignment, new_size); - } - alignment--; - { - void *oldmem; - OPJ_UINT8 *newmem; - size_t overhead = alignment + sizeof(void *); - - /* let's be extra careful */ - assert(alignment <= (SIZE_MAX - sizeof(void *))); - - /* Avoid integer overflow */ - if (new_size > SIZE_MAX - overhead) { - return NULL; - } - - oldmem = ((void**) ptr)[-1]; - newmem = (OPJ_UINT8*)realloc(oldmem, new_size + overhead); - if (newmem == NULL) { - return newmem; - } - - if (newmem == oldmem) { - r_ptr = ptr; - } else { - size_t old_offset; - size_t new_offset; - - /* realloc created a new copy, realign the copied memory block */ - old_offset = (size_t)((OPJ_UINT8*)ptr - (OPJ_UINT8*)oldmem); - - /* offset = ((alignment + 1U) - ((size_t)(mem + sizeof(void*)) & alignment)) & alignment; */ - /* Use the fact that alignment + 1U is a power of 2 */ - new_offset = ((alignment ^ ((size_t)(newmem + sizeof(void*)) & alignment)) + - 1U) & alignment; - new_offset += sizeof(void*); - r_ptr = (void *)(newmem + new_offset); - - if (new_offset != old_offset) { - memmove(newmem + new_offset, newmem + old_offset, new_size); - } - ((void**) r_ptr)[-1] = newmem; - } - } -#endif - return r_ptr; -} -void * opj_malloc(size_t size) -{ - if (size == 0U) { /* prevent implementation defined behavior of realloc */ - return NULL; - } - return malloc(size); -} -void * opj_calloc(size_t num, size_t size) -{ - if (num == 0 || size == 0) { - /* prevent implementation defined behavior of realloc */ - return NULL; - } - return calloc(num, size); -} - -void *opj_aligned_malloc(size_t size) -{ - return opj_aligned_alloc_n(16U, size); -} -void * opj_aligned_realloc(void *ptr, size_t size) -{ - return opj_aligned_realloc_n(ptr, 16U, size); -} - -void *opj_aligned_32_malloc(size_t size) -{ - return opj_aligned_alloc_n(32U, size); -} -void * opj_aligned_32_realloc(void *ptr, size_t size) -{ - return opj_aligned_realloc_n(ptr, 32U, size); -} - -void opj_aligned_free(void* ptr) -{ -#if defined(OPJ_HAVE_POSIX_MEMALIGN) || defined(OPJ_HAVE_MEMALIGN) - free(ptr); -#elif defined(OPJ_HAVE__ALIGNED_MALLOC) - _aligned_free(ptr); -#else - /* Generic implementation has malloced pointer stored in front of used area */ - if (ptr != NULL) { - free(((void**) ptr)[-1]); - } -#endif -} - -void * opj_realloc(void *ptr, size_t new_size) -{ - if (new_size == 0U) { /* prevent implementation defined behavior of realloc */ - return NULL; - } - return realloc(ptr, new_size); -} -void opj_free(void *ptr) -{ - free(ptr); -} diff --git a/src/3rd/LibOpenJpeg/opj_malloc.h b/src/3rd/LibOpenJpeg/opj_malloc.h deleted file mode 100644 index cbc4106c..00000000 --- a/src/3rd/LibOpenJpeg/opj_malloc.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2007, Callum Lerwick - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef OPJ_MALLOC_H -#define OPJ_MALLOC_H - -#include -/** -@file opj_malloc.h -@brief Internal functions - -The functions in opj_malloc.h are internal utilities used for memory management. -*/ - -/** @defgroup MISC MISC - Miscellaneous internal functions */ -/*@{*/ - -/** @name Exported functions */ -/*@{*/ -/* ----------------------------------------------------------------------- */ - -/** -Allocate an uninitialized memory block -@param size Bytes to allocate -@return Returns a void pointer to the allocated space, or NULL if there is insufficient memory available -*/ -void * opj_malloc(size_t size); - -/** -Allocate a memory block with elements initialized to 0 -@param numOfElements Blocks to allocate -@param sizeOfElements Bytes per block to allocate -@return Returns a void pointer to the allocated space, or NULL if there is insufficient memory available -*/ -void * opj_calloc(size_t numOfElements, size_t sizeOfElements); - -/** -Allocate memory aligned to a 16 byte boundary -@param size Bytes to allocate -@return Returns a void pointer to the allocated space, or NULL if there is insufficient memory available -*/ -void * opj_aligned_malloc(size_t size); -void * opj_aligned_realloc(void *ptr, size_t size); -void opj_aligned_free(void* ptr); - -/** -Allocate memory aligned to a 32 byte boundary -@param size Bytes to allocate -@return Returns a void pointer to the allocated space, or NULL if there is insufficient memory available -*/ -void * opj_aligned_32_malloc(size_t size); -void * opj_aligned_32_realloc(void *ptr, size_t size); - -/** -Reallocate memory blocks. -@param m Pointer to previously allocated memory block -@param s New size in bytes -@return Returns a void pointer to the reallocated (and possibly moved) memory block -*/ -void * opj_realloc(void * m, size_t s); - -/** -Deallocates or frees a memory block. -@param m Previously allocated memory block to be freed -*/ -void opj_free(void * m); - -#if defined(__GNUC__) && !defined(OPJ_SKIP_POISON) -#pragma GCC poison malloc calloc realloc free -#endif - -/* ----------------------------------------------------------------------- */ -/*@}*/ - -/*@}*/ - -#endif /* OPJ_MALLOC_H */ - diff --git a/src/3rd/LibOpenJpeg/opj_stdint.h b/src/3rd/LibOpenJpeg/opj_stdint.h deleted file mode 100644 index f26c921c..00000000 --- a/src/3rd/LibOpenJpeg/opj_stdint.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2012, Mathieu Malaterre - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef OPJ_STDINT_H -#define OPJ_STDINT_H - -#include "opj_config.h" -#ifdef OPJ_HAVE_STDINT_H -#include -#else -#if defined(_WIN32) -typedef signed __int8 int8_t; -typedef unsigned __int8 uint8_t; -typedef signed __int16 int16_t; -typedef unsigned __int16 uint16_t; -typedef signed __int32 int32_t; -typedef unsigned __int32 uint32_t; -typedef signed __int64 int64_t; -typedef unsigned __int64 uint64_t; -#else -#error unsupported platform -#endif -#endif - -#endif /* OPJ_STDINT_H */ diff --git a/src/3rd/LibOpenJpeg/pi.c b/src/3rd/LibOpenJpeg/pi.c deleted file mode 100644 index 91642ee4..00000000 --- a/src/3rd/LibOpenJpeg/pi.c +++ /dev/null @@ -1,2086 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2006-2007, Parvatha Elangovan - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "opj_includes.h" - -/** @defgroup PI PI - Implementation of a packet iterator */ -/*@{*/ - -/** @name Local static functions */ -/*@{*/ - -/** -Get next packet in layer-resolution-component-precinct order. -@param pi packet iterator to modify -@return returns false if pi pointed to the last packet or else returns true -*/ -static OPJ_BOOL opj_pi_next_lrcp(opj_pi_iterator_t * pi); -/** -Get next packet in resolution-layer-component-precinct order. -@param pi packet iterator to modify -@return returns false if pi pointed to the last packet or else returns true -*/ -static OPJ_BOOL opj_pi_next_rlcp(opj_pi_iterator_t * pi); -/** -Get next packet in resolution-precinct-component-layer order. -@param pi packet iterator to modify -@return returns false if pi pointed to the last packet or else returns true -*/ -static OPJ_BOOL opj_pi_next_rpcl(opj_pi_iterator_t * pi); -/** -Get next packet in precinct-component-resolution-layer order. -@param pi packet iterator to modify -@return returns false if pi pointed to the last packet or else returns true -*/ -static OPJ_BOOL opj_pi_next_pcrl(opj_pi_iterator_t * pi); -/** -Get next packet in component-precinct-resolution-layer order. -@param pi packet iterator to modify -@return returns false if pi pointed to the last packet or else returns true -*/ -static OPJ_BOOL opj_pi_next_cprl(opj_pi_iterator_t * pi); - -/** - * Updates the coding parameters if the encoding is used with Progression order changes and final (or cinema parameters are used). - * - * @param p_cp the coding parameters to modify - * @param p_tileno the tile index being concerned. - * @param p_tx0 X0 parameter for the tile - * @param p_tx1 X1 parameter for the tile - * @param p_ty0 Y0 parameter for the tile - * @param p_ty1 Y1 parameter for the tile - * @param p_max_prec the maximum precision for all the bands of the tile - * @param p_max_res the maximum number of resolutions for all the poc inside the tile. - * @param p_dx_min the minimum dx of all the components of all the resolutions for the tile. - * @param p_dy_min the minimum dy of all the components of all the resolutions for the tile. - */ -static void opj_pi_update_encode_poc_and_final(opj_cp_t *p_cp, - OPJ_UINT32 p_tileno, - OPJ_INT32 p_tx0, - OPJ_INT32 p_tx1, - OPJ_INT32 p_ty0, - OPJ_INT32 p_ty1, - OPJ_UINT32 p_max_prec, - OPJ_UINT32 p_max_res, - OPJ_UINT32 p_dx_min, - OPJ_UINT32 p_dy_min); - -/** - * Updates the coding parameters if the encoding is not used with Progression order changes and final (and cinema parameters are used). - * - * @param p_cp the coding parameters to modify - * @param p_num_comps the number of components - * @param p_tileno the tile index being concerned. - * @param p_tx0 X0 parameter for the tile - * @param p_tx1 X1 parameter for the tile - * @param p_ty0 Y0 parameter for the tile - * @param p_ty1 Y1 parameter for the tile - * @param p_max_prec the maximum precision for all the bands of the tile - * @param p_max_res the maximum number of resolutions for all the poc inside the tile. - * @param p_dx_min the minimum dx of all the components of all the resolutions for the tile. - * @param p_dy_min the minimum dy of all the components of all the resolutions for the tile. - */ -static void opj_pi_update_encode_not_poc(opj_cp_t *p_cp, - OPJ_UINT32 p_num_comps, - OPJ_UINT32 p_tileno, - OPJ_INT32 p_tx0, - OPJ_INT32 p_tx1, - OPJ_INT32 p_ty0, - OPJ_INT32 p_ty1, - OPJ_UINT32 p_max_prec, - OPJ_UINT32 p_max_res, - OPJ_UINT32 p_dx_min, - OPJ_UINT32 p_dy_min); -/** - * Gets the encoding parameters needed to update the coding parameters and all the pocs. - * - * @param p_image the image being encoded. - * @param p_cp the coding parameters. - * @param tileno the tile index of the tile being encoded. - * @param p_tx0 pointer that will hold the X0 parameter for the tile - * @param p_tx1 pointer that will hold the X1 parameter for the tile - * @param p_ty0 pointer that will hold the Y0 parameter for the tile - * @param p_ty1 pointer that will hold the Y1 parameter for the tile - * @param p_max_prec pointer that will hold the maximum precision for all the bands of the tile - * @param p_max_res pointer that will hold the maximum number of resolutions for all the poc inside the tile. - * @param p_dx_min pointer that will hold the minimum dx of all the components of all the resolutions for the tile. - * @param p_dy_min pointer that will hold the minimum dy of all the components of all the resolutions for the tile. - */ -static void opj_get_encoding_parameters(const opj_image_t *p_image, - const opj_cp_t *p_cp, - OPJ_UINT32 tileno, - OPJ_INT32 * p_tx0, - OPJ_INT32 * p_tx1, - OPJ_INT32 * p_ty0, - OPJ_INT32 * p_ty1, - OPJ_UINT32 * p_dx_min, - OPJ_UINT32 * p_dy_min, - OPJ_UINT32 * p_max_prec, - OPJ_UINT32 * p_max_res); - -/** - * Gets the encoding parameters needed to update the coding parameters and all the pocs. - * The precinct widths, heights, dx and dy for each component at each resolution will be stored as well. - * the last parameter of the function should be an array of pointers of size nb components, each pointer leading - * to an area of size 4 * max_res. The data is stored inside this area with the following pattern : - * dx_compi_res0 , dy_compi_res0 , w_compi_res0, h_compi_res0 , dx_compi_res1 , dy_compi_res1 , w_compi_res1, h_compi_res1 , ... - * - * @param p_image the image being encoded. - * @param p_cp the coding parameters. - * @param tileno the tile index of the tile being encoded. - * @param p_tx0 pointer that will hold the X0 parameter for the tile - * @param p_tx1 pointer that will hold the X1 parameter for the tile - * @param p_ty0 pointer that will hold the Y0 parameter for the tile - * @param p_ty1 pointer that will hold the Y1 parameter for the tile - * @param p_max_prec pointer that will hold the maximum precision for all the bands of the tile - * @param p_max_res pointer that will hold the maximum number of resolutions for all the poc inside the tile. - * @param p_dx_min pointer that will hold the minimum dx of all the components of all the resolutions for the tile. - * @param p_dy_min pointer that will hold the minimum dy of all the components of all the resolutions for the tile. - * @param p_resolutions pointer to an area corresponding to the one described above. - */ -static void opj_get_all_encoding_parameters(const opj_image_t *p_image, - const opj_cp_t *p_cp, - OPJ_UINT32 tileno, - OPJ_INT32 * p_tx0, - OPJ_INT32 * p_tx1, - OPJ_INT32 * p_ty0, - OPJ_INT32 * p_ty1, - OPJ_UINT32 * p_dx_min, - OPJ_UINT32 * p_dy_min, - OPJ_UINT32 * p_max_prec, - OPJ_UINT32 * p_max_res, - OPJ_UINT32 ** p_resolutions); -/** - * Allocates memory for a packet iterator. Data and data sizes are set by this operation. - * No other data is set. The include section of the packet iterator is not allocated. - * - * @param p_image the image used to initialize the packet iterator (in fact only the number of components is relevant. - * @param p_cp the coding parameters. - * @param tileno the index of the tile from which creating the packet iterator. - */ -static opj_pi_iterator_t * opj_pi_create(const opj_image_t *p_image, - const opj_cp_t *p_cp, - OPJ_UINT32 tileno); -/** - * FIXME DOC - */ -static void opj_pi_update_decode_not_poc(opj_pi_iterator_t * p_pi, - opj_tcp_t * p_tcp, - OPJ_UINT32 p_max_precision, - OPJ_UINT32 p_max_res); -/** - * FIXME DOC - */ -static void opj_pi_update_decode_poc(opj_pi_iterator_t * p_pi, - opj_tcp_t * p_tcp, - OPJ_UINT32 p_max_precision, - OPJ_UINT32 p_max_res); - -/** - * FIXME DOC - */ -static OPJ_BOOL opj_pi_check_next_level(OPJ_INT32 pos, - opj_cp_t *cp, - OPJ_UINT32 tileno, - OPJ_UINT32 pino, - const OPJ_CHAR *prog); - -/*@}*/ - -/*@}*/ - -/* -========================================================== - local functions -========================================================== -*/ - -static void opj_pi_emit_error(opj_pi_iterator_t * pi, const char* msg) -{ - (void)pi; - (void)msg; -} - -static OPJ_BOOL opj_pi_next_lrcp(opj_pi_iterator_t * pi) -{ - opj_pi_comp_t *comp = NULL; - opj_pi_resolution_t *res = NULL; - OPJ_UINT32 index = 0; - - if (!pi->first) { - comp = &pi->comps[pi->compno]; - res = &comp->resolutions[pi->resno]; - goto LABEL_SKIP; - } else { - pi->first = 0; - } - - for (pi->layno = pi->poc.layno0; pi->layno < pi->poc.layno1; pi->layno++) { - for (pi->resno = pi->poc.resno0; pi->resno < pi->poc.resno1; - pi->resno++) { - for (pi->compno = pi->poc.compno0; pi->compno < pi->poc.compno1; pi->compno++) { - comp = &pi->comps[pi->compno]; - if (pi->resno >= comp->numresolutions) { - continue; - } - res = &comp->resolutions[pi->resno]; - if (!pi->tp_on) { - pi->poc.precno1 = res->pw * res->ph; - } - for (pi->precno = pi->poc.precno0; pi->precno < pi->poc.precno1; pi->precno++) { - index = pi->layno * pi->step_l + pi->resno * pi->step_r + pi->compno * - pi->step_c + pi->precno * pi->step_p; - /* Avoids index out of bounds access with */ - /* id_000098,sig_11,src_005411,op_havoc,rep_2 of */ - /* https://github.com/uclouvain/openjpeg/issues/938 */ - /* Not sure if this is the most clever fix. Perhaps */ - /* include should be resized when a POC arises, or */ - /* the POC should be rejected */ - if (index >= pi->include_size) { - opj_pi_emit_error(pi, "Invalid access to pi->include"); - return OPJ_FALSE; - } - if (!pi->include[index]) { - pi->include[index] = 1; - return OPJ_TRUE; - } -LABEL_SKIP: - ; - } - } - } - } - - return OPJ_FALSE; -} - -static OPJ_BOOL opj_pi_next_rlcp(opj_pi_iterator_t * pi) -{ - opj_pi_comp_t *comp = NULL; - opj_pi_resolution_t *res = NULL; - OPJ_UINT32 index = 0; - - if (!pi->first) { - comp = &pi->comps[pi->compno]; - res = &comp->resolutions[pi->resno]; - goto LABEL_SKIP; - } else { - pi->first = 0; - } - - for (pi->resno = pi->poc.resno0; pi->resno < pi->poc.resno1; pi->resno++) { - for (pi->layno = pi->poc.layno0; pi->layno < pi->poc.layno1; pi->layno++) { - for (pi->compno = pi->poc.compno0; pi->compno < pi->poc.compno1; pi->compno++) { - comp = &pi->comps[pi->compno]; - if (pi->resno >= comp->numresolutions) { - continue; - } - res = &comp->resolutions[pi->resno]; - if (!pi->tp_on) { - pi->poc.precno1 = res->pw * res->ph; - } - for (pi->precno = pi->poc.precno0; pi->precno < pi->poc.precno1; pi->precno++) { - index = pi->layno * pi->step_l + pi->resno * pi->step_r + pi->compno * - pi->step_c + pi->precno * pi->step_p; - if (index >= pi->include_size) { - opj_pi_emit_error(pi, "Invalid access to pi->include"); - return OPJ_FALSE; - } - if (!pi->include[index]) { - pi->include[index] = 1; - return OPJ_TRUE; - } -LABEL_SKIP: - ; - } - } - } - } - - return OPJ_FALSE; -} - -static OPJ_BOOL opj_pi_next_rpcl(opj_pi_iterator_t * pi) -{ - opj_pi_comp_t *comp = NULL; - opj_pi_resolution_t *res = NULL; - OPJ_UINT32 index = 0; - - if (!pi->first) { - goto LABEL_SKIP; - } else { - OPJ_UINT32 compno, resno; - pi->first = 0; - pi->dx = 0; - pi->dy = 0; - for (compno = 0; compno < pi->numcomps; compno++) { - comp = &pi->comps[compno]; - for (resno = 0; resno < comp->numresolutions; resno++) { - OPJ_UINT32 dx, dy; - res = &comp->resolutions[resno]; - if (res->pdx + comp->numresolutions - 1 - resno < 32 && - comp->dx <= UINT_MAX / (1u << (res->pdx + comp->numresolutions - 1 - resno))) { - dx = comp->dx * (1u << (res->pdx + comp->numresolutions - 1 - resno)); - pi->dx = !pi->dx ? dx : opj_uint_min(pi->dx, dx); - } - if (res->pdy + comp->numresolutions - 1 - resno < 32 && - comp->dy <= UINT_MAX / (1u << (res->pdy + comp->numresolutions - 1 - resno))) { - dy = comp->dy * (1u << (res->pdy + comp->numresolutions - 1 - resno)); - pi->dy = !pi->dy ? dy : opj_uint_min(pi->dy, dy); - } - } - } - if (pi->dx == 0 || pi->dy == 0) { - return OPJ_FALSE; - } - } - if (!pi->tp_on) { - pi->poc.ty0 = pi->ty0; - pi->poc.tx0 = pi->tx0; - pi->poc.ty1 = pi->ty1; - pi->poc.tx1 = pi->tx1; - } - for (pi->resno = pi->poc.resno0; pi->resno < pi->poc.resno1; pi->resno++) { - for (pi->y = pi->poc.ty0; pi->y < pi->poc.ty1; - pi->y += (OPJ_INT32)(pi->dy - (OPJ_UINT32)(pi->y % (OPJ_INT32)pi->dy))) { - for (pi->x = pi->poc.tx0; pi->x < pi->poc.tx1; - pi->x += (OPJ_INT32)(pi->dx - (OPJ_UINT32)(pi->x % (OPJ_INT32)pi->dx))) { - for (pi->compno = pi->poc.compno0; pi->compno < pi->poc.compno1; pi->compno++) { - OPJ_UINT32 levelno; - OPJ_INT32 trx0, try0; - OPJ_INT32 trx1, try1; - OPJ_UINT32 rpx, rpy; - OPJ_INT32 prci, prcj; - comp = &pi->comps[pi->compno]; - if (pi->resno >= comp->numresolutions) { - continue; - } - res = &comp->resolutions[pi->resno]; - levelno = comp->numresolutions - 1 - pi->resno; - /* Avoids division by zero */ - /* Relates to id_000004,sig_06,src_000679,op_arith8,pos_49,val_-17 */ - /* of https://github.com/uclouvain/openjpeg/issues/938 */ - if (levelno >= 32 || - ((comp->dx << levelno) >> levelno) != comp->dx || - ((comp->dy << levelno) >> levelno) != comp->dy) { - continue; - } - if ((comp->dx << levelno) > INT_MAX || - (comp->dy << levelno) > INT_MAX) { - continue; - } - trx0 = opj_int_ceildiv(pi->tx0, (OPJ_INT32)(comp->dx << levelno)); - try0 = opj_int_ceildiv(pi->ty0, (OPJ_INT32)(comp->dy << levelno)); - trx1 = opj_int_ceildiv(pi->tx1, (OPJ_INT32)(comp->dx << levelno)); - try1 = opj_int_ceildiv(pi->ty1, (OPJ_INT32)(comp->dy << levelno)); - rpx = res->pdx + levelno; - rpy = res->pdy + levelno; - - /* To avoid divisions by zero / undefined behaviour on shift */ - /* in below tests */ - /* Fixes reading id:000026,sig:08,src:002419,op:int32,pos:60,val:+32 */ - /* of https://github.com/uclouvain/openjpeg/issues/938 */ - if (rpx >= 31 || ((comp->dx << rpx) >> rpx) != comp->dx || - rpy >= 31 || ((comp->dy << rpy) >> rpy) != comp->dy) { - continue; - } - - /* See ISO-15441. B.12.1.3 Resolution level-position-component-layer progression */ - if (!((pi->y % (OPJ_INT32)(comp->dy << rpy) == 0) || ((pi->y == pi->ty0) && - ((try0 << levelno) % (1 << rpy))))) { - continue; - } - if (!((pi->x % (OPJ_INT32)(comp->dx << rpx) == 0) || ((pi->x == pi->tx0) && - ((trx0 << levelno) % (1 << rpx))))) { - continue; - } - - if ((res->pw == 0) || (res->ph == 0)) { - continue; - } - - if ((trx0 == trx1) || (try0 == try1)) { - continue; - } - - prci = opj_int_floordivpow2(opj_int_ceildiv(pi->x, - (OPJ_INT32)(comp->dx << levelno)), (OPJ_INT32)res->pdx) - - opj_int_floordivpow2(trx0, (OPJ_INT32)res->pdx); - prcj = opj_int_floordivpow2(opj_int_ceildiv(pi->y, - (OPJ_INT32)(comp->dy << levelno)), (OPJ_INT32)res->pdy) - - opj_int_floordivpow2(try0, (OPJ_INT32)res->pdy); - pi->precno = (OPJ_UINT32)(prci + prcj * (OPJ_INT32)res->pw); - for (pi->layno = pi->poc.layno0; pi->layno < pi->poc.layno1; pi->layno++) { - index = pi->layno * pi->step_l + pi->resno * pi->step_r + pi->compno * - pi->step_c + pi->precno * pi->step_p; - if (index >= pi->include_size) { - opj_pi_emit_error(pi, "Invalid access to pi->include"); - return OPJ_FALSE; - } - if (!pi->include[index]) { - pi->include[index] = 1; - return OPJ_TRUE; - } -LABEL_SKIP: - ; - } - } - } - } - } - - return OPJ_FALSE; -} - -static OPJ_BOOL opj_pi_next_pcrl(opj_pi_iterator_t * pi) -{ - opj_pi_comp_t *comp = NULL; - opj_pi_resolution_t *res = NULL; - OPJ_UINT32 index = 0; - - if (!pi->first) { - comp = &pi->comps[pi->compno]; - goto LABEL_SKIP; - } else { - OPJ_UINT32 compno, resno; - pi->first = 0; - pi->dx = 0; - pi->dy = 0; - for (compno = 0; compno < pi->numcomps; compno++) { - comp = &pi->comps[compno]; - for (resno = 0; resno < comp->numresolutions; resno++) { - OPJ_UINT32 dx, dy; - res = &comp->resolutions[resno]; - if (res->pdx + comp->numresolutions - 1 - resno < 32 && - comp->dx <= UINT_MAX / (1u << (res->pdx + comp->numresolutions - 1 - resno))) { - dx = comp->dx * (1u << (res->pdx + comp->numresolutions - 1 - resno)); - pi->dx = !pi->dx ? dx : opj_uint_min(pi->dx, dx); - } - if (res->pdy + comp->numresolutions - 1 - resno < 32 && - comp->dy <= UINT_MAX / (1u << (res->pdy + comp->numresolutions - 1 - resno))) { - dy = comp->dy * (1u << (res->pdy + comp->numresolutions - 1 - resno)); - pi->dy = !pi->dy ? dy : opj_uint_min(pi->dy, dy); - } - } - } - if (pi->dx == 0 || pi->dy == 0) { - return OPJ_FALSE; - } - } - if (!pi->tp_on) { - pi->poc.ty0 = pi->ty0; - pi->poc.tx0 = pi->tx0; - pi->poc.ty1 = pi->ty1; - pi->poc.tx1 = pi->tx1; - } - for (pi->y = pi->poc.ty0; pi->y < pi->poc.ty1; - pi->y += (OPJ_INT32)(pi->dy - (OPJ_UINT32)(pi->y % (OPJ_INT32)pi->dy))) { - for (pi->x = pi->poc.tx0; pi->x < pi->poc.tx1; - pi->x += (OPJ_INT32)(pi->dx - (OPJ_UINT32)(pi->x % (OPJ_INT32)pi->dx))) { - for (pi->compno = pi->poc.compno0; pi->compno < pi->poc.compno1; pi->compno++) { - comp = &pi->comps[pi->compno]; - for (pi->resno = pi->poc.resno0; - pi->resno < opj_uint_min(pi->poc.resno1, comp->numresolutions); pi->resno++) { - OPJ_UINT32 levelno; - OPJ_INT32 trx0, try0; - OPJ_INT32 trx1, try1; - OPJ_UINT32 rpx, rpy; - OPJ_INT32 prci, prcj; - res = &comp->resolutions[pi->resno]; - levelno = comp->numresolutions - 1 - pi->resno; - /* Avoids division by zero */ - /* Relates to id_000004,sig_06,src_000679,op_arith8,pos_49,val_-17 */ - /* of https://github.com/uclouvain/openjpeg/issues/938 */ - if (levelno >= 32 || - ((comp->dx << levelno) >> levelno) != comp->dx || - ((comp->dy << levelno) >> levelno) != comp->dy) { - continue; - } - if ((comp->dx << levelno) > INT_MAX || - (comp->dy << levelno) > INT_MAX) { - continue; - } - trx0 = opj_int_ceildiv(pi->tx0, (OPJ_INT32)(comp->dx << levelno)); - try0 = opj_int_ceildiv(pi->ty0, (OPJ_INT32)(comp->dy << levelno)); - trx1 = opj_int_ceildiv(pi->tx1, (OPJ_INT32)(comp->dx << levelno)); - try1 = opj_int_ceildiv(pi->ty1, (OPJ_INT32)(comp->dy << levelno)); - rpx = res->pdx + levelno; - rpy = res->pdy + levelno; - - /* To avoid divisions by zero / undefined behaviour on shift */ - /* in below tests */ - /* Relates to id:000019,sig:08,src:001098,op:flip1,pos:49 */ - /* of https://github.com/uclouvain/openjpeg/issues/938 */ - if (rpx >= 31 || ((comp->dx << rpx) >> rpx) != comp->dx || - rpy >= 31 || ((comp->dy << rpy) >> rpy) != comp->dy) { - continue; - } - - /* See ISO-15441. B.12.1.4 Position-component-resolution level-layer progression */ - if (!((pi->y % (OPJ_INT32)(comp->dy << rpy) == 0) || ((pi->y == pi->ty0) && - ((try0 << levelno) % (1 << rpy))))) { - continue; - } - if (!((pi->x % (OPJ_INT32)(comp->dx << rpx) == 0) || ((pi->x == pi->tx0) && - ((trx0 << levelno) % (1 << rpx))))) { - continue; - } - - if ((res->pw == 0) || (res->ph == 0)) { - continue; - } - - if ((trx0 == trx1) || (try0 == try1)) { - continue; - } - - prci = opj_int_floordivpow2(opj_int_ceildiv(pi->x, - (OPJ_INT32)(comp->dx << levelno)), (OPJ_INT32)res->pdx) - - opj_int_floordivpow2(trx0, (OPJ_INT32)res->pdx); - prcj = opj_int_floordivpow2(opj_int_ceildiv(pi->y, - (OPJ_INT32)(comp->dy << levelno)), (OPJ_INT32)res->pdy) - - opj_int_floordivpow2(try0, (OPJ_INT32)res->pdy); - pi->precno = (OPJ_UINT32)(prci + prcj * (OPJ_INT32)res->pw); - for (pi->layno = pi->poc.layno0; pi->layno < pi->poc.layno1; pi->layno++) { - index = pi->layno * pi->step_l + pi->resno * pi->step_r + pi->compno * - pi->step_c + pi->precno * pi->step_p; - if (index >= pi->include_size) { - opj_pi_emit_error(pi, "Invalid access to pi->include"); - return OPJ_FALSE; - } - if (!pi->include[index]) { - pi->include[index] = 1; - return OPJ_TRUE; - } -LABEL_SKIP: - ; - } - } - } - } - } - - return OPJ_FALSE; -} - -static OPJ_BOOL opj_pi_next_cprl(opj_pi_iterator_t * pi) -{ - opj_pi_comp_t *comp = NULL; - opj_pi_resolution_t *res = NULL; - OPJ_UINT32 index = 0; - - if (!pi->first) { - comp = &pi->comps[pi->compno]; - goto LABEL_SKIP; - } else { - pi->first = 0; - } - - for (pi->compno = pi->poc.compno0; pi->compno < pi->poc.compno1; pi->compno++) { - OPJ_UINT32 resno; - comp = &pi->comps[pi->compno]; - pi->dx = 0; - pi->dy = 0; - for (resno = 0; resno < comp->numresolutions; resno++) { - OPJ_UINT32 dx, dy; - res = &comp->resolutions[resno]; - if (res->pdx + comp->numresolutions - 1 - resno < 32 && - comp->dx <= UINT_MAX / (1u << (res->pdx + comp->numresolutions - 1 - resno))) { - dx = comp->dx * (1u << (res->pdx + comp->numresolutions - 1 - resno)); - pi->dx = !pi->dx ? dx : opj_uint_min(pi->dx, dx); - } - if (res->pdy + comp->numresolutions - 1 - resno < 32 && - comp->dy <= UINT_MAX / (1u << (res->pdy + comp->numresolutions - 1 - resno))) { - dy = comp->dy * (1u << (res->pdy + comp->numresolutions - 1 - resno)); - pi->dy = !pi->dy ? dy : opj_uint_min(pi->dy, dy); - } - } - if (pi->dx == 0 || pi->dy == 0) { - return OPJ_FALSE; - } - if (!pi->tp_on) { - pi->poc.ty0 = pi->ty0; - pi->poc.tx0 = pi->tx0; - pi->poc.ty1 = pi->ty1; - pi->poc.tx1 = pi->tx1; - } - for (pi->y = pi->poc.ty0; pi->y < pi->poc.ty1; - pi->y += (OPJ_INT32)(pi->dy - (OPJ_UINT32)(pi->y % (OPJ_INT32)pi->dy))) { - for (pi->x = pi->poc.tx0; pi->x < pi->poc.tx1; - pi->x += (OPJ_INT32)(pi->dx - (OPJ_UINT32)(pi->x % (OPJ_INT32)pi->dx))) { - for (pi->resno = pi->poc.resno0; - pi->resno < opj_uint_min(pi->poc.resno1, comp->numresolutions); pi->resno++) { - OPJ_UINT32 levelno; - OPJ_INT32 trx0, try0; - OPJ_INT32 trx1, try1; - OPJ_UINT32 rpx, rpy; - OPJ_INT32 prci, prcj; - res = &comp->resolutions[pi->resno]; - levelno = comp->numresolutions - 1 - pi->resno; - /* Avoids division by zero on id_000004,sig_06,src_000679,op_arith8,pos_49,val_-17 */ - /* of https://github.com/uclouvain/openjpeg/issues/938 */ - if (levelno >= 32 || - ((comp->dx << levelno) >> levelno) != comp->dx || - ((comp->dy << levelno) >> levelno) != comp->dy) { - continue; - } - if ((comp->dx << levelno) > INT_MAX || - (comp->dy << levelno) > INT_MAX) { - continue; - } - trx0 = opj_int_ceildiv(pi->tx0, (OPJ_INT32)(comp->dx << levelno)); - try0 = opj_int_ceildiv(pi->ty0, (OPJ_INT32)(comp->dy << levelno)); - trx1 = opj_int_ceildiv(pi->tx1, (OPJ_INT32)(comp->dx << levelno)); - try1 = opj_int_ceildiv(pi->ty1, (OPJ_INT32)(comp->dy << levelno)); - rpx = res->pdx + levelno; - rpy = res->pdy + levelno; - - /* To avoid divisions by zero / undefined behaviour on shift */ - /* in below tests */ - /* Fixes reading id:000019,sig:08,src:001098,op:flip1,pos:49 */ - /* of https://github.com/uclouvain/openjpeg/issues/938 */ - if (rpx >= 31 || ((comp->dx << rpx) >> rpx) != comp->dx || - rpy >= 31 || ((comp->dy << rpy) >> rpy) != comp->dy) { - continue; - } - - /* See ISO-15441. B.12.1.5 Component-position-resolution level-layer progression */ - if (!((pi->y % (OPJ_INT32)(comp->dy << rpy) == 0) || ((pi->y == pi->ty0) && - ((try0 << levelno) % (1 << rpy))))) { - continue; - } - if (!((pi->x % (OPJ_INT32)(comp->dx << rpx) == 0) || ((pi->x == pi->tx0) && - ((trx0 << levelno) % (1 << rpx))))) { - continue; - } - - if ((res->pw == 0) || (res->ph == 0)) { - continue; - } - - if ((trx0 == trx1) || (try0 == try1)) { - continue; - } - - prci = opj_int_floordivpow2(opj_int_ceildiv(pi->x, - (OPJ_INT32)(comp->dx << levelno)), (OPJ_INT32)res->pdx) - - opj_int_floordivpow2(trx0, (OPJ_INT32)res->pdx); - prcj = opj_int_floordivpow2(opj_int_ceildiv(pi->y, - (OPJ_INT32)(comp->dy << levelno)), (OPJ_INT32)res->pdy) - - opj_int_floordivpow2(try0, (OPJ_INT32)res->pdy); - pi->precno = (OPJ_UINT32)(prci + prcj * (OPJ_INT32)res->pw); - for (pi->layno = pi->poc.layno0; pi->layno < pi->poc.layno1; pi->layno++) { - index = pi->layno * pi->step_l + pi->resno * pi->step_r + pi->compno * - pi->step_c + pi->precno * pi->step_p; - if (index >= pi->include_size) { - opj_pi_emit_error(pi, "Invalid access to pi->include"); - return OPJ_FALSE; - } - if (!pi->include[index]) { - pi->include[index] = 1; - return OPJ_TRUE; - } -LABEL_SKIP: - ; - } - } - } - } - } - - return OPJ_FALSE; -} - -static void opj_get_encoding_parameters(const opj_image_t *p_image, - const opj_cp_t *p_cp, - OPJ_UINT32 p_tileno, - OPJ_INT32 * p_tx0, - OPJ_INT32 * p_tx1, - OPJ_INT32 * p_ty0, - OPJ_INT32 * p_ty1, - OPJ_UINT32 * p_dx_min, - OPJ_UINT32 * p_dy_min, - OPJ_UINT32 * p_max_prec, - OPJ_UINT32 * p_max_res) -{ - /* loop */ - OPJ_UINT32 compno, resno; - /* pointers */ - const opj_tcp_t *l_tcp = 00; - const opj_tccp_t * l_tccp = 00; - const opj_image_comp_t * l_img_comp = 00; - - /* position in x and y of tile */ - OPJ_UINT32 p, q; - - /* preconditions */ - assert(p_cp != 00); - assert(p_image != 00); - assert(p_tileno < p_cp->tw * p_cp->th); - - /* initializations */ - l_tcp = &p_cp->tcps [p_tileno]; - l_img_comp = p_image->comps; - l_tccp = l_tcp->tccps; - - /* here calculation of tx0, tx1, ty0, ty1, maxprec, dx and dy */ - p = p_tileno % p_cp->tw; - q = p_tileno / p_cp->tw; - - /* find extent of tile */ - *p_tx0 = opj_int_max((OPJ_INT32)(p_cp->tx0 + p * p_cp->tdx), - (OPJ_INT32)p_image->x0); - *p_tx1 = opj_int_min((OPJ_INT32)(p_cp->tx0 + (p + 1) * p_cp->tdx), - (OPJ_INT32)p_image->x1); - *p_ty0 = opj_int_max((OPJ_INT32)(p_cp->ty0 + q * p_cp->tdy), - (OPJ_INT32)p_image->y0); - *p_ty1 = opj_int_min((OPJ_INT32)(p_cp->ty0 + (q + 1) * p_cp->tdy), - (OPJ_INT32)p_image->y1); - - /* max precision is 0 (can only grow) */ - *p_max_prec = 0; - *p_max_res = 0; - - /* take the largest value for dx_min and dy_min */ - *p_dx_min = 0x7fffffff; - *p_dy_min = 0x7fffffff; - - for (compno = 0; compno < p_image->numcomps; ++compno) { - /* arithmetic variables to calculate */ - OPJ_UINT32 l_level_no; - OPJ_INT32 l_rx0, l_ry0, l_rx1, l_ry1; - OPJ_INT32 l_px0, l_py0, l_px1, py1; - OPJ_UINT32 l_pdx, l_pdy; - OPJ_UINT32 l_pw, l_ph; - OPJ_UINT32 l_product; - OPJ_INT32 l_tcx0, l_tcy0, l_tcx1, l_tcy1; - - l_tcx0 = opj_int_ceildiv(*p_tx0, (OPJ_INT32)l_img_comp->dx); - l_tcy0 = opj_int_ceildiv(*p_ty0, (OPJ_INT32)l_img_comp->dy); - l_tcx1 = opj_int_ceildiv(*p_tx1, (OPJ_INT32)l_img_comp->dx); - l_tcy1 = opj_int_ceildiv(*p_ty1, (OPJ_INT32)l_img_comp->dy); - - if (l_tccp->numresolutions > *p_max_res) { - *p_max_res = l_tccp->numresolutions; - } - - /* use custom size for precincts */ - for (resno = 0; resno < l_tccp->numresolutions; ++resno) { - OPJ_UINT32 l_dx, l_dy; - - /* precinct width and height */ - l_pdx = l_tccp->prcw[resno]; - l_pdy = l_tccp->prch[resno]; - - l_dx = l_img_comp->dx * (1u << (l_pdx + l_tccp->numresolutions - 1 - resno)); - l_dy = l_img_comp->dy * (1u << (l_pdy + l_tccp->numresolutions - 1 - resno)); - - /* take the minimum size for dx for each comp and resolution */ - *p_dx_min = opj_uint_min(*p_dx_min, l_dx); - *p_dy_min = opj_uint_min(*p_dy_min, l_dy); - - /* various calculations of extents */ - l_level_no = l_tccp->numresolutions - 1 - resno; - - l_rx0 = opj_int_ceildivpow2(l_tcx0, (OPJ_INT32)l_level_no); - l_ry0 = opj_int_ceildivpow2(l_tcy0, (OPJ_INT32)l_level_no); - l_rx1 = opj_int_ceildivpow2(l_tcx1, (OPJ_INT32)l_level_no); - l_ry1 = opj_int_ceildivpow2(l_tcy1, (OPJ_INT32)l_level_no); - - l_px0 = opj_int_floordivpow2(l_rx0, (OPJ_INT32)l_pdx) << l_pdx; - l_py0 = opj_int_floordivpow2(l_ry0, (OPJ_INT32)l_pdy) << l_pdy; - l_px1 = opj_int_ceildivpow2(l_rx1, (OPJ_INT32)l_pdx) << l_pdx; - - py1 = opj_int_ceildivpow2(l_ry1, (OPJ_INT32)l_pdy) << l_pdy; - - l_pw = (l_rx0 == l_rx1) ? 0 : (OPJ_UINT32)((l_px1 - l_px0) >> l_pdx); - l_ph = (l_ry0 == l_ry1) ? 0 : (OPJ_UINT32)((py1 - l_py0) >> l_pdy); - - l_product = l_pw * l_ph; - - /* update precision */ - if (l_product > *p_max_prec) { - *p_max_prec = l_product; - } - } - ++l_img_comp; - ++l_tccp; - } -} - - -static void opj_get_all_encoding_parameters(const opj_image_t *p_image, - const opj_cp_t *p_cp, - OPJ_UINT32 tileno, - OPJ_INT32 * p_tx0, - OPJ_INT32 * p_tx1, - OPJ_INT32 * p_ty0, - OPJ_INT32 * p_ty1, - OPJ_UINT32 * p_dx_min, - OPJ_UINT32 * p_dy_min, - OPJ_UINT32 * p_max_prec, - OPJ_UINT32 * p_max_res, - OPJ_UINT32 ** p_resolutions) -{ - /* loop*/ - OPJ_UINT32 compno, resno; - - /* pointers*/ - const opj_tcp_t *tcp = 00; - const opj_tccp_t * l_tccp = 00; - const opj_image_comp_t * l_img_comp = 00; - - /* to store l_dx, l_dy, w and h for each resolution and component.*/ - OPJ_UINT32 * lResolutionPtr; - - /* position in x and y of tile*/ - OPJ_UINT32 p, q; - - /* non-corrected (in regard to image offset) tile offset */ - OPJ_UINT32 l_tx0, l_ty0; - - /* preconditions in debug*/ - assert(p_cp != 00); - assert(p_image != 00); - assert(tileno < p_cp->tw * p_cp->th); - - /* initializations*/ - tcp = &p_cp->tcps [tileno]; - l_tccp = tcp->tccps; - l_img_comp = p_image->comps; - - /* position in x and y of tile*/ - p = tileno % p_cp->tw; - q = tileno / p_cp->tw; - - /* here calculation of tx0, tx1, ty0, ty1, maxprec, l_dx and l_dy */ - l_tx0 = p_cp->tx0 + p * - p_cp->tdx; /* can't be greater than p_image->x1 so won't overflow */ - *p_tx0 = (OPJ_INT32)opj_uint_max(l_tx0, p_image->x0); - *p_tx1 = (OPJ_INT32)opj_uint_min(opj_uint_adds(l_tx0, p_cp->tdx), p_image->x1); - l_ty0 = p_cp->ty0 + q * - p_cp->tdy; /* can't be greater than p_image->y1 so won't overflow */ - *p_ty0 = (OPJ_INT32)opj_uint_max(l_ty0, p_image->y0); - *p_ty1 = (OPJ_INT32)opj_uint_min(opj_uint_adds(l_ty0, p_cp->tdy), p_image->y1); - - /* max precision and resolution is 0 (can only grow)*/ - *p_max_prec = 0; - *p_max_res = 0; - - /* take the largest value for dx_min and dy_min*/ - *p_dx_min = 0x7fffffff; - *p_dy_min = 0x7fffffff; - - for (compno = 0; compno < p_image->numcomps; ++compno) { - /* aritmetic variables to calculate*/ - OPJ_UINT32 l_level_no; - OPJ_INT32 l_rx0, l_ry0, l_rx1, l_ry1; - OPJ_INT32 l_px0, l_py0, l_px1, py1; - OPJ_UINT32 l_product; - OPJ_INT32 l_tcx0, l_tcy0, l_tcx1, l_tcy1; - OPJ_UINT32 l_pdx, l_pdy, l_pw, l_ph; - - lResolutionPtr = p_resolutions[compno]; - - l_tcx0 = opj_int_ceildiv(*p_tx0, (OPJ_INT32)l_img_comp->dx); - l_tcy0 = opj_int_ceildiv(*p_ty0, (OPJ_INT32)l_img_comp->dy); - l_tcx1 = opj_int_ceildiv(*p_tx1, (OPJ_INT32)l_img_comp->dx); - l_tcy1 = opj_int_ceildiv(*p_ty1, (OPJ_INT32)l_img_comp->dy); - - if (l_tccp->numresolutions > *p_max_res) { - *p_max_res = l_tccp->numresolutions; - } - - /* use custom size for precincts*/ - l_level_no = l_tccp->numresolutions; - for (resno = 0; resno < l_tccp->numresolutions; ++resno) { - OPJ_UINT32 l_dx, l_dy; - - --l_level_no; - - /* precinct width and height*/ - l_pdx = l_tccp->prcw[resno]; - l_pdy = l_tccp->prch[resno]; - *lResolutionPtr++ = l_pdx; - *lResolutionPtr++ = l_pdy; - if (l_pdx + l_level_no < 32 && - l_img_comp->dx <= UINT_MAX / (1u << (l_pdx + l_level_no))) { - l_dx = l_img_comp->dx * (1u << (l_pdx + l_level_no)); - /* take the minimum size for l_dx for each comp and resolution*/ - *p_dx_min = (OPJ_UINT32)opj_int_min((OPJ_INT32) * p_dx_min, (OPJ_INT32)l_dx); - } - if (l_pdy + l_level_no < 32 && - l_img_comp->dy <= UINT_MAX / (1u << (l_pdy + l_level_no))) { - l_dy = l_img_comp->dy * (1u << (l_pdy + l_level_no)); - *p_dy_min = (OPJ_UINT32)opj_int_min((OPJ_INT32) * p_dy_min, (OPJ_INT32)l_dy); - } - - /* various calculations of extents*/ - l_rx0 = opj_int_ceildivpow2(l_tcx0, (OPJ_INT32)l_level_no); - l_ry0 = opj_int_ceildivpow2(l_tcy0, (OPJ_INT32)l_level_no); - l_rx1 = opj_int_ceildivpow2(l_tcx1, (OPJ_INT32)l_level_no); - l_ry1 = opj_int_ceildivpow2(l_tcy1, (OPJ_INT32)l_level_no); - l_px0 = opj_int_floordivpow2(l_rx0, (OPJ_INT32)l_pdx) << l_pdx; - l_py0 = opj_int_floordivpow2(l_ry0, (OPJ_INT32)l_pdy) << l_pdy; - l_px1 = opj_int_ceildivpow2(l_rx1, (OPJ_INT32)l_pdx) << l_pdx; - py1 = opj_int_ceildivpow2(l_ry1, (OPJ_INT32)l_pdy) << l_pdy; - l_pw = (l_rx0 == l_rx1) ? 0 : (OPJ_UINT32)((l_px1 - l_px0) >> l_pdx); - l_ph = (l_ry0 == l_ry1) ? 0 : (OPJ_UINT32)((py1 - l_py0) >> l_pdy); - *lResolutionPtr++ = l_pw; - *lResolutionPtr++ = l_ph; - l_product = l_pw * l_ph; - - /* update precision*/ - if (l_product > *p_max_prec) { - *p_max_prec = l_product; - } - - } - ++l_tccp; - ++l_img_comp; - } -} - -static opj_pi_iterator_t * opj_pi_create(const opj_image_t *image, - const opj_cp_t *cp, - OPJ_UINT32 tileno) -{ - /* loop*/ - OPJ_UINT32 pino, compno; - /* number of poc in the p_pi*/ - OPJ_UINT32 l_poc_bound; - - /* pointers to tile coding parameters and components.*/ - opj_pi_iterator_t *l_pi = 00; - opj_tcp_t *tcp = 00; - const opj_tccp_t *tccp = 00; - - /* current packet iterator being allocated*/ - opj_pi_iterator_t *l_current_pi = 00; - - /* preconditions in debug*/ - assert(cp != 00); - assert(image != 00); - assert(tileno < cp->tw * cp->th); - - /* initializations*/ - tcp = &cp->tcps[tileno]; - l_poc_bound = tcp->numpocs + 1; - - /* memory allocations*/ - l_pi = (opj_pi_iterator_t*) opj_calloc((l_poc_bound), - sizeof(opj_pi_iterator_t)); - if (!l_pi) { - return NULL; - } - - l_current_pi = l_pi; - for (pino = 0; pino < l_poc_bound ; ++pino) { - - l_current_pi->comps = (opj_pi_comp_t*) opj_calloc(image->numcomps, - sizeof(opj_pi_comp_t)); - if (! l_current_pi->comps) { - opj_pi_destroy(l_pi, l_poc_bound); - return NULL; - } - - l_current_pi->numcomps = image->numcomps; - - for (compno = 0; compno < image->numcomps; ++compno) { - opj_pi_comp_t *comp = &l_current_pi->comps[compno]; - - tccp = &tcp->tccps[compno]; - - comp->resolutions = (opj_pi_resolution_t*) opj_calloc(tccp->numresolutions, - sizeof(opj_pi_resolution_t)); - if (!comp->resolutions) { - opj_pi_destroy(l_pi, l_poc_bound); - return 00; - } - - comp->numresolutions = tccp->numresolutions; - } - ++l_current_pi; - } - return l_pi; -} - -static void opj_pi_update_encode_poc_and_final(opj_cp_t *p_cp, - OPJ_UINT32 p_tileno, - OPJ_INT32 p_tx0, - OPJ_INT32 p_tx1, - OPJ_INT32 p_ty0, - OPJ_INT32 p_ty1, - OPJ_UINT32 p_max_prec, - OPJ_UINT32 p_max_res, - OPJ_UINT32 p_dx_min, - OPJ_UINT32 p_dy_min) -{ - /* loop*/ - OPJ_UINT32 pino; - /* tile coding parameter*/ - opj_tcp_t *l_tcp = 00; - /* current poc being updated*/ - opj_poc_t * l_current_poc = 00; - - /* number of pocs*/ - OPJ_UINT32 l_poc_bound; - - OPJ_ARG_NOT_USED(p_max_res); - - /* preconditions in debug*/ - assert(p_cp != 00); - assert(p_tileno < p_cp->tw * p_cp->th); - - /* initializations*/ - l_tcp = &p_cp->tcps [p_tileno]; - /* number of iterations in the loop */ - l_poc_bound = l_tcp->numpocs + 1; - - /* start at first element, and to make sure the compiler will not make a calculation each time in the loop - store a pointer to the current element to modify rather than l_tcp->pocs[i]*/ - l_current_poc = l_tcp->pocs; - - l_current_poc->compS = l_current_poc->compno0; - l_current_poc->compE = l_current_poc->compno1; - l_current_poc->resS = l_current_poc->resno0; - l_current_poc->resE = l_current_poc->resno1; - l_current_poc->layE = l_current_poc->layno1; - - /* special treatment for the first element*/ - l_current_poc->layS = 0; - l_current_poc->prg = l_current_poc->prg1; - l_current_poc->prcS = 0; - - l_current_poc->prcE = p_max_prec; - l_current_poc->txS = (OPJ_UINT32)p_tx0; - l_current_poc->txE = (OPJ_UINT32)p_tx1; - l_current_poc->tyS = (OPJ_UINT32)p_ty0; - l_current_poc->tyE = (OPJ_UINT32)p_ty1; - l_current_poc->dx = p_dx_min; - l_current_poc->dy = p_dy_min; - - ++ l_current_poc; - for (pino = 1; pino < l_poc_bound ; ++pino) { - l_current_poc->compS = l_current_poc->compno0; - l_current_poc->compE = l_current_poc->compno1; - l_current_poc->resS = l_current_poc->resno0; - l_current_poc->resE = l_current_poc->resno1; - l_current_poc->layE = l_current_poc->layno1; - l_current_poc->prg = l_current_poc->prg1; - l_current_poc->prcS = 0; - /* special treatment here different from the first element*/ - l_current_poc->layS = (l_current_poc->layE > (l_current_poc - 1)->layE) ? - l_current_poc->layE : 0; - - l_current_poc->prcE = p_max_prec; - l_current_poc->txS = (OPJ_UINT32)p_tx0; - l_current_poc->txE = (OPJ_UINT32)p_tx1; - l_current_poc->tyS = (OPJ_UINT32)p_ty0; - l_current_poc->tyE = (OPJ_UINT32)p_ty1; - l_current_poc->dx = p_dx_min; - l_current_poc->dy = p_dy_min; - ++ l_current_poc; - } -} - -static void opj_pi_update_encode_not_poc(opj_cp_t *p_cp, - OPJ_UINT32 p_num_comps, - OPJ_UINT32 p_tileno, - OPJ_INT32 p_tx0, - OPJ_INT32 p_tx1, - OPJ_INT32 p_ty0, - OPJ_INT32 p_ty1, - OPJ_UINT32 p_max_prec, - OPJ_UINT32 p_max_res, - OPJ_UINT32 p_dx_min, - OPJ_UINT32 p_dy_min) -{ - /* loop*/ - OPJ_UINT32 pino; - /* tile coding parameter*/ - opj_tcp_t *l_tcp = 00; - /* current poc being updated*/ - opj_poc_t * l_current_poc = 00; - /* number of pocs*/ - OPJ_UINT32 l_poc_bound; - - /* preconditions in debug*/ - assert(p_cp != 00); - assert(p_tileno < p_cp->tw * p_cp->th); - - /* initializations*/ - l_tcp = &p_cp->tcps [p_tileno]; - - /* number of iterations in the loop */ - l_poc_bound = l_tcp->numpocs + 1; - - /* start at first element, and to make sure the compiler will not make a calculation each time in the loop - store a pointer to the current element to modify rather than l_tcp->pocs[i]*/ - l_current_poc = l_tcp->pocs; - - for (pino = 0; pino < l_poc_bound ; ++pino) { - l_current_poc->compS = 0; - l_current_poc->compE = p_num_comps;/*p_image->numcomps;*/ - l_current_poc->resS = 0; - l_current_poc->resE = p_max_res; - l_current_poc->layS = 0; - l_current_poc->layE = l_tcp->numlayers; - l_current_poc->prg = l_tcp->prg; - l_current_poc->prcS = 0; - l_current_poc->prcE = p_max_prec; - l_current_poc->txS = (OPJ_UINT32)p_tx0; - l_current_poc->txE = (OPJ_UINT32)p_tx1; - l_current_poc->tyS = (OPJ_UINT32)p_ty0; - l_current_poc->tyE = (OPJ_UINT32)p_ty1; - l_current_poc->dx = p_dx_min; - l_current_poc->dy = p_dy_min; - ++ l_current_poc; - } -} - -static void opj_pi_update_decode_poc(opj_pi_iterator_t * p_pi, - opj_tcp_t * p_tcp, - OPJ_UINT32 p_max_precision, - OPJ_UINT32 p_max_res) -{ - /* loop*/ - OPJ_UINT32 pino; - - /* encoding prameters to set*/ - OPJ_UINT32 l_bound; - - opj_pi_iterator_t * l_current_pi = 00; - opj_poc_t* l_current_poc = 0; - - OPJ_ARG_NOT_USED(p_max_res); - - /* preconditions in debug*/ - assert(p_pi != 00); - assert(p_tcp != 00); - - /* initializations*/ - l_bound = p_tcp->numpocs + 1; - l_current_pi = p_pi; - l_current_poc = p_tcp->pocs; - - for (pino = 0; pino < l_bound; ++pino) { - l_current_pi->poc.prg = l_current_poc->prg; /* Progression Order #0 */ - l_current_pi->first = 1; - - l_current_pi->poc.resno0 = - l_current_poc->resno0; /* Resolution Level Index #0 (Start) */ - l_current_pi->poc.compno0 = - l_current_poc->compno0; /* Component Index #0 (Start) */ - l_current_pi->poc.layno0 = 0; - l_current_pi->poc.precno0 = 0; - l_current_pi->poc.resno1 = - l_current_poc->resno1; /* Resolution Level Index #0 (End) */ - l_current_pi->poc.compno1 = - l_current_poc->compno1; /* Component Index #0 (End) */ - l_current_pi->poc.layno1 = opj_uint_min(l_current_poc->layno1, - p_tcp->numlayers); /* Layer Index #0 (End) */ - l_current_pi->poc.precno1 = p_max_precision; - ++l_current_pi; - ++l_current_poc; - } -} - -static void opj_pi_update_decode_not_poc(opj_pi_iterator_t * p_pi, - opj_tcp_t * p_tcp, - OPJ_UINT32 p_max_precision, - OPJ_UINT32 p_max_res) -{ - /* loop*/ - OPJ_UINT32 pino; - - /* encoding prameters to set*/ - OPJ_UINT32 l_bound; - - opj_pi_iterator_t * l_current_pi = 00; - /* preconditions in debug*/ - assert(p_tcp != 00); - assert(p_pi != 00); - - /* initializations*/ - l_bound = p_tcp->numpocs + 1; - l_current_pi = p_pi; - - for (pino = 0; pino < l_bound; ++pino) { - l_current_pi->poc.prg = p_tcp->prg; - l_current_pi->first = 1; - l_current_pi->poc.resno0 = 0; - l_current_pi->poc.compno0 = 0; - l_current_pi->poc.layno0 = 0; - l_current_pi->poc.precno0 = 0; - l_current_pi->poc.resno1 = p_max_res; - l_current_pi->poc.compno1 = l_current_pi->numcomps; - l_current_pi->poc.layno1 = p_tcp->numlayers; - l_current_pi->poc.precno1 = p_max_precision; - ++l_current_pi; - } -} - - - -static OPJ_BOOL opj_pi_check_next_level(OPJ_INT32 pos, - opj_cp_t *cp, - OPJ_UINT32 tileno, - OPJ_UINT32 pino, - const OPJ_CHAR *prog) -{ - OPJ_INT32 i; - opj_tcp_t *tcps = &cp->tcps[tileno]; - opj_poc_t *tcp = &tcps->pocs[pino]; - - if (pos >= 0) { - for (i = pos; pos >= 0; i--) { - switch (prog[i]) { - case 'R': - if (tcp->res_t == tcp->resE) { - if (opj_pi_check_next_level(pos - 1, cp, tileno, pino, prog)) { - return OPJ_TRUE; - } else { - return OPJ_FALSE; - } - } else { - return OPJ_TRUE; - } - break; - case 'C': - if (tcp->comp_t == tcp->compE) { - if (opj_pi_check_next_level(pos - 1, cp, tileno, pino, prog)) { - return OPJ_TRUE; - } else { - return OPJ_FALSE; - } - } else { - return OPJ_TRUE; - } - break; - case 'L': - if (tcp->lay_t == tcp->layE) { - if (opj_pi_check_next_level(pos - 1, cp, tileno, pino, prog)) { - return OPJ_TRUE; - } else { - return OPJ_FALSE; - } - } else { - return OPJ_TRUE; - } - break; - case 'P': - switch (tcp->prg) { - case OPJ_LRCP: /* fall through */ - case OPJ_RLCP: - if (tcp->prc_t == tcp->prcE) { - if (opj_pi_check_next_level(i - 1, cp, tileno, pino, prog)) { - return OPJ_TRUE; - } else { - return OPJ_FALSE; - } - } else { - return OPJ_TRUE; - } - break; - default: - if (tcp->tx0_t == tcp->txE) { - /*TY*/ - if (tcp->ty0_t == tcp->tyE) { - if (opj_pi_check_next_level(i - 1, cp, tileno, pino, prog)) { - return OPJ_TRUE; - } else { - return OPJ_FALSE; - } - } else { - return OPJ_TRUE; - }/*TY*/ - } else { - return OPJ_TRUE; - } - break; - }/*end case P*/ - }/*end switch*/ - }/*end for*/ - }/*end if*/ - return OPJ_FALSE; -} - - -/* -========================================================== - Packet iterator interface -========================================================== -*/ -opj_pi_iterator_t *opj_pi_create_decode(opj_image_t *p_image, - opj_cp_t *p_cp, - OPJ_UINT32 p_tile_no) -{ - OPJ_UINT32 numcomps = p_image->numcomps; - - /* loop */ - OPJ_UINT32 pino; - OPJ_UINT32 compno, resno; - - /* to store w, h, dx and dy fro all components and resolutions */ - OPJ_UINT32 * l_tmp_data; - OPJ_UINT32 ** l_tmp_ptr; - - /* encoding prameters to set */ - OPJ_UINT32 l_max_res; - OPJ_UINT32 l_max_prec; - OPJ_INT32 l_tx0, l_tx1, l_ty0, l_ty1; - OPJ_UINT32 l_dx_min, l_dy_min; - OPJ_UINT32 l_bound; - OPJ_UINT32 l_step_p, l_step_c, l_step_r, l_step_l ; - OPJ_UINT32 l_data_stride; - - /* pointers */ - opj_pi_iterator_t *l_pi = 00; - opj_tcp_t *l_tcp = 00; - const opj_tccp_t *l_tccp = 00; - opj_pi_comp_t *l_current_comp = 00; - opj_image_comp_t * l_img_comp = 00; - opj_pi_iterator_t * l_current_pi = 00; - OPJ_UINT32 * l_encoding_value_ptr = 00; - - /* preconditions in debug */ - assert(p_cp != 00); - assert(p_image != 00); - assert(p_tile_no < p_cp->tw * p_cp->th); - - /* initializations */ - l_tcp = &p_cp->tcps[p_tile_no]; - l_bound = l_tcp->numpocs + 1; - - l_data_stride = 4 * OPJ_J2K_MAXRLVLS; - l_tmp_data = (OPJ_UINT32*)opj_malloc( - l_data_stride * numcomps * sizeof(OPJ_UINT32)); - if - (! l_tmp_data) { - return 00; - } - l_tmp_ptr = (OPJ_UINT32**)opj_malloc( - numcomps * sizeof(OPJ_UINT32 *)); - if - (! l_tmp_ptr) { - opj_free(l_tmp_data); - return 00; - } - - /* memory allocation for pi */ - l_pi = opj_pi_create(p_image, p_cp, p_tile_no); - if (!l_pi) { - opj_free(l_tmp_data); - opj_free(l_tmp_ptr); - return 00; - } - - l_encoding_value_ptr = l_tmp_data; - /* update pointer array */ - for - (compno = 0; compno < numcomps; ++compno) { - l_tmp_ptr[compno] = l_encoding_value_ptr; - l_encoding_value_ptr += l_data_stride; - } - /* get encoding parameters */ - opj_get_all_encoding_parameters(p_image, p_cp, p_tile_no, &l_tx0, &l_tx1, - &l_ty0, &l_ty1, &l_dx_min, &l_dy_min, &l_max_prec, &l_max_res, l_tmp_ptr); - - /* step calculations */ - l_step_p = 1; - l_step_c = l_max_prec * l_step_p; - l_step_r = numcomps * l_step_c; - l_step_l = l_max_res * l_step_r; - - /* set values for first packet iterator */ - l_current_pi = l_pi; - - /* memory allocation for include */ - /* prevent an integer overflow issue */ - /* 0 < l_tcp->numlayers < 65536 c.f. opj_j2k_read_cod in j2k.c */ - l_current_pi->include = 00; - if (l_step_l <= (UINT_MAX / (l_tcp->numlayers + 1U))) { - l_current_pi->include_size = (l_tcp->numlayers + 1U) * l_step_l; - l_current_pi->include = (OPJ_INT16*) opj_calloc( - l_current_pi->include_size, sizeof(OPJ_INT16)); - } - - if (!l_current_pi->include) { - opj_free(l_tmp_data); - opj_free(l_tmp_ptr); - opj_pi_destroy(l_pi, l_bound); - return 00; - } - - /* special treatment for the first packet iterator */ - l_current_comp = l_current_pi->comps; - l_img_comp = p_image->comps; - l_tccp = l_tcp->tccps; - - l_current_pi->tx0 = l_tx0; - l_current_pi->ty0 = l_ty0; - l_current_pi->tx1 = l_tx1; - l_current_pi->ty1 = l_ty1; - - /*l_current_pi->dx = l_img_comp->dx;*/ - /*l_current_pi->dy = l_img_comp->dy;*/ - - l_current_pi->step_p = l_step_p; - l_current_pi->step_c = l_step_c; - l_current_pi->step_r = l_step_r; - l_current_pi->step_l = l_step_l; - - /* allocation for components and number of components has already been calculated by opj_pi_create */ - for - (compno = 0; compno < numcomps; ++compno) { - opj_pi_resolution_t *l_res = l_current_comp->resolutions; - l_encoding_value_ptr = l_tmp_ptr[compno]; - - l_current_comp->dx = l_img_comp->dx; - l_current_comp->dy = l_img_comp->dy; - /* resolutions have already been initialized */ - for - (resno = 0; resno < l_current_comp->numresolutions; resno++) { - l_res->pdx = *(l_encoding_value_ptr++); - l_res->pdy = *(l_encoding_value_ptr++); - l_res->pw = *(l_encoding_value_ptr++); - l_res->ph = *(l_encoding_value_ptr++); - ++l_res; - } - ++l_current_comp; - ++l_img_comp; - ++l_tccp; - } - ++l_current_pi; - - for (pino = 1 ; pino < l_bound ; ++pino) { - l_current_comp = l_current_pi->comps; - l_img_comp = p_image->comps; - l_tccp = l_tcp->tccps; - - l_current_pi->tx0 = l_tx0; - l_current_pi->ty0 = l_ty0; - l_current_pi->tx1 = l_tx1; - l_current_pi->ty1 = l_ty1; - /*l_current_pi->dx = l_dx_min;*/ - /*l_current_pi->dy = l_dy_min;*/ - l_current_pi->step_p = l_step_p; - l_current_pi->step_c = l_step_c; - l_current_pi->step_r = l_step_r; - l_current_pi->step_l = l_step_l; - - /* allocation for components and number of components has already been calculated by opj_pi_create */ - for - (compno = 0; compno < numcomps; ++compno) { - opj_pi_resolution_t *l_res = l_current_comp->resolutions; - l_encoding_value_ptr = l_tmp_ptr[compno]; - - l_current_comp->dx = l_img_comp->dx; - l_current_comp->dy = l_img_comp->dy; - /* resolutions have already been initialized */ - for - (resno = 0; resno < l_current_comp->numresolutions; resno++) { - l_res->pdx = *(l_encoding_value_ptr++); - l_res->pdy = *(l_encoding_value_ptr++); - l_res->pw = *(l_encoding_value_ptr++); - l_res->ph = *(l_encoding_value_ptr++); - ++l_res; - } - ++l_current_comp; - ++l_img_comp; - ++l_tccp; - } - /* special treatment*/ - l_current_pi->include = (l_current_pi - 1)->include; - l_current_pi->include_size = (l_current_pi - 1)->include_size; - ++l_current_pi; - } - opj_free(l_tmp_data); - l_tmp_data = 00; - opj_free(l_tmp_ptr); - l_tmp_ptr = 00; - if - (l_tcp->POC) { - opj_pi_update_decode_poc(l_pi, l_tcp, l_max_prec, l_max_res); - } else { - opj_pi_update_decode_not_poc(l_pi, l_tcp, l_max_prec, l_max_res); - } - return l_pi; -} - - - -opj_pi_iterator_t *opj_pi_initialise_encode(const opj_image_t *p_image, - opj_cp_t *p_cp, - OPJ_UINT32 p_tile_no, - J2K_T2_MODE p_t2_mode) -{ - OPJ_UINT32 numcomps = p_image->numcomps; - - /* loop*/ - OPJ_UINT32 pino; - OPJ_UINT32 compno, resno; - - /* to store w, h, dx and dy fro all components and resolutions*/ - OPJ_UINT32 * l_tmp_data; - OPJ_UINT32 ** l_tmp_ptr; - - /* encoding prameters to set*/ - OPJ_UINT32 l_max_res; - OPJ_UINT32 l_max_prec; - OPJ_INT32 l_tx0, l_tx1, l_ty0, l_ty1; - OPJ_UINT32 l_dx_min, l_dy_min; - OPJ_UINT32 l_bound; - OPJ_UINT32 l_step_p, l_step_c, l_step_r, l_step_l ; - OPJ_UINT32 l_data_stride; - - /* pointers*/ - opj_pi_iterator_t *l_pi = 00; - opj_tcp_t *l_tcp = 00; - const opj_tccp_t *l_tccp = 00; - opj_pi_comp_t *l_current_comp = 00; - opj_image_comp_t * l_img_comp = 00; - opj_pi_iterator_t * l_current_pi = 00; - OPJ_UINT32 * l_encoding_value_ptr = 00; - - /* preconditions in debug*/ - assert(p_cp != 00); - assert(p_image != 00); - assert(p_tile_no < p_cp->tw * p_cp->th); - - /* initializations*/ - l_tcp = &p_cp->tcps[p_tile_no]; - l_bound = l_tcp->numpocs + 1; - - l_data_stride = 4 * OPJ_J2K_MAXRLVLS; - l_tmp_data = (OPJ_UINT32*)opj_malloc( - l_data_stride * numcomps * sizeof(OPJ_UINT32)); - if (! l_tmp_data) { - return 00; - } - - l_tmp_ptr = (OPJ_UINT32**)opj_malloc( - numcomps * sizeof(OPJ_UINT32 *)); - if (! l_tmp_ptr) { - opj_free(l_tmp_data); - return 00; - } - - /* memory allocation for pi*/ - l_pi = opj_pi_create(p_image, p_cp, p_tile_no); - if (!l_pi) { - opj_free(l_tmp_data); - opj_free(l_tmp_ptr); - return 00; - } - - l_encoding_value_ptr = l_tmp_data; - /* update pointer array*/ - for (compno = 0; compno < numcomps; ++compno) { - l_tmp_ptr[compno] = l_encoding_value_ptr; - l_encoding_value_ptr += l_data_stride; - } - - /* get encoding parameters*/ - opj_get_all_encoding_parameters(p_image, p_cp, p_tile_no, &l_tx0, &l_tx1, - &l_ty0, &l_ty1, &l_dx_min, &l_dy_min, &l_max_prec, &l_max_res, l_tmp_ptr); - - /* step calculations*/ - l_step_p = 1; - l_step_c = l_max_prec * l_step_p; - l_step_r = numcomps * l_step_c; - l_step_l = l_max_res * l_step_r; - - /* set values for first packet iterator*/ - l_pi->tp_on = (OPJ_BYTE)p_cp->m_specific_param.m_enc.m_tp_on; - l_current_pi = l_pi; - - /* memory allocation for include*/ - l_current_pi->include_size = l_tcp->numlayers * l_step_l; - l_current_pi->include = (OPJ_INT16*) opj_calloc(l_current_pi->include_size, - sizeof(OPJ_INT16)); - if (!l_current_pi->include) { - opj_free(l_tmp_data); - opj_free(l_tmp_ptr); - opj_pi_destroy(l_pi, l_bound); - return 00; - } - - /* special treatment for the first packet iterator*/ - l_current_comp = l_current_pi->comps; - l_img_comp = p_image->comps; - l_tccp = l_tcp->tccps; - l_current_pi->tx0 = l_tx0; - l_current_pi->ty0 = l_ty0; - l_current_pi->tx1 = l_tx1; - l_current_pi->ty1 = l_ty1; - l_current_pi->dx = l_dx_min; - l_current_pi->dy = l_dy_min; - l_current_pi->step_p = l_step_p; - l_current_pi->step_c = l_step_c; - l_current_pi->step_r = l_step_r; - l_current_pi->step_l = l_step_l; - - /* allocation for components and number of components has already been calculated by opj_pi_create */ - for (compno = 0; compno < numcomps; ++compno) { - opj_pi_resolution_t *l_res = l_current_comp->resolutions; - l_encoding_value_ptr = l_tmp_ptr[compno]; - - l_current_comp->dx = l_img_comp->dx; - l_current_comp->dy = l_img_comp->dy; - - /* resolutions have already been initialized */ - for (resno = 0; resno < l_current_comp->numresolutions; resno++) { - l_res->pdx = *(l_encoding_value_ptr++); - l_res->pdy = *(l_encoding_value_ptr++); - l_res->pw = *(l_encoding_value_ptr++); - l_res->ph = *(l_encoding_value_ptr++); - ++l_res; - } - - ++l_current_comp; - ++l_img_comp; - ++l_tccp; - } - ++l_current_pi; - - for (pino = 1 ; pino < l_bound ; ++pino) { - l_current_comp = l_current_pi->comps; - l_img_comp = p_image->comps; - l_tccp = l_tcp->tccps; - - l_current_pi->tx0 = l_tx0; - l_current_pi->ty0 = l_ty0; - l_current_pi->tx1 = l_tx1; - l_current_pi->ty1 = l_ty1; - l_current_pi->dx = l_dx_min; - l_current_pi->dy = l_dy_min; - l_current_pi->step_p = l_step_p; - l_current_pi->step_c = l_step_c; - l_current_pi->step_r = l_step_r; - l_current_pi->step_l = l_step_l; - - /* allocation for components and number of components has already been calculated by opj_pi_create */ - for (compno = 0; compno < numcomps; ++compno) { - opj_pi_resolution_t *l_res = l_current_comp->resolutions; - l_encoding_value_ptr = l_tmp_ptr[compno]; - - l_current_comp->dx = l_img_comp->dx; - l_current_comp->dy = l_img_comp->dy; - /* resolutions have already been initialized */ - for (resno = 0; resno < l_current_comp->numresolutions; resno++) { - l_res->pdx = *(l_encoding_value_ptr++); - l_res->pdy = *(l_encoding_value_ptr++); - l_res->pw = *(l_encoding_value_ptr++); - l_res->ph = *(l_encoding_value_ptr++); - ++l_res; - } - ++l_current_comp; - ++l_img_comp; - ++l_tccp; - } - - /* special treatment*/ - l_current_pi->include = (l_current_pi - 1)->include; - l_current_pi->include_size = (l_current_pi - 1)->include_size; - ++l_current_pi; - } - - opj_free(l_tmp_data); - l_tmp_data = 00; - opj_free(l_tmp_ptr); - l_tmp_ptr = 00; - - if (l_tcp->POC && (OPJ_IS_CINEMA(p_cp->rsiz) || p_t2_mode == FINAL_PASS)) { - opj_pi_update_encode_poc_and_final(p_cp, p_tile_no, l_tx0, l_tx1, l_ty0, l_ty1, - l_max_prec, l_max_res, l_dx_min, l_dy_min); - } else { - opj_pi_update_encode_not_poc(p_cp, numcomps, p_tile_no, l_tx0, l_tx1, - l_ty0, l_ty1, l_max_prec, l_max_res, l_dx_min, l_dy_min); - } - - return l_pi; -} - -void opj_pi_create_encode(opj_pi_iterator_t *pi, - opj_cp_t *cp, - OPJ_UINT32 tileno, - OPJ_UINT32 pino, - OPJ_UINT32 tpnum, - OPJ_INT32 tppos, - J2K_T2_MODE t2_mode) -{ - const OPJ_CHAR *prog; - OPJ_INT32 i; - OPJ_UINT32 incr_top = 1, resetX = 0; - opj_tcp_t *tcps = &cp->tcps[tileno]; - opj_poc_t *tcp = &tcps->pocs[pino]; - - prog = opj_j2k_convert_progression_order(tcp->prg); - - pi[pino].first = 1; - pi[pino].poc.prg = tcp->prg; - - if (!(cp->m_specific_param.m_enc.m_tp_on && ((!OPJ_IS_CINEMA(cp->rsiz) && - (t2_mode == FINAL_PASS)) || OPJ_IS_CINEMA(cp->rsiz)))) { - pi[pino].poc.resno0 = tcp->resS; - pi[pino].poc.resno1 = tcp->resE; - pi[pino].poc.compno0 = tcp->compS; - pi[pino].poc.compno1 = tcp->compE; - pi[pino].poc.layno0 = tcp->layS; - pi[pino].poc.layno1 = tcp->layE; - pi[pino].poc.precno0 = tcp->prcS; - pi[pino].poc.precno1 = tcp->prcE; - pi[pino].poc.tx0 = (OPJ_INT32)tcp->txS; - pi[pino].poc.ty0 = (OPJ_INT32)tcp->tyS; - pi[pino].poc.tx1 = (OPJ_INT32)tcp->txE; - pi[pino].poc.ty1 = (OPJ_INT32)tcp->tyE; - } else { - for (i = tppos + 1; i < 4; i++) { - switch (prog[i]) { - case 'R': - pi[pino].poc.resno0 = tcp->resS; - pi[pino].poc.resno1 = tcp->resE; - break; - case 'C': - pi[pino].poc.compno0 = tcp->compS; - pi[pino].poc.compno1 = tcp->compE; - break; - case 'L': - pi[pino].poc.layno0 = tcp->layS; - pi[pino].poc.layno1 = tcp->layE; - break; - case 'P': - switch (tcp->prg) { - case OPJ_LRCP: - case OPJ_RLCP: - pi[pino].poc.precno0 = tcp->prcS; - pi[pino].poc.precno1 = tcp->prcE; - break; - default: - pi[pino].poc.tx0 = (OPJ_INT32)tcp->txS; - pi[pino].poc.ty0 = (OPJ_INT32)tcp->tyS; - pi[pino].poc.tx1 = (OPJ_INT32)tcp->txE; - pi[pino].poc.ty1 = (OPJ_INT32)tcp->tyE; - break; - } - break; - } - } - - if (tpnum == 0) { - for (i = tppos; i >= 0; i--) { - switch (prog[i]) { - case 'C': - tcp->comp_t = tcp->compS; - pi[pino].poc.compno0 = tcp->comp_t; - pi[pino].poc.compno1 = tcp->comp_t + 1; - tcp->comp_t += 1; - break; - case 'R': - tcp->res_t = tcp->resS; - pi[pino].poc.resno0 = tcp->res_t; - pi[pino].poc.resno1 = tcp->res_t + 1; - tcp->res_t += 1; - break; - case 'L': - tcp->lay_t = tcp->layS; - pi[pino].poc.layno0 = tcp->lay_t; - pi[pino].poc.layno1 = tcp->lay_t + 1; - tcp->lay_t += 1; - break; - case 'P': - switch (tcp->prg) { - case OPJ_LRCP: - case OPJ_RLCP: - tcp->prc_t = tcp->prcS; - pi[pino].poc.precno0 = tcp->prc_t; - pi[pino].poc.precno1 = tcp->prc_t + 1; - tcp->prc_t += 1; - break; - default: - tcp->tx0_t = tcp->txS; - tcp->ty0_t = tcp->tyS; - pi[pino].poc.tx0 = (OPJ_INT32)tcp->tx0_t; - pi[pino].poc.tx1 = (OPJ_INT32)(tcp->tx0_t + tcp->dx - (tcp->tx0_t % tcp->dx)); - pi[pino].poc.ty0 = (OPJ_INT32)tcp->ty0_t; - pi[pino].poc.ty1 = (OPJ_INT32)(tcp->ty0_t + tcp->dy - (tcp->ty0_t % tcp->dy)); - tcp->tx0_t = (OPJ_UINT32)pi[pino].poc.tx1; - tcp->ty0_t = (OPJ_UINT32)pi[pino].poc.ty1; - break; - } - break; - } - } - incr_top = 1; - } else { - for (i = tppos; i >= 0; i--) { - switch (prog[i]) { - case 'C': - pi[pino].poc.compno0 = tcp->comp_t - 1; - pi[pino].poc.compno1 = tcp->comp_t; - break; - case 'R': - pi[pino].poc.resno0 = tcp->res_t - 1; - pi[pino].poc.resno1 = tcp->res_t; - break; - case 'L': - pi[pino].poc.layno0 = tcp->lay_t - 1; - pi[pino].poc.layno1 = tcp->lay_t; - break; - case 'P': - switch (tcp->prg) { - case OPJ_LRCP: - case OPJ_RLCP: - pi[pino].poc.precno0 = tcp->prc_t - 1; - pi[pino].poc.precno1 = tcp->prc_t; - break; - default: - pi[pino].poc.tx0 = (OPJ_INT32)(tcp->tx0_t - tcp->dx - (tcp->tx0_t % tcp->dx)); - pi[pino].poc.tx1 = (OPJ_INT32)tcp->tx0_t ; - pi[pino].poc.ty0 = (OPJ_INT32)(tcp->ty0_t - tcp->dy - (tcp->ty0_t % tcp->dy)); - pi[pino].poc.ty1 = (OPJ_INT32)tcp->ty0_t ; - break; - } - break; - } - if (incr_top == 1) { - switch (prog[i]) { - case 'R': - if (tcp->res_t == tcp->resE) { - if (opj_pi_check_next_level(i - 1, cp, tileno, pino, prog)) { - tcp->res_t = tcp->resS; - pi[pino].poc.resno0 = tcp->res_t; - pi[pino].poc.resno1 = tcp->res_t + 1; - tcp->res_t += 1; - incr_top = 1; - } else { - incr_top = 0; - } - } else { - pi[pino].poc.resno0 = tcp->res_t; - pi[pino].poc.resno1 = tcp->res_t + 1; - tcp->res_t += 1; - incr_top = 0; - } - break; - case 'C': - if (tcp->comp_t == tcp->compE) { - if (opj_pi_check_next_level(i - 1, cp, tileno, pino, prog)) { - tcp->comp_t = tcp->compS; - pi[pino].poc.compno0 = tcp->comp_t; - pi[pino].poc.compno1 = tcp->comp_t + 1; - tcp->comp_t += 1; - incr_top = 1; - } else { - incr_top = 0; - } - } else { - pi[pino].poc.compno0 = tcp->comp_t; - pi[pino].poc.compno1 = tcp->comp_t + 1; - tcp->comp_t += 1; - incr_top = 0; - } - break; - case 'L': - if (tcp->lay_t == tcp->layE) { - if (opj_pi_check_next_level(i - 1, cp, tileno, pino, prog)) { - tcp->lay_t = tcp->layS; - pi[pino].poc.layno0 = tcp->lay_t; - pi[pino].poc.layno1 = tcp->lay_t + 1; - tcp->lay_t += 1; - incr_top = 1; - } else { - incr_top = 0; - } - } else { - pi[pino].poc.layno0 = tcp->lay_t; - pi[pino].poc.layno1 = tcp->lay_t + 1; - tcp->lay_t += 1; - incr_top = 0; - } - break; - case 'P': - switch (tcp->prg) { - case OPJ_LRCP: - case OPJ_RLCP: - if (tcp->prc_t == tcp->prcE) { - if (opj_pi_check_next_level(i - 1, cp, tileno, pino, prog)) { - tcp->prc_t = tcp->prcS; - pi[pino].poc.precno0 = tcp->prc_t; - pi[pino].poc.precno1 = tcp->prc_t + 1; - tcp->prc_t += 1; - incr_top = 1; - } else { - incr_top = 0; - } - } else { - pi[pino].poc.precno0 = tcp->prc_t; - pi[pino].poc.precno1 = tcp->prc_t + 1; - tcp->prc_t += 1; - incr_top = 0; - } - break; - default: - if (tcp->tx0_t >= tcp->txE) { - if (tcp->ty0_t >= tcp->tyE) { - if (opj_pi_check_next_level(i - 1, cp, tileno, pino, prog)) { - tcp->ty0_t = tcp->tyS; - pi[pino].poc.ty0 = (OPJ_INT32)tcp->ty0_t; - pi[pino].poc.ty1 = (OPJ_INT32)(tcp->ty0_t + tcp->dy - (tcp->ty0_t % tcp->dy)); - tcp->ty0_t = (OPJ_UINT32)pi[pino].poc.ty1; - incr_top = 1; - resetX = 1; - } else { - incr_top = 0; - resetX = 0; - } - } else { - pi[pino].poc.ty0 = (OPJ_INT32)tcp->ty0_t; - pi[pino].poc.ty1 = (OPJ_INT32)(tcp->ty0_t + tcp->dy - (tcp->ty0_t % tcp->dy)); - tcp->ty0_t = (OPJ_UINT32)pi[pino].poc.ty1; - incr_top = 0; - resetX = 1; - } - if (resetX == 1) { - tcp->tx0_t = tcp->txS; - pi[pino].poc.tx0 = (OPJ_INT32)tcp->tx0_t; - pi[pino].poc.tx1 = (OPJ_INT32)(tcp->tx0_t + tcp->dx - (tcp->tx0_t % tcp->dx)); - tcp->tx0_t = (OPJ_UINT32)pi[pino].poc.tx1; - } - } else { - pi[pino].poc.tx0 = (OPJ_INT32)tcp->tx0_t; - pi[pino].poc.tx1 = (OPJ_INT32)(tcp->tx0_t + tcp->dx - (tcp->tx0_t % tcp->dx)); - tcp->tx0_t = (OPJ_UINT32)pi[pino].poc.tx1; - incr_top = 0; - } - break; - } - break; - } - } - } - } - } -} - -void opj_pi_destroy(opj_pi_iterator_t *p_pi, - OPJ_UINT32 p_nb_elements) -{ - OPJ_UINT32 compno, pino; - opj_pi_iterator_t *l_current_pi = p_pi; - if (p_pi) { - if (p_pi->include) { - opj_free(p_pi->include); - p_pi->include = 00; - } - for (pino = 0; pino < p_nb_elements; ++pino) { - if (l_current_pi->comps) { - opj_pi_comp_t *l_current_component = l_current_pi->comps; - for (compno = 0; compno < l_current_pi->numcomps; compno++) { - if (l_current_component->resolutions) { - opj_free(l_current_component->resolutions); - l_current_component->resolutions = 00; - } - - ++l_current_component; - } - opj_free(l_current_pi->comps); - l_current_pi->comps = 0; - } - ++l_current_pi; - } - opj_free(p_pi); - } -} - - - -void opj_pi_update_encoding_parameters(const opj_image_t *p_image, - opj_cp_t *p_cp, - OPJ_UINT32 p_tile_no) -{ - /* encoding parameters to set */ - OPJ_UINT32 l_max_res; - OPJ_UINT32 l_max_prec; - OPJ_INT32 l_tx0, l_tx1, l_ty0, l_ty1; - OPJ_UINT32 l_dx_min, l_dy_min; - - /* pointers */ - opj_tcp_t *l_tcp = 00; - - /* preconditions */ - assert(p_cp != 00); - assert(p_image != 00); - assert(p_tile_no < p_cp->tw * p_cp->th); - - l_tcp = &(p_cp->tcps[p_tile_no]); - - /* get encoding parameters */ - opj_get_encoding_parameters(p_image, p_cp, p_tile_no, &l_tx0, &l_tx1, &l_ty0, - &l_ty1, &l_dx_min, &l_dy_min, &l_max_prec, &l_max_res); - - if (l_tcp->POC) { - opj_pi_update_encode_poc_and_final(p_cp, p_tile_no, l_tx0, l_tx1, l_ty0, l_ty1, - l_max_prec, l_max_res, l_dx_min, l_dy_min); - } else { - opj_pi_update_encode_not_poc(p_cp, p_image->numcomps, p_tile_no, l_tx0, l_tx1, - l_ty0, l_ty1, l_max_prec, l_max_res, l_dx_min, l_dy_min); - } -} - -OPJ_BOOL opj_pi_next(opj_pi_iterator_t * pi) -{ - switch (pi->poc.prg) { - case OPJ_LRCP: - return opj_pi_next_lrcp(pi); - case OPJ_RLCP: - return opj_pi_next_rlcp(pi); - case OPJ_RPCL: - return opj_pi_next_rpcl(pi); - case OPJ_PCRL: - return opj_pi_next_pcrl(pi); - case OPJ_CPRL: - return opj_pi_next_cprl(pi); - case OPJ_PROG_UNKNOWN: - return OPJ_FALSE; - } - - return OPJ_FALSE; -} diff --git a/src/3rd/LibOpenJpeg/pi.h b/src/3rd/LibOpenJpeg/pi.h deleted file mode 100644 index 8c0dc25c..00000000 --- a/src/3rd/LibOpenJpeg/pi.h +++ /dev/null @@ -1,190 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef OPJ_PI_H -#define OPJ_PI_H -/** -@file pi.h -@brief Implementation of a packet iterator (PI) - -The functions in PI.C have for goal to realize a packet iterator that permits to get the next -packet following the progression order and change of it. The functions in PI.C are used -by some function in T2.C. -*/ - -/** @defgroup PI PI - Implementation of a packet iterator */ -/*@{*/ - -/** -FIXME DOC -*/ -typedef struct opj_pi_resolution { - OPJ_UINT32 pdx, pdy; - OPJ_UINT32 pw, ph; -} opj_pi_resolution_t; - -/** -FIXME DOC -*/ -typedef struct opj_pi_comp { - OPJ_UINT32 dx, dy; - /** number of resolution levels */ - OPJ_UINT32 numresolutions; - opj_pi_resolution_t *resolutions; -} opj_pi_comp_t; - -/** -Packet iterator -*/ -typedef struct opj_pi_iterator { - /** Enabling Tile part generation*/ - OPJ_BYTE tp_on; - /** precise if the packet has been already used (useful for progression order change) */ - OPJ_INT16 *include; - /** Number of elements in include array */ - OPJ_UINT32 include_size; - /** layer step used to localize the packet in the include vector */ - OPJ_UINT32 step_l; - /** resolution step used to localize the packet in the include vector */ - OPJ_UINT32 step_r; - /** component step used to localize the packet in the include vector */ - OPJ_UINT32 step_c; - /** precinct step used to localize the packet in the include vector */ - OPJ_UINT32 step_p; - /** component that identify the packet */ - OPJ_UINT32 compno; - /** resolution that identify the packet */ - OPJ_UINT32 resno; - /** precinct that identify the packet */ - OPJ_UINT32 precno; - /** layer that identify the packet */ - OPJ_UINT32 layno; - /** 0 if the first packet */ - OPJ_BOOL first; - /** progression order change information */ - opj_poc_t poc; - /** number of components in the image */ - OPJ_UINT32 numcomps; - /** Components*/ - opj_pi_comp_t *comps; - /** FIXME DOC*/ - OPJ_INT32 tx0, ty0, tx1, ty1; - /** FIXME DOC*/ - OPJ_INT32 x, y; - /** FIXME DOC*/ - OPJ_UINT32 dx, dy; -} opj_pi_iterator_t; - -/** @name Exported functions */ -/*@{*/ -/* ----------------------------------------------------------------------- */ -/** - * Creates a packet iterator for encoding. - * - * @param image the image being encoded. - * @param cp the coding parameters. - * @param tileno index of the tile being encoded. - * @param t2_mode the type of pass for generating the packet iterator - * - * @return a list of packet iterator that points to the first packet of the tile (not true). -*/ -opj_pi_iterator_t *opj_pi_initialise_encode(const opj_image_t *image, - opj_cp_t *cp, - OPJ_UINT32 tileno, - J2K_T2_MODE t2_mode); - -/** - * Updates the encoding parameters of the codec. - * - * @param p_image the image being encoded. - * @param p_cp the coding parameters. - * @param p_tile_no index of the tile being encoded. -*/ -void opj_pi_update_encoding_parameters(const opj_image_t *p_image, - opj_cp_t *p_cp, - OPJ_UINT32 p_tile_no); - -/** -Modify the packet iterator for enabling tile part generation -@param pi Handle to the packet iterator generated in pi_initialise_encode -@param cp Coding parameters -@param tileno Number that identifies the tile for which to list the packets -@param pino FIXME DOC -@param tpnum Tile part number of the current tile -@param tppos The position of the tile part flag in the progression order -@param t2_mode FIXME DOC -*/ -void opj_pi_create_encode(opj_pi_iterator_t *pi, - opj_cp_t *cp, - OPJ_UINT32 tileno, - OPJ_UINT32 pino, - OPJ_UINT32 tpnum, - OPJ_INT32 tppos, - J2K_T2_MODE t2_mode); - -/** -Create a packet iterator for Decoder -@param image Raw image for which the packets will be listed -@param cp Coding parameters -@param tileno Number that identifies the tile for which to list the packets -@return Returns a packet iterator that points to the first packet of the tile -@see opj_pi_destroy -*/ -opj_pi_iterator_t *opj_pi_create_decode(opj_image_t * image, - opj_cp_t * cp, - OPJ_UINT32 tileno); -/** - * Destroys a packet iterator array. - * - * @param p_pi the packet iterator array to destroy. - * @param p_nb_elements the number of elements in the array. - */ -void opj_pi_destroy(opj_pi_iterator_t *p_pi, - OPJ_UINT32 p_nb_elements); - -/** -Modify the packet iterator to point to the next packet -@param pi Packet iterator to modify -@return Returns false if pi pointed to the last packet or else returns true -*/ -OPJ_BOOL opj_pi_next(opj_pi_iterator_t * pi); -/* ----------------------------------------------------------------------- */ -/*@}*/ - -/*@}*/ - -#endif /* OPJ_PI_H */ diff --git a/src/3rd/LibOpenJpeg/sparse_array.c b/src/3rd/LibOpenJpeg/sparse_array.c deleted file mode 100644 index 73192924..00000000 --- a/src/3rd/LibOpenJpeg/sparse_array.c +++ /dev/null @@ -1,346 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2017, IntoPix SA - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "opj_includes.h" - - -struct opj_sparse_array_int32 { - OPJ_UINT32 width; - OPJ_UINT32 height; - OPJ_UINT32 block_width; - OPJ_UINT32 block_height; - OPJ_UINT32 block_count_hor; - OPJ_UINT32 block_count_ver; - OPJ_INT32** data_blocks; -}; - -opj_sparse_array_int32_t* opj_sparse_array_int32_create(OPJ_UINT32 width, - OPJ_UINT32 height, - OPJ_UINT32 block_width, - OPJ_UINT32 block_height) -{ - opj_sparse_array_int32_t* sa; - - if (width == 0 || height == 0 || block_width == 0 || block_height == 0) { - return NULL; - } - if (block_width > ((OPJ_UINT32)~0U) / block_height / sizeof(OPJ_INT32)) { - return NULL; - } - - sa = (opj_sparse_array_int32_t*) opj_calloc(1, - sizeof(opj_sparse_array_int32_t)); - sa->width = width; - sa->height = height; - sa->block_width = block_width; - sa->block_height = block_height; - sa->block_count_hor = opj_uint_ceildiv(width, block_width); - sa->block_count_ver = opj_uint_ceildiv(height, block_height); - if (sa->block_count_hor > ((OPJ_UINT32)~0U) / sa->block_count_ver) { - opj_free(sa); - return NULL; - } - sa->data_blocks = (OPJ_INT32**) opj_calloc(sizeof(OPJ_INT32*), - sa->block_count_hor * sa->block_count_ver); - if (sa->data_blocks == NULL) { - opj_free(sa); - return NULL; - } - - return sa; -} - -void opj_sparse_array_int32_free(opj_sparse_array_int32_t* sa) -{ - if (sa) { - OPJ_UINT32 i; - for (i = 0; i < sa->block_count_hor * sa->block_count_ver; i++) { - if (sa->data_blocks[i]) { - opj_free(sa->data_blocks[i]); - } - } - opj_free(sa->data_blocks); - opj_free(sa); - } -} - -OPJ_BOOL opj_sparse_array_is_region_valid(const opj_sparse_array_int32_t* sa, - OPJ_UINT32 x0, - OPJ_UINT32 y0, - OPJ_UINT32 x1, - OPJ_UINT32 y1) -{ - return !(x0 >= sa->width || x1 <= x0 || x1 > sa->width || - y0 >= sa->height || y1 <= y0 || y1 > sa->height); -} - -static OPJ_BOOL opj_sparse_array_int32_read_or_write( - const opj_sparse_array_int32_t* sa, - OPJ_UINT32 x0, - OPJ_UINT32 y0, - OPJ_UINT32 x1, - OPJ_UINT32 y1, - OPJ_INT32* buf, - OPJ_UINT32 buf_col_stride, - OPJ_UINT32 buf_line_stride, - OPJ_BOOL forgiving, - OPJ_BOOL is_read_op) -{ - OPJ_UINT32 y, block_y; - OPJ_UINT32 y_incr = 0; - const OPJ_UINT32 block_width = sa->block_width; - - if (!opj_sparse_array_is_region_valid(sa, x0, y0, x1, y1)) { - return forgiving; - } - - block_y = y0 / sa->block_height; - for (y = y0; y < y1; block_y ++, y += y_incr) { - OPJ_UINT32 x, block_x; - OPJ_UINT32 x_incr = 0; - OPJ_UINT32 block_y_offset; - y_incr = (y == y0) ? sa->block_height - (y0 % sa->block_height) : - sa->block_height; - block_y_offset = sa->block_height - y_incr; - y_incr = opj_uint_min(y_incr, y1 - y); - block_x = x0 / block_width; - for (x = x0; x < x1; block_x ++, x += x_incr) { - OPJ_UINT32 j; - OPJ_UINT32 block_x_offset; - OPJ_INT32* src_block; - x_incr = (x == x0) ? block_width - (x0 % block_width) : block_width; - block_x_offset = block_width - x_incr; - x_incr = opj_uint_min(x_incr, x1 - x); - src_block = sa->data_blocks[block_y * sa->block_count_hor + block_x]; - if (is_read_op) { - if (src_block == NULL) { - if (buf_col_stride == 1) { - OPJ_INT32* dest_ptr = buf + (y - y0) * (OPJ_SIZE_T)buf_line_stride + - (x - x0) * buf_col_stride; - for (j = 0; j < y_incr; j++) { - memset(dest_ptr, 0, sizeof(OPJ_INT32) * x_incr); - dest_ptr += buf_line_stride; - } - } else { - OPJ_INT32* dest_ptr = buf + (y - y0) * (OPJ_SIZE_T)buf_line_stride + - (x - x0) * buf_col_stride; - for (j = 0; j < y_incr; j++) { - OPJ_UINT32 k; - for (k = 0; k < x_incr; k++) { - dest_ptr[k * buf_col_stride] = 0; - } - dest_ptr += buf_line_stride; - } - } - } else { - const OPJ_INT32* OPJ_RESTRICT src_ptr = src_block + block_y_offset * - (OPJ_SIZE_T)block_width + block_x_offset; - if (buf_col_stride == 1) { - OPJ_INT32* OPJ_RESTRICT dest_ptr = buf + (y - y0) * (OPJ_SIZE_T)buf_line_stride - + - (x - x0) * buf_col_stride; - if (x_incr == 4) { - /* Same code as general branch, but the compiler */ - /* can have an efficient memcpy() */ - (void)(x_incr); /* trick to silent cppcheck duplicateBranch warning */ - for (j = 0; j < y_incr; j++) { - memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr); - dest_ptr += buf_line_stride; - src_ptr += block_width; - } - } else { - for (j = 0; j < y_incr; j++) { - memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr); - dest_ptr += buf_line_stride; - src_ptr += block_width; - } - } - } else { - OPJ_INT32* OPJ_RESTRICT dest_ptr = buf + (y - y0) * (OPJ_SIZE_T)buf_line_stride - + - (x - x0) * buf_col_stride; - if (x_incr == 1) { - for (j = 0; j < y_incr; j++) { - *dest_ptr = *src_ptr; - dest_ptr += buf_line_stride; - src_ptr += block_width; - } - } else if (y_incr == 1 && buf_col_stride == 2) { - OPJ_UINT32 k; - for (k = 0; k < (x_incr & ~3U); k += 4) { - dest_ptr[k * buf_col_stride] = src_ptr[k]; - dest_ptr[(k + 1) * buf_col_stride] = src_ptr[k + 1]; - dest_ptr[(k + 2) * buf_col_stride] = src_ptr[k + 2]; - dest_ptr[(k + 3) * buf_col_stride] = src_ptr[k + 3]; - } - for (; k < x_incr; k++) { - dest_ptr[k * buf_col_stride] = src_ptr[k]; - } - } else if (x_incr >= 8 && buf_col_stride == 8) { - for (j = 0; j < y_incr; j++) { - OPJ_UINT32 k; - for (k = 0; k < (x_incr & ~3U); k += 4) { - dest_ptr[k * buf_col_stride] = src_ptr[k]; - dest_ptr[(k + 1) * buf_col_stride] = src_ptr[k + 1]; - dest_ptr[(k + 2) * buf_col_stride] = src_ptr[k + 2]; - dest_ptr[(k + 3) * buf_col_stride] = src_ptr[k + 3]; - } - for (; k < x_incr; k++) { - dest_ptr[k * buf_col_stride] = src_ptr[k]; - } - dest_ptr += buf_line_stride; - src_ptr += block_width; - } - } else { - /* General case */ - for (j = 0; j < y_incr; j++) { - OPJ_UINT32 k; - for (k = 0; k < x_incr; k++) { - dest_ptr[k * buf_col_stride] = src_ptr[k]; - } - dest_ptr += buf_line_stride; - src_ptr += block_width; - } - } - } - } - } else { - if (src_block == NULL) { - src_block = (OPJ_INT32*) opj_calloc(1, - sa->block_width * sa->block_height * sizeof(OPJ_INT32)); - if (src_block == NULL) { - return OPJ_FALSE; - } - sa->data_blocks[block_y * sa->block_count_hor + block_x] = src_block; - } - - if (buf_col_stride == 1) { - OPJ_INT32* OPJ_RESTRICT dest_ptr = src_block + block_y_offset * - (OPJ_SIZE_T)block_width + block_x_offset; - const OPJ_INT32* OPJ_RESTRICT src_ptr = buf + (y - y0) * - (OPJ_SIZE_T)buf_line_stride + (x - x0) * buf_col_stride; - if (x_incr == 4) { - /* Same code as general branch, but the compiler */ - /* can have an efficient memcpy() */ - (void)(x_incr); /* trick to silent cppcheck duplicateBranch warning */ - for (j = 0; j < y_incr; j++) { - memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr); - dest_ptr += block_width; - src_ptr += buf_line_stride; - } - } else { - for (j = 0; j < y_incr; j++) { - memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr); - dest_ptr += block_width; - src_ptr += buf_line_stride; - } - } - } else { - OPJ_INT32* OPJ_RESTRICT dest_ptr = src_block + block_y_offset * - (OPJ_SIZE_T)block_width + block_x_offset; - const OPJ_INT32* OPJ_RESTRICT src_ptr = buf + (y - y0) * - (OPJ_SIZE_T)buf_line_stride + (x - x0) * buf_col_stride; - if (x_incr == 1) { - for (j = 0; j < y_incr; j++) { - *dest_ptr = *src_ptr; - src_ptr += buf_line_stride; - dest_ptr += block_width; - } - } else if (x_incr >= 8 && buf_col_stride == 8) { - for (j = 0; j < y_incr; j++) { - OPJ_UINT32 k; - for (k = 0; k < (x_incr & ~3U); k += 4) { - dest_ptr[k] = src_ptr[k * buf_col_stride]; - dest_ptr[k + 1] = src_ptr[(k + 1) * buf_col_stride]; - dest_ptr[k + 2] = src_ptr[(k + 2) * buf_col_stride]; - dest_ptr[k + 3] = src_ptr[(k + 3) * buf_col_stride]; - } - for (; k < x_incr; k++) { - dest_ptr[k] = src_ptr[k * buf_col_stride]; - } - src_ptr += buf_line_stride; - dest_ptr += block_width; - } - } else { - /* General case */ - for (j = 0; j < y_incr; j++) { - OPJ_UINT32 k; - for (k = 0; k < x_incr; k++) { - dest_ptr[k] = src_ptr[k * buf_col_stride]; - } - src_ptr += buf_line_stride; - dest_ptr += block_width; - } - } - } - } - } - } - - return OPJ_TRUE; -} - -OPJ_BOOL opj_sparse_array_int32_read(const opj_sparse_array_int32_t* sa, - OPJ_UINT32 x0, - OPJ_UINT32 y0, - OPJ_UINT32 x1, - OPJ_UINT32 y1, - OPJ_INT32* dest, - OPJ_UINT32 dest_col_stride, - OPJ_UINT32 dest_line_stride, - OPJ_BOOL forgiving) -{ - return opj_sparse_array_int32_read_or_write( - (opj_sparse_array_int32_t*)sa, x0, y0, x1, y1, - dest, - dest_col_stride, - dest_line_stride, - forgiving, - OPJ_TRUE); -} - -OPJ_BOOL opj_sparse_array_int32_write(opj_sparse_array_int32_t* sa, - OPJ_UINT32 x0, - OPJ_UINT32 y0, - OPJ_UINT32 x1, - OPJ_UINT32 y1, - const OPJ_INT32* src, - OPJ_UINT32 src_col_stride, - OPJ_UINT32 src_line_stride, - OPJ_BOOL forgiving) -{ - return opj_sparse_array_int32_read_or_write(sa, x0, y0, x1, y1, - (OPJ_INT32*)src, - src_col_stride, - src_line_stride, - forgiving, - OPJ_FALSE); -} diff --git a/src/3rd/LibOpenJpeg/sparse_array.h b/src/3rd/LibOpenJpeg/sparse_array.h deleted file mode 100644 index fd927eaa..00000000 --- a/src/3rd/LibOpenJpeg/sparse_array.h +++ /dev/null @@ -1,141 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2017, IntoPix SA - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "opj_includes.h" - -#ifndef OPJ_SPARSE_ARRAY_H -#define OPJ_SPARSE_ARRAY_H -/** -@file sparse_array.h -@brief Sparse array management - -The functions in this file manage sparse arrays. Sparse arrays are arrays with -potential big dimensions, but with very few samples actually set. Such sparse -arrays require allocating a low amount of memory, by just allocating memory -for blocks of the array that are set. The minimum memory allocation unit is a -a block. There is a trade-off to pick up an appropriate dimension for blocks. -If it is too big, and pixels set are far from each other, too much memory will -be used. If blocks are too small, the book-keeping costs of blocks will raise. -*/ - -/** @defgroup SPARSE_ARRAY SPARSE ARRAYS - Sparse arrays */ -/*@{*/ - -/** Opaque type for sparse arrays that contain int32 values */ -typedef struct opj_sparse_array_int32 opj_sparse_array_int32_t; - -/** Creates a new sparse array. - * @param width total width of the array. - * @param height total height of the array - * @param block_width width of a block. - * @param block_height height of a block. - * @return a new sparse array instance, or NULL in case of failure. - */ -opj_sparse_array_int32_t* opj_sparse_array_int32_create(OPJ_UINT32 width, - OPJ_UINT32 height, - OPJ_UINT32 block_width, - OPJ_UINT32 block_height); - -/** Frees a sparse array. - * @param sa sparse array instance. - */ -void opj_sparse_array_int32_free(opj_sparse_array_int32_t* sa); - -/** Returns whether region bounds are valid (non empty and within array bounds) - * @param sa sparse array instance. - * @param x0 left x coordinate of the region. - * @param y0 top x coordinate of the region. - * @param x1 right x coordinate (not included) of the region. Must be greater than x0. - * @param y1 bottom y coordinate (not included) of the region. Must be greater than y0. - * @return OPJ_TRUE or OPJ_FALSE. - */ -OPJ_BOOL opj_sparse_array_is_region_valid(const opj_sparse_array_int32_t* sa, - OPJ_UINT32 x0, - OPJ_UINT32 y0, - OPJ_UINT32 x1, - OPJ_UINT32 y1); - -/** Read the content of a rectangular region of the sparse array into a - * user buffer. - * - * Regions not written with opj_sparse_array_int32_write() are read as 0. - * - * @param sa sparse array instance. - * @param x0 left x coordinate of the region to read in the sparse array. - * @param y0 top x coordinate of the region to read in the sparse array. - * @param x1 right x coordinate (not included) of the region to read in the sparse array. Must be greater than x0. - * @param y1 bottom y coordinate (not included) of the region to read in the sparse array. Must be greater than y0. - * @param dest user buffer to fill. Must be at least sizeof(int32) * ( (y1 - y0 - 1) * dest_line_stride + (x1 - x0 - 1) * dest_col_stride + 1) bytes large. - * @param dest_col_stride spacing (in elements, not in bytes) in x dimension between consecutive elements of the user buffer. - * @param dest_line_stride spacing (in elements, not in bytes) in y dimension between consecutive elements of the user buffer. - * @param forgiving if set to TRUE and the region is invalid, OPJ_TRUE will still be returned. - * @return OPJ_TRUE in case of success. - */ -OPJ_BOOL opj_sparse_array_int32_read(const opj_sparse_array_int32_t* sa, - OPJ_UINT32 x0, - OPJ_UINT32 y0, - OPJ_UINT32 x1, - OPJ_UINT32 y1, - OPJ_INT32* dest, - OPJ_UINT32 dest_col_stride, - OPJ_UINT32 dest_line_stride, - OPJ_BOOL forgiving); - - -/** Write the content of a rectangular region into the sparse array from a - * user buffer. - * - * Blocks intersecting the region are allocated, if not already done. - * - * @param sa sparse array instance. - * @param x0 left x coordinate of the region to write into the sparse array. - * @param y0 top x coordinate of the region to write into the sparse array. - * @param x1 right x coordinate (not included) of the region to write into the sparse array. Must be greater than x0. - * @param y1 bottom y coordinate (not included) of the region to write into the sparse array. Must be greater than y0. - * @param src user buffer to fill. Must be at least sizeof(int32) * ( (y1 - y0 - 1) * src_line_stride + (x1 - x0 - 1) * src_col_stride + 1) bytes large. - * @param src_col_stride spacing (in elements, not in bytes) in x dimension between consecutive elements of the user buffer. - * @param src_line_stride spacing (in elements, not in bytes) in y dimension between consecutive elements of the user buffer. - * @param forgiving if set to TRUE and the region is invalid, OPJ_TRUE will still be returned. - * @return OPJ_TRUE in case of success. - */ -OPJ_BOOL opj_sparse_array_int32_write(opj_sparse_array_int32_t* sa, - OPJ_UINT32 x0, - OPJ_UINT32 y0, - OPJ_UINT32 x1, - OPJ_UINT32 y1, - const OPJ_INT32* src, - OPJ_UINT32 src_col_stride, - OPJ_UINT32 src_line_stride, - OPJ_BOOL forgiving); - -/*@}*/ - -#endif /* OPJ_SPARSE_ARRAY_H */ diff --git a/src/3rd/LibOpenJpeg/t1.c b/src/3rd/LibOpenJpeg/t1.c deleted file mode 100644 index 76744380..00000000 --- a/src/3rd/LibOpenJpeg/t1.c +++ /dev/null @@ -1,2419 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2007, Callum Lerwick - * Copyright (c) 2012, Carl Hetherington - * Copyright (c) 2017, IntoPIX SA - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#define OPJ_SKIP_POISON -#include "opj_includes.h" - -#ifdef __SSE__ -#include -#endif -#ifdef __SSE2__ -#include -#endif - -#if defined(__GNUC__) -#pragma GCC poison malloc calloc realloc free -#endif - -#include "t1_luts.h" - -/** @defgroup T1 T1 - Implementation of the tier-1 coding */ -/*@{*/ - -#define T1_FLAGS(x, y) (t1->flags[x + 1 + ((y / 4) + 1) * (t1->w+2)]) - -#define opj_t1_setcurctx(curctx, ctxno) curctx = &(mqc)->ctxs[(OPJ_UINT32)(ctxno)] - -/** @name Local static functions */ -/*@{*/ - -static INLINE OPJ_BYTE opj_t1_getctxno_zc(opj_mqc_t *mqc, OPJ_UINT32 f); -static INLINE OPJ_UINT32 opj_t1_getctxno_mag(OPJ_UINT32 f); -static OPJ_INT16 opj_t1_getnmsedec_sig(OPJ_UINT32 x, OPJ_UINT32 bitpos); -static OPJ_INT16 opj_t1_getnmsedec_ref(OPJ_UINT32 x, OPJ_UINT32 bitpos); -static INLINE void opj_t1_update_flags(opj_flag_t *flagsp, OPJ_UINT32 ci, - OPJ_UINT32 s, OPJ_UINT32 stride, - OPJ_UINT32 vsc); - - -/** -Decode significant pass -*/ - -static INLINE void opj_t1_dec_sigpass_step_raw( - opj_t1_t *t1, - opj_flag_t *flagsp, - OPJ_INT32 *datap, - OPJ_INT32 oneplushalf, - OPJ_UINT32 vsc, - OPJ_UINT32 row); -static INLINE void opj_t1_dec_sigpass_step_mqc( - opj_t1_t *t1, - opj_flag_t *flagsp, - OPJ_INT32 *datap, - OPJ_INT32 oneplushalf, - OPJ_UINT32 row, - OPJ_UINT32 flags_stride, - OPJ_UINT32 vsc); - -/** -Encode significant pass -*/ -static void opj_t1_enc_sigpass(opj_t1_t *t1, - OPJ_INT32 bpno, - OPJ_INT32 *nmsedec, - OPJ_BYTE type, - OPJ_UINT32 cblksty); - -/** -Decode significant pass -*/ -static void opj_t1_dec_sigpass_raw( - opj_t1_t *t1, - OPJ_INT32 bpno, - OPJ_INT32 cblksty); - -/** -Encode refinement pass -*/ -static void opj_t1_enc_refpass(opj_t1_t *t1, - OPJ_INT32 bpno, - OPJ_INT32 *nmsedec, - OPJ_BYTE type); - -/** -Decode refinement pass -*/ -static void opj_t1_dec_refpass_raw( - opj_t1_t *t1, - OPJ_INT32 bpno); - - -/** -Decode refinement pass -*/ - -static INLINE void opj_t1_dec_refpass_step_raw( - opj_t1_t *t1, - opj_flag_t *flagsp, - OPJ_INT32 *datap, - OPJ_INT32 poshalf, - OPJ_UINT32 row); -static INLINE void opj_t1_dec_refpass_step_mqc( - opj_t1_t *t1, - opj_flag_t *flagsp, - OPJ_INT32 *datap, - OPJ_INT32 poshalf, - OPJ_UINT32 row); - - -/** -Decode clean-up pass -*/ - -static void opj_t1_dec_clnpass_step( - opj_t1_t *t1, - opj_flag_t *flagsp, - OPJ_INT32 *datap, - OPJ_INT32 oneplushalf, - OPJ_UINT32 row, - OPJ_UINT32 vsc); - -/** -Encode clean-up pass -*/ -static void opj_t1_enc_clnpass( - opj_t1_t *t1, - OPJ_INT32 bpno, - OPJ_INT32 *nmsedec, - OPJ_UINT32 cblksty); - -static OPJ_FLOAT64 opj_t1_getwmsedec( - OPJ_INT32 nmsedec, - OPJ_UINT32 compno, - OPJ_UINT32 level, - OPJ_UINT32 orient, - OPJ_INT32 bpno, - OPJ_UINT32 qmfbid, - OPJ_FLOAT64 stepsize, - OPJ_UINT32 numcomps, - const OPJ_FLOAT64 * mct_norms, - OPJ_UINT32 mct_numcomps); - -static void opj_t1_encode_cblk(opj_t1_t *t1, - opj_tcd_cblk_enc_t* cblk, - OPJ_UINT32 orient, - OPJ_UINT32 compno, - OPJ_UINT32 level, - OPJ_UINT32 qmfbid, - OPJ_FLOAT64 stepsize, - OPJ_UINT32 cblksty, - OPJ_UINT32 numcomps, - opj_tcd_tile_t * tile, - const OPJ_FLOAT64 * mct_norms, - OPJ_UINT32 mct_numcomps); - -/** -Decode 1 code-block -@param t1 T1 handle -@param cblk Code-block coding parameters -@param orient -@param roishift Region of interest shifting value -@param cblksty Code-block style -@param p_manager the event manager -@param p_manager_mutex mutex for the event manager -@param check_pterm whether PTERM correct termination should be checked -*/ -static OPJ_BOOL opj_t1_decode_cblk(opj_t1_t *t1, - opj_tcd_cblk_dec_t* cblk, - OPJ_UINT32 orient, - OPJ_UINT32 roishift, - OPJ_UINT32 cblksty, - opj_event_mgr_t *p_manager, - opj_mutex_t* p_manager_mutex, - OPJ_BOOL check_pterm); - -static OPJ_BOOL opj_t1_allocate_buffers(opj_t1_t *t1, - OPJ_UINT32 w, - OPJ_UINT32 h); - -/*@}*/ - -/*@}*/ - -/* ----------------------------------------------------------------------- */ - -static INLINE OPJ_BYTE opj_t1_getctxno_zc(opj_mqc_t *mqc, OPJ_UINT32 f) -{ - return mqc->lut_ctxno_zc_orient[(f & T1_SIGMA_NEIGHBOURS)]; -} - -static INLINE OPJ_UINT32 opj_t1_getctxtno_sc_or_spb_index(OPJ_UINT32 fX, - OPJ_UINT32 pfX, - OPJ_UINT32 nfX, - OPJ_UINT32 ci) -{ - /* - 0 pfX T1_CHI_THIS T1_LUT_SGN_W - 1 tfX T1_SIGMA_1 T1_LUT_SIG_N - 2 nfX T1_CHI_THIS T1_LUT_SGN_E - 3 tfX T1_SIGMA_3 T1_LUT_SIG_W - 4 fX T1_CHI_(THIS - 1) T1_LUT_SGN_N - 5 tfX T1_SIGMA_5 T1_LUT_SIG_E - 6 fX T1_CHI_(THIS + 1) T1_LUT_SGN_S - 7 tfX T1_SIGMA_7 T1_LUT_SIG_S - */ - - OPJ_UINT32 lu = (fX >> (ci * 3U)) & (T1_SIGMA_1 | T1_SIGMA_3 | T1_SIGMA_5 | - T1_SIGMA_7); - - lu |= (pfX >> (T1_CHI_THIS_I + (ci * 3U))) & (1U << 0); - lu |= (nfX >> (T1_CHI_THIS_I - 2U + (ci * 3U))) & (1U << 2); - if (ci == 0U) { - lu |= (fX >> (T1_CHI_0_I - 4U)) & (1U << 4); - } else { - lu |= (fX >> (T1_CHI_1_I - 4U + ((ci - 1U) * 3U))) & (1U << 4); - } - lu |= (fX >> (T1_CHI_2_I - 6U + (ci * 3U))) & (1U << 6); - return lu; -} - -static INLINE OPJ_BYTE opj_t1_getctxno_sc(OPJ_UINT32 lu) -{ - return lut_ctxno_sc[lu]; -} - -static INLINE OPJ_UINT32 opj_t1_getctxno_mag(OPJ_UINT32 f) -{ - OPJ_UINT32 tmp = (f & T1_SIGMA_NEIGHBOURS) ? T1_CTXNO_MAG + 1 : T1_CTXNO_MAG; - OPJ_UINT32 tmp2 = (f & T1_MU_0) ? T1_CTXNO_MAG + 2 : tmp; - return tmp2; -} - -static INLINE OPJ_BYTE opj_t1_getspb(OPJ_UINT32 lu) -{ - return lut_spb[lu]; -} - -static OPJ_INT16 opj_t1_getnmsedec_sig(OPJ_UINT32 x, OPJ_UINT32 bitpos) -{ - if (bitpos > 0) { - return lut_nmsedec_sig[(x >> (bitpos)) & ((1 << T1_NMSEDEC_BITS) - 1)]; - } - - return lut_nmsedec_sig0[x & ((1 << T1_NMSEDEC_BITS) - 1)]; -} - -static OPJ_INT16 opj_t1_getnmsedec_ref(OPJ_UINT32 x, OPJ_UINT32 bitpos) -{ - if (bitpos > 0) { - return lut_nmsedec_ref[(x >> (bitpos)) & ((1 << T1_NMSEDEC_BITS) - 1)]; - } - - return lut_nmsedec_ref0[x & ((1 << T1_NMSEDEC_BITS) - 1)]; -} - -#define opj_t1_update_flags_macro(flags, flagsp, ci, s, stride, vsc) \ -{ \ - /* east */ \ - flagsp[-1] |= T1_SIGMA_5 << (3U * ci); \ - \ - /* mark target as significant */ \ - flags |= ((s << T1_CHI_1_I) | T1_SIGMA_4) << (3U * ci); \ - \ - /* west */ \ - flagsp[1] |= T1_SIGMA_3 << (3U * ci); \ - \ - /* north-west, north, north-east */ \ - if (ci == 0U && !(vsc)) { \ - opj_flag_t* north = flagsp - (stride); \ - *north |= (s << T1_CHI_5_I) | T1_SIGMA_16; \ - north[-1] |= T1_SIGMA_17; \ - north[1] |= T1_SIGMA_15; \ - } \ - \ - /* south-west, south, south-east */ \ - if (ci == 3U) { \ - opj_flag_t* south = flagsp + (stride); \ - *south |= (s << T1_CHI_0_I) | T1_SIGMA_1; \ - south[-1] |= T1_SIGMA_2; \ - south[1] |= T1_SIGMA_0; \ - } \ -} - - -static INLINE void opj_t1_update_flags(opj_flag_t *flagsp, OPJ_UINT32 ci, - OPJ_UINT32 s, OPJ_UINT32 stride, - OPJ_UINT32 vsc) -{ - opj_t1_update_flags_macro(*flagsp, flagsp, ci, s, stride, vsc); -} - -/** -Encode significant pass -*/ -static INLINE void opj_t1_enc_sigpass_step(opj_t1_t *t1, - opj_flag_t *flagsp, - OPJ_INT32 *datap, - OPJ_INT32 bpno, - OPJ_INT32 one, - OPJ_INT32 *nmsedec, - OPJ_BYTE type, - OPJ_UINT32 ci, - OPJ_UINT32 vsc) -{ - OPJ_UINT32 v; - - opj_mqc_t *mqc = &(t1->mqc); /* MQC component */ - - OPJ_UINT32 const flags = *flagsp; - - if ((flags & ((T1_SIGMA_THIS | T1_PI_THIS) << (ci * 3U))) == 0U && - (flags & (T1_SIGMA_NEIGHBOURS << (ci * 3U))) != 0U) { - OPJ_UINT32 ctxt1 = opj_t1_getctxno_zc(mqc, flags >> (ci * 3U)); - v = (opj_int_abs(*datap) & one) ? 1 : 0; -#ifdef DEBUG_ENC_SIG - fprintf(stderr, " ctxt1=%d\n", ctxt1); -#endif - opj_mqc_setcurctx(mqc, ctxt1); - if (type == T1_TYPE_RAW) { /* BYPASS/LAZY MODE */ - opj_mqc_bypass_enc(mqc, v); - } else { - opj_mqc_encode(mqc, v); - } - if (v) { - OPJ_UINT32 lu = opj_t1_getctxtno_sc_or_spb_index( - *flagsp, - flagsp[-1], flagsp[1], - ci); - OPJ_UINT32 ctxt2 = opj_t1_getctxno_sc(lu); - v = *datap < 0 ? 1U : 0U; - *nmsedec += opj_t1_getnmsedec_sig((OPJ_UINT32)opj_int_abs(*datap), - (OPJ_UINT32)bpno); -#ifdef DEBUG_ENC_SIG - fprintf(stderr, " ctxt2=%d\n", ctxt2); -#endif - opj_mqc_setcurctx(mqc, ctxt2); - if (type == T1_TYPE_RAW) { /* BYPASS/LAZY MODE */ - opj_mqc_bypass_enc(mqc, v); - } else { - OPJ_UINT32 spb = opj_t1_getspb(lu); -#ifdef DEBUG_ENC_SIG - fprintf(stderr, " spb=%d\n", spb); -#endif - opj_mqc_encode(mqc, v ^ spb); - } - opj_t1_update_flags(flagsp, ci, v, t1->w + 2, vsc); - } - *flagsp |= T1_PI_THIS << (ci * 3U); - } -} - -static INLINE void opj_t1_dec_sigpass_step_raw( - opj_t1_t *t1, - opj_flag_t *flagsp, - OPJ_INT32 *datap, - OPJ_INT32 oneplushalf, - OPJ_UINT32 vsc, - OPJ_UINT32 ci) -{ - OPJ_UINT32 v; - opj_mqc_t *mqc = &(t1->mqc); /* RAW component */ - - OPJ_UINT32 const flags = *flagsp; - - if ((flags & ((T1_SIGMA_THIS | T1_PI_THIS) << (ci * 3U))) == 0U && - (flags & (T1_SIGMA_NEIGHBOURS << (ci * 3U))) != 0U) { - if (opj_mqc_raw_decode(mqc)) { - v = opj_mqc_raw_decode(mqc); - *datap = v ? -oneplushalf : oneplushalf; - opj_t1_update_flags(flagsp, ci, v, t1->w + 2, vsc); - } - *flagsp |= T1_PI_THIS << (ci * 3U); - } -} - -#define opj_t1_dec_sigpass_step_mqc_macro(flags, flagsp, flags_stride, data, \ - data_stride, ci, mqc, curctx, \ - v, a, c, ct, oneplushalf, vsc) \ -{ \ - if ((flags & ((T1_SIGMA_THIS | T1_PI_THIS) << (ci * 3U))) == 0U && \ - (flags & (T1_SIGMA_NEIGHBOURS << (ci * 3U))) != 0U) { \ - OPJ_UINT32 ctxt1 = opj_t1_getctxno_zc(mqc, flags >> (ci * 3U)); \ - opj_t1_setcurctx(curctx, ctxt1); \ - opj_mqc_decode_macro(v, mqc, curctx, a, c, ct); \ - if (v) { \ - OPJ_UINT32 lu = opj_t1_getctxtno_sc_or_spb_index( \ - flags, \ - flagsp[-1], flagsp[1], \ - ci); \ - OPJ_UINT32 ctxt2 = opj_t1_getctxno_sc(lu); \ - OPJ_UINT32 spb = opj_t1_getspb(lu); \ - opj_t1_setcurctx(curctx, ctxt2); \ - opj_mqc_decode_macro(v, mqc, curctx, a, c, ct); \ - v = v ^ spb; \ - data[ci*data_stride] = v ? -oneplushalf : oneplushalf; \ - opj_t1_update_flags_macro(flags, flagsp, ci, v, flags_stride, vsc); \ - } \ - flags |= T1_PI_THIS << (ci * 3U); \ - } \ -} - -static INLINE void opj_t1_dec_sigpass_step_mqc( - opj_t1_t *t1, - opj_flag_t *flagsp, - OPJ_INT32 *datap, - OPJ_INT32 oneplushalf, - OPJ_UINT32 ci, - OPJ_UINT32 flags_stride, - OPJ_UINT32 vsc) -{ - OPJ_UINT32 v; - - opj_mqc_t *mqc = &(t1->mqc); /* MQC component */ - opj_t1_dec_sigpass_step_mqc_macro(*flagsp, flagsp, flags_stride, datap, - 0, ci, mqc, mqc->curctx, - v, mqc->a, mqc->c, mqc->ct, oneplushalf, vsc); -} - -static void opj_t1_enc_sigpass(opj_t1_t *t1, - OPJ_INT32 bpno, - OPJ_INT32 *nmsedec, - OPJ_BYTE type, - OPJ_UINT32 cblksty - ) -{ - OPJ_UINT32 i, k; - OPJ_INT32 const one = 1 << (bpno + T1_NMSEDEC_FRACBITS); - opj_flag_t* f = &T1_FLAGS(0, 0); - OPJ_UINT32 const extra = 2; - - *nmsedec = 0; -#ifdef DEBUG_ENC_SIG - fprintf(stderr, "enc_sigpass: bpno=%d\n", bpno); -#endif - for (k = 0; k < (t1->h & ~3U); k += 4) { -#ifdef DEBUG_ENC_SIG - fprintf(stderr, " k=%d\n", k); -#endif - for (i = 0; i < t1->w; ++i) { -#ifdef DEBUG_ENC_SIG - fprintf(stderr, " i=%d\n", i); -#endif - if (*f == 0U) { - /* Nothing to do for any of the 4 data points */ - f++; - continue; - } - opj_t1_enc_sigpass_step( - t1, - f, - &t1->data[((k + 0) * t1->data_stride) + i], - bpno, - one, - nmsedec, - type, - 0, cblksty & J2K_CCP_CBLKSTY_VSC); - opj_t1_enc_sigpass_step( - t1, - f, - &t1->data[((k + 1) * t1->data_stride) + i], - bpno, - one, - nmsedec, - type, - 1, 0); - opj_t1_enc_sigpass_step( - t1, - f, - &t1->data[((k + 2) * t1->data_stride) + i], - bpno, - one, - nmsedec, - type, - 2, 0); - opj_t1_enc_sigpass_step( - t1, - f, - &t1->data[((k + 3) * t1->data_stride) + i], - bpno, - one, - nmsedec, - type, - 3, 0); - ++f; - } - f += extra; - } - - if (k < t1->h) { - OPJ_UINT32 j; -#ifdef DEBUG_ENC_SIG - fprintf(stderr, " k=%d\n", k); -#endif - for (i = 0; i < t1->w; ++i) { -#ifdef DEBUG_ENC_SIG - fprintf(stderr, " i=%d\n", i); -#endif - if (*f == 0U) { - /* Nothing to do for any of the 4 data points */ - f++; - continue; - } - for (j = k; j < t1->h; ++j) { - opj_t1_enc_sigpass_step( - t1, - f, - &t1->data[(j * t1->data_stride) + i], - bpno, - one, - nmsedec, - type, - j - k, - (j == k && (cblksty & J2K_CCP_CBLKSTY_VSC) != 0)); - } - ++f; - } - } -} - -static void opj_t1_dec_sigpass_raw( - opj_t1_t *t1, - OPJ_INT32 bpno, - OPJ_INT32 cblksty) -{ - OPJ_INT32 one, half, oneplushalf; - OPJ_UINT32 i, j, k; - OPJ_INT32 *data = t1->data; - opj_flag_t *flagsp = &T1_FLAGS(0, 0); - const OPJ_UINT32 l_w = t1->w; - one = 1 << bpno; - half = one >> 1; - oneplushalf = one | half; - - for (k = 0; k < (t1->h & ~3U); k += 4, flagsp += 2, data += 3 * l_w) { - for (i = 0; i < l_w; ++i, ++flagsp, ++data) { - opj_flag_t flags = *flagsp; - if (flags != 0) { - opj_t1_dec_sigpass_step_raw( - t1, - flagsp, - data, - oneplushalf, - cblksty & J2K_CCP_CBLKSTY_VSC, /* vsc */ - 0U); - opj_t1_dec_sigpass_step_raw( - t1, - flagsp, - data + l_w, - oneplushalf, - OPJ_FALSE, /* vsc */ - 1U); - opj_t1_dec_sigpass_step_raw( - t1, - flagsp, - data + 2 * l_w, - oneplushalf, - OPJ_FALSE, /* vsc */ - 2U); - opj_t1_dec_sigpass_step_raw( - t1, - flagsp, - data + 3 * l_w, - oneplushalf, - OPJ_FALSE, /* vsc */ - 3U); - } - } - } - if (k < t1->h) { - for (i = 0; i < l_w; ++i, ++flagsp, ++data) { - for (j = 0; j < t1->h - k; ++j) { - opj_t1_dec_sigpass_step_raw( - t1, - flagsp, - data + j * l_w, - oneplushalf, - cblksty & J2K_CCP_CBLKSTY_VSC, /* vsc */ - j); - } - } - } -} - -#define opj_t1_dec_sigpass_mqc_internal(t1, bpno, vsc, w, h, flags_stride) \ -{ \ - OPJ_INT32 one, half, oneplushalf; \ - OPJ_UINT32 i, j, k; \ - register OPJ_INT32 *data = t1->data; \ - register opj_flag_t *flagsp = &t1->flags[(flags_stride) + 1]; \ - const OPJ_UINT32 l_w = w; \ - opj_mqc_t* mqc = &(t1->mqc); \ - DOWNLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct); \ - register OPJ_UINT32 v; \ - one = 1 << bpno; \ - half = one >> 1; \ - oneplushalf = one | half; \ - for (k = 0; k < (h & ~3u); k += 4, data += 3*l_w, flagsp += 2) { \ - for (i = 0; i < l_w; ++i, ++data, ++flagsp) { \ - opj_flag_t flags = *flagsp; \ - if( flags != 0 ) { \ - opj_t1_dec_sigpass_step_mqc_macro( \ - flags, flagsp, flags_stride, data, \ - l_w, 0, mqc, curctx, v, a, c, ct, oneplushalf, vsc); \ - opj_t1_dec_sigpass_step_mqc_macro( \ - flags, flagsp, flags_stride, data, \ - l_w, 1, mqc, curctx, v, a, c, ct, oneplushalf, OPJ_FALSE); \ - opj_t1_dec_sigpass_step_mqc_macro( \ - flags, flagsp, flags_stride, data, \ - l_w, 2, mqc, curctx, v, a, c, ct, oneplushalf, OPJ_FALSE); \ - opj_t1_dec_sigpass_step_mqc_macro( \ - flags, flagsp, flags_stride, data, \ - l_w, 3, mqc, curctx, v, a, c, ct, oneplushalf, OPJ_FALSE); \ - *flagsp = flags; \ - } \ - } \ - } \ - UPLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct); \ - if( k < h ) { \ - for (i = 0; i < l_w; ++i, ++data, ++flagsp) { \ - for (j = 0; j < h - k; ++j) { \ - opj_t1_dec_sigpass_step_mqc(t1, flagsp, \ - data + j * l_w, oneplushalf, j, flags_stride, vsc); \ - } \ - } \ - } \ -} - -static void opj_t1_dec_sigpass_mqc_64x64_novsc( - opj_t1_t *t1, - OPJ_INT32 bpno) -{ - opj_t1_dec_sigpass_mqc_internal(t1, bpno, OPJ_FALSE, 64, 64, 66); -} - -static void opj_t1_dec_sigpass_mqc_64x64_vsc( - opj_t1_t *t1, - OPJ_INT32 bpno) -{ - opj_t1_dec_sigpass_mqc_internal(t1, bpno, OPJ_TRUE, 64, 64, 66); -} - -static void opj_t1_dec_sigpass_mqc_generic_novsc( - opj_t1_t *t1, - OPJ_INT32 bpno) -{ - opj_t1_dec_sigpass_mqc_internal(t1, bpno, OPJ_FALSE, t1->w, t1->h, - t1->w + 2U); -} - -static void opj_t1_dec_sigpass_mqc_generic_vsc( - opj_t1_t *t1, - OPJ_INT32 bpno) -{ - opj_t1_dec_sigpass_mqc_internal(t1, bpno, OPJ_TRUE, t1->w, t1->h, - t1->w + 2U); -} - -static void opj_t1_dec_sigpass_mqc( - opj_t1_t *t1, - OPJ_INT32 bpno, - OPJ_INT32 cblksty) -{ - if (t1->w == 64 && t1->h == 64) { - if (cblksty & J2K_CCP_CBLKSTY_VSC) { - opj_t1_dec_sigpass_mqc_64x64_vsc(t1, bpno); - } else { - opj_t1_dec_sigpass_mqc_64x64_novsc(t1, bpno); - } - } else { - if (cblksty & J2K_CCP_CBLKSTY_VSC) { - opj_t1_dec_sigpass_mqc_generic_vsc(t1, bpno); - } else { - opj_t1_dec_sigpass_mqc_generic_novsc(t1, bpno); - } - } -} - -/** -Encode refinement pass step -*/ -static INLINE void opj_t1_enc_refpass_step(opj_t1_t *t1, - opj_flag_t *flagsp, - OPJ_INT32 *datap, - OPJ_INT32 bpno, - OPJ_INT32 one, - OPJ_INT32 *nmsedec, - OPJ_BYTE type, - OPJ_UINT32 ci) -{ - OPJ_UINT32 v; - - opj_mqc_t *mqc = &(t1->mqc); /* MQC component */ - - OPJ_UINT32 const shift_flags = - (*flagsp >> (ci * 3U)); - - if ((shift_flags & (T1_SIGMA_THIS | T1_PI_THIS)) == T1_SIGMA_THIS) { - OPJ_UINT32 ctxt = opj_t1_getctxno_mag(shift_flags); - *nmsedec += opj_t1_getnmsedec_ref((OPJ_UINT32)opj_int_abs(*datap), - (OPJ_UINT32)bpno); - v = (opj_int_abs(*datap) & one) ? 1 : 0; -#ifdef DEBUG_ENC_REF - fprintf(stderr, " ctxt=%d\n", ctxt); -#endif - opj_mqc_setcurctx(mqc, ctxt); - if (type == T1_TYPE_RAW) { /* BYPASS/LAZY MODE */ - opj_mqc_bypass_enc(mqc, v); - } else { - opj_mqc_encode(mqc, v); - } - *flagsp |= T1_MU_THIS << (ci * 3U); - } -} - - -static INLINE void opj_t1_dec_refpass_step_raw( - opj_t1_t *t1, - opj_flag_t *flagsp, - OPJ_INT32 *datap, - OPJ_INT32 poshalf, - OPJ_UINT32 ci) -{ - OPJ_UINT32 v; - - opj_mqc_t *mqc = &(t1->mqc); /* RAW component */ - - if ((*flagsp & ((T1_SIGMA_THIS | T1_PI_THIS) << (ci * 3U))) == - (T1_SIGMA_THIS << (ci * 3U))) { - v = opj_mqc_raw_decode(mqc); - *datap += (v ^ (*datap < 0)) ? poshalf : -poshalf; - *flagsp |= T1_MU_THIS << (ci * 3U); - } -} - -#define opj_t1_dec_refpass_step_mqc_macro(flags, data, data_stride, ci, \ - mqc, curctx, v, a, c, ct, poshalf) \ -{ \ - if ((flags & ((T1_SIGMA_THIS | T1_PI_THIS) << (ci * 3U))) == \ - (T1_SIGMA_THIS << (ci * 3U))) { \ - OPJ_UINT32 ctxt = opj_t1_getctxno_mag(flags >> (ci * 3U)); \ - opj_t1_setcurctx(curctx, ctxt); \ - opj_mqc_decode_macro(v, mqc, curctx, a, c, ct); \ - data[ci*data_stride] += (v ^ (data[ci*data_stride] < 0)) ? poshalf : -poshalf; \ - flags |= T1_MU_THIS << (ci * 3U); \ - } \ -} - -static INLINE void opj_t1_dec_refpass_step_mqc( - opj_t1_t *t1, - opj_flag_t *flagsp, - OPJ_INT32 *datap, - OPJ_INT32 poshalf, - OPJ_UINT32 ci) -{ - OPJ_UINT32 v; - - opj_mqc_t *mqc = &(t1->mqc); /* MQC component */ - opj_t1_dec_refpass_step_mqc_macro(*flagsp, datap, 0, ci, - mqc, mqc->curctx, v, mqc->a, mqc->c, - mqc->ct, poshalf); -} - -static void opj_t1_enc_refpass( - opj_t1_t *t1, - OPJ_INT32 bpno, - OPJ_INT32 *nmsedec, - OPJ_BYTE type) -{ - OPJ_UINT32 i, k; - const OPJ_INT32 one = 1 << (bpno + T1_NMSEDEC_FRACBITS); - opj_flag_t* f = &T1_FLAGS(0, 0); - const OPJ_UINT32 extra = 2U; - - *nmsedec = 0; -#ifdef DEBUG_ENC_REF - fprintf(stderr, "enc_refpass: bpno=%d\n", bpno); -#endif - for (k = 0; k < (t1->h & ~3U); k += 4) { -#ifdef DEBUG_ENC_REF - fprintf(stderr, " k=%d\n", k); -#endif - for (i = 0; i < t1->w; ++i) { -#ifdef DEBUG_ENC_REF - fprintf(stderr, " i=%d\n", i); -#endif - if ((*f & (T1_SIGMA_4 | T1_SIGMA_7 | T1_SIGMA_10 | T1_SIGMA_13)) == 0) { - /* none significant */ - f++; - continue; - } - if ((*f & (T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3)) == - (T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3)) { - /* all processed by sigpass */ - f++; - continue; - } - - opj_t1_enc_refpass_step( - t1, - f, - &t1->data[((k + 0) * t1->data_stride) + i], - bpno, - one, - nmsedec, - type, - 0); - opj_t1_enc_refpass_step( - t1, - f, - &t1->data[((k + 1) * t1->data_stride) + i], - bpno, - one, - nmsedec, - type, - 1); - opj_t1_enc_refpass_step( - t1, - f, - &t1->data[((k + 2) * t1->data_stride) + i], - bpno, - one, - nmsedec, - type, - 2); - opj_t1_enc_refpass_step( - t1, - f, - &t1->data[((k + 3) * t1->data_stride) + i], - bpno, - one, - nmsedec, - type, - 3); - ++f; - } - f += extra; - } - - if (k < t1->h) { - OPJ_UINT32 j; -#ifdef DEBUG_ENC_REF - fprintf(stderr, " k=%d\n", k); -#endif - for (i = 0; i < t1->w; ++i) { -#ifdef DEBUG_ENC_REF - fprintf(stderr, " i=%d\n", i); -#endif - if ((*f & (T1_SIGMA_4 | T1_SIGMA_7 | T1_SIGMA_10 | T1_SIGMA_13)) == 0) { - /* none significant */ - f++; - continue; - } - for (j = k; j < t1->h; ++j) { - opj_t1_enc_refpass_step( - t1, - f, - &t1->data[(j * t1->data_stride) + i], - bpno, - one, - nmsedec, - type, - j - k); - } - ++f; - } - } -} - - -static void opj_t1_dec_refpass_raw( - opj_t1_t *t1, - OPJ_INT32 bpno) -{ - OPJ_INT32 one, poshalf; - OPJ_UINT32 i, j, k; - OPJ_INT32 *data = t1->data; - opj_flag_t *flagsp = &T1_FLAGS(0, 0); - const OPJ_UINT32 l_w = t1->w; - one = 1 << bpno; - poshalf = one >> 1; - for (k = 0; k < (t1->h & ~3U); k += 4, flagsp += 2, data += 3 * l_w) { - for (i = 0; i < l_w; ++i, ++flagsp, ++data) { - opj_flag_t flags = *flagsp; - if (flags != 0) { - opj_t1_dec_refpass_step_raw( - t1, - flagsp, - data, - poshalf, - 0U); - opj_t1_dec_refpass_step_raw( - t1, - flagsp, - data + l_w, - poshalf, - 1U); - opj_t1_dec_refpass_step_raw( - t1, - flagsp, - data + 2 * l_w, - poshalf, - 2U); - opj_t1_dec_refpass_step_raw( - t1, - flagsp, - data + 3 * l_w, - poshalf, - 3U); - } - } - } - if (k < t1->h) { - for (i = 0; i < l_w; ++i, ++flagsp, ++data) { - for (j = 0; j < t1->h - k; ++j) { - opj_t1_dec_refpass_step_raw( - t1, - flagsp, - data + j * l_w, - poshalf, - j); - } - } - } -} - -#define opj_t1_dec_refpass_mqc_internal(t1, bpno, w, h, flags_stride) \ -{ \ - OPJ_INT32 one, poshalf; \ - OPJ_UINT32 i, j, k; \ - register OPJ_INT32 *data = t1->data; \ - register opj_flag_t *flagsp = &t1->flags[flags_stride + 1]; \ - const OPJ_UINT32 l_w = w; \ - opj_mqc_t* mqc = &(t1->mqc); \ - DOWNLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct); \ - register OPJ_UINT32 v; \ - one = 1 << bpno; \ - poshalf = one >> 1; \ - for (k = 0; k < (h & ~3u); k += 4, data += 3*l_w, flagsp += 2) { \ - for (i = 0; i < l_w; ++i, ++data, ++flagsp) { \ - opj_flag_t flags = *flagsp; \ - if( flags != 0 ) { \ - opj_t1_dec_refpass_step_mqc_macro( \ - flags, data, l_w, 0, \ - mqc, curctx, v, a, c, ct, poshalf); \ - opj_t1_dec_refpass_step_mqc_macro( \ - flags, data, l_w, 1, \ - mqc, curctx, v, a, c, ct, poshalf); \ - opj_t1_dec_refpass_step_mqc_macro( \ - flags, data, l_w, 2, \ - mqc, curctx, v, a, c, ct, poshalf); \ - opj_t1_dec_refpass_step_mqc_macro( \ - flags, data, l_w, 3, \ - mqc, curctx, v, a, c, ct, poshalf); \ - *flagsp = flags; \ - } \ - } \ - } \ - UPLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct); \ - if( k < h ) { \ - for (i = 0; i < l_w; ++i, ++data, ++flagsp) { \ - for (j = 0; j < h - k; ++j) { \ - opj_t1_dec_refpass_step_mqc(t1, flagsp, data + j * l_w, poshalf, j); \ - } \ - } \ - } \ -} - -static void opj_t1_dec_refpass_mqc_64x64( - opj_t1_t *t1, - OPJ_INT32 bpno) -{ - opj_t1_dec_refpass_mqc_internal(t1, bpno, 64, 64, 66); -} - -static void opj_t1_dec_refpass_mqc_generic( - opj_t1_t *t1, - OPJ_INT32 bpno) -{ - opj_t1_dec_refpass_mqc_internal(t1, bpno, t1->w, t1->h, t1->w + 2U); -} - -static void opj_t1_dec_refpass_mqc( - opj_t1_t *t1, - OPJ_INT32 bpno) -{ - if (t1->w == 64 && t1->h == 64) { - opj_t1_dec_refpass_mqc_64x64(t1, bpno); - } else { - opj_t1_dec_refpass_mqc_generic(t1, bpno); - } -} - -/** -Encode clean-up pass step -*/ -static void opj_t1_enc_clnpass_step( - opj_t1_t *t1, - opj_flag_t *flagsp, - OPJ_INT32 *datap, - OPJ_INT32 bpno, - OPJ_INT32 one, - OPJ_INT32 *nmsedec, - OPJ_UINT32 agg, - OPJ_UINT32 runlen, - OPJ_UINT32 lim, - OPJ_UINT32 cblksty) -{ - OPJ_UINT32 v; - OPJ_UINT32 ci; - opj_mqc_t *mqc = &(t1->mqc); /* MQC component */ - - const OPJ_UINT32 check = (T1_SIGMA_4 | T1_SIGMA_7 | T1_SIGMA_10 | T1_SIGMA_13 | - T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3); - - if ((*flagsp & check) == check) { - if (runlen == 0) { - *flagsp &= ~(T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3); - } else if (runlen == 1) { - *flagsp &= ~(T1_PI_1 | T1_PI_2 | T1_PI_3); - } else if (runlen == 2) { - *flagsp &= ~(T1_PI_2 | T1_PI_3); - } else if (runlen == 3) { - *flagsp &= ~(T1_PI_3); - } - return; - } - - for (ci = runlen; ci < lim; ++ci) { - OPJ_UINT32 vsc; - opj_flag_t flags; - OPJ_UINT32 ctxt1; - - flags = *flagsp; - - if ((agg != 0) && (ci == runlen)) { - goto LABEL_PARTIAL; - } - - if (!(flags & ((T1_SIGMA_THIS | T1_PI_THIS) << (ci * 3U)))) { - ctxt1 = opj_t1_getctxno_zc(mqc, flags >> (ci * 3U)); -#ifdef DEBUG_ENC_CLN - printf(" ctxt1=%d\n", ctxt1); -#endif - opj_mqc_setcurctx(mqc, ctxt1); - v = (opj_int_abs(*datap) & one) ? 1 : 0; - opj_mqc_encode(mqc, v); - if (v) { - OPJ_UINT32 ctxt2, spb; - OPJ_UINT32 lu; -LABEL_PARTIAL: - lu = opj_t1_getctxtno_sc_or_spb_index( - *flagsp, - flagsp[-1], flagsp[1], - ci); - *nmsedec += opj_t1_getnmsedec_sig((OPJ_UINT32)opj_int_abs(*datap), - (OPJ_UINT32)bpno); - ctxt2 = opj_t1_getctxno_sc(lu); -#ifdef DEBUG_ENC_CLN - printf(" ctxt2=%d\n", ctxt2); -#endif - opj_mqc_setcurctx(mqc, ctxt2); - - v = *datap < 0 ? 1U : 0U; - spb = opj_t1_getspb(lu); -#ifdef DEBUG_ENC_CLN - printf(" spb=%d\n", spb); -#endif - opj_mqc_encode(mqc, v ^ spb); - vsc = ((cblksty & J2K_CCP_CBLKSTY_VSC) && (ci == 0)) ? 1 : 0; - opj_t1_update_flags(flagsp, ci, v, t1->w + 2U, vsc); - } - } - *flagsp &= ~(T1_PI_THIS << (3U * ci)); - datap += t1->data_stride; - } -} - -#define opj_t1_dec_clnpass_step_macro(check_flags, partial, \ - flags, flagsp, flags_stride, data, \ - data_stride, ci, mqc, curctx, \ - v, a, c, ct, oneplushalf, vsc) \ -{ \ - if ( !check_flags || !(flags & ((T1_SIGMA_THIS | T1_PI_THIS) << (ci * 3U)))) {\ - do { \ - if( !partial ) { \ - OPJ_UINT32 ctxt1 = opj_t1_getctxno_zc(mqc, flags >> (ci * 3U)); \ - opj_t1_setcurctx(curctx, ctxt1); \ - opj_mqc_decode_macro(v, mqc, curctx, a, c, ct); \ - if( !v ) \ - break; \ - } \ - { \ - OPJ_UINT32 lu = opj_t1_getctxtno_sc_or_spb_index( \ - flags, flagsp[-1], flagsp[1], \ - ci); \ - opj_t1_setcurctx(curctx, opj_t1_getctxno_sc(lu)); \ - opj_mqc_decode_macro(v, mqc, curctx, a, c, ct); \ - v = v ^ opj_t1_getspb(lu); \ - data[ci*data_stride] = v ? -oneplushalf : oneplushalf; \ - opj_t1_update_flags_macro(flags, flagsp, ci, v, flags_stride, vsc); \ - } \ - } while(0); \ - } \ -} - -static void opj_t1_dec_clnpass_step( - opj_t1_t *t1, - opj_flag_t *flagsp, - OPJ_INT32 *datap, - OPJ_INT32 oneplushalf, - OPJ_UINT32 ci, - OPJ_UINT32 vsc) -{ - OPJ_UINT32 v; - - opj_mqc_t *mqc = &(t1->mqc); /* MQC component */ - opj_t1_dec_clnpass_step_macro(OPJ_TRUE, OPJ_FALSE, - *flagsp, flagsp, t1->w + 2U, datap, - 0, ci, mqc, mqc->curctx, - v, mqc->a, mqc->c, mqc->ct, oneplushalf, vsc); -} - -static void opj_t1_enc_clnpass( - opj_t1_t *t1, - OPJ_INT32 bpno, - OPJ_INT32 *nmsedec, - OPJ_UINT32 cblksty) -{ - OPJ_UINT32 i, k; - const OPJ_INT32 one = 1 << (bpno + T1_NMSEDEC_FRACBITS); - OPJ_UINT32 agg, runlen; - - opj_mqc_t *mqc = &(t1->mqc); /* MQC component */ - - *nmsedec = 0; -#ifdef DEBUG_ENC_CLN - printf("enc_clnpass: bpno=%d\n", bpno); -#endif - for (k = 0; k < (t1->h & ~3U); k += 4) { -#ifdef DEBUG_ENC_CLN - printf(" k=%d\n", k); -#endif - for (i = 0; i < t1->w; ++i) { -#ifdef DEBUG_ENC_CLN - printf(" i=%d\n", i); -#endif - agg = !(T1_FLAGS(i, k)); -#ifdef DEBUG_ENC_CLN - printf(" agg=%d\n", agg); -#endif - if (agg) { - for (runlen = 0; runlen < 4; ++runlen) { - if (opj_int_abs(t1->data[((k + runlen)*t1->data_stride) + i]) & one) { - break; - } - } - opj_mqc_setcurctx(mqc, T1_CTXNO_AGG); - opj_mqc_encode(mqc, runlen != 4); - if (runlen == 4) { - continue; - } - opj_mqc_setcurctx(mqc, T1_CTXNO_UNI); - opj_mqc_encode(mqc, runlen >> 1); - opj_mqc_encode(mqc, runlen & 1); - } else { - runlen = 0; - } - opj_t1_enc_clnpass_step( - t1, - &T1_FLAGS(i, k), - &t1->data[((k + runlen) * t1->data_stride) + i], - bpno, - one, - nmsedec, - agg, - runlen, - 4U, - cblksty); - } - } - if (k < t1->h) { - agg = 0; - runlen = 0; -#ifdef DEBUG_ENC_CLN - printf(" k=%d\n", k); -#endif - for (i = 0; i < t1->w; ++i) { -#ifdef DEBUG_ENC_CLN - printf(" i=%d\n", i); - printf(" agg=%d\n", agg); -#endif - opj_t1_enc_clnpass_step( - t1, - &T1_FLAGS(i, k), - &t1->data[((k + runlen) * t1->data_stride) + i], - bpno, - one, - nmsedec, - agg, - runlen, - t1->h - k, - cblksty); - } - } -} - -#define opj_t1_dec_clnpass_internal(t1, bpno, vsc, w, h, flags_stride) \ -{ \ - OPJ_INT32 one, half, oneplushalf; \ - OPJ_UINT32 runlen; \ - OPJ_UINT32 i, j, k; \ - const OPJ_UINT32 l_w = w; \ - opj_mqc_t* mqc = &(t1->mqc); \ - register OPJ_INT32 *data = t1->data; \ - register opj_flag_t *flagsp = &t1->flags[flags_stride + 1]; \ - DOWNLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct); \ - register OPJ_UINT32 v; \ - one = 1 << bpno; \ - half = one >> 1; \ - oneplushalf = one | half; \ - for (k = 0; k < (h & ~3u); k += 4, data += 3*l_w, flagsp += 2) { \ - for (i = 0; i < l_w; ++i, ++data, ++flagsp) { \ - opj_flag_t flags = *flagsp; \ - if (flags == 0) { \ - OPJ_UINT32 partial = OPJ_TRUE; \ - opj_t1_setcurctx(curctx, T1_CTXNO_AGG); \ - opj_mqc_decode_macro(v, mqc, curctx, a, c, ct); \ - if (!v) { \ - continue; \ - } \ - opj_t1_setcurctx(curctx, T1_CTXNO_UNI); \ - opj_mqc_decode_macro(runlen, mqc, curctx, a, c, ct); \ - opj_mqc_decode_macro(v, mqc, curctx, a, c, ct); \ - runlen = (runlen << 1) | v; \ - switch(runlen) { \ - case 0: \ - opj_t1_dec_clnpass_step_macro(OPJ_FALSE, OPJ_TRUE,\ - flags, flagsp, flags_stride, data, \ - l_w, 0, mqc, curctx, \ - v, a, c, ct, oneplushalf, vsc); \ - partial = OPJ_FALSE; \ - /* FALLTHRU */ \ - case 1: \ - opj_t1_dec_clnpass_step_macro(OPJ_FALSE, partial,\ - flags, flagsp, flags_stride, data, \ - l_w, 1, mqc, curctx, \ - v, a, c, ct, oneplushalf, OPJ_FALSE); \ - partial = OPJ_FALSE; \ - /* FALLTHRU */ \ - case 2: \ - opj_t1_dec_clnpass_step_macro(OPJ_FALSE, partial,\ - flags, flagsp, flags_stride, data, \ - l_w, 2, mqc, curctx, \ - v, a, c, ct, oneplushalf, OPJ_FALSE); \ - partial = OPJ_FALSE; \ - /* FALLTHRU */ \ - case 3: \ - opj_t1_dec_clnpass_step_macro(OPJ_FALSE, partial,\ - flags, flagsp, flags_stride, data, \ - l_w, 3, mqc, curctx, \ - v, a, c, ct, oneplushalf, OPJ_FALSE); \ - break; \ - } \ - } else { \ - opj_t1_dec_clnpass_step_macro(OPJ_TRUE, OPJ_FALSE, \ - flags, flagsp, flags_stride, data, \ - l_w, 0, mqc, curctx, \ - v, a, c, ct, oneplushalf, vsc); \ - opj_t1_dec_clnpass_step_macro(OPJ_TRUE, OPJ_FALSE, \ - flags, flagsp, flags_stride, data, \ - l_w, 1, mqc, curctx, \ - v, a, c, ct, oneplushalf, OPJ_FALSE); \ - opj_t1_dec_clnpass_step_macro(OPJ_TRUE, OPJ_FALSE, \ - flags, flagsp, flags_stride, data, \ - l_w, 2, mqc, curctx, \ - v, a, c, ct, oneplushalf, OPJ_FALSE); \ - opj_t1_dec_clnpass_step_macro(OPJ_TRUE, OPJ_FALSE, \ - flags, flagsp, flags_stride, data, \ - l_w, 3, mqc, curctx, \ - v, a, c, ct, oneplushalf, OPJ_FALSE); \ - } \ - *flagsp = flags & ~(T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3); \ - } \ - } \ - UPLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct); \ - if( k < h ) { \ - for (i = 0; i < l_w; ++i, ++flagsp, ++data) { \ - for (j = 0; j < h - k; ++j) { \ - opj_t1_dec_clnpass_step(t1, flagsp, data + j * l_w, oneplushalf, j, vsc); \ - } \ - *flagsp &= ~(T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3); \ - } \ - } \ -} - -static void opj_t1_dec_clnpass_check_segsym(opj_t1_t *t1, OPJ_INT32 cblksty) -{ - if (cblksty & J2K_CCP_CBLKSTY_SEGSYM) { - opj_mqc_t* mqc = &(t1->mqc); - OPJ_UINT32 v, v2; - opj_mqc_setcurctx(mqc, T1_CTXNO_UNI); - opj_mqc_decode(v, mqc); - opj_mqc_decode(v2, mqc); - v = (v << 1) | v2; - opj_mqc_decode(v2, mqc); - v = (v << 1) | v2; - opj_mqc_decode(v2, mqc); - v = (v << 1) | v2; - /* - if (v!=0xa) { - opj_event_msg(t1->cinfo, EVT_WARNING, "Bad segmentation symbol %x\n", v); - } - */ - } -} - -static void opj_t1_dec_clnpass_64x64_novsc( - opj_t1_t *t1, - OPJ_INT32 bpno) -{ - opj_t1_dec_clnpass_internal(t1, bpno, OPJ_FALSE, 64, 64, 66); -} - -static void opj_t1_dec_clnpass_64x64_vsc( - opj_t1_t *t1, - OPJ_INT32 bpno) -{ - opj_t1_dec_clnpass_internal(t1, bpno, OPJ_TRUE, 64, 64, 66); -} - -static void opj_t1_dec_clnpass_generic_novsc( - opj_t1_t *t1, - OPJ_INT32 bpno) -{ - opj_t1_dec_clnpass_internal(t1, bpno, OPJ_FALSE, t1->w, t1->h, - t1->w + 2U); -} - -static void opj_t1_dec_clnpass_generic_vsc( - opj_t1_t *t1, - OPJ_INT32 bpno) -{ - opj_t1_dec_clnpass_internal(t1, bpno, OPJ_TRUE, t1->w, t1->h, - t1->w + 2U); -} - -static void opj_t1_dec_clnpass( - opj_t1_t *t1, - OPJ_INT32 bpno, - OPJ_INT32 cblksty) -{ - if (t1->w == 64 && t1->h == 64) { - if (cblksty & J2K_CCP_CBLKSTY_VSC) { - opj_t1_dec_clnpass_64x64_vsc(t1, bpno); - } else { - opj_t1_dec_clnpass_64x64_novsc(t1, bpno); - } - } else { - if (cblksty & J2K_CCP_CBLKSTY_VSC) { - opj_t1_dec_clnpass_generic_vsc(t1, bpno); - } else { - opj_t1_dec_clnpass_generic_novsc(t1, bpno); - } - } - opj_t1_dec_clnpass_check_segsym(t1, cblksty); -} - - -/** mod fixed_quality */ -static OPJ_FLOAT64 opj_t1_getwmsedec( - OPJ_INT32 nmsedec, - OPJ_UINT32 compno, - OPJ_UINT32 level, - OPJ_UINT32 orient, - OPJ_INT32 bpno, - OPJ_UINT32 qmfbid, - OPJ_FLOAT64 stepsize, - OPJ_UINT32 numcomps, - const OPJ_FLOAT64 * mct_norms, - OPJ_UINT32 mct_numcomps) -{ - OPJ_FLOAT64 w1 = 1, w2, wmsedec; - OPJ_ARG_NOT_USED(numcomps); - - if (mct_norms && (compno < mct_numcomps)) { - w1 = mct_norms[compno]; - } - - if (qmfbid == 1) { - w2 = opj_dwt_getnorm(level, orient); - } else { /* if (qmfbid == 0) */ - w2 = opj_dwt_getnorm_real(level, orient); - } - - wmsedec = w1 * w2 * stepsize * (1 << bpno); - wmsedec *= wmsedec * nmsedec / 8192.0; - - return wmsedec; -} - -static OPJ_BOOL opj_t1_allocate_buffers( - opj_t1_t *t1, - OPJ_UINT32 w, - OPJ_UINT32 h) -{ - OPJ_UINT32 flagssize; - OPJ_UINT32 flags_stride; - - /* No risk of overflow. Prior checks ensure those assert are met */ - /* They are per the specification */ - assert(w <= 1024); - assert(h <= 1024); - assert(w * h <= 4096); - - /* encoder uses tile buffer, so no need to allocate */ - if (!t1->encoder) { - OPJ_UINT32 datasize = w * h; - - if (datasize > t1->datasize) { - opj_aligned_free(t1->data); - t1->data = (OPJ_INT32*) opj_aligned_malloc(datasize * sizeof(OPJ_INT32)); - if (!t1->data) { - /* FIXME event manager error callback */ - return OPJ_FALSE; - } - t1->datasize = datasize; - } - /* memset first arg is declared to never be null by gcc */ - if (t1->data != NULL) { - memset(t1->data, 0, datasize * sizeof(OPJ_INT32)); - } - } - - flags_stride = w + 2U; /* can't be 0U */ - - flagssize = (h + 3U) / 4U + 2U; - - flagssize *= flags_stride; - { - opj_flag_t* p; - OPJ_UINT32 x; - OPJ_UINT32 flags_height = (h + 3U) / 4U; - - if (flagssize > t1->flagssize) { - - opj_aligned_free(t1->flags); - t1->flags = (opj_flag_t*) opj_aligned_malloc(flagssize * sizeof( - opj_flag_t)); - if (!t1->flags) { - /* FIXME event manager error callback */ - return OPJ_FALSE; - } - } - t1->flagssize = flagssize; - - memset(t1->flags, 0, flagssize * sizeof(opj_flag_t)); - - p = &t1->flags[0]; - for (x = 0; x < flags_stride; ++x) { - /* magic value to hopefully stop any passes being interested in this entry */ - *p++ = (T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3); - } - - p = &t1->flags[((flags_height + 1) * flags_stride)]; - for (x = 0; x < flags_stride; ++x) { - /* magic value to hopefully stop any passes being interested in this entry */ - *p++ = (T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3); - } - - if (h % 4) { - OPJ_UINT32 v = 0; - p = &t1->flags[((flags_height) * flags_stride)]; - if (h % 4 == 1) { - v |= T1_PI_1 | T1_PI_2 | T1_PI_3; - } else if (h % 4 == 2) { - v |= T1_PI_2 | T1_PI_3; - } else if (h % 4 == 3) { - v |= T1_PI_3; - } - for (x = 0; x < flags_stride; ++x) { - *p++ = v; - } - } - } - - t1->w = w; - t1->h = h; - - return OPJ_TRUE; -} - -/* ----------------------------------------------------------------------- */ - -/* ----------------------------------------------------------------------- */ -/** - * Creates a new Tier 1 handle - * and initializes the look-up tables of the Tier-1 coder/decoder - * @return a new T1 handle if successful, returns NULL otherwise -*/ -opj_t1_t* opj_t1_create(OPJ_BOOL isEncoder) -{ - opj_t1_t *l_t1 = 00; - - l_t1 = (opj_t1_t*) opj_calloc(1, sizeof(opj_t1_t)); - if (!l_t1) { - return 00; - } - - l_t1->encoder = isEncoder; - - return l_t1; -} - - -/** - * Destroys a previously created T1 handle - * - * @param p_t1 Tier 1 handle to destroy -*/ -void opj_t1_destroy(opj_t1_t *p_t1) -{ - if (! p_t1) { - return; - } - - /* encoder uses tile buffer, so no need to free */ - if (!p_t1->encoder && p_t1->data) { - opj_aligned_free(p_t1->data); - p_t1->data = 00; - } - - if (p_t1->flags) { - opj_aligned_free(p_t1->flags); - p_t1->flags = 00; - } - - opj_free(p_t1->cblkdatabuffer); - - opj_free(p_t1); -} - -typedef struct { - OPJ_BOOL whole_tile_decoding; - OPJ_UINT32 resno; - opj_tcd_cblk_dec_t* cblk; - opj_tcd_band_t* band; - opj_tcd_tilecomp_t* tilec; - opj_tccp_t* tccp; - OPJ_BOOL mustuse_cblkdatabuffer; - volatile OPJ_BOOL* pret; - opj_event_mgr_t *p_manager; - opj_mutex_t* p_manager_mutex; - OPJ_BOOL check_pterm; -} opj_t1_cblk_decode_processing_job_t; - -static void opj_t1_destroy_wrapper(void* t1) -{ - opj_t1_destroy((opj_t1_t*) t1); -} - -static void opj_t1_clbl_decode_processor(void* user_data, opj_tls_t* tls) -{ - opj_tcd_cblk_dec_t* cblk; - opj_tcd_band_t* band; - opj_tcd_tilecomp_t* tilec; - opj_tccp_t* tccp; - OPJ_INT32* OPJ_RESTRICT datap; - OPJ_UINT32 cblk_w, cblk_h; - OPJ_INT32 x, y; - OPJ_UINT32 i, j; - opj_t1_cblk_decode_processing_job_t* job; - opj_t1_t* t1; - OPJ_UINT32 resno; - OPJ_UINT32 tile_w; - - job = (opj_t1_cblk_decode_processing_job_t*) user_data; - - cblk = job->cblk; - - if (!job->whole_tile_decoding) { - cblk_w = (OPJ_UINT32)(cblk->x1 - cblk->x0); - cblk_h = (OPJ_UINT32)(cblk->y1 - cblk->y0); - - cblk->decoded_data = (OPJ_INT32*)opj_aligned_malloc(cblk_w * cblk_h * sizeof( - OPJ_INT32)); - if (cblk->decoded_data == NULL) { - if (job->p_manager_mutex) { - opj_mutex_lock(job->p_manager_mutex); - } - opj_event_msg(job->p_manager, EVT_ERROR, - "Cannot allocate cblk->decoded_data\n"); - if (job->p_manager_mutex) { - opj_mutex_unlock(job->p_manager_mutex); - } - *(job->pret) = OPJ_FALSE; - opj_free(job); - return; - } - /* Zero-init required */ - memset(cblk->decoded_data, 0, cblk_w * cblk_h * sizeof(OPJ_INT32)); - } else if (cblk->decoded_data) { - /* Not sure if that code path can happen, but better be */ - /* safe than sorry */ - opj_aligned_free(cblk->decoded_data); - cblk->decoded_data = NULL; - } - - resno = job->resno; - band = job->band; - tilec = job->tilec; - tccp = job->tccp; - tile_w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions - 1].x1 - - - tilec->resolutions[tilec->minimum_num_resolutions - 1].x0); - - if (!*(job->pret)) { - opj_free(job); - return; - } - - t1 = (opj_t1_t*) opj_tls_get(tls, OPJ_TLS_KEY_T1); - if (t1 == NULL) { - t1 = opj_t1_create(OPJ_FALSE); - opj_tls_set(tls, OPJ_TLS_KEY_T1, t1, opj_t1_destroy_wrapper); - } - t1->mustuse_cblkdatabuffer = job->mustuse_cblkdatabuffer; - - if (OPJ_FALSE == opj_t1_decode_cblk( - t1, - cblk, - band->bandno, - (OPJ_UINT32)tccp->roishift, - tccp->cblksty, - job->p_manager, - job->p_manager_mutex, - job->check_pterm)) { - *(job->pret) = OPJ_FALSE; - opj_free(job); - return; - } - - x = cblk->x0 - band->x0; - y = cblk->y0 - band->y0; - if (band->bandno & 1) { - opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1]; - x += pres->x1 - pres->x0; - } - if (band->bandno & 2) { - opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1]; - y += pres->y1 - pres->y0; - } - - datap = cblk->decoded_data ? cblk->decoded_data : t1->data; - cblk_w = t1->w; - cblk_h = t1->h; - - if (tccp->roishift) { - if (tccp->roishift >= 31) { - for (j = 0; j < cblk_h; ++j) { - for (i = 0; i < cblk_w; ++i) { - datap[(j * cblk_w) + i] = 0; - } - } - } else { - OPJ_INT32 thresh = 1 << tccp->roishift; - for (j = 0; j < cblk_h; ++j) { - for (i = 0; i < cblk_w; ++i) { - OPJ_INT32 val = datap[(j * cblk_w) + i]; - OPJ_INT32 mag = abs(val); - if (mag >= thresh) { - mag >>= tccp->roishift; - datap[(j * cblk_w) + i] = val < 0 ? -mag : mag; - } - } - } - } - } - - /* Both can be non NULL if for example decoding a full tile and then */ - /* partially a tile. In which case partial decoding should be the */ - /* priority */ - assert((cblk->decoded_data != NULL) || (tilec->data != NULL)); - - if (cblk->decoded_data) { - OPJ_UINT32 cblk_size = cblk_w * cblk_h; - if (tccp->qmfbid == 1) { - for (i = 0; i < cblk_size; ++i) { - datap[i] /= 2; - } - } else { /* if (tccp->qmfbid == 0) */ - i = 0; -#ifdef __SSE2__ - { - const __m128 xmm_stepsize = _mm_set1_ps(band->stepsize); - for (; i < (cblk_size & ~15U); i += 16) { - __m128 xmm0_data = _mm_cvtepi32_ps(_mm_load_si128((__m128i * const)( - datap + 0))); - __m128 xmm1_data = _mm_cvtepi32_ps(_mm_load_si128((__m128i * const)( - datap + 4))); - __m128 xmm2_data = _mm_cvtepi32_ps(_mm_load_si128((__m128i * const)( - datap + 8))); - __m128 xmm3_data = _mm_cvtepi32_ps(_mm_load_si128((__m128i * const)( - datap + 12))); - _mm_store_ps((float*)(datap + 0), _mm_mul_ps(xmm0_data, xmm_stepsize)); - _mm_store_ps((float*)(datap + 4), _mm_mul_ps(xmm1_data, xmm_stepsize)); - _mm_store_ps((float*)(datap + 8), _mm_mul_ps(xmm2_data, xmm_stepsize)); - _mm_store_ps((float*)(datap + 12), _mm_mul_ps(xmm3_data, xmm_stepsize)); - datap += 16; - } - } -#endif - for (; i < cblk_size; ++i) { - OPJ_FLOAT32 tmp = ((OPJ_FLOAT32)(*datap)) * band->stepsize; - memcpy(datap, &tmp, sizeof(tmp)); - datap++; - } - } - } else if (tccp->qmfbid == 1) { - OPJ_INT32* OPJ_RESTRICT tiledp = &tilec->data[(OPJ_SIZE_T)y * tile_w + - (OPJ_SIZE_T)x]; - for (j = 0; j < cblk_h; ++j) { - i = 0; - for (; i < (cblk_w & ~(OPJ_UINT32)3U); i += 4U) { - OPJ_INT32 tmp0 = datap[(j * cblk_w) + i + 0U]; - OPJ_INT32 tmp1 = datap[(j * cblk_w) + i + 1U]; - OPJ_INT32 tmp2 = datap[(j * cblk_w) + i + 2U]; - OPJ_INT32 tmp3 = datap[(j * cblk_w) + i + 3U]; - ((OPJ_INT32*)tiledp)[(j * (OPJ_SIZE_T)tile_w) + i + 0U] = tmp0 / 2; - ((OPJ_INT32*)tiledp)[(j * (OPJ_SIZE_T)tile_w) + i + 1U] = tmp1 / 2; - ((OPJ_INT32*)tiledp)[(j * (OPJ_SIZE_T)tile_w) + i + 2U] = tmp2 / 2; - ((OPJ_INT32*)tiledp)[(j * (OPJ_SIZE_T)tile_w) + i + 3U] = tmp3 / 2; - } - for (; i < cblk_w; ++i) { - OPJ_INT32 tmp = datap[(j * cblk_w) + i]; - ((OPJ_INT32*)tiledp)[(j * (OPJ_SIZE_T)tile_w) + i] = tmp / 2; - } - } - } else { /* if (tccp->qmfbid == 0) */ - OPJ_FLOAT32* OPJ_RESTRICT tiledp = (OPJ_FLOAT32*) &tilec->data[(OPJ_SIZE_T)y * - tile_w + (OPJ_SIZE_T)x]; - for (j = 0; j < cblk_h; ++j) { - OPJ_FLOAT32* OPJ_RESTRICT tiledp2 = tiledp; - for (i = 0; i < cblk_w; ++i) { - OPJ_FLOAT32 tmp = (OPJ_FLOAT32) * datap * band->stepsize; - *tiledp2 = tmp; - datap++; - tiledp2++; - } - tiledp += tile_w; - } - } - - opj_free(job); -} - - -void opj_t1_decode_cblks(opj_tcd_t* tcd, - volatile OPJ_BOOL* pret, - opj_tcd_tilecomp_t* tilec, - opj_tccp_t* tccp, - opj_event_mgr_t *p_manager, - opj_mutex_t* p_manager_mutex, - OPJ_BOOL check_pterm - ) -{ - opj_thread_pool_t* tp = tcd->thread_pool; - OPJ_UINT32 resno, bandno, precno, cblkno; - -#ifdef DEBUG_VERBOSE - OPJ_UINT32 codeblocks_decoded = 0; - printf("Enter opj_t1_decode_cblks()\n"); -#endif - - for (resno = 0; resno < tilec->minimum_num_resolutions; ++resno) { - opj_tcd_resolution_t* res = &tilec->resolutions[resno]; - - for (bandno = 0; bandno < res->numbands; ++bandno) { - opj_tcd_band_t* OPJ_RESTRICT band = &res->bands[bandno]; - - for (precno = 0; precno < res->pw * res->ph; ++precno) { - opj_tcd_precinct_t* precinct = &band->precincts[precno]; - - if (!opj_tcd_is_subband_area_of_interest(tcd, - tilec->compno, - resno, - band->bandno, - (OPJ_UINT32)precinct->x0, - (OPJ_UINT32)precinct->y0, - (OPJ_UINT32)precinct->x1, - (OPJ_UINT32)precinct->y1)) { - for (cblkno = 0; cblkno < precinct->cw * precinct->ch; ++cblkno) { - opj_tcd_cblk_dec_t* cblk = &precinct->cblks.dec[cblkno]; - if (cblk->decoded_data) { -#ifdef DEBUG_VERBOSE - printf("Discarding codeblock %d,%d at resno=%d, bandno=%d\n", - cblk->x0, cblk->y0, resno, bandno); -#endif - opj_aligned_free(cblk->decoded_data); - cblk->decoded_data = NULL; - } - } - continue; - } - - for (cblkno = 0; cblkno < precinct->cw * precinct->ch; ++cblkno) { - opj_tcd_cblk_dec_t* cblk = &precinct->cblks.dec[cblkno]; - opj_t1_cblk_decode_processing_job_t* job; - - if (!opj_tcd_is_subband_area_of_interest(tcd, - tilec->compno, - resno, - band->bandno, - (OPJ_UINT32)cblk->x0, - (OPJ_UINT32)cblk->y0, - (OPJ_UINT32)cblk->x1, - (OPJ_UINT32)cblk->y1)) { - if (cblk->decoded_data) { -#ifdef DEBUG_VERBOSE - printf("Discarding codeblock %d,%d at resno=%d, bandno=%d\n", - cblk->x0, cblk->y0, resno, bandno); -#endif - opj_aligned_free(cblk->decoded_data); - cblk->decoded_data = NULL; - } - continue; - } - - if (!tcd->whole_tile_decoding) { - OPJ_UINT32 cblk_w = (OPJ_UINT32)(cblk->x1 - cblk->x0); - OPJ_UINT32 cblk_h = (OPJ_UINT32)(cblk->y1 - cblk->y0); - if (cblk->decoded_data != NULL) { -#ifdef DEBUG_VERBOSE - printf("Reusing codeblock %d,%d at resno=%d, bandno=%d\n", - cblk->x0, cblk->y0, resno, bandno); -#endif - continue; - } - if (cblk_w == 0 || cblk_h == 0) { - continue; - } -#ifdef DEBUG_VERBOSE - printf("Decoding codeblock %d,%d at resno=%d, bandno=%d\n", - cblk->x0, cblk->y0, resno, bandno); -#endif - } - - job = (opj_t1_cblk_decode_processing_job_t*) opj_calloc(1, - sizeof(opj_t1_cblk_decode_processing_job_t)); - if (!job) { - *pret = OPJ_FALSE; - return; - } - job->whole_tile_decoding = tcd->whole_tile_decoding; - job->resno = resno; - job->cblk = cblk; - job->band = band; - job->tilec = tilec; - job->tccp = tccp; - job->pret = pret; - job->p_manager_mutex = p_manager_mutex; - job->p_manager = p_manager; - job->check_pterm = check_pterm; - job->mustuse_cblkdatabuffer = opj_thread_pool_get_thread_count(tp) > 1; - opj_thread_pool_submit_job(tp, opj_t1_clbl_decode_processor, job); -#ifdef DEBUG_VERBOSE - codeblocks_decoded ++; -#endif - if (!(*pret)) { - return; - } - } /* cblkno */ - } /* precno */ - } /* bandno */ - } /* resno */ - -#ifdef DEBUG_VERBOSE - printf("Leave opj_t1_decode_cblks(). Number decoded: %d\n", codeblocks_decoded); -#endif - return; -} - - -static OPJ_BOOL opj_t1_decode_cblk(opj_t1_t *t1, - opj_tcd_cblk_dec_t* cblk, - OPJ_UINT32 orient, - OPJ_UINT32 roishift, - OPJ_UINT32 cblksty, - opj_event_mgr_t *p_manager, - opj_mutex_t* p_manager_mutex, - OPJ_BOOL check_pterm) -{ - opj_mqc_t *mqc = &(t1->mqc); /* MQC component */ - - OPJ_INT32 bpno_plus_one; - OPJ_UINT32 passtype; - OPJ_UINT32 segno, passno; - OPJ_BYTE* cblkdata = NULL; - OPJ_UINT32 cblkdataindex = 0; - OPJ_BYTE type = T1_TYPE_MQ; /* BYPASS mode */ - OPJ_INT32* original_t1_data = NULL; - - mqc->lut_ctxno_zc_orient = lut_ctxno_zc + (orient << 9); - - if (!opj_t1_allocate_buffers( - t1, - (OPJ_UINT32)(cblk->x1 - cblk->x0), - (OPJ_UINT32)(cblk->y1 - cblk->y0))) { - return OPJ_FALSE; - } - - bpno_plus_one = (OPJ_INT32)(roishift + cblk->numbps); - if (bpno_plus_one >= 31) { - if (p_manager_mutex) { - opj_mutex_lock(p_manager_mutex); - } - opj_event_msg(p_manager, EVT_WARNING, - "opj_t1_decode_cblk(): unsupported bpno_plus_one = %d >= 31\n", - bpno_plus_one); - if (p_manager_mutex) { - opj_mutex_unlock(p_manager_mutex); - } - return OPJ_FALSE; - } - passtype = 2; - - opj_mqc_resetstates(mqc); - opj_mqc_setstate(mqc, T1_CTXNO_UNI, 0, 46); - opj_mqc_setstate(mqc, T1_CTXNO_AGG, 0, 3); - opj_mqc_setstate(mqc, T1_CTXNO_ZC, 0, 4); - - /* Even if we have a single chunk, in multi-threaded decoding */ - /* the insertion of our synthetic marker might potentially override */ - /* valid codestream of other codeblocks decoded in parallel. */ - if (cblk->numchunks > 1 || t1->mustuse_cblkdatabuffer) { - OPJ_UINT32 i; - OPJ_UINT32 cblk_len; - - /* Compute whole codeblock length from chunk lengths */ - cblk_len = 0; - for (i = 0; i < cblk->numchunks; i++) { - cblk_len += cblk->chunks[i].len; - } - - /* Allocate temporary memory if needed */ - if (cblk_len + OPJ_COMMON_CBLK_DATA_EXTRA > t1->cblkdatabuffersize) { - cblkdata = (OPJ_BYTE*)opj_realloc(t1->cblkdatabuffer, - cblk_len + OPJ_COMMON_CBLK_DATA_EXTRA); - if (cblkdata == NULL) { - return OPJ_FALSE; - } - t1->cblkdatabuffer = cblkdata; - memset(t1->cblkdatabuffer + cblk_len, 0, OPJ_COMMON_CBLK_DATA_EXTRA); - t1->cblkdatabuffersize = cblk_len + OPJ_COMMON_CBLK_DATA_EXTRA; - } - - /* Concatenate all chunks */ - cblkdata = t1->cblkdatabuffer; - cblk_len = 0; - for (i = 0; i < cblk->numchunks; i++) { - memcpy(cblkdata + cblk_len, cblk->chunks[i].data, cblk->chunks[i].len); - cblk_len += cblk->chunks[i].len; - } - } else if (cblk->numchunks == 1) { - cblkdata = cblk->chunks[0].data; - } else { - /* Not sure if that can happen in practice, but avoid Coverity to */ - /* think we will dereference a null cblkdta pointer */ - return OPJ_TRUE; - } - - /* For subtile decoding, directly decode in the decoded_data buffer of */ - /* the code-block. Hack t1->data to point to it, and restore it later */ - if (cblk->decoded_data) { - original_t1_data = t1->data; - t1->data = cblk->decoded_data; - } - - for (segno = 0; segno < cblk->real_num_segs; ++segno) { - opj_tcd_seg_t *seg = &cblk->segs[segno]; - - /* BYPASS mode */ - type = ((bpno_plus_one <= ((OPJ_INT32)(cblk->numbps)) - 4) && (passtype < 2) && - (cblksty & J2K_CCP_CBLKSTY_LAZY)) ? T1_TYPE_RAW : T1_TYPE_MQ; - - if (type == T1_TYPE_RAW) { - opj_mqc_raw_init_dec(mqc, cblkdata + cblkdataindex, seg->len, - OPJ_COMMON_CBLK_DATA_EXTRA); - } else { - opj_mqc_init_dec(mqc, cblkdata + cblkdataindex, seg->len, - OPJ_COMMON_CBLK_DATA_EXTRA); - } - cblkdataindex += seg->len; - - for (passno = 0; (passno < seg->real_num_passes) && - (bpno_plus_one >= 1); ++passno) { - switch (passtype) { - case 0: - if (type == T1_TYPE_RAW) { - opj_t1_dec_sigpass_raw(t1, bpno_plus_one, (OPJ_INT32)cblksty); - } else { - opj_t1_dec_sigpass_mqc(t1, bpno_plus_one, (OPJ_INT32)cblksty); - } - break; - case 1: - if (type == T1_TYPE_RAW) { - opj_t1_dec_refpass_raw(t1, bpno_plus_one); - } else { - opj_t1_dec_refpass_mqc(t1, bpno_plus_one); - } - break; - case 2: - opj_t1_dec_clnpass(t1, bpno_plus_one, (OPJ_INT32)cblksty); - break; - } - - if ((cblksty & J2K_CCP_CBLKSTY_RESET) && type == T1_TYPE_MQ) { - opj_mqc_resetstates(mqc); - opj_mqc_setstate(mqc, T1_CTXNO_UNI, 0, 46); - opj_mqc_setstate(mqc, T1_CTXNO_AGG, 0, 3); - opj_mqc_setstate(mqc, T1_CTXNO_ZC, 0, 4); - } - if (++passtype == 3) { - passtype = 0; - bpno_plus_one--; - } - } - - opq_mqc_finish_dec(mqc); - } - - if (check_pterm) { - if (mqc->bp + 2 < mqc->end) { - if (p_manager_mutex) { - opj_mutex_lock(p_manager_mutex); - } - opj_event_msg(p_manager, EVT_WARNING, - "PTERM check failure: %d remaining bytes in code block (%d used / %d)\n", - (int)(mqc->end - mqc->bp) - 2, - (int)(mqc->bp - mqc->start), - (int)(mqc->end - mqc->start)); - if (p_manager_mutex) { - opj_mutex_unlock(p_manager_mutex); - } - } else if (mqc->end_of_byte_stream_counter > 2) { - if (p_manager_mutex) { - opj_mutex_lock(p_manager_mutex); - } - opj_event_msg(p_manager, EVT_WARNING, - "PTERM check failure: %d synthetized 0xFF markers read\n", - mqc->end_of_byte_stream_counter); - if (p_manager_mutex) { - opj_mutex_unlock(p_manager_mutex); - } - } - } - - /* Restore original t1->data is needed */ - if (cblk->decoded_data) { - t1->data = original_t1_data; - } - - return OPJ_TRUE; -} - - - - -OPJ_BOOL opj_t1_encode_cblks(opj_t1_t *t1, - opj_tcd_tile_t *tile, - opj_tcp_t *tcp, - const OPJ_FLOAT64 * mct_norms, - OPJ_UINT32 mct_numcomps - ) -{ - OPJ_UINT32 compno, resno, bandno, precno, cblkno; - - tile->distotile = 0; /* fixed_quality */ - - for (compno = 0; compno < tile->numcomps; ++compno) { - opj_tcd_tilecomp_t* tilec = &tile->comps[compno]; - opj_tccp_t* tccp = &tcp->tccps[compno]; - OPJ_UINT32 tile_w = (OPJ_UINT32)(tilec->x1 - tilec->x0); - - for (resno = 0; resno < tilec->numresolutions; ++resno) { - opj_tcd_resolution_t *res = &tilec->resolutions[resno]; - - for (bandno = 0; bandno < res->numbands; ++bandno) { - opj_tcd_band_t* OPJ_RESTRICT band = &res->bands[bandno]; - OPJ_INT32 bandconst; - - /* Skip empty bands */ - if (opj_tcd_is_band_empty(band)) { - continue; - } - - bandconst = 8192 * 8192 / ((OPJ_INT32) floor(band->stepsize * 8192)); - for (precno = 0; precno < res->pw * res->ph; ++precno) { - opj_tcd_precinct_t *prc = &band->precincts[precno]; - - for (cblkno = 0; cblkno < prc->cw * prc->ch; ++cblkno) { - opj_tcd_cblk_enc_t* cblk = &prc->cblks.enc[cblkno]; - OPJ_INT32* OPJ_RESTRICT tiledp; - OPJ_UINT32 cblk_w; - OPJ_UINT32 cblk_h; - OPJ_UINT32 i, j, tileLineAdvance; - OPJ_SIZE_T tileIndex = 0; - - OPJ_INT32 x = cblk->x0 - band->x0; - OPJ_INT32 y = cblk->y0 - band->y0; - if (band->bandno & 1) { - opj_tcd_resolution_t *pres = &tilec->resolutions[resno - 1]; - x += pres->x1 - pres->x0; - } - if (band->bandno & 2) { - opj_tcd_resolution_t *pres = &tilec->resolutions[resno - 1]; - y += pres->y1 - pres->y0; - } - - if (!opj_t1_allocate_buffers( - t1, - (OPJ_UINT32)(cblk->x1 - cblk->x0), - (OPJ_UINT32)(cblk->y1 - cblk->y0))) { - return OPJ_FALSE; - } - - cblk_w = t1->w; - cblk_h = t1->h; - tileLineAdvance = tile_w - cblk_w; - - tiledp = &tilec->data[(OPJ_SIZE_T)y * tile_w + (OPJ_SIZE_T)x]; - t1->data = tiledp; - t1->data_stride = tile_w; - if (tccp->qmfbid == 1) { - for (j = 0; j < cblk_h; ++j) { - for (i = 0; i < cblk_w; ++i) { - tiledp[tileIndex] *= (1 << T1_NMSEDEC_FRACBITS); - tileIndex++; - } - tileIndex += tileLineAdvance; - } - } else { /* if (tccp->qmfbid == 0) */ - for (j = 0; j < cblk_h; ++j) { - for (i = 0; i < cblk_w; ++i) { - OPJ_INT32 tmp = tiledp[tileIndex]; - tiledp[tileIndex] = - opj_int_fix_mul_t1( - tmp, - bandconst); - tileIndex++; - } - tileIndex += tileLineAdvance; - } - } - - opj_t1_encode_cblk( - t1, - cblk, - band->bandno, - compno, - tilec->numresolutions - 1 - resno, - tccp->qmfbid, - band->stepsize, - tccp->cblksty, - tile->numcomps, - tile, - mct_norms, - mct_numcomps); - - } /* cblkno */ - } /* precno */ - } /* bandno */ - } /* resno */ - } /* compno */ - return OPJ_TRUE; -} - -/* Returns whether the pass (bpno, passtype) is terminated */ -static int opj_t1_enc_is_term_pass(opj_tcd_cblk_enc_t* cblk, - OPJ_UINT32 cblksty, - OPJ_INT32 bpno, - OPJ_UINT32 passtype) -{ - /* Is it the last cleanup pass ? */ - if (passtype == 2 && bpno == 0) { - return OPJ_TRUE; - } - - if (cblksty & J2K_CCP_CBLKSTY_TERMALL) { - return OPJ_TRUE; - } - - if ((cblksty & J2K_CCP_CBLKSTY_LAZY)) { - /* For bypass arithmetic bypass, terminate the 4th cleanup pass */ - if ((bpno == ((OPJ_INT32)cblk->numbps - 4)) && (passtype == 2)) { - return OPJ_TRUE; - } - /* and beyond terminate all the magnitude refinement passes (in raw) */ - /* and cleanup passes (in MQC) */ - if ((bpno < ((OPJ_INT32)(cblk->numbps) - 4)) && (passtype > 0)) { - return OPJ_TRUE; - } - } - - return OPJ_FALSE; -} - - -/** mod fixed_quality */ -static void opj_t1_encode_cblk(opj_t1_t *t1, - opj_tcd_cblk_enc_t* cblk, - OPJ_UINT32 orient, - OPJ_UINT32 compno, - OPJ_UINT32 level, - OPJ_UINT32 qmfbid, - OPJ_FLOAT64 stepsize, - OPJ_UINT32 cblksty, - OPJ_UINT32 numcomps, - opj_tcd_tile_t * tile, - const OPJ_FLOAT64 * mct_norms, - OPJ_UINT32 mct_numcomps) -{ - OPJ_FLOAT64 cumwmsedec = 0.0; - - opj_mqc_t *mqc = &(t1->mqc); /* MQC component */ - - OPJ_UINT32 passno; - OPJ_INT32 bpno; - OPJ_UINT32 passtype; - OPJ_INT32 nmsedec = 0; - OPJ_INT32 max; - OPJ_UINT32 i, j; - OPJ_BYTE type = T1_TYPE_MQ; - OPJ_FLOAT64 tempwmsedec; - -#ifdef EXTRA_DEBUG - printf("encode_cblk(x=%d,y=%d,x1=%d,y1=%d,orient=%d,compno=%d,level=%d\n", - cblk->x0, cblk->y0, cblk->x1, cblk->y1, orient, compno, level); -#endif - - mqc->lut_ctxno_zc_orient = lut_ctxno_zc + (orient << 9); - - max = 0; - for (i = 0; i < t1->w; ++i) { - for (j = 0; j < t1->h; ++j) { - OPJ_INT32 tmp = abs(t1->data[i + j * t1->data_stride]); - max = opj_int_max(max, tmp); - } - } - - cblk->numbps = max ? (OPJ_UINT32)((opj_int_floorlog2(max) + 1) - - T1_NMSEDEC_FRACBITS) : 0; - if (cblk->numbps == 0) { - cblk->totalpasses = 0; - return; - } - - bpno = (OPJ_INT32)(cblk->numbps - 1); - passtype = 2; - - opj_mqc_resetstates(mqc); - opj_mqc_setstate(mqc, T1_CTXNO_UNI, 0, 46); - opj_mqc_setstate(mqc, T1_CTXNO_AGG, 0, 3); - opj_mqc_setstate(mqc, T1_CTXNO_ZC, 0, 4); - opj_mqc_init_enc(mqc, cblk->data); - - for (passno = 0; bpno >= 0; ++passno) { - opj_tcd_pass_t *pass = &cblk->passes[passno]; - type = ((bpno < ((OPJ_INT32)(cblk->numbps) - 4)) && (passtype < 2) && - (cblksty & J2K_CCP_CBLKSTY_LAZY)) ? T1_TYPE_RAW : T1_TYPE_MQ; - - /* If the previous pass was terminating, we need to reset the encoder */ - if (passno > 0 && cblk->passes[passno - 1].term) { - if (type == T1_TYPE_RAW) { - opj_mqc_bypass_init_enc(mqc); - } else { - opj_mqc_restart_init_enc(mqc); - } - } - - switch (passtype) { - case 0: - opj_t1_enc_sigpass(t1, bpno, &nmsedec, type, cblksty); - break; - case 1: - opj_t1_enc_refpass(t1, bpno, &nmsedec, type); - break; - case 2: - opj_t1_enc_clnpass(t1, bpno, &nmsedec, cblksty); - /* code switch SEGMARK (i.e. SEGSYM) */ - if (cblksty & J2K_CCP_CBLKSTY_SEGSYM) { - opj_mqc_segmark_enc(mqc); - } - break; - } - - /* fixed_quality */ - tempwmsedec = opj_t1_getwmsedec(nmsedec, compno, level, orient, bpno, qmfbid, - stepsize, numcomps, mct_norms, mct_numcomps) ; - cumwmsedec += tempwmsedec; - tile->distotile += tempwmsedec; - pass->distortiondec = cumwmsedec; - - if (opj_t1_enc_is_term_pass(cblk, cblksty, bpno, passtype)) { - /* If it is a terminated pass, terminate it */ - if (type == T1_TYPE_RAW) { - opj_mqc_bypass_flush_enc(mqc, cblksty & J2K_CCP_CBLKSTY_PTERM); - } else { - if (cblksty & J2K_CCP_CBLKSTY_PTERM) { - opj_mqc_erterm_enc(mqc); - } else { - opj_mqc_flush(mqc); - } - } - pass->term = 1; - pass->rate = opj_mqc_numbytes(mqc); - } else { - /* Non terminated pass */ - OPJ_UINT32 rate_extra_bytes; - if (type == T1_TYPE_RAW) { - rate_extra_bytes = opj_mqc_bypass_get_extra_bytes( - mqc, (cblksty & J2K_CCP_CBLKSTY_PTERM)); - } else { - rate_extra_bytes = 3; - } - pass->term = 0; - pass->rate = opj_mqc_numbytes(mqc) + rate_extra_bytes; - } - - if (++passtype == 3) { - passtype = 0; - bpno--; - } - - /* Code-switch "RESET" */ - if (cblksty & J2K_CCP_CBLKSTY_RESET) { - opj_mqc_reset_enc(mqc); - } - } - - cblk->totalpasses = passno; - - if (cblk->totalpasses) { - /* Make sure that pass rates are increasing */ - OPJ_UINT32 last_pass_rate = opj_mqc_numbytes(mqc); - for (passno = cblk->totalpasses; passno > 0;) { - opj_tcd_pass_t *pass = &cblk->passes[--passno]; - if (pass->rate > last_pass_rate) { - pass->rate = last_pass_rate; - } else { - last_pass_rate = pass->rate; - } - } - } - - for (passno = 0; passno < cblk->totalpasses; passno++) { - opj_tcd_pass_t *pass = &cblk->passes[passno]; - - /* Prevent generation of FF as last data byte of a pass*/ - /* For terminating passes, the flushing procedure ensured this already */ - assert(pass->rate > 0); - if (cblk->data[pass->rate - 1] == 0xFF) { - pass->rate--; - } - pass->len = pass->rate - (passno == 0 ? 0 : cblk->passes[passno - 1].rate); - } - -#ifdef EXTRA_DEBUG - printf(" len=%d\n", (cblk->totalpasses) ? opj_mqc_numbytes(mqc) : 0); - - /* Check that there not 0xff >=0x90 sequences */ - if (cblk->totalpasses) { - OPJ_UINT32 i; - OPJ_UINT32 len = opj_mqc_numbytes(mqc); - for (i = 1; i < len; ++i) { - if (cblk->data[i - 1] == 0xff && cblk->data[i] >= 0x90) { - printf("0xff %02x at offset %d\n", cblk->data[i], i - 1); - abort(); - } - } - } -#endif -} diff --git a/src/3rd/LibOpenJpeg/t1.h b/src/3rd/LibOpenJpeg/t1.h deleted file mode 100644 index 171dfb0a..00000000 --- a/src/3rd/LibOpenJpeg/t1.h +++ /dev/null @@ -1,269 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2012, Carl Hetherington - * Copyright (c) 2017, IntoPIX SA - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef OPJ_T1_H -#define OPJ_T1_H -/** -@file t1.h -@brief Implementation of the tier-1 coding (coding of code-block coefficients) (T1) - -The functions in T1.C have for goal to realize the tier-1 coding operation. The functions -in T1.C are used by some function in TCD.C. -*/ - -/** @defgroup T1 T1 - Implementation of the tier-1 coding */ -/*@{*/ - -/* ----------------------------------------------------------------------- */ -#define T1_NMSEDEC_BITS 7 - -#define T1_NUMCTXS_ZC 9 -#define T1_NUMCTXS_SC 5 -#define T1_NUMCTXS_MAG 3 -#define T1_NUMCTXS_AGG 1 -#define T1_NUMCTXS_UNI 1 - -#define T1_CTXNO_ZC 0 -#define T1_CTXNO_SC (T1_CTXNO_ZC+T1_NUMCTXS_ZC) -#define T1_CTXNO_MAG (T1_CTXNO_SC+T1_NUMCTXS_SC) -#define T1_CTXNO_AGG (T1_CTXNO_MAG+T1_NUMCTXS_MAG) -#define T1_CTXNO_UNI (T1_CTXNO_AGG+T1_NUMCTXS_AGG) -#define T1_NUMCTXS (T1_CTXNO_UNI+T1_NUMCTXS_UNI) - -#define T1_NMSEDEC_FRACBITS (T1_NMSEDEC_BITS-1) - -#define T1_TYPE_MQ 0 /**< Normal coding using entropy coder */ -#define T1_TYPE_RAW 1 /**< No encoding the information is store under raw format in codestream (mode switch RAW)*/ - -/* BEGINNING of flags that apply to opj_flag_t */ -/** We hold the state of individual data points for the T1 encoder using - * a single 32-bit flags word to hold the state of 4 data points. This corresponds - * to the 4-point-high columns that the data is processed in. - * - * These \#defines declare the layout of a 32-bit flags word. - * - * This is currently done for encoding only. - * The values must NOT be changed, otherwise this is going to break a lot of - * assumptions. - */ - -/* SIGMA: significance state (3 cols x 6 rows) - * CHI: state for negative sample value (1 col x 6 rows) - * MU: state for visited in refinement pass (1 col x 4 rows) - * PI: state for visited in significance pass (1 col * 4 rows) - */ - -#define T1_SIGMA_0 (1U << 0) -#define T1_SIGMA_1 (1U << 1) -#define T1_SIGMA_2 (1U << 2) -#define T1_SIGMA_3 (1U << 3) -#define T1_SIGMA_4 (1U << 4) -#define T1_SIGMA_5 (1U << 5) -#define T1_SIGMA_6 (1U << 6) -#define T1_SIGMA_7 (1U << 7) -#define T1_SIGMA_8 (1U << 8) -#define T1_SIGMA_9 (1U << 9) -#define T1_SIGMA_10 (1U << 10) -#define T1_SIGMA_11 (1U << 11) -#define T1_SIGMA_12 (1U << 12) -#define T1_SIGMA_13 (1U << 13) -#define T1_SIGMA_14 (1U << 14) -#define T1_SIGMA_15 (1U << 15) -#define T1_SIGMA_16 (1U << 16) -#define T1_SIGMA_17 (1U << 17) - -#define T1_CHI_0 (1U << 18) -#define T1_CHI_0_I 18 -#define T1_CHI_1 (1U << 19) -#define T1_CHI_1_I 19 -#define T1_MU_0 (1U << 20) -#define T1_PI_0 (1U << 21) -#define T1_CHI_2 (1U << 22) -#define T1_CHI_2_I 22 -#define T1_MU_1 (1U << 23) -#define T1_PI_1 (1U << 24) -#define T1_CHI_3 (1U << 25) -#define T1_MU_2 (1U << 26) -#define T1_PI_2 (1U << 27) -#define T1_CHI_4 (1U << 28) -#define T1_MU_3 (1U << 29) -#define T1_PI_3 (1U << 30) -#define T1_CHI_5 (1U << 31) -#define T1_CHI_5_I 31 - -/** As an example, the bits T1_SIGMA_3, T1_SIGMA_4 and T1_SIGMA_5 - * indicate the significance state of the west neighbour of data point zero - * of our four, the point itself, and its east neighbour respectively. - * Many of the bits are arranged so that given a flags word, you can - * look at the values for the data point 0, then shift the flags - * word right by 3 bits and look at the same bit positions to see the - * values for data point 1. - * - * The \#defines below help a bit with this; say you have a flags word - * f, you can do things like - * - * (f & T1_SIGMA_THIS) - * - * to see the significance bit of data point 0, then do - * - * ((f >> 3) & T1_SIGMA_THIS) - * - * to see the significance bit of data point 1. - */ - -#define T1_SIGMA_NW T1_SIGMA_0 -#define T1_SIGMA_N T1_SIGMA_1 -#define T1_SIGMA_NE T1_SIGMA_2 -#define T1_SIGMA_W T1_SIGMA_3 -#define T1_SIGMA_THIS T1_SIGMA_4 -#define T1_SIGMA_E T1_SIGMA_5 -#define T1_SIGMA_SW T1_SIGMA_6 -#define T1_SIGMA_S T1_SIGMA_7 -#define T1_SIGMA_SE T1_SIGMA_8 -#define T1_SIGMA_NEIGHBOURS (T1_SIGMA_NW | T1_SIGMA_N | T1_SIGMA_NE | T1_SIGMA_W | T1_SIGMA_E | T1_SIGMA_SW | T1_SIGMA_S | T1_SIGMA_SE) - -#define T1_CHI_THIS T1_CHI_1 -#define T1_CHI_THIS_I T1_CHI_1_I -#define T1_MU_THIS T1_MU_0 -#define T1_PI_THIS T1_PI_0 -#define T1_CHI_S T1_CHI_2 - -#define T1_LUT_SGN_W (1U << 0) -#define T1_LUT_SIG_N (1U << 1) -#define T1_LUT_SGN_E (1U << 2) -#define T1_LUT_SIG_W (1U << 3) -#define T1_LUT_SGN_N (1U << 4) -#define T1_LUT_SIG_E (1U << 5) -#define T1_LUT_SGN_S (1U << 6) -#define T1_LUT_SIG_S (1U << 7) -/* END of flags that apply to opj_flag_t */ - -/* ----------------------------------------------------------------------- */ - -/** Flags for 4 consecutive rows of a column */ -typedef OPJ_UINT32 opj_flag_t; - -/** -Tier-1 coding (coding of code-block coefficients) -*/ -typedef struct opj_t1 { - - /** MQC component */ - opj_mqc_t mqc; - - OPJ_INT32 *data; - /** Flags used by decoder and encoder. - * Such that flags[1+0] is for state of col=0,row=0..3, - flags[1+1] for col=1, row=0..3, flags[1+flags_stride] for col=0,row=4..7, ... - This array avoids too much cache trashing when processing by 4 vertical samples - as done in the various decoding steps. */ - opj_flag_t *flags; - - OPJ_UINT32 w; - OPJ_UINT32 h; - OPJ_UINT32 datasize; - OPJ_UINT32 flagssize; - OPJ_UINT32 data_stride; - OPJ_BOOL encoder; - - /* Thre 3 variables below are only used by the decoder */ - /* set to TRUE in multithreaded context */ - OPJ_BOOL mustuse_cblkdatabuffer; - /* Temporary buffer to concatenate all chunks of a codebock */ - OPJ_BYTE *cblkdatabuffer; - /* Maximum size available in cblkdatabuffer */ - OPJ_UINT32 cblkdatabuffersize; -} opj_t1_t; - -/** @name Exported functions */ -/*@{*/ -/* ----------------------------------------------------------------------- */ - -/** -Encode the code-blocks of a tile -@param t1 T1 handle -@param tile The tile to encode -@param tcp Tile coding parameters -@param mct_norms FIXME DOC -@param mct_numcomps Number of components used for MCT -*/ -OPJ_BOOL opj_t1_encode_cblks(opj_t1_t *t1, - opj_tcd_tile_t *tile, - opj_tcp_t *tcp, - const OPJ_FLOAT64 * mct_norms, - OPJ_UINT32 mct_numcomps); - -/** -Decode the code-blocks of a tile -@param tcd TCD handle -@param pret Pointer to return value -@param tilec The tile to decode -@param tccp Tile coding parameters -@param p_manager the event manager -@param p_manager_mutex mutex for the event manager -@param check_pterm whether PTERM correct termination should be checked -*/ -void opj_t1_decode_cblks(opj_tcd_t* tcd, - volatile OPJ_BOOL* pret, - opj_tcd_tilecomp_t* tilec, - opj_tccp_t* tccp, - opj_event_mgr_t *p_manager, - opj_mutex_t* p_manager_mutex, - OPJ_BOOL check_pterm); - - - -/** - * Creates a new Tier 1 handle - * and initializes the look-up tables of the Tier-1 coder/decoder - * @return a new T1 handle if successful, returns NULL otherwise -*/ -opj_t1_t* opj_t1_create(OPJ_BOOL isEncoder); - -/** - * Destroys a previously created T1 handle - * - * @param p_t1 Tier 1 handle to destroy -*/ -void opj_t1_destroy(opj_t1_t *p_t1); -/* ----------------------------------------------------------------------- */ -/*@}*/ - -/*@}*/ - -#endif /* OPJ_T1_H */ diff --git a/src/3rd/LibOpenJpeg/t1_generate_luts.c b/src/3rd/LibOpenJpeg/t1_generate_luts.c deleted file mode 100644 index 9ad6f200..00000000 --- a/src/3rd/LibOpenJpeg/t1_generate_luts.c +++ /dev/null @@ -1,311 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2007, Callum Lerwick - * Copyright (c) 2012, Carl Hetherington - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "opj_includes.h" - -static int t1_init_ctxno_zc(OPJ_UINT32 f, OPJ_UINT32 orient) -{ - int h, v, d, n, t, hv; - n = 0; - h = ((f & T1_SIGMA_3) != 0) + ((f & T1_SIGMA_5) != 0); - v = ((f & T1_SIGMA_1) != 0) + ((f & T1_SIGMA_7) != 0); - d = ((f & T1_SIGMA_0) != 0) + ((f & T1_SIGMA_2) != 0) + (( - f & T1_SIGMA_8) != 0) + ((f & T1_SIGMA_6) != 0); - - switch (orient) { - case 2: - t = h; - h = v; - v = t; - case 0: - case 1: - if (!h) { - if (!v) { - if (!d) { - n = 0; - } else if (d == 1) { - n = 1; - } else { - n = 2; - } - } else if (v == 1) { - n = 3; - } else { - n = 4; - } - } else if (h == 1) { - if (!v) { - if (!d) { - n = 5; - } else { - n = 6; - } - } else { - n = 7; - } - } else { - n = 8; - } - break; - case 3: - hv = h + v; - if (!d) { - if (!hv) { - n = 0; - } else if (hv == 1) { - n = 1; - } else { - n = 2; - } - } else if (d == 1) { - if (!hv) { - n = 3; - } else if (hv == 1) { - n = 4; - } else { - n = 5; - } - } else if (d == 2) { - if (!hv) { - n = 6; - } else { - n = 7; - } - } else { - n = 8; - } - break; - } - - return (T1_CTXNO_ZC + n); -} - -static int t1_init_ctxno_sc(OPJ_UINT32 f) -{ - int hc, vc, n; - n = 0; - - hc = opj_int_min(((f & (T1_LUT_SIG_E | T1_LUT_SGN_E)) == - T1_LUT_SIG_E) + ((f & (T1_LUT_SIG_W | T1_LUT_SGN_W)) == T1_LUT_SIG_W), - 1) - opj_int_min(((f & (T1_LUT_SIG_E | T1_LUT_SGN_E)) == - (T1_LUT_SIG_E | T1_LUT_SGN_E)) + - ((f & (T1_LUT_SIG_W | T1_LUT_SGN_W)) == - (T1_LUT_SIG_W | T1_LUT_SGN_W)), 1); - - vc = opj_int_min(((f & (T1_LUT_SIG_N | T1_LUT_SGN_N)) == - T1_LUT_SIG_N) + ((f & (T1_LUT_SIG_S | T1_LUT_SGN_S)) == T1_LUT_SIG_S), - 1) - opj_int_min(((f & (T1_LUT_SIG_N | T1_LUT_SGN_N)) == - (T1_LUT_SIG_N | T1_LUT_SGN_N)) + - ((f & (T1_LUT_SIG_S | T1_LUT_SGN_S)) == - (T1_LUT_SIG_S | T1_LUT_SGN_S)), 1); - - if (hc < 0) { - hc = -hc; - vc = -vc; - } - if (!hc) { - if (vc == -1) { - n = 1; - } else if (!vc) { - n = 0; - } else { - n = 1; - } - } else if (hc == 1) { - if (vc == -1) { - n = 2; - } else if (!vc) { - n = 3; - } else { - n = 4; - } - } - - return (T1_CTXNO_SC + n); -} - -static int t1_init_spb(OPJ_UINT32 f) -{ - int hc, vc, n; - - hc = opj_int_min(((f & (T1_LUT_SIG_E | T1_LUT_SGN_E)) == - T1_LUT_SIG_E) + ((f & (T1_LUT_SIG_W | T1_LUT_SGN_W)) == T1_LUT_SIG_W), - 1) - opj_int_min(((f & (T1_LUT_SIG_E | T1_LUT_SGN_E)) == - (T1_LUT_SIG_E | T1_LUT_SGN_E)) + - ((f & (T1_LUT_SIG_W | T1_LUT_SGN_W)) == - (T1_LUT_SIG_W | T1_LUT_SGN_W)), 1); - - vc = opj_int_min(((f & (T1_LUT_SIG_N | T1_LUT_SGN_N)) == - T1_LUT_SIG_N) + ((f & (T1_LUT_SIG_S | T1_LUT_SGN_S)) == T1_LUT_SIG_S), - 1) - opj_int_min(((f & (T1_LUT_SIG_N | T1_LUT_SGN_N)) == - (T1_LUT_SIG_N | T1_LUT_SGN_N)) + - ((f & (T1_LUT_SIG_S | T1_LUT_SGN_S)) == - (T1_LUT_SIG_S | T1_LUT_SGN_S)), 1); - - if (!hc && !vc) { - n = 0; - } else { - n = (!(hc > 0 || (!hc && vc > 0))); - } - - return n; -} - -static void dump_array16(int array[], int size) -{ - int i; - --size; - for (i = 0; i < size; ++i) { - printf("0x%04x,", array[i]); - if (!((i + 1) & 0x7)) { - printf("\n "); - } else { - printf(" "); - } - } - printf("0x%04x\n};\n\n", array[size]); -} - -int main(int argc, char **argv) -{ - unsigned int i, j; - double u, v, t; - - int lut_ctxno_zc[2048]; - int lut_nmsedec_sig[1 << T1_NMSEDEC_BITS]; - int lut_nmsedec_sig0[1 << T1_NMSEDEC_BITS]; - int lut_nmsedec_ref[1 << T1_NMSEDEC_BITS]; - int lut_nmsedec_ref0[1 << T1_NMSEDEC_BITS]; - (void)argc; - (void)argv; - - printf("/* This file was automatically generated by t1_generate_luts.c */\n\n"); - - /* lut_ctxno_zc */ - for (j = 0; j < 4; ++j) { - for (i = 0; i < 512; ++i) { - OPJ_UINT32 orient = j; - if (orient == 2) { - orient = 1; - } else if (orient == 1) { - orient = 2; - } - lut_ctxno_zc[(orient << 9) | i] = t1_init_ctxno_zc(i, j); - } - } - - printf("static const OPJ_BYTE lut_ctxno_zc[2048] = {\n "); - for (i = 0; i < 2047; ++i) { - printf("%i,", lut_ctxno_zc[i]); - if (!((i + 1) & 0x1f)) { - printf("\n "); - } else { - printf(" "); - } - } - printf("%i\n};\n\n", lut_ctxno_zc[2047]); - - /* lut_ctxno_sc */ - printf("static const OPJ_BYTE lut_ctxno_sc[256] = {\n "); - for (i = 0; i < 255; ++i) { - printf("0x%x,", t1_init_ctxno_sc(i)); - if (!((i + 1) & 0xf)) { - printf("\n "); - } else { - printf(" "); - } - } - printf("0x%x\n};\n\n", t1_init_ctxno_sc(255)); - - /* lut_spb */ - printf("static const OPJ_BYTE lut_spb[256] = {\n "); - for (i = 0; i < 255; ++i) { - printf("%i,", t1_init_spb(i)); - if (!((i + 1) & 0x1f)) { - printf("\n "); - } else { - printf(" "); - } - } - printf("%i\n};\n\n", t1_init_spb(255)); - - /* FIXME FIXME FIXME */ - /* fprintf(stdout,"nmsedec luts:\n"); */ - for (i = 0U; i < (1U << T1_NMSEDEC_BITS); ++i) { - t = i / pow(2, T1_NMSEDEC_FRACBITS); - u = t; - v = t - 1.5; - lut_nmsedec_sig[i] = - opj_int_max(0, - (int)(floor((u * u - v * v) * pow(2, T1_NMSEDEC_FRACBITS) + 0.5) / pow(2, - T1_NMSEDEC_FRACBITS) * 8192.0)); - lut_nmsedec_sig0[i] = - opj_int_max(0, - (int)(floor((u * u) * pow(2, T1_NMSEDEC_FRACBITS) + 0.5) / pow(2, - T1_NMSEDEC_FRACBITS) * 8192.0)); - u = t - 1.0; - if (i & (1 << (T1_NMSEDEC_BITS - 1))) { - v = t - 1.5; - } else { - v = t - 0.5; - } - lut_nmsedec_ref[i] = - opj_int_max(0, - (int)(floor((u * u - v * v) * pow(2, T1_NMSEDEC_FRACBITS) + 0.5) / pow(2, - T1_NMSEDEC_FRACBITS) * 8192.0)); - lut_nmsedec_ref0[i] = - opj_int_max(0, - (int)(floor((u * u) * pow(2, T1_NMSEDEC_FRACBITS) + 0.5) / pow(2, - T1_NMSEDEC_FRACBITS) * 8192.0)); - } - - printf("static const OPJ_INT16 lut_nmsedec_sig[1U << T1_NMSEDEC_BITS] = {\n "); - dump_array16(lut_nmsedec_sig, 1U << T1_NMSEDEC_BITS); - - printf("static const OPJ_INT16 lut_nmsedec_sig0[1U << T1_NMSEDEC_BITS] = {\n "); - dump_array16(lut_nmsedec_sig0, 1U << T1_NMSEDEC_BITS); - - printf("static const OPJ_INT16 lut_nmsedec_ref[1U << T1_NMSEDEC_BITS] = {\n "); - dump_array16(lut_nmsedec_ref, 1U << T1_NMSEDEC_BITS); - - printf("static const OPJ_INT16 lut_nmsedec_ref0[1U << T1_NMSEDEC_BITS] = {\n "); - dump_array16(lut_nmsedec_ref0, 1U << T1_NMSEDEC_BITS); - - return 0; -} diff --git a/src/3rd/LibOpenJpeg/t1_luts.h b/src/3rd/LibOpenJpeg/t1_luts.h deleted file mode 100644 index 1a5e7844..00000000 --- a/src/3rd/LibOpenJpeg/t1_luts.h +++ /dev/null @@ -1,175 +0,0 @@ -/* This file was automatically generated by t1_generate_luts.c */ - -static const OPJ_BYTE lut_ctxno_zc[2048] = { - 0, 1, 3, 3, 1, 2, 3, 3, 5, 6, 7, 7, 6, 6, 7, 7, 0, 1, 3, 3, 1, 2, 3, 3, 5, 6, 7, 7, 6, 6, 7, 7, - 5, 6, 7, 7, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 5, 6, 7, 7, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, - 1, 2, 3, 3, 2, 2, 3, 3, 6, 6, 7, 7, 6, 6, 7, 7, 1, 2, 3, 3, 2, 2, 3, 3, 6, 6, 7, 7, 6, 6, 7, 7, - 6, 6, 7, 7, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 7, 7, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, - 3, 3, 4, 4, 3, 3, 4, 4, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 4, 4, 3, 3, 4, 4, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, - 3, 3, 4, 4, 3, 3, 4, 4, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 4, 4, 3, 3, 4, 4, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, - 1, 2, 3, 3, 2, 2, 3, 3, 6, 6, 7, 7, 6, 6, 7, 7, 1, 2, 3, 3, 2, 2, 3, 3, 6, 6, 7, 7, 6, 6, 7, 7, - 6, 6, 7, 7, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 7, 7, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, - 2, 2, 3, 3, 2, 2, 3, 3, 6, 6, 7, 7, 6, 6, 7, 7, 2, 2, 3, 3, 2, 2, 3, 3, 6, 6, 7, 7, 6, 6, 7, 7, - 6, 6, 7, 7, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 7, 7, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, - 3, 3, 4, 4, 3, 3, 4, 4, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 4, 4, 3, 3, 4, 4, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, - 3, 3, 4, 4, 3, 3, 4, 4, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 4, 4, 3, 3, 4, 4, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, - 0, 1, 5, 6, 1, 2, 6, 6, 3, 3, 7, 7, 3, 3, 7, 7, 0, 1, 5, 6, 1, 2, 6, 6, 3, 3, 7, 7, 3, 3, 7, 7, - 3, 3, 7, 7, 3, 3, 7, 7, 4, 4, 7, 7, 4, 4, 7, 7, 3, 3, 7, 7, 3, 3, 7, 7, 4, 4, 7, 7, 4, 4, 7, 7, - 1, 2, 6, 6, 2, 2, 6, 6, 3, 3, 7, 7, 3, 3, 7, 7, 1, 2, 6, 6, 2, 2, 6, 6, 3, 3, 7, 7, 3, 3, 7, 7, - 3, 3, 7, 7, 3, 3, 7, 7, 4, 4, 7, 7, 4, 4, 7, 7, 3, 3, 7, 7, 3, 3, 7, 7, 4, 4, 7, 7, 4, 4, 7, 7, - 5, 6, 8, 8, 6, 6, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 5, 6, 8, 8, 6, 6, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, - 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, - 6, 6, 8, 8, 6, 6, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 6, 6, 8, 8, 6, 6, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, - 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, - 1, 2, 6, 6, 2, 2, 6, 6, 3, 3, 7, 7, 3, 3, 7, 7, 1, 2, 6, 6, 2, 2, 6, 6, 3, 3, 7, 7, 3, 3, 7, 7, - 3, 3, 7, 7, 3, 3, 7, 7, 4, 4, 7, 7, 4, 4, 7, 7, 3, 3, 7, 7, 3, 3, 7, 7, 4, 4, 7, 7, 4, 4, 7, 7, - 2, 2, 6, 6, 2, 2, 6, 6, 3, 3, 7, 7, 3, 3, 7, 7, 2, 2, 6, 6, 2, 2, 6, 6, 3, 3, 7, 7, 3, 3, 7, 7, - 3, 3, 7, 7, 3, 3, 7, 7, 4, 4, 7, 7, 4, 4, 7, 7, 3, 3, 7, 7, 3, 3, 7, 7, 4, 4, 7, 7, 4, 4, 7, 7, - 6, 6, 8, 8, 6, 6, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 6, 6, 8, 8, 6, 6, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, - 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, - 6, 6, 8, 8, 6, 6, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 6, 6, 8, 8, 6, 6, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, - 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, 7, 7, 8, 8, - 0, 1, 3, 3, 1, 2, 3, 3, 5, 6, 7, 7, 6, 6, 7, 7, 0, 1, 3, 3, 1, 2, 3, 3, 5, 6, 7, 7, 6, 6, 7, 7, - 5, 6, 7, 7, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 5, 6, 7, 7, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, - 1, 2, 3, 3, 2, 2, 3, 3, 6, 6, 7, 7, 6, 6, 7, 7, 1, 2, 3, 3, 2, 2, 3, 3, 6, 6, 7, 7, 6, 6, 7, 7, - 6, 6, 7, 7, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 7, 7, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, - 3, 3, 4, 4, 3, 3, 4, 4, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 4, 4, 3, 3, 4, 4, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, - 3, 3, 4, 4, 3, 3, 4, 4, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 4, 4, 3, 3, 4, 4, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, - 1, 2, 3, 3, 2, 2, 3, 3, 6, 6, 7, 7, 6, 6, 7, 7, 1, 2, 3, 3, 2, 2, 3, 3, 6, 6, 7, 7, 6, 6, 7, 7, - 6, 6, 7, 7, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 7, 7, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, - 2, 2, 3, 3, 2, 2, 3, 3, 6, 6, 7, 7, 6, 6, 7, 7, 2, 2, 3, 3, 2, 2, 3, 3, 6, 6, 7, 7, 6, 6, 7, 7, - 6, 6, 7, 7, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 7, 7, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, - 3, 3, 4, 4, 3, 3, 4, 4, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 4, 4, 3, 3, 4, 4, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, - 3, 3, 4, 4, 3, 3, 4, 4, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 4, 4, 3, 3, 4, 4, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, - 0, 3, 1, 4, 3, 6, 4, 7, 1, 4, 2, 5, 4, 7, 5, 7, 0, 3, 1, 4, 3, 6, 4, 7, 1, 4, 2, 5, 4, 7, 5, 7, - 1, 4, 2, 5, 4, 7, 5, 7, 2, 5, 2, 5, 5, 7, 5, 7, 1, 4, 2, 5, 4, 7, 5, 7, 2, 5, 2, 5, 5, 7, 5, 7, - 3, 6, 4, 7, 6, 8, 7, 8, 4, 7, 5, 7, 7, 8, 7, 8, 3, 6, 4, 7, 6, 8, 7, 8, 4, 7, 5, 7, 7, 8, 7, 8, - 4, 7, 5, 7, 7, 8, 7, 8, 5, 7, 5, 7, 7, 8, 7, 8, 4, 7, 5, 7, 7, 8, 7, 8, 5, 7, 5, 7, 7, 8, 7, 8, - 1, 4, 2, 5, 4, 7, 5, 7, 2, 5, 2, 5, 5, 7, 5, 7, 1, 4, 2, 5, 4, 7, 5, 7, 2, 5, 2, 5, 5, 7, 5, 7, - 2, 5, 2, 5, 5, 7, 5, 7, 2, 5, 2, 5, 5, 7, 5, 7, 2, 5, 2, 5, 5, 7, 5, 7, 2, 5, 2, 5, 5, 7, 5, 7, - 4, 7, 5, 7, 7, 8, 7, 8, 5, 7, 5, 7, 7, 8, 7, 8, 4, 7, 5, 7, 7, 8, 7, 8, 5, 7, 5, 7, 7, 8, 7, 8, - 5, 7, 5, 7, 7, 8, 7, 8, 5, 7, 5, 7, 7, 8, 7, 8, 5, 7, 5, 7, 7, 8, 7, 8, 5, 7, 5, 7, 7, 8, 7, 8, - 3, 6, 4, 7, 6, 8, 7, 8, 4, 7, 5, 7, 7, 8, 7, 8, 3, 6, 4, 7, 6, 8, 7, 8, 4, 7, 5, 7, 7, 8, 7, 8, - 4, 7, 5, 7, 7, 8, 7, 8, 5, 7, 5, 7, 7, 8, 7, 8, 4, 7, 5, 7, 7, 8, 7, 8, 5, 7, 5, 7, 7, 8, 7, 8, - 6, 8, 7, 8, 8, 8, 8, 8, 7, 8, 7, 8, 8, 8, 8, 8, 6, 8, 7, 8, 8, 8, 8, 8, 7, 8, 7, 8, 8, 8, 8, 8, - 7, 8, 7, 8, 8, 8, 8, 8, 7, 8, 7, 8, 8, 8, 8, 8, 7, 8, 7, 8, 8, 8, 8, 8, 7, 8, 7, 8, 8, 8, 8, 8, - 4, 7, 5, 7, 7, 8, 7, 8, 5, 7, 5, 7, 7, 8, 7, 8, 4, 7, 5, 7, 7, 8, 7, 8, 5, 7, 5, 7, 7, 8, 7, 8, - 5, 7, 5, 7, 7, 8, 7, 8, 5, 7, 5, 7, 7, 8, 7, 8, 5, 7, 5, 7, 7, 8, 7, 8, 5, 7, 5, 7, 7, 8, 7, 8, - 7, 8, 7, 8, 8, 8, 8, 8, 7, 8, 7, 8, 8, 8, 8, 8, 7, 8, 7, 8, 8, 8, 8, 8, 7, 8, 7, 8, 8, 8, 8, 8, - 7, 8, 7, 8, 8, 8, 8, 8, 7, 8, 7, 8, 8, 8, 8, 8, 7, 8, 7, 8, 8, 8, 8, 8, 7, 8, 7, 8, 8, 8, 8, 8 -}; - -static const OPJ_BYTE lut_ctxno_sc[256] = { - 0x9, 0x9, 0xa, 0xa, 0x9, 0x9, 0xa, 0xa, 0xc, 0xc, 0xd, 0xb, 0xc, 0xc, 0xd, 0xb, - 0x9, 0x9, 0xa, 0xa, 0x9, 0x9, 0xa, 0xa, 0xc, 0xc, 0xb, 0xd, 0xc, 0xc, 0xb, 0xd, - 0xc, 0xc, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xc, 0x9, 0xd, 0xa, 0x9, 0xc, 0xa, 0xb, - 0xc, 0xc, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xc, 0x9, 0xb, 0xa, 0x9, 0xc, 0xa, 0xd, - 0x9, 0x9, 0xa, 0xa, 0x9, 0x9, 0xa, 0xa, 0xc, 0xc, 0xd, 0xb, 0xc, 0xc, 0xd, 0xb, - 0x9, 0x9, 0xa, 0xa, 0x9, 0x9, 0xa, 0xa, 0xc, 0xc, 0xb, 0xd, 0xc, 0xc, 0xb, 0xd, - 0xc, 0xc, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xc, 0x9, 0xd, 0xa, 0x9, 0xc, 0xa, 0xb, - 0xc, 0xc, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xc, 0x9, 0xb, 0xa, 0x9, 0xc, 0xa, 0xd, - 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xd, 0xb, 0xd, 0xb, 0xd, 0xb, 0xd, 0xb, - 0xa, 0xa, 0x9, 0x9, 0xa, 0xa, 0x9, 0x9, 0xd, 0xb, 0xc, 0xc, 0xd, 0xb, 0xc, 0xc, - 0xd, 0xd, 0xd, 0xd, 0xb, 0xb, 0xb, 0xb, 0xd, 0xa, 0xd, 0xa, 0xa, 0xb, 0xa, 0xb, - 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xc, 0xc, 0xd, 0xa, 0xc, 0x9, 0xa, 0xb, 0x9, 0xc, - 0xa, 0xa, 0x9, 0x9, 0xa, 0xa, 0x9, 0x9, 0xb, 0xd, 0xc, 0xc, 0xb, 0xd, 0xc, 0xc, - 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xb, 0xd, 0xb, 0xd, 0xb, 0xd, 0xb, 0xd, - 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xc, 0xc, 0xb, 0xa, 0xc, 0x9, 0xa, 0xd, 0x9, 0xc, - 0xb, 0xb, 0xb, 0xb, 0xd, 0xd, 0xd, 0xd, 0xb, 0xa, 0xb, 0xa, 0xa, 0xd, 0xa, 0xd -}; - -static const OPJ_BYTE lut_spb[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, - 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1 -}; - -static const OPJ_INT16 lut_nmsedec_sig[1U << T1_NMSEDEC_BITS] = { - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0180, 0x0300, 0x0480, 0x0600, 0x0780, 0x0900, 0x0a80, - 0x0c00, 0x0d80, 0x0f00, 0x1080, 0x1200, 0x1380, 0x1500, 0x1680, - 0x1800, 0x1980, 0x1b00, 0x1c80, 0x1e00, 0x1f80, 0x2100, 0x2280, - 0x2400, 0x2580, 0x2700, 0x2880, 0x2a00, 0x2b80, 0x2d00, 0x2e80, - 0x3000, 0x3180, 0x3300, 0x3480, 0x3600, 0x3780, 0x3900, 0x3a80, - 0x3c00, 0x3d80, 0x3f00, 0x4080, 0x4200, 0x4380, 0x4500, 0x4680, - 0x4800, 0x4980, 0x4b00, 0x4c80, 0x4e00, 0x4f80, 0x5100, 0x5280, - 0x5400, 0x5580, 0x5700, 0x5880, 0x5a00, 0x5b80, 0x5d00, 0x5e80, - 0x6000, 0x6180, 0x6300, 0x6480, 0x6600, 0x6780, 0x6900, 0x6a80, - 0x6c00, 0x6d80, 0x6f00, 0x7080, 0x7200, 0x7380, 0x7500, 0x7680 -}; - -static const OPJ_INT16 lut_nmsedec_sig0[1U << T1_NMSEDEC_BITS] = { - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0080, 0x0080, - 0x0080, 0x0080, 0x0100, 0x0100, 0x0100, 0x0180, 0x0180, 0x0200, - 0x0200, 0x0280, 0x0280, 0x0300, 0x0300, 0x0380, 0x0400, 0x0400, - 0x0480, 0x0500, 0x0580, 0x0580, 0x0600, 0x0680, 0x0700, 0x0780, - 0x0800, 0x0880, 0x0900, 0x0980, 0x0a00, 0x0a80, 0x0b80, 0x0c00, - 0x0c80, 0x0d00, 0x0e00, 0x0e80, 0x0f00, 0x1000, 0x1080, 0x1180, - 0x1200, 0x1300, 0x1380, 0x1480, 0x1500, 0x1600, 0x1700, 0x1780, - 0x1880, 0x1980, 0x1a80, 0x1b00, 0x1c00, 0x1d00, 0x1e00, 0x1f00, - 0x2000, 0x2100, 0x2200, 0x2300, 0x2400, 0x2500, 0x2680, 0x2780, - 0x2880, 0x2980, 0x2b00, 0x2c00, 0x2d00, 0x2e80, 0x2f80, 0x3100, - 0x3200, 0x3380, 0x3480, 0x3600, 0x3700, 0x3880, 0x3a00, 0x3b00, - 0x3c80, 0x3e00, 0x3f80, 0x4080, 0x4200, 0x4380, 0x4500, 0x4680, - 0x4800, 0x4980, 0x4b00, 0x4c80, 0x4e00, 0x4f80, 0x5180, 0x5300, - 0x5480, 0x5600, 0x5800, 0x5980, 0x5b00, 0x5d00, 0x5e80, 0x6080, - 0x6200, 0x6400, 0x6580, 0x6780, 0x6900, 0x6b00, 0x6d00, 0x6e80, - 0x7080, 0x7280, 0x7480, 0x7600, 0x7800, 0x7a00, 0x7c00, 0x7e00 -}; - -static const OPJ_INT16 lut_nmsedec_ref[1U << T1_NMSEDEC_BITS] = { - 0x1800, 0x1780, 0x1700, 0x1680, 0x1600, 0x1580, 0x1500, 0x1480, - 0x1400, 0x1380, 0x1300, 0x1280, 0x1200, 0x1180, 0x1100, 0x1080, - 0x1000, 0x0f80, 0x0f00, 0x0e80, 0x0e00, 0x0d80, 0x0d00, 0x0c80, - 0x0c00, 0x0b80, 0x0b00, 0x0a80, 0x0a00, 0x0980, 0x0900, 0x0880, - 0x0800, 0x0780, 0x0700, 0x0680, 0x0600, 0x0580, 0x0500, 0x0480, - 0x0400, 0x0380, 0x0300, 0x0280, 0x0200, 0x0180, 0x0100, 0x0080, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0080, 0x0100, 0x0180, 0x0200, 0x0280, 0x0300, 0x0380, - 0x0400, 0x0480, 0x0500, 0x0580, 0x0600, 0x0680, 0x0700, 0x0780, - 0x0800, 0x0880, 0x0900, 0x0980, 0x0a00, 0x0a80, 0x0b00, 0x0b80, - 0x0c00, 0x0c80, 0x0d00, 0x0d80, 0x0e00, 0x0e80, 0x0f00, 0x0f80, - 0x1000, 0x1080, 0x1100, 0x1180, 0x1200, 0x1280, 0x1300, 0x1380, - 0x1400, 0x1480, 0x1500, 0x1580, 0x1600, 0x1680, 0x1700, 0x1780 -}; - -static const OPJ_INT16 lut_nmsedec_ref0[1U << T1_NMSEDEC_BITS] = { - 0x2000, 0x1f00, 0x1e00, 0x1d00, 0x1c00, 0x1b00, 0x1a80, 0x1980, - 0x1880, 0x1780, 0x1700, 0x1600, 0x1500, 0x1480, 0x1380, 0x1300, - 0x1200, 0x1180, 0x1080, 0x1000, 0x0f00, 0x0e80, 0x0e00, 0x0d00, - 0x0c80, 0x0c00, 0x0b80, 0x0a80, 0x0a00, 0x0980, 0x0900, 0x0880, - 0x0800, 0x0780, 0x0700, 0x0680, 0x0600, 0x0580, 0x0580, 0x0500, - 0x0480, 0x0400, 0x0400, 0x0380, 0x0300, 0x0300, 0x0280, 0x0280, - 0x0200, 0x0200, 0x0180, 0x0180, 0x0100, 0x0100, 0x0100, 0x0080, - 0x0080, 0x0080, 0x0080, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0080, 0x0080, - 0x0080, 0x0080, 0x0100, 0x0100, 0x0100, 0x0180, 0x0180, 0x0200, - 0x0200, 0x0280, 0x0280, 0x0300, 0x0300, 0x0380, 0x0400, 0x0400, - 0x0480, 0x0500, 0x0580, 0x0580, 0x0600, 0x0680, 0x0700, 0x0780, - 0x0800, 0x0880, 0x0900, 0x0980, 0x0a00, 0x0a80, 0x0b80, 0x0c00, - 0x0c80, 0x0d00, 0x0e00, 0x0e80, 0x0f00, 0x1000, 0x1080, 0x1180, - 0x1200, 0x1300, 0x1380, 0x1480, 0x1500, 0x1600, 0x1700, 0x1780, - 0x1880, 0x1980, 0x1a80, 0x1b00, 0x1c00, 0x1d00, 0x1e00, 0x1f00 -}; - diff --git a/src/3rd/LibOpenJpeg/t2.c b/src/3rd/LibOpenJpeg/t2.c deleted file mode 100644 index 9825118c..00000000 --- a/src/3rd/LibOpenJpeg/t2.c +++ /dev/null @@ -1,1571 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR - * Copyright (c) 2012, CS Systemes d'Information, France - * Copyright (c) 2017, IntoPIX SA - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "opj_includes.h" -#include "opj_common.h" - - -/** @defgroup T2 T2 - Implementation of a tier-2 coding */ -/*@{*/ - -/** @name Local static functions */ -/*@{*/ - -static void opj_t2_putcommacode(opj_bio_t *bio, OPJ_INT32 n); - -static OPJ_UINT32 opj_t2_getcommacode(opj_bio_t *bio); -/** -Variable length code for signalling delta Zil (truncation point) -@param bio Bit Input/Output component -@param n delta Zil -*/ -static void opj_t2_putnumpasses(opj_bio_t *bio, OPJ_UINT32 n); -static OPJ_UINT32 opj_t2_getnumpasses(opj_bio_t *bio); - -/** -Encode a packet of a tile to a destination buffer -@param tileno Number of the tile encoded -@param tile Tile for which to write the packets -@param tcp Tile coding parameters -@param pi Packet identity -@param dest Destination buffer -@param p_data_written FIXME DOC -@param len Length of the destination buffer -@param cstr_info Codestream information structure -@param p_t2_mode If == THRESH_CALC In Threshold calculation ,If == FINAL_PASS Final pass -@param p_manager the user event manager -@return -*/ -static OPJ_BOOL opj_t2_encode_packet(OPJ_UINT32 tileno, - opj_tcd_tile_t *tile, - opj_tcp_t *tcp, - opj_pi_iterator_t *pi, - OPJ_BYTE *dest, - OPJ_UINT32 * p_data_written, - OPJ_UINT32 len, - opj_codestream_info_t *cstr_info, - J2K_T2_MODE p_t2_mode, - opj_event_mgr_t *p_manager); - -/** -Decode a packet of a tile from a source buffer -@param t2 T2 handle -@param tile Tile for which to write the packets -@param tcp Tile coding parameters -@param pi Packet identity -@param src Source buffer -@param data_read FIXME DOC -@param max_length FIXME DOC -@param pack_info Packet information -@param p_manager the user event manager - -@return FIXME DOC -*/ -static OPJ_BOOL opj_t2_decode_packet(opj_t2_t* t2, - opj_tcd_tile_t *tile, - opj_tcp_t *tcp, - opj_pi_iterator_t *pi, - OPJ_BYTE *src, - OPJ_UINT32 * data_read, - OPJ_UINT32 max_length, - opj_packet_info_t *pack_info, - opj_event_mgr_t *p_manager); - -static OPJ_BOOL opj_t2_skip_packet(opj_t2_t* p_t2, - opj_tcd_tile_t *p_tile, - opj_tcp_t *p_tcp, - opj_pi_iterator_t *p_pi, - OPJ_BYTE *p_src, - OPJ_UINT32 * p_data_read, - OPJ_UINT32 p_max_length, - opj_packet_info_t *p_pack_info, - opj_event_mgr_t *p_manager); - -static OPJ_BOOL opj_t2_read_packet_header(opj_t2_t* p_t2, - opj_tcd_tile_t *p_tile, - opj_tcp_t *p_tcp, - opj_pi_iterator_t *p_pi, - OPJ_BOOL * p_is_data_present, - OPJ_BYTE *p_src_data, - OPJ_UINT32 * p_data_read, - OPJ_UINT32 p_max_length, - opj_packet_info_t *p_pack_info, - opj_event_mgr_t *p_manager); - -static OPJ_BOOL opj_t2_read_packet_data(opj_t2_t* p_t2, - opj_tcd_tile_t *p_tile, - opj_pi_iterator_t *p_pi, - OPJ_BYTE *p_src_data, - OPJ_UINT32 * p_data_read, - OPJ_UINT32 p_max_length, - opj_packet_info_t *pack_info, - opj_event_mgr_t *p_manager); - -static OPJ_BOOL opj_t2_skip_packet_data(opj_t2_t* p_t2, - opj_tcd_tile_t *p_tile, - opj_pi_iterator_t *p_pi, - OPJ_UINT32 * p_data_read, - OPJ_UINT32 p_max_length, - opj_packet_info_t *pack_info, - opj_event_mgr_t *p_manager); - -/** -@param cblk -@param index -@param cblksty -@param first -*/ -static OPJ_BOOL opj_t2_init_seg(opj_tcd_cblk_dec_t* cblk, - OPJ_UINT32 index, - OPJ_UINT32 cblksty, - OPJ_UINT32 first); - -/*@}*/ - -/*@}*/ - -/* ----------------------------------------------------------------------- */ - -/* #define RESTART 0x04 */ -static void opj_t2_putcommacode(opj_bio_t *bio, OPJ_INT32 n) -{ - while (--n >= 0) { - opj_bio_write(bio, 1, 1); - } - opj_bio_write(bio, 0, 1); -} - -static OPJ_UINT32 opj_t2_getcommacode(opj_bio_t *bio) -{ - OPJ_UINT32 n = 0; - while (opj_bio_read(bio, 1)) { - ++n; - } - return n; -} - -static void opj_t2_putnumpasses(opj_bio_t *bio, OPJ_UINT32 n) -{ - if (n == 1) { - opj_bio_write(bio, 0, 1); - } else if (n == 2) { - opj_bio_write(bio, 2, 2); - } else if (n <= 5) { - opj_bio_write(bio, 0xc | (n - 3), 4); - } else if (n <= 36) { - opj_bio_write(bio, 0x1e0 | (n - 6), 9); - } else if (n <= 164) { - opj_bio_write(bio, 0xff80 | (n - 37), 16); - } -} - -static OPJ_UINT32 opj_t2_getnumpasses(opj_bio_t *bio) -{ - OPJ_UINT32 n; - if (!opj_bio_read(bio, 1)) { - return 1; - } - if (!opj_bio_read(bio, 1)) { - return 2; - } - if ((n = opj_bio_read(bio, 2)) != 3) { - return (3 + n); - } - if ((n = opj_bio_read(bio, 5)) != 31) { - return (6 + n); - } - return (37 + opj_bio_read(bio, 7)); -} - -/* ----------------------------------------------------------------------- */ - -OPJ_BOOL opj_t2_encode_packets(opj_t2_t* p_t2, - OPJ_UINT32 p_tile_no, - opj_tcd_tile_t *p_tile, - OPJ_UINT32 p_maxlayers, - OPJ_BYTE *p_dest, - OPJ_UINT32 * p_data_written, - OPJ_UINT32 p_max_len, - opj_codestream_info_t *cstr_info, - OPJ_UINT32 p_tp_num, - OPJ_INT32 p_tp_pos, - OPJ_UINT32 p_pino, - J2K_T2_MODE p_t2_mode, - opj_event_mgr_t *p_manager) -{ - OPJ_BYTE *l_current_data = p_dest; - OPJ_UINT32 l_nb_bytes = 0; - OPJ_UINT32 compno; - OPJ_UINT32 poc; - opj_pi_iterator_t *l_pi = 00; - opj_pi_iterator_t *l_current_pi = 00; - opj_image_t *l_image = p_t2->image; - opj_cp_t *l_cp = p_t2->cp; - opj_tcp_t *l_tcp = &l_cp->tcps[p_tile_no]; - OPJ_UINT32 pocno = (l_cp->rsiz == OPJ_PROFILE_CINEMA_4K) ? 2 : 1; - OPJ_UINT32 l_max_comp = l_cp->m_specific_param.m_enc.m_max_comp_size > 0 ? - l_image->numcomps : 1; - OPJ_UINT32 l_nb_pocs = l_tcp->numpocs + 1; - - l_pi = opj_pi_initialise_encode(l_image, l_cp, p_tile_no, p_t2_mode); - if (!l_pi) { - return OPJ_FALSE; - } - - * p_data_written = 0; - - if (p_t2_mode == THRESH_CALC) { /* Calculating threshold */ - l_current_pi = l_pi; - - for (compno = 0; compno < l_max_comp; ++compno) { - OPJ_UINT32 l_comp_len = 0; - l_current_pi = l_pi; - - for (poc = 0; poc < pocno ; ++poc) { - OPJ_UINT32 l_tp_num = compno; - - /* TODO MSD : check why this function cannot fail (cf. v1) */ - opj_pi_create_encode(l_pi, l_cp, p_tile_no, poc, l_tp_num, p_tp_pos, p_t2_mode); - - if (l_current_pi->poc.prg == OPJ_PROG_UNKNOWN) { - /* TODO ADE : add an error */ - opj_pi_destroy(l_pi, l_nb_pocs); - return OPJ_FALSE; - } - while (opj_pi_next(l_current_pi)) { - if (l_current_pi->layno < p_maxlayers) { - l_nb_bytes = 0; - - if (! opj_t2_encode_packet(p_tile_no, p_tile, l_tcp, l_current_pi, - l_current_data, &l_nb_bytes, - p_max_len, cstr_info, - p_t2_mode, - p_manager)) { - opj_pi_destroy(l_pi, l_nb_pocs); - return OPJ_FALSE; - } - - l_comp_len += l_nb_bytes; - l_current_data += l_nb_bytes; - p_max_len -= l_nb_bytes; - - * p_data_written += l_nb_bytes; - } - } - - if (l_cp->m_specific_param.m_enc.m_max_comp_size) { - if (l_comp_len > l_cp->m_specific_param.m_enc.m_max_comp_size) { - opj_pi_destroy(l_pi, l_nb_pocs); - return OPJ_FALSE; - } - } - - ++l_current_pi; - } - } - } else { /* t2_mode == FINAL_PASS */ - opj_pi_create_encode(l_pi, l_cp, p_tile_no, p_pino, p_tp_num, p_tp_pos, - p_t2_mode); - - l_current_pi = &l_pi[p_pino]; - if (l_current_pi->poc.prg == OPJ_PROG_UNKNOWN) { - /* TODO ADE : add an error */ - opj_pi_destroy(l_pi, l_nb_pocs); - return OPJ_FALSE; - } - while (opj_pi_next(l_current_pi)) { - if (l_current_pi->layno < p_maxlayers) { - l_nb_bytes = 0; - - if (! opj_t2_encode_packet(p_tile_no, p_tile, l_tcp, l_current_pi, - l_current_data, &l_nb_bytes, p_max_len, - cstr_info, p_t2_mode, p_manager)) { - opj_pi_destroy(l_pi, l_nb_pocs); - return OPJ_FALSE; - } - - l_current_data += l_nb_bytes; - p_max_len -= l_nb_bytes; - - * p_data_written += l_nb_bytes; - - /* INDEX >> */ - if (cstr_info) { - if (cstr_info->index_write) { - opj_tile_info_t *info_TL = &cstr_info->tile[p_tile_no]; - opj_packet_info_t *info_PK = &info_TL->packet[cstr_info->packno]; - if (!cstr_info->packno) { - info_PK->start_pos = info_TL->end_header + 1; - } else { - info_PK->start_pos = ((l_cp->m_specific_param.m_enc.m_tp_on | l_tcp->POC) && - info_PK->start_pos) ? info_PK->start_pos : info_TL->packet[cstr_info->packno - - 1].end_pos + 1; - } - info_PK->end_pos = info_PK->start_pos + l_nb_bytes - 1; - info_PK->end_ph_pos += info_PK->start_pos - - 1; /* End of packet header which now only represents the distance - to start of packet is incremented by value of start of packet*/ - } - - cstr_info->packno++; - } - /* << INDEX */ - ++p_tile->packno; - } - } - } - - opj_pi_destroy(l_pi, l_nb_pocs); - - return OPJ_TRUE; -} - -/* see issue 80 */ -#if 0 -#define JAS_FPRINTF fprintf -#else -/* issue 290 */ -static void opj_null_jas_fprintf(FILE* file, const char * format, ...) -{ - (void)file; - (void)format; -} -#define JAS_FPRINTF opj_null_jas_fprintf -#endif - -OPJ_BOOL opj_t2_decode_packets(opj_tcd_t* tcd, - opj_t2_t *p_t2, - OPJ_UINT32 p_tile_no, - opj_tcd_tile_t *p_tile, - OPJ_BYTE *p_src, - OPJ_UINT32 * p_data_read, - OPJ_UINT32 p_max_len, - opj_codestream_index_t *p_cstr_index, - opj_event_mgr_t *p_manager) -{ - OPJ_BYTE *l_current_data = p_src; - opj_pi_iterator_t *l_pi = 00; - OPJ_UINT32 pino; - opj_image_t *l_image = p_t2->image; - opj_cp_t *l_cp = p_t2->cp; - opj_tcp_t *l_tcp = &(p_t2->cp->tcps[p_tile_no]); - OPJ_UINT32 l_nb_bytes_read; - OPJ_UINT32 l_nb_pocs = l_tcp->numpocs + 1; - opj_pi_iterator_t *l_current_pi = 00; -#ifdef TODO_MSD - OPJ_UINT32 curtp = 0; - OPJ_UINT32 tp_start_packno; -#endif - opj_packet_info_t *l_pack_info = 00; - opj_image_comp_t* l_img_comp = 00; - - OPJ_ARG_NOT_USED(p_cstr_index); - -#ifdef TODO_MSD - if (p_cstr_index) { - l_pack_info = p_cstr_index->tile_index[p_tile_no].packet; - } -#endif - - /* create a packet iterator */ - l_pi = opj_pi_create_decode(l_image, l_cp, p_tile_no); - if (!l_pi) { - return OPJ_FALSE; - } - - - l_current_pi = l_pi; - - for (pino = 0; pino <= l_tcp->numpocs; ++pino) { - - /* if the resolution needed is too low, one dim of the tilec could be equal to zero - * and no packets are used to decode this resolution and - * l_current_pi->resno is always >= p_tile->comps[l_current_pi->compno].minimum_num_resolutions - * and no l_img_comp->resno_decoded are computed - */ - OPJ_BOOL* first_pass_failed = NULL; - - if (l_current_pi->poc.prg == OPJ_PROG_UNKNOWN) { - /* TODO ADE : add an error */ - opj_pi_destroy(l_pi, l_nb_pocs); - return OPJ_FALSE; - } - - first_pass_failed = (OPJ_BOOL*)opj_malloc(l_image->numcomps * sizeof(OPJ_BOOL)); - if (!first_pass_failed) { - opj_pi_destroy(l_pi, l_nb_pocs); - return OPJ_FALSE; - } - memset(first_pass_failed, OPJ_TRUE, l_image->numcomps * sizeof(OPJ_BOOL)); - - while (opj_pi_next(l_current_pi)) { - OPJ_BOOL skip_packet = OPJ_FALSE; - JAS_FPRINTF(stderr, - "packet offset=00000166 prg=%d cmptno=%02d rlvlno=%02d prcno=%03d lyrno=%02d\n\n", - l_current_pi->poc.prg1, l_current_pi->compno, l_current_pi->resno, - l_current_pi->precno, l_current_pi->layno); - - /* If the packet layer is greater or equal than the maximum */ - /* number of layers, skip the packet */ - if (l_current_pi->layno >= l_tcp->num_layers_to_decode) { - skip_packet = OPJ_TRUE; - } - /* If the packet resolution number is greater than the minimum */ - /* number of resolution allowed, skip the packet */ - else if (l_current_pi->resno >= - p_tile->comps[l_current_pi->compno].minimum_num_resolutions) { - skip_packet = OPJ_TRUE; - } else { - /* If no precincts of any band intersects the area of interest, */ - /* skip the packet */ - OPJ_UINT32 bandno; - opj_tcd_tilecomp_t *tilec = &p_tile->comps[l_current_pi->compno]; - opj_tcd_resolution_t *res = &tilec->resolutions[l_current_pi->resno]; - - skip_packet = OPJ_TRUE; - for (bandno = 0; bandno < res->numbands; ++bandno) { - opj_tcd_band_t* band = &res->bands[bandno]; - opj_tcd_precinct_t* prec = &band->precincts[l_current_pi->precno]; - - if (opj_tcd_is_subband_area_of_interest(tcd, - l_current_pi->compno, - l_current_pi->resno, - band->bandno, - (OPJ_UINT32)prec->x0, - (OPJ_UINT32)prec->y0, - (OPJ_UINT32)prec->x1, - (OPJ_UINT32)prec->y1)) { - skip_packet = OPJ_FALSE; - break; - } - } - /* - printf("packet cmptno=%02d rlvlno=%02d prcno=%03d lyrno=%02d -> %s\n", - l_current_pi->compno, l_current_pi->resno, - l_current_pi->precno, l_current_pi->layno, skip_packet ? "skipped" : "kept"); - */ - } - - if (!skip_packet) { - l_nb_bytes_read = 0; - - first_pass_failed[l_current_pi->compno] = OPJ_FALSE; - - if (! opj_t2_decode_packet(p_t2, p_tile, l_tcp, l_current_pi, l_current_data, - &l_nb_bytes_read, p_max_len, l_pack_info, p_manager)) { - opj_pi_destroy(l_pi, l_nb_pocs); - opj_free(first_pass_failed); - return OPJ_FALSE; - } - - l_img_comp = &(l_image->comps[l_current_pi->compno]); - l_img_comp->resno_decoded = opj_uint_max(l_current_pi->resno, - l_img_comp->resno_decoded); - } else { - l_nb_bytes_read = 0; - if (! opj_t2_skip_packet(p_t2, p_tile, l_tcp, l_current_pi, l_current_data, - &l_nb_bytes_read, p_max_len, l_pack_info, p_manager)) { - opj_pi_destroy(l_pi, l_nb_pocs); - opj_free(first_pass_failed); - return OPJ_FALSE; - } - } - - if (first_pass_failed[l_current_pi->compno]) { - l_img_comp = &(l_image->comps[l_current_pi->compno]); - if (l_img_comp->resno_decoded == 0) { - l_img_comp->resno_decoded = - p_tile->comps[l_current_pi->compno].minimum_num_resolutions - 1; - } - } - - l_current_data += l_nb_bytes_read; - p_max_len -= l_nb_bytes_read; - - /* INDEX >> */ -#ifdef TODO_MSD - if (p_cstr_info) { - opj_tile_info_v2_t *info_TL = &p_cstr_info->tile[p_tile_no]; - opj_packet_info_t *info_PK = &info_TL->packet[p_cstr_info->packno]; - tp_start_packno = 0; - if (!p_cstr_info->packno) { - info_PK->start_pos = info_TL->end_header + 1; - } else if (info_TL->packet[p_cstr_info->packno - 1].end_pos >= - (OPJ_INT32) - p_cstr_info->tile[p_tile_no].tp[curtp].tp_end_pos) { /* New tile part */ - info_TL->tp[curtp].tp_numpacks = p_cstr_info->packno - - tp_start_packno; /* Number of packets in previous tile-part */ - tp_start_packno = p_cstr_info->packno; - curtp++; - info_PK->start_pos = p_cstr_info->tile[p_tile_no].tp[curtp].tp_end_header + 1; - } else { - info_PK->start_pos = (l_cp->m_specific_param.m_enc.m_tp_on && - info_PK->start_pos) ? info_PK->start_pos : info_TL->packet[p_cstr_info->packno - - 1].end_pos + 1; - } - info_PK->end_pos = info_PK->start_pos + l_nb_bytes_read - 1; - info_PK->end_ph_pos += info_PK->start_pos - - 1; /* End of packet header which now only represents the distance */ - ++p_cstr_info->packno; - } -#endif - /* << INDEX */ - } - ++l_current_pi; - - opj_free(first_pass_failed); - } - /* INDEX >> */ -#ifdef TODO_MSD - if - (p_cstr_info) { - p_cstr_info->tile[p_tile_no].tp[curtp].tp_numpacks = p_cstr_info->packno - - tp_start_packno; /* Number of packets in last tile-part */ - } -#endif - /* << INDEX */ - - /* don't forget to release pi */ - opj_pi_destroy(l_pi, l_nb_pocs); - *p_data_read = (OPJ_UINT32)(l_current_data - p_src); - return OPJ_TRUE; -} - -/* ----------------------------------------------------------------------- */ - -/** - * Creates a Tier 2 handle - * - * @param p_image Source or destination image - * @param p_cp Image coding parameters. - * @return a new T2 handle if successful, NULL otherwise. -*/ -opj_t2_t* opj_t2_create(opj_image_t *p_image, opj_cp_t *p_cp) -{ - /* create the t2 structure */ - opj_t2_t *l_t2 = (opj_t2_t*)opj_calloc(1, sizeof(opj_t2_t)); - if (!l_t2) { - return NULL; - } - - l_t2->image = p_image; - l_t2->cp = p_cp; - - return l_t2; -} - -void opj_t2_destroy(opj_t2_t *t2) -{ - if (t2) { - opj_free(t2); - } -} - -static OPJ_BOOL opj_t2_decode_packet(opj_t2_t* p_t2, - opj_tcd_tile_t *p_tile, - opj_tcp_t *p_tcp, - opj_pi_iterator_t *p_pi, - OPJ_BYTE *p_src, - OPJ_UINT32 * p_data_read, - OPJ_UINT32 p_max_length, - opj_packet_info_t *p_pack_info, - opj_event_mgr_t *p_manager) -{ - OPJ_BOOL l_read_data; - OPJ_UINT32 l_nb_bytes_read = 0; - OPJ_UINT32 l_nb_total_bytes_read = 0; - - *p_data_read = 0; - - if (! opj_t2_read_packet_header(p_t2, p_tile, p_tcp, p_pi, &l_read_data, p_src, - &l_nb_bytes_read, p_max_length, p_pack_info, p_manager)) { - return OPJ_FALSE; - } - - p_src += l_nb_bytes_read; - l_nb_total_bytes_read += l_nb_bytes_read; - p_max_length -= l_nb_bytes_read; - - /* we should read data for the packet */ - if (l_read_data) { - l_nb_bytes_read = 0; - - if (! opj_t2_read_packet_data(p_t2, p_tile, p_pi, p_src, &l_nb_bytes_read, - p_max_length, p_pack_info, p_manager)) { - return OPJ_FALSE; - } - - l_nb_total_bytes_read += l_nb_bytes_read; - } - - *p_data_read = l_nb_total_bytes_read; - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_t2_encode_packet(OPJ_UINT32 tileno, - opj_tcd_tile_t * tile, - opj_tcp_t * tcp, - opj_pi_iterator_t *pi, - OPJ_BYTE *dest, - OPJ_UINT32 * p_data_written, - OPJ_UINT32 length, - opj_codestream_info_t *cstr_info, - J2K_T2_MODE p_t2_mode, - opj_event_mgr_t *p_manager) -{ - OPJ_UINT32 bandno, cblkno; - OPJ_BYTE* c = dest; - OPJ_UINT32 l_nb_bytes; - OPJ_UINT32 compno = pi->compno; /* component value */ - OPJ_UINT32 resno = pi->resno; /* resolution level value */ - OPJ_UINT32 precno = pi->precno; /* precinct value */ - OPJ_UINT32 layno = pi->layno; /* quality layer value */ - OPJ_UINT32 l_nb_blocks; - opj_tcd_band_t *band = 00; - opj_tcd_cblk_enc_t* cblk = 00; - opj_tcd_pass_t *pass = 00; - - opj_tcd_tilecomp_t *tilec = &tile->comps[compno]; - opj_tcd_resolution_t *res = &tilec->resolutions[resno]; - - opj_bio_t *bio = 00; /* BIO component */ -#ifdef ENABLE_EMPTY_PACKET_OPTIMIZATION - OPJ_BOOL packet_empty = OPJ_TRUE; -#else - OPJ_BOOL packet_empty = OPJ_FALSE; -#endif - - /* */ - if (tcp->csty & J2K_CP_CSTY_SOP) { - if (length < 6) { - if (p_t2_mode == FINAL_PASS) { - opj_event_msg(p_manager, EVT_ERROR, - "opj_t2_encode_packet(): only %u bytes remaining in " - "output buffer. %u needed.\n", - length, 6); - } - return OPJ_FALSE; - } - c[0] = 255; - c[1] = 145; - c[2] = 0; - c[3] = 4; -#if 0 - c[4] = (tile->packno % 65536) / 256; - c[5] = (tile->packno % 65536) % 256; -#else - c[4] = (tile->packno >> 8) & 0xff; /* packno is uint32_t */ - c[5] = tile->packno & 0xff; -#endif - c += 6; - length -= 6; - } - /* */ - - if (!layno) { - band = res->bands; - - for (bandno = 0; bandno < res->numbands; ++bandno, ++band) { - opj_tcd_precinct_t *prc; - - /* Skip empty bands */ - if (opj_tcd_is_band_empty(band)) { - continue; - } - - prc = &band->precincts[precno]; - opj_tgt_reset(prc->incltree); - opj_tgt_reset(prc->imsbtree); - - l_nb_blocks = prc->cw * prc->ch; - for (cblkno = 0; cblkno < l_nb_blocks; ++cblkno) { - cblk = &prc->cblks.enc[cblkno]; - - cblk->numpasses = 0; - opj_tgt_setvalue(prc->imsbtree, cblkno, band->numbps - (OPJ_INT32)cblk->numbps); - } - } - } - - bio = opj_bio_create(); - if (!bio) { - /* FIXME event manager error callback */ - return OPJ_FALSE; - } - opj_bio_init_enc(bio, c, length); - -#ifdef ENABLE_EMPTY_PACKET_OPTIMIZATION - /* WARNING: this code branch is disabled, since it has been reported that */ - /* such packets cause decoding issues with cinema J2K hardware */ - /* decoders: https://groups.google.com/forum/#!topic/openjpeg/M7M_fLX_Bco */ - - /* Check if the packet is empty */ - /* Note: we could also skip that step and always write a packet header */ - band = res->bands; - for (bandno = 0; bandno < res->numbands; ++bandno, ++band) { - opj_tcd_precinct_t *prc; - /* Skip empty bands */ - if (opj_tcd_is_band_empty(band)) { - continue; - } - - prc = &band->precincts[precno]; - l_nb_blocks = prc->cw * prc->ch; - cblk = prc->cblks.enc; - for (cblkno = 0; cblkno < l_nb_blocks; cblkno++, ++cblk) { - opj_tcd_layer_t *layer = &cblk->layers[layno]; - - /* if cblk not included, go to the next cblk */ - if (!layer->numpasses) { - continue; - } - packet_empty = OPJ_FALSE; - break; - } - if (!packet_empty) { - break; - } - } -#endif - opj_bio_write(bio, packet_empty ? 0 : 1, 1); /* Empty header bit */ - - /* Writing Packet header */ - band = res->bands; - for (bandno = 0; !packet_empty && - bandno < res->numbands; ++bandno, ++band) { - opj_tcd_precinct_t *prc; - - /* Skip empty bands */ - if (opj_tcd_is_band_empty(band)) { - continue; - } - - prc = &band->precincts[precno]; - l_nb_blocks = prc->cw * prc->ch; - cblk = prc->cblks.enc; - - for (cblkno = 0; cblkno < l_nb_blocks; ++cblkno) { - opj_tcd_layer_t *layer = &cblk->layers[layno]; - - if (!cblk->numpasses && layer->numpasses) { - opj_tgt_setvalue(prc->incltree, cblkno, (OPJ_INT32)layno); - } - - ++cblk; - } - - cblk = prc->cblks.enc; - for (cblkno = 0; cblkno < l_nb_blocks; cblkno++) { - opj_tcd_layer_t *layer = &cblk->layers[layno]; - OPJ_UINT32 increment = 0; - OPJ_UINT32 nump = 0; - OPJ_UINT32 len = 0, passno; - OPJ_UINT32 l_nb_passes; - - /* cblk inclusion bits */ - if (!cblk->numpasses) { - opj_tgt_encode(bio, prc->incltree, cblkno, (OPJ_INT32)(layno + 1)); - } else { - opj_bio_write(bio, layer->numpasses != 0, 1); - } - - /* if cblk not included, go to the next cblk */ - if (!layer->numpasses) { - ++cblk; - continue; - } - - /* if first instance of cblk --> zero bit-planes information */ - if (!cblk->numpasses) { - cblk->numlenbits = 3; - opj_tgt_encode(bio, prc->imsbtree, cblkno, 999); - } - - /* number of coding passes included */ - opj_t2_putnumpasses(bio, layer->numpasses); - l_nb_passes = cblk->numpasses + layer->numpasses; - pass = cblk->passes + cblk->numpasses; - - /* computation of the increase of the length indicator and insertion in the header */ - for (passno = cblk->numpasses; passno < l_nb_passes; ++passno) { - ++nump; - len += pass->len; - - if (pass->term || passno == (cblk->numpasses + layer->numpasses) - 1) { - increment = (OPJ_UINT32)opj_int_max((OPJ_INT32)increment, - opj_int_floorlog2((OPJ_INT32)len) + 1 - - ((OPJ_INT32)cblk->numlenbits + opj_int_floorlog2((OPJ_INT32)nump))); - len = 0; - nump = 0; - } - - ++pass; - } - opj_t2_putcommacode(bio, (OPJ_INT32)increment); - - /* computation of the new Length indicator */ - cblk->numlenbits += increment; - - pass = cblk->passes + cblk->numpasses; - /* insertion of the codeword segment length */ - for (passno = cblk->numpasses; passno < l_nb_passes; ++passno) { - nump++; - len += pass->len; - - if (pass->term || passno == (cblk->numpasses + layer->numpasses) - 1) { - opj_bio_write(bio, (OPJ_UINT32)len, - cblk->numlenbits + (OPJ_UINT32)opj_int_floorlog2((OPJ_INT32)nump)); - len = 0; - nump = 0; - } - ++pass; - } - - ++cblk; - } - } - - if (!opj_bio_flush(bio)) { - opj_bio_destroy(bio); - return OPJ_FALSE; /* modified to eliminate longjmp !! */ - } - - l_nb_bytes = (OPJ_UINT32)opj_bio_numbytes(bio); - c += l_nb_bytes; - length -= l_nb_bytes; - - opj_bio_destroy(bio); - - /* */ - if (tcp->csty & J2K_CP_CSTY_EPH) { - if (length < 2) { - if (p_t2_mode == FINAL_PASS) { - opj_event_msg(p_manager, EVT_ERROR, - "opj_t2_encode_packet(): only %u bytes remaining in " - "output buffer. %u needed.\n", - length, 2); - } - return OPJ_FALSE; - } - c[0] = 255; - c[1] = 146; - c += 2; - length -= 2; - } - /* */ - - /* << INDEX */ - /* End of packet header position. Currently only represents the distance to start of packet - Will be updated later by incrementing with packet start value*/ - if (cstr_info && cstr_info->index_write) { - opj_packet_info_t *info_PK = &cstr_info->tile[tileno].packet[cstr_info->packno]; - info_PK->end_ph_pos = (OPJ_INT32)(c - dest); - } - /* INDEX >> */ - - /* Writing the packet body */ - band = res->bands; - for (bandno = 0; !packet_empty && bandno < res->numbands; bandno++, ++band) { - opj_tcd_precinct_t *prc; - - /* Skip empty bands */ - if (opj_tcd_is_band_empty(band)) { - continue; - } - - prc = &band->precincts[precno]; - l_nb_blocks = prc->cw * prc->ch; - cblk = prc->cblks.enc; - - for (cblkno = 0; cblkno < l_nb_blocks; ++cblkno) { - opj_tcd_layer_t *layer = &cblk->layers[layno]; - - if (!layer->numpasses) { - ++cblk; - continue; - } - - if (layer->len > length) { - if (p_t2_mode == FINAL_PASS) { - opj_event_msg(p_manager, EVT_ERROR, - "opj_t2_encode_packet(): only %u bytes remaining in " - "output buffer. %u needed.\n", - length, layer->len); - } - return OPJ_FALSE; - } - - memcpy(c, layer->data, layer->len); - cblk->numpasses += layer->numpasses; - c += layer->len; - length -= layer->len; - - /* << INDEX */ - if (cstr_info && cstr_info->index_write) { - opj_packet_info_t *info_PK = &cstr_info->tile[tileno].packet[cstr_info->packno]; - info_PK->disto += layer->disto; - if (cstr_info->D_max < info_PK->disto) { - cstr_info->D_max = info_PK->disto; - } - } - - ++cblk; - /* INDEX >> */ - } - } - - assert(c >= dest); - * p_data_written += (OPJ_UINT32)(c - dest); - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_t2_skip_packet(opj_t2_t* p_t2, - opj_tcd_tile_t *p_tile, - opj_tcp_t *p_tcp, - opj_pi_iterator_t *p_pi, - OPJ_BYTE *p_src, - OPJ_UINT32 * p_data_read, - OPJ_UINT32 p_max_length, - opj_packet_info_t *p_pack_info, - opj_event_mgr_t *p_manager) -{ - OPJ_BOOL l_read_data; - OPJ_UINT32 l_nb_bytes_read = 0; - OPJ_UINT32 l_nb_total_bytes_read = 0; - - *p_data_read = 0; - - if (! opj_t2_read_packet_header(p_t2, p_tile, p_tcp, p_pi, &l_read_data, p_src, - &l_nb_bytes_read, p_max_length, p_pack_info, p_manager)) { - return OPJ_FALSE; - } - - p_src += l_nb_bytes_read; - l_nb_total_bytes_read += l_nb_bytes_read; - p_max_length -= l_nb_bytes_read; - - /* we should read data for the packet */ - if (l_read_data) { - l_nb_bytes_read = 0; - - if (! opj_t2_skip_packet_data(p_t2, p_tile, p_pi, &l_nb_bytes_read, - p_max_length, p_pack_info, p_manager)) { - return OPJ_FALSE; - } - - l_nb_total_bytes_read += l_nb_bytes_read; - } - *p_data_read = l_nb_total_bytes_read; - - return OPJ_TRUE; -} - - -static OPJ_BOOL opj_t2_read_packet_header(opj_t2_t* p_t2, - opj_tcd_tile_t *p_tile, - opj_tcp_t *p_tcp, - opj_pi_iterator_t *p_pi, - OPJ_BOOL * p_is_data_present, - OPJ_BYTE *p_src_data, - OPJ_UINT32 * p_data_read, - OPJ_UINT32 p_max_length, - opj_packet_info_t *p_pack_info, - opj_event_mgr_t *p_manager) - -{ - /* loop */ - OPJ_UINT32 bandno, cblkno; - OPJ_UINT32 l_nb_code_blocks; - OPJ_UINT32 l_remaining_length; - OPJ_UINT32 l_header_length; - OPJ_UINT32 * l_modified_length_ptr = 00; - OPJ_BYTE *l_current_data = p_src_data; - opj_cp_t *l_cp = p_t2->cp; - opj_bio_t *l_bio = 00; /* BIO component */ - opj_tcd_band_t *l_band = 00; - opj_tcd_cblk_dec_t* l_cblk = 00; - opj_tcd_resolution_t* l_res = - &p_tile->comps[p_pi->compno].resolutions[p_pi->resno]; - - OPJ_BYTE *l_header_data = 00; - OPJ_BYTE **l_header_data_start = 00; - - OPJ_UINT32 l_present; - - if (p_pi->layno == 0) { - l_band = l_res->bands; - - /* reset tagtrees */ - for (bandno = 0; bandno < l_res->numbands; ++bandno) { - if (!opj_tcd_is_band_empty(l_band)) { - opj_tcd_precinct_t *l_prc = &l_band->precincts[p_pi->precno]; - if (!(p_pi->precno < (l_band->precincts_data_size / sizeof( - opj_tcd_precinct_t)))) { - opj_event_msg(p_manager, EVT_ERROR, "Invalid precinct\n"); - return OPJ_FALSE; - } - - - opj_tgt_reset(l_prc->incltree); - opj_tgt_reset(l_prc->imsbtree); - l_cblk = l_prc->cblks.dec; - - l_nb_code_blocks = l_prc->cw * l_prc->ch; - for (cblkno = 0; cblkno < l_nb_code_blocks; ++cblkno) { - l_cblk->numsegs = 0; - l_cblk->real_num_segs = 0; - ++l_cblk; - } - } - - ++l_band; - } - } - - /* SOP markers */ - - if (p_tcp->csty & J2K_CP_CSTY_SOP) { - if (p_max_length < 6) { - opj_event_msg(p_manager, EVT_WARNING, - "Not enough space for expected SOP marker\n"); - } else if ((*l_current_data) != 0xff || (*(l_current_data + 1) != 0x91)) { - opj_event_msg(p_manager, EVT_WARNING, "Expected SOP marker\n"); - } else { - l_current_data += 6; - } - - /** TODO : check the Nsop value */ - } - - /* - When the marker PPT/PPM is used the packet header are store in PPT/PPM marker - This part deal with this caracteristic - step 1: Read packet header in the saved structure - step 2: Return to codestream for decoding - */ - - l_bio = opj_bio_create(); - if (! l_bio) { - return OPJ_FALSE; - } - - if (l_cp->ppm == 1) { /* PPM */ - l_header_data_start = &l_cp->ppm_data; - l_header_data = *l_header_data_start; - l_modified_length_ptr = &(l_cp->ppm_len); - - } else if (p_tcp->ppt == 1) { /* PPT */ - l_header_data_start = &(p_tcp->ppt_data); - l_header_data = *l_header_data_start; - l_modified_length_ptr = &(p_tcp->ppt_len); - } else { /* Normal Case */ - l_header_data_start = &(l_current_data); - l_header_data = *l_header_data_start; - l_remaining_length = (OPJ_UINT32)(p_src_data + p_max_length - l_header_data); - l_modified_length_ptr = &(l_remaining_length); - } - - opj_bio_init_dec(l_bio, l_header_data, *l_modified_length_ptr); - - l_present = opj_bio_read(l_bio, 1); - JAS_FPRINTF(stderr, "present=%d \n", l_present); - if (!l_present) { - /* TODO MSD: no test to control the output of this function*/ - opj_bio_inalign(l_bio); - l_header_data += opj_bio_numbytes(l_bio); - opj_bio_destroy(l_bio); - - /* EPH markers */ - if (p_tcp->csty & J2K_CP_CSTY_EPH) { - if ((*l_modified_length_ptr - (OPJ_UINT32)(l_header_data - - *l_header_data_start)) < 2U) { - opj_event_msg(p_manager, EVT_WARNING, - "Not enough space for expected EPH marker\n"); - } else if ((*l_header_data) != 0xff || (*(l_header_data + 1) != 0x92)) { - opj_event_msg(p_manager, EVT_WARNING, "Expected EPH marker\n"); - } else { - l_header_data += 2; - } - } - - l_header_length = (OPJ_UINT32)(l_header_data - *l_header_data_start); - *l_modified_length_ptr -= l_header_length; - *l_header_data_start += l_header_length; - - /* << INDEX */ - /* End of packet header position. Currently only represents the distance to start of packet - Will be updated later by incrementing with packet start value */ - if (p_pack_info) { - p_pack_info->end_ph_pos = (OPJ_INT32)(l_current_data - p_src_data); - } - /* INDEX >> */ - - * p_is_data_present = OPJ_FALSE; - *p_data_read = (OPJ_UINT32)(l_current_data - p_src_data); - return OPJ_TRUE; - } - - l_band = l_res->bands; - for (bandno = 0; bandno < l_res->numbands; ++bandno, ++l_band) { - opj_tcd_precinct_t *l_prc = &(l_band->precincts[p_pi->precno]); - - if (opj_tcd_is_band_empty(l_band)) { - continue; - } - - l_nb_code_blocks = l_prc->cw * l_prc->ch; - l_cblk = l_prc->cblks.dec; - for (cblkno = 0; cblkno < l_nb_code_blocks; cblkno++) { - OPJ_UINT32 l_included, l_increment, l_segno; - OPJ_INT32 n; - - /* if cblk not yet included before --> inclusion tagtree */ - if (!l_cblk->numsegs) { - l_included = opj_tgt_decode(l_bio, l_prc->incltree, cblkno, - (OPJ_INT32)(p_pi->layno + 1)); - /* else one bit */ - } else { - l_included = opj_bio_read(l_bio, 1); - } - - /* if cblk not included */ - if (!l_included) { - l_cblk->numnewpasses = 0; - ++l_cblk; - JAS_FPRINTF(stderr, "included=%d \n", l_included); - continue; - } - - /* if cblk not yet included --> zero-bitplane tagtree */ - if (!l_cblk->numsegs) { - OPJ_UINT32 i = 0; - - while (!opj_tgt_decode(l_bio, l_prc->imsbtree, cblkno, (OPJ_INT32)i)) { - ++i; - } - - l_cblk->numbps = (OPJ_UINT32)l_band->numbps + 1 - i; - l_cblk->numlenbits = 3; - } - - /* number of coding passes */ - l_cblk->numnewpasses = opj_t2_getnumpasses(l_bio); - l_increment = opj_t2_getcommacode(l_bio); - - /* length indicator increment */ - l_cblk->numlenbits += l_increment; - l_segno = 0; - - if (!l_cblk->numsegs) { - if (! opj_t2_init_seg(l_cblk, l_segno, p_tcp->tccps[p_pi->compno].cblksty, 1)) { - opj_bio_destroy(l_bio); - return OPJ_FALSE; - } - } else { - l_segno = l_cblk->numsegs - 1; - if (l_cblk->segs[l_segno].numpasses == l_cblk->segs[l_segno].maxpasses) { - ++l_segno; - if (! opj_t2_init_seg(l_cblk, l_segno, p_tcp->tccps[p_pi->compno].cblksty, 0)) { - opj_bio_destroy(l_bio); - return OPJ_FALSE; - } - } - } - n = (OPJ_INT32)l_cblk->numnewpasses; - - do { - OPJ_UINT32 bit_number; - l_cblk->segs[l_segno].numnewpasses = (OPJ_UINT32)opj_int_min((OPJ_INT32)( - l_cblk->segs[l_segno].maxpasses - l_cblk->segs[l_segno].numpasses), n); - bit_number = l_cblk->numlenbits + opj_uint_floorlog2( - l_cblk->segs[l_segno].numnewpasses); - if (bit_number > 32) { - opj_event_msg(p_manager, EVT_ERROR, - "Invalid bit number %d in opj_t2_read_packet_header()\n", - bit_number); - opj_bio_destroy(l_bio); - return OPJ_FALSE; - } - l_cblk->segs[l_segno].newlen = opj_bio_read(l_bio, bit_number); - JAS_FPRINTF(stderr, "included=%d numnewpasses=%d increment=%d len=%d \n", - l_included, l_cblk->segs[l_segno].numnewpasses, l_increment, - l_cblk->segs[l_segno].newlen); - - n -= (OPJ_INT32)l_cblk->segs[l_segno].numnewpasses; - if (n > 0) { - ++l_segno; - - if (! opj_t2_init_seg(l_cblk, l_segno, p_tcp->tccps[p_pi->compno].cblksty, 0)) { - opj_bio_destroy(l_bio); - return OPJ_FALSE; - } - } - } while (n > 0); - - ++l_cblk; - } - } - - if (!opj_bio_inalign(l_bio)) { - opj_bio_destroy(l_bio); - return OPJ_FALSE; - } - - l_header_data += opj_bio_numbytes(l_bio); - opj_bio_destroy(l_bio); - - /* EPH markers */ - if (p_tcp->csty & J2K_CP_CSTY_EPH) { - if ((*l_modified_length_ptr - (OPJ_UINT32)(l_header_data - - *l_header_data_start)) < 2U) { - opj_event_msg(p_manager, EVT_WARNING, - "Not enough space for expected EPH marker\n"); - } else if ((*l_header_data) != 0xff || (*(l_header_data + 1) != 0x92)) { - opj_event_msg(p_manager, EVT_WARNING, "Expected EPH marker\n"); - } else { - l_header_data += 2; - } - } - - l_header_length = (OPJ_UINT32)(l_header_data - *l_header_data_start); - JAS_FPRINTF(stderr, "hdrlen=%d \n", l_header_length); - JAS_FPRINTF(stderr, "packet body\n"); - *l_modified_length_ptr -= l_header_length; - *l_header_data_start += l_header_length; - - /* << INDEX */ - /* End of packet header position. Currently only represents the distance to start of packet - Will be updated later by incrementing with packet start value */ - if (p_pack_info) { - p_pack_info->end_ph_pos = (OPJ_INT32)(l_current_data - p_src_data); - } - /* INDEX >> */ - - *p_is_data_present = OPJ_TRUE; - *p_data_read = (OPJ_UINT32)(l_current_data - p_src_data); - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_t2_read_packet_data(opj_t2_t* p_t2, - opj_tcd_tile_t *p_tile, - opj_pi_iterator_t *p_pi, - OPJ_BYTE *p_src_data, - OPJ_UINT32 * p_data_read, - OPJ_UINT32 p_max_length, - opj_packet_info_t *pack_info, - opj_event_mgr_t* p_manager) -{ - OPJ_UINT32 bandno, cblkno; - OPJ_UINT32 l_nb_code_blocks; - OPJ_BYTE *l_current_data = p_src_data; - opj_tcd_band_t *l_band = 00; - opj_tcd_cblk_dec_t* l_cblk = 00; - opj_tcd_resolution_t* l_res = - &p_tile->comps[p_pi->compno].resolutions[p_pi->resno]; - - OPJ_ARG_NOT_USED(p_t2); - OPJ_ARG_NOT_USED(pack_info); - - l_band = l_res->bands; - for (bandno = 0; bandno < l_res->numbands; ++bandno) { - opj_tcd_precinct_t *l_prc = &l_band->precincts[p_pi->precno]; - - if ((l_band->x1 - l_band->x0 == 0) || (l_band->y1 - l_band->y0 == 0)) { - ++l_band; - continue; - } - - l_nb_code_blocks = l_prc->cw * l_prc->ch; - l_cblk = l_prc->cblks.dec; - - for (cblkno = 0; cblkno < l_nb_code_blocks; ++cblkno) { - opj_tcd_seg_t *l_seg = 00; - - if (!l_cblk->numnewpasses) { - /* nothing to do */ - ++l_cblk; - continue; - } - - if (!l_cblk->numsegs) { - l_seg = l_cblk->segs; - ++l_cblk->numsegs; - } else { - l_seg = &l_cblk->segs[l_cblk->numsegs - 1]; - - if (l_seg->numpasses == l_seg->maxpasses) { - ++l_seg; - ++l_cblk->numsegs; - } - } - - do { - /* Check possible overflow (on l_current_data only, assumes input args already checked) then size */ - if ((((OPJ_SIZE_T)l_current_data + (OPJ_SIZE_T)l_seg->newlen) < - (OPJ_SIZE_T)l_current_data) || - (l_current_data + l_seg->newlen > p_src_data + p_max_length)) { - opj_event_msg(p_manager, EVT_ERROR, - "read: segment too long (%d) with max (%d) for codeblock %d (p=%d, b=%d, r=%d, c=%d)\n", - l_seg->newlen, p_max_length, cblkno, p_pi->precno, bandno, p_pi->resno, - p_pi->compno); - return OPJ_FALSE; - } - -#ifdef USE_JPWL - /* we need here a j2k handle to verify if making a check to - the validity of cblocks parameters is selected from user (-W) */ - - /* let's check that we are not exceeding */ - if ((l_cblk->len + l_seg->newlen) > 8192) { - opj_event_msg(p_manager, EVT_WARNING, - "JPWL: segment too long (%d) for codeblock %d (p=%d, b=%d, r=%d, c=%d)\n", - l_seg->newlen, cblkno, p_pi->precno, bandno, p_pi->resno, p_pi->compno); - if (!JPWL_ASSUME) { - opj_event_msg(p_manager, EVT_ERROR, "JPWL: giving up\n"); - return OPJ_FALSE; - } - l_seg->newlen = 8192 - l_cblk->len; - opj_event_msg(p_manager, EVT_WARNING, " - truncating segment to %d\n", - l_seg->newlen); - break; - }; - -#endif /* USE_JPWL */ - - if (l_cblk->numchunks == l_cblk->numchunksalloc) { - OPJ_UINT32 l_numchunksalloc = l_cblk->numchunksalloc * 2 + 1; - opj_tcd_seg_data_chunk_t* l_chunks = - (opj_tcd_seg_data_chunk_t*)opj_realloc(l_cblk->chunks, - l_numchunksalloc * sizeof(opj_tcd_seg_data_chunk_t)); - if (l_chunks == NULL) { - opj_event_msg(p_manager, EVT_ERROR, - "cannot allocate opj_tcd_seg_data_chunk_t* array"); - return OPJ_FALSE; - } - l_cblk->chunks = l_chunks; - l_cblk->numchunksalloc = l_numchunksalloc; - } - - l_cblk->chunks[l_cblk->numchunks].data = l_current_data; - l_cblk->chunks[l_cblk->numchunks].len = l_seg->newlen; - l_cblk->numchunks ++; - - l_current_data += l_seg->newlen; - l_seg->len += l_seg->newlen; - l_seg->numpasses += l_seg->numnewpasses; - l_cblk->numnewpasses -= l_seg->numnewpasses; - - l_seg->real_num_passes = l_seg->numpasses; - - if (l_cblk->numnewpasses > 0) { - ++l_seg; - ++l_cblk->numsegs; - } - } while (l_cblk->numnewpasses > 0); - - l_cblk->real_num_segs = l_cblk->numsegs; - ++l_cblk; - } /* next code_block */ - - ++l_band; - } - - *(p_data_read) = (OPJ_UINT32)(l_current_data - p_src_data); - - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_t2_skip_packet_data(opj_t2_t* p_t2, - opj_tcd_tile_t *p_tile, - opj_pi_iterator_t *p_pi, - OPJ_UINT32 * p_data_read, - OPJ_UINT32 p_max_length, - opj_packet_info_t *pack_info, - opj_event_mgr_t *p_manager) -{ - OPJ_UINT32 bandno, cblkno; - OPJ_UINT32 l_nb_code_blocks; - opj_tcd_band_t *l_band = 00; - opj_tcd_cblk_dec_t* l_cblk = 00; - opj_tcd_resolution_t* l_res = - &p_tile->comps[p_pi->compno].resolutions[p_pi->resno]; - - OPJ_ARG_NOT_USED(p_t2); - OPJ_ARG_NOT_USED(pack_info); - - *p_data_read = 0; - l_band = l_res->bands; - - for (bandno = 0; bandno < l_res->numbands; ++bandno) { - opj_tcd_precinct_t *l_prc = &l_band->precincts[p_pi->precno]; - - if ((l_band->x1 - l_band->x0 == 0) || (l_band->y1 - l_band->y0 == 0)) { - ++l_band; - continue; - } - - l_nb_code_blocks = l_prc->cw * l_prc->ch; - l_cblk = l_prc->cblks.dec; - - for (cblkno = 0; cblkno < l_nb_code_blocks; ++cblkno) { - opj_tcd_seg_t *l_seg = 00; - - if (!l_cblk->numnewpasses) { - /* nothing to do */ - ++l_cblk; - continue; - } - - if (!l_cblk->numsegs) { - l_seg = l_cblk->segs; - ++l_cblk->numsegs; - } else { - l_seg = &l_cblk->segs[l_cblk->numsegs - 1]; - - if (l_seg->numpasses == l_seg->maxpasses) { - ++l_seg; - ++l_cblk->numsegs; - } - } - - do { - /* Check possible overflow then size */ - if (((*p_data_read + l_seg->newlen) < (*p_data_read)) || - ((*p_data_read + l_seg->newlen) > p_max_length)) { - opj_event_msg(p_manager, EVT_ERROR, - "skip: segment too long (%d) with max (%d) for codeblock %d (p=%d, b=%d, r=%d, c=%d)\n", - l_seg->newlen, p_max_length, cblkno, p_pi->precno, bandno, p_pi->resno, - p_pi->compno); - return OPJ_FALSE; - } - -#ifdef USE_JPWL - /* we need here a j2k handle to verify if making a check to - the validity of cblocks parameters is selected from user (-W) */ - - /* let's check that we are not exceeding */ - if ((l_cblk->len + l_seg->newlen) > 8192) { - opj_event_msg(p_manager, EVT_WARNING, - "JPWL: segment too long (%d) for codeblock %d (p=%d, b=%d, r=%d, c=%d)\n", - l_seg->newlen, cblkno, p_pi->precno, bandno, p_pi->resno, p_pi->compno); - if (!JPWL_ASSUME) { - opj_event_msg(p_manager, EVT_ERROR, "JPWL: giving up\n"); - return -999; - } - l_seg->newlen = 8192 - l_cblk->len; - opj_event_msg(p_manager, EVT_WARNING, " - truncating segment to %d\n", - l_seg->newlen); - break; - }; - -#endif /* USE_JPWL */ - JAS_FPRINTF(stderr, "p_data_read (%d) newlen (%d) \n", *p_data_read, - l_seg->newlen); - *(p_data_read) += l_seg->newlen; - - l_seg->numpasses += l_seg->numnewpasses; - l_cblk->numnewpasses -= l_seg->numnewpasses; - if (l_cblk->numnewpasses > 0) { - ++l_seg; - ++l_cblk->numsegs; - } - } while (l_cblk->numnewpasses > 0); - - ++l_cblk; - } - - ++l_band; - } - - return OPJ_TRUE; -} - - -static OPJ_BOOL opj_t2_init_seg(opj_tcd_cblk_dec_t* cblk, - OPJ_UINT32 index, - OPJ_UINT32 cblksty, - OPJ_UINT32 first) -{ - opj_tcd_seg_t* seg = 00; - OPJ_UINT32 l_nb_segs = index + 1; - - if (l_nb_segs > cblk->m_current_max_segs) { - opj_tcd_seg_t* new_segs; - OPJ_UINT32 l_m_current_max_segs = cblk->m_current_max_segs + - OPJ_J2K_DEFAULT_NB_SEGS; - - new_segs = (opj_tcd_seg_t*) opj_realloc(cblk->segs, - l_m_current_max_segs * sizeof(opj_tcd_seg_t)); - if (! new_segs) { - /* opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to initialize segment %d\n", l_nb_segs); */ - return OPJ_FALSE; - } - cblk->segs = new_segs; - memset(new_segs + cblk->m_current_max_segs, - 0, OPJ_J2K_DEFAULT_NB_SEGS * sizeof(opj_tcd_seg_t)); - cblk->m_current_max_segs = l_m_current_max_segs; - } - - seg = &cblk->segs[index]; - opj_tcd_reinit_segment(seg); - - if (cblksty & J2K_CCP_CBLKSTY_TERMALL) { - seg->maxpasses = 1; - } else if (cblksty & J2K_CCP_CBLKSTY_LAZY) { - if (first) { - seg->maxpasses = 10; - } else { - seg->maxpasses = (((seg - 1)->maxpasses == 1) || - ((seg - 1)->maxpasses == 10)) ? 2 : 1; - } - } else { - /* See paragraph "B.10.6 Number of coding passes" of the standard. - * Probably that 109 must be interpreted a (Mb-1)*3 + 1 with Mb=37, - * Mb being the maximum number of bit-planes available for the - * representation of coefficients in the sub-band */ - seg->maxpasses = 109; - } - - return OPJ_TRUE; -} diff --git a/src/3rd/LibOpenJpeg/t2.h b/src/3rd/LibOpenJpeg/t2.h deleted file mode 100644 index 66500b16..00000000 --- a/src/3rd/LibOpenJpeg/t2.h +++ /dev/null @@ -1,140 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR - * Copyright (c) 2012, CS Systemes d'Information, France - * Copyright (c) 2017, IntoPIX SA - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef OPJ_T2_H -#define OPJ_T2_H -/** -@file t2.h -@brief Implementation of a tier-2 coding (packetization of code-block data) (T2) - -*/ - -/** @defgroup T2 T2 - Implementation of a tier-2 coding */ -/*@{*/ - -/** -Tier-2 coding -*/ -typedef struct opj_t2 { - - /** Encoding: pointer to the src image. Decoding: pointer to the dst image. */ - opj_image_t *image; - /** pointer to the image coding parameters */ - opj_cp_t *cp; -} opj_t2_t; - -/** @name Exported functions */ -/*@{*/ -/* ----------------------------------------------------------------------- */ - -/** -Encode the packets of a tile to a destination buffer -@param t2 T2 handle -@param tileno number of the tile encoded -@param tile the tile for which to write the packets -@param maxlayers maximum number of layers -@param dest the destination buffer -@param p_data_written FIXME DOC -@param len the length of the destination buffer -@param cstr_info Codestream information structure -@param tpnum Tile part number of the current tile -@param tppos The position of the tile part flag in the progression order -@param pino FIXME DOC -@param t2_mode If == THRESH_CALC In Threshold calculation ,If == FINAL_PASS Final pass -@param p_manager the user event manager -*/ -OPJ_BOOL opj_t2_encode_packets(opj_t2_t* t2, - OPJ_UINT32 tileno, - opj_tcd_tile_t *tile, - OPJ_UINT32 maxlayers, - OPJ_BYTE *dest, - OPJ_UINT32 * p_data_written, - OPJ_UINT32 len, - opj_codestream_info_t *cstr_info, - OPJ_UINT32 tpnum, - OPJ_INT32 tppos, - OPJ_UINT32 pino, - J2K_T2_MODE t2_mode, - opj_event_mgr_t *p_manager); - -/** -Decode the packets of a tile from a source buffer -@param tcd TCD handle -@param t2 T2 handle -@param tileno number that identifies the tile for which to decode the packets -@param tile tile for which to decode the packets -@param src FIXME DOC -@param p_data_read the source buffer -@param len length of the source buffer -@param cstr_info FIXME DOC -@param p_manager the user event manager - -@return FIXME DOC - */ -OPJ_BOOL opj_t2_decode_packets(opj_tcd_t* tcd, - opj_t2_t *t2, - OPJ_UINT32 tileno, - opj_tcd_tile_t *tile, - OPJ_BYTE *src, - OPJ_UINT32 * p_data_read, - OPJ_UINT32 len, - opj_codestream_index_t *cstr_info, - opj_event_mgr_t *p_manager); - -/** - * Creates a Tier 2 handle - * - * @param p_image Source or destination image - * @param p_cp Image coding parameters. - * @return a new T2 handle if successful, NULL otherwise. -*/ -opj_t2_t* opj_t2_create(opj_image_t *p_image, opj_cp_t *p_cp); - -/** -Destroy a T2 handle -@param t2 T2 handle to destroy -*/ -void opj_t2_destroy(opj_t2_t *t2); - -/* ----------------------------------------------------------------------- */ -/*@}*/ - -/*@}*/ - -#endif /* OPJ_T2_H */ diff --git a/src/3rd/LibOpenJpeg/tcd.c b/src/3rd/LibOpenJpeg/tcd.c deleted file mode 100644 index be3b8436..00000000 --- a/src/3rd/LibOpenJpeg/tcd.c +++ /dev/null @@ -1,2804 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2006-2007, Parvatha Elangovan - * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR - * Copyright (c) 2012, CS Systemes d'Information, France - * Copyright (c) 2017, IntoPIX SA - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "opj_includes.h" -#include "opj_common.h" - -/* ----------------------------------------------------------------------- */ - -/* TODO MSD: */ -#ifdef TODO_MSD -void tcd_dump(FILE *fd, opj_tcd_t *tcd, opj_tcd_image_t * img) -{ - int tileno, compno, resno, bandno, precno;/*, cblkno;*/ - - fprintf(fd, "image {\n"); - fprintf(fd, " tw=%d, th=%d x0=%d x1=%d y0=%d y1=%d\n", - img->tw, img->th, tcd->image->x0, tcd->image->x1, tcd->image->y0, - tcd->image->y1); - - for (tileno = 0; tileno < img->th * img->tw; tileno++) { - opj_tcd_tile_t *tile = &tcd->tcd_image->tiles[tileno]; - fprintf(fd, " tile {\n"); - fprintf(fd, " x0=%d, y0=%d, x1=%d, y1=%d, numcomps=%d\n", - tile->x0, tile->y0, tile->x1, tile->y1, tile->numcomps); - for (compno = 0; compno < tile->numcomps; compno++) { - opj_tcd_tilecomp_t *tilec = &tile->comps[compno]; - fprintf(fd, " tilec {\n"); - fprintf(fd, - " x0=%d, y0=%d, x1=%d, y1=%d, numresolutions=%d\n", - tilec->x0, tilec->y0, tilec->x1, tilec->y1, tilec->numresolutions); - for (resno = 0; resno < tilec->numresolutions; resno++) { - opj_tcd_resolution_t *res = &tilec->resolutions[resno]; - fprintf(fd, "\n res {\n"); - fprintf(fd, - " x0=%d, y0=%d, x1=%d, y1=%d, pw=%d, ph=%d, numbands=%d\n", - res->x0, res->y0, res->x1, res->y1, res->pw, res->ph, res->numbands); - for (bandno = 0; bandno < res->numbands; bandno++) { - opj_tcd_band_t *band = &res->bands[bandno]; - fprintf(fd, " band {\n"); - fprintf(fd, - " x0=%d, y0=%d, x1=%d, y1=%d, stepsize=%f, numbps=%d\n", - band->x0, band->y0, band->x1, band->y1, band->stepsize, band->numbps); - for (precno = 0; precno < res->pw * res->ph; precno++) { - opj_tcd_precinct_t *prec = &band->precincts[precno]; - fprintf(fd, " prec {\n"); - fprintf(fd, - " x0=%d, y0=%d, x1=%d, y1=%d, cw=%d, ch=%d\n", - prec->x0, prec->y0, prec->x1, prec->y1, prec->cw, prec->ch); - /* - for (cblkno = 0; cblkno < prec->cw * prec->ch; cblkno++) { - opj_tcd_cblk_t *cblk = &prec->cblks[cblkno]; - fprintf(fd, " cblk {\n"); - fprintf(fd, - " x0=%d, y0=%d, x1=%d, y1=%d\n", - cblk->x0, cblk->y0, cblk->x1, cblk->y1); - fprintf(fd, " }\n"); - } - */ - fprintf(fd, " }\n"); - } - fprintf(fd, " }\n"); - } - fprintf(fd, " }\n"); - } - fprintf(fd, " }\n"); - } - fprintf(fd, " }\n"); - } - fprintf(fd, "}\n"); -} -#endif - -/** - * Initializes tile coding/decoding - */ -static INLINE OPJ_BOOL opj_tcd_init_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no, - OPJ_BOOL isEncoder, OPJ_FLOAT32 fraction, OPJ_SIZE_T sizeof_block, - opj_event_mgr_t* manager); - -/** -* Allocates memory for a decoding code block. -*/ -static OPJ_BOOL opj_tcd_code_block_dec_allocate(opj_tcd_cblk_dec_t * - p_code_block); - -/** - * Deallocates the decoding data of the given precinct. - */ -static void opj_tcd_code_block_dec_deallocate(opj_tcd_precinct_t * p_precinct); - -/** - * Allocates memory for an encoding code block (but not data). - */ -static OPJ_BOOL opj_tcd_code_block_enc_allocate(opj_tcd_cblk_enc_t * - p_code_block); - -/** - * Allocates data for an encoding code block - */ -static OPJ_BOOL opj_tcd_code_block_enc_allocate_data(opj_tcd_cblk_enc_t * - p_code_block); - -/** - * Deallocates the encoding data of the given precinct. - */ -static void opj_tcd_code_block_enc_deallocate(opj_tcd_precinct_t * p_precinct); - - -/** -Free the memory allocated for encoding -@param tcd TCD handle -*/ -static void opj_tcd_free_tile(opj_tcd_t *tcd); - - -static OPJ_BOOL opj_tcd_t2_decode(opj_tcd_t *p_tcd, - OPJ_BYTE * p_src_data, - OPJ_UINT32 * p_data_read, - OPJ_UINT32 p_max_src_size, - opj_codestream_index_t *p_cstr_index, - opj_event_mgr_t *p_manager); - -static OPJ_BOOL opj_tcd_t1_decode(opj_tcd_t *p_tcd, - opj_event_mgr_t *p_manager); - -static OPJ_BOOL opj_tcd_dwt_decode(opj_tcd_t *p_tcd); - -static OPJ_BOOL opj_tcd_mct_decode(opj_tcd_t *p_tcd, - opj_event_mgr_t *p_manager); - -static OPJ_BOOL opj_tcd_dc_level_shift_decode(opj_tcd_t *p_tcd); - - -static OPJ_BOOL opj_tcd_dc_level_shift_encode(opj_tcd_t *p_tcd); - -static OPJ_BOOL opj_tcd_mct_encode(opj_tcd_t *p_tcd); - -static OPJ_BOOL opj_tcd_dwt_encode(opj_tcd_t *p_tcd); - -static OPJ_BOOL opj_tcd_t1_encode(opj_tcd_t *p_tcd); - -static OPJ_BOOL opj_tcd_t2_encode(opj_tcd_t *p_tcd, - OPJ_BYTE * p_dest_data, - OPJ_UINT32 * p_data_written, - OPJ_UINT32 p_max_dest_size, - opj_codestream_info_t *p_cstr_info, - opj_event_mgr_t *p_manager); - -static OPJ_BOOL opj_tcd_rate_allocate_encode(opj_tcd_t *p_tcd, - OPJ_BYTE * p_dest_data, - OPJ_UINT32 p_max_dest_size, - opj_codestream_info_t *p_cstr_info, - opj_event_mgr_t *p_manager); - - -static OPJ_BOOL opj_tcd_is_whole_tilecomp_decoding(opj_tcd_t *tcd, - OPJ_UINT32 compno); - -/* ----------------------------------------------------------------------- */ - -/** -Create a new TCD handle -*/ -opj_tcd_t* opj_tcd_create(OPJ_BOOL p_is_decoder) -{ - opj_tcd_t *l_tcd = 00; - - /* create the tcd structure */ - l_tcd = (opj_tcd_t*) opj_calloc(1, sizeof(opj_tcd_t)); - if (!l_tcd) { - return 00; - } - - l_tcd->m_is_decoder = p_is_decoder ? 1 : 0; - - l_tcd->tcd_image = (opj_tcd_image_t*)opj_calloc(1, sizeof(opj_tcd_image_t)); - if (!l_tcd->tcd_image) { - opj_free(l_tcd); - return 00; - } - - return l_tcd; -} - - -/* ----------------------------------------------------------------------- */ - -void opj_tcd_rateallocate_fixed(opj_tcd_t *tcd) -{ - OPJ_UINT32 layno; - - for (layno = 0; layno < tcd->tcp->numlayers; layno++) { - opj_tcd_makelayer_fixed(tcd, layno, 1); - } -} - - -void opj_tcd_makelayer(opj_tcd_t *tcd, - OPJ_UINT32 layno, - OPJ_FLOAT64 thresh, - OPJ_UINT32 final) -{ - OPJ_UINT32 compno, resno, bandno, precno, cblkno; - OPJ_UINT32 passno; - - opj_tcd_tile_t *tcd_tile = tcd->tcd_image->tiles; - - tcd_tile->distolayer[layno] = 0; /* fixed_quality */ - - for (compno = 0; compno < tcd_tile->numcomps; compno++) { - opj_tcd_tilecomp_t *tilec = &tcd_tile->comps[compno]; - - for (resno = 0; resno < tilec->numresolutions; resno++) { - opj_tcd_resolution_t *res = &tilec->resolutions[resno]; - - for (bandno = 0; bandno < res->numbands; bandno++) { - opj_tcd_band_t *band = &res->bands[bandno]; - - /* Skip empty bands */ - if (opj_tcd_is_band_empty(band)) { - continue; - } - - for (precno = 0; precno < res->pw * res->ph; precno++) { - opj_tcd_precinct_t *prc = &band->precincts[precno]; - - for (cblkno = 0; cblkno < prc->cw * prc->ch; cblkno++) { - opj_tcd_cblk_enc_t *cblk = &prc->cblks.enc[cblkno]; - opj_tcd_layer_t *layer = &cblk->layers[layno]; - OPJ_UINT32 n; - - if (layno == 0) { - cblk->numpassesinlayers = 0; - } - - n = cblk->numpassesinlayers; - - if (thresh < 0) { - /* Special value to indicate to use all passes */ - n = cblk->totalpasses; - } else { - for (passno = cblk->numpassesinlayers; passno < cblk->totalpasses; passno++) { - OPJ_UINT32 dr; - OPJ_FLOAT64 dd; - opj_tcd_pass_t *pass = &cblk->passes[passno]; - - if (n == 0) { - dr = pass->rate; - dd = pass->distortiondec; - } else { - dr = pass->rate - cblk->passes[n - 1].rate; - dd = pass->distortiondec - cblk->passes[n - 1].distortiondec; - } - - if (!dr) { - if (dd != 0) { - n = passno + 1; - } - continue; - } - if (thresh - (dd / dr) < - DBL_EPSILON) { /* do not rely on float equality, check with DBL_EPSILON margin */ - n = passno + 1; - } - } - } - - layer->numpasses = n - cblk->numpassesinlayers; - - if (!layer->numpasses) { - layer->disto = 0; - continue; - } - - if (cblk->numpassesinlayers == 0) { - layer->len = cblk->passes[n - 1].rate; - layer->data = cblk->data; - layer->disto = cblk->passes[n - 1].distortiondec; - } else { - layer->len = cblk->passes[n - 1].rate - cblk->passes[cblk->numpassesinlayers - - 1].rate; - layer->data = cblk->data + cblk->passes[cblk->numpassesinlayers - 1].rate; - layer->disto = cblk->passes[n - 1].distortiondec - - cblk->passes[cblk->numpassesinlayers - 1].distortiondec; - } - - tcd_tile->distolayer[layno] += layer->disto; /* fixed_quality */ - - if (final) { - cblk->numpassesinlayers = n; - } - } - } - } - } - } -} - -void opj_tcd_makelayer_fixed(opj_tcd_t *tcd, OPJ_UINT32 layno, - OPJ_UINT32 final) -{ - OPJ_UINT32 compno, resno, bandno, precno, cblkno; - OPJ_INT32 value; /*, matrice[tcd_tcp->numlayers][tcd_tile->comps[0].numresolutions][3]; */ - OPJ_INT32 matrice[10][10][3]; - OPJ_UINT32 i, j, k; - - opj_cp_t *cp = tcd->cp; - opj_tcd_tile_t *tcd_tile = tcd->tcd_image->tiles; - opj_tcp_t *tcd_tcp = tcd->tcp; - - for (compno = 0; compno < tcd_tile->numcomps; compno++) { - opj_tcd_tilecomp_t *tilec = &tcd_tile->comps[compno]; - - for (i = 0; i < tcd_tcp->numlayers; i++) { - for (j = 0; j < tilec->numresolutions; j++) { - for (k = 0; k < 3; k++) { - matrice[i][j][k] = - (OPJ_INT32)((OPJ_FLOAT32)cp->m_specific_param.m_enc.m_matrice[i * - tilec->numresolutions * 3 + j * 3 + k] - * (OPJ_FLOAT32)(tcd->image->comps[compno].prec / 16.0)); - } - } - } - - for (resno = 0; resno < tilec->numresolutions; resno++) { - opj_tcd_resolution_t *res = &tilec->resolutions[resno]; - - for (bandno = 0; bandno < res->numbands; bandno++) { - opj_tcd_band_t *band = &res->bands[bandno]; - - /* Skip empty bands */ - if (opj_tcd_is_band_empty(band)) { - continue; - } - - for (precno = 0; precno < res->pw * res->ph; precno++) { - opj_tcd_precinct_t *prc = &band->precincts[precno]; - - for (cblkno = 0; cblkno < prc->cw * prc->ch; cblkno++) { - opj_tcd_cblk_enc_t *cblk = &prc->cblks.enc[cblkno]; - opj_tcd_layer_t *layer = &cblk->layers[layno]; - OPJ_UINT32 n; - OPJ_INT32 imsb = (OPJ_INT32)(tcd->image->comps[compno].prec - - cblk->numbps); /* number of bit-plan equal to zero */ - - /* Correction of the matrix of coefficient to include the IMSB information */ - if (layno == 0) { - value = matrice[layno][resno][bandno]; - if (imsb >= value) { - value = 0; - } else { - value -= imsb; - } - } else { - value = matrice[layno][resno][bandno] - matrice[layno - 1][resno][bandno]; - if (imsb >= matrice[layno - 1][resno][bandno]) { - value -= (imsb - matrice[layno - 1][resno][bandno]); - if (value < 0) { - value = 0; - } - } - } - - if (layno == 0) { - cblk->numpassesinlayers = 0; - } - - n = cblk->numpassesinlayers; - if (cblk->numpassesinlayers == 0) { - if (value != 0) { - n = 3 * (OPJ_UINT32)value - 2 + cblk->numpassesinlayers; - } else { - n = cblk->numpassesinlayers; - } - } else { - n = 3 * (OPJ_UINT32)value + cblk->numpassesinlayers; - } - - layer->numpasses = n - cblk->numpassesinlayers; - - if (!layer->numpasses) { - continue; - } - - if (cblk->numpassesinlayers == 0) { - layer->len = cblk->passes[n - 1].rate; - layer->data = cblk->data; - } else { - layer->len = cblk->passes[n - 1].rate - cblk->passes[cblk->numpassesinlayers - - 1].rate; - layer->data = cblk->data + cblk->passes[cblk->numpassesinlayers - 1].rate; - } - - if (final) { - cblk->numpassesinlayers = n; - } - } - } - } - } - } -} - -OPJ_BOOL opj_tcd_rateallocate(opj_tcd_t *tcd, - OPJ_BYTE *dest, - OPJ_UINT32 * p_data_written, - OPJ_UINT32 len, - opj_codestream_info_t *cstr_info, - opj_event_mgr_t *p_manager) -{ - OPJ_UINT32 compno, resno, bandno, precno, cblkno, layno; - OPJ_UINT32 passno; - OPJ_FLOAT64 min, max; - OPJ_FLOAT64 cumdisto[100]; /* fixed_quality */ - const OPJ_FLOAT64 K = 1; /* 1.1; fixed_quality */ - OPJ_FLOAT64 maxSE = 0; - - opj_cp_t *cp = tcd->cp; - opj_tcd_tile_t *tcd_tile = tcd->tcd_image->tiles; - opj_tcp_t *tcd_tcp = tcd->tcp; - - min = DBL_MAX; - max = 0; - - tcd_tile->numpix = 0; /* fixed_quality */ - - for (compno = 0; compno < tcd_tile->numcomps; compno++) { - opj_tcd_tilecomp_t *tilec = &tcd_tile->comps[compno]; - tilec->numpix = 0; - - for (resno = 0; resno < tilec->numresolutions; resno++) { - opj_tcd_resolution_t *res = &tilec->resolutions[resno]; - - for (bandno = 0; bandno < res->numbands; bandno++) { - opj_tcd_band_t *band = &res->bands[bandno]; - - /* Skip empty bands */ - if (opj_tcd_is_band_empty(band)) { - continue; - } - - for (precno = 0; precno < res->pw * res->ph; precno++) { - opj_tcd_precinct_t *prc = &band->precincts[precno]; - - for (cblkno = 0; cblkno < prc->cw * prc->ch; cblkno++) { - opj_tcd_cblk_enc_t *cblk = &prc->cblks.enc[cblkno]; - - for (passno = 0; passno < cblk->totalpasses; passno++) { - opj_tcd_pass_t *pass = &cblk->passes[passno]; - OPJ_INT32 dr; - OPJ_FLOAT64 dd, rdslope; - - if (passno == 0) { - dr = (OPJ_INT32)pass->rate; - dd = pass->distortiondec; - } else { - dr = (OPJ_INT32)(pass->rate - cblk->passes[passno - 1].rate); - dd = pass->distortiondec - cblk->passes[passno - 1].distortiondec; - } - - if (dr == 0) { - continue; - } - - rdslope = dd / dr; - if (rdslope < min) { - min = rdslope; - } - - if (rdslope > max) { - max = rdslope; - } - } /* passno */ - - /* fixed_quality */ - tcd_tile->numpix += ((cblk->x1 - cblk->x0) * (cblk->y1 - cblk->y0)); - tilec->numpix += ((cblk->x1 - cblk->x0) * (cblk->y1 - cblk->y0)); - } /* cbklno */ - } /* precno */ - } /* bandno */ - } /* resno */ - - maxSE += (((OPJ_FLOAT64)(1 << tcd->image->comps[compno].prec) - 1.0) - * ((OPJ_FLOAT64)(1 << tcd->image->comps[compno].prec) - 1.0)) - * ((OPJ_FLOAT64)(tilec->numpix)); - } /* compno */ - - /* index file */ - if (cstr_info) { - opj_tile_info_t *tile_info = &cstr_info->tile[tcd->tcd_tileno]; - tile_info->numpix = tcd_tile->numpix; - tile_info->distotile = tcd_tile->distotile; - tile_info->thresh = (OPJ_FLOAT64 *) opj_malloc(tcd_tcp->numlayers * sizeof( - OPJ_FLOAT64)); - if (!tile_info->thresh) { - /* FIXME event manager error callback */ - return OPJ_FALSE; - } - } - - for (layno = 0; layno < tcd_tcp->numlayers; layno++) { - OPJ_FLOAT64 lo = min; - OPJ_FLOAT64 hi = max; - OPJ_UINT32 maxlen = tcd_tcp->rates[layno] > 0.0f ? opj_uint_min((( - OPJ_UINT32) ceil(tcd_tcp->rates[layno])), len) : len; - OPJ_FLOAT64 goodthresh = 0; - OPJ_FLOAT64 stable_thresh = 0; - OPJ_UINT32 i; - OPJ_FLOAT64 distotarget; /* fixed_quality */ - - /* fixed_quality */ - distotarget = tcd_tile->distotile - ((K * maxSE) / pow((OPJ_FLOAT32)10, - tcd_tcp->distoratio[layno] / 10)); - - /* Don't try to find an optimal threshold but rather take everything not included yet, if - -r xx,yy,zz,0 (disto_alloc == 1 and rates == 0) - -q xx,yy,zz,0 (fixed_quality == 1 and distoratio == 0) - ==> possible to have some lossy layers and the last layer for sure lossless */ - if (((cp->m_specific_param.m_enc.m_disto_alloc == 1) && - (tcd_tcp->rates[layno] > 0.0f)) || - ((cp->m_specific_param.m_enc.m_fixed_quality == 1) && - (tcd_tcp->distoratio[layno] > 0.0))) { - opj_t2_t*t2 = opj_t2_create(tcd->image, cp); - OPJ_FLOAT64 thresh = 0; - - if (t2 == 00) { - return OPJ_FALSE; - } - - for (i = 0; i < 128; ++i) { - OPJ_FLOAT64 distoachieved = 0; /* fixed_quality */ - - thresh = (lo + hi) / 2; - - opj_tcd_makelayer(tcd, layno, thresh, 0); - - if (cp->m_specific_param.m_enc.m_fixed_quality) { /* fixed_quality */ - if (OPJ_IS_CINEMA(cp->rsiz)) { - if (! opj_t2_encode_packets(t2, tcd->tcd_tileno, tcd_tile, layno + 1, dest, - p_data_written, maxlen, cstr_info, tcd->cur_tp_num, tcd->tp_pos, tcd->cur_pino, - THRESH_CALC, p_manager)) { - - lo = thresh; - continue; - } else { - distoachieved = layno == 0 ? - tcd_tile->distolayer[0] : cumdisto[layno - 1] + tcd_tile->distolayer[layno]; - - if (distoachieved < distotarget) { - hi = thresh; - stable_thresh = thresh; - continue; - } else { - lo = thresh; - } - } - } else { - distoachieved = (layno == 0) ? - tcd_tile->distolayer[0] : (cumdisto[layno - 1] + tcd_tile->distolayer[layno]); - - if (distoachieved < distotarget) { - hi = thresh; - stable_thresh = thresh; - continue; - } - lo = thresh; - } - } else { - if (! opj_t2_encode_packets(t2, tcd->tcd_tileno, tcd_tile, layno + 1, dest, - p_data_written, maxlen, cstr_info, tcd->cur_tp_num, tcd->tp_pos, tcd->cur_pino, - THRESH_CALC, p_manager)) { - /* TODO: what to do with l ??? seek / tell ??? */ - /* opj_event_msg(tcd->cinfo, EVT_INFO, "rate alloc: len=%d, max=%d\n", l, maxlen); */ - lo = thresh; - continue; - } - - hi = thresh; - stable_thresh = thresh; - } - } - - goodthresh = stable_thresh == 0 ? thresh : stable_thresh; - - opj_t2_destroy(t2); - } else { - /* Special value to indicate to use all passes */ - goodthresh = -1; - } - - if (cstr_info) { /* Threshold for Marcela Index */ - cstr_info->tile[tcd->tcd_tileno].thresh[layno] = goodthresh; - } - - opj_tcd_makelayer(tcd, layno, goodthresh, 1); - - /* fixed_quality */ - cumdisto[layno] = (layno == 0) ? tcd_tile->distolayer[0] : - (cumdisto[layno - 1] + tcd_tile->distolayer[layno]); - } - - return OPJ_TRUE; -} - -OPJ_BOOL opj_tcd_init(opj_tcd_t *p_tcd, - opj_image_t * p_image, - opj_cp_t * p_cp, - opj_thread_pool_t* p_tp) -{ - p_tcd->image = p_image; - p_tcd->cp = p_cp; - - p_tcd->tcd_image->tiles = (opj_tcd_tile_t *) opj_calloc(1, - sizeof(opj_tcd_tile_t)); - if (! p_tcd->tcd_image->tiles) { - return OPJ_FALSE; - } - - p_tcd->tcd_image->tiles->comps = (opj_tcd_tilecomp_t *) opj_calloc( - p_image->numcomps, sizeof(opj_tcd_tilecomp_t)); - if (! p_tcd->tcd_image->tiles->comps) { - return OPJ_FALSE; - } - - p_tcd->tcd_image->tiles->numcomps = p_image->numcomps; - p_tcd->tp_pos = p_cp->m_specific_param.m_enc.m_tp_pos; - p_tcd->thread_pool = p_tp; - - return OPJ_TRUE; -} - -/** -Destroy a previously created TCD handle -*/ -void opj_tcd_destroy(opj_tcd_t *tcd) -{ - if (tcd) { - opj_tcd_free_tile(tcd); - - if (tcd->tcd_image) { - opj_free(tcd->tcd_image); - tcd->tcd_image = 00; - } - - opj_free(tcd->used_component); - - opj_free(tcd); - } -} - -OPJ_BOOL opj_alloc_tile_component_data(opj_tcd_tilecomp_t *l_tilec) -{ - if ((l_tilec->data == 00) || - ((l_tilec->data_size_needed > l_tilec->data_size) && - (l_tilec->ownsData == OPJ_FALSE))) { - l_tilec->data = (OPJ_INT32 *) opj_image_data_alloc(l_tilec->data_size_needed); - if (!l_tilec->data && l_tilec->data_size_needed != 0) { - return OPJ_FALSE; - } - /*fprintf(stderr, "tAllocate data of tilec (int): %d x OPJ_UINT32n",l_data_size);*/ - l_tilec->data_size = l_tilec->data_size_needed; - l_tilec->ownsData = OPJ_TRUE; - } else if (l_tilec->data_size_needed > l_tilec->data_size) { - /* We don't need to keep old data */ - opj_image_data_free(l_tilec->data); - l_tilec->data = (OPJ_INT32 *) opj_image_data_alloc(l_tilec->data_size_needed); - if (! l_tilec->data) { - l_tilec->data_size = 0; - l_tilec->data_size_needed = 0; - l_tilec->ownsData = OPJ_FALSE; - return OPJ_FALSE; - } - /*fprintf(stderr, "tReallocate data of tilec (int): from %d to %d x OPJ_UINT32n", l_tilec->data_size, l_data_size);*/ - l_tilec->data_size = l_tilec->data_size_needed; - l_tilec->ownsData = OPJ_TRUE; - } - return OPJ_TRUE; -} - -/* ----------------------------------------------------------------------- */ - -static INLINE OPJ_BOOL opj_tcd_init_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no, - OPJ_BOOL isEncoder, OPJ_FLOAT32 fraction, OPJ_SIZE_T sizeof_block, - opj_event_mgr_t* manager) -{ - OPJ_UINT32(*l_gain_ptr)(OPJ_UINT32) = 00; - OPJ_UINT32 compno, resno, bandno, precno, cblkno; - opj_tcp_t * l_tcp = 00; - opj_cp_t * l_cp = 00; - opj_tcd_tile_t * l_tile = 00; - opj_tccp_t *l_tccp = 00; - opj_tcd_tilecomp_t *l_tilec = 00; - opj_image_comp_t * l_image_comp = 00; - opj_tcd_resolution_t *l_res = 00; - opj_tcd_band_t *l_band = 00; - opj_stepsize_t * l_step_size = 00; - opj_tcd_precinct_t *l_current_precinct = 00; - opj_image_t *l_image = 00; - OPJ_UINT32 p, q; - OPJ_UINT32 l_level_no; - OPJ_UINT32 l_pdx, l_pdy; - OPJ_UINT32 l_gain; - OPJ_INT32 l_x0b, l_y0b; - OPJ_UINT32 l_tx0, l_ty0; - /* extent of precincts , top left, bottom right**/ - OPJ_INT32 l_tl_prc_x_start, l_tl_prc_y_start, l_br_prc_x_end, l_br_prc_y_end; - /* number of precinct for a resolution */ - OPJ_UINT32 l_nb_precincts; - /* room needed to store l_nb_precinct precinct for a resolution */ - OPJ_UINT32 l_nb_precinct_size; - /* number of code blocks for a precinct*/ - OPJ_UINT32 l_nb_code_blocks; - /* room needed to store l_nb_code_blocks code blocks for a precinct*/ - OPJ_UINT32 l_nb_code_blocks_size; - /* size of data for a tile */ - OPJ_UINT32 l_data_size; - - l_cp = p_tcd->cp; - l_tcp = &(l_cp->tcps[p_tile_no]); - l_tile = p_tcd->tcd_image->tiles; - l_tccp = l_tcp->tccps; - l_tilec = l_tile->comps; - l_image = p_tcd->image; - l_image_comp = p_tcd->image->comps; - - p = p_tile_no % l_cp->tw; /* tile coordinates */ - q = p_tile_no / l_cp->tw; - /*fprintf(stderr, "Tile coordinate = %d,%d\n", p, q);*/ - - /* 4 borders of the tile rescale on the image if necessary */ - l_tx0 = l_cp->tx0 + p * - l_cp->tdx; /* can't be greater than l_image->x1 so won't overflow */ - l_tile->x0 = (OPJ_INT32)opj_uint_max(l_tx0, l_image->x0); - l_tile->x1 = (OPJ_INT32)opj_uint_min(opj_uint_adds(l_tx0, l_cp->tdx), - l_image->x1); - /* all those OPJ_UINT32 are casted to OPJ_INT32, let's do some sanity check */ - if ((l_tile->x0 < 0) || (l_tile->x1 <= l_tile->x0)) { - opj_event_msg(manager, EVT_ERROR, "Tile X coordinates are not supported\n"); - return OPJ_FALSE; - } - l_ty0 = l_cp->ty0 + q * - l_cp->tdy; /* can't be greater than l_image->y1 so won't overflow */ - l_tile->y0 = (OPJ_INT32)opj_uint_max(l_ty0, l_image->y0); - l_tile->y1 = (OPJ_INT32)opj_uint_min(opj_uint_adds(l_ty0, l_cp->tdy), - l_image->y1); - /* all those OPJ_UINT32 are casted to OPJ_INT32, let's do some sanity check */ - if ((l_tile->y0 < 0) || (l_tile->y1 <= l_tile->y0)) { - opj_event_msg(manager, EVT_ERROR, "Tile Y coordinates are not supported\n"); - return OPJ_FALSE; - } - - - /* testcase 1888.pdf.asan.35.988 */ - if (l_tccp->numresolutions == 0) { - opj_event_msg(manager, EVT_ERROR, "tiles require at least one resolution\n"); - return OPJ_FALSE; - } - /*fprintf(stderr, "Tile border = %d,%d,%d,%d\n", l_tile->x0, l_tile->y0,l_tile->x1,l_tile->y1);*/ - - /*tile->numcomps = image->numcomps; */ - for (compno = 0; compno < l_tile->numcomps; ++compno) { - /*fprintf(stderr, "compno = %d/%d\n", compno, l_tile->numcomps);*/ - l_image_comp->resno_decoded = 0; - /* border of each l_tile component (global) */ - l_tilec->x0 = opj_int_ceildiv(l_tile->x0, (OPJ_INT32)l_image_comp->dx); - l_tilec->y0 = opj_int_ceildiv(l_tile->y0, (OPJ_INT32)l_image_comp->dy); - l_tilec->x1 = opj_int_ceildiv(l_tile->x1, (OPJ_INT32)l_image_comp->dx); - l_tilec->y1 = opj_int_ceildiv(l_tile->y1, (OPJ_INT32)l_image_comp->dy); - l_tilec->compno = compno; - /*fprintf(stderr, "\tTile compo border = %d,%d,%d,%d\n", l_tilec->x0, l_tilec->y0,l_tilec->x1,l_tilec->y1);*/ - - l_tilec->numresolutions = l_tccp->numresolutions; - if (l_tccp->numresolutions < l_cp->m_specific_param.m_dec.m_reduce) { - l_tilec->minimum_num_resolutions = 1; - } else { - l_tilec->minimum_num_resolutions = l_tccp->numresolutions - - l_cp->m_specific_param.m_dec.m_reduce; - } - - if (isEncoder) { - OPJ_SIZE_T l_tile_data_size; - - /* compute l_data_size with overflow check */ - OPJ_SIZE_T w = (OPJ_SIZE_T)(l_tilec->x1 - l_tilec->x0); - OPJ_SIZE_T h = (OPJ_SIZE_T)(l_tilec->y1 - l_tilec->y0); - - /* issue 733, l_data_size == 0U, probably something wrong should be checked before getting here */ - if (h > 0 && w > SIZE_MAX / h) { - opj_event_msg(manager, EVT_ERROR, "Size of tile data exceeds system limits\n"); - return OPJ_FALSE; - } - l_tile_data_size = w * h; - - if (SIZE_MAX / sizeof(OPJ_UINT32) < l_tile_data_size) { - opj_event_msg(manager, EVT_ERROR, "Size of tile data exceeds system limits\n"); - return OPJ_FALSE; - } - l_tile_data_size = l_tile_data_size * sizeof(OPJ_UINT32); - - l_tilec->data_size_needed = l_tile_data_size; - } - - l_data_size = l_tilec->numresolutions * (OPJ_UINT32)sizeof( - opj_tcd_resolution_t); - - opj_image_data_free(l_tilec->data_win); - l_tilec->data_win = NULL; - l_tilec->win_x0 = 0; - l_tilec->win_y0 = 0; - l_tilec->win_x1 = 0; - l_tilec->win_y1 = 0; - - if (l_tilec->resolutions == 00) { - l_tilec->resolutions = (opj_tcd_resolution_t *) opj_malloc(l_data_size); - if (! l_tilec->resolutions) { - return OPJ_FALSE; - } - /*fprintf(stderr, "\tAllocate resolutions of tilec (opj_tcd_resolution_t): %d\n",l_data_size);*/ - l_tilec->resolutions_size = l_data_size; - memset(l_tilec->resolutions, 0, l_data_size); - } else if (l_data_size > l_tilec->resolutions_size) { - opj_tcd_resolution_t* new_resolutions = (opj_tcd_resolution_t *) opj_realloc( - l_tilec->resolutions, l_data_size); - if (! new_resolutions) { - opj_event_msg(manager, EVT_ERROR, "Not enough memory for tile resolutions\n"); - opj_free(l_tilec->resolutions); - l_tilec->resolutions = NULL; - l_tilec->resolutions_size = 0; - return OPJ_FALSE; - } - l_tilec->resolutions = new_resolutions; - /*fprintf(stderr, "\tReallocate data of tilec (int): from %d to %d x OPJ_UINT32\n", l_tilec->resolutions_size, l_data_size);*/ - memset(((OPJ_BYTE*) l_tilec->resolutions) + l_tilec->resolutions_size, 0, - l_data_size - l_tilec->resolutions_size); - l_tilec->resolutions_size = l_data_size; - } - - l_level_no = l_tilec->numresolutions; - l_res = l_tilec->resolutions; - l_step_size = l_tccp->stepsizes; - if (l_tccp->qmfbid == 0) { - l_gain_ptr = &opj_dwt_getgain_real; - } else { - l_gain_ptr = &opj_dwt_getgain; - } - /*fprintf(stderr, "\tlevel_no=%d\n",l_level_no);*/ - - for (resno = 0; resno < l_tilec->numresolutions; ++resno) { - /*fprintf(stderr, "\t\tresno = %d/%d\n", resno, l_tilec->numresolutions);*/ - OPJ_INT32 tlcbgxstart, tlcbgystart /*, brcbgxend, brcbgyend*/; - OPJ_UINT32 cbgwidthexpn, cbgheightexpn; - OPJ_UINT32 cblkwidthexpn, cblkheightexpn; - - --l_level_no; - - /* border for each resolution level (global) */ - l_res->x0 = opj_int_ceildivpow2(l_tilec->x0, (OPJ_INT32)l_level_no); - l_res->y0 = opj_int_ceildivpow2(l_tilec->y0, (OPJ_INT32)l_level_no); - l_res->x1 = opj_int_ceildivpow2(l_tilec->x1, (OPJ_INT32)l_level_no); - l_res->y1 = opj_int_ceildivpow2(l_tilec->y1, (OPJ_INT32)l_level_no); - - /*fprintf(stderr, "\t\t\tres_x0= %d, res_y0 =%d, res_x1=%d, res_y1=%d\n", l_res->x0, l_res->y0, l_res->x1, l_res->y1);*/ - /* p. 35, table A-23, ISO/IEC FDIS154444-1 : 2000 (18 august 2000) */ - l_pdx = l_tccp->prcw[resno]; - l_pdy = l_tccp->prch[resno]; - /*fprintf(stderr, "\t\t\tpdx=%d, pdy=%d\n", l_pdx, l_pdy);*/ - /* p. 64, B.6, ISO/IEC FDIS15444-1 : 2000 (18 august 2000) */ - l_tl_prc_x_start = opj_int_floordivpow2(l_res->x0, (OPJ_INT32)l_pdx) << l_pdx; - l_tl_prc_y_start = opj_int_floordivpow2(l_res->y0, (OPJ_INT32)l_pdy) << l_pdy; - l_br_prc_x_end = opj_int_ceildivpow2(l_res->x1, (OPJ_INT32)l_pdx) << l_pdx; - l_br_prc_y_end = opj_int_ceildivpow2(l_res->y1, (OPJ_INT32)l_pdy) << l_pdy; - /*fprintf(stderr, "\t\t\tprc_x_start=%d, prc_y_start=%d, br_prc_x_end=%d, br_prc_y_end=%d \n", l_tl_prc_x_start, l_tl_prc_y_start, l_br_prc_x_end ,l_br_prc_y_end );*/ - - l_res->pw = (l_res->x0 == l_res->x1) ? 0U : (OPJ_UINT32)(( - l_br_prc_x_end - l_tl_prc_x_start) >> l_pdx); - l_res->ph = (l_res->y0 == l_res->y1) ? 0U : (OPJ_UINT32)(( - l_br_prc_y_end - l_tl_prc_y_start) >> l_pdy); - /*fprintf(stderr, "\t\t\tres_pw=%d, res_ph=%d\n", l_res->pw, l_res->ph );*/ - - if ((l_res->pw != 0U) && ((((OPJ_UINT32) - 1) / l_res->pw) < l_res->ph)) { - opj_event_msg(manager, EVT_ERROR, "Size of tile data exceeds system limits\n"); - return OPJ_FALSE; - } - l_nb_precincts = l_res->pw * l_res->ph; - - if ((((OPJ_UINT32) - 1) / (OPJ_UINT32)sizeof(opj_tcd_precinct_t)) < - l_nb_precincts) { - opj_event_msg(manager, EVT_ERROR, "Size of tile data exceeds system limits\n"); - return OPJ_FALSE; - } - l_nb_precinct_size = l_nb_precincts * (OPJ_UINT32)sizeof(opj_tcd_precinct_t); - - if (resno == 0) { - tlcbgxstart = l_tl_prc_x_start; - tlcbgystart = l_tl_prc_y_start; - /*brcbgxend = l_br_prc_x_end;*/ - /* brcbgyend = l_br_prc_y_end;*/ - cbgwidthexpn = l_pdx; - cbgheightexpn = l_pdy; - l_res->numbands = 1; - } else { - tlcbgxstart = opj_int_ceildivpow2(l_tl_prc_x_start, 1); - tlcbgystart = opj_int_ceildivpow2(l_tl_prc_y_start, 1); - /*brcbgxend = opj_int_ceildivpow2(l_br_prc_x_end, 1);*/ - /*brcbgyend = opj_int_ceildivpow2(l_br_prc_y_end, 1);*/ - cbgwidthexpn = l_pdx - 1; - cbgheightexpn = l_pdy - 1; - l_res->numbands = 3; - } - - cblkwidthexpn = opj_uint_min(l_tccp->cblkw, cbgwidthexpn); - cblkheightexpn = opj_uint_min(l_tccp->cblkh, cbgheightexpn); - l_band = l_res->bands; - - for (bandno = 0; bandno < l_res->numbands; ++bandno, ++l_band, ++l_step_size) { - OPJ_INT32 numbps; - /*fprintf(stderr, "\t\t\tband_no=%d/%d\n", bandno, l_res->numbands );*/ - - if (resno == 0) { - l_band->bandno = 0 ; - l_band->x0 = opj_int_ceildivpow2(l_tilec->x0, (OPJ_INT32)l_level_no); - l_band->y0 = opj_int_ceildivpow2(l_tilec->y0, (OPJ_INT32)l_level_no); - l_band->x1 = opj_int_ceildivpow2(l_tilec->x1, (OPJ_INT32)l_level_no); - l_band->y1 = opj_int_ceildivpow2(l_tilec->y1, (OPJ_INT32)l_level_no); - } else { - l_band->bandno = bandno + 1; - /* x0b = 1 if bandno = 1 or 3 */ - l_x0b = l_band->bandno & 1; - /* y0b = 1 if bandno = 2 or 3 */ - l_y0b = (OPJ_INT32)((l_band->bandno) >> 1); - /* l_band border (global) */ - l_band->x0 = opj_int64_ceildivpow2(l_tilec->x0 - ((OPJ_INT64)l_x0b << - l_level_no), (OPJ_INT32)(l_level_no + 1)); - l_band->y0 = opj_int64_ceildivpow2(l_tilec->y0 - ((OPJ_INT64)l_y0b << - l_level_no), (OPJ_INT32)(l_level_no + 1)); - l_band->x1 = opj_int64_ceildivpow2(l_tilec->x1 - ((OPJ_INT64)l_x0b << - l_level_no), (OPJ_INT32)(l_level_no + 1)); - l_band->y1 = opj_int64_ceildivpow2(l_tilec->y1 - ((OPJ_INT64)l_y0b << - l_level_no), (OPJ_INT32)(l_level_no + 1)); - } - - if (isEncoder) { - /* Skip empty bands */ - if (opj_tcd_is_band_empty(l_band)) { - /* Do not zero l_band->precints to avoid leaks */ - /* but make sure we don't use it later, since */ - /* it will point to precincts of previous bands... */ - continue; - } - } - - /** avoid an if with storing function pointer */ - l_gain = (*l_gain_ptr)(l_band->bandno); - numbps = (OPJ_INT32)(l_image_comp->prec + l_gain); - l_band->stepsize = (OPJ_FLOAT32)(((1.0 + l_step_size->mant / 2048.0) * pow(2.0, - (OPJ_INT32)(numbps - l_step_size->expn)))) * fraction; - /* Mb value of Equation E-2 in "E.1 Inverse quantization - * procedure" of the standard */ - l_band->numbps = l_step_size->expn + (OPJ_INT32)l_tccp->numgbits - - 1; - - if (!l_band->precincts && (l_nb_precincts > 0U)) { - l_band->precincts = (opj_tcd_precinct_t *) opj_malloc(/*3 * */ - l_nb_precinct_size); - if (! l_band->precincts) { - opj_event_msg(manager, EVT_ERROR, - "Not enough memory to handle band precints\n"); - return OPJ_FALSE; - } - /*fprintf(stderr, "\t\t\t\tAllocate precincts of a band (opj_tcd_precinct_t): %d\n",l_nb_precinct_size); */ - memset(l_band->precincts, 0, l_nb_precinct_size); - l_band->precincts_data_size = l_nb_precinct_size; - } else if (l_band->precincts_data_size < l_nb_precinct_size) { - - opj_tcd_precinct_t * new_precincts = (opj_tcd_precinct_t *) opj_realloc( - l_band->precincts,/*3 * */ l_nb_precinct_size); - if (! new_precincts) { - opj_event_msg(manager, EVT_ERROR, - "Not enough memory to handle band precints\n"); - opj_free(l_band->precincts); - l_band->precincts = NULL; - l_band->precincts_data_size = 0; - return OPJ_FALSE; - } - l_band->precincts = new_precincts; - /*fprintf(stderr, "\t\t\t\tReallocate precincts of a band (opj_tcd_precinct_t): from %d to %d\n",l_band->precincts_data_size, l_nb_precinct_size);*/ - memset(((OPJ_BYTE *) l_band->precincts) + l_band->precincts_data_size, 0, - l_nb_precinct_size - l_band->precincts_data_size); - l_band->precincts_data_size = l_nb_precinct_size; - } - - l_current_precinct = l_band->precincts; - for (precno = 0; precno < l_nb_precincts; ++precno) { - OPJ_INT32 tlcblkxstart, tlcblkystart, brcblkxend, brcblkyend; - OPJ_INT32 cbgxstart = tlcbgxstart + (OPJ_INT32)(precno % l_res->pw) * - (1 << cbgwidthexpn); - OPJ_INT32 cbgystart = tlcbgystart + (OPJ_INT32)(precno / l_res->pw) * - (1 << cbgheightexpn); - OPJ_INT32 cbgxend = cbgxstart + (1 << cbgwidthexpn); - OPJ_INT32 cbgyend = cbgystart + (1 << cbgheightexpn); - /*fprintf(stderr, "\t precno=%d; bandno=%d, resno=%d; compno=%d\n", precno, bandno , resno, compno);*/ - /*fprintf(stderr, "\t tlcbgxstart(=%d) + (precno(=%d) percent res->pw(=%d)) * (1 << cbgwidthexpn(=%d)) \n",tlcbgxstart,precno,l_res->pw,cbgwidthexpn);*/ - - /* precinct size (global) */ - /*fprintf(stderr, "\t cbgxstart=%d, l_band->x0 = %d \n",cbgxstart, l_band->x0);*/ - - l_current_precinct->x0 = opj_int_max(cbgxstart, l_band->x0); - l_current_precinct->y0 = opj_int_max(cbgystart, l_band->y0); - l_current_precinct->x1 = opj_int_min(cbgxend, l_band->x1); - l_current_precinct->y1 = opj_int_min(cbgyend, l_band->y1); - /*fprintf(stderr, "\t prc_x0=%d; prc_y0=%d, prc_x1=%d; prc_y1=%d\n",l_current_precinct->x0, l_current_precinct->y0 ,l_current_precinct->x1, l_current_precinct->y1);*/ - - tlcblkxstart = opj_int_floordivpow2(l_current_precinct->x0, - (OPJ_INT32)cblkwidthexpn) << cblkwidthexpn; - /*fprintf(stderr, "\t tlcblkxstart =%d\n",tlcblkxstart );*/ - tlcblkystart = opj_int_floordivpow2(l_current_precinct->y0, - (OPJ_INT32)cblkheightexpn) << cblkheightexpn; - /*fprintf(stderr, "\t tlcblkystart =%d\n",tlcblkystart );*/ - brcblkxend = opj_int_ceildivpow2(l_current_precinct->x1, - (OPJ_INT32)cblkwidthexpn) << cblkwidthexpn; - /*fprintf(stderr, "\t brcblkxend =%d\n",brcblkxend );*/ - brcblkyend = opj_int_ceildivpow2(l_current_precinct->y1, - (OPJ_INT32)cblkheightexpn) << cblkheightexpn; - /*fprintf(stderr, "\t brcblkyend =%d\n",brcblkyend );*/ - l_current_precinct->cw = (OPJ_UINT32)((brcblkxend - tlcblkxstart) >> - cblkwidthexpn); - l_current_precinct->ch = (OPJ_UINT32)((brcblkyend - tlcblkystart) >> - cblkheightexpn); - - l_nb_code_blocks = l_current_precinct->cw * l_current_precinct->ch; - /*fprintf(stderr, "\t\t\t\t precinct_cw = %d x recinct_ch = %d\n",l_current_precinct->cw, l_current_precinct->ch); */ - if ((((OPJ_UINT32) - 1) / (OPJ_UINT32)sizeof_block) < - l_nb_code_blocks) { - opj_event_msg(manager, EVT_ERROR, - "Size of code block data exceeds system limits\n"); - return OPJ_FALSE; - } - l_nb_code_blocks_size = l_nb_code_blocks * (OPJ_UINT32)sizeof_block; - - if (!l_current_precinct->cblks.blocks && (l_nb_code_blocks > 0U)) { - l_current_precinct->cblks.blocks = opj_malloc(l_nb_code_blocks_size); - if (! l_current_precinct->cblks.blocks) { - return OPJ_FALSE; - } - /*fprintf(stderr, "\t\t\t\tAllocate cblks of a precinct (opj_tcd_cblk_dec_t): %d\n",l_nb_code_blocks_size);*/ - - memset(l_current_precinct->cblks.blocks, 0, l_nb_code_blocks_size); - - l_current_precinct->block_size = l_nb_code_blocks_size; - } else if (l_nb_code_blocks_size > l_current_precinct->block_size) { - void *new_blocks = opj_realloc(l_current_precinct->cblks.blocks, - l_nb_code_blocks_size); - if (! new_blocks) { - opj_free(l_current_precinct->cblks.blocks); - l_current_precinct->cblks.blocks = NULL; - l_current_precinct->block_size = 0; - opj_event_msg(manager, EVT_ERROR, - "Not enough memory for current precinct codeblock element\n"); - return OPJ_FALSE; - } - l_current_precinct->cblks.blocks = new_blocks; - /*fprintf(stderr, "\t\t\t\tReallocate cblks of a precinct (opj_tcd_cblk_dec_t): from %d to %d\n",l_current_precinct->block_size, l_nb_code_blocks_size); */ - - memset(((OPJ_BYTE *) l_current_precinct->cblks.blocks) + - l_current_precinct->block_size - , 0 - , l_nb_code_blocks_size - l_current_precinct->block_size); - - l_current_precinct->block_size = l_nb_code_blocks_size; - } - - if (! l_current_precinct->incltree) { - l_current_precinct->incltree = opj_tgt_create(l_current_precinct->cw, - l_current_precinct->ch, manager); - } else { - l_current_precinct->incltree = opj_tgt_init(l_current_precinct->incltree, - l_current_precinct->cw, l_current_precinct->ch, manager); - } - - if (! l_current_precinct->imsbtree) { - l_current_precinct->imsbtree = opj_tgt_create(l_current_precinct->cw, - l_current_precinct->ch, manager); - } else { - l_current_precinct->imsbtree = opj_tgt_init(l_current_precinct->imsbtree, - l_current_precinct->cw, l_current_precinct->ch, manager); - } - - for (cblkno = 0; cblkno < l_nb_code_blocks; ++cblkno) { - OPJ_INT32 cblkxstart = tlcblkxstart + (OPJ_INT32)(cblkno % - l_current_precinct->cw) * (1 << cblkwidthexpn); - OPJ_INT32 cblkystart = tlcblkystart + (OPJ_INT32)(cblkno / - l_current_precinct->cw) * (1 << cblkheightexpn); - OPJ_INT32 cblkxend = cblkxstart + (1 << cblkwidthexpn); - OPJ_INT32 cblkyend = cblkystart + (1 << cblkheightexpn); - - if (isEncoder) { - opj_tcd_cblk_enc_t* l_code_block = l_current_precinct->cblks.enc + cblkno; - - if (! opj_tcd_code_block_enc_allocate(l_code_block)) { - return OPJ_FALSE; - } - /* code-block size (global) */ - l_code_block->x0 = opj_int_max(cblkxstart, l_current_precinct->x0); - l_code_block->y0 = opj_int_max(cblkystart, l_current_precinct->y0); - l_code_block->x1 = opj_int_min(cblkxend, l_current_precinct->x1); - l_code_block->y1 = opj_int_min(cblkyend, l_current_precinct->y1); - - if (! opj_tcd_code_block_enc_allocate_data(l_code_block)) { - return OPJ_FALSE; - } - } else { - opj_tcd_cblk_dec_t* l_code_block = l_current_precinct->cblks.dec + cblkno; - - if (! opj_tcd_code_block_dec_allocate(l_code_block)) { - return OPJ_FALSE; - } - /* code-block size (global) */ - l_code_block->x0 = opj_int_max(cblkxstart, l_current_precinct->x0); - l_code_block->y0 = opj_int_max(cblkystart, l_current_precinct->y0); - l_code_block->x1 = opj_int_min(cblkxend, l_current_precinct->x1); - l_code_block->y1 = opj_int_min(cblkyend, l_current_precinct->y1); - } - } - ++l_current_precinct; - } /* precno */ - } /* bandno */ - ++l_res; - } /* resno */ - ++l_tccp; - ++l_tilec; - ++l_image_comp; - } /* compno */ - return OPJ_TRUE; -} - -OPJ_BOOL opj_tcd_init_encode_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no, - opj_event_mgr_t* p_manager) -{ - return opj_tcd_init_tile(p_tcd, p_tile_no, OPJ_TRUE, 1.0F, - sizeof(opj_tcd_cblk_enc_t), p_manager); -} - -OPJ_BOOL opj_tcd_init_decode_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no, - opj_event_mgr_t* p_manager) -{ - return opj_tcd_init_tile(p_tcd, p_tile_no, OPJ_FALSE, 0.5F, - sizeof(opj_tcd_cblk_dec_t), p_manager); -} - -/** - * Allocates memory for an encoding code block (but not data memory). - */ -static OPJ_BOOL opj_tcd_code_block_enc_allocate(opj_tcd_cblk_enc_t * - p_code_block) -{ - if (! p_code_block->layers) { - /* no memset since data */ - p_code_block->layers = (opj_tcd_layer_t*) opj_calloc(100, - sizeof(opj_tcd_layer_t)); - if (! p_code_block->layers) { - return OPJ_FALSE; - } - } - if (! p_code_block->passes) { - p_code_block->passes = (opj_tcd_pass_t*) opj_calloc(100, - sizeof(opj_tcd_pass_t)); - if (! p_code_block->passes) { - return OPJ_FALSE; - } - } - return OPJ_TRUE; -} - -/** - * Allocates data memory for an encoding code block. - */ -static OPJ_BOOL opj_tcd_code_block_enc_allocate_data(opj_tcd_cblk_enc_t * - p_code_block) -{ - OPJ_UINT32 l_data_size; - - /* +1 is needed for https://github.com/uclouvain/openjpeg/issues/835 */ - /* and actually +2 required for https://github.com/uclouvain/openjpeg/issues/982 */ - /* TODO: is there a theoretical upper-bound for the compressed code */ - /* block size ? */ - l_data_size = 2 + (OPJ_UINT32)((p_code_block->x1 - p_code_block->x0) * - (p_code_block->y1 - p_code_block->y0) * (OPJ_INT32)sizeof(OPJ_UINT32)); - - if (l_data_size > p_code_block->data_size) { - if (p_code_block->data) { - /* We refer to data - 1 since below we incremented it */ - opj_free(p_code_block->data - 1); - } - p_code_block->data = (OPJ_BYTE*) opj_malloc(l_data_size + 1); - if (! p_code_block->data) { - p_code_block->data_size = 0U; - return OPJ_FALSE; - } - p_code_block->data_size = l_data_size; - - /* We reserve the initial byte as a fake byte to a non-FF value */ - /* and increment the data pointer, so that opj_mqc_init_enc() */ - /* can do bp = data - 1, and opj_mqc_byteout() can safely dereference */ - /* it. */ - p_code_block->data[0] = 0; - p_code_block->data += 1; /*why +1 ?*/ - } - return OPJ_TRUE; -} - - -void opj_tcd_reinit_segment(opj_tcd_seg_t* seg) -{ - memset(seg, 0, sizeof(opj_tcd_seg_t)); -} - -/** - * Allocates memory for a decoding code block. - */ -static OPJ_BOOL opj_tcd_code_block_dec_allocate(opj_tcd_cblk_dec_t * - p_code_block) -{ - if (! p_code_block->segs) { - - p_code_block->segs = (opj_tcd_seg_t *) opj_calloc(OPJ_J2K_DEFAULT_NB_SEGS, - sizeof(opj_tcd_seg_t)); - if (! p_code_block->segs) { - return OPJ_FALSE; - } - /*fprintf(stderr, "Allocate %d elements of code_block->data\n", OPJ_J2K_DEFAULT_NB_SEGS * sizeof(opj_tcd_seg_t));*/ - - p_code_block->m_current_max_segs = OPJ_J2K_DEFAULT_NB_SEGS; - /*fprintf(stderr, "m_current_max_segs of code_block->data = %d\n", p_code_block->m_current_max_segs);*/ - } else { - /* sanitize */ - opj_tcd_seg_t * l_segs = p_code_block->segs; - OPJ_UINT32 l_current_max_segs = p_code_block->m_current_max_segs; - opj_tcd_seg_data_chunk_t* l_chunks = p_code_block->chunks; - OPJ_UINT32 l_numchunksalloc = p_code_block->numchunksalloc; - OPJ_UINT32 i; - - opj_aligned_free(p_code_block->decoded_data); - p_code_block->decoded_data = 00; - - memset(p_code_block, 0, sizeof(opj_tcd_cblk_dec_t)); - p_code_block->segs = l_segs; - p_code_block->m_current_max_segs = l_current_max_segs; - for (i = 0; i < l_current_max_segs; ++i) { - opj_tcd_reinit_segment(&l_segs[i]); - } - p_code_block->chunks = l_chunks; - p_code_block->numchunksalloc = l_numchunksalloc; - } - - return OPJ_TRUE; -} - -OPJ_UINT32 opj_tcd_get_decoded_tile_size(opj_tcd_t *p_tcd, - OPJ_BOOL take_into_account_partial_decoding) -{ - OPJ_UINT32 i; - OPJ_UINT32 l_data_size = 0; - opj_image_comp_t * l_img_comp = 00; - opj_tcd_tilecomp_t * l_tile_comp = 00; - opj_tcd_resolution_t * l_res = 00; - OPJ_UINT32 l_size_comp, l_remaining; - OPJ_UINT32 l_temp; - - l_tile_comp = p_tcd->tcd_image->tiles->comps; - l_img_comp = p_tcd->image->comps; - - for (i = 0; i < p_tcd->image->numcomps; ++i) { - OPJ_UINT32 w, h; - l_size_comp = l_img_comp->prec >> 3; /*(/ 8)*/ - l_remaining = l_img_comp->prec & 7; /* (%8) */ - - if (l_remaining) { - ++l_size_comp; - } - - if (l_size_comp == 3) { - l_size_comp = 4; - } - - l_res = l_tile_comp->resolutions + l_tile_comp->minimum_num_resolutions - 1; - if (take_into_account_partial_decoding && !p_tcd->whole_tile_decoding) { - w = l_res->win_x1 - l_res->win_x0; - h = l_res->win_y1 - l_res->win_y0; - } else { - w = (OPJ_UINT32)(l_res->x1 - l_res->x0); - h = (OPJ_UINT32)(l_res->y1 - l_res->y0); - } - if (h > 0 && UINT_MAX / w < h) { - return UINT_MAX; - } - l_temp = w * h; - if (l_size_comp && UINT_MAX / l_size_comp < l_temp) { - return UINT_MAX; - } - l_temp *= l_size_comp; - - if (l_temp > UINT_MAX - l_data_size) { - return UINT_MAX; - } - l_data_size += l_temp; - ++l_img_comp; - ++l_tile_comp; - } - - return l_data_size; -} - -OPJ_BOOL opj_tcd_encode_tile(opj_tcd_t *p_tcd, - OPJ_UINT32 p_tile_no, - OPJ_BYTE *p_dest, - OPJ_UINT32 * p_data_written, - OPJ_UINT32 p_max_length, - opj_codestream_info_t *p_cstr_info, - opj_event_mgr_t *p_manager) -{ - - if (p_tcd->cur_tp_num == 0) { - - p_tcd->tcd_tileno = p_tile_no; - p_tcd->tcp = &p_tcd->cp->tcps[p_tile_no]; - - /* INDEX >> "Precinct_nb_X et Precinct_nb_Y" */ - if (p_cstr_info) { - OPJ_UINT32 l_num_packs = 0; - OPJ_UINT32 i; - opj_tcd_tilecomp_t *l_tilec_idx = - &p_tcd->tcd_image->tiles->comps[0]; /* based on component 0 */ - opj_tccp_t *l_tccp = p_tcd->tcp->tccps; /* based on component 0 */ - - for (i = 0; i < l_tilec_idx->numresolutions; i++) { - opj_tcd_resolution_t *l_res_idx = &l_tilec_idx->resolutions[i]; - - p_cstr_info->tile[p_tile_no].pw[i] = (int)l_res_idx->pw; - p_cstr_info->tile[p_tile_no].ph[i] = (int)l_res_idx->ph; - - l_num_packs += l_res_idx->pw * l_res_idx->ph; - p_cstr_info->tile[p_tile_no].pdx[i] = (int)l_tccp->prcw[i]; - p_cstr_info->tile[p_tile_no].pdy[i] = (int)l_tccp->prch[i]; - } - p_cstr_info->tile[p_tile_no].packet = (opj_packet_info_t*) opj_calloc(( - OPJ_SIZE_T)p_cstr_info->numcomps * (OPJ_SIZE_T)p_cstr_info->numlayers * - l_num_packs, - sizeof(opj_packet_info_t)); - if (!p_cstr_info->tile[p_tile_no].packet) { - /* FIXME event manager error callback */ - return OPJ_FALSE; - } - } - /* << INDEX */ - - /* FIXME _ProfStart(PGROUP_DC_SHIFT); */ - /*---------------TILE-------------------*/ - if (! opj_tcd_dc_level_shift_encode(p_tcd)) { - return OPJ_FALSE; - } - /* FIXME _ProfStop(PGROUP_DC_SHIFT); */ - - /* FIXME _ProfStart(PGROUP_MCT); */ - if (! opj_tcd_mct_encode(p_tcd)) { - return OPJ_FALSE; - } - /* FIXME _ProfStop(PGROUP_MCT); */ - - /* FIXME _ProfStart(PGROUP_DWT); */ - if (! opj_tcd_dwt_encode(p_tcd)) { - return OPJ_FALSE; - } - /* FIXME _ProfStop(PGROUP_DWT); */ - - /* FIXME _ProfStart(PGROUP_T1); */ - if (! opj_tcd_t1_encode(p_tcd)) { - return OPJ_FALSE; - } - /* FIXME _ProfStop(PGROUP_T1); */ - - /* FIXME _ProfStart(PGROUP_RATE); */ - if (! opj_tcd_rate_allocate_encode(p_tcd, p_dest, p_max_length, - p_cstr_info, p_manager)) { - return OPJ_FALSE; - } - /* FIXME _ProfStop(PGROUP_RATE); */ - - } - /*--------------TIER2------------------*/ - - /* INDEX */ - if (p_cstr_info) { - p_cstr_info->index_write = 1; - } - /* FIXME _ProfStart(PGROUP_T2); */ - - if (! opj_tcd_t2_encode(p_tcd, p_dest, p_data_written, p_max_length, - p_cstr_info, p_manager)) { - return OPJ_FALSE; - } - /* FIXME _ProfStop(PGROUP_T2); */ - - /*---------------CLEAN-------------------*/ - - return OPJ_TRUE; -} - -OPJ_BOOL opj_tcd_decode_tile(opj_tcd_t *p_tcd, - OPJ_UINT32 win_x0, - OPJ_UINT32 win_y0, - OPJ_UINT32 win_x1, - OPJ_UINT32 win_y1, - OPJ_UINT32 numcomps_to_decode, - const OPJ_UINT32 *comps_indices, - OPJ_BYTE *p_src, - OPJ_UINT32 p_max_length, - OPJ_UINT32 p_tile_no, - opj_codestream_index_t *p_cstr_index, - opj_event_mgr_t *p_manager - ) -{ - OPJ_UINT32 l_data_read; - OPJ_UINT32 compno; - - p_tcd->tcd_tileno = p_tile_no; - p_tcd->tcp = &(p_tcd->cp->tcps[p_tile_no]); - p_tcd->win_x0 = win_x0; - p_tcd->win_y0 = win_y0; - p_tcd->win_x1 = win_x1; - p_tcd->win_y1 = win_y1; - p_tcd->whole_tile_decoding = OPJ_TRUE; - - opj_free(p_tcd->used_component); - p_tcd->used_component = NULL; - - if (numcomps_to_decode) { - OPJ_BOOL* used_component = (OPJ_BOOL*) opj_calloc(sizeof(OPJ_BOOL), - p_tcd->image->numcomps); - if (used_component == NULL) { - return OPJ_FALSE; - } - for (compno = 0; compno < numcomps_to_decode; compno++) { - used_component[ comps_indices[compno] ] = OPJ_TRUE; - } - - p_tcd->used_component = used_component; - } - - for (compno = 0; compno < p_tcd->image->numcomps; compno++) { - if (p_tcd->used_component != NULL && !p_tcd->used_component[compno]) { - continue; - } - - if (!opj_tcd_is_whole_tilecomp_decoding(p_tcd, compno)) { - p_tcd->whole_tile_decoding = OPJ_FALSE; - break; - } - } - - if (p_tcd->whole_tile_decoding) { - for (compno = 0; compno < p_tcd->image->numcomps; compno++) { - opj_tcd_tilecomp_t* tilec = &(p_tcd->tcd_image->tiles->comps[compno]); - opj_tcd_resolution_t *l_res = & - (tilec->resolutions[tilec->minimum_num_resolutions - 1]); - OPJ_SIZE_T l_data_size; - - /* compute l_data_size with overflow check */ - OPJ_SIZE_T res_w = (OPJ_SIZE_T)(l_res->x1 - l_res->x0); - OPJ_SIZE_T res_h = (OPJ_SIZE_T)(l_res->y1 - l_res->y0); - - if (p_tcd->used_component != NULL && !p_tcd->used_component[compno]) { - continue; - } - - /* issue 733, l_data_size == 0U, probably something wrong should be checked before getting here */ - if (res_h > 0 && res_w > SIZE_MAX / res_h) { - opj_event_msg(p_manager, EVT_ERROR, - "Size of tile data exceeds system limits\n"); - return OPJ_FALSE; - } - l_data_size = res_w * res_h; - - if (SIZE_MAX / sizeof(OPJ_UINT32) < l_data_size) { - opj_event_msg(p_manager, EVT_ERROR, - "Size of tile data exceeds system limits\n"); - return OPJ_FALSE; - } - l_data_size *= sizeof(OPJ_UINT32); - - tilec->data_size_needed = l_data_size; - - if (!opj_alloc_tile_component_data(tilec)) { - opj_event_msg(p_manager, EVT_ERROR, - "Size of tile data exceeds system limits\n"); - return OPJ_FALSE; - } - } - } else { - /* Compute restricted tile-component and tile-resolution coordinates */ - /* of the window of interest, but defer the memory allocation until */ - /* we know the resno_decoded */ - for (compno = 0; compno < p_tcd->image->numcomps; compno++) { - OPJ_UINT32 resno; - opj_tcd_tilecomp_t* tilec = &(p_tcd->tcd_image->tiles->comps[compno]); - opj_image_comp_t* image_comp = &(p_tcd->image->comps[compno]); - - if (p_tcd->used_component != NULL && !p_tcd->used_component[compno]) { - continue; - } - - /* Compute the intersection of the area of interest, expressed in tile coordinates */ - /* with the tile coordinates */ - tilec->win_x0 = opj_uint_max( - (OPJ_UINT32)tilec->x0, - opj_uint_ceildiv(p_tcd->win_x0, image_comp->dx)); - tilec->win_y0 = opj_uint_max( - (OPJ_UINT32)tilec->y0, - opj_uint_ceildiv(p_tcd->win_y0, image_comp->dy)); - tilec->win_x1 = opj_uint_min( - (OPJ_UINT32)tilec->x1, - opj_uint_ceildiv(p_tcd->win_x1, image_comp->dx)); - tilec->win_y1 = opj_uint_min( - (OPJ_UINT32)tilec->y1, - opj_uint_ceildiv(p_tcd->win_y1, image_comp->dy)); - if (tilec->win_x1 < tilec->win_x0 || - tilec->win_y1 < tilec->win_y0) { - /* We should not normally go there. The circumstance is when */ - /* the tile coordinates do not intersect the area of interest */ - /* Upper level logic should not even try to decode that tile */ - opj_event_msg(p_manager, EVT_ERROR, - "Invalid tilec->win_xxx values\n"); - return OPJ_FALSE; - } - - for (resno = 0; resno < tilec->numresolutions; ++resno) { - opj_tcd_resolution_t *res = tilec->resolutions + resno; - res->win_x0 = opj_uint_ceildivpow2(tilec->win_x0, - tilec->numresolutions - 1 - resno); - res->win_y0 = opj_uint_ceildivpow2(tilec->win_y0, - tilec->numresolutions - 1 - resno); - res->win_x1 = opj_uint_ceildivpow2(tilec->win_x1, - tilec->numresolutions - 1 - resno); - res->win_y1 = opj_uint_ceildivpow2(tilec->win_y1, - tilec->numresolutions - 1 - resno); - } - } - } - -#ifdef TODO_MSD /* FIXME */ - /* INDEX >> */ - if (p_cstr_info) { - OPJ_UINT32 resno, compno, numprec = 0; - for (compno = 0; compno < (OPJ_UINT32) p_cstr_info->numcomps; compno++) { - opj_tcp_t *tcp = &p_tcd->cp->tcps[0]; - opj_tccp_t *tccp = &tcp->tccps[compno]; - opj_tcd_tilecomp_t *tilec_idx = &p_tcd->tcd_image->tiles->comps[compno]; - for (resno = 0; resno < tilec_idx->numresolutions; resno++) { - opj_tcd_resolution_t *res_idx = &tilec_idx->resolutions[resno]; - p_cstr_info->tile[p_tile_no].pw[resno] = res_idx->pw; - p_cstr_info->tile[p_tile_no].ph[resno] = res_idx->ph; - numprec += res_idx->pw * res_idx->ph; - p_cstr_info->tile[p_tile_no].pdx[resno] = tccp->prcw[resno]; - p_cstr_info->tile[p_tile_no].pdy[resno] = tccp->prch[resno]; - } - } - p_cstr_info->tile[p_tile_no].packet = (opj_packet_info_t *) opj_malloc( - p_cstr_info->numlayers * numprec * sizeof(opj_packet_info_t)); - p_cstr_info->packno = 0; - } - /* << INDEX */ -#endif - - /*--------------TIER2------------------*/ - /* FIXME _ProfStart(PGROUP_T2); */ - l_data_read = 0; - if (! opj_tcd_t2_decode(p_tcd, p_src, &l_data_read, p_max_length, p_cstr_index, - p_manager)) { - return OPJ_FALSE; - } - /* FIXME _ProfStop(PGROUP_T2); */ - - /*------------------TIER1-----------------*/ - - /* FIXME _ProfStart(PGROUP_T1); */ - if (! opj_tcd_t1_decode(p_tcd, p_manager)) { - return OPJ_FALSE; - } - /* FIXME _ProfStop(PGROUP_T1); */ - - - /* For subtile decoding, now we know the resno_decoded, we can allocate */ - /* the tile data buffer */ - if (!p_tcd->whole_tile_decoding) { - for (compno = 0; compno < p_tcd->image->numcomps; compno++) { - opj_tcd_tilecomp_t* tilec = &(p_tcd->tcd_image->tiles->comps[compno]); - opj_image_comp_t* image_comp = &(p_tcd->image->comps[compno]); - opj_tcd_resolution_t *res = tilec->resolutions + image_comp->resno_decoded; - OPJ_SIZE_T w = res->win_x1 - res->win_x0; - OPJ_SIZE_T h = res->win_y1 - res->win_y0; - OPJ_SIZE_T l_data_size; - - opj_image_data_free(tilec->data_win); - tilec->data_win = NULL; - - if (p_tcd->used_component != NULL && !p_tcd->used_component[compno]) { - continue; - } - - if (w > 0 && h > 0) { - if (w > SIZE_MAX / h) { - opj_event_msg(p_manager, EVT_ERROR, - "Size of tile data exceeds system limits\n"); - return OPJ_FALSE; - } - l_data_size = w * h; - if (l_data_size > SIZE_MAX / sizeof(OPJ_INT32)) { - opj_event_msg(p_manager, EVT_ERROR, - "Size of tile data exceeds system limits\n"); - return OPJ_FALSE; - } - l_data_size *= sizeof(OPJ_INT32); - - tilec->data_win = (OPJ_INT32*) opj_image_data_alloc(l_data_size); - if (tilec->data_win == NULL) { - opj_event_msg(p_manager, EVT_ERROR, - "Size of tile data exceeds system limits\n"); - return OPJ_FALSE; - } - } - } - } - - /*----------------DWT---------------------*/ - - /* FIXME _ProfStart(PGROUP_DWT); */ - if - (! opj_tcd_dwt_decode(p_tcd)) { - return OPJ_FALSE; - } - /* FIXME _ProfStop(PGROUP_DWT); */ - - /*----------------MCT-------------------*/ - /* FIXME _ProfStart(PGROUP_MCT); */ - if - (! opj_tcd_mct_decode(p_tcd, p_manager)) { - return OPJ_FALSE; - } - /* FIXME _ProfStop(PGROUP_MCT); */ - - /* FIXME _ProfStart(PGROUP_DC_SHIFT); */ - if - (! opj_tcd_dc_level_shift_decode(p_tcd)) { - return OPJ_FALSE; - } - /* FIXME _ProfStop(PGROUP_DC_SHIFT); */ - - - /*---------------TILE-------------------*/ - return OPJ_TRUE; -} - -OPJ_BOOL opj_tcd_update_tile_data(opj_tcd_t *p_tcd, - OPJ_BYTE * p_dest, - OPJ_UINT32 p_dest_length - ) -{ - OPJ_UINT32 i, j, k, l_data_size = 0; - opj_image_comp_t * l_img_comp = 00; - opj_tcd_tilecomp_t * l_tilec = 00; - opj_tcd_resolution_t * l_res; - OPJ_UINT32 l_size_comp, l_remaining; - OPJ_UINT32 l_stride, l_width, l_height; - - l_data_size = opj_tcd_get_decoded_tile_size(p_tcd, OPJ_TRUE); - if (l_data_size == UINT_MAX || l_data_size > p_dest_length) { - return OPJ_FALSE; - } - - l_tilec = p_tcd->tcd_image->tiles->comps; - l_img_comp = p_tcd->image->comps; - - for (i = 0; i < p_tcd->image->numcomps; ++i) { - const OPJ_INT32* l_src_data; - l_size_comp = l_img_comp->prec >> 3; /*(/ 8)*/ - l_remaining = l_img_comp->prec & 7; /* (%8) */ - l_res = l_tilec->resolutions + l_img_comp->resno_decoded; - if (p_tcd->whole_tile_decoding) { - l_width = (OPJ_UINT32)(l_res->x1 - l_res->x0); - l_height = (OPJ_UINT32)(l_res->y1 - l_res->y0); - l_stride = (OPJ_UINT32)(l_tilec->resolutions[l_tilec->minimum_num_resolutions - - 1].x1 - - l_tilec->resolutions[l_tilec->minimum_num_resolutions - 1].x0) - l_width; - l_src_data = l_tilec->data; - } else { - l_width = l_res->win_x1 - l_res->win_x0; - l_height = l_res->win_y1 - l_res->win_y0; - l_stride = 0; - l_src_data = l_tilec->data_win; - } - - if (l_remaining) { - ++l_size_comp; - } - - if (l_size_comp == 3) { - l_size_comp = 4; - } - - switch (l_size_comp) { - case 1: { - OPJ_CHAR * l_dest_ptr = (OPJ_CHAR *) p_dest; - const OPJ_INT32 * l_src_ptr = l_src_data; - - if (l_img_comp->sgnd) { - for (j = 0; j < l_height; ++j) { - for (k = 0; k < l_width; ++k) { - *(l_dest_ptr++) = (OPJ_CHAR)(*(l_src_ptr++)); - } - l_src_ptr += l_stride; - } - } else { - for (j = 0; j < l_height; ++j) { - for (k = 0; k < l_width; ++k) { - *(l_dest_ptr++) = (OPJ_CHAR)((*(l_src_ptr++)) & 0xff); - } - l_src_ptr += l_stride; - } - } - - p_dest = (OPJ_BYTE *)l_dest_ptr; - } - break; - case 2: { - const OPJ_INT32 * l_src_ptr = l_src_data; - OPJ_INT16 * l_dest_ptr = (OPJ_INT16 *) p_dest; - - if (l_img_comp->sgnd) { - for (j = 0; j < l_height; ++j) { - for (k = 0; k < l_width; ++k) { - OPJ_INT16 val = (OPJ_INT16)(*(l_src_ptr++)); - memcpy(l_dest_ptr, &val, sizeof(val)); - l_dest_ptr ++; - } - l_src_ptr += l_stride; - } - } else { - for (j = 0; j < l_height; ++j) { - for (k = 0; k < l_width; ++k) { - OPJ_INT16 val = (OPJ_INT16)((*(l_src_ptr++)) & 0xffff); - memcpy(l_dest_ptr, &val, sizeof(val)); - l_dest_ptr ++; - } - l_src_ptr += l_stride; - } - } - - p_dest = (OPJ_BYTE*) l_dest_ptr; - } - break; - case 4: { - OPJ_INT32 * l_dest_ptr = (OPJ_INT32 *) p_dest; - const OPJ_INT32 * l_src_ptr = l_src_data; - - for (j = 0; j < l_height; ++j) { - memcpy(l_dest_ptr, l_src_ptr, l_width * sizeof(OPJ_INT32)); - l_dest_ptr += l_width; - l_src_ptr += l_width + l_stride; - } - - p_dest = (OPJ_BYTE*) l_dest_ptr; - } - break; - } - - ++l_img_comp; - ++l_tilec; - } - - return OPJ_TRUE; -} - - - - -static void opj_tcd_free_tile(opj_tcd_t *p_tcd) -{ - OPJ_UINT32 compno, resno, bandno, precno; - opj_tcd_tile_t *l_tile = 00; - opj_tcd_tilecomp_t *l_tile_comp = 00; - opj_tcd_resolution_t *l_res = 00; - opj_tcd_band_t *l_band = 00; - opj_tcd_precinct_t *l_precinct = 00; - OPJ_UINT32 l_nb_resolutions, l_nb_precincts; - void (* l_tcd_code_block_deallocate)(opj_tcd_precinct_t *) = 00; - - if (! p_tcd) { - return; - } - - if (! p_tcd->tcd_image) { - return; - } - - if (p_tcd->m_is_decoder) { - l_tcd_code_block_deallocate = opj_tcd_code_block_dec_deallocate; - } else { - l_tcd_code_block_deallocate = opj_tcd_code_block_enc_deallocate; - } - - l_tile = p_tcd->tcd_image->tiles; - if (! l_tile) { - return; - } - - l_tile_comp = l_tile->comps; - - for (compno = 0; compno < l_tile->numcomps; ++compno) { - l_res = l_tile_comp->resolutions; - if (l_res) { - - l_nb_resolutions = l_tile_comp->resolutions_size / (OPJ_UINT32)sizeof( - opj_tcd_resolution_t); - for (resno = 0; resno < l_nb_resolutions; ++resno) { - l_band = l_res->bands; - for (bandno = 0; bandno < 3; ++bandno) { - l_precinct = l_band->precincts; - if (l_precinct) { - - l_nb_precincts = l_band->precincts_data_size / (OPJ_UINT32)sizeof( - opj_tcd_precinct_t); - for (precno = 0; precno < l_nb_precincts; ++precno) { - opj_tgt_destroy(l_precinct->incltree); - l_precinct->incltree = 00; - opj_tgt_destroy(l_precinct->imsbtree); - l_precinct->imsbtree = 00; - (*l_tcd_code_block_deallocate)(l_precinct); - ++l_precinct; - } - - opj_free(l_band->precincts); - l_band->precincts = 00; - } - ++l_band; - } /* for (resno */ - ++l_res; - } - - opj_free(l_tile_comp->resolutions); - l_tile_comp->resolutions = 00; - } - - if (l_tile_comp->ownsData && l_tile_comp->data) { - opj_image_data_free(l_tile_comp->data); - l_tile_comp->data = 00; - l_tile_comp->ownsData = 0; - l_tile_comp->data_size = 0; - l_tile_comp->data_size_needed = 0; - } - - opj_image_data_free(l_tile_comp->data_win); - - ++l_tile_comp; - } - - opj_free(l_tile->comps); - l_tile->comps = 00; - opj_free(p_tcd->tcd_image->tiles); - p_tcd->tcd_image->tiles = 00; -} - - -static OPJ_BOOL opj_tcd_t2_decode(opj_tcd_t *p_tcd, - OPJ_BYTE * p_src_data, - OPJ_UINT32 * p_data_read, - OPJ_UINT32 p_max_src_size, - opj_codestream_index_t *p_cstr_index, - opj_event_mgr_t *p_manager - ) -{ - opj_t2_t * l_t2; - - l_t2 = opj_t2_create(p_tcd->image, p_tcd->cp); - if (l_t2 == 00) { - return OPJ_FALSE; - } - - if (! opj_t2_decode_packets( - p_tcd, - l_t2, - p_tcd->tcd_tileno, - p_tcd->tcd_image->tiles, - p_src_data, - p_data_read, - p_max_src_size, - p_cstr_index, - p_manager)) { - opj_t2_destroy(l_t2); - return OPJ_FALSE; - } - - opj_t2_destroy(l_t2); - - /*---------------CLEAN-------------------*/ - return OPJ_TRUE; -} - -static OPJ_BOOL opj_tcd_t1_decode(opj_tcd_t *p_tcd, opj_event_mgr_t *p_manager) -{ - OPJ_UINT32 compno; - opj_tcd_tile_t * l_tile = p_tcd->tcd_image->tiles; - opj_tcd_tilecomp_t* l_tile_comp = l_tile->comps; - opj_tccp_t * l_tccp = p_tcd->tcp->tccps; - volatile OPJ_BOOL ret = OPJ_TRUE; - OPJ_BOOL check_pterm = OPJ_FALSE; - opj_mutex_t* p_manager_mutex = NULL; - - p_manager_mutex = opj_mutex_create(); - - /* Only enable PTERM check if we decode all layers */ - if (p_tcd->tcp->num_layers_to_decode == p_tcd->tcp->numlayers && - (l_tccp->cblksty & J2K_CCP_CBLKSTY_PTERM) != 0) { - check_pterm = OPJ_TRUE; - } - - for (compno = 0; compno < l_tile->numcomps; - ++compno, ++l_tile_comp, ++l_tccp) { - if (p_tcd->used_component != NULL && !p_tcd->used_component[compno]) { - continue; - } - - opj_t1_decode_cblks(p_tcd, &ret, l_tile_comp, l_tccp, - p_manager, p_manager_mutex, check_pterm); - if (!ret) { - break; - } - } - - opj_thread_pool_wait_completion(p_tcd->thread_pool, 0); - if (p_manager_mutex) { - opj_mutex_destroy(p_manager_mutex); - } - return ret; -} - - -static OPJ_BOOL opj_tcd_dwt_decode(opj_tcd_t *p_tcd) -{ - OPJ_UINT32 compno; - opj_tcd_tile_t * l_tile = p_tcd->tcd_image->tiles; - opj_tcd_tilecomp_t * l_tile_comp = l_tile->comps; - opj_tccp_t * l_tccp = p_tcd->tcp->tccps; - opj_image_comp_t * l_img_comp = p_tcd->image->comps; - - for (compno = 0; compno < l_tile->numcomps; - compno++, ++l_tile_comp, ++l_img_comp, ++l_tccp) { - if (p_tcd->used_component != NULL && !p_tcd->used_component[compno]) { - continue; - } - - if (l_tccp->qmfbid == 1) { - if (! opj_dwt_decode(p_tcd, l_tile_comp, - l_img_comp->resno_decoded + 1)) { - return OPJ_FALSE; - } - } else { - if (! opj_dwt_decode_real(p_tcd, l_tile_comp, - l_img_comp->resno_decoded + 1)) { - return OPJ_FALSE; - } - } - - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_tcd_mct_decode(opj_tcd_t *p_tcd, opj_event_mgr_t *p_manager) -{ - opj_tcd_tile_t * l_tile = p_tcd->tcd_image->tiles; - opj_tcp_t * l_tcp = p_tcd->tcp; - opj_tcd_tilecomp_t * l_tile_comp = l_tile->comps; - OPJ_UINT32 l_samples, i; - - if (l_tcp->mct == 0 || p_tcd->used_component != NULL) { - return OPJ_TRUE; - } - - if (p_tcd->whole_tile_decoding) { - opj_tcd_resolution_t* res_comp0 = l_tile->comps[0].resolutions + - l_tile_comp->minimum_num_resolutions - 1; - - /* A bit inefficient: we process more data than needed if */ - /* resno_decoded < l_tile_comp->minimum_num_resolutions-1, */ - /* but we would need to take into account a stride then */ - l_samples = (OPJ_UINT32)((res_comp0->x1 - res_comp0->x0) * - (res_comp0->y1 - res_comp0->y0)); - if (l_tile->numcomps >= 3) { - if (l_tile_comp->minimum_num_resolutions != - l_tile->comps[1].minimum_num_resolutions || - l_tile_comp->minimum_num_resolutions != - l_tile->comps[2].minimum_num_resolutions) { - opj_event_msg(p_manager, EVT_ERROR, - "Tiles don't all have the same dimension. Skip the MCT step.\n"); - return OPJ_FALSE; - } - } - if (l_tile->numcomps >= 3) { - opj_tcd_resolution_t* res_comp1 = l_tile->comps[1].resolutions + - l_tile_comp->minimum_num_resolutions - 1; - opj_tcd_resolution_t* res_comp2 = l_tile->comps[2].resolutions + - l_tile_comp->minimum_num_resolutions - 1; - /* testcase 1336.pdf.asan.47.376 */ - if (p_tcd->image->comps[0].resno_decoded != - p_tcd->image->comps[1].resno_decoded || - p_tcd->image->comps[0].resno_decoded != - p_tcd->image->comps[2].resno_decoded || - (OPJ_SIZE_T)(res_comp1->x1 - res_comp1->x0) * - (OPJ_SIZE_T)(res_comp1->y1 - res_comp1->y0) != l_samples || - (OPJ_SIZE_T)(res_comp2->x1 - res_comp2->x0) * - (OPJ_SIZE_T)(res_comp2->y1 - res_comp2->y0) != l_samples) { - opj_event_msg(p_manager, EVT_ERROR, - "Tiles don't all have the same dimension. Skip the MCT step.\n"); - return OPJ_FALSE; - } - } - } else { - opj_tcd_resolution_t* res_comp0 = l_tile->comps[0].resolutions + - p_tcd->image->comps[0].resno_decoded; - - l_samples = (res_comp0->win_x1 - res_comp0->win_x0) * - (res_comp0->win_y1 - res_comp0->win_y0); - if (l_tile->numcomps >= 3) { - opj_tcd_resolution_t* res_comp1 = l_tile->comps[1].resolutions + - p_tcd->image->comps[1].resno_decoded; - opj_tcd_resolution_t* res_comp2 = l_tile->comps[2].resolutions + - p_tcd->image->comps[2].resno_decoded; - /* testcase 1336.pdf.asan.47.376 */ - if (p_tcd->image->comps[0].resno_decoded != - p_tcd->image->comps[1].resno_decoded || - p_tcd->image->comps[0].resno_decoded != - p_tcd->image->comps[2].resno_decoded || - (OPJ_SIZE_T)(res_comp1->win_x1 - res_comp1->win_x0) * - (OPJ_SIZE_T)(res_comp1->win_y1 - res_comp1->win_y0) != l_samples || - (OPJ_SIZE_T)(res_comp2->win_x1 - res_comp2->win_x0) * - (OPJ_SIZE_T)(res_comp2->win_y1 - res_comp2->win_y0) != l_samples) { - opj_event_msg(p_manager, EVT_ERROR, - "Tiles don't all have the same dimension. Skip the MCT step.\n"); - return OPJ_FALSE; - } - } - } - - if (l_tile->numcomps >= 3) { - if (l_tcp->mct == 2) { - OPJ_BYTE ** l_data; - - if (! l_tcp->m_mct_decoding_matrix) { - return OPJ_TRUE; - } - - l_data = (OPJ_BYTE **) opj_malloc(l_tile->numcomps * sizeof(OPJ_BYTE*)); - if (! l_data) { - return OPJ_FALSE; - } - - for (i = 0; i < l_tile->numcomps; ++i) { - if (p_tcd->whole_tile_decoding) { - l_data[i] = (OPJ_BYTE*) l_tile_comp->data; - } else { - l_data[i] = (OPJ_BYTE*) l_tile_comp->data_win; - } - ++l_tile_comp; - } - - if (! opj_mct_decode_custom(/* MCT data */ - (OPJ_BYTE*) l_tcp->m_mct_decoding_matrix, - /* size of components */ - l_samples, - /* components */ - l_data, - /* nb of components (i.e. size of pData) */ - l_tile->numcomps, - /* tells if the data is signed */ - p_tcd->image->comps->sgnd)) { - opj_free(l_data); - return OPJ_FALSE; - } - - opj_free(l_data); - } else { - if (l_tcp->tccps->qmfbid == 1) { - if (p_tcd->whole_tile_decoding) { - opj_mct_decode(l_tile->comps[0].data, - l_tile->comps[1].data, - l_tile->comps[2].data, - l_samples); - } else { - opj_mct_decode(l_tile->comps[0].data_win, - l_tile->comps[1].data_win, - l_tile->comps[2].data_win, - l_samples); - } - } else { - if (p_tcd->whole_tile_decoding) { - opj_mct_decode_real((OPJ_FLOAT32*)l_tile->comps[0].data, - (OPJ_FLOAT32*)l_tile->comps[1].data, - (OPJ_FLOAT32*)l_tile->comps[2].data, - l_samples); - } else { - opj_mct_decode_real((OPJ_FLOAT32*)l_tile->comps[0].data_win, - (OPJ_FLOAT32*)l_tile->comps[1].data_win, - (OPJ_FLOAT32*)l_tile->comps[2].data_win, - l_samples); - } - } - } - } else { - opj_event_msg(p_manager, EVT_ERROR, - "Number of components (%d) is inconsistent with a MCT. Skip the MCT step.\n", - l_tile->numcomps); - } - - return OPJ_TRUE; -} - - -static OPJ_BOOL opj_tcd_dc_level_shift_decode(opj_tcd_t *p_tcd) -{ - OPJ_UINT32 compno; - opj_tcd_tilecomp_t * l_tile_comp = 00; - opj_tccp_t * l_tccp = 00; - opj_image_comp_t * l_img_comp = 00; - opj_tcd_resolution_t* l_res = 00; - opj_tcd_tile_t * l_tile; - OPJ_UINT32 l_width, l_height, i, j; - OPJ_INT32 * l_current_ptr; - OPJ_INT32 l_min, l_max; - OPJ_UINT32 l_stride; - - l_tile = p_tcd->tcd_image->tiles; - l_tile_comp = l_tile->comps; - l_tccp = p_tcd->tcp->tccps; - l_img_comp = p_tcd->image->comps; - - for (compno = 0; compno < l_tile->numcomps; - compno++, ++l_img_comp, ++l_tccp, ++l_tile_comp) { - - if (p_tcd->used_component != NULL && !p_tcd->used_component[compno]) { - continue; - } - - l_res = l_tile_comp->resolutions + l_img_comp->resno_decoded; - - if (!p_tcd->whole_tile_decoding) { - l_width = l_res->win_x1 - l_res->win_x0; - l_height = l_res->win_y1 - l_res->win_y0; - l_stride = 0; - l_current_ptr = l_tile_comp->data_win; - } else { - l_width = (OPJ_UINT32)(l_res->x1 - l_res->x0); - l_height = (OPJ_UINT32)(l_res->y1 - l_res->y0); - l_stride = (OPJ_UINT32)( - l_tile_comp->resolutions[l_tile_comp->minimum_num_resolutions - 1].x1 - - l_tile_comp->resolutions[l_tile_comp->minimum_num_resolutions - 1].x0) - - l_width; - l_current_ptr = l_tile_comp->data; - - assert(l_height == 0 || - l_width + l_stride <= l_tile_comp->data_size / l_height); /*MUPDF*/ - } - - if (l_img_comp->sgnd) { - l_min = -(1 << (l_img_comp->prec - 1)); - l_max = (1 << (l_img_comp->prec - 1)) - 1; - } else { - l_min = 0; - l_max = (OPJ_INT32)((1U << l_img_comp->prec) - 1); - } - - - if (l_tccp->qmfbid == 1) { - for (j = 0; j < l_height; ++j) { - for (i = 0; i < l_width; ++i) { - /* TODO: do addition on int64 ? */ - *l_current_ptr = opj_int_clamp(*l_current_ptr + l_tccp->m_dc_level_shift, l_min, - l_max); - ++l_current_ptr; - } - l_current_ptr += l_stride; - } - } else { - for (j = 0; j < l_height; ++j) { - for (i = 0; i < l_width; ++i) { - OPJ_FLOAT32 l_value = *((OPJ_FLOAT32 *) l_current_ptr); - if (l_value > INT_MAX) { - *l_current_ptr = l_max; - } else if (l_value < INT_MIN) { - *l_current_ptr = l_min; - } else { - /* Do addition on int64 to avoid overflows */ - OPJ_INT64 l_value_int = (OPJ_INT64)opj_lrintf(l_value); - *l_current_ptr = (OPJ_INT32)opj_int64_clamp( - l_value_int + l_tccp->m_dc_level_shift, l_min, l_max); - } - ++l_current_ptr; - } - l_current_ptr += l_stride; - } - } - } - - return OPJ_TRUE; -} - - - -/** - * Deallocates the encoding data of the given precinct. - */ -static void opj_tcd_code_block_dec_deallocate(opj_tcd_precinct_t * p_precinct) -{ - OPJ_UINT32 cblkno, l_nb_code_blocks; - - opj_tcd_cblk_dec_t * l_code_block = p_precinct->cblks.dec; - if (l_code_block) { - /*fprintf(stderr,"deallocate codeblock:{\n");*/ - /*fprintf(stderr,"\t x0=%d, y0=%d, x1=%d, y1=%d\n",l_code_block->x0, l_code_block->y0, l_code_block->x1, l_code_block->y1);*/ - /*fprintf(stderr,"\t numbps=%d, numlenbits=%d, len=%d, numnewpasses=%d, real_num_segs=%d, m_current_max_segs=%d\n ", - l_code_block->numbps, l_code_block->numlenbits, l_code_block->len, l_code_block->numnewpasses, l_code_block->real_num_segs, l_code_block->m_current_max_segs );*/ - - - l_nb_code_blocks = p_precinct->block_size / (OPJ_UINT32)sizeof( - opj_tcd_cblk_dec_t); - /*fprintf(stderr,"nb_code_blocks =%d\t}\n", l_nb_code_blocks);*/ - - for (cblkno = 0; cblkno < l_nb_code_blocks; ++cblkno) { - - if (l_code_block->segs) { - opj_free(l_code_block->segs); - l_code_block->segs = 00; - } - - if (l_code_block->chunks) { - opj_free(l_code_block->chunks); - l_code_block->chunks = 00; - } - - opj_aligned_free(l_code_block->decoded_data); - l_code_block->decoded_data = NULL; - - ++l_code_block; - } - - opj_free(p_precinct->cblks.dec); - p_precinct->cblks.dec = 00; - } -} - -/** - * Deallocates the encoding data of the given precinct. - */ -static void opj_tcd_code_block_enc_deallocate(opj_tcd_precinct_t * p_precinct) -{ - OPJ_UINT32 cblkno, l_nb_code_blocks; - - opj_tcd_cblk_enc_t * l_code_block = p_precinct->cblks.enc; - if (l_code_block) { - l_nb_code_blocks = p_precinct->block_size / (OPJ_UINT32)sizeof( - opj_tcd_cblk_enc_t); - - for (cblkno = 0; cblkno < l_nb_code_blocks; ++cblkno) { - if (l_code_block->data) { - /* We refer to data - 1 since below we incremented it */ - /* in opj_tcd_code_block_enc_allocate_data() */ - opj_free(l_code_block->data - 1); - l_code_block->data = 00; - } - - if (l_code_block->layers) { - opj_free(l_code_block->layers); - l_code_block->layers = 00; - } - - if (l_code_block->passes) { - opj_free(l_code_block->passes); - l_code_block->passes = 00; - } - ++l_code_block; - } - - opj_free(p_precinct->cblks.enc); - - p_precinct->cblks.enc = 00; - } -} - -OPJ_SIZE_T opj_tcd_get_encoded_tile_size(opj_tcd_t *p_tcd) -{ - OPJ_UINT32 i; - OPJ_SIZE_T l_data_size = 0; - opj_image_comp_t * l_img_comp = 00; - opj_tcd_tilecomp_t * l_tilec = 00; - OPJ_UINT32 l_size_comp, l_remaining; - - l_tilec = p_tcd->tcd_image->tiles->comps; - l_img_comp = p_tcd->image->comps; - for (i = 0; i < p_tcd->image->numcomps; ++i) { - l_size_comp = l_img_comp->prec >> 3; /*(/ 8)*/ - l_remaining = l_img_comp->prec & 7; /* (%8) */ - - if (l_remaining) { - ++l_size_comp; - } - - if (l_size_comp == 3) { - l_size_comp = 4; - } - - l_data_size += l_size_comp * ((OPJ_SIZE_T)(l_tilec->x1 - l_tilec->x0) * - (OPJ_SIZE_T)(l_tilec->y1 - l_tilec->y0)); - ++l_img_comp; - ++l_tilec; - } - - return l_data_size; -} - -static OPJ_BOOL opj_tcd_dc_level_shift_encode(opj_tcd_t *p_tcd) -{ - OPJ_UINT32 compno; - opj_tcd_tilecomp_t * l_tile_comp = 00; - opj_tccp_t * l_tccp = 00; - opj_image_comp_t * l_img_comp = 00; - opj_tcd_tile_t * l_tile; - OPJ_SIZE_T l_nb_elem, i; - OPJ_INT32 * l_current_ptr; - - l_tile = p_tcd->tcd_image->tiles; - l_tile_comp = l_tile->comps; - l_tccp = p_tcd->tcp->tccps; - l_img_comp = p_tcd->image->comps; - - for (compno = 0; compno < l_tile->numcomps; compno++) { - l_current_ptr = l_tile_comp->data; - l_nb_elem = (OPJ_SIZE_T)(l_tile_comp->x1 - l_tile_comp->x0) * - (OPJ_SIZE_T)(l_tile_comp->y1 - l_tile_comp->y0); - - if (l_tccp->qmfbid == 1) { - for (i = 0; i < l_nb_elem; ++i) { - *l_current_ptr -= l_tccp->m_dc_level_shift ; - ++l_current_ptr; - } - } else { - for (i = 0; i < l_nb_elem; ++i) { - *l_current_ptr = (*l_current_ptr - l_tccp->m_dc_level_shift) * (1 << 11); - ++l_current_ptr; - } - } - - ++l_img_comp; - ++l_tccp; - ++l_tile_comp; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_tcd_mct_encode(opj_tcd_t *p_tcd) -{ - opj_tcd_tile_t * l_tile = p_tcd->tcd_image->tiles; - opj_tcd_tilecomp_t * l_tile_comp = p_tcd->tcd_image->tiles->comps; - OPJ_SIZE_T samples = (OPJ_SIZE_T)(l_tile_comp->x1 - l_tile_comp->x0) * - (OPJ_SIZE_T)(l_tile_comp->y1 - l_tile_comp->y0); - OPJ_UINT32 i; - OPJ_BYTE ** l_data = 00; - opj_tcp_t * l_tcp = p_tcd->tcp; - - if (!p_tcd->tcp->mct) { - return OPJ_TRUE; - } - - if (p_tcd->tcp->mct == 2) { - if (! p_tcd->tcp->m_mct_coding_matrix) { - return OPJ_TRUE; - } - - l_data = (OPJ_BYTE **) opj_malloc(l_tile->numcomps * sizeof(OPJ_BYTE*)); - if (! l_data) { - return OPJ_FALSE; - } - - for (i = 0; i < l_tile->numcomps; ++i) { - l_data[i] = (OPJ_BYTE*) l_tile_comp->data; - ++l_tile_comp; - } - - if (! opj_mct_encode_custom(/* MCT data */ - (OPJ_BYTE*) p_tcd->tcp->m_mct_coding_matrix, - /* size of components */ - samples, - /* components */ - l_data, - /* nb of components (i.e. size of pData) */ - l_tile->numcomps, - /* tells if the data is signed */ - p_tcd->image->comps->sgnd)) { - opj_free(l_data); - return OPJ_FALSE; - } - - opj_free(l_data); - } else if (l_tcp->tccps->qmfbid == 0) { - opj_mct_encode_real(l_tile->comps[0].data, l_tile->comps[1].data, - l_tile->comps[2].data, samples); - } else { - opj_mct_encode(l_tile->comps[0].data, l_tile->comps[1].data, - l_tile->comps[2].data, samples); - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_tcd_dwt_encode(opj_tcd_t *p_tcd) -{ - opj_tcd_tile_t * l_tile = p_tcd->tcd_image->tiles; - opj_tcd_tilecomp_t * l_tile_comp = p_tcd->tcd_image->tiles->comps; - opj_tccp_t * l_tccp = p_tcd->tcp->tccps; - OPJ_UINT32 compno; - - for (compno = 0; compno < l_tile->numcomps; ++compno) { - if (l_tccp->qmfbid == 1) { - if (! opj_dwt_encode(l_tile_comp)) { - return OPJ_FALSE; - } - } else if (l_tccp->qmfbid == 0) { - if (! opj_dwt_encode_real(l_tile_comp)) { - return OPJ_FALSE; - } - } - - ++l_tile_comp; - ++l_tccp; - } - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_tcd_t1_encode(opj_tcd_t *p_tcd) -{ - opj_t1_t * l_t1; - const OPJ_FLOAT64 * l_mct_norms; - OPJ_UINT32 l_mct_numcomps = 0U; - opj_tcp_t * l_tcp = p_tcd->tcp; - - l_t1 = opj_t1_create(OPJ_TRUE); - if (l_t1 == 00) { - return OPJ_FALSE; - } - - if (l_tcp->mct == 1) { - l_mct_numcomps = 3U; - /* irreversible encoding */ - if (l_tcp->tccps->qmfbid == 0) { - l_mct_norms = opj_mct_get_mct_norms_real(); - } else { - l_mct_norms = opj_mct_get_mct_norms(); - } - } else { - l_mct_numcomps = p_tcd->image->numcomps; - l_mct_norms = (const OPJ_FLOAT64 *)(l_tcp->mct_norms); - } - - if (! opj_t1_encode_cblks(l_t1, p_tcd->tcd_image->tiles, l_tcp, l_mct_norms, - l_mct_numcomps)) { - opj_t1_destroy(l_t1); - return OPJ_FALSE; - } - - opj_t1_destroy(l_t1); - - return OPJ_TRUE; -} - -static OPJ_BOOL opj_tcd_t2_encode(opj_tcd_t *p_tcd, - OPJ_BYTE * p_dest_data, - OPJ_UINT32 * p_data_written, - OPJ_UINT32 p_max_dest_size, - opj_codestream_info_t *p_cstr_info, - opj_event_mgr_t *p_manager) -{ - opj_t2_t * l_t2; - - l_t2 = opj_t2_create(p_tcd->image, p_tcd->cp); - if (l_t2 == 00) { - return OPJ_FALSE; - } - - if (! opj_t2_encode_packets( - l_t2, - p_tcd->tcd_tileno, - p_tcd->tcd_image->tiles, - p_tcd->tcp->numlayers, - p_dest_data, - p_data_written, - p_max_dest_size, - p_cstr_info, - p_tcd->tp_num, - p_tcd->tp_pos, - p_tcd->cur_pino, - FINAL_PASS, - p_manager)) { - opj_t2_destroy(l_t2); - return OPJ_FALSE; - } - - opj_t2_destroy(l_t2); - - /*---------------CLEAN-------------------*/ - return OPJ_TRUE; -} - - -static OPJ_BOOL opj_tcd_rate_allocate_encode(opj_tcd_t *p_tcd, - OPJ_BYTE * p_dest_data, - OPJ_UINT32 p_max_dest_size, - opj_codestream_info_t *p_cstr_info, - opj_event_mgr_t *p_manager) -{ - opj_cp_t * l_cp = p_tcd->cp; - OPJ_UINT32 l_nb_written = 0; - - if (p_cstr_info) { - p_cstr_info->index_write = 0; - } - - if (l_cp->m_specific_param.m_enc.m_disto_alloc || - l_cp->m_specific_param.m_enc.m_fixed_quality) { - /* fixed_quality */ - /* Normal Rate/distortion allocation */ - if (! opj_tcd_rateallocate(p_tcd, p_dest_data, &l_nb_written, p_max_dest_size, - p_cstr_info, p_manager)) { - return OPJ_FALSE; - } - } else { - /* Fixed layer allocation */ - opj_tcd_rateallocate_fixed(p_tcd); - } - - return OPJ_TRUE; -} - - -OPJ_BOOL opj_tcd_copy_tile_data(opj_tcd_t *p_tcd, - OPJ_BYTE * p_src, - OPJ_SIZE_T p_src_length) -{ - OPJ_UINT32 i; - OPJ_SIZE_T j; - OPJ_SIZE_T l_data_size = 0; - opj_image_comp_t * l_img_comp = 00; - opj_tcd_tilecomp_t * l_tilec = 00; - OPJ_UINT32 l_size_comp, l_remaining; - OPJ_SIZE_T l_nb_elem; - - l_data_size = opj_tcd_get_encoded_tile_size(p_tcd); - if (l_data_size != p_src_length) { - return OPJ_FALSE; - } - - l_tilec = p_tcd->tcd_image->tiles->comps; - l_img_comp = p_tcd->image->comps; - for (i = 0; i < p_tcd->image->numcomps; ++i) { - l_size_comp = l_img_comp->prec >> 3; /*(/ 8)*/ - l_remaining = l_img_comp->prec & 7; /* (%8) */ - l_nb_elem = (OPJ_SIZE_T)(l_tilec->x1 - l_tilec->x0) * - (OPJ_SIZE_T)(l_tilec->y1 - l_tilec->y0); - - if (l_remaining) { - ++l_size_comp; - } - - if (l_size_comp == 3) { - l_size_comp = 4; - } - - switch (l_size_comp) { - case 1: { - OPJ_CHAR * l_src_ptr = (OPJ_CHAR *) p_src; - OPJ_INT32 * l_dest_ptr = l_tilec->data; - - if (l_img_comp->sgnd) { - for (j = 0; j < l_nb_elem; ++j) { - *(l_dest_ptr++) = (OPJ_INT32)(*(l_src_ptr++)); - } - } else { - for (j = 0; j < l_nb_elem; ++j) { - *(l_dest_ptr++) = (*(l_src_ptr++)) & 0xff; - } - } - - p_src = (OPJ_BYTE*) l_src_ptr; - } - break; - case 2: { - OPJ_INT32 * l_dest_ptr = l_tilec->data; - OPJ_INT16 * l_src_ptr = (OPJ_INT16 *) p_src; - - if (l_img_comp->sgnd) { - for (j = 0; j < l_nb_elem; ++j) { - *(l_dest_ptr++) = (OPJ_INT32)(*(l_src_ptr++)); - } - } else { - for (j = 0; j < l_nb_elem; ++j) { - *(l_dest_ptr++) = (*(l_src_ptr++)) & 0xffff; - } - } - - p_src = (OPJ_BYTE*) l_src_ptr; - } - break; - case 4: { - OPJ_INT32 * l_src_ptr = (OPJ_INT32 *) p_src; - OPJ_INT32 * l_dest_ptr = l_tilec->data; - - for (j = 0; j < l_nb_elem; ++j) { - *(l_dest_ptr++) = (OPJ_INT32)(*(l_src_ptr++)); - } - - p_src = (OPJ_BYTE*) l_src_ptr; - } - break; - } - - ++l_img_comp; - ++l_tilec; - } - - return OPJ_TRUE; -} - -OPJ_BOOL opj_tcd_is_band_empty(opj_tcd_band_t* band) -{ - return (band->x1 - band->x0 == 0) || (band->y1 - band->y0 == 0); -} - -OPJ_BOOL opj_tcd_is_subband_area_of_interest(opj_tcd_t *tcd, - OPJ_UINT32 compno, - OPJ_UINT32 resno, - OPJ_UINT32 bandno, - OPJ_UINT32 band_x0, - OPJ_UINT32 band_y0, - OPJ_UINT32 band_x1, - OPJ_UINT32 band_y1) -{ - /* Note: those values for filter_margin are in part the result of */ - /* experimentation. The value 2 for QMFBID=1 (5x3 filter) can be linked */ - /* to the maximum left/right extension given in tables F.2 and F.3 of the */ - /* standard. The value 3 for QMFBID=0 (9x7 filter) is more suspicious, */ - /* since F.2 and F.3 would lead to 4 instead, so the current 3 might be */ - /* needed to be bumped to 4, in case inconsistencies are found while */ - /* decoding parts of irreversible coded images. */ - /* See opj_dwt_decode_partial_53 and opj_dwt_decode_partial_97 as well */ - OPJ_UINT32 filter_margin = (tcd->tcp->tccps[compno].qmfbid == 1) ? 2 : 3; - opj_tcd_tilecomp_t *tilec = &(tcd->tcd_image->tiles->comps[compno]); - opj_image_comp_t* image_comp = &(tcd->image->comps[compno]); - /* Compute the intersection of the area of interest, expressed in tile coordinates */ - /* with the tile coordinates */ - OPJ_UINT32 tcx0 = opj_uint_max( - (OPJ_UINT32)tilec->x0, - opj_uint_ceildiv(tcd->win_x0, image_comp->dx)); - OPJ_UINT32 tcy0 = opj_uint_max( - (OPJ_UINT32)tilec->y0, - opj_uint_ceildiv(tcd->win_y0, image_comp->dy)); - OPJ_UINT32 tcx1 = opj_uint_min( - (OPJ_UINT32)tilec->x1, - opj_uint_ceildiv(tcd->win_x1, image_comp->dx)); - OPJ_UINT32 tcy1 = opj_uint_min( - (OPJ_UINT32)tilec->y1, - opj_uint_ceildiv(tcd->win_y1, image_comp->dy)); - /* Compute number of decomposition for this band. See table F-1 */ - OPJ_UINT32 nb = (resno == 0) ? - tilec->numresolutions - 1 : - tilec->numresolutions - resno; - /* Map above tile-based coordinates to sub-band-based coordinates per */ - /* equation B-15 of the standard */ - OPJ_UINT32 x0b = bandno & 1; - OPJ_UINT32 y0b = bandno >> 1; - OPJ_UINT32 tbx0 = (nb == 0) ? tcx0 : - (tcx0 <= (1U << (nb - 1)) * x0b) ? 0 : - opj_uint_ceildivpow2(tcx0 - (1U << (nb - 1)) * x0b, nb); - OPJ_UINT32 tby0 = (nb == 0) ? tcy0 : - (tcy0 <= (1U << (nb - 1)) * y0b) ? 0 : - opj_uint_ceildivpow2(tcy0 - (1U << (nb - 1)) * y0b, nb); - OPJ_UINT32 tbx1 = (nb == 0) ? tcx1 : - (tcx1 <= (1U << (nb - 1)) * x0b) ? 0 : - opj_uint_ceildivpow2(tcx1 - (1U << (nb - 1)) * x0b, nb); - OPJ_UINT32 tby1 = (nb == 0) ? tcy1 : - (tcy1 <= (1U << (nb - 1)) * y0b) ? 0 : - opj_uint_ceildivpow2(tcy1 - (1U << (nb - 1)) * y0b, nb); - OPJ_BOOL intersects; - - if (tbx0 < filter_margin) { - tbx0 = 0; - } else { - tbx0 -= filter_margin; - } - if (tby0 < filter_margin) { - tby0 = 0; - } else { - tby0 -= filter_margin; - } - tbx1 = opj_uint_adds(tbx1, filter_margin); - tby1 = opj_uint_adds(tby1, filter_margin); - - intersects = band_x0 < tbx1 && band_y0 < tby1 && band_x1 > tbx0 && - band_y1 > tby0; - -#ifdef DEBUG_VERBOSE - printf("compno=%u resno=%u nb=%u bandno=%u x0b=%u y0b=%u band=%u,%u,%u,%u tb=%u,%u,%u,%u -> %u\n", - compno, resno, nb, bandno, x0b, y0b, - band_x0, band_y0, band_x1, band_y1, - tbx0, tby0, tbx1, tby1, intersects); -#endif - return intersects; -} - -/** Returns whether a tile componenent is fully decoded, taking into account - * p_tcd->win_* members. - * - * @param p_tcd TCD handle. - * @param compno Component number - * @return OPJ_TRUE whether the tile componenent is fully decoded - */ -static OPJ_BOOL opj_tcd_is_whole_tilecomp_decoding(opj_tcd_t *p_tcd, - OPJ_UINT32 compno) -{ - opj_tcd_tilecomp_t* tilec = &(p_tcd->tcd_image->tiles->comps[compno]); - opj_image_comp_t* image_comp = &(p_tcd->image->comps[compno]); - /* Compute the intersection of the area of interest, expressed in tile coordinates */ - /* with the tile coordinates */ - OPJ_UINT32 tcx0 = opj_uint_max( - (OPJ_UINT32)tilec->x0, - opj_uint_ceildiv(p_tcd->win_x0, image_comp->dx)); - OPJ_UINT32 tcy0 = opj_uint_max( - (OPJ_UINT32)tilec->y0, - opj_uint_ceildiv(p_tcd->win_y0, image_comp->dy)); - OPJ_UINT32 tcx1 = opj_uint_min( - (OPJ_UINT32)tilec->x1, - opj_uint_ceildiv(p_tcd->win_x1, image_comp->dx)); - OPJ_UINT32 tcy1 = opj_uint_min( - (OPJ_UINT32)tilec->y1, - opj_uint_ceildiv(p_tcd->win_y1, image_comp->dy)); - - OPJ_UINT32 shift = tilec->numresolutions - tilec->minimum_num_resolutions; - /* Tolerate small margin within the reduced resolution factor to consider if */ - /* the whole tile path must be taken */ - return (tcx0 >= (OPJ_UINT32)tilec->x0 && - tcy0 >= (OPJ_UINT32)tilec->y0 && - tcx1 <= (OPJ_UINT32)tilec->x1 && - tcy1 <= (OPJ_UINT32)tilec->y1 && - (shift >= 32 || - (((tcx0 - (OPJ_UINT32)tilec->x0) >> shift) == 0 && - ((tcy0 - (OPJ_UINT32)tilec->y0) >> shift) == 0 && - (((OPJ_UINT32)tilec->x1 - tcx1) >> shift) == 0 && - (((OPJ_UINT32)tilec->y1 - tcy1) >> shift) == 0))); -} diff --git a/src/3rd/LibOpenJpeg/tcd.h b/src/3rd/LibOpenJpeg/tcd.h deleted file mode 100644 index e3214c1d..00000000 --- a/src/3rd/LibOpenJpeg/tcd.h +++ /dev/null @@ -1,486 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR - * Copyright (c) 2012, CS Systemes d'Information, France - * Copyright (c) 2017, IntoPIX SA - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef OPJ_TCD_H -#define OPJ_TCD_H -/** -@file tcd.h -@brief Implementation of a tile coder/decoder (TCD) - -The functions in TCD.C encode or decode each tile independently from -each other. The functions in TCD.C are used by other functions in J2K.C. -*/ - -/** @defgroup TCD TCD - Implementation of a tile coder/decoder */ -/*@{*/ - - -/** -FIXME DOC -*/ -typedef struct opj_tcd_pass { - OPJ_UINT32 rate; - OPJ_FLOAT64 distortiondec; - OPJ_UINT32 len; - OPJ_BITFIELD term : 1; -} opj_tcd_pass_t; - -/** -FIXME DOC -*/ -typedef struct opj_tcd_layer { - OPJ_UINT32 numpasses; /* Number of passes in the layer */ - OPJ_UINT32 len; /* len of information */ - OPJ_FLOAT64 disto; /* add for index (Cfr. Marcela) */ - OPJ_BYTE *data; /* data */ -} opj_tcd_layer_t; - -/** -FIXME DOC -*/ -typedef struct opj_tcd_cblk_enc { - OPJ_BYTE* data; /* Data */ - opj_tcd_layer_t* layers; /* layer information */ - opj_tcd_pass_t* passes; /* information about the passes */ - OPJ_INT32 x0, y0, x1, - y1; /* dimension of the code-blocks : left upper corner (x0, y0) right low corner (x1,y1) */ - OPJ_UINT32 numbps; - OPJ_UINT32 numlenbits; - OPJ_UINT32 data_size; /* Size of allocated data buffer */ - OPJ_UINT32 - numpasses; /* number of pass already done for the code-blocks */ - OPJ_UINT32 numpassesinlayers; /* number of passes in the layer */ - OPJ_UINT32 totalpasses; /* total number of passes */ -} opj_tcd_cblk_enc_t; - - -/** Chunk of codestream data that is part of a code block */ -typedef struct opj_tcd_seg_data_chunk { - /* Point to tilepart buffer. We don't make a copy ! - So the tilepart buffer must be kept alive - as long as we need to decode the codeblocks */ - OPJ_BYTE * data; - OPJ_UINT32 len; /* Usable length of data */ -} opj_tcd_seg_data_chunk_t; - -/** Segment of a code-block. - * A segment represent a number of consecutive coding passes, without termination - * of MQC or RAW between them. */ -typedef struct opj_tcd_seg { - OPJ_UINT32 len; /* Size of data related to this segment */ - /* Number of passes decoded. Including those that we skip */ - OPJ_UINT32 numpasses; - /* Number of passes actually to be decoded. To be used for code-block decoding */ - OPJ_UINT32 real_num_passes; - /* Maximum number of passes for this segment */ - OPJ_UINT32 maxpasses; - /* Number of new passes for current packed. Transitory value */ - OPJ_UINT32 numnewpasses; - /* Codestream length for this segment for current packed. Transitory value */ - OPJ_UINT32 newlen; -} opj_tcd_seg_t; - -/** Code-block for decoding */ -typedef struct opj_tcd_cblk_dec { - opj_tcd_seg_t* segs; /* segments information */ - opj_tcd_seg_data_chunk_t* chunks; /* Array of chunks */ - /* position of the code-blocks : left upper corner (x0, y0) right low corner (x1,y1) */ - OPJ_INT32 x0, y0, x1, y1; - OPJ_UINT32 numbps; - /* number of bits for len, for the current packet. Transitory value */ - OPJ_UINT32 numlenbits; - /* number of pass added to the code-blocks, for the current packet. Transitory value */ - OPJ_UINT32 numnewpasses; - /* number of segments, including those of packet we skip */ - OPJ_UINT32 numsegs; - /* number of segments, to be used for code block decoding */ - OPJ_UINT32 real_num_segs; - OPJ_UINT32 m_current_max_segs; /* allocated number of segs[] items */ - OPJ_UINT32 numchunks; /* Number of valid chunks items */ - OPJ_UINT32 numchunksalloc; /* Number of chunks item allocated */ - /* Decoded code-block. Only used for subtile decoding. Otherwise tilec->data is directly updated */ - OPJ_INT32* decoded_data; -} opj_tcd_cblk_dec_t; - -/** Precinct structure */ -typedef struct opj_tcd_precinct { - /* dimension of the precinct : left upper corner (x0, y0) right low corner (x1,y1) */ - OPJ_INT32 x0, y0, x1, y1; - OPJ_UINT32 cw, ch; /* number of code-blocks, in width and height */ - union { /* code-blocks information */ - opj_tcd_cblk_enc_t* enc; - opj_tcd_cblk_dec_t* dec; - void* blocks; - } cblks; - OPJ_UINT32 block_size; /* size taken by cblks (in bytes) */ - opj_tgt_tree_t *incltree; /* inclusion tree */ - opj_tgt_tree_t *imsbtree; /* IMSB tree */ -} opj_tcd_precinct_t; - -/** Sub-band structure */ -typedef struct opj_tcd_band { - /* dimension of the subband : left upper corner (x0, y0) right low corner (x1,y1) */ - OPJ_INT32 x0, y0, x1, y1; - /* band number: for lowest resolution level (0=LL), otherwise (1=HL, 2=LH, 3=HH) */ - OPJ_UINT32 bandno; - /* precinct information */ - opj_tcd_precinct_t *precincts; - /* size of data taken by precincts */ - OPJ_UINT32 precincts_data_size; - OPJ_INT32 numbps; - OPJ_FLOAT32 stepsize; -} opj_tcd_band_t; - -/** Tile-component resolution structure */ -typedef struct opj_tcd_resolution { - /* dimension of the resolution level : left upper corner (x0, y0) right low corner (x1,y1) */ - OPJ_INT32 x0, y0, x1, y1; - /* number of precincts, in width and height, for this resolution level */ - OPJ_UINT32 pw, ph; - /* number of sub-bands for the resolution level (1 for lowest resolution level, 3 otherwise) */ - OPJ_UINT32 numbands; - /* subband information */ - opj_tcd_band_t bands[3]; - - /* dimension of the resolution limited to window of interest. Only valid if tcd->whole_tile_decoding is set */ - OPJ_UINT32 win_x0; - OPJ_UINT32 win_y0; - OPJ_UINT32 win_x1; - OPJ_UINT32 win_y1; -} opj_tcd_resolution_t; - -/** Tile-component structure */ -typedef struct opj_tcd_tilecomp { - /* dimension of component : left upper corner (x0, y0) right low corner (x1,y1) */ - OPJ_INT32 x0, y0, x1, y1; - /* component number */ - OPJ_UINT32 compno; - /* number of resolutions level */ - OPJ_UINT32 numresolutions; - /* number of resolutions level to decode (at max)*/ - OPJ_UINT32 minimum_num_resolutions; - /* resolutions information */ - opj_tcd_resolution_t *resolutions; - /* size of data for resolutions (in bytes) */ - OPJ_UINT32 resolutions_size; - - /* data of the component. For decoding, only valid if tcd->whole_tile_decoding is set (so exclusive of data_win member) */ - OPJ_INT32 *data; - /* if true, then need to free after usage, otherwise do not free */ - OPJ_BOOL ownsData; - /* we may either need to allocate this amount of data, or re-use image data and ignore this value */ - size_t data_size_needed; - /* size of the data of the component */ - size_t data_size; - - /** data of the component limited to window of interest. Only valid for decoding and if tcd->whole_tile_decoding is NOT set (so exclusive of data member) */ - OPJ_INT32 *data_win; - /* dimension of the component limited to window of interest. Only valid for decoding and if tcd->whole_tile_decoding is NOT set */ - OPJ_UINT32 win_x0; - OPJ_UINT32 win_y0; - OPJ_UINT32 win_x1; - OPJ_UINT32 win_y1; - - /* add fixed_quality */ - OPJ_INT32 numpix; -} opj_tcd_tilecomp_t; - - -/** -FIXME DOC -*/ -typedef struct opj_tcd_tile { - /* dimension of the tile : left upper corner (x0, y0) right low corner (x1,y1) */ - OPJ_INT32 x0, y0, x1, y1; - OPJ_UINT32 numcomps; /* number of components in tile */ - opj_tcd_tilecomp_t *comps; /* Components information */ - OPJ_INT32 numpix; /* add fixed_quality */ - OPJ_FLOAT64 distotile; /* add fixed_quality */ - OPJ_FLOAT64 distolayer[100]; /* add fixed_quality */ - OPJ_UINT32 packno; /* packet number */ -} opj_tcd_tile_t; - -/** -FIXME DOC -*/ -typedef struct opj_tcd_image { - opj_tcd_tile_t *tiles; /* Tiles information */ -} -opj_tcd_image_t; - - -/** -Tile coder/decoder -*/ -typedef struct opj_tcd { - /** Position of the tilepart flag in Progression order*/ - OPJ_INT32 tp_pos; - /** Tile part number*/ - OPJ_UINT32 tp_num; - /** Current tile part number*/ - OPJ_UINT32 cur_tp_num; - /** Total number of tileparts of the current tile*/ - OPJ_UINT32 cur_totnum_tp; - /** Current Packet iterator number */ - OPJ_UINT32 cur_pino; - /** info on each image tile */ - opj_tcd_image_t *tcd_image; - /** image header */ - opj_image_t *image; - /** coding parameters */ - opj_cp_t *cp; - /** coding/decoding parameters common to all tiles */ - opj_tcp_t *tcp; - /** current encoded/decoded tile */ - OPJ_UINT32 tcd_tileno; - /** tell if the tcd is a decoder. */ - OPJ_BITFIELD m_is_decoder : 1; - /** Thread pool */ - opj_thread_pool_t* thread_pool; - /** Coordinates of the window of interest, in grid reference space */ - OPJ_UINT32 win_x0; - OPJ_UINT32 win_y0; - OPJ_UINT32 win_x1; - OPJ_UINT32 win_y1; - /** Only valid for decoding. Whether the whole tile is decoded, or just the region in win_x0/win_y0/win_x1/win_y1 */ - OPJ_BOOL whole_tile_decoding; - /* Array of size image->numcomps indicating if a component must be decoded. NULL if all components must be decoded */ - OPJ_BOOL* used_component; -} opj_tcd_t; - -/** @name Exported functions */ -/*@{*/ -/* ----------------------------------------------------------------------- */ - -/** -Dump the content of a tcd structure -*/ -/*void tcd_dump(FILE *fd, opj_tcd_t *tcd, opj_tcd_image_t *img);*/ /* TODO MSD shoul use the new v2 structures */ - -/** -Create a new TCD handle -@param p_is_decoder FIXME DOC -@return Returns a new TCD handle if successful returns NULL otherwise -*/ -opj_tcd_t* opj_tcd_create(OPJ_BOOL p_is_decoder); - -/** -Destroy a previously created TCD handle -@param tcd TCD handle to destroy -*/ -void opj_tcd_destroy(opj_tcd_t *tcd); - -/** - * Initialize the tile coder and may reuse some memory. - * @param p_tcd TCD handle. - * @param p_image raw image. - * @param p_cp coding parameters. - * @param p_tp thread pool - * - * @return true if the encoding values could be set (false otherwise). -*/ -OPJ_BOOL opj_tcd_init(opj_tcd_t *p_tcd, - opj_image_t * p_image, - opj_cp_t * p_cp, - opj_thread_pool_t* p_tp); - -/** - * Allocates memory for decoding a specific tile. - * - * @param p_tcd the tile decoder. - * @param p_tile_no the index of the tile received in sequence. This not necessarily lead to the - * tile at index p_tile_no. - * @param p_manager the event manager. - * - * @return true if the remaining data is sufficient. - */ -OPJ_BOOL opj_tcd_init_decode_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no, - opj_event_mgr_t* p_manager); - -void opj_tcd_makelayer_fixed(opj_tcd_t *tcd, OPJ_UINT32 layno, - OPJ_UINT32 final); - -void opj_tcd_rateallocate_fixed(opj_tcd_t *tcd); - -void opj_tcd_makelayer(opj_tcd_t *tcd, - OPJ_UINT32 layno, - OPJ_FLOAT64 thresh, - OPJ_UINT32 final); - -OPJ_BOOL opj_tcd_rateallocate(opj_tcd_t *tcd, - OPJ_BYTE *dest, - OPJ_UINT32 * p_data_written, - OPJ_UINT32 len, - opj_codestream_info_t *cstr_info, - opj_event_mgr_t *p_manager); - -/** - * Gets the maximum tile size that will be taken by the tile once decoded. - */ -OPJ_UINT32 opj_tcd_get_decoded_tile_size(opj_tcd_t *p_tcd, - OPJ_BOOL take_into_account_partial_decoding); - -/** - * Encodes a tile from the raw image into the given buffer. - * @param p_tcd Tile Coder handle - * @param p_tile_no Index of the tile to encode. - * @param p_dest Destination buffer - * @param p_data_written pointer to an int that is incremented by the number of bytes really written on p_dest - * @param p_len Maximum length of the destination buffer - * @param p_cstr_info Codestream information structure - * @param p_manager the user event manager - * @return true if the coding is successful. -*/ -OPJ_BOOL opj_tcd_encode_tile(opj_tcd_t *p_tcd, - OPJ_UINT32 p_tile_no, - OPJ_BYTE *p_dest, - OPJ_UINT32 * p_data_written, - OPJ_UINT32 p_len, - struct opj_codestream_info *p_cstr_info, - opj_event_mgr_t *p_manager); - - -/** -Decode a tile from a buffer into a raw image -@param tcd TCD handle -@param win_x0 Upper left x of region to decode (in grid coordinates) -@param win_y0 Upper left y of region to decode (in grid coordinates) -@param win_x1 Lower right x of region to decode (in grid coordinates) -@param win_y1 Lower right y of region to decode (in grid coordinates) -@param numcomps_to_decode Size of the comps_indices array, or 0 if decoding all components. -@param comps_indices Array of numcomps values representing the indices - of the components to decode (relative to the - codestream, starting at 0). Or NULL if decoding all components. -@param src Source buffer -@param len Length of source buffer -@param tileno Number that identifies one of the tiles to be decoded -@param cstr_info FIXME DOC -@param manager the event manager. -*/ -OPJ_BOOL opj_tcd_decode_tile(opj_tcd_t *tcd, - OPJ_UINT32 win_x0, - OPJ_UINT32 win_y0, - OPJ_UINT32 win_x1, - OPJ_UINT32 win_y1, - OPJ_UINT32 numcomps_to_decode, - const OPJ_UINT32 *comps_indices, - OPJ_BYTE *src, - OPJ_UINT32 len, - OPJ_UINT32 tileno, - opj_codestream_index_t *cstr_info, - opj_event_mgr_t *manager); - - -/** - * Copies tile data from the system onto the given memory block. - */ -OPJ_BOOL opj_tcd_update_tile_data(opj_tcd_t *p_tcd, - OPJ_BYTE * p_dest, - OPJ_UINT32 p_dest_length); - -/** - * - */ -OPJ_SIZE_T opj_tcd_get_encoded_tile_size(opj_tcd_t *p_tcd); - -/** - * Initialize the tile coder and may reuse some meory. - * - * @param p_tcd TCD handle. - * @param p_tile_no current tile index to encode. - * @param p_manager the event manager. - * - * @return true if the encoding values could be set (false otherwise). -*/ -OPJ_BOOL opj_tcd_init_encode_tile(opj_tcd_t *p_tcd, - OPJ_UINT32 p_tile_no, opj_event_mgr_t* p_manager); - -/** - * Copies tile data from the given memory block onto the system. - */ -OPJ_BOOL opj_tcd_copy_tile_data(opj_tcd_t *p_tcd, - OPJ_BYTE * p_src, - OPJ_SIZE_T p_src_length); - -/** - * Allocates tile component data - * - * - */ -OPJ_BOOL opj_alloc_tile_component_data(opj_tcd_tilecomp_t *l_tilec); - -/** Returns whether a sub-band is empty (i.e. whether it has a null area) - * @param band Sub-band handle. - * @return OPJ_TRUE whether the sub-band is empty. - */ -OPJ_BOOL opj_tcd_is_band_empty(opj_tcd_band_t* band); - -/** Reinitialize a segment */ -void opj_tcd_reinit_segment(opj_tcd_seg_t* seg); - - -/** Returns whether a sub-band region contributes to the area of interest - * tcd->win_x0,tcd->win_y0,tcd->win_x1,tcd->win_y1. - * - * @param tcd TCD handle. - * @param compno Component number - * @param resno Resolution number - * @param bandno Band number (*not* band index, ie 0, 1, 2 or 3) - * @param x0 Upper left x in subband coordinates - * @param y0 Upper left y in subband coordinates - * @param x1 Lower right x in subband coordinates - * @param y1 Lower right y in subband coordinates - * @return OPJ_TRUE whether the sub-band region contributs to the area of - * interest. - */ -OPJ_BOOL opj_tcd_is_subband_area_of_interest(opj_tcd_t *tcd, - OPJ_UINT32 compno, - OPJ_UINT32 resno, - OPJ_UINT32 bandno, - OPJ_UINT32 x0, - OPJ_UINT32 y0, - OPJ_UINT32 x1, - OPJ_UINT32 y1); - -/* ----------------------------------------------------------------------- */ -/*@}*/ - -/*@}*/ - -#endif /* OPJ_TCD_H */ diff --git a/src/3rd/LibOpenJpeg/test_sparse_array.c b/src/3rd/LibOpenJpeg/test_sparse_array.c deleted file mode 100644 index 8e136451..00000000 --- a/src/3rd/LibOpenJpeg/test_sparse_array.c +++ /dev/null @@ -1,174 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2017, IntoPix SA - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#undef NDEBUG - -#include "opj_includes.h" - -int main() -{ - OPJ_UINT32 i, j, w, h; - OPJ_INT32 buffer[ 99 * 101 ]; - OPJ_BOOL ret; - opj_sparse_array_int32_t* sa; - - sa = opj_sparse_array_int32_create(0, 1, 1, 1); - assert(sa == NULL); - opj_sparse_array_int32_free(sa); - - sa = opj_sparse_array_int32_create(1, 0, 1, 1); - assert(sa == NULL); - - sa = opj_sparse_array_int32_create(1, 1, 0, 1); - assert(sa == NULL); - - sa = opj_sparse_array_int32_create(1, 1, 1, 0); - assert(sa == NULL); - - sa = opj_sparse_array_int32_create(99, 101, ~0U, ~0U); - assert(sa == NULL); - - sa = opj_sparse_array_int32_create(99, 101, 15, 17); - opj_sparse_array_int32_free(sa); - - sa = opj_sparse_array_int32_create(99, 101, 15, 17); - ret = opj_sparse_array_int32_read(sa, 0, 0, 0, 1, buffer, 1, 1, OPJ_FALSE); - assert(!ret); - ret = opj_sparse_array_int32_read(sa, 0, 0, 1, 0, buffer, 1, 1, OPJ_FALSE); - assert(!ret); - ret = opj_sparse_array_int32_read(sa, 0, 0, 100, 1, buffer, 1, 1, OPJ_FALSE); - assert(!ret); - ret = opj_sparse_array_int32_read(sa, 0, 0, 1, 102, buffer, 1, 1, OPJ_FALSE); - assert(!ret); - ret = opj_sparse_array_int32_read(sa, 1, 0, 0, 1, buffer, 1, 1, OPJ_FALSE); - assert(!ret); - ret = opj_sparse_array_int32_read(sa, 0, 1, 1, 0, buffer, 1, 1, OPJ_FALSE); - assert(!ret); - ret = opj_sparse_array_int32_read(sa, 99, 101, 99, 101, buffer, 1, 1, - OPJ_FALSE); - assert(!ret); - - buffer[0] = 1; - ret = opj_sparse_array_int32_read(sa, 0, 0, 1, 1, buffer, 1, 1, OPJ_FALSE); - assert(ret); - assert(buffer[0] == 0); - - memset(buffer, 0xFF, sizeof(buffer)); - ret = opj_sparse_array_int32_read(sa, 0, 0, 99, 101, buffer, 1, 99, OPJ_FALSE); - assert(ret); - for (i = 0; i < 99 * 101; i++) { - assert(buffer[i] == 0); - } - - buffer[0] = 1; - ret = opj_sparse_array_int32_write(sa, 4, 5, 4 + 1, 5 + 1, buffer, 1, 1, - OPJ_FALSE); - assert(ret); - - buffer[0] = 2; - ret = opj_sparse_array_int32_write(sa, 4, 5, 4 + 1, 5 + 1, buffer, 1, 1, - OPJ_FALSE); - assert(ret); - - buffer[0] = 0; - buffer[1] = 0xFF; - ret = opj_sparse_array_int32_read(sa, 4, 5, 4 + 1, 5 + 1, buffer, 1, 1, - OPJ_FALSE); - assert(ret); - assert(buffer[0] == 2); - assert(buffer[1] == 0xFF); - - buffer[0] = 0xFF; - buffer[1] = 0xFF; - buffer[2] = 0xFF; - ret = opj_sparse_array_int32_read(sa, 4, 5, 4 + 1, 5 + 2, buffer, 0, 1, - OPJ_FALSE); - assert(ret); - assert(buffer[0] == 2); - assert(buffer[1] == 0); - assert(buffer[2] == 0xFF); - - buffer[0] = 3; - ret = opj_sparse_array_int32_write(sa, 4, 5, 4 + 1, 5 + 1, buffer, 0, 1, - OPJ_FALSE); - assert(ret); - - buffer[0] = 0; - buffer[1] = 0xFF; - ret = opj_sparse_array_int32_read(sa, 4, 5, 4 + 1, 5 + 1, buffer, 1, 1, - OPJ_FALSE); - assert(ret); - assert(buffer[0] == 3); - assert(buffer[1] == 0xFF); - - w = 15 + 1; - h = 17 + 1; - memset(buffer, 0xFF, sizeof(buffer)); - ret = opj_sparse_array_int32_read(sa, 2, 1, 2 + w, 1 + h, buffer, 1, w, - OPJ_FALSE); - assert(ret); - for (j = 0; j < h; j++) { - for (i = 0; i < w; i++) { - if (i == 4 - 2 && j == 5 - 1) { - assert(buffer[ j * w + i ] == 3); - } else { - assert(buffer[ j * w + i ] == 0); - } - } - } - - opj_sparse_array_int32_free(sa); - - - sa = opj_sparse_array_int32_create(99, 101, 15, 17); - memset(buffer, 0xFF, sizeof(buffer)); - ret = opj_sparse_array_int32_read(sa, 0, 0, 2, 1, buffer, 2, 4, OPJ_FALSE); - assert(ret); - assert(buffer[0] == 0); - assert(buffer[1] == -1); - assert(buffer[2] == 0); - - buffer[0] = 1; - buffer[2] = 3; - ret = opj_sparse_array_int32_write(sa, 0, 0, 2, 1, buffer, 2, 4, OPJ_FALSE); - assert(ret); - - memset(buffer, 0xFF, sizeof(buffer)); - ret = opj_sparse_array_int32_read(sa, 0, 0, 2, 1, buffer, 2, 4, OPJ_FALSE); - assert(ret); - assert(buffer[0] == 1); - assert(buffer[1] == -1); - assert(buffer[2] == 3); - - opj_sparse_array_int32_free(sa); - - return 0; -} diff --git a/src/3rd/LibOpenJpeg/tgt.c b/src/3rd/LibOpenJpeg/tgt.c deleted file mode 100644 index 0cbad12c..00000000 --- a/src/3rd/LibOpenJpeg/tgt.c +++ /dev/null @@ -1,344 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR - * Copyright (c) 2012, CS Systemes d'Information, France - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "opj_includes.h" - -/* -========================================================== - Tag-tree coder interface -========================================================== -*/ - -opj_tgt_tree_t *opj_tgt_create(OPJ_UINT32 numleafsh, OPJ_UINT32 numleafsv, - opj_event_mgr_t *p_manager) -{ - OPJ_INT32 nplh[32]; - OPJ_INT32 nplv[32]; - opj_tgt_node_t *node = 00; - opj_tgt_node_t *l_parent_node = 00; - opj_tgt_node_t *l_parent_node0 = 00; - opj_tgt_tree_t *tree = 00; - OPJ_UINT32 i; - OPJ_INT32 j, k; - OPJ_UINT32 numlvls; - OPJ_UINT32 n; - - tree = (opj_tgt_tree_t *) opj_calloc(1, sizeof(opj_tgt_tree_t)); - if (!tree) { - opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to create Tag-tree\n"); - return 00; - } - - tree->numleafsh = numleafsh; - tree->numleafsv = numleafsv; - - numlvls = 0; - nplh[0] = (OPJ_INT32)numleafsh; - nplv[0] = (OPJ_INT32)numleafsv; - tree->numnodes = 0; - do { - n = (OPJ_UINT32)(nplh[numlvls] * nplv[numlvls]); - nplh[numlvls + 1] = (nplh[numlvls] + 1) / 2; - nplv[numlvls + 1] = (nplv[numlvls] + 1) / 2; - tree->numnodes += n; - ++numlvls; - } while (n > 1); - - /* ADD */ - if (tree->numnodes == 0) { - opj_free(tree); - return 00; - } - - tree->nodes = (opj_tgt_node_t*) opj_calloc(tree->numnodes, - sizeof(opj_tgt_node_t)); - if (!tree->nodes) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to create Tag-tree nodes\n"); - opj_free(tree); - return 00; - } - tree->nodes_size = tree->numnodes * (OPJ_UINT32)sizeof(opj_tgt_node_t); - - node = tree->nodes; - l_parent_node = &tree->nodes[tree->numleafsh * tree->numleafsv]; - l_parent_node0 = l_parent_node; - - for (i = 0; i < numlvls - 1; ++i) { - for (j = 0; j < nplv[i]; ++j) { - k = nplh[i]; - while (--k >= 0) { - node->parent = l_parent_node; - ++node; - if (--k >= 0) { - node->parent = l_parent_node; - ++node; - } - ++l_parent_node; - } - if ((j & 1) || j == nplv[i] - 1) { - l_parent_node0 = l_parent_node; - } else { - l_parent_node = l_parent_node0; - l_parent_node0 += nplh[i]; - } - } - } - node->parent = 0; - opj_tgt_reset(tree); - return tree; -} - -/** - * Reinitialises a tag-tree from an existing one. - * - * @param p_tree the tree to reinitialize. - * @param p_num_leafs_h the width of the array of leafs of the tree - * @param p_num_leafs_v the height of the array of leafs of the tree - * @return a new tag-tree if successful, NULL otherwise -*/ -opj_tgt_tree_t *opj_tgt_init(opj_tgt_tree_t * p_tree, OPJ_UINT32 p_num_leafs_h, - OPJ_UINT32 p_num_leafs_v, opj_event_mgr_t *p_manager) -{ - OPJ_INT32 l_nplh[32]; - OPJ_INT32 l_nplv[32]; - opj_tgt_node_t *l_node = 00; - opj_tgt_node_t *l_parent_node = 00; - opj_tgt_node_t *l_parent_node0 = 00; - OPJ_UINT32 i; - OPJ_INT32 j, k; - OPJ_UINT32 l_num_levels; - OPJ_UINT32 n; - OPJ_UINT32 l_node_size; - - if (! p_tree) { - return 00; - } - - if ((p_tree->numleafsh != p_num_leafs_h) || - (p_tree->numleafsv != p_num_leafs_v)) { - p_tree->numleafsh = p_num_leafs_h; - p_tree->numleafsv = p_num_leafs_v; - - l_num_levels = 0; - l_nplh[0] = (OPJ_INT32)p_num_leafs_h; - l_nplv[0] = (OPJ_INT32)p_num_leafs_v; - p_tree->numnodes = 0; - do { - n = (OPJ_UINT32)(l_nplh[l_num_levels] * l_nplv[l_num_levels]); - l_nplh[l_num_levels + 1] = (l_nplh[l_num_levels] + 1) / 2; - l_nplv[l_num_levels + 1] = (l_nplv[l_num_levels] + 1) / 2; - p_tree->numnodes += n; - ++l_num_levels; - } while (n > 1); - - /* ADD */ - if (p_tree->numnodes == 0) { - opj_tgt_destroy(p_tree); - return 00; - } - l_node_size = p_tree->numnodes * (OPJ_UINT32)sizeof(opj_tgt_node_t); - - if (l_node_size > p_tree->nodes_size) { - opj_tgt_node_t* new_nodes = (opj_tgt_node_t*) opj_realloc(p_tree->nodes, - l_node_size); - if (! new_nodes) { - opj_event_msg(p_manager, EVT_ERROR, - "Not enough memory to reinitialize the tag tree\n"); - opj_tgt_destroy(p_tree); - return 00; - } - p_tree->nodes = new_nodes; - memset(((char *) p_tree->nodes) + p_tree->nodes_size, 0, - l_node_size - p_tree->nodes_size); - p_tree->nodes_size = l_node_size; - } - l_node = p_tree->nodes; - l_parent_node = &p_tree->nodes[p_tree->numleafsh * p_tree->numleafsv]; - l_parent_node0 = l_parent_node; - - for (i = 0; i < l_num_levels - 1; ++i) { - for (j = 0; j < l_nplv[i]; ++j) { - k = l_nplh[i]; - while (--k >= 0) { - l_node->parent = l_parent_node; - ++l_node; - if (--k >= 0) { - l_node->parent = l_parent_node; - ++l_node; - } - ++l_parent_node; - } - if ((j & 1) || j == l_nplv[i] - 1) { - l_parent_node0 = l_parent_node; - } else { - l_parent_node = l_parent_node0; - l_parent_node0 += l_nplh[i]; - } - } - } - l_node->parent = 0; - } - opj_tgt_reset(p_tree); - - return p_tree; -} - -void opj_tgt_destroy(opj_tgt_tree_t *p_tree) -{ - if (! p_tree) { - return; - } - - if (p_tree->nodes) { - opj_free(p_tree->nodes); - p_tree->nodes = 00; - } - opj_free(p_tree); -} - -void opj_tgt_reset(opj_tgt_tree_t *p_tree) -{ - OPJ_UINT32 i; - opj_tgt_node_t * l_current_node = 00;; - - if (! p_tree) { - return; - } - - l_current_node = p_tree->nodes; - for (i = 0; i < p_tree->numnodes; ++i) { - l_current_node->value = 999; - l_current_node->low = 0; - l_current_node->known = 0; - ++l_current_node; - } -} - -void opj_tgt_setvalue(opj_tgt_tree_t *tree, OPJ_UINT32 leafno, OPJ_INT32 value) -{ - opj_tgt_node_t *node; - node = &tree->nodes[leafno]; - while (node && node->value > value) { - node->value = value; - node = node->parent; - } -} - -void opj_tgt_encode(opj_bio_t *bio, opj_tgt_tree_t *tree, OPJ_UINT32 leafno, - OPJ_INT32 threshold) -{ - opj_tgt_node_t *stk[31]; - opj_tgt_node_t **stkptr; - opj_tgt_node_t *node; - OPJ_INT32 low; - - stkptr = stk; - node = &tree->nodes[leafno]; - while (node->parent) { - *stkptr++ = node; - node = node->parent; - } - - low = 0; - for (;;) { - if (low > node->low) { - node->low = low; - } else { - low = node->low; - } - - while (low < threshold) { - if (low >= node->value) { - if (!node->known) { - opj_bio_write(bio, 1, 1); - node->known = 1; - } - break; - } - opj_bio_write(bio, 0, 1); - ++low; - } - - node->low = low; - if (stkptr == stk) { - break; - } - node = *--stkptr; - } -} - -OPJ_UINT32 opj_tgt_decode(opj_bio_t *bio, opj_tgt_tree_t *tree, - OPJ_UINT32 leafno, OPJ_INT32 threshold) -{ - opj_tgt_node_t *stk[31]; - opj_tgt_node_t **stkptr; - opj_tgt_node_t *node; - OPJ_INT32 low; - - stkptr = stk; - node = &tree->nodes[leafno]; - while (node->parent) { - *stkptr++ = node; - node = node->parent; - } - - low = 0; - for (;;) { - if (low > node->low) { - node->low = low; - } else { - low = node->low; - } - while (low < threshold && low < node->value) { - if (opj_bio_read(bio, 1)) { - node->value = low; - } else { - ++low; - } - } - node->low = low; - if (stkptr == stk) { - break; - } - node = *--stkptr; - } - - return (node->value < threshold) ? 1 : 0; -} diff --git a/src/3rd/LibOpenJpeg/tgt.h b/src/3rd/LibOpenJpeg/tgt.h deleted file mode 100644 index 9818208b..00000000 --- a/src/3rd/LibOpenJpeg/tgt.h +++ /dev/null @@ -1,148 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2014, Professor Benoit Macq - * Copyright (c) 2001-2003, David Janssens - * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux - * Copyright (c) 2003-2014, Antonin Descampe - * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2008, Jerome Fimes, Communications & Systemes - * Copyright (c) 2011-2012, Centre National d'Etudes Spatiales (CNES), France - * Copyright (c) 2012, CS Systemes d'Information, France - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef OPJ_TGT_H -#define OPJ_TGT_H -/** -@file tgt.h -@brief Implementation of a tag-tree coder (TGT) - -The functions in TGT.C have for goal to realize a tag-tree coder. The functions in TGT.C -are used by some function in T2.C. -*/ - -/** @defgroup TGT TGT - Implementation of a tag-tree coder */ -/*@{*/ - -/** -Tag node -*/ -typedef struct opj_tgt_node { - struct opj_tgt_node *parent; - OPJ_INT32 value; - OPJ_INT32 low; - OPJ_UINT32 known; -} opj_tgt_node_t; - -/** -Tag tree -*/ -typedef struct opj_tgt_tree { - OPJ_UINT32 numleafsh; - OPJ_UINT32 numleafsv; - OPJ_UINT32 numnodes; - opj_tgt_node_t *nodes; - OPJ_UINT32 nodes_size; /* maximum size taken by nodes */ -} opj_tgt_tree_t; - - -/** @name Exported functions */ -/*@{*/ -/* ----------------------------------------------------------------------- */ -/** -Create a tag-tree -@param numleafsh Width of the array of leafs of the tree -@param numleafsv Height of the array of leafs of the tree -@param p_manager the event manager -@return Returns a new tag-tree if successful, returns NULL otherwise -*/ -opj_tgt_tree_t *opj_tgt_create(OPJ_UINT32 numleafsh, OPJ_UINT32 numleafsv, - opj_event_mgr_t *p_manager); - -/** - * Reinitialises a tag-tree from an exixting one. - * - * @param p_tree the tree to reinitialize. - * @param p_num_leafs_h the width of the array of leafs of the tree - * @param p_num_leafs_v the height of the array of leafs of the tree - * @param p_manager the event manager - * @return a new tag-tree if successful, NULL otherwise -*/ -opj_tgt_tree_t *opj_tgt_init(opj_tgt_tree_t * p_tree, - OPJ_UINT32 p_num_leafs_h, - OPJ_UINT32 p_num_leafs_v, opj_event_mgr_t *p_manager); -/** -Destroy a tag-tree, liberating memory -@param tree Tag-tree to destroy -*/ -void opj_tgt_destroy(opj_tgt_tree_t *tree); -/** -Reset a tag-tree (set all leaves to 0) -@param tree Tag-tree to reset -*/ -void opj_tgt_reset(opj_tgt_tree_t *tree); -/** -Set the value of a leaf of a tag-tree -@param tree Tag-tree to modify -@param leafno Number that identifies the leaf to modify -@param value New value of the leaf -*/ -void opj_tgt_setvalue(opj_tgt_tree_t *tree, - OPJ_UINT32 leafno, - OPJ_INT32 value); -/** -Encode the value of a leaf of the tag-tree up to a given threshold -@param bio Pointer to a BIO handle -@param tree Tag-tree to modify -@param leafno Number that identifies the leaf to encode -@param threshold Threshold to use when encoding value of the leaf -*/ -void opj_tgt_encode(opj_bio_t *bio, - opj_tgt_tree_t *tree, - OPJ_UINT32 leafno, - OPJ_INT32 threshold); -/** -Decode the value of a leaf of the tag-tree up to a given threshold -@param bio Pointer to a BIO handle -@param tree Tag-tree to decode -@param leafno Number that identifies the leaf to decode -@param threshold Threshold to use when decoding value of the leaf -@return Returns 1 if the node's value < threshold, returns 0 otherwise -*/ -OPJ_UINT32 opj_tgt_decode(opj_bio_t *bio, - opj_tgt_tree_t *tree, - OPJ_UINT32 leafno, - OPJ_INT32 threshold); -/* ----------------------------------------------------------------------- */ -/*@}*/ - -/*@}*/ - -#endif /* OPJ_TGT_H */ diff --git a/src/3rd/LibOpenJpeg/thread.c b/src/3rd/LibOpenJpeg/thread.c deleted file mode 100644 index af33c2c8..00000000 --- a/src/3rd/LibOpenJpeg/thread.c +++ /dev/null @@ -1,952 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2016, Even Rouault - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include - -#ifdef MUTEX_win32 - -/* Some versions of x86_64-w64-mingw32-gc -m32 resolve InterlockedCompareExchange() */ -/* as __sync_val_compare_and_swap_4 but fails to link it. As this protects against */ -/* a rather unlikely race, skip it */ -#if !(defined(__MINGW32__) && defined(__i386__)) -#define HAVE_INTERLOCKED_COMPARE_EXCHANGE 1 -#endif - -#include -#include - -#include "opj_includes.h" - -OPJ_BOOL OPJ_CALLCONV opj_has_thread_support(void) -{ - return OPJ_TRUE; -} - -int OPJ_CALLCONV opj_get_num_cpus(void) -{ - SYSTEM_INFO info; - DWORD dwNum; - GetSystemInfo(&info); - dwNum = info.dwNumberOfProcessors; - if (dwNum < 1) { - return 1; - } - return (int)dwNum; -} - -struct opj_mutex_t { - CRITICAL_SECTION cs; -}; - -opj_mutex_t* opj_mutex_create(void) -{ - opj_mutex_t* mutex = (opj_mutex_t*) opj_malloc(sizeof(opj_mutex_t)); - if (!mutex) { - return NULL; - } - InitializeCriticalSectionAndSpinCount(&(mutex->cs), 4000); - return mutex; -} - -void opj_mutex_lock(opj_mutex_t* mutex) -{ - EnterCriticalSection(&(mutex->cs)); -} - -void opj_mutex_unlock(opj_mutex_t* mutex) -{ - LeaveCriticalSection(&(mutex->cs)); -} - -void opj_mutex_destroy(opj_mutex_t* mutex) -{ - if (!mutex) { - return; - } - DeleteCriticalSection(&(mutex->cs)); - opj_free(mutex); -} - -struct opj_cond_waiter_list_t { - HANDLE hEvent; - struct opj_cond_waiter_list_t* next; -}; -typedef struct opj_cond_waiter_list_t opj_cond_waiter_list_t; - -struct opj_cond_t { - opj_mutex_t *internal_mutex; - opj_cond_waiter_list_t *waiter_list; -}; - -static DWORD TLSKey = 0; -static volatile LONG inTLSLockedSection = 0; -static volatile int TLSKeyInit = OPJ_FALSE; - -opj_cond_t* opj_cond_create(void) -{ - opj_cond_t* cond = (opj_cond_t*) opj_malloc(sizeof(opj_cond_t)); - if (!cond) { - return NULL; - } - - /* Make sure that the TLS key is allocated in a thread-safe way */ - /* We cannot use a global mutex/critical section since its creation itself would not be */ - /* thread-safe, so use InterlockedCompareExchange trick */ - while (OPJ_TRUE) { - -#if HAVE_INTERLOCKED_COMPARE_EXCHANGE - if (InterlockedCompareExchange(&inTLSLockedSection, 1, 0) == 0) -#endif - { - if (!TLSKeyInit) { - TLSKey = TlsAlloc(); - TLSKeyInit = OPJ_TRUE; - } -#if HAVE_INTERLOCKED_COMPARE_EXCHANGE - InterlockedCompareExchange(&inTLSLockedSection, 0, 1); -#endif - break; - } - } - - if (TLSKey == TLS_OUT_OF_INDEXES) { - opj_free(cond); - return NULL; - } - cond->internal_mutex = opj_mutex_create(); - if (cond->internal_mutex == NULL) { - opj_free(cond); - return NULL; - } - cond->waiter_list = NULL; - return cond; -} - -void opj_cond_wait(opj_cond_t* cond, opj_mutex_t* mutex) -{ - opj_cond_waiter_list_t* item; - HANDLE hEvent = (HANDLE) TlsGetValue(TLSKey); - if (hEvent == NULL) { - hEvent = CreateEvent(NULL, /* security attributes */ - 0, /* manual reset = no */ - 0, /* initial state = unsignaled */ - NULL /* no name */); - assert(hEvent); - - TlsSetValue(TLSKey, hEvent); - } - - /* Insert the waiter into the waiter list of the condition */ - opj_mutex_lock(cond->internal_mutex); - - item = (opj_cond_waiter_list_t*)opj_malloc(sizeof(opj_cond_waiter_list_t)); - assert(item != NULL); - - item->hEvent = hEvent; - item->next = cond->waiter_list; - - cond->waiter_list = item; - - opj_mutex_unlock(cond->internal_mutex); - - /* Release the client mutex before waiting for the event being signaled */ - opj_mutex_unlock(mutex); - - /* Ideally we would check that we do not get WAIT_FAILED but it is hard */ - /* to report a failure. */ - WaitForSingleObject(hEvent, INFINITE); - - /* Reacquire the client mutex */ - opj_mutex_lock(mutex); -} - -void opj_cond_signal(opj_cond_t* cond) -{ - opj_cond_waiter_list_t* psIter; - - /* Signal the first registered event, and remove it from the list */ - opj_mutex_lock(cond->internal_mutex); - - psIter = cond->waiter_list; - if (psIter != NULL) { - SetEvent(psIter->hEvent); - cond->waiter_list = psIter->next; - opj_free(psIter); - } - - opj_mutex_unlock(cond->internal_mutex); -} - -void opj_cond_destroy(opj_cond_t* cond) -{ - if (!cond) { - return; - } - opj_mutex_destroy(cond->internal_mutex); - assert(cond->waiter_list == NULL); - opj_free(cond); -} - -struct opj_thread_t { - opj_thread_fn thread_fn; - void* user_data; - HANDLE hThread; -}; - -unsigned int __stdcall opj_thread_callback_adapter(void *info) -{ - opj_thread_t* thread = (opj_thread_t*) info; - HANDLE hEvent = NULL; - - thread->thread_fn(thread->user_data); - - /* Free the handle possible allocated by a cond */ - while (OPJ_TRUE) { - /* Make sure TLSKey is not being created just at that moment... */ -#if HAVE_INTERLOCKED_COMPARE_EXCHANGE - if (InterlockedCompareExchange(&inTLSLockedSection, 1, 0) == 0) -#endif - { - if (TLSKeyInit) { - hEvent = (HANDLE) TlsGetValue(TLSKey); - } -#if HAVE_INTERLOCKED_COMPARE_EXCHANGE - InterlockedCompareExchange(&inTLSLockedSection, 0, 1); -#endif - break; - } - } - if (hEvent) { - CloseHandle(hEvent); - } - - return 0; -} - -opj_thread_t* opj_thread_create(opj_thread_fn thread_fn, void* user_data) -{ - opj_thread_t* thread; - - assert(thread_fn); - - thread = (opj_thread_t*) opj_malloc(sizeof(opj_thread_t)); - if (!thread) { - return NULL; - } - thread->thread_fn = thread_fn; - thread->user_data = user_data; - - thread->hThread = (HANDLE)_beginthreadex(NULL, 0, - opj_thread_callback_adapter, thread, 0, NULL); - - if (thread->hThread == NULL) { - opj_free(thread); - return NULL; - } - return thread; -} - -void opj_thread_join(opj_thread_t* thread) -{ - WaitForSingleObject(thread->hThread, INFINITE); - CloseHandle(thread->hThread); - - opj_free(thread); -} - -#elif MUTEX_pthread - -#include -#include -#include - -/* Moved after all system includes, and in particular pthread.h, so as to */ -/* avoid poisoning issuing with malloc() use in pthread.h with ulibc (#1013) */ -#include "opj_includes.h" - -OPJ_BOOL OPJ_CALLCONV opj_has_thread_support(void) -{ - return OPJ_TRUE; -} - -int OPJ_CALLCONV opj_get_num_cpus(void) -{ -#ifdef _SC_NPROCESSORS_ONLN - return (int)sysconf(_SC_NPROCESSORS_ONLN); -#else - return 1; -#endif -} - -struct opj_mutex_t { - pthread_mutex_t mutex; -}; - -opj_mutex_t* opj_mutex_create(void) -{ - opj_mutex_t* mutex = (opj_mutex_t*) opj_calloc(1U, sizeof(opj_mutex_t)); - if (mutex != NULL) { - if (pthread_mutex_init(&mutex->mutex, NULL) != 0) { - opj_free(mutex); - mutex = NULL; - } - } - return mutex; -} - -void opj_mutex_lock(opj_mutex_t* mutex) -{ - pthread_mutex_lock(&(mutex->mutex)); -} - -void opj_mutex_unlock(opj_mutex_t* mutex) -{ - pthread_mutex_unlock(&(mutex->mutex)); -} - -void opj_mutex_destroy(opj_mutex_t* mutex) -{ - if (!mutex) { - return; - } - pthread_mutex_destroy(&(mutex->mutex)); - opj_free(mutex); -} - -struct opj_cond_t { - pthread_cond_t cond; -}; - -opj_cond_t* opj_cond_create(void) -{ - opj_cond_t* cond = (opj_cond_t*) opj_malloc(sizeof(opj_cond_t)); - if (!cond) { - return NULL; - } - if (pthread_cond_init(&(cond->cond), NULL) != 0) { - opj_free(cond); - return NULL; - } - return cond; -} - -void opj_cond_wait(opj_cond_t* cond, opj_mutex_t* mutex) -{ - pthread_cond_wait(&(cond->cond), &(mutex->mutex)); -} - -void opj_cond_signal(opj_cond_t* cond) -{ - int ret = pthread_cond_signal(&(cond->cond)); - (void)ret; - assert(ret == 0); -} - -void opj_cond_destroy(opj_cond_t* cond) -{ - if (!cond) { - return; - } - pthread_cond_destroy(&(cond->cond)); - opj_free(cond); -} - - -struct opj_thread_t { - opj_thread_fn thread_fn; - void* user_data; - pthread_t thread; -}; - -static void* opj_thread_callback_adapter(void* info) -{ - opj_thread_t* thread = (opj_thread_t*) info; - thread->thread_fn(thread->user_data); - return NULL; -} - -opj_thread_t* opj_thread_create(opj_thread_fn thread_fn, void* user_data) -{ - pthread_attr_t attr; - opj_thread_t* thread; - - assert(thread_fn); - - thread = (opj_thread_t*) opj_malloc(sizeof(opj_thread_t)); - if (!thread) { - return NULL; - } - thread->thread_fn = thread_fn; - thread->user_data = user_data; - - pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - if (pthread_create(&(thread->thread), &attr, - opj_thread_callback_adapter, (void *) thread) != 0) { - opj_free(thread); - return NULL; - } - return thread; -} - -void opj_thread_join(opj_thread_t* thread) -{ - void* status; - pthread_join(thread->thread, &status); - - opj_free(thread); -} - -#else -/* Stub implementation */ - -#include "opj_includes.h" - -OPJ_BOOL OPJ_CALLCONV opj_has_thread_support(void) -{ - return OPJ_FALSE; -} - -int OPJ_CALLCONV opj_get_num_cpus(void) -{ - return 1; -} - -opj_mutex_t* opj_mutex_create(void) -{ - return NULL; -} - -void opj_mutex_lock(opj_mutex_t* mutex) -{ - (void) mutex; -} - -void opj_mutex_unlock(opj_mutex_t* mutex) -{ - (void) mutex; -} - -void opj_mutex_destroy(opj_mutex_t* mutex) -{ - (void) mutex; -} - -opj_cond_t* opj_cond_create(void) -{ - return NULL; -} - -void opj_cond_wait(opj_cond_t* cond, opj_mutex_t* mutex) -{ - (void) cond; - (void) mutex; -} - -void opj_cond_signal(opj_cond_t* cond) -{ - (void) cond; -} - -void opj_cond_destroy(opj_cond_t* cond) -{ - (void) cond; -} - -opj_thread_t* opj_thread_create(opj_thread_fn thread_fn, void* user_data) -{ - (void) thread_fn; - (void) user_data; - return NULL; -} - -void opj_thread_join(opj_thread_t* thread) -{ - (void) thread; -} - -#endif - -typedef struct { - int key; - void* value; - opj_tls_free_func opj_free_func; -} opj_tls_key_val_t; - -struct opj_tls_t { - opj_tls_key_val_t* key_val; - int key_val_count; -}; - -static opj_tls_t* opj_tls_new(void) -{ - return (opj_tls_t*) opj_calloc(1, sizeof(opj_tls_t)); -} - -static void opj_tls_destroy(opj_tls_t* tls) -{ - int i; - if (!tls) { - return; - } - for (i = 0; i < tls->key_val_count; i++) { - if (tls->key_val[i].opj_free_func) { - tls->key_val[i].opj_free_func(tls->key_val[i].value); - } - } - opj_free(tls->key_val); - opj_free(tls); -} - -void* opj_tls_get(opj_tls_t* tls, int key) -{ - int i; - for (i = 0; i < tls->key_val_count; i++) { - if (tls->key_val[i].key == key) { - return tls->key_val[i].value; - } - } - return NULL; -} - -OPJ_BOOL opj_tls_set(opj_tls_t* tls, int key, void* value, - opj_tls_free_func opj_free_func) -{ - opj_tls_key_val_t* new_key_val; - int i; - - if (tls->key_val_count == INT_MAX) { - return OPJ_FALSE; - } - for (i = 0; i < tls->key_val_count; i++) { - if (tls->key_val[i].key == key) { - if (tls->key_val[i].opj_free_func) { - tls->key_val[i].opj_free_func(tls->key_val[i].value); - } - tls->key_val[i].value = value; - tls->key_val[i].opj_free_func = opj_free_func; - return OPJ_TRUE; - } - } - new_key_val = (opj_tls_key_val_t*) opj_realloc(tls->key_val, - ((size_t)tls->key_val_count + 1U) * sizeof(opj_tls_key_val_t)); - if (!new_key_val) { - return OPJ_FALSE; - } - tls->key_val = new_key_val; - new_key_val[tls->key_val_count].key = key; - new_key_val[tls->key_val_count].value = value; - new_key_val[tls->key_val_count].opj_free_func = opj_free_func; - tls->key_val_count ++; - return OPJ_TRUE; -} - - -typedef struct { - opj_job_fn job_fn; - void *user_data; -} opj_worker_thread_job_t; - -typedef struct { - opj_thread_pool_t *tp; - opj_thread_t *thread; - int marked_as_waiting; - - opj_mutex_t *mutex; - opj_cond_t *cond; -} opj_worker_thread_t; - -typedef enum { - OPJWTS_OK, - OPJWTS_STOP, - OPJWTS_ERROR -} opj_worker_thread_state; - -struct opj_job_list_t { - opj_worker_thread_job_t* job; - struct opj_job_list_t* next; -}; -typedef struct opj_job_list_t opj_job_list_t; - -struct opj_worker_thread_list_t { - opj_worker_thread_t* worker_thread; - struct opj_worker_thread_list_t* next; -}; -typedef struct opj_worker_thread_list_t opj_worker_thread_list_t; - -struct opj_thread_pool_t { - opj_worker_thread_t* worker_threads; - int worker_threads_count; - opj_cond_t* cond; - opj_mutex_t* mutex; - volatile opj_worker_thread_state state; - opj_job_list_t* job_queue; - volatile int pending_jobs_count; - opj_worker_thread_list_t* waiting_worker_thread_list; - int waiting_worker_thread_count; - opj_tls_t* tls; - int signaling_threshold; -}; - -static OPJ_BOOL opj_thread_pool_setup(opj_thread_pool_t* tp, int num_threads); -static opj_worker_thread_job_t* opj_thread_pool_get_next_job( - opj_thread_pool_t* tp, - opj_worker_thread_t* worker_thread, - OPJ_BOOL signal_job_finished); - -opj_thread_pool_t* opj_thread_pool_create(int num_threads) -{ - opj_thread_pool_t* tp; - - tp = (opj_thread_pool_t*) opj_calloc(1, sizeof(opj_thread_pool_t)); - if (!tp) { - return NULL; - } - tp->state = OPJWTS_OK; - - if (num_threads <= 0) { - tp->tls = opj_tls_new(); - if (!tp->tls) { - opj_free(tp); - tp = NULL; - } - return tp; - } - - tp->mutex = opj_mutex_create(); - if (!tp->mutex) { - opj_free(tp); - return NULL; - } - if (!opj_thread_pool_setup(tp, num_threads)) { - opj_thread_pool_destroy(tp); - return NULL; - } - return tp; -} - -static void opj_worker_thread_function(void* user_data) -{ - opj_worker_thread_t* worker_thread; - opj_thread_pool_t* tp; - opj_tls_t* tls; - OPJ_BOOL job_finished = OPJ_FALSE; - - worker_thread = (opj_worker_thread_t*) user_data; - tp = worker_thread->tp; - tls = opj_tls_new(); - - while (OPJ_TRUE) { - opj_worker_thread_job_t* job = opj_thread_pool_get_next_job(tp, worker_thread, - job_finished); - if (job == NULL) { - break; - } - - if (job->job_fn) { - job->job_fn(job->user_data, tls); - } - opj_free(job); - job_finished = OPJ_TRUE; - } - - opj_tls_destroy(tls); -} - -static OPJ_BOOL opj_thread_pool_setup(opj_thread_pool_t* tp, int num_threads) -{ - int i; - OPJ_BOOL bRet = OPJ_TRUE; - - assert(num_threads > 0); - - tp->cond = opj_cond_create(); - if (tp->cond == NULL) { - return OPJ_FALSE; - } - - tp->worker_threads = (opj_worker_thread_t*) opj_calloc((size_t)num_threads, - sizeof(opj_worker_thread_t)); - if (tp->worker_threads == NULL) { - return OPJ_FALSE; - } - tp->worker_threads_count = num_threads; - - for (i = 0; i < num_threads; i++) { - tp->worker_threads[i].tp = tp; - - tp->worker_threads[i].mutex = opj_mutex_create(); - if (tp->worker_threads[i].mutex == NULL) { - tp->worker_threads_count = i; - bRet = OPJ_FALSE; - break; - } - - tp->worker_threads[i].cond = opj_cond_create(); - if (tp->worker_threads[i].cond == NULL) { - opj_mutex_destroy(tp->worker_threads[i].mutex); - tp->worker_threads_count = i; - bRet = OPJ_FALSE; - break; - } - - tp->worker_threads[i].marked_as_waiting = OPJ_FALSE; - - tp->worker_threads[i].thread = opj_thread_create(opj_worker_thread_function, - &(tp->worker_threads[i])); - if (tp->worker_threads[i].thread == NULL) { - tp->worker_threads_count = i; - bRet = OPJ_FALSE; - break; - } - } - - /* Wait all threads to be started */ - /* printf("waiting for all threads to be started\n"); */ - opj_mutex_lock(tp->mutex); - while (tp->waiting_worker_thread_count < num_threads) { - opj_cond_wait(tp->cond, tp->mutex); - } - opj_mutex_unlock(tp->mutex); - /* printf("all threads started\n"); */ - - if (tp->state == OPJWTS_ERROR) { - bRet = OPJ_FALSE; - } - - return bRet; -} - -/* -void opj_waiting() -{ - printf("waiting!\n"); -} -*/ - -static opj_worker_thread_job_t* opj_thread_pool_get_next_job( - opj_thread_pool_t* tp, - opj_worker_thread_t* worker_thread, - OPJ_BOOL signal_job_finished) -{ - while (OPJ_TRUE) { - opj_job_list_t* top_job_iter; - - opj_mutex_lock(tp->mutex); - - if (signal_job_finished) { - signal_job_finished = OPJ_FALSE; - tp->pending_jobs_count --; - /*printf("tp=%p, remaining jobs: %d\n", tp, tp->pending_jobs_count);*/ - if (tp->pending_jobs_count <= tp->signaling_threshold) { - opj_cond_signal(tp->cond); - } - } - - if (tp->state == OPJWTS_STOP) { - opj_mutex_unlock(tp->mutex); - return NULL; - } - top_job_iter = tp->job_queue; - if (top_job_iter) { - opj_worker_thread_job_t* job; - tp->job_queue = top_job_iter->next; - - job = top_job_iter->job; - opj_mutex_unlock(tp->mutex); - opj_free(top_job_iter); - return job; - } - - /* opj_waiting(); */ - if (!worker_thread->marked_as_waiting) { - opj_worker_thread_list_t* item; - - worker_thread->marked_as_waiting = OPJ_TRUE; - tp->waiting_worker_thread_count ++; - assert(tp->waiting_worker_thread_count <= tp->worker_threads_count); - - item = (opj_worker_thread_list_t*) opj_malloc(sizeof(opj_worker_thread_list_t)); - if (item == NULL) { - tp->state = OPJWTS_ERROR; - opj_cond_signal(tp->cond); - - opj_mutex_unlock(tp->mutex); - return NULL; - } - - item->worker_thread = worker_thread; - item->next = tp->waiting_worker_thread_list; - tp->waiting_worker_thread_list = item; - } - - /* printf("signaling that worker thread is ready\n"); */ - opj_cond_signal(tp->cond); - - opj_mutex_lock(worker_thread->mutex); - opj_mutex_unlock(tp->mutex); - - /* printf("waiting for job\n"); */ - opj_cond_wait(worker_thread->cond, worker_thread->mutex); - - opj_mutex_unlock(worker_thread->mutex); - /* printf("got job\n"); */ - } -} - -OPJ_BOOL opj_thread_pool_submit_job(opj_thread_pool_t* tp, - opj_job_fn job_fn, - void* user_data) -{ - opj_worker_thread_job_t* job; - opj_job_list_t* item; - - if (tp->mutex == NULL) { - job_fn(user_data, tp->tls); - return OPJ_TRUE; - } - - job = (opj_worker_thread_job_t*)opj_malloc(sizeof(opj_worker_thread_job_t)); - if (job == NULL) { - return OPJ_FALSE; - } - job->job_fn = job_fn; - job->user_data = user_data; - - item = (opj_job_list_t*) opj_malloc(sizeof(opj_job_list_t)); - if (item == NULL) { - opj_free(job); - return OPJ_FALSE; - } - item->job = job; - - opj_mutex_lock(tp->mutex); - - tp->signaling_threshold = 100 * tp->worker_threads_count; - while (tp->pending_jobs_count > tp->signaling_threshold) { - /* printf("%d jobs enqueued. Waiting\n", tp->pending_jobs_count); */ - opj_cond_wait(tp->cond, tp->mutex); - /* printf("...%d jobs enqueued.\n", tp->pending_jobs_count); */ - } - - item->next = tp->job_queue; - tp->job_queue = item; - tp->pending_jobs_count ++; - - if (tp->waiting_worker_thread_list) { - opj_worker_thread_t* worker_thread; - opj_worker_thread_list_t* next; - opj_worker_thread_list_t* to_opj_free; - - worker_thread = tp->waiting_worker_thread_list->worker_thread; - - assert(worker_thread->marked_as_waiting); - worker_thread->marked_as_waiting = OPJ_FALSE; - - next = tp->waiting_worker_thread_list->next; - to_opj_free = tp->waiting_worker_thread_list; - tp->waiting_worker_thread_list = next; - tp->waiting_worker_thread_count --; - - opj_mutex_lock(worker_thread->mutex); - opj_mutex_unlock(tp->mutex); - opj_cond_signal(worker_thread->cond); - opj_mutex_unlock(worker_thread->mutex); - - opj_free(to_opj_free); - } else { - opj_mutex_unlock(tp->mutex); - } - - return OPJ_TRUE; -} - -void opj_thread_pool_wait_completion(opj_thread_pool_t* tp, - int max_remaining_jobs) -{ - if (tp->mutex == NULL) { - return; - } - - if (max_remaining_jobs < 0) { - max_remaining_jobs = 0; - } - opj_mutex_lock(tp->mutex); - tp->signaling_threshold = max_remaining_jobs; - while (tp->pending_jobs_count > max_remaining_jobs) { - /*printf("tp=%p, jobs before wait = %d, max_remaining_jobs = %d\n", tp, tp->pending_jobs_count, max_remaining_jobs);*/ - opj_cond_wait(tp->cond, tp->mutex); - /*printf("tp=%p, jobs after wait = %d\n", tp, tp->pending_jobs_count);*/ - } - opj_mutex_unlock(tp->mutex); -} - -int opj_thread_pool_get_thread_count(opj_thread_pool_t* tp) -{ - return tp->worker_threads_count; -} - -void opj_thread_pool_destroy(opj_thread_pool_t* tp) -{ - if (!tp) { - return; - } - if (tp->cond) { - int i; - opj_thread_pool_wait_completion(tp, 0); - - opj_mutex_lock(tp->mutex); - tp->state = OPJWTS_STOP; - opj_mutex_unlock(tp->mutex); - - for (i = 0; i < tp->worker_threads_count; i++) { - opj_mutex_lock(tp->worker_threads[i].mutex); - opj_cond_signal(tp->worker_threads[i].cond); - opj_mutex_unlock(tp->worker_threads[i].mutex); - opj_thread_join(tp->worker_threads[i].thread); - opj_cond_destroy(tp->worker_threads[i].cond); - opj_mutex_destroy(tp->worker_threads[i].mutex); - } - - opj_free(tp->worker_threads); - - while (tp->waiting_worker_thread_list != NULL) { - opj_worker_thread_list_t* next = tp->waiting_worker_thread_list->next; - opj_free(tp->waiting_worker_thread_list); - tp->waiting_worker_thread_list = next; - } - - opj_cond_destroy(tp->cond); - } - opj_mutex_destroy(tp->mutex); - opj_tls_destroy(tp->tls); - opj_free(tp); -} diff --git a/src/3rd/LibOpenJpeg/thread.h b/src/3rd/LibOpenJpeg/thread.h deleted file mode 100644 index c89e19b4..00000000 --- a/src/3rd/LibOpenJpeg/thread.h +++ /dev/null @@ -1,256 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2016, Even Rouault - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef THREAD_H -#define THREAD_H - -#include "openjpeg.h" - -/** -@file thread.h -@brief Thread API - -The functions in thread.c have for goal to manage mutex, conditions, thread -creation and thread pools that accept jobs. -*/ - -/** @defgroup THREAD THREAD - Mutex, conditions, threads and thread pools */ -/*@{*/ - -/** @name Mutex */ -/*@{*/ - -/** Opaque type for a mutex */ -typedef struct opj_mutex_t opj_mutex_t; - -/** Creates a mutex. - * @return the mutex or NULL in case of error (can for example happen if the library - * is built without thread support) - */ -opj_mutex_t* opj_mutex_create(void); - -/** Lock/acquire the mutex. - * @param mutex the mutex to acquire. - */ -void opj_mutex_lock(opj_mutex_t* mutex); - -/** Unlock/release the mutex. - * @param mutex the mutex to release. - */ -void opj_mutex_unlock(opj_mutex_t* mutex); - -/** Destroy a mutex - * @param mutex the mutex to destroy. - */ -void opj_mutex_destroy(opj_mutex_t* mutex); - -/*@}*/ - -/** @name Condition */ -/*@{*/ - -/** Opaque type for a condition */ -typedef struct opj_cond_t opj_cond_t; - -/** Creates a condition. - * @return the condition or NULL in case of error (can for example happen if the library - * is built without thread support) - */ -opj_cond_t* opj_cond_create(void); - -/** Wait for the condition to be signaled. - * The semantics is the same as the POSIX pthread_cond_wait. - * The provided mutex *must* be acquired before calling this function, and - * released afterwards. - * The mutex will be released by this function while it must wait for the condition - * and reacquired afterwards. - * In some particular situations, the function might return even if the condition is not signaled - * with opj_cond_signal(), hence the need to check with an application level - * mechanism. - * - * Waiting thread : - * \code - * opj_mutex_lock(mutex); - * while( !some_application_level_condition ) - * { - * opj_cond_wait(cond, mutex); - * } - * opj_mutex_unlock(mutex); - * \endcode - * - * Signaling thread : - * \code - * opj_mutex_lock(mutex); - * some_application_level_condition = TRUE; - * opj_cond_signal(cond); - * opj_mutex_unlock(mutex); - * \endcode - * - * @param cond the condition to wait. - * @param mutex the mutex (in acquired state before calling this function) - */ -void opj_cond_wait(opj_cond_t* cond, opj_mutex_t* mutex); - -/** Signal waiting threads on a condition. - * One of the thread waiting with opj_cond_wait() will be waken up. - * It is strongly advised that this call is done with the mutex that is used - * by opj_cond_wait(), in a acquired state. - * @param cond the condition to signal. - */ -void opj_cond_signal(opj_cond_t* cond); - -/** Destroy a condition - * @param cond the condition to destroy. - */ -void opj_cond_destroy(opj_cond_t* cond); - -/*@}*/ - -/** @name Thread */ -/*@{*/ - -/** Opaque type for a thread handle */ -typedef struct opj_thread_t opj_thread_t; - -/** User function to execute in a thread - * @param user_data user data provided with opj_thread_create() - */ -typedef void (*opj_thread_fn)(void* user_data); - -/** Creates a new thread. - * @param thread_fn Function to run in the new thread. - * @param user_data user data provided to the thread function. Might be NULL. - * @return a thread handle or NULL in case of failure (can for example happen if the library - * is built without thread support) - */ -opj_thread_t* opj_thread_create(opj_thread_fn thread_fn, void* user_data); - -/** Wait for a thread to be finished and release associated resources to the - * thread handle. - * @param thread the thread to wait for being finished. - */ -void opj_thread_join(opj_thread_t* thread); - -/*@}*/ - -/** @name Thread local storage */ -/*@{*/ -/** Opaque type for a thread local storage */ -typedef struct opj_tls_t opj_tls_t; - -/** Get a thread local value corresponding to the provided key. - * @param tls thread local storage handle - * @param key key whose value to retrieve. - * @return value associated with the key, or NULL is missing. - */ -void* opj_tls_get(opj_tls_t* tls, int key); - -/** Type of the function used to free a TLS value */ -typedef void (*opj_tls_free_func)(void* value); - -/** Set a thread local value corresponding to the provided key. - * @param tls thread local storage handle - * @param key key whose value to set. - * @param value value to set (may be NULL). - * @param free_func function to call currently installed value. - * @return OPJ_TRUE if successful. - */ -OPJ_BOOL opj_tls_set(opj_tls_t* tls, int key, void* value, - opj_tls_free_func free_func); - -/*@}*/ - -/** @name Thread pool */ -/*@{*/ - -/** Opaque type for a thread pool */ -typedef struct opj_thread_pool_t opj_thread_pool_t; - -/** Create a new thread pool. - * num_thread must nominally be >= 1 to create a real thread pool. If num_threads - * is negative or null, then a dummy thread pool will be created. All functions - * operating on the thread pool will work, but job submission will be run - * synchronously in the calling thread. - * - * @param num_threads the number of threads to allocate for this thread pool. - * @return a thread pool handle, or NULL in case of failure (can for example happen if the library - * is built without thread support) - */ -opj_thread_pool_t* opj_thread_pool_create(int num_threads); - -/** User function to execute in a thread - * @param user_data user data provided with opj_thread_create() - * @param tls handle to thread local storage - */ -typedef void (*opj_job_fn)(void* user_data, opj_tls_t* tls); - - -/** Submit a new job to be run by one of the thread in the thread pool. - * The job ( thread_fn, user_data ) will be added in the queue of jobs managed - * by the thread pool, and run by the first thread that is no longer busy. - * - * @param tp the thread pool handle. - * @param job_fn Function to run. Must not be NULL. - * @param user_data User data provided to thread_fn. - * @return OPJ_TRUE if the job was successfully submitted. - */ -OPJ_BOOL opj_thread_pool_submit_job(opj_thread_pool_t* tp, opj_job_fn job_fn, - void* user_data); - -/** Wait that no more than max_remaining_jobs jobs are remaining in the queue of - * the thread pool. The aim of this function is to avoid submitting too many - * jobs while the thread pool cannot cope fast enough with them, which would - * result potentially in out-of-memory situations with too many job descriptions - * being queued. - * - * @param tp the thread pool handle - * @param max_remaining_jobs maximum number of jobs allowed to be queued without waiting. - */ -void opj_thread_pool_wait_completion(opj_thread_pool_t* tp, - int max_remaining_jobs); - -/** Return the number of threads associated with the thread pool. - * - * @param tp the thread pool handle. - * @return number of threads associated with the thread pool. - */ -int opj_thread_pool_get_thread_count(opj_thread_pool_t* tp); - -/** Destroy a thread pool. - * @param tp the thread pool handle. - */ -void opj_thread_pool_destroy(opj_thread_pool_t* tp); - -/*@}*/ - -/*@}*/ - -#endif /* THREAD_H */ diff --git a/src/3rd/LibOpenJpeg/tls_keys.h b/src/3rd/LibOpenJpeg/tls_keys.h deleted file mode 100644 index 23f84754..00000000 --- a/src/3rd/LibOpenJpeg/tls_keys.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third - * party and contributor rights, including patent rights, and no such rights - * are granted under this license. - * - * Copyright (c) 2016, Even Rouault - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef OPJ_TLS_KEYS_H -#define OPJ_TLS_KEYS_H - -#define OPJ_TLS_KEY_T1 0 - -#endif /* OPJ_TLS_KEY_H */ diff --git a/src/3rd/Simd/Avx1_32.vcxproj b/src/3rd/Simd/Avx1_32.vcxproj deleted file mode 100644 index 6fbe2bbf..00000000 --- a/src/3rd/Simd/Avx1_32.vcxproj +++ /dev/null @@ -1,90 +0,0 @@ - - - - - - Debug - Win32 - - - Release - Win32 - - - - {D6E29B9B-EE2D-4339-8417-DCADC283884F} - Win32Proj - Avx1_32 - - - StaticLibrary - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - _LIB;%(PreprocessorDefinitions) - AdvancedVectorExtensions - - - Windows - - - - \ No newline at end of file diff --git a/src/3rd/Simd/Avx1_32.vcxproj.filters b/src/3rd/Simd/Avx1_32.vcxproj.filters deleted file mode 100644 index 10c41368..00000000 --- a/src/3rd/Simd/Avx1_32.vcxproj.filters +++ /dev/null @@ -1,174 +0,0 @@ - - - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - - - {347f368f-bb75-4710-9eeb-1d95bc36549e} - - - {58836233-75a5-429d-ba88-137d5581600a} - - - - - Avx1 - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - \ No newline at end of file diff --git a/src/3rd/Simd/Avx1_64.vcxproj b/src/3rd/Simd/Avx1_64.vcxproj deleted file mode 100644 index 20f19f7d..00000000 --- a/src/3rd/Simd/Avx1_64.vcxproj +++ /dev/null @@ -1,90 +0,0 @@ - - - - - - Debug - x64 - - - Release - x64 - - - - {810B8E84-7F94-488A-A04A-820F598813B3} - Win32Proj - Avx1_64 - - - StaticLibrary - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - _LIB;%(PreprocessorDefinitions) - AdvancedVectorExtensions - - - Windows - - - - \ No newline at end of file diff --git a/src/3rd/Simd/Avx1_64.vcxproj.filters b/src/3rd/Simd/Avx1_64.vcxproj.filters deleted file mode 100644 index 10c41368..00000000 --- a/src/3rd/Simd/Avx1_64.vcxproj.filters +++ /dev/null @@ -1,174 +0,0 @@ - - - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - Avx1 - - - - - {347f368f-bb75-4710-9eeb-1d95bc36549e} - - - {58836233-75a5-429d-ba88-137d5581600a} - - - - - Avx1 - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - \ No newline at end of file diff --git a/src/3rd/Simd/Avx2_32.vcxproj b/src/3rd/Simd/Avx2_32.vcxproj deleted file mode 100644 index 82a6d0b9..00000000 --- a/src/3rd/Simd/Avx2_32.vcxproj +++ /dev/null @@ -1,155 +0,0 @@ - - - - - - Debug - Win32 - - - Release - Win32 - - - - {4C82474F-3B9A-4B92-AC74-2434CA1D5064} - Win32Proj - Avx2_32 - - - StaticLibrary - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - _LIB;%(PreprocessorDefinitions) - AdvancedVectorExtensions2 - - - Windows - - - - \ No newline at end of file diff --git a/src/3rd/Simd/Avx2_32.vcxproj.filters b/src/3rd/Simd/Avx2_32.vcxproj.filters deleted file mode 100644 index abf39be6..00000000 --- a/src/3rd/Simd/Avx2_32.vcxproj.filters +++ /dev/null @@ -1,369 +0,0 @@ - - - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - - - {203c7d5c-0ce6-441c-8d15-65177604f3b2} - - - {d10e2ea2-1b27-485c-af9d-7912073dffaf} - - - - - Avx2 - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - \ No newline at end of file diff --git a/src/3rd/Simd/Avx2_64.vcxproj b/src/3rd/Simd/Avx2_64.vcxproj deleted file mode 100644 index 28b50e65..00000000 --- a/src/3rd/Simd/Avx2_64.vcxproj +++ /dev/null @@ -1,155 +0,0 @@ - - - - - - Debug - x64 - - - Release - x64 - - - - {AC3D5666-4510-4C2C-9C53-2AD9A5B5F365} - Win32Proj - Avx2_64 - - - StaticLibrary - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - _LIB;%(PreprocessorDefinitions) - AdvancedVectorExtensions2 - - - Windows - - - - \ No newline at end of file diff --git a/src/3rd/Simd/Avx2_64.vcxproj.filters b/src/3rd/Simd/Avx2_64.vcxproj.filters deleted file mode 100644 index abf39be6..00000000 --- a/src/3rd/Simd/Avx2_64.vcxproj.filters +++ /dev/null @@ -1,369 +0,0 @@ - - - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - Avx2 - - - - - {203c7d5c-0ce6-441c-8d15-65177604f3b2} - - - {d10e2ea2-1b27-485c-af9d-7912073dffaf} - - - - - Avx2 - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - \ No newline at end of file diff --git a/src/3rd/Simd/Avx512bw_32.vcxproj b/src/3rd/Simd/Avx512bw_32.vcxproj deleted file mode 100644 index fe3f8fcf..00000000 --- a/src/3rd/Simd/Avx512bw_32.vcxproj +++ /dev/null @@ -1,136 +0,0 @@ - - - - - - Debug - Win32 - - - Release - Win32 - - - - {3E24DA78-24E3-41D2-9066-54C75BE13B90} - Win32Proj - Avx512bw_32 - - - StaticLibrary - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - _LIB;%(PreprocessorDefinitions) - AdvancedVectorExtensions512 - - - Windows - - - - \ No newline at end of file diff --git a/src/3rd/Simd/Avx512bw_32.vcxproj.filters b/src/3rd/Simd/Avx512bw_32.vcxproj.filters deleted file mode 100644 index c02abe42..00000000 --- a/src/3rd/Simd/Avx512bw_32.vcxproj.filters +++ /dev/null @@ -1,312 +0,0 @@ - - - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - - - {f3dc6602-6c7b-4f27-a975-ee218315a949} - - - {7b9dfadb-70b2-434d-8cce-95985b7cca91} - - - - - Avx512bw - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - \ No newline at end of file diff --git a/src/3rd/Simd/Avx512bw_64.vcxproj b/src/3rd/Simd/Avx512bw_64.vcxproj deleted file mode 100644 index f6161fd2..00000000 --- a/src/3rd/Simd/Avx512bw_64.vcxproj +++ /dev/null @@ -1,136 +0,0 @@ - - - - - - Debug - x64 - - - Release - x64 - - - - {7A379C6B-B694-40DE-8DE3-F23F5811F4DB} - Win32Proj - Avx512bw_64 - - - StaticLibrary - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - _LIB;%(PreprocessorDefinitions) - AdvancedVectorExtensions512 - - - Windows - - - - \ No newline at end of file diff --git a/src/3rd/Simd/Avx512bw_64.vcxproj.filters b/src/3rd/Simd/Avx512bw_64.vcxproj.filters deleted file mode 100644 index c02abe42..00000000 --- a/src/3rd/Simd/Avx512bw_64.vcxproj.filters +++ /dev/null @@ -1,312 +0,0 @@ - - - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - Avx512bw - - - - - {f3dc6602-6c7b-4f27-a975-ee218315a949} - - - {7b9dfadb-70b2-434d-8cce-95985b7cca91} - - - - - Avx512bw - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - \ No newline at end of file diff --git a/src/3rd/Simd/Avx512f_32.vcxproj b/src/3rd/Simd/Avx512f_32.vcxproj deleted file mode 100644 index 2219102d..00000000 --- a/src/3rd/Simd/Avx512f_32.vcxproj +++ /dev/null @@ -1,91 +0,0 @@ - - - - - - Debug - Win32 - - - Release - Win32 - - - - {EAEF875C-436D-4760-8E30-87B334AFD979} - Win32Proj - Avx512f_32 - - - StaticLibrary - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - _LIB;%(PreprocessorDefinitions) - AdvancedVectorExtensions512 - - - Windows - - - - \ No newline at end of file diff --git a/src/3rd/Simd/Avx512f_32.vcxproj.filters b/src/3rd/Simd/Avx512f_32.vcxproj.filters deleted file mode 100644 index 6d62d545..00000000 --- a/src/3rd/Simd/Avx512f_32.vcxproj.filters +++ /dev/null @@ -1,177 +0,0 @@ - - - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - - - {beb8718a-05f3-4410-81c5-c99a8ba4948f} - - - {2c2adfce-5e1a-4262-b985-daa3cb998619} - - - - - Avx512f - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - \ No newline at end of file diff --git a/src/3rd/Simd/Avx512f_64.vcxproj b/src/3rd/Simd/Avx512f_64.vcxproj deleted file mode 100644 index 7224cc59..00000000 --- a/src/3rd/Simd/Avx512f_64.vcxproj +++ /dev/null @@ -1,91 +0,0 @@ - - - - - - Debug - x64 - - - Release - x64 - - - - {EC9747D9-601D-4858-A3C2-5427BC6A75A1} - Win32Proj - Avx512f_64 - - - StaticLibrary - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - _LIB;%(PreprocessorDefinitions) - AdvancedVectorExtensions512 - - - Windows - - - - \ No newline at end of file diff --git a/src/3rd/Simd/Avx512f_64.vcxproj.filters b/src/3rd/Simd/Avx512f_64.vcxproj.filters deleted file mode 100644 index 6d62d545..00000000 --- a/src/3rd/Simd/Avx512f_64.vcxproj.filters +++ /dev/null @@ -1,177 +0,0 @@ - - - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - Avx512f - - - - - {beb8718a-05f3-4410-81c5-c99a8ba4948f} - - - {2c2adfce-5e1a-4262-b985-daa3cb998619} - - - - - Avx512f - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - \ No newline at end of file diff --git a/src/3rd/Simd/Avx512vnni_32.vcxproj b/src/3rd/Simd/Avx512vnni_32.vcxproj deleted file mode 100644 index 6e6cbf1c..00000000 --- a/src/3rd/Simd/Avx512vnni_32.vcxproj +++ /dev/null @@ -1,60 +0,0 @@ - - - - - - Debug - Win32 - - - Release - Win32 - - - - {E89969DE-D5F1-44C5-81AF-A4283851090B} - Win32Proj - Avx512vnni_32 - - - StaticLibrary - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - _LIB;%(PreprocessorDefinitions) - AdvancedVectorExtensions512 - - - Windows - - - - \ No newline at end of file diff --git a/src/3rd/Simd/Avx512vnni_32.vcxproj.filters b/src/3rd/Simd/Avx512vnni_32.vcxproj.filters deleted file mode 100644 index 90e2d9bd..00000000 --- a/src/3rd/Simd/Avx512vnni_32.vcxproj.filters +++ /dev/null @@ -1,84 +0,0 @@ - - - - - {caa88b94-eaf3-477c-bbfe-b544c73751d6} - - - {72188850-ff72-458f-a213-aebed33115b1} - - - - - Avx512vnni - - - - - Avx512vnni - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - \ No newline at end of file diff --git a/src/3rd/Simd/Avx512vnni_64.vcxproj b/src/3rd/Simd/Avx512vnni_64.vcxproj deleted file mode 100644 index 3ad3ac4a..00000000 --- a/src/3rd/Simd/Avx512vnni_64.vcxproj +++ /dev/null @@ -1,60 +0,0 @@ - - - - - - Debug - x64 - - - Release - x64 - - - - {54217234-7CFF-4E48-873F-37B4EF50C914} - Win32Proj - Avx512vnni_64 - - - StaticLibrary - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - _LIB;%(PreprocessorDefinitions) - AdvancedVectorExtensions512 - - - Windows - - - - \ No newline at end of file diff --git a/src/3rd/Simd/Avx512vnni_64.vcxproj.filters b/src/3rd/Simd/Avx512vnni_64.vcxproj.filters deleted file mode 100644 index 90e2d9bd..00000000 --- a/src/3rd/Simd/Avx512vnni_64.vcxproj.filters +++ /dev/null @@ -1,84 +0,0 @@ - - - - - {caa88b94-eaf3-477c-bbfe-b544c73751d6} - - - {72188850-ff72-458f-a213-aebed33115b1} - - - - - Avx512vnni - - - - - Avx512vnni - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - \ No newline at end of file diff --git a/src/3rd/Simd/Base_32.vcxproj b/src/3rd/Simd/Base_32.vcxproj deleted file mode 100644 index ce50d82f..00000000 --- a/src/3rd/Simd/Base_32.vcxproj +++ /dev/null @@ -1,159 +0,0 @@ - - - - - - Debug - Win32 - - - Release - Win32 - - - - {1622C4EF-06A4-4DAA-9631-5D71B32858A2} - Win32Proj - Base_32 - - - StaticLibrary - - - - - _LIB;%(PreprocessorDefinitions) - NoExtensions - - - Windows - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/src/3rd/Simd/Base_32.vcxproj.filters b/src/3rd/Simd/Base_32.vcxproj.filters deleted file mode 100644 index 73f1c058..00000000 --- a/src/3rd/Simd/Base_32.vcxproj.filters +++ /dev/null @@ -1,381 +0,0 @@ - - - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - - - Base - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - - - {00b9149f-e127-44ae-b89f-a866edf8cf11} - - - {e965be5d-215b-4037-8839-9ae912bbb6fd} - - - \ No newline at end of file diff --git a/src/3rd/Simd/Base_64.vcxproj b/src/3rd/Simd/Base_64.vcxproj deleted file mode 100644 index 61b30487..00000000 --- a/src/3rd/Simd/Base_64.vcxproj +++ /dev/null @@ -1,159 +0,0 @@ - - - - - - Debug - x64 - - - Release - x64 - - - - {627967C1-4623-479F-9F4E-83AF4F61FBCD} - Win32Proj - Base_64 - - - StaticLibrary - - - - - _LIB;%(PreprocessorDefinitions) - NotSet - - - Windows - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/src/3rd/Simd/Base_64.vcxproj.filters b/src/3rd/Simd/Base_64.vcxproj.filters deleted file mode 100644 index 73f1c058..00000000 --- a/src/3rd/Simd/Base_64.vcxproj.filters +++ /dev/null @@ -1,381 +0,0 @@ - - - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - Base - - - - - Base - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - - - {00b9149f-e127-44ae-b89f-a866edf8cf11} - - - {e965be5d-215b-4037-8839-9ae912bbb6fd} - - - \ No newline at end of file diff --git a/src/3rd/Simd/Msa_32.vcxproj b/src/3rd/Simd/Msa_32.vcxproj deleted file mode 100644 index 17a9a7a6..00000000 --- a/src/3rd/Simd/Msa_32.vcxproj +++ /dev/null @@ -1,51 +0,0 @@ - - - - - - Debug - Win32 - - - Release - Win32 - - - - {AF08F27A-49FE-4A7F-84CC-3DD0005863EF} - Win32Proj - Msa_32 - - - StaticLibrary - - - - - _LIB;%(PreprocessorDefinitions) - NoExtensions - - - Windows - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/src/3rd/Simd/Msa_32.vcxproj.filters b/src/3rd/Simd/Msa_32.vcxproj.filters deleted file mode 100644 index 011c11bc..00000000 --- a/src/3rd/Simd/Msa_32.vcxproj.filters +++ /dev/null @@ -1,57 +0,0 @@ - - - - - Msa - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - - - {bd884b7b-7ffd-4acc-b116-053af5c8ca50} - - - {e6e1621d-febc-48ae-9087-4b9f4f5842c8} - - - - - Msa - - - \ No newline at end of file diff --git a/src/3rd/Simd/Msa_64.vcxproj b/src/3rd/Simd/Msa_64.vcxproj deleted file mode 100644 index 8a657f12..00000000 --- a/src/3rd/Simd/Msa_64.vcxproj +++ /dev/null @@ -1,51 +0,0 @@ - - - - - - Debug - x64 - - - Release - x64 - - - - {B0558ECB-DF7F-4F5F-B6ED-9B11893AE476} - Win32Proj - Msa_64 - - - StaticLibrary - - - - - _LIB;%(PreprocessorDefinitions) - NotSet - - - Windows - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/src/3rd/Simd/Msa_64.vcxproj.filters b/src/3rd/Simd/Msa_64.vcxproj.filters deleted file mode 100644 index 011c11bc..00000000 --- a/src/3rd/Simd/Msa_64.vcxproj.filters +++ /dev/null @@ -1,57 +0,0 @@ - - - - - Msa - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - - - {bd884b7b-7ffd-4acc-b116-053af5c8ca50} - - - {e6e1621d-febc-48ae-9087-4b9f4f5842c8} - - - - - Msa - - - \ No newline at end of file diff --git a/src/3rd/Simd/Neon_32.vcxproj b/src/3rd/Simd/Neon_32.vcxproj deleted file mode 100644 index eef37ac9..00000000 --- a/src/3rd/Simd/Neon_32.vcxproj +++ /dev/null @@ -1,155 +0,0 @@ - - - - - - Debug - Win32 - - - Release - Win32 - - - - {271C2D02-B19D-4193-8D83-CC0F06E75F3D} - Win32Proj - Neon_32 - - - StaticLibrary - - - - - _LIB;%(PreprocessorDefinitions) - NoExtensions - - - Windows - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/src/3rd/Simd/Neon_32.vcxproj.filters b/src/3rd/Simd/Neon_32.vcxproj.filters deleted file mode 100644 index 95970c3f..00000000 --- a/src/3rd/Simd/Neon_32.vcxproj.filters +++ /dev/null @@ -1,369 +0,0 @@ - - - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - - - {51c3a96e-009e-4de0-9c30-329e21957b9c} - - - {e31a5615-a021-4e0f-b8c4-b542d164bc82} - - - - - Neon - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - \ No newline at end of file diff --git a/src/3rd/Simd/Neon_64.vcxproj b/src/3rd/Simd/Neon_64.vcxproj deleted file mode 100644 index 9404598a..00000000 --- a/src/3rd/Simd/Neon_64.vcxproj +++ /dev/null @@ -1,155 +0,0 @@ - - - - - - Debug - x64 - - - Release - x64 - - - - {96467C7E-991C-4406-8304-4E08076A1A70} - Win32Proj - Neon_64 - - - StaticLibrary - - - - - _LIB;%(PreprocessorDefinitions) - NotSet - - - Windows - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/src/3rd/Simd/Neon_64.vcxproj.filters b/src/3rd/Simd/Neon_64.vcxproj.filters deleted file mode 100644 index 95970c3f..00000000 --- a/src/3rd/Simd/Neon_64.vcxproj.filters +++ /dev/null @@ -1,369 +0,0 @@ - - - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - Neon - - - - - {51c3a96e-009e-4de0-9c30-329e21957b9c} - - - {e31a5615-a021-4e0f-b8c4-b542d164bc82} - - - - - Neon - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - Inc - - - \ No newline at end of file diff --git a/src/3rd/Simd/Simd/SimdAllocator.hpp b/src/3rd/Simd/Simd/SimdAllocator.hpp deleted file mode 100644 index fbb207db..00000000 --- a/src/3rd/Simd/Simd/SimdAllocator.hpp +++ /dev/null @@ -1,227 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdAllocator_hpp__ -#define __SimdAllocator_hpp__ - -#include "Simd/SimdLib.h" - -#include - -namespace Simd -{ - /*! @ingroup cpp_allocator - - \short Aligned memory allocator. - - Performs allocation and deletion of aligned memory. - - \note Also it can be used as an allocator for STL containers. - */ - template struct Allocator - { - /*! - \fn void * Allocate(size_t size, size_t align); - - \short Allocates aligned memory block. - - \note The memory allocated by this function is must be deleted by function Simd::Allocator::Free. - - \param [in] size - a size of required memory block. - \param [in] align - an align of allocated memory address. - \return a pointer to allocated memory. - */ - static SIMD_INLINE void * Allocate(size_t size, size_t align) - { -#ifdef __SimdMemory_h__ - return Simd::Allocate(size, align); -#else - return SimdAllocate(size, align); -#endif - } - - /*! - \fn void Free(void * ptr); - - \short Frees aligned memory block. - - \note This function frees a memory allocated by function Simd::Allocator::Allocate. - - \param [in] ptr - a pointer to the memory to be deleted. - */ - static SIMD_INLINE void Free(void * ptr) - { -#ifdef __SimdMemory_h__ - Simd::Free(ptr); -#else - SimdFree(ptr); -#endif - } - - /*! - \fn size_t Align(size_t size, size_t align); - - \short Gets aligned size. - - \param [in] size - an original size. - \param [in] align - a required alignment. - - \return an aligned size. - */ - static SIMD_INLINE size_t Align(size_t size, size_t align) - { -#ifdef __SimdMemory_h__ - return Simd::AlignHi(size, align); -#else - return SimdAlign(size, align); -#endif - } - - /*! - \fn void * Align(void * ptr, size_t align); - - \short Gets aligned address. - - \param [in] ptr - an original pointer. - \param [in] align - a required alignment. - - \return an aligned address. - */ - static SIMD_INLINE void * Align(void * ptr, size_t align) - { -#ifdef __SimdMemory_h__ - return Simd::AlignHi(ptr, align); -#else - return (void *)SimdAlign((size_t)ptr, align); -#endif - } - - /*! - \fn size_t Alignment(); - - \short Gets memory alignment required for the most productive work. - - \return a required memory alignment. - */ - static SIMD_INLINE size_t Alignment() - { -#if defined(__SimdEnable_h__) && defined(WIN32) - return Simd::ALIGNMENT; -#else - return SimdAlignment(); -#endif - } - - //--------------------------------------------------------------------- - // STL allocator interface implementation: - - typedef T value_type; - typedef T * pointer; - typedef std::size_t size_type; - typedef std::ptrdiff_t difference_type; - typedef T & reference; - typedef const T & const_reference; - typedef const T * const_pointer; - - template - struct rebind - { - typedef Allocator other; - }; - - SIMD_INLINE Allocator() - { - } - - template SIMD_INLINE Allocator(const Allocator & a) - { - } - - SIMD_INLINE const_pointer address(const_reference value) const - { -#if defined(SIMD_CPP_2011_ENABLE) - return std::addressof(value); -#else - return (reinterpret_cast(&const_cast(reinterpret_cast(value)))); -#endif - } - - SIMD_INLINE pointer address(reference value) const - { -#if defined(SIMD_CPP_2011_ENABLE) - return std::addressof(value); -#else - return (reinterpret_cast(&const_cast(reinterpret_cast(value)))); -#endif - } - - SIMD_INLINE pointer allocate(size_type size, const void * ptr = NULL) - { - return static_cast(Allocate(size * sizeof(T), Alignment())); - } - - SIMD_INLINE size_type max_size() const - { - return ~static_cast(0) / sizeof(T); - } - - SIMD_INLINE void deallocate(pointer ptr, size_type size) - { - Free(ptr); - } - - template SIMD_INLINE void construct(U * ptr, const V & value) - { - ::new((void*)ptr) U(value); - } - -#if defined(SIMD_CPP_2011_ENABLE) - template SIMD_INLINE void construct(U * ptr, Args &&... args) - { - ::new((void*)ptr) U(std::forward(args)...); - } -#endif - - template SIMD_INLINE void construct(U * ptr) - { - ::new((void*)ptr) U(); - } - - template SIMD_INLINE void destroy(U * ptr) - { - ptr->~U(); - } - }; - - template SIMD_INLINE bool operator == (const Allocator & a1, const Allocator & a2) - { - return true; - } - - template SIMD_INLINE bool operator != (const Allocator & a1, const Allocator & a2) - { - return false; - } -} - -#endif//__SimdAllocator_hpp__ diff --git a/src/3rd/Simd/Simd/SimdArray.h b/src/3rd/Simd/Simd/SimdArray.h deleted file mode 100644 index 95b8bd50..00000000 --- a/src/3rd/Simd/Simd/SimdArray.h +++ /dev/null @@ -1,134 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdArray_h__ -#define __SimdArray_h__ - -#include "Simd/SimdMemory.h" -#include "Simd/SimdMath.h" - -namespace Simd -{ - template struct Array - { - T * const data; - size_t const size; - - SIMD_INLINE Array(size_t size_ = 0, bool clear = false, size_t align = SIMD_ALIGN) - : data(0) - , size(0) - { - Resize(size_, clear); - } - - SIMD_INLINE ~Array() - { - if (data) - Simd::Free(data); - } - - SIMD_INLINE void Resize(size_t size_, bool clear = false, size_t align = SIMD_ALIGN) - { - if (size_ != size) - { - if (data) - { - Simd::Free(data); - *(T**)&data = 0; - } - *(size_t*)&size = size_; - if (size_) - *(T**)&data = (T*)Simd::Allocate(size * sizeof(T), align); - } - if (clear) - Clear(); - } - - SIMD_INLINE void Clear() - { - ::memset(data, 0, size * sizeof(T)); - } - - SIMD_INLINE void Swap(const Array & array) - { - Simd::Swap((T*&)data, (T*&)(array.data)); - Simd::Swap((size_t&)size, (size_t&)(array.size)); - } - - SIMD_INLINE T & operator[] (size_t i) - { - return data[i]; - } - - SIMD_INLINE const T & operator[] (size_t i) const - { - return data[i]; - } - }; - - typedef Array Array8i; - typedef Array Array8u; - typedef Array Array16i; - typedef Array Array16u; - typedef Array Array32i; - typedef Array Array32f; - -#if defined(__GNUC__) && __GNUC__ >= 6 -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wignored-attributes" -#endif - -#ifdef SIMD_SSE_ENABLE - namespace Sse - { - typedef Array<__m128> Array128f; - } -#endif - -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - typedef Array<__m256> Array256f; - } -#endif - -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - typedef Array<__m512> Array512f; - } -#endif - -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - typedef Array Array128f; - } -#endif - -#if defined(__GNUC__) && __GNUC__ >= 6 -#pragma GCC diagnostic pop -#endif -} - -#endif//__SimdArray_h__ diff --git a/src/3rd/Simd/Simd/SimdAvx1.h b/src/3rd/Simd/Simd/SimdAvx1.h deleted file mode 100644 index f2c56ff9..00000000 --- a/src/3rd/Simd/Simd/SimdAvx1.h +++ /dev/null @@ -1,203 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdAvx_h__ -#define __SimdAvx_h__ - -#include "Simd/SimdDefs.h" - -namespace Simd -{ -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - void Fill32f(float * dst, size_t size, const float * value); - - void CosineDistance32f(const float * a, const float * b, size_t size, float * distance); - - void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); - - void Gemm32fNT(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); - - void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride); - - void HogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight); - - void HogLiteCompressFeatures(const float * src, size_t srcStride, size_t width, size_t height, const float * pca, float * dst, size_t dstStride); - - void HogLiteFilterSeparable(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * hFilter, size_t hSize, const float * vFilter, size_t vSize, float * dst, size_t dstStride, int add); - - void NeuralProductSum(const float * a, const float * b, size_t size, float * sum); - - void NeuralAddVectorMultipliedByValue(const float * src, size_t size, const float * value, float * dst); - - void NeuralAddVector(const float * src, size_t size, float * dst); - - void NeuralAddValue(const float * value, float * dst, size_t size); - - void NeuralRoughSigmoid(const float * src, size_t size, const float * slope, float * dst); - - void NeuralRoughSigmoid2(const float * src, size_t size, const float * slope, float * dst); - - void NeuralDerivativeSigmoid(const float * src, size_t size, const float * slope, float * dst); - - void NeuralRoughTanh(const float * src, size_t size, const float * slope, float * dst); - - void NeuralDerivativeTanh(const float * src, size_t size, const float * slope, float * dst); - - void NeuralDerivativeRelu(const float * src, size_t size, const float * slope, float * dst); - - void NeuralUpdateWeights(const float * x, size_t size, const float * a, const float * b, float * d, float * w); - - void NeuralAdaptiveGradientUpdate(const float * delta, size_t size, size_t batch, const float * alpha, const float * epsilon, float * gradient, float * weight); - - void NeuralAddConvolution2x2Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution3x3Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution4x4Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution5x5Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution2x2Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution3x3Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution4x4Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution5x5Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution2x2Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - void NeuralAddConvolution3x3Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - void NeuralAddConvolution4x4Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - void NeuralAddConvolution5x5Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - void NeuralPooling2x2Max2x2(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride); - - void NeuralConvolutionForward(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, const float * weight, - size_t kernelX, size_t kernelY, size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, - void * buffer, size_t * size, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth, int add); - - void SquaredDifferenceSum32f(const float * a, const float * b, size_t size, float * sum); - - void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum); - - void SvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum); - - void SynetAddBias(const float * bias, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetReorderImage(size_t batch, size_t channels, size_t spatial, const float * src, SimdTensorFormatType srcFormat, float * dst, SimdTensorFormatType dstFormat); - - void SynetReorderFilter(size_t output, size_t input, size_t kernel, const float * src, SimdTensorFormatType srcFormat, float * dst, SimdTensorFormatType dstFormat); - - void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst); - - void SynetFusedLayerForward0(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward1(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward2(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward3(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward4(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward8(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward9(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1, SimdTensorFormatType format); - - void SynetHswish32f(const float * src, size_t size, const float * shift, const float * scale, float * dst); - - void SynetInnerProductLayerForward(const float * src, const float * weight, const float * bias, size_t count, size_t size, float * dst); - - void SynetPoolingForwardAverage(const float* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float* dst, size_t dstH, size_t dstW, SimdBool excludePad, SimdTensorFormatType format); - - void SynetPoolingForwardMax32f(const float * src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float * dst, size_t dstH, size_t dstW, SimdTensorFormatType format); - - void SynetPreluLayerForward(const float * src, const float * slope, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetRelu32f(const float* src, size_t size, const float* slope, float* dst); - - void SynetRestrictRange32f(const float * src, size_t size, const float * lower, const float * upper, float * dst); - - void SynetScaleLayerForward(const float* src, const float* scale, const float* bias, size_t channels, size_t height, size_t width, float* dst, SimdTensorFormatType format, SimdSynetCompatibilityType compatibility); - - void SynetShuffleLayerForward(const float* src0, const float* src1, size_t channels0, size_t channels1, size_t spatial, float* dst0, float* dst1, SimdTensorFormatType format, int type); - - void WinogradKernel1x3Block1x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans); - - void WinogradKernel1x3Block1x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel1x3Block1x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel1x5Block1x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans); - - void WinogradKernel1x5Block1x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel1x5Block1x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel2x2Block2x2SetFilter(const float* src, size_t size, float* dst, SimdBool trans); - - void WinogradKernel2x2Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel2x2Block2x2SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel2x2Block4x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans); - - void WinogradKernel2x2Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel2x2Block4x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel3x3Block2x2SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - void WinogradKernel3x3Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel3x3Block2x2SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel3x3Block3x3SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - void WinogradKernel3x3Block3x3SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel3x3Block3x3SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel3x3Block4x4SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - void WinogradKernel3x3Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel3x3Block4x4SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - } -#endif// SIMD_AVX_ENABLE -} -#endif//__SimdAvx_h__ diff --git a/src/3rd/Simd/Simd/SimdAvx1Fill.cpp b/src/3rd/Simd/Simd/SimdAvx1Fill.cpp deleted file mode 100644 index 09578374..00000000 --- a/src/3rd/Simd/Simd/SimdAvx1Fill.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - void Fill32f(float * dst, size_t size, const float * value) - { - if (value == 0 || value[0] == 0) - memset(dst, 0, size * sizeof(float)); - else - { - float v = value[0]; - const float * nose = (float*)AlignHi(dst, F * sizeof(float)); - for (; dst < nose && size; --size) - *dst++ = v; - const float * end = dst + size; - const float * endF = dst + AlignLo(size, F); - const float * endQF = dst + AlignLo(size, QF); - size_t i = 0; - __m256 _v = _mm256_set1_ps(v); - for (; dst < endQF; dst += QF) - { - _mm256_storeu_ps(dst + 0 * F, _v); - _mm256_storeu_ps(dst + 1 * F, _v); - _mm256_storeu_ps(dst + 2 * F, _v); - _mm256_storeu_ps(dst + 3 * F, _v); - } - for (; dst < endF; dst += F) - _mm256_storeu_ps(dst, _v); - for (; dst < end;) - *dst++ = v; - } - } - } -#endif// SIMD_AVX_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx1Float32.cpp b/src/3rd/Simd/Simd/SimdAvx1Float32.cpp deleted file mode 100644 index 78de5333..00000000 --- a/src/3rd/Simd/Simd/SimdAvx1Float32.cpp +++ /dev/null @@ -1,92 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" - -namespace Simd -{ -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - template void CosineDistance32f(const float * a, const float * b, size_t size, float * distance) - { - if (align) - assert(Aligned(a) && Aligned(b)); - - size_t partialAlignedSize = AlignLo(size, F); - size_t fullAlignedSize = AlignLo(size, DF); - size_t i = 0; - __m256 _aa[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }; - __m256 _ab[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }; - __m256 _bb[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }; - if (fullAlignedSize) - { - for (; i < fullAlignedSize; i += DF) - { - __m256 a0 = Load(a + i + 0 * F); - __m256 b0 = Load(b + i + 0 * F); - _aa[0] = _mm256_add_ps(_aa[0], _mm256_mul_ps(a0, a0)); - _ab[0] = _mm256_add_ps(_ab[0], _mm256_mul_ps(a0, b0)); - _bb[0] = _mm256_add_ps(_bb[0], _mm256_mul_ps(b0, b0)); - __m256 a1 = Load(a + i + 1 * F); - __m256 b1 = Load(b + i + 1 * F); - _aa[1] = _mm256_add_ps(_aa[1], _mm256_mul_ps(a1, a1)); - _ab[1] = _mm256_add_ps(_ab[1], _mm256_mul_ps(a1, b1)); - _bb[1] = _mm256_add_ps(_bb[1], _mm256_mul_ps(b1, b1)); - } - _aa[0] = _mm256_add_ps(_aa[0], _aa[1]); - _ab[0] = _mm256_add_ps(_ab[0], _ab[1]); - _bb[0] = _mm256_add_ps(_bb[0], _bb[1]); - } - for (; i < partialAlignedSize; i += F) - { - __m256 a0 = Load(a + i); - __m256 b0 = Load(b + i); - _aa[0] = _mm256_add_ps(_aa[0], _mm256_mul_ps(a0, a0)); - _ab[0] = _mm256_add_ps(_ab[0], _mm256_mul_ps(a0, b0)); - _bb[0] = _mm256_add_ps(_bb[0], _mm256_mul_ps(b0, b0)); - } - float aa = ExtractSum(_aa[0]), ab = ExtractSum(_ab[0]), bb = ExtractSum(_bb[0]); - for (; i < size; ++i) - { - float _a = a[i]; - float _b = b[i]; - aa += _a * _a; - ab += _a * _b; - bb += _b * _b; - } - *distance = 1.0f - ab / ::sqrt(aa*bb); - } - - void CosineDistance32f(const float * a, const float * b, size_t size, float * distance) - { - if (Aligned(a) && Aligned(b)) - CosineDistance32f(a, b, size, distance); - else - CosineDistance32f(a, b, size, distance); - } - } -#endif// SIMD_AVX_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx1Gemm32f.cpp b/src/3rd/Simd/Simd/SimdAvx1Gemm32f.cpp deleted file mode 100644 index 95551e9a..00000000 --- a/src/3rd/Simd/Simd/SimdAvx1Gemm32f.cpp +++ /dev/null @@ -1,1214 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdGemm.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - SIMD_INLINE void AddProduct(float * ptr, __m256 value, __m256 alpha) - { - _mm256_storeu_ps(ptr, _mm256_add_ps(_mm256_mul_ps(value, alpha), _mm256_loadu_ps(ptr))); - } - - SIMD_INLINE void AddProduct(float * ptr, __m256 value, __m256 alpha, size_t tail) - { - if (tail == F) - AddProduct(ptr, value, alpha); - else - { - float tmp[F]; - _mm256_storeu_ps(tmp, _mm256_add_ps(_mm256_mul_ps(value, alpha), _mm256_loadu_ps(ptr))); - for (size_t i = 0; i < tail; ++i) - ptr[i] = tmp[i]; - } - } - - void GemmKernel4x24nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - __m256 c00 = _mm256_setzero_ps(); - __m256 c10 = _mm256_setzero_ps(); - __m256 c20 = _mm256_setzero_ps(); - __m256 c30 = _mm256_setzero_ps(); - __m256 c01 = _mm256_setzero_ps(); - __m256 c11 = _mm256_setzero_ps(); - __m256 c21 = _mm256_setzero_ps(); - __m256 c31 = _mm256_setzero_ps(); - __m256 c02 = _mm256_setzero_ps(); - __m256 c12 = _mm256_setzero_ps(); - __m256 c22 = _mm256_setzero_ps(); - __m256 c32 = _mm256_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t sa = lda == 1 ? 4 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - __m256 b0, b1, b2, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm256_loadu_ps(B + ob0); - b1 = _mm256_loadu_ps(B + ob1); - b2 = _mm256_loadu_ps(B + ob2); - a0 = _mm256_set1_ps(A[oa0]); - c00 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c00); - c01 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c01); - c02 = _mm256_add_ps(_mm256_mul_ps(a0, b2), c02); - a0 = _mm256_set1_ps(A[oa1]); - c10 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c10); - c11 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c11); - c12 = _mm256_add_ps(_mm256_mul_ps(a0, b2), c12); - a0 = _mm256_set1_ps(A[oa2]); - c20 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c20); - c21 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c21); - c22 = _mm256_add_ps(_mm256_mul_ps(a0, b2), c22); - a0 = _mm256_set1_ps(A[oa3]); - c30 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c30); - c31 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c31); - c32 = _mm256_add_ps(_mm256_mul_ps(a0, b2), c32); - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01); - AddProduct(C + 2 * F, _alpha, c02, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11); - AddProduct(C + 2 * F, _alpha, c12, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21); - AddProduct(C + 2 * F, _alpha, c22, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31); - AddProduct(C + 2 * F, _alpha, c32, tail); - } - - void GemmKernel4x16nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - __m256 c00 = _mm256_setzero_ps(); - __m256 c10 = _mm256_setzero_ps(); - __m256 c20 = _mm256_setzero_ps(); - __m256 c30 = _mm256_setzero_ps(); - __m256 c01 = _mm256_setzero_ps(); - __m256 c11 = _mm256_setzero_ps(); - __m256 c21 = _mm256_setzero_ps(); - __m256 c31 = _mm256_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t sa = lda == 1 ? 4 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - __m256 b0, b1, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm256_loadu_ps(B + ob0); - b1 = _mm256_loadu_ps(B + ob1); - a0 = _mm256_set1_ps(A[oa0]); - c00 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c00); - c01 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c01); - a0 = _mm256_set1_ps(A[oa1]); - c10 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c10); - c11 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c11); - a0 = _mm256_set1_ps(A[oa2]); - c20 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c20); - c21 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c21); - a0 = _mm256_set1_ps(A[oa3]); - c30 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c30); - c31 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c31); - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31, tail); - } - - void GemmKernel4x8nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - __m256 c0 = _mm256_setzero_ps(); - __m256 c1 = _mm256_setzero_ps(); - __m256 c2 = _mm256_setzero_ps(); - __m256 c3 = _mm256_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t sa = lda == 1 ? 4 : 1; - const size_t ob0 = ldb * 0; - __m256 b0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm256_loadu_ps(B + ob0); - c0 = _mm256_add_ps(_mm256_mul_ps(b0, _mm256_set1_ps(A[oa0])), c0); - c1 = _mm256_add_ps(_mm256_mul_ps(b0, _mm256_set1_ps(A[oa1])), c1); - c2 = _mm256_add_ps(_mm256_mul_ps(b0, _mm256_set1_ps(A[oa2])), c2); - c3 = _mm256_add_ps(_mm256_mul_ps(b0, _mm256_set1_ps(A[oa3])), c3); - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - AddProduct(C + 0 * ldc, _alpha, c0, tail); - AddProduct(C + 1 * ldc, _alpha, c1, tail); - AddProduct(C + 2 * ldc, _alpha, c2, tail); - AddProduct(C + 3 * ldc, _alpha, c3, tail); - } - - void GemmKernel6x16nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - __m256 c00 = _mm256_setzero_ps(); - __m256 c10 = _mm256_setzero_ps(); - __m256 c20 = _mm256_setzero_ps(); - __m256 c30 = _mm256_setzero_ps(); - __m256 c40 = _mm256_setzero_ps(); - __m256 c50 = _mm256_setzero_ps(); - __m256 c01 = _mm256_setzero_ps(); - __m256 c11 = _mm256_setzero_ps(); - __m256 c21 = _mm256_setzero_ps(); - __m256 c31 = _mm256_setzero_ps(); - __m256 c41 = _mm256_setzero_ps(); - __m256 c51 = _mm256_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t sa = lda == 1 ? 6 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - __m256 b0, b1, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm256_loadu_ps(B + ob0); - b1 = _mm256_loadu_ps(B + ob1); - a0 = _mm256_set1_ps(A[oa0]); - c00 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c00); - c01 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c01); - a0 = _mm256_set1_ps(A[oa1]); - c10 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c10); - c11 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c11); - a0 = _mm256_set1_ps(A[oa2]); - c20 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c20); - c21 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c21); - a0 = _mm256_set1_ps(A[oa3]); - c30 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c30); - c31 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c31); - a0 = _mm256_set1_ps(A[oa4]); - c40 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c40); - c41 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c41); - a0 = _mm256_set1_ps(A[oa5]); - c50 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c50); - c51 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c51); - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40); - AddProduct(C + 1 * F, _alpha, c41, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50); - AddProduct(C + 1 * F, _alpha, c51, tail); - } - - void GemmKernel6x8nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - __m256 c00 = _mm256_setzero_ps(); - __m256 c10 = _mm256_setzero_ps(); - __m256 c20 = _mm256_setzero_ps(); - __m256 c30 = _mm256_setzero_ps(); - __m256 c40 = _mm256_setzero_ps(); - __m256 c50 = _mm256_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t sa = lda == 1 ? 6 : 1; - const size_t ob0 = ldb * 0; - __m256 b0, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm256_loadu_ps(B + ob0); - a0 = _mm256_set1_ps(A[oa0]); - c00 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c00); - a0 = _mm256_set1_ps(A[oa1]); - c10 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c10); - a0 = _mm256_set1_ps(A[oa2]); - c20 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c20); - a0 = _mm256_set1_ps(A[oa3]); - c30 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c30); - a0 = _mm256_set1_ps(A[oa4]); - c40 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c40); - a0 = _mm256_set1_ps(A[oa5]); - c50 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c50); - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50, tail); - } - - void GemmKernelMx24nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - __m256 c[4][3]; - size_t oa[4]; - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - for (size_t i = 0; i < M; ++i) - { - c[i][0] = _mm256_setzero_ps(); - c[i][1] = _mm256_setzero_ps(); - c[i][2] = _mm256_setzero_ps(); - oa[i] = lda * i; - } - __m256 b0, b1, b2, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm256_loadu_ps(B + ob0); - b1 = _mm256_loadu_ps(B + ob1); - b2 = _mm256_loadu_ps(B + ob2); - for (size_t i = 0; i < M; ++i) - { - a0 = _mm256_set1_ps(A[oa[i]]); - c[i][0] = _mm256_add_ps(_mm256_mul_ps(b0, a0), c[i][0]); - c[i][1] = _mm256_add_ps(_mm256_mul_ps(b1, a0), c[i][1]); - c[i][2] = _mm256_add_ps(_mm256_mul_ps(b2, a0), c[i][2]); - } - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - for (size_t i = 0; i < M; ++i) - { - AddProduct(C + 0 * F, _alpha, c[i][0]); - AddProduct(C + 1 * F, _alpha, c[i][1]); - AddProduct(C + 2 * F, _alpha, c[i][2], tail); - C += ldc; - } - } - - void GemmKernelMx16nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - __m256 c[6][2]; - size_t oa[6]; - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - for (size_t i = 0; i < M; ++i) - { - c[i][0] = _mm256_setzero_ps(); - c[i][1] = _mm256_setzero_ps(); - oa[i] = lda * i; - } - __m256 b0, b1, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm256_loadu_ps(B + ob0); - b1 = _mm256_loadu_ps(B + ob1); - for (size_t i = 0; i < M; ++i) - { - a0 = _mm256_set1_ps(A[oa[i]]); - c[i][0] = _mm256_add_ps(_mm256_mul_ps(b0, a0), c[i][0]); - c[i][1] = _mm256_add_ps(_mm256_mul_ps(b1, a0), c[i][1]); - } - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - for (size_t i = 0; i < M; ++i) - { - AddProduct(C + 0 * F, _alpha, c[i][0]); - AddProduct(C + 1 * F, _alpha, c[i][1], tail); - C += ldc; - } - } - - void GemmKernelMx8nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { -#ifdef SIMD_X64_ENABLE - __m256 c[6]; - size_t oa[6]; -#else - __m256 c[4]; - size_t oa[4]; -#endif - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - for (size_t i = 0; i < M; ++i) - { - c[i] = _mm256_setzero_ps(); - oa[i] = lda * i; - } - __m256 b0, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm256_loadu_ps(B + ob0); - for (size_t i = 0; i < M; ++i) - { - a0 = _mm256_set1_ps(A[oa[i]]); - c[i] = _mm256_add_ps(_mm256_mul_ps(b0, a0), c[i]); - } - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - for (size_t i = 0; i < M; ++i) - AddProduct(C + i * ldc, _alpha, c[i], tail); - } - - template void GemmKernelMx24nnT(size_t, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - __m256 c00, c01, c02, c03, c10, c11, c12, c13, c20, c21, c22, c23, b0, b1, b2, a0; - if (M > 0) c00 = _mm256_setzero_ps(), c10 = _mm256_setzero_ps(), c20 = _mm256_setzero_ps(); - if (M > 1) c01 = _mm256_setzero_ps(), c11 = _mm256_setzero_ps(), c21 = _mm256_setzero_ps(); - if (M > 2) c02 = _mm256_setzero_ps(), c12 = _mm256_setzero_ps(), c22 = _mm256_setzero_ps(); - if (M > 3) c03 = _mm256_setzero_ps(), c13 = _mm256_setzero_ps(), c23 = _mm256_setzero_ps(); - size_t oa0, oa1, oa2, oa3; - if (M > 0) oa0 = lda * 0; - if (M > 1) oa1 = lda * 1; - if (M > 2) oa2 = lda * 2; - if (M > 3) oa3 = lda * 3; - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - for (size_t k = 0; k < K; k++) - { - b0 = _mm256_loadu_ps(B + ob0); - b1 = _mm256_loadu_ps(B + ob1); - b2 = _mm256_loadu_ps(B + ob2); - if (M > 0) a0 = _mm256_set1_ps(A[oa0]), c00 = _mm256_add_ps(_mm256_mul_ps(b0, a0), c00), c10 = _mm256_add_ps(_mm256_mul_ps(b1, a0), c10), c20 = _mm256_add_ps(_mm256_mul_ps(b2, a0), c20); - if (M > 1) a0 = _mm256_set1_ps(A[oa1]), c01 = _mm256_add_ps(_mm256_mul_ps(b0, a0), c01), c11 = _mm256_add_ps(_mm256_mul_ps(b1, a0), c11), c21 = _mm256_add_ps(_mm256_mul_ps(b2, a0), c21); - if (M > 2) a0 = _mm256_set1_ps(A[oa2]), c02 = _mm256_add_ps(_mm256_mul_ps(b0, a0), c02), c12 = _mm256_add_ps(_mm256_mul_ps(b1, a0), c12), c22 = _mm256_add_ps(_mm256_mul_ps(b2, a0), c22); - if (M > 3) a0 = _mm256_set1_ps(A[oa3]), c03 = _mm256_add_ps(_mm256_mul_ps(b0, a0), c03), c13 = _mm256_add_ps(_mm256_mul_ps(b1, a0), c13), c23 = _mm256_add_ps(_mm256_mul_ps(b2, a0), c23); - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - if (M > 0) AddProduct(C + 0 * F, _alpha, c00), AddProduct(C + 1 * F, _alpha, c10), AddProduct(C + 2 * F, _alpha, c20, tail), C += ldc; - if (M > 1) AddProduct(C + 0 * F, _alpha, c01), AddProduct(C + 1 * F, _alpha, c11), AddProduct(C + 2 * F, _alpha, c21, tail), C += ldc; - if (M > 2) AddProduct(C + 0 * F, _alpha, c02), AddProduct(C + 1 * F, _alpha, c12), AddProduct(C + 2 * F, _alpha, c22, tail), C += ldc; - if (M > 3) AddProduct(C + 0 * F, _alpha, c03), AddProduct(C + 1 * F, _alpha, c13), AddProduct(C + 2 * F, _alpha, c23, tail), C += ldc; - } - - template void GemmKernelMx16nnT(size_t, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - __m256 c00, c01, c02, c03, c04, c05, c10, c11, c12, c13, c14, c15, b0, b1, a0; - if (M > 0) c00 = _mm256_setzero_ps(), c10 = _mm256_setzero_ps(); - if (M > 1) c01 = _mm256_setzero_ps(), c11 = _mm256_setzero_ps(); - if (M > 2) c02 = _mm256_setzero_ps(), c12 = _mm256_setzero_ps(); - if (M > 3) c03 = _mm256_setzero_ps(), c13 = _mm256_setzero_ps(); - if (M > 4) c04 = _mm256_setzero_ps(), c14 = _mm256_setzero_ps(); - if (M > 5) c05 = _mm256_setzero_ps(), c15 = _mm256_setzero_ps(); - size_t oa0, oa1, oa2, oa3, oa4, oa5; - if (M > 0) oa0 = lda * 0; - if (M > 1) oa1 = lda * 1; - if (M > 2) oa2 = lda * 2; - if (M > 3) oa3 = lda * 3; - if (M > 4) oa4 = lda * 4; - if (M > 5) oa5 = lda * 5; - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - for (size_t k = 0; k < K; k++) - { - b0 = _mm256_loadu_ps(B + ob0); - b1 = _mm256_loadu_ps(B + ob1); - if (M > 0) a0 = _mm256_set1_ps(A[oa0]), c00 = _mm256_add_ps(_mm256_mul_ps(b0, a0), c00), c10 = _mm256_add_ps(_mm256_mul_ps(b1, a0), c10); - if (M > 1) a0 = _mm256_set1_ps(A[oa1]), c01 = _mm256_add_ps(_mm256_mul_ps(b0, a0), c01), c11 = _mm256_add_ps(_mm256_mul_ps(b1, a0), c11); - if (M > 2) a0 = _mm256_set1_ps(A[oa2]), c02 = _mm256_add_ps(_mm256_mul_ps(b0, a0), c02), c12 = _mm256_add_ps(_mm256_mul_ps(b1, a0), c12); - if (M > 3) a0 = _mm256_set1_ps(A[oa3]), c03 = _mm256_add_ps(_mm256_mul_ps(b0, a0), c03), c13 = _mm256_add_ps(_mm256_mul_ps(b1, a0), c13); - if (M > 4) a0 = _mm256_set1_ps(A[oa4]), c04 = _mm256_add_ps(_mm256_mul_ps(b0, a0), c04), c14 = _mm256_add_ps(_mm256_mul_ps(b1, a0), c14); - if (M > 5) a0 = _mm256_set1_ps(A[oa5]), c05 = _mm256_add_ps(_mm256_mul_ps(b0, a0), c05), c15 = _mm256_add_ps(_mm256_mul_ps(b1, a0), c15); - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - if (M > 0) AddProduct(C + 0 * F, _alpha, c00), AddProduct(C + 1 * F, _alpha, c10, tail), C += ldc; - if (M > 1) AddProduct(C + 0 * F, _alpha, c01), AddProduct(C + 1 * F, _alpha, c11, tail), C += ldc; - if (M > 2) AddProduct(C + 0 * F, _alpha, c02), AddProduct(C + 1 * F, _alpha, c12, tail), C += ldc; - if (M > 3) AddProduct(C + 0 * F, _alpha, c03), AddProduct(C + 1 * F, _alpha, c13, tail), C += ldc; - if (M > 4) AddProduct(C + 0 * F, _alpha, c04), AddProduct(C + 1 * F, _alpha, c14, tail), C += ldc; - if (M > 5) AddProduct(C + 0 * F, _alpha, c05), AddProduct(C + 1 * F, _alpha, c15, tail), C += ldc; - } - - template void GemmKernelMx8nnT(size_t, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - __m256 c00, c01, c02, c03, c04, c05, b0; - if (M > 0) c00 = _mm256_setzero_ps(); - if (M > 1) c01 = _mm256_setzero_ps(); - if (M > 2) c02 = _mm256_setzero_ps(); - if (M > 3) c03 = _mm256_setzero_ps(); - if (M > 4) c04 = _mm256_setzero_ps(); - if (M > 5) c05 = _mm256_setzero_ps(); - size_t oa0, oa1, oa2, oa3, oa4, oa5; - if (M > 0) oa0 = lda * 0; - if (M > 1) oa1 = lda * 1; - if (M > 2) oa2 = lda * 2; - if (M > 3) oa3 = lda * 3; - if (M > 4) oa4 = lda * 4; - if (M > 5) oa5 = lda * 5; - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm256_loadu_ps(B + ob0); - if (M > 0) c00 = _mm256_add_ps(_mm256_mul_ps(b0, _mm256_set1_ps(A[oa0])), c00); - if (M > 1) c01 = _mm256_add_ps(_mm256_mul_ps(b0, _mm256_set1_ps(A[oa1])), c01); - if (M > 2) c02 = _mm256_add_ps(_mm256_mul_ps(b0, _mm256_set1_ps(A[oa2])), c02); - if (M > 3) c03 = _mm256_add_ps(_mm256_mul_ps(b0, _mm256_set1_ps(A[oa3])), c03); - if (M > 4) c04 = _mm256_add_ps(_mm256_mul_ps(b0, _mm256_set1_ps(A[oa4])), c04); - if (M > 5) c05 = _mm256_add_ps(_mm256_mul_ps(b0, _mm256_set1_ps(A[oa5])), c05); - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - if (M > 0) AddProduct(C + 0 * ldc, _alpha, c00, tail); - if (M > 1) AddProduct(C + 1 * ldc, _alpha, c01, tail); - if (M > 2) AddProduct(C + 2 * ldc, _alpha, c02, tail); - if (M > 3) AddProduct(C + 3 * ldc, _alpha, c03, tail); - if (M > 4) AddProduct(C + 4 * ldc, _alpha, c04, tail); - if (M > 5) AddProduct(C + 5 * ldc, _alpha, c05, tail); - } - - SIMD_INLINE Simd::GemmNN::Tail GetGemmTail(size_t M, size_t N) - { - if (N <= 8) - { - switch (M) - { - case 0: return GemmKernelMx8nnT<0>; - case 1: return GemmKernelMx8nnT<1>; - case 2: return GemmKernelMx8nnT<2>; - case 3: return GemmKernelMx8nnT<3>; - case 4: return GemmKernelMx8nnT<4>; - case 5: return GemmKernelMx8nnT<5>; - } - } - else if (N <= 16) - { - switch (M) - { - case 0: return GemmKernelMx16nnT<0>; - case 1: return GemmKernelMx16nnT<1>; - case 2: return GemmKernelMx16nnT<2>; - case 3: return GemmKernelMx16nnT<3>; - case 4: return GemmKernelMx16nnT<4>; - case 5: return GemmKernelMx16nnT<5>; - } - } - else if (N <= 24) - { - switch (M) - { - case 0: return GemmKernelMx24nnT<0>; - case 1: return GemmKernelMx24nnT<1>; - case 2: return GemmKernelMx24nnT<2>; - case 3: return GemmKernelMx24nnT<3>; - } - } - assert(0); - return NULL; - } - - SIMD_INLINE void GemmPackA_4x8(const float* src, size_t stride, float* dst) - { - __m256 s0 = _mm256_loadu_ps(src + 0 * stride); - __m256 s1 = _mm256_loadu_ps(src + 1 * stride); - __m256 s2 = _mm256_loadu_ps(src + 2 * stride); - __m256 s3 = _mm256_loadu_ps(src + 3 * stride); - __m256 s00 = _mm256_unpacklo_ps(s0, s2); - __m256 s01 = _mm256_unpacklo_ps(s1, s3); - __m256 s10 = _mm256_unpackhi_ps(s0, s2); - __m256 s11 = _mm256_unpackhi_ps(s1, s3); - __m256 d0 = _mm256_unpacklo_ps(s00, s01); - __m256 d1 = _mm256_unpackhi_ps(s00, s01); - __m256 d2 = _mm256_unpacklo_ps(s10, s11); - __m256 d3 = _mm256_unpackhi_ps(s10, s11); - _mm256_storeu_ps(dst + 0x00, _mm256_permute2f128_ps(d0, d1, 0x20)); - _mm256_storeu_ps(dst + 0x08, _mm256_permute2f128_ps(d2, d3, 0x20)); - _mm256_storeu_ps(dst + 0x10, _mm256_permute2f128_ps(d0, d1, 0x31)); - _mm256_storeu_ps(dst + 0x18, _mm256_permute2f128_ps(d2, d3, 0x31)); - } - - SIMD_INLINE void GemmPackA_4x4(const float* src, size_t stride, float* dst) - { - __m128 s0 = _mm_loadu_ps(src + 0 * stride); - __m128 s1 = _mm_loadu_ps(src + 1 * stride); - __m128 s2 = _mm_loadu_ps(src + 2 * stride); - __m128 s3 = _mm_loadu_ps(src + 3 * stride); - __m128 s00 = _mm_unpacklo_ps(s0, s2); - __m128 s01 = _mm_unpacklo_ps(s1, s3); - __m128 s10 = _mm_unpackhi_ps(s0, s2); - __m128 s11 = _mm_unpackhi_ps(s1, s3); - _mm_storeu_ps(dst + 0, _mm_unpacklo_ps(s00, s01)); - _mm_storeu_ps(dst + 4, _mm_unpackhi_ps(s00, s01)); - _mm_storeu_ps(dst + 8, _mm_unpacklo_ps(s10, s11)); - _mm_storeu_ps(dst + 12, _mm_unpackhi_ps(s10, s11)); - } - - SIMD_INLINE void GemmPackA_6x4(const float* src, size_t stride, float* dst) - { - __m128 s0 = _mm_loadu_ps(src + 0 * stride); - __m128 s1 = _mm_loadu_ps(src + 1 * stride); - __m128 s2 = _mm_loadu_ps(src + 2 * stride); - __m128 s3 = _mm_loadu_ps(src + 3 * stride); - __m128 s4 = _mm_loadu_ps(src + 4 * stride); - __m128 s5 = _mm_loadu_ps(src + 5 * stride); - __m128 s00 = _mm_unpacklo_ps(s0, s2); - __m128 s01 = _mm_unpacklo_ps(s1, s3); - __m128 s10 = _mm_unpackhi_ps(s0, s2); - __m128 s11 = _mm_unpackhi_ps(s1, s3); - __m128 s20 = _mm_unpacklo_ps(s4, s5); - __m128 s21 = _mm_unpackhi_ps(s4, s5); - _mm_storeu_ps(dst + 0, _mm_unpacklo_ps(s00, s01)); - _mm_storel_pi((__m64*)(dst + 4), s20); - _mm_storeu_ps(dst + 6, _mm_unpackhi_ps(s00, s01)); - _mm_storeh_pi((__m64*)(dst + 10), s20); - _mm_storeu_ps(dst + 12, _mm_unpacklo_ps(s10, s11)); - _mm_storel_pi((__m64*)(dst + 16), s21); - _mm_storeu_ps(dst + 18, _mm_unpackhi_ps(s10, s11)); - _mm_storeh_pi((__m64*)(dst + 22), s21); - } - - void GemmPackA(const float * src, size_t stride, size_t M, size_t K, size_t cell, float * dst) - { - size_t K4 = AlignLo(K, 4), K8 = AlignLo(K, 8); - for (size_t i = 0; i < M; i += cell) - { - size_t m = Simd::Min(cell, M - i), k = 0; - if (cell == 4 && m == 4) - { - for (; k < K8; k += 8, dst += 32) - GemmPackA_4x8(src + k, stride, dst); - for (; k < K4; k += 4, dst += 16) - GemmPackA_4x4(src + k, stride, dst); - } - else if (cell == 6 && m == 6) - { - for (; k < K4; k += 4, dst += 24) - GemmPackA_6x4(src + k, stride, dst); - } - for (; k < K; ++k) - { - for (size_t c = 0; c < m; ++c) - *(dst++) = src[c*stride + k]; - } - src += cell * stride; - } - } - - void GemmPackB(const float * B, size_t ldb, size_t K, size_t N, size_t microN, float * pB) - { - for (size_t j = 0; j < N; j += microN) - { - size_t n = Simd::Min(microN, N - j); - size_t k = 0; - if (microN == 1 * F) - { - if (n == microN) - { - for (; k < K; ++k) - { - const float * b = B + k * ldb; - _mm256_storeu_ps(pB + 0 * F, _mm256_loadu_ps(b + 0 * F)); - pB += microN; - } - } - else - { - __m256 mask0 = Avx::LeftNotZero32f(n - 0 * F); - for (; k < K - 1; ++k) - { - const float * b = B + k * ldb; - _mm256_storeu_ps(pB + 0 * F, _mm256_and_ps(mask0, _mm256_loadu_ps(b + 0 * F))); - pB += microN; - } - } - } - else if (microN == 2 * F) - { - if (n == microN) - { - for (; k < K; ++k) - { - const float * b = B + k * ldb; - _mm256_storeu_ps(pB + 0 * F, _mm256_loadu_ps(b + 0 * F)); - _mm256_storeu_ps(pB + 1 * F, _mm256_loadu_ps(b + 1 * F)); - pB += microN; - } - } - else - { - __m256 mask0 = Avx::LeftNotZero32f(n - 0 * F); - __m256 mask1 = Avx::LeftNotZero32f(n - 1 * F); - for (; k < K - 1; ++k) - { - const float * b = B + k * ldb; - _mm256_storeu_ps(pB + 0 * F, _mm256_and_ps(mask0, _mm256_loadu_ps(b + 0 * F))); - _mm256_storeu_ps(pB + 1 * F, _mm256_and_ps(mask1, _mm256_loadu_ps(b + 1 * F))); - pB += microN; - } - } - } - else if (microN == 3 * F) - { - if (n == microN) - { - for (; k < K; ++k) - { - const float * b = B + k * ldb; - _mm256_storeu_ps(pB + 0 * F, _mm256_loadu_ps(b + 0 * F)); - _mm256_storeu_ps(pB + 1 * F, _mm256_loadu_ps(b + 1 * F)); - _mm256_storeu_ps(pB + 2 * F, _mm256_loadu_ps(b + 2 * F)); - pB += microN; - } - } - else - { - __m256 mask0 = Avx::LeftNotZero32f(n - 0 * F); - __m256 mask1 = Avx::LeftNotZero32f(n - 1 * F); - __m256 mask2 = Avx::LeftNotZero32f(n - 2 * F); - for (; k < K - 1; ++k) - { - const float * b = B + k * ldb; - _mm256_storeu_ps(pB + 0 * F, _mm256_and_ps(mask0, _mm256_loadu_ps(b + 0 * F))); - _mm256_storeu_ps(pB + 1 * F, _mm256_and_ps(mask1, _mm256_loadu_ps(b + 1 * F))); - _mm256_storeu_ps(pB + 2 * F, _mm256_and_ps(mask2, _mm256_loadu_ps(b + 2 * F))); - pB += microN; - } - } - } - for (; k < K; ++k) - { - const float * b = B + k * ldb; - size_t c = 0; - for (; c < n; ++c) - *(pB++) = *(b++); - for (; c < microN; ++c) - *(pB++) = 0; - } - B += microN; - } - } - - SIMD_INLINE void ScaleC(float * C, __m256 beta) - { - _mm256_storeu_ps(C, _mm256_mul_ps(_mm256_loadu_ps(C), beta)); - } - - void GemmScaleC(size_t M, size_t N, float beta, float * C, size_t ldc) - { - if (beta == 1.0f) - return; - else if (beta == 0.0f) - { - for (size_t i = 0; i < M; ++i) - memset(C + i * ldc, 0, N * sizeof(float)); - } - else - { - size_t NQF = AlignLo(N, QF); - size_t NF = AlignLo(N, F); - __m256 _beta = _mm256_set1_ps(beta); - for (size_t i = 0; i < M; ++i) - { - size_t j = 0; - for (; j < NQF; j += QF) - { - ScaleC(C + j + F * 0, _beta); - ScaleC(C + j + F * 1, _beta); - ScaleC(C + j + F * 2, _beta); - ScaleC(C + j + F * 3, _beta); - } - for (; j < NF; j += F) - ScaleC(C + j, _beta); - for (; j < N; ++j) - C[j] *= beta; - C += ldc; - } - } - } - - void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc) - { - typedef Simd::GemmNN GemmNN; - GemmNN::Main kernelMM, kernelMT; - GemmNN::Tail kernelTM, kernelTT; - size_t microM, microN, L1, L2; -#ifdef SIMD_X64_ENABLE - if (N < K) - { - microM = 6; - microN = 16; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = GemmKernel6x16nn; - kernelMT = tail > F ? GemmKernel6x16nn : GemmKernel6x8nn; - kernelTM = GemmKernelMx16nn; - kernelTT = tail > F ? GemmKernelMx16nn : GemmKernelMx8nn; - } - else - { - microM = 4; - microN = 24; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = GemmKernel4x24nn; - kernelMT = tail > DF ? GemmKernel4x24nn : (tail > F ? GemmKernel4x16nn : GemmKernel4x8nn); - kernelTM = GemmKernelMx24nn; - kernelTT = tail > DF ? GemmKernelMx24nn : (tail > F ? GemmKernelMx16nn : GemmKernelMx8nn); - } -#else - microM = 4; - microN = 8; - kernelMM = GemmKernel4x8nn; - kernelMT = GemmKernel4x8nn; - kernelTM = GemmKernelMx8nn; - kernelTT = GemmKernelMx8nn; -#endif - GemmNN::PackA packA = NULL; - L1 = N > 4096 ? Base::AlgCacheL2() : Base::AlgCacheL1(); - L2 = N > 4096 ? Base::AlgCacheL3() : Base::AlgCacheL2(); - GemmNN gemmNN(M, N, K, microM, microN, L1, L2, Base::AlgCacheL3(), F, - kernelMM, kernelMT, kernelTM, kernelTT, packA, Avx::GemmPackB, Avx::GemmScaleC, NULL); - gemmNN.Run(alpha, A, lda, B, ldb, beta, C, ldc); - } - - //--------------------------------------------------------------------- - - typedef Simd::GemmNNcb Gemm32fNNcb; - - SIMD_INLINE Gemm32fNNcb CreateGemm32fNNcb(size_t M, size_t N, size_t K, GemmKernelType type, bool compatibility) - { - Gemm32fNNcb::Main kernelMM, kernelMT; - Gemm32fNNcb::Tail kernelTM, kernelTT; - size_t microM, microN; -#ifdef SIMD_X64_ENABLE - if (type == GemmKernelF3 || (type == GemmKernelAny && (M == 4 || M == 8 || M == 16) && N > 16)) - { - microM = 4; - microN = 24; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = Avx::GemmKernel4x24nn; - kernelMT = tail > DF ? Avx::GemmKernel4x24nn : (tail > F ? Avx::GemmKernel4x16nn : Avx::GemmKernel4x8nn); - kernelTM = Avx::GetGemmTail(M%microM, microN); - kernelTT = Avx::GetGemmTail(M%microM, tail); - type = GemmKernelF3; - } - if (type == GemmKernelF2 || (type == GemmKernelF3 && N <= 16) || (type == GemmKernelAny && N > 8)) - { - microM = 6; - microN = 16; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = Avx::GemmKernel6x16nn; - kernelMT = tail > F ? Avx::GemmKernel6x16nn : Avx::GemmKernel6x8nn; - kernelTM = Avx::GetGemmTail(M%microM, microN); - kernelTT = Avx::GetGemmTail(M%microM, tail); - type = GemmKernelF2; - } - if (type == GemmKernelF1 || (type == GemmKernelF2 && N <= 8) || type == GemmKernelAny) - { - microM = 6; - microN = 8; - kernelMM = Avx::GemmKernel6x8nn; - kernelMT = Avx::GemmKernel6x8nn; - kernelTM = Avx::GetGemmTail(M%microM, microN); - kernelTT = Avx::GetGemmTail(M%microM, microN); - type = GemmKernelF1; - } -#else - microM = 4; - microN = 8; - kernelMM = Avx::GemmKernel4x8nn; - kernelMT = Avx::GemmKernel4x8nn; - kernelTM = Avx::GetGemmTail(M%microM, microN); - kernelTT = Avx::GetGemmTail(M%microM, microN); -#endif - return Gemm32fNNcb(M, N, K, microM, microN, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), F, - kernelMM, kernelMT, kernelTM, kernelTT, NULL, Avx::GemmPackB, Avx::GemmScaleC, NULL, compatibility); - } - - size_t Gemm32fNNcbBufferSize(size_t M, size_t N, size_t K, GemmKernelType type, bool compatibility) - { - Gemm32fNNcb gemm = CreateGemm32fNNcb(M, N, K, type, compatibility); - return gemm.BufferSize(); - } - - void Gemm32fNNcbReorderB(size_t M, size_t N, size_t K, const float * B, float * pB, GemmKernelType type, bool compatibility) - { - Gemm32fNNcb gemm = CreateGemm32fNNcb(M, N, K, type, compatibility); - gemm.ReorderB(B, N, pB); - } - - void Gemm32fNNcbRun(size_t M, size_t N, size_t K, const float * A, const float * pB, float * C, GemmKernelType type, bool compatibility) - { - Gemm32fNNcb gemm = CreateGemm32fNNcb(M, N, K, type, compatibility); - gemm.Run(A, K, pB, C, N); - } - - //--------------------------------------------------------------------- - - SIMD_INLINE __m256 Tail(size_t tail) - { - const int32_t mask[DF] = { 0, 0, 0, 0, 0, 0, 0, 0 , -1, -1, -1, -1, -1, -1, -1, -1 }; - return _mm256_loadu_ps((float*)(mask + tail)); - } - - SIMD_INLINE void Add4ExtractedSums(const __m256 & sum0, const __m256 & sum1, const __m256 & sum2, const __m256 & sum3, const __m128 & alpha, float * dst) - { - __m256 sum256 = _mm256_hadd_ps(_mm256_hadd_ps(sum0, sum1), _mm256_hadd_ps(sum2, sum3)); - __m128 sum128 = _mm_add_ps(_mm256_extractf128_ps(sum256, 0), _mm256_extractf128_ps(sum256, 1)); - _mm_storeu_ps(dst, _mm_add_ps(_mm_loadu_ps(dst), _mm_mul_ps(alpha, sum128))); - } - - static void Kernel1x1x8nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K8 = K & (~7); - const float * A0 = A + 0 * lda; - const float * B0 = B + 0 * ldb; - __m256 c00 = _mm256_setzero_ps(); - __m256 a0, b0; - for (size_t k = 0; k < K8; k += 8) - { - a0 = _mm256_loadu_ps(A0 + k); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_add_ps(c00, _mm256_mul_ps(a0, b0)); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 tail = Tail(K - K8); - a0 = _mm256_and_ps(tail, _mm256_loadu_ps(A0 + k)); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_add_ps(c00, _mm256_mul_ps(a0, b0)); - } - C[0] += alpha * Avx::ExtractSum(c00); - } - - static void Kernel1x4x8nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K8 = K & (~7); - const float * A0 = A + 0 * lda; - const float * B0 = B + 0 * ldb; - const float * B1 = B + 1 * ldb; - const float * B2 = B + 2 * ldb; - const float * B3 = B + 3 * ldb; - __m256 c00 = _mm256_setzero_ps(); - __m256 c01 = _mm256_setzero_ps(); - __m256 c02 = _mm256_setzero_ps(); - __m256 c03 = _mm256_setzero_ps(); - __m256 a0, b0; - for (size_t k = 0; k < K8; k += 8) - { - a0 = _mm256_loadu_ps(A0 + k); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_add_ps(c00, _mm256_mul_ps(a0, b0)); - b0 = _mm256_loadu_ps(B1 + k); - c01 = _mm256_add_ps(c01, _mm256_mul_ps(a0, b0)); - b0 = _mm256_loadu_ps(B2 + k); - c02 = _mm256_add_ps(c02, _mm256_mul_ps(a0, b0)); - b0 = _mm256_loadu_ps(B3 + k); - c03 = _mm256_add_ps(c03, _mm256_mul_ps(a0, b0)); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 tail = Tail(K - K8); - a0 = _mm256_and_ps(tail, _mm256_loadu_ps(A0 + k)); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_add_ps(c00, _mm256_mul_ps(a0, b0)); - b0 = _mm256_loadu_ps(B1 + k); - c01 = _mm256_add_ps(c01, _mm256_mul_ps(a0, b0)); - b0 = _mm256_loadu_ps(B2 + k); - c02 = _mm256_add_ps(c02, _mm256_mul_ps(a0, b0)); - b0 = _mm256_loadu_ps(B3 + k); - c03 = _mm256_add_ps(c03, _mm256_mul_ps(a0, b0)); - } - __m128 _alpha = _mm_set1_ps(alpha); - Add4ExtractedSums(c00, c01, c02, c03, _alpha, C + 0 * ldc); - } - - static void Kernel2x1x8nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K8 = K & (~7); - const float * A0 = A + 0 * lda; - const float * A1 = A + 1 * lda; - const float * B0 = B + 0 * ldb; - __m256 c00 = _mm256_setzero_ps(); - __m256 c10 = _mm256_setzero_ps(); - __m256 a0, a1, b0; - for (size_t k = 0; k < K8; k += 8) - { - a0 = _mm256_loadu_ps(A0 + k); - a1 = _mm256_loadu_ps(A1 + k); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_add_ps(c00, _mm256_mul_ps(a0, b0)); - c10 = _mm256_add_ps(c10, _mm256_mul_ps(a1, b0)); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 tail = Tail(K - K8); - a0 = _mm256_and_ps(tail, _mm256_loadu_ps(A0 + k)); - a1 = _mm256_and_ps(tail, _mm256_loadu_ps(A1 + k)); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_add_ps(c00, _mm256_mul_ps(a0, b0)); - c10 = _mm256_add_ps(c10, _mm256_mul_ps(a1, b0)); - } - C[0 * ldc] += alpha * Avx::ExtractSum(c00); - C[1 * ldc] += alpha * Avx::ExtractSum(c10); - } - - static void Kernel2x4x8nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K8 = K & (~7); - const float * A0 = A + 0 * lda; - const float * A1 = A + 1 * lda; - const float * B0 = B + 0 * ldb; - const float * B1 = B + 1 * ldb; - const float * B2 = B + 2 * ldb; - const float * B3 = B + 3 * ldb; - __m256 c00 = _mm256_setzero_ps(); - __m256 c01 = _mm256_setzero_ps(); - __m256 c02 = _mm256_setzero_ps(); - __m256 c03 = _mm256_setzero_ps(); - __m256 c10 = _mm256_setzero_ps(); - __m256 c11 = _mm256_setzero_ps(); - __m256 c12 = _mm256_setzero_ps(); - __m256 c13 = _mm256_setzero_ps(); - __m256 a0, a1, b0; - for (size_t k = 0; k < K8; k += 8) - { - a0 = _mm256_loadu_ps(A0 + k); - a1 = _mm256_loadu_ps(A1 + k); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_add_ps(c00, _mm256_mul_ps(a0, b0)); - c10 = _mm256_add_ps(c10, _mm256_mul_ps(a1, b0)); - b0 = _mm256_loadu_ps(B1 + k); - c01 = _mm256_add_ps(c01, _mm256_mul_ps(a0, b0)); - c11 = _mm256_add_ps(c11, _mm256_mul_ps(a1, b0)); - b0 = _mm256_loadu_ps(B2 + k); - c02 = _mm256_add_ps(c02, _mm256_mul_ps(a0, b0)); - c12 = _mm256_add_ps(c12, _mm256_mul_ps(a1, b0)); - b0 = _mm256_loadu_ps(B3 + k); - c03 = _mm256_add_ps(c03, _mm256_mul_ps(a0, b0)); - c13 = _mm256_add_ps(c13, _mm256_mul_ps(a1, b0)); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 tail = Tail(K - K8); - a0 = _mm256_and_ps(tail, _mm256_loadu_ps(A0 + k)); - a1 = _mm256_and_ps(tail, _mm256_loadu_ps(A1 + k)); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_add_ps(c00, _mm256_mul_ps(a0, b0)); - c10 = _mm256_add_ps(c10, _mm256_mul_ps(a1, b0)); - b0 = _mm256_loadu_ps(B1 + k); - c01 = _mm256_add_ps(c01, _mm256_mul_ps(a0, b0)); - c11 = _mm256_add_ps(c11, _mm256_mul_ps(a1, b0)); - b0 = _mm256_loadu_ps(B2 + k); - c02 = _mm256_add_ps(c02, _mm256_mul_ps(a0, b0)); - c12 = _mm256_add_ps(c12, _mm256_mul_ps(a1, b0)); - b0 = _mm256_loadu_ps(B3 + k); - c03 = _mm256_add_ps(c03, _mm256_mul_ps(a0, b0)); - c13 = _mm256_add_ps(c13, _mm256_mul_ps(a1, b0)); - } - __m128 _alpha = _mm_set1_ps(alpha); - Add4ExtractedSums(c00, c01, c02, c03, _alpha, C + 0 * ldc); - Add4ExtractedSums(c10, c11, c12, c13, _alpha, C + 1 * ldc); - } - - static void Kernel3x1x8nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K8 = K & (~7); - const float * A0 = A + 0 * lda; - const float * A1 = A + 1 * lda; - const float * A2 = A + 2 * lda; - const float * B0 = B + 0 * ldb; - __m256 c00 = _mm256_setzero_ps(); - __m256 c10 = _mm256_setzero_ps(); - __m256 c20 = _mm256_setzero_ps(); - __m256 a0, a1, a2, b0; - for (size_t k = 0; k < K8; k += 8) - { - a0 = _mm256_loadu_ps(A0 + k); - a1 = _mm256_loadu_ps(A1 + k); - a2 = _mm256_loadu_ps(A2 + k); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_add_ps(c00, _mm256_mul_ps(a0, b0)); - c10 = _mm256_add_ps(c10, _mm256_mul_ps(a1, b0)); - c20 = _mm256_add_ps(c20, _mm256_mul_ps(a2, b0)); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 tail = Tail(K - K8); - a0 = _mm256_and_ps(tail, _mm256_loadu_ps(A0 + k)); - a1 = _mm256_and_ps(tail, _mm256_loadu_ps(A1 + k)); - a2 = _mm256_and_ps(tail, _mm256_loadu_ps(A2 + k)); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_add_ps(c00, _mm256_mul_ps(a0, b0)); - c10 = _mm256_add_ps(c10, _mm256_mul_ps(a1, b0)); - c20 = _mm256_add_ps(c20, _mm256_mul_ps(a2, b0)); - } - C[0 * ldc] += alpha * Avx::ExtractSum(c00); - C[1 * ldc] += alpha * Avx::ExtractSum(c10); - C[2 * ldc] += alpha * Avx::ExtractSum(c20); - } - - static void Kernel3x4x8nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K8 = K & (~7); - const float * A0 = A + 0 * lda; - const float * A1 = A + 1 * lda; - const float * A2 = A + 2 * lda; - const float * B0 = B + 0 * ldb; - const float * B1 = B + 1 * ldb; - const float * B2 = B + 2 * ldb; - const float * B3 = B + 3 * ldb; - __m256 c00 = _mm256_setzero_ps(); - __m256 c01 = _mm256_setzero_ps(); - __m256 c02 = _mm256_setzero_ps(); - __m256 c03 = _mm256_setzero_ps(); - __m256 c10 = _mm256_setzero_ps(); - __m256 c11 = _mm256_setzero_ps(); - __m256 c12 = _mm256_setzero_ps(); - __m256 c13 = _mm256_setzero_ps(); - __m256 c20 = _mm256_setzero_ps(); - __m256 c21 = _mm256_setzero_ps(); - __m256 c22 = _mm256_setzero_ps(); - __m256 c23 = _mm256_setzero_ps(); - __m256 a0, a1, a2, b0; - for (size_t k = 0; k < K8; k += 8) - { - a0 = _mm256_loadu_ps(A0 + k); - a1 = _mm256_loadu_ps(A1 + k); - a2 = _mm256_loadu_ps(A2 + k); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_add_ps(c00, _mm256_mul_ps(a0, b0)); - c10 = _mm256_add_ps(c10, _mm256_mul_ps(a1, b0)); - c20 = _mm256_add_ps(c20, _mm256_mul_ps(a2, b0)); - b0 = _mm256_loadu_ps(B1 + k); - c01 = _mm256_add_ps(c01, _mm256_mul_ps(a0, b0)); - c11 = _mm256_add_ps(c11, _mm256_mul_ps(a1, b0)); - c21 = _mm256_add_ps(c21, _mm256_mul_ps(a2, b0)); - b0 = _mm256_loadu_ps(B2 + k); - c02 = _mm256_add_ps(c02, _mm256_mul_ps(a0, b0)); - c12 = _mm256_add_ps(c12, _mm256_mul_ps(a1, b0)); - c22 = _mm256_add_ps(c22, _mm256_mul_ps(a2, b0)); - b0 = _mm256_loadu_ps(B3 + k); - c03 = _mm256_add_ps(c03, _mm256_mul_ps(a0, b0)); - c13 = _mm256_add_ps(c13, _mm256_mul_ps(a1, b0)); - c23 = _mm256_add_ps(c23, _mm256_mul_ps(a2, b0)); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 tail = Tail(K - K8); - a0 = _mm256_and_ps(tail, _mm256_loadu_ps(A0 + k)); - a1 = _mm256_and_ps(tail, _mm256_loadu_ps(A1 + k)); - a2 = _mm256_and_ps(tail, _mm256_loadu_ps(A2 + k)); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_add_ps(c00, _mm256_mul_ps(a0, b0)); - c10 = _mm256_add_ps(c10, _mm256_mul_ps(a1, b0)); - c20 = _mm256_add_ps(c20, _mm256_mul_ps(a2, b0)); - b0 = _mm256_loadu_ps(B1 + k); - c01 = _mm256_add_ps(c01, _mm256_mul_ps(a0, b0)); - c11 = _mm256_add_ps(c11, _mm256_mul_ps(a1, b0)); - c21 = _mm256_add_ps(c21, _mm256_mul_ps(a2, b0)); - b0 = _mm256_loadu_ps(B2 + k); - c02 = _mm256_add_ps(c02, _mm256_mul_ps(a0, b0)); - c12 = _mm256_add_ps(c12, _mm256_mul_ps(a1, b0)); - c22 = _mm256_add_ps(c22, _mm256_mul_ps(a2, b0)); - b0 = _mm256_loadu_ps(B3 + k); - c03 = _mm256_add_ps(c03, _mm256_mul_ps(a0, b0)); - c13 = _mm256_add_ps(c13, _mm256_mul_ps(a1, b0)); - c23 = _mm256_add_ps(c23, _mm256_mul_ps(a2, b0)); - } - __m128 _alpha = _mm_set1_ps(alpha); - Add4ExtractedSums(c00, c01, c02, c03, _alpha, C + 0 * ldc); - Add4ExtractedSums(c10, c11, c12, c13, _alpha, C + 1 * ldc); - Add4ExtractedSums(c20, c21, c22, c23, _alpha, C + 2 * ldc); - } - - void Gemm32fNT(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc) - { - typedef Simd::GemmNT GemmNT; -#ifdef SIMD_X64_ENABLE - GemmNT gemmNT(M, N, K, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), F, Avx::GemmScaleC, - Kernel1x1x8nt, Kernel1x4x8nt, Kernel2x1x8nt, Kernel2x4x8nt, Kernel3x1x8nt, Kernel3x4x8nt, NULL, NULL); -#else - GemmNT gemmNT(M, N, K, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), F, Sse::GemmScaleC, - Kernel1x1x8nt, Kernel1x4x8nt, NULL, NULL, NULL, NULL, NULL, NULL); -#endif - gemmNT.Run(alpha, A, lda, B, ldb, beta, C, ldc); - } - } -#endif// SIMD_AVX_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx1HogLite.cpp b/src/3rd/Simd/Simd/SimdAvx1HogLite.cpp deleted file mode 100644 index 0d19d02d..00000000 --- a/src/3rd/Simd/Simd/SimdAvx1HogLite.cpp +++ /dev/null @@ -1,495 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdCompare.h" -#include "Simd/SimdArray.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdUpdate.h" - -namespace Simd -{ -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - class HogLiteFeatureFilter - { - template SIMD_INLINE void ProductSum1x1(const float * src, const float * filter, __m256 & sum) - { - __m256 _src = Load(src); - __m256 _filter = Load(filter); - sum = _mm256_add_ps(sum, _mm256_mul_ps(_src, _filter)); - } - - template SIMD_INLINE void ProductSum1x4(const float * src, const float * filter, __m256 * sums) - { - __m256 _filter = Load(filter); - sums[0] = _mm256_add_ps(sums[0], _mm256_mul_ps(Load(src + 0 * step), _filter)); - sums[1] = _mm256_add_ps(sums[1], _mm256_mul_ps(Load(src + 1 * step), _filter)); - sums[2] = _mm256_add_ps(sums[2], _mm256_mul_ps(Load(src + 2 * step), _filter)); - sums[3] = _mm256_add_ps(sums[3], _mm256_mul_ps(Load(src + 3 * step), _filter)); - } - - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride) - { - size_t filterStride = featureSize * filterWidth; - size_t alignedDstWidth = AlignLo(dstWidth, 4); - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - size_t dstCol = 0; - for (; dstCol < alignedDstWidth; dstCol += 4) - { - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - size_t filterCol = 0; - for (; filterCol < filterStride; filterCol += F) - ProductSum1x4(pSrc + filterCol, pFilter + filterCol, sums); - pSrc += srcStride; - pFilter += filterStride; - } - _mm_storeu_ps(dst + dstCol, Avx::Extract4Sums(sums)); - } - for (; dstCol < dstWidth; ++dstCol) - { - __m256 sum = _mm256_setzero_ps(); - const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - for (size_t filterCol = 0; filterCol < filterStride; filterCol += F) - ProductSum1x1(pSrc + filterCol, pFilter + filterCol, sum); - pSrc += srcStride; - pFilter += filterStride; - } - dst[dstCol] = Avx::ExtractSum(sum); - } - dst += dstStride; - } - } - - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) - { - size_t filterStride = featureSize * filterWidth; - size_t alignedDstWidth = AlignLo(dstWidth, 4); - __m128 _min = _mm_set1_ps(-FLT_MAX); - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - size_t dstCol = 0; - for (; dstCol < alignedDstWidth; dstCol += 4) - { - __m128 _mask = _mm_castsi128_ps(_mm_loadu_si128((__m128i*)(mask + dstCol))); - if (Sse41::TestZ(_mask)) - _mm_storeu_ps(dst + dstCol, _min); - else - { - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - size_t filterCol = 0; - for (; filterCol < filterStride; filterCol += F) - ProductSum1x4(pSrc + filterCol, pFilter + filterCol, sums); - pSrc += srcStride; - pFilter += filterStride; - } - _mm_storeu_ps(dst + dstCol, _mm_blendv_ps(_min, Avx::Extract4Sums(sums), _mask)); - } - } - for (; dstCol < dstWidth; ++dstCol) - { - if (mask[dstCol]) - { - __m256 sum = _mm256_setzero_ps(); - const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - for (size_t filterCol = 0; filterCol < filterStride; filterCol += F) - ProductSum1x1(pSrc + filterCol, pFilter + filterCol, sum); - pSrc += srcStride; - pFilter += filterStride; - } - dst[dstCol] = Avx::ExtractSum(sum); - } - else - dst[dstCol] = -FLT_MAX; - } - dst += dstStride; - mask += maskStride; - } - } - - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride) - { - if (featureSize == 16) - Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride); - else - Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride); - } - - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) - { - if (featureSize == 16) - Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - else - Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - } - - public: - - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) - { - assert(featureSize == 8 || featureSize == 16); - assert(srcWidth >= filterWidth && srcHeight >= filterHeight); - - size_t dstWidth = srcWidth - filterWidth + 1; - size_t dstHeight = srcHeight - filterHeight + 1; - - if (mask) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(filter)) - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - else - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - } - else - { - if (Aligned(src) && Aligned(srcStride) && Aligned(filter)) - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride); - else - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride); - } - } - }; - - void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) - { - HogLiteFeatureFilter featureFilter; - featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - } - - namespace HogLiteFeatureResizerDetail - { - template struct Feature - { - template static SIMD_INLINE void Interpolate(const float * src0, const float * src1, const __m256 k[2][2], float * dst); - }; - - template <> struct Feature<8> - { - template static SIMD_INLINE void Interpolate(const float * src0, const float * src1, const __m256 k[2][2], float * dst) - { - Store(dst + 0 * F, _mm256_add_ps( - _mm256_add_ps(_mm256_mul_ps(Load(src0 + 0 * F), k[0][0]), _mm256_mul_ps(Load(src0 + 1 * F), k[0][1])), - _mm256_add_ps(_mm256_mul_ps(Load(src1 + 0 * F), k[1][0]), _mm256_mul_ps(Load(src1 + 1 * F), k[1][1])))); - } - }; - - template <> struct Feature<16> - { - template static SIMD_INLINE void Interpolate(const float * src0, const float * src1, const __m256 k[2][2], float * dst) - { - Store(dst + 0 * F, _mm256_add_ps( - _mm256_add_ps(_mm256_mul_ps(Load(src0 + 0 * F), k[0][0]), _mm256_mul_ps(Load(src0 + 2 * F), k[0][1])), - _mm256_add_ps(_mm256_mul_ps(Load(src1 + 0 * F), k[1][0]), _mm256_mul_ps(Load(src1 + 2 * F), k[1][1])))); - Store(dst + 1 * F, _mm256_add_ps( - _mm256_add_ps(_mm256_mul_ps(Load(src0 + 1 * F), k[0][0]), _mm256_mul_ps(Load(src0 + 3 * F), k[0][1])), - _mm256_add_ps(_mm256_mul_ps(Load(src1 + 1 * F), k[1][0]), _mm256_mul_ps(Load(src1 + 3 * F), k[1][1])))); - } - }; - } - - - class HogLiteFeatureResizer - { - typedef Array Ints; - typedef Array Floats; - - Ints _iy, _ix; - Floats _ky, _kx; - - void InitIndexWeight(size_t srcSize, size_t dstSize, size_t dstStep, Ints & indexes, Floats & weights) - { - indexes.Resize(dstSize); - weights.Resize(dstSize); - - float scale = float(srcSize) / float(dstSize); - for (size_t i = 0; i < dstSize; ++i) - { - float weight = (float)((i + 0.5f)*scale - 0.5f); - int index = (int)::floor(weight); - weight -= index; - if (index < 0) - { - index = 0; - weight = 0.0f; - } - if (index > (int)srcSize - 2) - { - index = (int)srcSize - 2; - weight = 1.0f; - } - indexes[i] = int(index*dstStep); - weights[i] = weight; - } - } - - template void Resize(const float * src, size_t srcStride, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) - { - __m256 _1 = _mm256_set1_ps(1.0f); - for (size_t rowDst = 0; rowDst < dstHeight; ++rowDst) - { - __m256 ky1 = _mm256_set1_ps(_ky[rowDst]); - __m256 ky0 = _mm256_sub_ps(_1, ky1); - const float * pSrc = src + _iy[rowDst]; - float * pDst = dst + rowDst * dstStride; - for (size_t colDst = 0; colDst < dstWidth; ++colDst, pDst += featureSize) - { - __m256 kx1 = _mm256_set1_ps(_kx[colDst]); - __m256 kx0 = _mm256_sub_ps(_1, kx1); - __m256 k[2][2]; - k[0][0] = _mm256_mul_ps(ky0, kx0); - k[0][1] = _mm256_mul_ps(ky0, kx1); - k[1][0] = _mm256_mul_ps(ky1, kx0); - k[1][1] = _mm256_mul_ps(ky1, kx1); - const float * pSrc0 = pSrc + _ix[colDst]; - const float * pSrc1 = pSrc0 + srcStride; - HogLiteFeatureResizerDetail::Feature:: template Interpolate(pSrc0, pSrc1, k, pDst); - } - } - } - - template void Resize(const float * src, size_t srcStride, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) - { - if (featureSize == 8) - Resize(src, srcStride, dst, dstStride, dstWidth, dstHeight); - else - Resize(src, srcStride, dst, dstStride, dstWidth, dstHeight); - } - - public: - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) - { - assert(featureSize == 8 || featureSize == 16); - - if (srcWidth == dstWidth && srcHeight == dstHeight) - { - size_t size = sizeof(float)*srcWidth*featureSize; - for (size_t row = 0; row < dstHeight; ++row) - memcpy(dst + row * dstStride, src + row * srcStride, size); - return; - } - - InitIndexWeight(srcWidth, dstWidth, featureSize, _ix, _kx); - InitIndexWeight(srcHeight, dstHeight, srcStride, _iy, _ky); - - if (Aligned(src) && Aligned(dst)) - Resize(src, srcStride, featureSize, dst, dstStride, dstWidth, dstHeight); - else - Resize(src, srcStride, featureSize, dst, dstStride, dstWidth, dstHeight); - } - }; - - void HogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) - { - HogLiteFeatureResizer featureResizer; - featureResizer.Run(src, srcStride, srcWidth, srcHeight, featureSize, dst, dstStride, dstWidth, dstHeight); - } - - template void HogLiteCompressFeatures(const float * src, size_t srcStride, size_t width, size_t height, const float * pca, float * dst, size_t dstStride) - { - for (size_t row = 0; row < height; ++row) - { - const float * s = src; - float * d = dst; - for (size_t col = 0; col < width; ++col) - { - const float * p = pca; - for (size_t i = 0; i < 8; i += 4, p += 64) - { - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - for (size_t j = 0; j < 16; j += F) - { - __m256 _s = Load(s + j); - sums[0] = _mm256_add_ps(sums[0], _mm256_mul_ps(_s, Load(p + j + 00))); - sums[1] = _mm256_add_ps(sums[1], _mm256_mul_ps(_s, Load(p + j + 16))); - sums[2] = _mm256_add_ps(sums[2], _mm256_mul_ps(_s, Load(p + j + 32))); - sums[3] = _mm256_add_ps(sums[3], _mm256_mul_ps(_s, Load(p + j + 48))); - } - __m256 sum = _mm256_hadd_ps(_mm256_hadd_ps(sums[0], sums[1]), _mm256_hadd_ps(sums[2], sums[3])); - Sse::Store(d + i, _mm_add_ps(_mm256_castps256_ps128(sum), _mm256_extractf128_ps(sum, 1))); - } - s += 16; - d += 8; - } - src += srcStride; - dst += dstStride; - } - - } - - void HogLiteCompressFeatures(const float * src, size_t srcStride, size_t width, size_t height, const float * pca, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(pca) && Aligned(dst)) - HogLiteCompressFeatures(src, srcStride, width, height, pca, dst, dstStride); - else - HogLiteCompressFeatures(src, srcStride, width, height, pca, dst, dstStride); - } - - class HogLiteSeparableFilter - { - size_t _dstWidth, _dstHeight, _dstStride; - Array32f _buffer; - Array256f _filter; - - void Init(size_t srcWidth, size_t srcHeight, size_t hSize, size_t vSize) - { - _dstWidth = srcWidth - hSize + 1; - _dstStride = AlignHi(_dstWidth, F); - _dstHeight = srcHeight - vSize + 1; - _buffer.Resize(_dstStride*srcHeight); - } - - template static SIMD_INLINE void FilterHx1(const float * src, const float * filter, __m256 & sum) - { - __m256 _src = Avx::Load(src); - __m256 _filter = Avx::Load(filter); - sum = _mm256_add_ps(sum, _mm256_mul_ps(_src, _filter)); - } - - template static SIMD_INLINE void FilterHx4(const float * src, const float * filter, __m256 * sums) - { - __m256 _filter = Avx::Load(filter); - sums[0] = _mm256_add_ps(sums[0], _mm256_mul_ps(Avx::Load(src + 0 * step), _filter)); - sums[1] = _mm256_add_ps(sums[1], _mm256_mul_ps(Avx::Load(src + 1 * step), _filter)); - sums[2] = _mm256_add_ps(sums[2], _mm256_mul_ps(Avx::Load(src + 2 * step), _filter)); - sums[3] = _mm256_add_ps(sums[3], _mm256_mul_ps(Avx::Load(src + 3 * step), _filter)); - } - - template void FilterH(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - size_t alignedWidth = AlignLo(width, 4); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += 4) - { - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - const float * s = src + col * step; - for (size_t i = 0; i < size; i += F) - FilterHx4(s + i, filter + i, sums); - Sse::Store(dst + col, Avx::Extract4Sums(sums)); - } - for (; col < width; ++col) - { - __m256 sum = _mm256_setzero_ps(); - const float * s = src + col * step; - for (size_t i = 0; i < size; i += F) - FilterHx1(s + i, filter + i, sum); - dst[col] = Avx::ExtractSum(sum); - } - src += srcStride; - dst += dstStride; - } - } - - template void FilterH(const float * src, size_t srcStride, size_t width, size_t height, size_t step, const float * filter, size_t size, float * dst, size_t dstStride) - { - if (step == 16) - FilterH(src, srcStride, width, height, filter, size, dst, dstStride); - else - FilterH(src, srcStride, width, height, filter, size, dst, dstStride); - } - - template static SIMD_INLINE void FilterV(const float * src, size_t stride, const __m256 * filter, size_t size, float * dst, const __m256 & mask) - { - __m256 sum = _mm256_setzero_ps(); - for (size_t i = 0; i < size; ++i, src += stride) - sum = _mm256_add_ps(sum, _mm256_mul_ps(Load(src), filter[i])); - Update(dst, Masked(sum, mask)); - } - - template void FilterV(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - _filter.Resize(size); - for (size_t i = 0; i < size; ++i) - _filter[i] = _mm256_set1_ps(filter[i]); - - size_t alignedWidth = AlignLo(width, F); - __m256 tailMask = RightNotZero32f(width - alignedWidth); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += F) - FilterV(src + col, srcStride, _filter.data, size, dst + col, tailMask); - if (alignedWidth != width) - FilterV(src + width - F, srcStride, _filter.data, size, dst + width - F, tailMask); - src += srcStride; - dst += dstStride; - } - } - - template void FilterV(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - if (Aligned(dst) && Aligned(dstStride)) - FilterV(src, srcStride, width, height, filter, size, dst, dstStride); - else - FilterV(src, srcStride, width, height, filter, size, dst, dstStride); - } - - public: - - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * hFilter, size_t hSize, const float * vFilter, size_t vSize, float * dst, size_t dstStride, int add) - { - assert(featureSize == 8 || featureSize == 16); - assert(srcWidth >= hSize && srcHeight >= vSize); - - Init(srcWidth, srcHeight, hSize, vSize); - - if (Aligned(src) && Aligned(srcStride) && Aligned(hFilter)) - FilterH(src, srcStride, _dstWidth, srcHeight, featureSize, hFilter, hSize*featureSize, _buffer.data, _dstStride); - else - FilterH(src, srcStride, _dstWidth, srcHeight, featureSize, hFilter, hSize*featureSize, _buffer.data, _dstStride); - - if (add) - FilterV(_buffer.data, _dstStride, _dstWidth, _dstHeight, vFilter, vSize, dst, dstStride); - else - FilterV(_buffer.data, _dstStride, _dstWidth, _dstHeight, vFilter, vSize, dst, dstStride); - } - }; - - void HogLiteFilterSeparable(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * hFilter, size_t hSize, const float * vFilter, size_t vSize, float * dst, size_t dstStride, int add) - { - HogLiteSeparableFilter filter; - filter.Run(src, srcStride, srcWidth, srcHeight, featureSize, hFilter, hSize, vFilter, vSize, dst, dstStride, add); - } - } -#endif// SIMD_AVX_ENABLE -} - - diff --git a/src/3rd/Simd/Simd/SimdAvx1Neural.cpp b/src/3rd/Simd/Simd/SimdAvx1Neural.cpp deleted file mode 100644 index 0526a626..00000000 --- a/src/3rd/Simd/Simd/SimdAvx1Neural.cpp +++ /dev/null @@ -1,1913 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - template SIMD_INLINE void NeuralProductSum(const float * a, const float * b, size_t offset, __m256 & sum) - { - __m256 _a = Load(a + offset); - __m256 _b = Load(b + offset); - sum = _mm256_add_ps(sum, _mm256_mul_ps(_a, _b)); - } - - template SIMD_INLINE void NeuralProductSum(const float * a, const float * b, size_t size, float * sum) - { - if (align) - assert(Aligned(a) && Aligned(b)); - - *sum = 0; - size_t partialAlignedSize = AlignLo(size, F); - size_t fullAlignedSize = AlignLo(size, QF); - size_t i = 0; - if (partialAlignedSize) - { - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - if (fullAlignedSize) - { - for (; i < fullAlignedSize; i += QF) - { - NeuralProductSum(a, b, i + F * 0, sums[0]); - NeuralProductSum(a, b, i + F * 1, sums[1]); - NeuralProductSum(a, b, i + F * 2, sums[2]); - NeuralProductSum(a, b, i + F * 3, sums[3]); - } - sums[0] = _mm256_add_ps(_mm256_add_ps(sums[0], sums[1]), _mm256_add_ps(sums[2], sums[3])); - } - for (; i < partialAlignedSize; i += F) - NeuralProductSum(a, b, i, sums[0]); - *sum += ExtractSum(sums[0]); - } - for (; i < size; ++i) - *sum += a[i] * b[i]; - } - - void NeuralProductSum(const float * a, const float * b, size_t size, float * sum) - { - if (Aligned(a) && Aligned(b)) - NeuralProductSum(a, b, size, sum); - else - NeuralProductSum(a, b, size, sum); - } - - template SIMD_INLINE void AddMultiplied(const float * src, const __m256 & value, float * dst) - { - Store(dst, _mm256_add_ps(Load(dst), _mm256_mul_ps(value, Load(src)))); - } - - template SIMD_INLINE void AddMultiplied(const float * src, size_t aligned, size_t partial, size_t full, float value, float * dst) - { - size_t i = 0; - if (partial) - { - __m256 _value = _mm256_set1_ps(value); - for (; i < aligned; i += QF) - { - AddMultiplied(src + i + F * 0, _value, dst + i + F * 0); - AddMultiplied(src + i + F * 1, _value, dst + i + F * 1); - AddMultiplied(src + i + F * 2, _value, dst + i + F * 2); - AddMultiplied(src + i + F * 3, _value, dst + i + F * 3); - } - for (; i < partial; i += F) - AddMultiplied(src + i, _value, dst + i); - } - for (; i < full; ++i) - dst[i] += src[i] * value; - } - - void NeuralAddVectorMultipliedByValue(const float * src, size_t size, const float * value, float * dst) - { - size_t aligned = AlignLo(size, QF); - size_t partial = AlignLo(size, F); - if (Aligned(src) && Aligned(dst)) - AddMultiplied(src, aligned, partial, size, *value, dst); - else - AddMultiplied(src, aligned, partial, size, *value, dst); - } - - template SIMD_INLINE void AddVector(const float * src, float * dst) - { - Store(dst, _mm256_add_ps(Load(dst), Load(src))); - } - - template SIMD_INLINE void AddVector(const float * src, size_t aligned, size_t partial, size_t full, float * dst) - { - size_t i = 0; - for (; i < aligned; i += QF) - { - AddVector(src + i + F * 0, dst + i + F * 0); - AddVector(src + i + F * 1, dst + i + F * 1); - AddVector(src + i + F * 2, dst + i + F * 2); - AddVector(src + i + F * 3, dst + i + F * 3); - } - for (; i < partial; i += F) - AddVector(src + i, dst + i); - for (; i < full; ++i) - dst[i] += src[i]; - } - - void NeuralAddVector(const float * src, size_t size, float * dst) - { - size_t aligned = AlignLo(size, QF); - size_t partial = AlignLo(size, F); - if (Aligned(src) && Aligned(dst)) - AddVector(src, aligned, partial, size, dst); - else - AddVector(src, aligned, partial, size, dst); - } - - template SIMD_INLINE void AddValue(const __m256 & value, float * dst) - { - Store(dst, _mm256_add_ps(Load(dst), value)); - } - - template SIMD_INLINE void AddValue(const float * value, float * dst, size_t aligned, size_t partial, size_t full) - { - size_t i = 0; - if (partial) - { - __m256 _value = _mm256_set1_ps(value[0]); - for (; i < aligned; i += QF) - { - AddValue(_value, dst + i + F * 0); - AddValue(_value, dst + i + F * 1); - AddValue(_value, dst + i + F * 2); - AddValue(_value, dst + i + F * 3); - } - for (; i < partial; i += F) - AddValue(_value, dst + i); - } - for (; i < full; ++i) - dst[i] += value[0]; - } - - void NeuralAddValue(const float * value, float * dst, size_t size) - { - size_t aligned = AlignLo(size, QF); - size_t partial = AlignLo(size, F); - if (Aligned(dst)) - AddValue(value, dst, aligned, partial, size); - else - AddValue(value, dst, aligned, partial, size); - } - - template SIMD_INLINE void NeuralRoughSigmoid(const float * src, size_t size, const float * slope, float * dst) - { - size_t alignedSize = Simd::AlignLo(size, F); - __m256 _slope = _mm256_set1_ps(*slope); - __m256 _0 = _mm256_set1_ps(-0.0f); - __m256 _1 = _mm256_set1_ps(1.0f); - __m256 _a = _mm256_set1_ps(0.5417f); - __m256 _b = _mm256_set1_ps(0.1460f); - size_t i = 0; - for (; i < alignedSize; i += F) - { - __m256 _src = Load(src + i); - __m256 x = _mm256_andnot_ps(_0, _mm256_mul_ps(_src, _slope)); - __m256 x2 = _mm256_mul_ps(x, x); - __m256 x4 = _mm256_mul_ps(x2, x2); - __m256 series = _mm256_add_ps(_mm256_add_ps(_1, x), _mm256_add_ps(_mm256_mul_ps(x2, _a), _mm256_mul_ps(x4, _b))); - __m256 mask = _mm256_cmp_ps(_src, _0, _CMP_GT_OS); - __m256 exp = _mm256_blendv_ps(series, _mm256_rcp_ps(series), mask); - __m256 sigmoid = _mm256_rcp_ps(_mm256_add_ps(_1, exp)); - Store(dst + i, sigmoid); - } - for (; i < size; ++i) - dst[i] = Base::RoughSigmoid(src[i] * slope[0]); - } - - void NeuralRoughSigmoid(const float * src, size_t size, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - NeuralRoughSigmoid(src, size, slope, dst); - else - NeuralRoughSigmoid(src, size, slope, dst); - } - - template SIMD_INLINE void NeuralRoughSigmoid2(const float * src, const __m256 & k, const __m256 & o, const __m256 & m, float * dst) - { - __m256 _src = Load(src); - __m256 e1 = _mm256_max_ps(m, _mm256_sub_ps(o, _mm256_mul_ps(_src, k))); - __m256 e2 = _mm256_mul_ps(e1, e1); - __m256 e4 = _mm256_mul_ps(e2, e2); - __m256 e8 = _mm256_mul_ps(e4, e4); - __m256 e16 = _mm256_mul_ps(e8, e8); - __m256 e32 = _mm256_mul_ps(e16, e16); - __m256 e64 = _mm256_mul_ps(e32, e32); - __m256 sigmoid = _mm256_rcp_ps(_mm256_add_ps(o, _mm256_mul_ps(e64, e64))); - Store(dst, sigmoid); - } - - template SIMD_INLINE void NeuralRoughSigmoid2(const float * src, size_t size, const float * slope, float * dst) - { - size_t partialAlignedSize = Simd::AlignLo(size, F); - size_t fullAlignedSize = Simd::AlignLo(size, QF); - __m256 _k = _mm256_set1_ps((*slope)*0.0078125f); - __m256 _1 = _mm256_set1_ps(1.0f); - __m256 _05 = _mm256_set1_ps(0.5f); - size_t i = 0; - for (; i < fullAlignedSize; i += QF) - { - NeuralRoughSigmoid2(src + i + 0 * F, _k, _1, _05, dst + i + 0 * F); - NeuralRoughSigmoid2(src + i + 1 * F, _k, _1, _05, dst + i + 1 * F); - NeuralRoughSigmoid2(src + i + 2 * F, _k, _1, _05, dst + i + 2 * F); - NeuralRoughSigmoid2(src + i + 3 * F, _k, _1, _05, dst + i + 3 * F); - } - for (; i < partialAlignedSize; i += F) - NeuralRoughSigmoid2(src + i, _k, _1, _05, dst + i); - for (; i < size; ++i) - dst[i] = Base::RoughSigmoid2(src[i] * slope[0]); - } - - void NeuralRoughSigmoid2(const float * src, size_t size, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - NeuralRoughSigmoid2(src, size, slope, dst); - else - NeuralRoughSigmoid2(src, size, slope, dst); - } - - template SIMD_INLINE void NeuralDerivativeSigmoid(const float * src, size_t size, const float * slope, float * dst) - { - size_t alignedSize = Simd::AlignLo(size, F); - __m256 _slope = _mm256_set1_ps(*slope); - __m256 _1 = _mm256_set1_ps(1.0f); - size_t i = 0; - for (; i < alignedSize; i += F) - { - __m256 _src = Load(src + i); - __m256 _dst = Load(dst + i); - Store(dst + i, _mm256_mul_ps(_mm256_mul_ps(_dst, _slope), _mm256_mul_ps(_mm256_sub_ps(_1, _src), _src))); - } - for (; i < size; ++i) - dst[i] *= slope[0] * Base::DerivativeSigmoid(src[i]); - } - - void NeuralDerivativeSigmoid(const float * src, size_t size, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - NeuralDerivativeSigmoid(src, size, slope, dst); - else - NeuralDerivativeSigmoid(src, size, slope, dst); - } - - template SIMD_INLINE void NeuralRoughTanh(const float * src, size_t size, const float * slope, float * dst) - { - size_t alignedSize = Simd::AlignLo(size, F); - __m256 _slope = _mm256_set1_ps(*slope); - __m256 _0 = _mm256_set1_ps(-0.0f); - __m256 _1 = _mm256_set1_ps(1.0f); - __m256 _a = _mm256_set1_ps(0.5658f); - __m256 _b = _mm256_set1_ps(0.1430f); - size_t i = 0; - for (; i < alignedSize; i += F) - { - __m256 _src = Load(src + i); - __m256 x = _mm256_andnot_ps(_0, _mm256_mul_ps(_src, _slope)); - __m256 x2 = _mm256_mul_ps(x, x); - __m256 x4 = _mm256_mul_ps(x2, x2); - __m256 pe = _mm256_add_ps(_mm256_add_ps(_1, x), _mm256_add_ps(_mm256_mul_ps(x2, _a), _mm256_mul_ps(x4, _b))); - __m256 ne = _mm256_rcp_ps(pe); - __m256 absTanh = _mm256_mul_ps(_mm256_sub_ps(pe, ne), _mm256_rcp_ps(_mm256_add_ps(pe, ne))); - __m256 tanh = _mm256_xor_ps(absTanh, _mm256_and_ps(_0, _mm256_cmp_ps(_0, _src, _CMP_GT_OS))); - Store(dst + i, tanh); - } - for (; i < size; ++i) - dst[i] = Base::RoughTanh(src[i] * slope[0]); - } - - void NeuralRoughTanh(const float * src, size_t size, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - NeuralRoughTanh(src, size, slope, dst); - else - NeuralRoughTanh(src, size, slope, dst); - } - - template SIMD_INLINE void NeuralDerivativeTanh(const float * src, size_t size, const float * slope, float * dst) - { - size_t alignedSize = Simd::AlignLo(size, F); - __m256 _slope = _mm256_set1_ps(*slope); - __m256 _1 = _mm256_set1_ps(1.0f); - size_t i = 0; - for (; i < alignedSize; i += F) - { - __m256 _src = Load(src + i); - __m256 _dst = Load(dst + i); - Store(dst + i, _mm256_mul_ps(_mm256_mul_ps(_dst, _slope), _mm256_sub_ps(_1, _mm256_mul_ps(_src, _src)))); - } - for (; i < size; ++i) - dst[i] *= slope[0] * Base::DerivativeTanh(src[i]); - } - - void NeuralDerivativeTanh(const float * src, size_t size, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - NeuralDerivativeTanh(src, size, slope, dst); - else - NeuralDerivativeTanh(src, size, slope, dst); - } - - template void NeuralDerivativeRelu(const float * src, size_t size, const float * slope, float * dst) - { - float s = slope[0]; - __m256 _0 = _mm256_set1_ps(0.0f); - __m256 _1 = _mm256_set1_ps(1.0f); - __m256 _s = _mm256_set1_ps(s); - size_t alignedSize = Simd::AlignLo(size, F); - size_t i = 0; - for (; i < alignedSize; i += F) - { - __m256 mask = _mm256_cmp_ps(Load(src + i), _0, _CMP_GT_OS); - __m256 _dst = Load(dst + i); - Store(dst + i, _mm256_mul_ps(_mm256_blendv_ps(_s, _1, mask), _dst)); - } - for (; i < size; ++i) - dst[i] *= src[i] > 0 ? 1.0f : s; - } - - void NeuralDerivativeRelu(const float * src, size_t size, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - NeuralDerivativeRelu(src, size, slope, dst); - else - NeuralDerivativeRelu(src, size, slope, dst); - } - - template SIMD_INLINE void UpdateWeights(const float * x, const __m256 & a, const __m256 & b, float * d, float * w) - { - __m256 _d = _mm256_add_ps(_mm256_mul_ps(a, Load(d)), _mm256_mul_ps(b, Load(x))); - Store(d, _d); - Store(w, _mm256_add_ps(Load(w), _d)); - } - - template SIMD_INLINE void UpdateWeights(const float * x, size_t offset, const __m256 & a, const __m256 & b, float * d, float * w) - { - UpdateWeights(x + offset, a, b, d + offset, w + offset); - } - - template SIMD_INLINE void NeuralUpdateWeights(const float * x, size_t size, const float & a, const float & b, float * d, float * w) - { - if (align) - assert(Aligned(x) && Aligned(d) && Aligned(w)); - - size_t partialAlignedSize = AlignLo(size, F); - size_t fullAlignedSize = AlignLo(size, QF); - __m256 _a = _mm256_set1_ps(a); - __m256 _b = _mm256_set1_ps(b); - size_t i = 0; - if (partialAlignedSize) - { - if (fullAlignedSize) - { - for (; i < fullAlignedSize; i += QF) - { - UpdateWeights(x, i + F * 0, _a, _b, d, w); - UpdateWeights(x, i + F * 1, _a, _b, d, w); - UpdateWeights(x, i + F * 2, _a, _b, d, w); - UpdateWeights(x, i + F * 3, _a, _b, d, w); - } - } - for (; i < partialAlignedSize; i += F) - UpdateWeights(x, i, _a, _b, d, w); - } - for (; i < size; ++i) - Base::UpdateWeights(x, i, a, b, d, w); - } - - void NeuralUpdateWeights(const float * x, size_t size, const float * a, const float * b, float * d, float * w) - { - if (Aligned(x) && Aligned(d) && Aligned(w)) - NeuralUpdateWeights(x, size, *a, *b, d, w); - else - NeuralUpdateWeights(x, size, *a, *b, d, w); - } - - template SIMD_INLINE void AdaptiveGradientUpdate(const float * delta, const __m256 & norm, const __m256 & alpha, const __m256 & epsilon, float * gradient, float * weight) - { - __m256 d = _mm256_mul_ps(Load(delta), norm); - __m256 _gradient = _mm256_add_ps(Load(gradient), _mm256_mul_ps(d, d)); - Store(gradient, _gradient); - Store(weight, _mm256_sub_ps(Load(weight), _mm256_mul_ps(_mm256_mul_ps(alpha, d), _mm256_rsqrt_ps(_mm256_add_ps(_gradient, epsilon))))); - } - - template SIMD_INLINE void AdaptiveGradientUpdate(const float * delta, size_t offset, const __m256 & norm, const __m256 & alpha, const __m256 & epsilon, float * gradient, float * weight) - { - AdaptiveGradientUpdate(delta + offset, norm, alpha, epsilon, gradient + offset, weight + offset); - } - - template void NeuralAdaptiveGradientUpdate(const float * delta, size_t size, size_t batch, const float * alpha, const float * epsilon, float * gradient, float * weight) - { - if (align) - assert(Aligned(delta) && Aligned(gradient) && Aligned(weight)); - - size_t partialAlignedSize = AlignLo(size, F); - size_t fullAlignedSize = AlignLo(size, QF); - const float norm = (float)(1.0 / batch); - __m256 _norm = _mm256_set1_ps(norm); - __m256 _alpha = _mm256_set1_ps(*alpha); - __m256 _epsilon = _mm256_set1_ps(*epsilon); - size_t i = 0; - if (partialAlignedSize) - { - if (fullAlignedSize) - { - for (; i < fullAlignedSize; i += QF) - { - AdaptiveGradientUpdate(delta, i + F * 0, _norm, _alpha, _epsilon, gradient, weight); - AdaptiveGradientUpdate(delta, i + F * 1, _norm, _alpha, _epsilon, gradient, weight); - AdaptiveGradientUpdate(delta, i + F * 2, _norm, _alpha, _epsilon, gradient, weight); - AdaptiveGradientUpdate(delta, i + F * 3, _norm, _alpha, _epsilon, gradient, weight); - } - } - for (; i < partialAlignedSize; i += F) - AdaptiveGradientUpdate(delta, i, _norm, _alpha, _epsilon, gradient, weight); - } - for (; i < size; ++i) - Base::AdaptiveGradientUpdate(delta, i, norm, *alpha, *epsilon, gradient, weight); - } - - void NeuralAdaptiveGradientUpdate(const float * delta, size_t size, size_t batch, const float * alpha, const float * epsilon, float * gradient, float * weight) - { - if (Aligned(delta) && Aligned(gradient) && Aligned(weight)) - NeuralAdaptiveGradientUpdate(delta, size, batch, alpha, epsilon, gradient, weight); - else - NeuralAdaptiveGradientUpdate(delta, size, batch, alpha, epsilon, gradient, weight); - } - - template SIMD_INLINE void LoadWeightsForward(const float * src, __m256 * dst) - { - for (size_t i = 0; i < size; ++i) - dst[i] = _mm256_set1_ps(src[i]); - } - - template SIMD_INLINE void LoadWeightsBackward(const float * src, __m256 * dst) - { - for (size_t i = 0; i < size; ++i) - dst[i] = _mm256_set1_ps(src[size - i - 1]); - } - - namespace - { - template struct Buffer - { - Buffer(size_t width) - { - _size = width * sizeof(float); - size_t stride = AlignHi(width + 2 * (count - 1), F); - size_t full = count*stride * sizeof(float); - _ptr = Allocate(full); - memset(_ptr, 0, full); - rows[0] = (float*)_ptr; - for (size_t i = 1; i < count; ++i) - rows[i] = rows[i - 1] + stride; - } - - void Update(const float * src) - { - float * tmp = rows[0]; - if (src == NULL) - memset(tmp + count - 1, 0, _size); - else - memcpy(tmp + count - 1, src, _size); - for (size_t i = 0; i < count - 1; ++i) - rows[i] = rows[i + 1]; - rows[count - 1] = tmp; - } - - ~Buffer() - { - Free(_ptr); - } - - float * rows[count]; - private: - size_t _size; - void * _ptr; - }; - } - - template struct Convolution - { - template static SIMD_INLINE __m256 Forward(const float * src, size_t stride, const __m256 * weights); - - template static SIMD_INLINE __m256 Backward(const Buffer & buffer, size_t offset, const __m256 * weights); - - template static SIMD_INLINE void Sum(const float * src, size_t stride, const __m256 & dst, __m256 * sums); - }; - - template<> struct Convolution<2, 2> - { - template static SIMD_INLINE __m256 RowConvolution(const float * src, const __m256 * weights) - { - return _mm256_add_ps(_mm256_mul_ps(Load(src), weights[0]), - _mm256_mul_ps(Load(src + 1), weights[1])); - } - - template static SIMD_INLINE __m256 Forward(const float * src, size_t stride, const __m256 * weights) - { - return _mm256_add_ps(RowConvolution(src, weights), - RowConvolution(src + stride, weights + 2)); - } - - template static SIMD_INLINE __m256 Backward(const Buffer<2> & buffer, size_t offset, const __m256 * weights) - { - return _mm256_add_ps(RowConvolution(buffer.rows[0] + offset, weights), - RowConvolution(buffer.rows[1] + offset, weights + 2)); - } - - template static SIMD_INLINE void Sum(const float * src, const __m256 & dst, __m256 * sums) - { - sums[0] = _mm256_add_ps(sums[0], _mm256_mul_ps(dst, Load(src + 0))); - sums[1] = _mm256_add_ps(sums[1], _mm256_mul_ps(dst, Load(src + 1))); - } - - template static SIMD_INLINE void Sum(const float * src, size_t stride, const __m256 & dst, __m256 * sums) - { - Sum(src + stride * 0, dst, sums + 0); - Sum(src + stride * 1, dst, sums + 2); - } - }; - - template<> struct Convolution<3, 3> - { - template static SIMD_INLINE __m256 RowConvolution(const float * src, const __m256 * weights) - { - return _mm256_add_ps(_mm256_mul_ps(Load(src), weights[0]), - _mm256_add_ps(_mm256_mul_ps(Load(src + 1), weights[1]), - _mm256_mul_ps(Load(src + 2), weights[2]))); - } - - template static SIMD_INLINE __m256 Forward(const float * src, size_t stride, const __m256 * weights) - { - return _mm256_add_ps(RowConvolution(src, weights), - _mm256_add_ps(RowConvolution(src + stride, weights + 3), - RowConvolution(src + 2 * stride, weights + 6))); - } - - template static SIMD_INLINE __m256 Backward(const Buffer<3> & buffer, size_t offset, const __m256 * weights) - { - return _mm256_add_ps(RowConvolution(buffer.rows[0] + offset, weights), - _mm256_add_ps(RowConvolution(buffer.rows[1] + offset, weights + 3), - RowConvolution(buffer.rows[2] + offset, weights + 6))); - } - - template static SIMD_INLINE void Sum(const float * src, const __m256 & dst, __m256 * sums) - { - sums[0] = _mm256_add_ps(sums[0], _mm256_mul_ps(dst, Load(src + 0))); - sums[1] = _mm256_add_ps(sums[1], _mm256_mul_ps(dst, Load(src + 1))); - sums[2] = _mm256_add_ps(sums[2], _mm256_mul_ps(dst, Load(src + 2))); - } - - template static SIMD_INLINE void Sum(const float * src, size_t stride, const __m256 & dst, __m256 * sums) - { - Sum(src + stride * 0, dst, sums + 0); - Sum(src + stride * 1, dst, sums + 3); - Sum(src + stride * 2, dst, sums + 6); - } - }; - - template<> struct Convolution<4, 4> - { - template static SIMD_INLINE __m256 RowConvolution(const float * src, const __m256 * weights) - { - return _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(Load(src), weights[0]), _mm256_mul_ps(Load(src + 1), weights[1])), - _mm256_add_ps(_mm256_mul_ps(Load(src + 2), weights[2]), _mm256_mul_ps(Load(src + 3), weights[3]))); - } - - template static SIMD_INLINE __m256 Forward(const float * src, size_t stride, const __m256 * weights) - { - return _mm256_add_ps(_mm256_add_ps(RowConvolution(src, weights), - RowConvolution(src + stride, weights + 4)), - _mm256_add_ps(RowConvolution(src + 2 * stride, weights + 8), - RowConvolution(src + 3 * stride, weights + 12))); - } - - template static SIMD_INLINE __m256 Backward(const Buffer<4> & buffer, size_t offset, const __m256 * weights) - { - return _mm256_add_ps(_mm256_add_ps(RowConvolution(buffer.rows[0] + offset, weights), - RowConvolution(buffer.rows[1] + offset, weights + 4)), - _mm256_add_ps(RowConvolution(buffer.rows[2] + offset, weights + 8), - RowConvolution(buffer.rows[3] + offset, weights + 12))); - } - - template static SIMD_INLINE void Sum(const float * src, const __m256 & dst, __m256 * sums) - { - sums[0] = _mm256_add_ps(sums[0], _mm256_mul_ps(dst, Load(src + 0))); - sums[1] = _mm256_add_ps(sums[1], _mm256_mul_ps(dst, Load(src + 1))); - sums[2] = _mm256_add_ps(sums[2], _mm256_mul_ps(dst, Load(src + 2))); - sums[3] = _mm256_add_ps(sums[3], _mm256_mul_ps(dst, Load(src + 3))); - } - - template static SIMD_INLINE void Sum(const float * src, size_t stride, const __m256 & dst, __m256 * sums) - { - Sum(src + stride * 0, dst, sums + 0); - Sum(src + stride * 1, dst, sums + 4); - Sum(src + stride * 2, dst, sums + 8); - Sum(src + stride * 3, dst, sums + 12); - } - }; - - template<> struct Convolution<5, 5> - { - template static SIMD_INLINE __m256 RowConvolution(const float * src, const __m256 * weights) - { - return _mm256_add_ps(_mm256_mul_ps(Load(src), weights[0]), _mm256_add_ps( - _mm256_add_ps(_mm256_mul_ps(Load(src + 1), weights[1]), _mm256_mul_ps(Load(src + 2), weights[2])), - _mm256_add_ps(_mm256_mul_ps(Load(src + 3), weights[3]), _mm256_mul_ps(Load(src + 4), weights[4])))); - } - - template static SIMD_INLINE __m256 Forward(const float * src, size_t stride, const __m256 * weights) - { - return _mm256_add_ps(RowConvolution(src, weights), - _mm256_add_ps(_mm256_add_ps(RowConvolution(src + stride, weights + 5), - RowConvolution(src + 2 * stride, weights + 10)), - _mm256_add_ps(RowConvolution(src + 3 * stride, weights + 15), - RowConvolution(src + 4 * stride, weights + 20)))); - } - - template static SIMD_INLINE __m256 Backward(const Buffer<5> & buffer, size_t offset, const __m256 * weights) - { - return _mm256_add_ps(_mm256_add_ps(RowConvolution(buffer.rows[0] + offset, weights), - _mm256_add_ps(RowConvolution(buffer.rows[1] + offset, weights + 5), - RowConvolution(buffer.rows[2] + offset, weights + 10))), - _mm256_add_ps(RowConvolution(buffer.rows[3] + offset, weights + 15), - RowConvolution(buffer.rows[4] + offset, weights + 20))); - } - - template static SIMD_INLINE void Sum(const float * src, const __m256 & dst, __m256 * sums) - { - sums[0] = _mm256_add_ps(sums[0], _mm256_mul_ps(dst, Load(src + 0))); - sums[1] = _mm256_add_ps(sums[1], _mm256_mul_ps(dst, Load(src + 1))); - sums[2] = _mm256_add_ps(sums[2], _mm256_mul_ps(dst, Load(src + 2))); - sums[3] = _mm256_add_ps(sums[3], _mm256_mul_ps(dst, Load(src + 3))); - sums[4] = _mm256_add_ps(sums[4], _mm256_mul_ps(dst, Load(src + 4))); - } - - template static SIMD_INLINE void Sum(const float * src, size_t stride, const __m256 & dst, __m256 * sums) - { - Sum(src + stride * 0, dst, sums + 0); - Sum(src + stride * 1, dst, sums + 5); - Sum(src + stride * 2, dst, sums + 10); - Sum(src + stride * 3, dst, sums + 15); - Sum(src + stride * 4, dst, sums + 20); - } - }; - - template void NeuralAddConvolutionForward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - size_t alignedWidth = AlignLo(width, F); - __m256 tailMask = RightNotZero32f(width - alignedWidth); - __m256 _weights[coreX*coreY]; - LoadWeightsForward(weights, _weights); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += F) - { - __m256 _dst = Load(dst + col); - _dst = _mm256_add_ps(_dst, Convolution::template Forward(src + col, srcStride, _weights)); - Store(dst + col, _dst); - } - if (width - alignedWidth) - { - size_t col = width - F; - __m256 _dst = Load(dst + col); - _dst = _mm256_add_ps(_dst, _mm256_and_ps(tailMask, Convolution::template Forward(src + col, srcStride, _weights))); - Store(dst + col, _dst); - } - src += srcStride; - dst += dstStride; - } - } - - void NeuralAddConvolution2x2Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution3x3Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution4x4Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution5x5Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - } - - template struct If - { - template static SIMD_INLINE void AddMultiplied(const float * src, size_t aligned, size_t partial, size_t full, float value, float * dst) - { - Avx::AddMultiplied(src, aligned, partial, full, value, dst); - } - }; - - template<> struct If - { - template static SIMD_INLINE void AddMultiplied(const float * src, size_t aligned, size_t partial, size_t full, float value, float * dst) - { - } - }; - - template void NeuralAddConvolutionBackwardSmall(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - size_t aligned = AlignLo(width, QF); - size_t partial = AlignLo(width, F); - for (size_t row = 0; row < height; ++row) - { - for (size_t dy = 0; dy < coreY; ++dy) - { - const float * w = weights + dy * coreX; - float * d = dst + dy*dstStride; - If < 0 < coreX > ::template AddMultiplied(src, aligned, partial, width, w[0], d + 0); - If < 1 < coreX > ::template AddMultiplied(src, aligned, partial, width, w[1], d + 1); - If < 2 < coreX > ::template AddMultiplied(src, aligned, partial, width, w[2], d + 2); - If < 3 < coreX > ::template AddMultiplied(src, aligned, partial, width, w[3], d + 3); - If < 4 < coreX > ::template AddMultiplied(src, aligned, partial, width, w[4], d + 4); - } - src += srcStride; - dst += dstStride; - } - } - - template void NeuralAddConvolutionBackwardLarge(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - Buffer buffer(width); - height += coreY - 1; - width += coreX - 1; - size_t alignedWidth = AlignLo(width, F); - __m256 tailMask = RightNotZero32f(width - alignedWidth); - __m256 _weights[coreX*coreY]; - LoadWeightsBackward(weights, _weights); - - for (size_t row = 0; row < height; ++row) - { - buffer.Update(row <= height - coreY ? src : NULL); - for (size_t col = 0; col < alignedWidth; col += F) - { - __m256 _dst = Load(dst + col); - _dst = _mm256_add_ps(_dst, Convolution::template Backward(buffer, col, _weights)); - Store(dst + col, _dst); - } - if (width - alignedWidth) - { - size_t col = width - F; - __m256 _dst = Load(dst + col); - _dst = _mm256_add_ps(_dst, _mm256_and_ps(tailMask, Convolution::template Backward(buffer, col, _weights))); - Store(dst + col, _dst); - } - src += srcStride; - dst += dstStride; - } - } - - template void NeuralAddConvolutionBackward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (width*height < 1024) - NeuralAddConvolutionBackwardSmall(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionBackwardLarge(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution2x2Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution3x3Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution4x4Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution5x5Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - } - - template SIMD_INLINE void NeuralAddConvolutionSum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - size_t alignedWidth = Simd::AlignLo(width, F); - __m256 tailMask = RightNotZero32f(width - alignedWidth); - __m256 _sums[coreX*coreY]; - memset(_sums, 0, sizeof(_sums)); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += F) - { - __m256 _dst = Load(dst + col); - Convolution::template Sum(src + col, srcStride, _dst, _sums); - } - if (alignedWidth < width) - { - size_t col = width - F; - __m256 _dst = _mm256_and_ps(tailMask, Load(dst + col)); - Convolution::template Sum(src + col, srcStride, _dst, _sums); - } - src += srcStride; - dst += dstStride; - } - size_t i = 0, n = Simd::AlignLo(coreX*coreY, F); - for (; i < n; i += F) - Add8ExtractedSums(_sums + i, sums + i); - for (; i < coreX*coreY; ++i) - sums[i] += ExtractSum(_sums[i]); - } - - void NeuralAddConvolution2x2Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - else - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - } - - void NeuralAddConvolution3x3Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - else - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - } - - void NeuralAddConvolution4x4Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - else - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - } - - void NeuralAddConvolution5x5Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - else - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - } - - template SIMD_INLINE __m256 Pooling2x2Max2x2(const float * src, size_t stride) - { - __m256 lo = _mm256_max_ps(Load(src + 0), Load(src + stride + 0)); - __m256 hi = _mm256_max_ps(Load(src + F), Load(src + stride + F)); - __m256 _lo = _mm256_permute2f128_ps(lo, hi, 0x20); - __m256 _hi = _mm256_permute2f128_ps(lo, hi, 0x31); - return _mm256_max_ps(_mm256_shuffle_ps(_lo, _hi, 0x88), _mm256_shuffle_ps(_lo, _hi, 0xDD)); - } - - template SIMD_INLINE __m256 Pooling2x2Max2(const float * src) - { - __m256 lo = Load(src + 0); - __m256 hi = Load(src + F); - __m256 _lo = _mm256_permute2f128_ps(lo, hi, 0x20); - __m256 _hi = _mm256_permute2f128_ps(lo, hi, 0x31); - return _mm256_max_ps(_mm256_shuffle_ps(_lo, _hi, 0x88), _mm256_shuffle_ps(_lo, _hi, 0xDD)); - } - - template void NeuralPooling2x2Max2x2(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - size_t heightEven = Simd::AlignLo(height, 2); - size_t widthEven = Simd::AlignLo(width, 2); - size_t alignedWidth = AlignLo(width, DF); - for (size_t row = 0; row < heightEven; row += 2) - { - for (size_t col = 0; col < alignedWidth; col += DF) - Store(dst + (col >> 1), Pooling2x2Max2x2(src + col, srcStride)); - if (widthEven - alignedWidth) - { - size_t col = widthEven - DF; - Store(dst + (col >> 1), Pooling2x2Max2x2(src + col, srcStride)); - } - if (width - widthEven) - dst[widthEven >> 1] = Simd::Max(src[widthEven], src[widthEven + srcStride]); - src += 2 * srcStride; - dst += dstStride; - } - if (height - heightEven) - { - for (size_t col = 0; col < alignedWidth; col += DF) - Store(dst + (col >> 1), Pooling2x2Max2(src + col)); - if (widthEven - alignedWidth) - { - size_t col = widthEven - DF; - Store(dst + (col >> 1), Pooling2x2Max2(src + col)); - } - if (width - widthEven) - dst[widthEven >> 1] = src[widthEven]; - } - } - - void NeuralPooling2x2Max2x2(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralPooling2x2Max2x2(src, srcStride, width, height, dst, dstStride); - else - NeuralPooling2x2Max2x2(src, srcStride, width, height, dst, dstStride); - } - - namespace Ncf - { - namespace Ver0 - { - void PrepareB(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, size_t kernelX, size_t kernelY, - size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, float * dst) - { - const size_t K = kernelX*kernelY*srcDepth, N = dstHeight*dstWidth; - if (dilationX*dilationY*strideX*strideY != 1) - { - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - size_t srcRow0 = dstRow*strideY - padY; - for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol) - { - size_t srcCol0 = dstCol*strideX - padX; - for (size_t channel = 0; channel < srcDepth; ++channel) - { - for (size_t kernelRow = 0; kernelRow < kernelY; ++kernelRow) - { - size_t srcRow = srcRow0 + kernelRow*dilationY; - if (srcRow < srcHeight) - { - const float * psrc = src + (channel*srcHeight + srcRow)*srcWidth; - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol) - { - size_t srcCol = srcCol0 + kernelCol*dilationX; - if (srcCol < srcWidth) - *(dst++) = psrc[srcCol]; - else - *(dst++) = 0; - } - } - else - { - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol) - *(dst++) = 0; - } - } - } - } - } - } - else if (kernelX*kernelY != 1) - { - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - size_t srcRow0 = dstRow - padY; - for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol) - { - size_t srcCol0 = dstCol - padX; - for (size_t channel = 0; channel < srcDepth; ++channel) - { - for (size_t kernelRow = 0; kernelRow < kernelY; ++kernelRow) - { - size_t srcRow = srcRow0 + kernelRow; - if (srcRow < srcHeight) - { - const float * psrc = src + (channel*srcHeight + srcRow)*srcWidth; - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol) - { - size_t srcCol = srcCol0 + kernelCol; - if (srcCol < srcWidth) - *(dst++) = psrc[srcCol]; - else - *(dst++) = 0; - } - } - else - { - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol) - *(dst++) = 0; - } - } - } - } - } - } - else - { - for (size_t i = 0; i < N; ++i) - { - for (size_t k = 0; k < K; ++k) - *(dst++) = src[k*N + i]; - } - } - } - - template static SIMD_INLINE void Kernel1x4x8(const __m256 & a, size_t K, const float * b, __m256 * sums) - { - sums[0] = _mm256_add_ps(sums[0], _mm256_mul_ps(a, Load(b + 0 * K))); - sums[1] = _mm256_add_ps(sums[1], _mm256_mul_ps(a, Load(b + 1 * K))); - sums[2] = _mm256_add_ps(sums[2], _mm256_mul_ps(a, Load(b + 2 * K))); - sums[3] = _mm256_add_ps(sums[3], _mm256_mul_ps(a, Load(b + 3 * K))); - } - - template static SIMD_INLINE void Kernel1x1x8(const __m256 & a, const float * b, __m256 & sum) - { - sum = _mm256_add_ps(sum, _mm256_mul_ps(a, Load(b))); - } - - SIMD_INLINE void Add4ExtractedSums(const __m256 * src, float * dst) - { - __m256 sum256 = _mm256_hadd_ps(_mm256_hadd_ps(src[0], src[1]), _mm256_hadd_ps(src[2], src[3])); - __m128 sum128 = _mm_add_ps(_mm256_extractf128_ps(sum256, 0), _mm256_extractf128_ps(sum256, 1)); - _mm_storeu_ps(dst, _mm_add_ps(_mm_loadu_ps(dst), sum128)); - } - - template static SIMD_INLINE void Kernel3x4x8(const __m256 * a, size_t K, const float * b, __m256 * sums) - { - __m256 _b; - _b = Avx::Load(b + 0 * K); - sums[0x0] = _mm256_add_ps(sums[0x0], _mm256_mul_ps(a[0], _b)); - sums[0x4] = _mm256_add_ps(sums[0x4], _mm256_mul_ps(a[1], _b)); - sums[0x8] = _mm256_add_ps(sums[0x8], _mm256_mul_ps(a[2], _b)); - _b = Avx::Load(b + 1 * K); - sums[0x1] = _mm256_add_ps(sums[0x1], _mm256_mul_ps(a[0], _b)); - sums[0x5] = _mm256_add_ps(sums[0x5], _mm256_mul_ps(a[1], _b)); - sums[0x9] = _mm256_add_ps(sums[0x9], _mm256_mul_ps(a[2], _b)); - _b = Avx::Load(b + 2 * K); - sums[0x2] = _mm256_add_ps(sums[0x2], _mm256_mul_ps(a[0], _b)); - sums[0x6] = _mm256_add_ps(sums[0x6], _mm256_mul_ps(a[1], _b)); - sums[0xA] = _mm256_add_ps(sums[0xA], _mm256_mul_ps(a[2], _b)); - _b = Avx::Load(b + 3 * K); - sums[0x3] = _mm256_add_ps(sums[0x3], _mm256_mul_ps(a[0], _b)); - sums[0x7] = _mm256_add_ps(sums[0x7], _mm256_mul_ps(a[1], _b)); - sums[0xB] = _mm256_add_ps(sums[0xB], _mm256_mul_ps(a[2], _b)); - } - - template static SIMD_INLINE void Kernel3x1x8(const __m256 * a, const float * b, __m256 * sums) - { - __m256 _b = Avx::Load(b); - sums[0x0] = _mm256_add_ps(sums[0x0], _mm256_mul_ps(a[0], _b)); - sums[0x1] = _mm256_add_ps(sums[0x1], _mm256_mul_ps(a[1], _b)); - sums[0x2] = _mm256_add_ps(sums[0x2], _mm256_mul_ps(a[2], _b)); - } - - template void Execute(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) - { - size_t M3 = M / 3 * 3; - size_t N4 = Simd::AlignLo(N, 4); - size_t K8 = Simd::AlignLo(K, 8); - __m256 tailMask = RightNotZero32f(K - K8); - size_t i = 0; - for (; i < M3; i += 3) - { - const float * pa = a + i * K; - float * pc = c + i * N; - size_t j = 0; - for (; j < N4; j += 4) - { - const float * pb = b + j * K; - __m256 sums[12] = { - _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), - _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), - _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - __m256 _a[3]; - for (size_t k = 0; k < K8; k += 8) - { - _a[0] = Avx::Load(pa + k + 0 * K); - _a[1] = Avx::Load(pa + k + 1 * K); - _a[2] = Avx::Load(pa + k + 2 * K); - Kernel3x4x8(_a, K, pb + k, sums); - } - if (K8 < K) - { - size_t k = K - 8; - _a[0] = _mm256_and_ps(tailMask, Avx::Load(pa + k + 0 * K)); - _a[1] = _mm256_and_ps(tailMask, Avx::Load(pa + k + 1 * K)); - _a[2] = _mm256_and_ps(tailMask, Avx::Load(pa + k + 2 * K)); - Kernel3x4x8(_a, K, pb + k, sums); - } - Add4ExtractedSums(sums + 0, pc + j + 0 * N); - Add4ExtractedSums(sums + 4, pc + j + 1 * N); - Add4ExtractedSums(sums + 8, pc + j + 2 * N); - } - for (; j < N; ++j) - { - const float * pb = b + j * K; - __m256 sums[3] = { _mm256_setzero_ps(), _mm256_setzero_ps() , _mm256_setzero_ps() }; - __m256 _a[3]; - for (size_t k = 0; k < K8; k += 8) - { - _a[0] = Avx::Load(pa + k + 0 * K); - _a[1] = Avx::Load(pa + k + 1 * K); - _a[2] = Avx::Load(pa + k + 2 * K); - Kernel3x1x8(_a, pb + k, sums); - } - if (K8 < K) - { - size_t k = K - 8; - _a[0] = _mm256_and_ps(tailMask, Avx::Load(pa + k + 0 * K)); - _a[1] = _mm256_and_ps(tailMask, Avx::Load(pa + k + 1 * K)); - _a[2] = _mm256_and_ps(tailMask, Avx::Load(pa + k + 2 * K)); - Kernel3x1x8(_a, pb + k, sums); - } - pc[j + 0 * N] += Avx::ExtractSum(sums[0]); - pc[j + 1 * N] += Avx::ExtractSum(sums[1]); - pc[j + 2 * N] += Avx::ExtractSum(sums[2]); - } - } - for (; i < M; ++i) - { - const float * pa = a + i*K; - float * pc = c + i*N; - size_t j = 0; - for (; j < N4; j += 4) - { - const float * pb = b + j*K; - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - for (size_t k = 0; k < K8; k += 8) - { - __m256 _a = Avx::Load(pa + k); - Kernel1x4x8(_a, K, pb + k, sums); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 _a = _mm256_and_ps(tailMask, Avx::Load(pa + k)); - Kernel1x4x8(_a, K, pb + k, sums); - } - Add4ExtractedSums(sums + 0, pc + j); - } - for (; j < N; ++j) - { - const float * pb = b + j*K; - __m256 sum = _mm256_setzero_ps(); - for (size_t k = 0; k < K8; k += 8) - { - __m256 _a = Avx::Load(pa + k); - Kernel1x1x8(_a, pb + k, sum); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 _a = _mm256_and_ps(tailMask, Avx::Load(pa + k)); - Kernel1x1x8(_a, pb + k, sum); - } - pc[j] += Avx::ExtractSum(sum); - } - } - } - - void Execute(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) - { - if (Aligned(K, F)) - Execute(M, N, K, a, b, c); - else - Execute(M, N, K, a, b, c); - } - } - - namespace Ver1 - { - void PrepareA(const float * src, size_t M, size_t K, size_t cell, float * dst) - { - size_t K4 = AlignLo(K, 4), K8 = AlignLo(K, 8); - for (size_t i = 0; i < M; i += cell) - { - size_t n = Simd::Min(cell, M - i), k = 0; - if (cell == 4 && n == 4) - { - for (; k < K8; k += 8) - { - const float * ps = src + k; - __m256 s0 = Avx::Load(ps + 0 * K); - __m256 s1 = Avx::Load(ps + 1 * K); - __m256 s2 = Avx::Load(ps + 2 * K); - __m256 s3 = Avx::Load(ps + 3 * K); - __m256 s00 = _mm256_unpacklo_ps(s0, s2); - __m256 s01 = _mm256_unpacklo_ps(s1, s3); - __m256 s10 = _mm256_unpackhi_ps(s0, s2); - __m256 s11 = _mm256_unpackhi_ps(s1, s3); - __m256 d0 = _mm256_unpacklo_ps(s00, s01); - __m256 d1 = _mm256_unpackhi_ps(s00, s01); - __m256 d2 = _mm256_unpacklo_ps(s10, s11); - __m256 d3 = _mm256_unpackhi_ps(s10, s11); - Avx::Store(dst + 0, _mm256_permute2f128_ps(d0, d1, 0x20)); - Avx::Store(dst + 8, _mm256_permute2f128_ps(d2, d3, 0x20)); - Avx::Store(dst + 16, _mm256_permute2f128_ps(d0, d1, 0x31)); - Avx::Store(dst + 24, _mm256_permute2f128_ps(d2, d3, 0x31)); - dst += 32; - } - for (; k < K4; k += 4) - { - const float * ps = src + k; - __m128 s0 = Sse::Load(ps + 0 * K); - __m128 s1 = Sse::Load(ps + 1 * K); - __m128 s2 = Sse::Load(ps + 2 * K); - __m128 s3 = Sse::Load(ps + 3 * K); - __m128 s00 = _mm_unpacklo_ps(s0, s2); - __m128 s01 = _mm_unpacklo_ps(s1, s3); - __m128 s10 = _mm_unpackhi_ps(s0, s2); - __m128 s11 = _mm_unpackhi_ps(s1, s3); - Sse::Store(dst + 0, _mm_unpacklo_ps(s00, s01)); - Sse::Store(dst + 4, _mm_unpackhi_ps(s00, s01)); - Sse::Store(dst + 8, _mm_unpacklo_ps(s10, s11)); - Sse::Store(dst + 12, _mm_unpackhi_ps(s10, s11)); - dst += 16; - } - } - for (; k < K; ++k) - { - for (size_t c = 0; c < n; ++c) - *(dst++) = src[c*K + k]; - } - src += cell*K; - } - } - - void PrepareB(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, size_t kernelX, size_t kernelY, size_t padX, size_t padY, - size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, size_t cell, float * tmp, float * dst) - { - const size_t K = kernelX*kernelY*srcDepth, N = dstHeight*dstWidth; - if (kernelX*kernelY != 1) - { - float * dst = tmp; - size_t channelSize = srcHeight * srcWidth; - if (dilationX*dilationY*strideX*strideY != 1) - { - for (size_t channel = 0, k = 0; channel < srcDepth; ++channel, src += channelSize) - { - for (size_t kernelRow = 0; kernelRow < kernelY; ++kernelRow) - { - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol, ++k) - { - size_t srcRow = kernelRow*dilationY - padY; - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - if (srcRow < srcHeight) - { - size_t srcCol = kernelCol*dilationX - padX; - for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol) - { - if (srcCol < srcWidth) - *(dst++) = src[srcRow*srcWidth + srcCol]; - else - *(dst++) = 0; - srcCol += strideX; - } - } - else - { - for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol) - *(dst++) = 0; - } - srcRow += strideY; - } - } - } - } - } - else - { - const size_t bodySize = dstWidth - padX * 2; - for (size_t channel = 0, k = 0; channel < srcDepth; ++channel, src += channelSize) - { - for (size_t kernelRow = 0; kernelRow < kernelY; ++kernelRow) - { - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol, ++k) - { - size_t srcRow = kernelRow - padY; - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow, ++srcRow) - { - if (srcRow < srcHeight) - { - size_t srcCol = kernelCol - padX, dstCol = 0; - const float * psrc = src + srcRow*srcWidth; - for (; dstCol < padX; ++dstCol, ++srcCol) - { - if (srcCol < srcWidth) - *(dst++) = psrc[srcCol]; - else - *(dst++) = 0; - } - memcpy(dst, psrc + srcCol, bodySize * 4); - dst += bodySize; - dstCol += bodySize; - srcCol += bodySize; - for (; dstCol < dstWidth; ++dstCol, ++srcCol) - { - if (srcCol < srcWidth) - *(dst++) = psrc[srcCol]; - else - *(dst++) = 0; - } - } - else - { - memset(dst, 0, dstWidth * 4); - dst += dstWidth; - } - } - } - } - } - } - src = tmp; - } - if (cell == 16) - { - for (size_t j = 0; j < N; j += cell) - { - size_t n = Simd::Min(cell, N - j); - if (n == cell) - { - for (size_t k = 0; k < K; ++k) - { - const float * psrc = src + k*N; - Store(dst + 0, Load(psrc + 0)); - Store(dst + 8, Load(psrc + 8)); - dst += 16; - } - } - else - { - for (size_t k = 0; k < K; ++k) - { - const float * psrc = src + k*N; - size_t c = 0; - for (; c < n; ++c) - *(dst++) = *(psrc++); - for (; c < cell; ++c) - *(dst++) = 0; - } - } - src += cell; - } - } - else - { - for (size_t j = 0; j < N; j += cell) - { - size_t n = Simd::Min(cell, N - j); - for (size_t k = 0; k < K; ++k) - { - const float * psrc = src + k*N; - size_t c = 0; - for (; c < n; ++c) - *(dst++) = *(psrc++); - for (; c < cell; ++c) - *(dst++) = 0; - } - src += cell; - } - } - } - - SIMD_INLINE void AddSum(const __m256 & sum, float * dst) - { - Store(dst, _mm256_add_ps(Load(dst), sum)); - } - - SIMD_INLINE void AddSums8(const __m256 * sums, size_t size, const float * mask, float * dst, size_t stride) - { - if (mask) - { - __m256 _mask = _mm256_loadu_ps(mask); - for (size_t i = 0; i < size; ++i, dst += stride) - AddSum(_mm256_and_ps(_mask, sums[i]), dst); - } - else - { - for (size_t i = 0; i < size; ++i, dst += stride) - AddSum(sums[i], dst); - } - } - - template SIMD_INLINE void KernelMx8(size_t N, size_t K, const float * a, const float * b, float * c, const float * mask, size_t m) - { - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - for (size_t k = 0; k < K; ++k) - { - __m256 b0 = Load(b); - for (size_t s = 0; s < m; ++s) - sums[s] = _mm256_add_ps(sums[s], _mm256_mul_ps(_mm256_set1_ps(a[s]), b0)); - b += 8; - a += m; - } - AddSums8(sums, m, mask, c, N); - } - - template SIMD_INLINE void Kernel4x8(size_t N, size_t K, const float * a, const float * b, float * c, const float * mask) - { - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - for (size_t k = 0; k < K; ++k) - { - __m256 b0 = Load(b); - sums[0] = _mm256_add_ps(sums[0], _mm256_mul_ps(_mm256_set1_ps(a[0]), b0)); - sums[1] = _mm256_add_ps(sums[1], _mm256_mul_ps(_mm256_set1_ps(a[1]), b0)); - sums[2] = _mm256_add_ps(sums[2], _mm256_mul_ps(_mm256_set1_ps(a[2]), b0)); - sums[3] = _mm256_add_ps(sums[3], _mm256_mul_ps(_mm256_set1_ps(a[3]), b0)); - b += 8; - a += 4; - } - AddSums8(sums, 4, mask, c, N); - } - - template void Execute4x8(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) - { - size_t M4 = Simd::AlignLo(M, 4); - size_t N8 = Simd::AlignLo(N, 8); - const int32_t mask[16] = { -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 }; - const float * tail = (float*)mask + 8 - N + N8; - size_t i = 0; - for (; i < M4; i += 4) - { - size_t j = 0; - for (; j < N8; j += 8) - Kernel4x8(N, K, a + i*K, b + j*K, c + i*N + j, NULL); - if (N8 < N) - Kernel4x8(N, K, a + i*K, b + j*K, c + i*N + j, tail); - } - if (M4 < M) - { - size_t j = 0; - for (; j < N8; j += 8) - KernelMx8(N, K, a + i*K, b + j*K, c + i*N + j, NULL, M - M4); - if (N8 < N) - KernelMx8(N, K, a + i*K, b + j*K, c + i*N + j, tail, M - M4); - } - } - - SIMD_INLINE void AddSums16(const __m256 * sums, size_t size, const float * mask, float * dst, size_t stride) - { - if (mask) - { - __m256 mask0 = _mm256_loadu_ps(mask + 0); - __m256 mask1 = _mm256_loadu_ps(mask + 8); - for (size_t i = 0; i < size; ++i, dst += stride) - { - AddSum(_mm256_and_ps(mask0, sums[i + 0]), dst + 0); - AddSum(_mm256_and_ps(mask1, sums[i + 4]), dst + 8); - } - } - else - { - for (size_t i = 0; i < size; ++i, dst += stride) - { - AddSum(sums[i + 0], dst + 0); - AddSum(sums[i + 4], dst + 8); - } - } - } - - template SIMD_INLINE void KernelMx16(size_t N, size_t K, const float * a, const float * b, float * c, const float * mask, size_t m) - { - __m256 sums[8] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), - _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - for (size_t k = 0; k < K; ++k) - { - __m256 b0 = Load(b + 0); - __m256 b1 = Load(b + 8); - for (size_t s = 0; s < m; ++s) - { - __m256 a0 = _mm256_set1_ps(a[s]); - sums[s + 0] = _mm256_add_ps(sums[s + 0], _mm256_mul_ps(b0, a0)); - sums[s + 4] = _mm256_add_ps(sums[s + 4], _mm256_mul_ps(b1, a0)); - } - b += 16; - a += m; - } - AddSums16(sums, m, mask, c, N); - } - - template SIMD_INLINE void Kernel4x16(size_t N, size_t K, const float * a, const float * b, float * c, const float * mask) - { - __m256 sums[8] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), - _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - for (size_t k = 0; k < K; ++k) - { - __m256 b0 = Load(b + 0); - __m256 b1 = Load(b + 8); - __m256 a0 = _mm256_set1_ps(a[0]); - sums[0] = _mm256_add_ps(sums[0], _mm256_mul_ps(b0, a0)); - sums[4] = _mm256_add_ps(sums[4], _mm256_mul_ps(b1, a0)); - __m256 a1 = _mm256_set1_ps(a[1]); - sums[1] = _mm256_add_ps(sums[1], _mm256_mul_ps(b0, a1)); - sums[5] = _mm256_add_ps(sums[5], _mm256_mul_ps(b1, a1)); - __m256 a2 = _mm256_set1_ps(a[2]); - sums[2] = _mm256_add_ps(sums[2], _mm256_mul_ps(b0, a2)); - sums[6] = _mm256_add_ps(sums[6], _mm256_mul_ps(b1, a2)); - __m256 a3 = _mm256_set1_ps(a[3]); - sums[3] = _mm256_add_ps(sums[3], _mm256_mul_ps(b0, a3)); - sums[7] = _mm256_add_ps(sums[7], _mm256_mul_ps(b1, a3)); - b += 16; - a += 4; - } - AddSums16(sums, 4, mask, c, N); - } - - template void Execute4x16(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) - { - size_t M4 = Simd::AlignLo(M, 4); - size_t N16 = Simd::AlignLo(N, 16); - const int32_t mask[32] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - const float * tail = (float*)mask + 16 - N + N16; - size_t i = 0; - for (; i < M4; i += 4) - { - size_t j = 0; - for (; j < N16; j += 16) - Kernel4x16(N, K, a + i*K, b + j*K, c + i*N + j, NULL); - if (N16 < N) - Kernel4x16(N, K, a + i*K, b + j*K, c + i*N + j, tail); - } - if (M4 < M) - { - size_t j = 0; - for (; j < N16; j += 16) - KernelMx16(N, K, a + i*K, b + j*K, c + i*N + j, NULL, M - M4); - if (N16 < N) - KernelMx16(N, K, a + i*K, b + j*K, c + i*N + j, tail, M - M4); - } - } - - void Execute(size_t M, size_t N, size_t K, const float * a, const float * b, float * c, size_t cellA, size_t cellB) - { - if (cellA == 4) - { - if (cellB == 8) - Execute4x8(M, N, K, a, b, c); - if (cellB == 16) - Execute4x16(M, N, K, a, b, c); - } - } - } - - namespace Ver2 - { - void PrepareB(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, size_t padX, size_t padY, float * dst, size_t dstWidth, size_t dstHeight) - { - for (size_t channel = 0; channel < srcDepth; ++channel) - { - const float * s = src; - float * d = dst; - memset(d, 0, padY*dstWidth * 4); - d += padY*dstWidth; - for (size_t row = padY; row < dstHeight - padY; ++row) - { - memset(d, 0, padX * 4); - memcpy(d + padX, s, srcWidth * 4); - memset(d + padX + srcWidth, 0, padX * 4); - d += dstWidth; - s += srcWidth; - } - memset(d, 0, padY*dstWidth * 4); - src += srcWidth*srcHeight; - dst += dstWidth*dstHeight; - } - } - - template void AddConvolution8x8(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, - const float * weight, float * dst, size_t dstDepth) - { - __m256 _weight[kernelX*kernelY]; - for (size_t dstChannel = 0; dstChannel < dstDepth; ++dstChannel) - { - __m256 _dst[8]; - float * pdst = dst; - for (size_t row = 0; row < 8; ++row, pdst += 8) - _dst[row] = Avx::Load(pdst); - if (kernelY < 4) - { - for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) - { - const float * psrc = src + srcWidth*srcHeight*srcChannel; - LoadWeightsForward(weight, _weight); - for (size_t row = 0; row < 8; ++row) - { - _dst[row] = _mm256_add_ps(_dst[row], Convolution::template Forward(psrc, srcWidth, _weight)); - psrc += srcWidth; - } - weight += kernelX*kernelY; - } - } - else - { - for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) - { - const float * psrc = src + srcWidth*srcHeight*srcChannel; - for (size_t dy = 0; dy < kernelY; dy++) - { - const float * ps = psrc + dy*srcWidth; - LoadWeightsForward(weight, _weight); - for (size_t row = 0; row < 8; ++row) - { - _dst[row] = _mm256_add_ps(_dst[row], Convolution::template RowConvolution(ps, _weight)); - ps += srcWidth; - } - weight += kernelX; - } - } - } - for (size_t row = 0; row < 8; ++row, dst += 8) - Avx::Store(dst, _dst[row]); - } - } - - template void AddConvolution(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, - const float * weight, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth) - { - if (dstWidth == 8 && dstHeight == 8) - { - AddConvolution8x8(src, srcWidth, srcHeight, srcDepth, weight, dst, dstDepth); - return; - } - size_t alignedWidth = AlignLo(dstWidth, F); - __m256 tailMask = RightNotZero32f(dstWidth - alignedWidth); - __m256 _weight[kernelX*kernelY]; - for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) - { - for (size_t dstChannel = 0; dstChannel < dstDepth; ++dstChannel) - { - const float * psrc = src + srcWidth*srcHeight*srcChannel; - const float * pweight = weight + (dstChannel*srcDepth + srcChannel)*kernelX*kernelY; - float * pdst = dst + dstWidth*dstHeight*dstChannel; - LoadWeightsForward(pweight, _weight); - for (size_t row = 0; row < dstHeight; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += F) - { - __m256 _dst = Load(pdst + col); - _dst = _mm256_add_ps(_dst, Convolution::template Forward(psrc + col, srcWidth, _weight)); - Store(pdst + col, _dst); - } - if (dstWidth - alignedWidth) - { - size_t col = dstWidth - F; - __m256 _dst = Load(pdst + col); - _dst = _mm256_add_ps(_dst, _mm256_and_ps(tailMask, Convolution::template Forward(psrc + col, srcWidth, _weight))); - Store(pdst + col, _dst); - } - psrc += srcWidth; - pdst += dstWidth; - } - } - } - } - - void Execute(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, - const float * weight, size_t kernelX, size_t kernelY, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth) - { - assert(kernelX == kernelY); - if (kernelX == 2) - AddConvolution(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth); - else if (kernelX == 3) - AddConvolution(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth); - else if (kernelX == 4) - AddConvolution(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth); - else if (kernelX == 5) - AddConvolution(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth); - else - assert(0); - } - - bool Preferable(size_t srcDepth, size_t kernelX, size_t kernelY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, size_t dstDepth) - { - if (kernelX == kernelY && kernelX >= 2 && kernelX <= 5 && strideX*strideY*dilationX*dilationY == 1 && dstWidth >= F) - { - if (dstWidth*dstHeight*kernelX*kernelY >= 8 * 8 * 3 * 3) - return true; - } - return false; - } - } - - struct Opt - { - enum Alg - { - None, - Ver0, - Ver1, - Ver2, - } alg; - - size_t sizeA; - size_t sizeB; - size_t sizeT; - - size_t cellA; - size_t cellB; - - size_t M, N, K; - size_t strideB; - size_t paddedW; - size_t paddedH; - - Opt(size_t srcWidth, size_t srcHeight, size_t srcDepth, size_t kernelX, size_t kernelY, size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, size_t dstDepth) - { - alg = None; - sizeA = 0; - sizeB = 0; - sizeT = 0; - cellA = 1; - cellB = 1; - - M = dstDepth; - N = dstHeight*dstWidth; - K = kernelX*kernelY*srcDepth; - - if (dstWidth*dstHeight / kernelX <= 2000) - alg = Ver0; - else - alg = Ver1; - if (Ver2::Preferable(srcDepth, kernelX, kernelY, strideX, strideY, dilationX, dilationY, dstWidth, dstHeight, dstDepth)) - alg = Ver2; - - switch (alg) - { - case Ver0: - sizeB = N*K; - break; - case Ver1: - cellA = 4; - cellB = 16; - sizeA = M*K; - strideB = Simd::AlignHi(N, cellB); - sizeB = strideB*K; - if (kernelX*kernelY > 1) - sizeT = sizeB; - break; - case Ver2: - if (padX > 0 || padY > 0) - { - paddedW = Simd::AlignHi(srcWidth + 2 * padX, F); - paddedH = srcHeight + 2 * padY; - sizeB = paddedW*paddedH*srcDepth; - } - else - { - paddedW = srcWidth; - paddedH = srcHeight; - } - break; - default: - assert(0); - break; - } - } - }; - - struct Data - { - float * a; - float * b; - float * t; - - Data(size_t sizeA, size_t sizeB, size_t sizeT, void * externalData, size_t * externalSize) - : a(0) - , b(0) - , _data(0) - { - sizeA = AlignHi(sizeA, F); - sizeB = AlignHi(sizeB, F); - sizeT = AlignHi(sizeT, F); - size_t size = (sizeA + sizeB + sizeT) * sizeof(float); - if (size == 0) - return; - if (externalData != AlignHi(externalData, SIMD_ALIGN)) - size += SIMD_ALIGN; - float * data = NULL; - if (externalData == NULL || externalSize == NULL || *externalSize < size) - { - _data = Simd::Allocate(size); - if (externalSize) - *externalSize = size; - data = (float*)_data; - } - else - data = (float*)AlignHi(externalData, SIMD_ALIGN); - if (sizeA) - a = data; - if (sizeB) - b = data + sizeA; - if (sizeT) - t = data + sizeA + sizeB; - } - - ~Data() - { - if (_data) - Simd::Free(_data); - } - - private: - void * _data; - }; - } - - void NeuralConvolutionForward(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, - const float * weight, size_t kernelX, size_t kernelY, size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, - void * buffer, size_t * size, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth, int add) - { - using namespace Ncf; - - assert(dstWidth == (srcWidth + 2 * padX - (dilationX * (kernelX - 1) + 1)) / strideX + 1); - assert(dstHeight == (srcHeight + 2 * padY - (dilationY * (kernelY - 1) + 1)) / strideY + 1); - - if (!add) - memset(dst, 0, dstWidth*dstHeight*dstDepth * sizeof(float)); - - Opt opt(srcWidth, srcHeight, srcDepth, kernelX, kernelY, padX, padY, strideX, strideY, dilationX, dilationY, dstWidth, dstHeight, dstDepth); - - Data data(opt.sizeA, opt.sizeB, opt.sizeT, buffer, size); - - if (opt.sizeA) - { - switch (opt.alg) - { - case Opt::Ver1: Ver1::PrepareA(weight, opt.M, opt.K, opt.cellA, data.a); - default: - break; - } - } - else - data.a = (float*)weight; - - if (opt.sizeB) - { - switch (opt.alg) - { - case Opt::Ver0: Ver0::PrepareB(src, srcWidth, srcHeight, srcDepth, kernelX, kernelY, padX, padY, strideX, strideY, dilationX, dilationY, dstWidth, dstHeight, data.b); break; - case Opt::Ver1: Ver1::PrepareB(src, srcWidth, srcHeight, srcDepth, kernelX, kernelY, padX, padY, strideX, strideY, dilationX, dilationY, dstWidth, dstHeight, opt.cellB, data.t, data.b); break; - case Opt::Ver2: Ver2::PrepareB(src, srcWidth, srcHeight, srcDepth, padX, padY, data.b, opt.paddedW, opt.paddedH); break; - default: break; - } - } - else - data.b = (float*)src; - - switch (opt.alg) - { - case Opt::Ver0: Ver0::Execute(opt.M, opt.N, opt.K, data.a, data.b, dst); break; - case Opt::Ver1: Ver1::Execute(opt.M, opt.N, opt.K, data.a, data.b, dst, opt.cellA, opt.cellB); break; - case Opt::Ver2: Ver2::Execute(data.b, opt.paddedW, opt.paddedH, srcDepth, weight, kernelX, kernelY, dst, dstWidth, dstHeight, dstDepth); break; - default: break; - } - } - } -#endif// SIMD_AVX_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx1Resizer.cpp b/src/3rd/Simd/Simd/SimdAvx1Resizer.cpp deleted file mode 100644 index 0eb6dd35..00000000 --- a/src/3rd/Simd/Simd/SimdAvx1Resizer.cpp +++ /dev/null @@ -1,155 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdResizer.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - ResizerFloatBilinear::ResizerFloatBilinear(const ResParam & param) - : Base::ResizerFloatBilinear(param) - { - } - - void ResizerFloatBilinear::Run(const float * src, size_t srcStride, float * dst, size_t dstStride) - { - size_t cn = _param.channels; - size_t rs = _param.dstW * cn; - float * pbx[2] = { _bx[0].data, _bx[1].data }; - int32_t prev = -2; - size_t rsa = AlignLo(rs, Avx::F); - size_t rsh = AlignLo(rs, Sse::F); - for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride) - { - float fy1 = _ay[dy]; - float fy0 = 1.0f - fy1; - int32_t sy = _iy[dy]; - int32_t k = 0; - - if (sy == prev) - k = 2; - else if (sy == prev + 1) - { - Swap(pbx[0], pbx[1]); - k = 1; - } - - prev = sy; - - for (; k < 2; k++) - { - float * pb = pbx[k]; - const float * ps = src + (sy + k)*srcStride; - size_t dx = 0; - if (cn == 1) - { - __m256 _1 = _mm256_set1_ps(1.0f); - for (; dx < rsa; dx += Avx::F) - { - __m256 s0145 = Avx::Load(ps + _ix[dx + 0], ps + _ix[dx + 1], ps + _ix[dx + 4], ps + _ix[dx + 5]); - __m256 s2367 = Avx::Load(ps + _ix[dx + 2], ps + _ix[dx + 3], ps + _ix[dx + 6], ps + _ix[dx + 7]); - __m256 fx1 = _mm256_load_ps(_ax.data + dx); - __m256 fx0 = _mm256_sub_ps(_1, fx1); - __m256 m0 = _mm256_mul_ps(fx0, _mm256_shuffle_ps(s0145, s2367, 0x88)); - __m256 m1 = _mm256_mul_ps(fx1, _mm256_shuffle_ps(s0145, s2367, 0xDD)); - _mm256_store_ps(pb + dx, _mm256_add_ps(m0, m1)); - } - for (; dx < rsh; dx += Sse::F) - { - __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]); - __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]); - __m128 fx1 = _mm_load_ps(_ax.data + dx); - __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1); - __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88)); - __m128 m1 = _mm_mul_ps(fx1, _mm_shuffle_ps(s01, s23, 0xDD)); - _mm_store_ps(pb + dx, _mm_add_ps(m0, m1)); - } - } - if (cn == 3 && rs > 3) - { - __m256 _1 = _mm256_set1_ps(1.0f); - size_t rs3 = rs - 3; - size_t rs6 = AlignLoAny(rs3, 6); - for (; dx < rs6; dx += 6) - { - __m256 s0 = Load(ps + _ix[dx + 0] + 0, ps + _ix[dx + 3] + 0); - __m256 s1 = Load(ps + _ix[dx + 0] + 3, ps + _ix[dx + 3] + 3); - __m256 fx1 = Load(_ax.data + dx + 0, _ax.data + dx + 3); - __m256 fx0 = _mm256_sub_ps(_1, fx1); - Store(pb + dx + 0, pb + dx + 3, _mm256_add_ps(_mm256_mul_ps(fx0, s0), _mm256_mul_ps(fx1, s1))); - } - for (; dx < rs3; dx += 3) - { - __m128 s0 = _mm_loadu_ps(ps + _ix[dx] + 0); - __m128 s1 = _mm_loadu_ps(ps + _ix[dx] + 3); - __m128 fx1 = _mm_set1_ps(_ax.data[dx]); - __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1); - _mm_storeu_ps(pb + dx, _mm_add_ps(_mm_mul_ps(fx0, s0), _mm_mul_ps(fx1, s1))); - } - } - for (; dx < rs; dx++) - { - int32_t sx = _ix[dx]; - float fx = _ax[dx]; - pb[dx] = ps[sx] * (1.0f - fx) + ps[sx + cn] * fx; - } - } - - size_t dx = 0; - __m256 _fy0 = _mm256_set1_ps(fy0); - __m256 _fy1 = _mm256_set1_ps(fy1); - for (; dx < rsa; dx += Avx::F) - { - __m256 m0 = _mm256_mul_ps(_mm256_load_ps(pbx[0] + dx), _fy0); - __m256 m1 = _mm256_mul_ps(_mm256_load_ps(pbx[1] + dx), _fy1); - _mm256_storeu_ps(dst + dx, _mm256_add_ps(m0, m1)); - } - for (; dx < rsh; dx += Sse::F) - { - __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _mm256_castps256_ps128(_fy0)); - __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _mm256_castps256_ps128(_fy1)); - _mm_storeu_ps(dst + dx, _mm_add_ps(m0, m1)); - } - for (; dx < rs; dx++) - dst[dx] = pbx[0][dx] * fy0 + pbx[1][dx] * fy1; - } - } - - //--------------------------------------------------------------------- - - void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) - { - ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m256)); - if (param.IsFloatBilinear()) - return new ResizerFloatBilinear(param); - else - return Sse41::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - } - } -#endif //SIMD_AVX_ENABLE -} - diff --git a/src/3rd/Simd/Simd/SimdAvx1SquaredDifferenceSum.cpp b/src/3rd/Simd/Simd/SimdAvx1SquaredDifferenceSum.cpp deleted file mode 100644 index 32d90638..00000000 --- a/src/3rd/Simd/Simd/SimdAvx1SquaredDifferenceSum.cpp +++ /dev/null @@ -1,131 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" - -namespace Simd -{ -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - template SIMD_INLINE void SquaredDifferenceSum32f(const float * a, const float * b, size_t offset, __m256 & sum) - { - __m256 _a = Load(a + offset); - __m256 _b = Load(b + offset); - __m256 _d = _mm256_sub_ps(_a, _b); - sum = _mm256_add_ps(sum, _mm256_mul_ps(_d, _d)); - } - - template SIMD_INLINE void SquaredDifferenceSum32f(const float * a, const float * b, size_t size, float * sum) - { - if (align) - assert(Aligned(a) && Aligned(b)); - - *sum = 0; - size_t partialAlignedSize = AlignLo(size, 8); - size_t fullAlignedSize = AlignLo(size, 32); - size_t i = 0; - if (partialAlignedSize) - { - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - if (fullAlignedSize) - { - for (; i < fullAlignedSize; i += 32) - { - SquaredDifferenceSum32f(a, b, i, sums[0]); - SquaredDifferenceSum32f(a, b, i + 8, sums[1]); - SquaredDifferenceSum32f(a, b, i + 16, sums[2]); - SquaredDifferenceSum32f(a, b, i + 24, sums[3]); - } - sums[0] = _mm256_add_ps(_mm256_add_ps(sums[0], sums[1]), _mm256_add_ps(sums[2], sums[3])); - } - for (; i < partialAlignedSize; i += 8) - SquaredDifferenceSum32f(a, b, i, sums[0]); - *sum += ExtractSum(sums[0]); - } - for (; i < size; ++i) - *sum += Simd::Square(a[i] - b[i]); - } - - void SquaredDifferenceSum32f(const float * a, const float * b, size_t size, float * sum) - { - if (Aligned(a) && Aligned(b)) - SquaredDifferenceSum32f(a, b, size, sum); - else - SquaredDifferenceSum32f(a, b, size, sum); - } - - template SIMD_INLINE void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t offset, __m256 & sum, __m256 & correction) - { - __m256 _a = Load(a + offset); - __m256 _b = Load(b + offset); - __m256 _d = _mm256_sub_ps(_a, _b); - __m256 term = _mm256_sub_ps(_mm256_mul_ps(_d, _d), correction); - __m256 temp = _mm256_add_ps(sum, term); - correction = _mm256_sub_ps(_mm256_sub_ps(temp, sum), term); - sum = temp; - } - - template SIMD_INLINE void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum) - { - if (align) - assert(Aligned(a) && Aligned(b)); - - *sum = 0; - size_t partialAlignedSize = AlignLo(size, 8); - size_t fullAlignedSize = AlignLo(size, 32); - size_t i = 0; - if (partialAlignedSize) - { - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - __m256 corrections[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - if (fullAlignedSize) - { - for (; i < fullAlignedSize; i += 32) - { - SquaredDifferenceKahanSum32f(a, b, i, sums[0], corrections[0]); - SquaredDifferenceKahanSum32f(a, b, i + 8, sums[1], corrections[1]); - SquaredDifferenceKahanSum32f(a, b, i + 16, sums[2], corrections[2]); - SquaredDifferenceKahanSum32f(a, b, i + 24, sums[3], corrections[3]); - } - } - for (; i < partialAlignedSize; i += 8) - SquaredDifferenceKahanSum32f(a, b, i, sums[0], corrections[0]); - *sum += ExtractSum(_mm256_add_ps(_mm256_add_ps(sums[0], sums[1]), _mm256_add_ps(sums[2], sums[3]))); - } - for (; i < size; ++i) - *sum += Simd::Square(a[i] - b[i]); - } - - void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum) - { - if (Aligned(a) && Aligned(b)) - SquaredDifferenceKahanSum32f(a, b, size, sum); - else - SquaredDifferenceKahanSum32f(a, b, size, sum); - } - } -#endif// SIMD_AVX_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx1Svm.cpp b/src/3rd/Simd/Simd/SimdAvx1Svm.cpp deleted file mode 100644 index a2109f5d..00000000 --- a/src/3rd/Simd/Simd/SimdAvx1Svm.cpp +++ /dev/null @@ -1,91 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" - -namespace Simd -{ -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - namespace - { - struct Buffer - { - Buffer(size_t count) - { - size_t size = sizeof(float)*count; - _p = Allocate(size); - memset(_p, 0, size); - sums = (float*)_p; - } - - ~Buffer() - { - Free(_p); - } - - float * sums; - private: - void *_p; - }; - } - - void SvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum) - { - Buffer buffer(count); - size_t alignedCount = AlignLo(count, F); - - for (size_t j = 0; j < length; ++j) - { - size_t i = 0; - float v = x[j]; - __m256 _v = _mm256_set1_ps(v); - for (; i < alignedCount; i += F) - { - __m256 sums = Load(buffer.sums + i); - __m256 _svs = Load(svs + i); - Store(buffer.sums + i, _mm256_add_ps(sums, _mm256_mul_ps(_v, _svs))); - } - for (; i < count; ++i) - buffer.sums[i] += v*svs[i]; - svs += count; - } - - size_t i = 0; - __m256 _sum = _mm256_setzero_ps(); - for (; i < alignedCount; i += F) - { - __m256 sums = Load(buffer.sums + i); - __m256 _weights = Load(weights + i); - _sum = _mm256_add_ps(_sum, _mm256_mul_ps(sums, _weights)); - } - *sum = ExtractSum(_sum); - for (; i < count; ++i) - *sum += buffer.sums[i] * weights[i]; - } - } -#endif// SIMD_AVX_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx1Synet.cpp b/src/3rd/Simd/Simd/SimdAvx1Synet.cpp deleted file mode 100644 index 42908c48..00000000 --- a/src/3rd/Simd/Simd/SimdAvx1Synet.cpp +++ /dev/null @@ -1,885 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdSse1.h" -#include "Simd/SimdAvx1.h" - -namespace Simd -{ -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - template SIMD_INLINE void SynetAddBias(const float * bias, float * dst) - { - Store(dst, _mm256_add_ps(Load(dst), Load(bias))); - } - - template SIMD_INLINE void SynetAddBias(__m256 bias, float * dst) - { - Store(dst, _mm256_add_ps(Load(dst), bias)); - } - - template void SynetAddBiasNchw(const float * bias, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(spatial, F) && Aligned(dst)); - - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - if (partial) - { - __m256 _bias = _mm256_set1_ps(bias[c]); - for (; s < aligned; s += QF) - { - SynetAddBias(_bias, dst + s + F * 0); - SynetAddBias(_bias, dst + s + F * 1); - SynetAddBias(_bias, dst + s + F * 2); - SynetAddBias(_bias, dst + s + F * 3); - } - for (; s < partial; s += F) - SynetAddBias(_bias, dst + s); - } - for (; s < spatial; ++s) - dst[s] += bias[c]; - dst += spatial; - } - } - - SIMD_INLINE void SynetAddBiasNchw(const float * bias, size_t channels, size_t spatial, float * dst) - { - if (Aligned(spatial, F) && Aligned(dst)) - SynetAddBiasNchw(bias, channels, spatial, dst); - else - SynetAddBiasNchw(bias, channels, spatial, dst); - } - - template void SynetAddBiasNhwc(const float * bias, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(channels, F) && Aligned(bias) && Aligned(dst)); - - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - if (partial) - { - for (; c < aligned; c += QF) - { - SynetAddBias(bias + c + F * 0, dst + c + F * 0); - SynetAddBias(bias + c + F * 1, dst + c + F * 1); - SynetAddBias(bias + c + F * 2, dst + c + F * 2); - SynetAddBias(bias + c + F * 3, dst + c + F * 3); - } - for (; c < partial; c += F) - SynetAddBias(bias + c, dst + c); - } - for (; c < channels; ++c) - dst[c] += bias[c]; - dst += channels; - } - } - - SIMD_INLINE void SynetAddBiasNhwc(const float * bias, size_t channels, size_t spatial, float * dst) - { - if (Aligned(bias) && Aligned(channels, F) && Aligned(dst)) - SynetAddBiasNhwc(bias, channels, spatial, dst); - else - SynetAddBiasNhwc(bias, channels, spatial, dst); - } - - template void SynetAddBiasNchw8c(const float * bias, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(dst)); - - size_t spatial4 = AlignLo(spatial, 4); - for (size_t c = 0; c < channels; c += F) - { - __m256 _bias = Load(bias + c); - size_t s = 0; - for (; s < spatial4; s += 4, dst += 4 * F) - { - SynetAddBias(_bias, dst + 0 * F); - SynetAddBias(_bias, dst + 1 * F); - SynetAddBias(_bias, dst + 2 * F); - SynetAddBias(_bias, dst + 3 * F); - } - for (; s < spatial; ++s, dst += F) - SynetAddBias(_bias, dst); - } - } - - SIMD_INLINE void SynetAddBiasNchw8c(const float * bias, size_t channels, size_t spatial, float * dst) - { - if (Aligned(dst)) - SynetAddBiasNchw8c(bias, channels, spatial, dst); - else - SynetAddBiasNchw8c(bias, channels, spatial, dst); - } - - void SynetAddBias(const float * bias, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetAddBiasNchw(bias, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetAddBiasNhwc(bias, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - Sse::SynetAddBias(bias, channels, spatial, dst, format); - else if (format == SimdTensorFormatNchw8c) - SynetAddBiasNchw8c(bias, channels, spatial, dst); - else - Base::SynetAddBias(bias, channels, spatial, dst, format); - } - - //--------------------------------------------------------------------- - - template __m256 SynetEltwiseLayerForward(__m256 src0, __m256 src1); - - template <> SIMD_INLINE __m256 SynetEltwiseLayerForward(__m256 src0, __m256 src1) - { - return _mm256_mul_ps(src0, src1); - } - - template <> SIMD_INLINE __m256 SynetEltwiseLayerForward(__m256 src0, __m256 src1) - { - return _mm256_max_ps(src0, src1); - } - - template <> SIMD_INLINE __m256 SynetEltwiseLayerForward(__m256 src0, __m256 src1) - { - return _mm256_min_ps(src0, src1); - } - - template SIMD_INLINE void SynetEltwiseLayerForward(const float * src0, const float * src1, float * dst, size_t offset) - { - Store(dst + offset, SynetEltwiseLayerForward(Load(src0 + offset), Load(src1 + offset))); - } - - template void SynetEltwiseLayerForward(float const * const * src, size_t count, size_t size, float * dst) - { - size_t aligned = AlignLo(size, QF); - size_t partial = AlignLo(size, F); - const float * src0 = src[0]; - const float * src1 = src[1]; - size_t j = 0; - if (partial) - { - for (; j < aligned; j += QF) - { - SynetEltwiseLayerForward(src0, src1, dst, j + F * 0); - SynetEltwiseLayerForward(src0, src1, dst, j + F * 1); - SynetEltwiseLayerForward(src0, src1, dst, j + F * 2); - SynetEltwiseLayerForward(src0, src1, dst, j + F * 3); - } - for (; j < partial; j += F) - SynetEltwiseLayerForward(src0, src1, dst, j); - } - for (; j < size; ++j) - dst[j] = Base::SynetEltwiseLayerForward(src0[j], src1[j]); - for (size_t i = 2; i < count; ++i) - { - const float * srci = src[i]; - size_t j = 0; - if (partial) - { - for (; j < aligned; j += QF) - { - SynetEltwiseLayerForward(dst, srci, dst, j + F * 0); - SynetEltwiseLayerForward(dst, srci, dst, j + F * 1); - SynetEltwiseLayerForward(dst, srci, dst, j + F * 2); - SynetEltwiseLayerForward(dst, srci, dst, j + F * 3); - } - for (; j < partial; j += F) - SynetEltwiseLayerForward(dst, srci, dst, j); - } - for (; j < size; ++j) - dst[j] = Base::SynetEltwiseLayerForward(dst[j], srci[j]); - } - } - - template void SynetEltwiseLayerForwardSum(const float * src0, const __m256 & weight0, const float * src1, const __m256 & weight1, float * dst, size_t offset) - { - Store(dst + offset, _mm256_add_ps(_mm256_mul_ps(Load(src0 + offset), weight0), _mm256_mul_ps(Load(src1 + offset), weight1))); - } - - template void SynetEltwiseLayerForwardSum(const float * src, const __m256 & weight, float * dst, size_t offset) - { - Store(dst + offset, _mm256_add_ps(_mm256_mul_ps(Load(src + offset), weight), Load(dst + offset))); - } - - template void SynetEltwiseLayerForwardSum(float const * const * src, const float * weight, size_t count, size_t size, float * dst) - { - size_t aligned = AlignLo(size, QF); - size_t partial = AlignLo(size, F); - const float * src0 = src[0]; - const float * src1 = src[1]; - __m256 weight0 = _mm256_set1_ps(weight[0]); - __m256 weight1 = _mm256_set1_ps(weight[1]); - size_t j = 0; - if (partial) - { - for (; j < aligned; j += QF) - { - SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 0); - SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 1); - SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 2); - SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 3); - } - for (; j < partial; j += F) - SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j); - } - for (; j < size; ++j) - dst[j] = src0[j] * weight[0] + src1[j] * weight[1]; - for (size_t i = 2; i < count; ++i) - { - const float * srci = src[i]; - __m256 weighti = _mm256_set1_ps(weight[i]); - size_t j = 0; - if (partial) - { - for (; j < aligned; j += QF) - { - SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 0); - SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 1); - SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 2); - SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 3); - } - for (; j < partial; j += F) - SynetEltwiseLayerForwardSum(srci, weighti, dst, j); - } - for (; j < size; ++j) - dst[j] += srci[j] * weight[i]; - } - } - - template void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst) - { - switch (type) - { - case SimdSynetEltwiseOperationProduct: - SynetEltwiseLayerForward(src, count, size, dst); - break; - case SimdSynetEltwiseOperationSum: - SynetEltwiseLayerForwardSum(src, weight, count, size, dst); - break; - case SimdSynetEltwiseOperationMax: - SynetEltwiseLayerForward(src, count, size, dst); - break; - case SimdSynetEltwiseOperationMin: - SynetEltwiseLayerForward(src, count, size, dst); - break; - default: - assert(0); - } - } - - void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst) - { - assert(count >= 2); - bool aligned = Aligned(dst) && Aligned(src[0]) && Aligned(src[1]); - for (size_t i = 2; i < count; ++i) - aligned = aligned && Aligned(src[i]); - if (aligned) - SynetEltwiseLayerForward(src, weight, count, size, type, dst); - else - SynetEltwiseLayerForward(src, weight, count, size, type, dst); - } - - //--------------------------------------------------------------------- - - SIMD_INLINE __m256 Tail(size_t tail) - { - const int32_t mask[DF] = { 0, 0, 0, 0, 0, 0, 0, 0 , -1, -1, -1, -1, -1, -1, -1, -1 }; - return _mm256_loadu_ps((float*)(mask + tail)); - } - - void SynetInnerProductLayerForward1(const float * S0, const float * W, const float * B, size_t K, float * D) - { - size_t K8 = K & (~7); - size_t K32 = K & (~31); - const float * W0 = W + 0 * K; - __m256 d00, d01, d02, d03; - __m256 s0, s1, s2, s3, w0, w1, w2, w3; - size_t k = 0; - d00 = _mm256_setzero_ps(); - if (K32) - { - d01 = _mm256_setzero_ps(); - d02 = _mm256_setzero_ps(); - d03 = _mm256_setzero_ps(); - for (; k < K32; k += 32) - { - s0 = _mm256_loadu_ps(S0 + k + 0 * F); - s1 = _mm256_loadu_ps(S0 + k + 1 * F); - w0 = _mm256_loadu_ps(W0 + k + 0 * F); - w1 = _mm256_loadu_ps(W0 + k + 1 * F); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - d01 = _mm256_add_ps(_mm256_mul_ps(s1, w1), d01); - s2 = _mm256_loadu_ps(S0 + k + 2 * F); - s3 = _mm256_loadu_ps(S0 + k + 3 * F); - w2 = _mm256_loadu_ps(W0 + k + 2 * F); - w3 = _mm256_loadu_ps(W0 + k + 3 * F); - d02 = _mm256_add_ps(_mm256_mul_ps(s2, w2), d02); - d03 = _mm256_add_ps(_mm256_mul_ps(s3, w3), d03); - } - d00 = _mm256_add_ps(_mm256_add_ps(d00, d01), _mm256_add_ps(d02, d03)); - } - for (; k < K8; k += 8) - { - s0 = _mm256_loadu_ps(S0 + k); - w0 = _mm256_loadu_ps(W0 + k); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 tail = Tail(K - K8); - s0 = _mm256_and_ps(tail, _mm256_loadu_ps(S0 + k)); - w0 = _mm256_loadu_ps(W0 + k); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - } - D[0] = Avx::ExtractSum(d00) + B[0]; - } - - void SynetInnerProductLayerForward4(const float * S0, const float * W, const float * B, size_t K, float * D) - { - size_t K8 = K & (~7); - size_t K16 = K & (~15); - const float * W0 = W + 0 * K; - const float * W1 = W + 1 * K; - const float * W2 = W + 2 * K; - const float * W3 = W + 3 * K; - __m256 d00, d01, d10, d11, d20, d21, d30, d31; - __m256 s0, s1, w0, w1; - size_t k = 0; - d00 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); - if (K16) - { - d01 = _mm256_setzero_ps(); - d11 = _mm256_setzero_ps(); - d21 = _mm256_setzero_ps(); - d31 = _mm256_setzero_ps(); - for (; k < K16; k += 16) - { - s0 = _mm256_loadu_ps(S0 + k + 0 * F); - s1 = _mm256_loadu_ps(S0 + k + 1 * F); - w0 = _mm256_loadu_ps(W0 + k + 0 * F); - w1 = _mm256_loadu_ps(W0 + k + 1 * F); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - d01 = _mm256_add_ps(_mm256_mul_ps(s1, w1), d01); - w0 = _mm256_loadu_ps(W1 + k + 0 * F); - w1 = _mm256_loadu_ps(W1 + k + 1 * F); - d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - d11 = _mm256_add_ps(_mm256_mul_ps(s1, w1), d11); - w0 = _mm256_loadu_ps(W2 + k + 0 * F); - w1 = _mm256_loadu_ps(W2 + k + 1 * F); - d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - d21 = _mm256_add_ps(_mm256_mul_ps(s1, w1), d21); - w0 = _mm256_loadu_ps(W3 + k + 0 * F); - w1 = _mm256_loadu_ps(W3 + k + 1 * F); - d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - d31 = _mm256_add_ps(_mm256_mul_ps(s1, w1), d31); - } - d00 = _mm256_add_ps(d00, d01); - d10 = _mm256_add_ps(d10, d11); - d20 = _mm256_add_ps(d20, d21); - d30 = _mm256_add_ps(d30, d31); - } - for (; k < K8; k += 8) - { - s0 = _mm256_loadu_ps(S0 + k + 0 * F); - w0 = _mm256_loadu_ps(W0 + k + 0 * F); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - w0 = _mm256_loadu_ps(W1 + k + 0 * F); - d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - w0 = _mm256_loadu_ps(W2 + k + 0 * F); - d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - w0 = _mm256_loadu_ps(W3 + k + 0 * F); - d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 tail = Tail(K - K8); - s0 = _mm256_and_ps(tail, _mm256_loadu_ps(S0 + k)); - w0 = _mm256_loadu_ps(W0 + k + 0 * F); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - w0 = _mm256_loadu_ps(W1 + k + 0 * F); - d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - w0 = _mm256_loadu_ps(W2 + k + 0 * F); - d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - w0 = _mm256_loadu_ps(W3 + k + 0 * F); - d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - } - _mm_storeu_ps(D, _mm_add_ps(Extract4Sums(d00, d10, d20, d30), _mm_loadu_ps(B))); - } - - void SynetInnerProductLayerForward(const float * src, const float * weight, const float * bias, size_t count, size_t size, float * dst) - { - float _bias[4] = { 0, 0, 0, 0 }; - size_t count4 = AlignLo(count, 4); - size_t i = 0; - for (; i < count4; i += 4) - SynetInnerProductLayerForward4(src, weight + i * size, (bias ? bias + i : _bias), size, dst + i); - for (; i < count; ++i) - SynetInnerProductLayerForward1(src, weight + i * size, (bias ? bias + i : _bias), size, dst + i); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, float * dst, size_t offset) - { - Store(dst + offset, _mm256_add_ps(_mm256_mul_ps(Load(src + offset), Load(scale + offset)), Load(bias + offset))); - } - - template SIMD_INLINE void SynetScaleLayerForward(const float * src, const float * scale, float * dst, size_t offset) - { - Store(dst + offset, _mm256_mul_ps(Load(src + offset), Load(scale + offset))); - } - - template SIMD_INLINE void SynetScaleLayerForward(const float * src, const __m256 & scale, const __m256 & bias, float * dst, size_t offset) - { - Store(dst + offset, _mm256_add_ps(_mm256_mul_ps(Load(src + offset), scale), bias)); - } - - template SIMD_INLINE void SynetScaleLayerForward(const float * src, const __m256 & scale, float * dst, size_t offset) - { - Store(dst + offset, _mm256_mul_ps(Load(src + offset), scale)); - } - - template void SynetScaleLayerForwardNchw(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(spatial, F) && Aligned(dst)); - - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - if (bias) - { - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - if (partial) - { - __m256 _scale = _mm256_set1_ps(scale[c]); - __m256 _bias = _mm256_set1_ps(bias[c]); - for (; s < aligned; s += QF) - { - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 0); - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 1); - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 2); - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetScaleLayerForward(src, _scale, _bias, dst, s); - } - for (; s < spatial; ++s) - dst[s] = src[s] * scale[c] + bias[c]; - src += spatial; - dst += spatial; - } - } - else - { - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - if (partial) - { - __m256 _scale = _mm256_set1_ps(scale[c]); - for (; s < aligned; s += QF) - { - SynetScaleLayerForward(src, _scale, dst, s + F * 0); - SynetScaleLayerForward(src, _scale, dst, s + F * 1); - SynetScaleLayerForward(src, _scale, dst, s + F * 2); - SynetScaleLayerForward(src, _scale, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetScaleLayerForward(src, _scale, dst, s); - } - for (; s < spatial; ++s) - dst[s] = src[s] * scale[c]; - src += spatial; - dst += spatial; - } - } - } - - SIMD_INLINE void SynetScaleLayerForwardNchw(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(spatial, F) && Aligned(dst)) - SynetScaleLayerForwardNchw(src, scale, bias, channels, spatial, dst); - else - SynetScaleLayerForwardNchw(src, scale, bias, channels, spatial, dst); - } - - template void SynetScaleLayerForwardNhwc(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(scale) && Aligned(bias) && Aligned(channels, F) && Aligned(dst)); - - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - if (bias) - { - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - if (partial) - { - for (; c < aligned; c += QF) - { - SynetScaleLayerForward(src, scale, bias, dst, c + F * 0); - SynetScaleLayerForward(src, scale, bias, dst, c + F * 1); - SynetScaleLayerForward(src, scale, bias, dst, c + F * 2); - SynetScaleLayerForward(src, scale, bias, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetScaleLayerForward(src, scale, bias, dst, c); - } - for (; c < channels; ++c) - dst[c] = src[c] * scale[c] + bias[c]; - src += channels; - dst += channels; - } - } - else - { - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - if (partial) - { - for (; c < aligned; c += QF) - { - SynetScaleLayerForward(src, scale, dst, c + F * 0); - SynetScaleLayerForward(src, scale, dst, c + F * 1); - SynetScaleLayerForward(src, scale, dst, c + F * 2); - SynetScaleLayerForward(src, scale, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetScaleLayerForward(src, scale, dst, c); - } - for (; c < channels; ++c) - dst[c] = src[c] * scale[c]; - src += channels; - dst += channels; - } - } - } - - template void SynetScaleLayerForwardNhwc3(const float * src, const float * scale, const float * bias, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t spatial3 = spatial * 3; - size_t spatialF3 = AlignLo(spatial, F) * 3; - if (bias) - { - size_t s = 0; - if (spatialF3) - { - float _scale[F * 3], _bias[F * 3]; - for (size_t i = 0; i < F; ++i) - for (size_t c = 0; c < 3; ++c) - _scale[i * 3 + c] = scale[c], _bias[i * 3 + c] = bias[c]; - __m256 _scale0 = Load(_scale + 0 * F); - __m256 _scale1 = Load(_scale + 1 * F); - __m256 _scale2 = Load(_scale + 2 * F); - __m256 _bias0 = Load(_bias + 0 * F); - __m256 _bias1 = Load(_bias + 1 * F); - __m256 _bias2 = Load(_bias + 2 * F); - for (; s < spatialF3; s += F * 3) - { - SynetScaleLayerForward(src, _scale0, _bias0, dst, s + F * 0); - SynetScaleLayerForward(src, _scale1, _bias1, dst, s + F * 1); - SynetScaleLayerForward(src, _scale2, _bias2, dst, s + F * 2); - } - } - for (; s < spatial3; s += 3) - { - dst[s + 0] = src[s + 0] * scale[0] + bias[0]; - dst[s + 1] = src[s + 1] * scale[1] + bias[1]; - dst[s + 2] = src[s + 2] * scale[2] + bias[2]; - } - } - else - { - size_t s = 0; - if (spatialF3) - { - float _scale[F * 3]; - for (size_t i = 0; i < F; ++i) - for (size_t c = 0; c < 3; ++c) - _scale[i * 3 + c] = scale[c]; - __m256 _scale0 = Load(_scale + 0 * F); - __m256 _scale1 = Load(_scale + 1 * F); - __m256 _scale2 = Load(_scale + 2 * F); - for (; s < spatialF3; s += F * 3) - { - SynetScaleLayerForward(src, _scale0, dst, s + F * 0); - SynetScaleLayerForward(src, _scale1, dst, s + F * 1); - SynetScaleLayerForward(src, _scale2, dst, s + F * 2); - } - } - for (; s < spatial3; s += 3) - { - dst[s + 0] = src[s + 0] * scale[0]; - dst[s + 1] = src[s + 1] * scale[1]; - dst[s + 2] = src[s + 2] * scale[2]; - } - } - } - - SIMD_INLINE void SynetScaleLayerForwardNhwc(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, float * dst) - { - if (channels == 3) - { - if (Aligned(src) && Aligned(dst)) - SynetScaleLayerForwardNhwc3(src, scale, bias, spatial, dst); - else - SynetScaleLayerForwardNhwc3(src, scale, bias, spatial, dst); - } - else - { - if (Aligned(src) && Aligned(scale) && Aligned(bias) && Aligned(channels, F) && Aligned(dst)) - SynetScaleLayerForwardNhwc(src, scale, bias, channels, spatial, dst); - else - SynetScaleLayerForwardNhwc(src, scale, bias, channels, spatial, dst); - } - } - - template void SynetScaleLayerForwardNchw8c(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - if (bias) - { - for (size_t c = 0; c < channels; c += F) - { - __m256 _scale = Load(scale + c); - __m256 _bias = Load(bias + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 0); - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 1); - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 2); - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetScaleLayerForward(src, _scale, _bias, dst, s); - src += spatialF; - dst += spatialF; - } - } - else - { - for (size_t c = 0; c < channels; c += F) - { - __m256 _scale = Load(scale + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetScaleLayerForward(src, _scale, dst, s + F * 0); - SynetScaleLayerForward(src, _scale, dst, s + F * 1); - SynetScaleLayerForward(src, _scale, dst, s + F * 2); - SynetScaleLayerForward(src, _scale, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetScaleLayerForward(src, _scale, dst, s); - src += spatialF; - dst += spatialF; - } - } - } - - SIMD_INLINE void SynetScaleLayerForwardNchw8c(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetScaleLayerForwardNchw8c(src, scale, bias, channels, spatial, dst); - else - SynetScaleLayerForwardNchw8c(src, scale, bias, channels, spatial, dst); - } - - void SynetScaleLayerForward(const float* src, const float* scale, const float* bias, size_t channels, size_t height, size_t width, float* dst, SimdTensorFormatType format, SimdSynetCompatibilityType compatibility) - { - size_t spatial = height * width; - if (Base::NchwCompatible(channels, spatial, format)) - SynetScaleLayerForwardNchw(src, scale, bias, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetScaleLayerForwardNhwc(src, scale, bias, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - Sse::SynetScaleLayerForward(src, scale, bias, channels, height, width, dst, format, compatibility); - else if (format == SimdTensorFormatNchw8c) - SynetScaleLayerForwardNchw8c(src, scale, bias, channels, spatial, dst); - else - Base::SynetScaleLayerForward(src, scale, bias, channels, height, width, dst, format, compatibility); - } - - //--------------------------------------------------------------------- - - - void SynetShuffleLayerForward(const float* src0, const float* src1, size_t channels0, size_t channels1, size_t spatial, float* dst0, float* dst1, SimdTensorFormatType format, int type) - { - if (format == SimdTensorFormatNchw) - Base::SynetShuffleLayerForward(src0, src1, channels0, channels1, spatial, dst0, dst1, format, type); - else if (format == SimdTensorFormatNhwc) - { - size_t channels = (channels0 + channels1) / 2; - size_t channels0F = AlignLo(channels0, F); - size_t channels0DF = AlignLo(channels0, DF); - size_t channels1F = AlignLo(channels1, F); - size_t channels1DF = AlignLo(channels1, DF); - if (type == 0) - { - for (size_t s = 0; s < spatial; ++s) - { - size_t cd = 0, cs0 = 0, cs1 = 0; - for (; cs0 < channels0DF; cs0 += DF, cd += F) - { - __m256 s0 = _mm256_loadu_ps(src0 + cs0 + 0); - __m256 s1 = _mm256_loadu_ps(src0 + cs0 + F); - __m256 p0 = _mm256_permute2f128_ps(s0, s1, 0x20); - __m256 p1 = _mm256_permute2f128_ps(s0, s1, 0x31); - _mm256_storeu_ps(dst0 + cd, _mm256_shuffle_ps(p0, p1, 0x88)); - _mm256_storeu_ps(dst1 + cd, _mm256_shuffle_ps(p0, p1, 0xDD)); - } - for (; cs0 < channels0F; cs0 += F, cd += HF) - { - __m128 s0 = _mm_loadu_ps(src0 + cs0 + 00); - __m128 s1 = _mm_loadu_ps(src0 + cs0 + HF); - _mm_storeu_ps(dst0 + cd, _mm_shuffle_ps(s0, s1, 0x88)); - _mm_storeu_ps(dst1 + cd, _mm_shuffle_ps(s0, s1, 0xDD)); - } - for (; cs0 < channels0; cs0 += 2, cd += 1) - { - dst0[cd] = src0[cs0 + 0]; - dst1[cd] = src0[cs0 + 1]; - } - for (; cs1 < channels1DF; cs1 += DF, cd += F) - { - __m256 s0 = _mm256_loadu_ps(src1 + cs1 + 0); - __m256 s1 = _mm256_loadu_ps(src1 + cs1 + F); - __m256 p0 = _mm256_permute2f128_ps(s0, s1, 0x20); - __m256 p1 = _mm256_permute2f128_ps(s0, s1, 0x31); - _mm256_storeu_ps(dst0 + cd, _mm256_shuffle_ps(p0, p1, 0x88)); - _mm256_storeu_ps(dst1 + cd, _mm256_shuffle_ps(p0, p1, 0xDD)); - } - for (; cs1 < channels1F; cs1 += F, cd += HF) - { - __m128 s0 = _mm_loadu_ps(src1 + cs1 + 00); - __m128 s1 = _mm_loadu_ps(src1 + cs1 + HF); - _mm_storeu_ps(dst0 + cd, _mm_shuffle_ps(s0, s1, 0x88)); - _mm_storeu_ps(dst1 + cd, _mm_shuffle_ps(s0, s1, 0xDD)); - } - for (; cs1 < channels1; cs1 += 2, cd += 1) - { - dst0[cd] = src1[cs1 + 0]; - dst1[cd] = src1[cs1 + 1]; - } - src0 += channels0; - src1 += channels1; - dst0 += channels; - dst1 += channels; - } - } - else if (type == 1) - { - for (size_t s = 0; s < spatial; ++s) - { - size_t cs = 0, cd0 = 0, cd1 = 0; - for (; cd0 < channels0DF; cd0 += DF, cs += F) - { - __m256 s0 = _mm256_loadu_ps(src0 + cs); - __m256 s1 = _mm256_loadu_ps(src1 + cs); - __m256 u0 = _mm256_unpacklo_ps(s0, s1); - __m256 u1 = _mm256_unpackhi_ps(s0, s1); - _mm256_storeu_ps(dst0 + cd0 + 0, _mm256_permute2f128_ps(u0, u1, 0x20)); - _mm256_storeu_ps(dst0 + cd0 + F, _mm256_permute2f128_ps(u0, u1, 0x31)); - } - for (; cd0 < channels0F; cd0 += F, cs += HF) - { - __m128 s0 = _mm_loadu_ps(src0 + cs); - __m128 s1 = _mm_loadu_ps(src1 + cs); - _mm_storeu_ps(dst0 + cd0 + 00, _mm_unpacklo_ps(s0, s1)); - _mm_storeu_ps(dst0 + cd0 + HF, _mm_unpackhi_ps(s0, s1)); - } - for (; cd0 < channels0; cd0 += 2, cs += 1) - { - dst0[cd0 + 0] = src0[cs]; - dst0[cd0 + 1] = src1[cs]; - } - for (; cd1 < channels1DF; cd1 += DF, cs += F) - { - __m256 s0 = _mm256_loadu_ps(src0 + cs); - __m256 s1 = _mm256_loadu_ps(src1 + cs); - __m256 u0 = _mm256_unpacklo_ps(s0, s1); - __m256 u1 = _mm256_unpackhi_ps(s0, s1); - _mm256_storeu_ps(dst1 + cd1 + 0, _mm256_permute2f128_ps(u0, u1, 0x20)); - _mm256_storeu_ps(dst1 + cd1 + F, _mm256_permute2f128_ps(u0, u1, 0x31)); - } - for (; cd1 < channels1F; cd1 += F, cs += HF) - { - __m128 s0 = _mm_loadu_ps(src0 + cs); - __m128 s1 = _mm_loadu_ps(src1 + cs); - _mm_storeu_ps(dst1 + cd1 + 00, _mm_unpacklo_ps(s0, s1)); - _mm_storeu_ps(dst1 + cd1 + HF, _mm_unpackhi_ps(s0, s1)); - } - for (; cd1 < channels1; cd1 += 2, cs += 1) - { - dst1[cd1 + 0] = src0[cs]; - dst1[cd1 + 1] = src1[cs]; - } - src0 += channels; - src1 += channels; - dst0 += channels0; - dst1 += channels1; - } - } - else - assert(0); - } - else - assert(0); - } - } -#endif// SIMD_AVX_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx1SynetActivation.cpp b/src/3rd/Simd/Simd/SimdAvx1SynetActivation.cpp deleted file mode 100644 index c81f4b1f..00000000 --- a/src/3rd/Simd/Simd/SimdAvx1SynetActivation.cpp +++ /dev/null @@ -1,282 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdSse1.h" -#include "Simd/SimdSynet.h" - -namespace Simd -{ -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - template SIMD_INLINE void SynetHswish32f(const float * src, __m256 shift, __m256 scale, float * dst, size_t offset) - { - __m256 value = Load(src + offset); - Store(dst + offset, _mm256_mul_ps(_mm256_mul_ps(_mm256_max_ps(_mm256_add_ps(_mm256_min_ps(value, shift), shift), _mm256_setzero_ps()), scale), value)); - } - - template void SynetHswish32f(const float * src, size_t size, const float * shift, const float * scale, float * dst) - { - __m256 _shift = _mm256_set1_ps(shift[0]); - __m256 _scale = _mm256_set1_ps(scale[0]); - size_t sizeF = AlignLo(size, F); - size_t sizeQF = AlignLo(size, QF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - SynetHswish32f(src, _shift, _scale, dst, i + 0 * F); - SynetHswish32f(src, _shift, _scale, dst, i + 1 * F); - SynetHswish32f(src, _shift, _scale, dst, i + 2 * F); - SynetHswish32f(src, _shift, _scale, dst, i + 3 * F); - } - for (; i < sizeF; i += F) - SynetHswish32f(src, _shift, _scale, dst, i); - for (; i < size; ++i) - dst[i] = Base::SynetHswish32f(src[i], shift[0], scale[0]); - } - - void SynetHswish32f(const float * src, size_t size, const float * shift, const float * scale, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetHswish32f(src, size, shift, scale, dst); - else - SynetHswish32f(src, size, shift, scale, dst); - } - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetPreluLayerForward(const float* src, const float* slope, float* dst, size_t offset) - { - __m256 _src = Load(src + offset); - __m256 _slope = Load(slope + offset); - __m256 pos = _mm256_max_ps(_mm256_setzero_ps(), _src); - __m256 neg = _mm256_min_ps(_mm256_setzero_ps(), _src); - Store(dst + offset, _mm256_add_ps(pos, _mm256_mul_ps(_slope, neg))); - } - - template SIMD_INLINE void SynetPreluLayerForward(const float* src, __m256 slope, float* dst, size_t offset) - { - __m256 _src = Load(src + offset); - __m256 pos = _mm256_max_ps(_mm256_setzero_ps(), _src); - __m256 neg = _mm256_min_ps(_mm256_setzero_ps(), _src); - Store(dst + offset, _mm256_add_ps(pos, _mm256_mul_ps(slope, neg))); - } - - template void SynetPreluLayerForwardNchw(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) - { - if (align) - assert(Aligned(src) && Aligned(spatial, F) && Aligned(dst)); - - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - if (partial) - { - __m256 _slope = _mm256_set1_ps(slope[c]); - for (; s < aligned; s += QF) - { - SynetPreluLayerForward(src, _slope, dst, s + F * 0); - SynetPreluLayerForward(src, _slope, dst, s + F * 1); - SynetPreluLayerForward(src, _slope, dst, s + F * 2); - SynetPreluLayerForward(src, _slope, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetPreluLayerForward(src, _slope, dst, s); - } - for (; s < spatial; ++s) - dst[s] = Base::SynetRelu32f(src[s], slope[c]); - src += spatial; - dst += spatial; - } - } - - SIMD_INLINE void SynetPreluLayerForwardNchw(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) - { - if (Aligned(src) && Aligned(spatial, F) && Aligned(dst)) - SynetPreluLayerForwardNchw(src, slope, channels, spatial, dst); - else - SynetPreluLayerForwardNchw(src, slope, channels, spatial, dst); - } - - template void SynetPreluLayerForwardNhwc(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) - { - if (align) - assert(Aligned(src) && Aligned(slope) && Aligned(channels, F) && Aligned(dst)); - - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - if (partial) - { - for (; c < aligned; c += QF) - { - SynetPreluLayerForward(src, slope, dst, c + F * 0); - SynetPreluLayerForward(src, slope, dst, c + F * 1); - SynetPreluLayerForward(src, slope, dst, c + F * 2); - SynetPreluLayerForward(src, slope, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetPreluLayerForward(src, slope, dst, c); - } - for (; c < channels; ++c) - dst[c] = Base::SynetRelu32f(src[c], slope[c]); - src += channels; - dst += channels; - } - } - - SIMD_INLINE void SynetPreluLayerForwardNhwc(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) - { - if (Aligned(src) && Aligned(slope) && Aligned(channels, F) && Aligned(dst)) - SynetPreluLayerForwardNhwc(src, slope, channels, spatial, dst); - else - SynetPreluLayerForwardNhwc(src, slope, channels, spatial, dst); - } - - template void SynetPreluLayerForwardNchw8c(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4) * F; - for (size_t c = 0; c < channels; c += F) - { - __m256 _slope = Load(slope + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetPreluLayerForward(src, _slope, dst, s + F * 0); - SynetPreluLayerForward(src, _slope, dst, s + F * 1); - SynetPreluLayerForward(src, _slope, dst, s + F * 2); - SynetPreluLayerForward(src, _slope, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetPreluLayerForward(src, _slope, dst, s); - src += spatialF; - dst += spatialF; - } - } - - SIMD_INLINE void SynetPreluLayerForwardNchw8c(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) - { - if (Aligned(src) && Aligned(dst)) - SynetPreluLayerForwardNchw8c(src, slope, channels, spatial, dst); - else - SynetPreluLayerForwardNchw8c(src, slope, channels, spatial, dst); - } - - void SynetPreluLayerForward(const float* src, const float* slope, size_t channels, size_t spatial, float* dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetPreluLayerForwardNchw(src, slope, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetPreluLayerForwardNhwc(src, slope, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - Sse::SynetPreluLayerForward(src, slope, channels, spatial, dst, format); - else if (format == SimdTensorFormatNchw8c) - SynetPreluLayerForwardNchw8c(src, slope, channels, spatial, dst); - else - Base::SynetPreluLayerForward(src, slope, channels, spatial, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetRelu32f(const float* src, __m256 slope, float* dst, size_t offset) - { - Store(dst + offset, SynetRelu32f(Load(src + offset), slope)); - } - - template void SynetRelu32f(const float* src, size_t size, const float* slope, float* dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - __m256 _slope = _mm256_set1_ps(slope[0]); - size_t sizeF = AlignLo(size, F); - size_t sizeQF = AlignLo(size, QF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - SynetRelu32f(src, _slope, dst, i + 0 * F); - SynetRelu32f(src, _slope, dst, i + 1 * F); - SynetRelu32f(src, _slope, dst, i + 2 * F); - SynetRelu32f(src, _slope, dst, i + 3 * F); - } - for (; i < sizeF; i += F) - SynetRelu32f(src, _slope, dst, i); - for (; i < size; ++i) - dst[i] = Base::SynetRelu32f(src[i], slope[0]); - } - - void SynetRelu32f(const float* src, size_t size, const float* slope, float* dst) - { - if (Aligned(src) && Aligned(dst)) - SynetRelu32f(src, size, slope, dst); - else - SynetRelu32f(src, size, slope, dst); - } - - //--------------------------------------------------------------------- - - template void SynetRestrictRange32f(const float * src, size_t size, const float * lower, const float * upper, float * dst) - { - assert(lower[0] <= upper[0]); - if (align) - assert(Aligned(src) && Aligned(dst)); - float min = *lower; - float max = *upper; - __m256 _min = _mm256_set1_ps(min); - __m256 _max = _mm256_set1_ps(max); - size_t sizeF = Simd::AlignLo(size, F); - size_t sizeQF = Simd::AlignLo(size, QF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - Store(dst + i + 0 * F, _mm256_min_ps(_mm256_max_ps(_min, Load(src + i + 0 * F)), _max)); - Store(dst + i + 1 * F, _mm256_min_ps(_mm256_max_ps(_min, Load(src + i + 1 * F)), _max)); - Store(dst + i + 2 * F, _mm256_min_ps(_mm256_max_ps(_min, Load(src + i + 2 * F)), _max)); - Store(dst + i + 3 * F, _mm256_min_ps(_mm256_max_ps(_min, Load(src + i + 3 * F)), _max)); - } - for (; i < sizeF; i += F) - Store(dst + i, _mm256_min_ps(_mm256_max_ps(_min, Load(src + i)), _max)); - for (; i < size; ++i) - dst[i] = Simd::RestrictRange(src[i], min, max); - } - - void SynetRestrictRange32f(const float * src, size_t size, const float * lower, const float * upper, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetRestrictRange32f(src, size, lower, upper, dst); - else - SynetRestrictRange32f(src, size, lower, upper, dst); - } - } -#endif// SIMD_AVX_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx1SynetConversion.cpp b/src/3rd/Simd/Simd/SimdAvx1SynetConversion.cpp deleted file mode 100644 index 52b4ab4f..00000000 --- a/src/3rd/Simd/Simd/SimdAvx1SynetConversion.cpp +++ /dev/null @@ -1,567 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdTranspose.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdSse1.h" - -namespace Simd -{ -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - template void SynetReorderImage_Chw_Hwc(size_t channels, size_t spatial, const float * src, float * dst) - { - size_t spatial4 = AlignLo(spatial, 4); - size_t channels8 = AlignLo(channels, 8); - size_t spatial8 = AlignLo(spatial, 8); - size_t s = 0; - for (; s < spatial8; s += 8, src += 8, dst += 8 * channels) - { - size_t c = 0; - const float * ps = src; - float * pd = dst; - for (; c < channels8; c += 8, ps += 8 * spatial, pd += 8) - Transpose8x8(ps, spatial, pd, channels); - for (; c < channels; ++c, ps += spatial, pd += 1) - { - pd[0 * channels] = ps[0]; - pd[1 * channels] = ps[1]; - pd[2 * channels] = ps[2]; - pd[3 * channels] = ps[3]; - pd[4 * channels] = ps[4]; - pd[5 * channels] = ps[5]; - pd[6 * channels] = ps[6]; - pd[7 * channels] = ps[7]; - } - } - for (; s < spatial4; s += 4, src += 4, dst += 4 * channels) - { - size_t c = 0; - const float * ps = src; - float * pd = dst; - for (; c < channels8; c += 8, ps += 8 * spatial, pd += 8) - Transpose4x8(ps, spatial, pd, channels); - for (; c < channels; ++c, ps += spatial, pd += 1) - { - pd[0 * channels] = ps[0]; - pd[1 * channels] = ps[1]; - pd[2 * channels] = ps[2]; - pd[3 * channels] = ps[3]; - } - } - for (; s < spatial; ++s, src += 1, dst += channels) - for (size_t c = 0; c < channels; ++c) - dst[c] = src[c*spatial]; - } - - template void SynetReorderImage_Chw_Chw8c(size_t channels, size_t spatial, const float * src, float * dst) - { - size_t channels8 = AlignLo(channels, 8); - size_t spatial8 = AlignLo(spatial, 8); - size_t tail = channels - channels8; - size_t c = 0; - for (; c < channels8; c += 8, src += 8 * spatial) - { - size_t s = 0; - const float * ps = src; - for (; s < spatial8; s += 8, dst += 8 * F, ps += 8) - Transpose8x8(ps, spatial, dst, 8); - for (; s < spatial; ++s, dst += F, ps += 1) - { - dst[0] = ps[0 * spatial]; - dst[1] = ps[1 * spatial]; - dst[2] = ps[2 * spatial]; - dst[3] = ps[3 * spatial]; - dst[4] = ps[4 * spatial]; - dst[5] = ps[5 * spatial]; - dst[6] = ps[6 * spatial]; - dst[7] = ps[7 * spatial]; - } - } - if (tail) - { - const float * ps = src; - for (size_t s = 0; s < spatial; ++s, dst += F, ps += 1) - { - size_t i = 0; - for (; i < tail; ++i) - dst[i] = ps[i*spatial]; - for (; i < F; ++i) - dst[i] = 0; - } - } - } - - template void SynetReorderImage_Hwc_Chw(size_t channels, size_t spatial, const float * src, float * dst) - { - SynetReorderImage_Chw_Hwc(spatial, channels, src, dst); - } - - template void SynetReorderImage_Hwc_Chw8c(size_t channels, size_t spatial, const float * src, float * dst) - { - size_t channelsF = AlignLo(channels, F); - size_t channelsF4 = AlignLo(channels, 4 * F); - size_t tail = channels - channelsF; - size_t spatial4 = AlignLo(spatial, 4); - size_t stride = spatial * F; - size_t c = 0; - for (; c < channelsF4; c += 4 * F, src += 4 * F) - { - const float * ps = src; - float * pd = dst; - size_t i = 0; - for (; i < spatial4; i += 4, pd += 4 * F, ps += 4 * channels) - Transpose4x4xF(ps, channels, pd, stride); - for (; i < spatial; ++i, pd += F, ps += channels) - { - Copy(ps + 0 * F, pd + 0 * stride); - Copy(ps + 1 * F, pd + 1 * stride); - Copy(ps + 2 * F, pd + 2 * stride); - Copy(ps + 3 * F, pd + 3 * stride); - } - dst += 4 * stride; - } - for (; c < channelsF; c += F, src += F) - { - const float * ps = src; - for (size_t s = 0; s < spatial; ++s, ps += channels, dst += F) - Copy(ps, dst); - } - if (tail) - { - const float * psrc = src; - for (size_t s = 0; s < spatial; ++s, psrc += channels, dst += F) - { - size_t i = 0; - for (; i < tail; ++i) - dst[i] = psrc[i]; - for (; i < F; ++i) - dst[i] = 0; - } - } - } - - template void SynetReorderImage_Chw8c_Chw(size_t channels, size_t spatial, const float * src, float * dst) - { - size_t channels8 = AlignLo(channels, 8); - size_t spatial8 = AlignLo(spatial, 8); - size_t tail = channels - channels8; - size_t c = 0; - for (; c < channels8; c += 8, dst += 8 * spatial, src += 8 * spatial) - { - const float * ps = src; - size_t s = 0; - for (; s < spatial8; s += 8, ps += 8 * F) - Transpose8x8(ps, 8, dst + s, spatial); - for (; s < spatial; ++s, ps += 8) - { - dst[s + 0 * spatial] = ps[0]; - dst[s + 1 * spatial] = ps[1]; - dst[s + 2 * spatial] = ps[2]; - dst[s + 3 * spatial] = ps[3]; - dst[s + 4 * spatial] = ps[4]; - dst[s + 5 * spatial] = ps[5]; - dst[s + 6 * spatial] = ps[6]; - dst[s + 7 * spatial] = ps[7]; - } - } - if (tail) - { - const float * ps = src; - for (size_t i = 0; i < tail; ++i, ps += 1, dst += spatial) - { - for (size_t s = 0; s < spatial; ++s) - dst[s] = ps[s*F]; - } - } - } - - template void SynetReorderImage_Chw8c_Hwc(size_t channels, size_t spatial, const float * src, float * dst) - { - size_t stride = F * spatial; - size_t channelsF = AlignLo(channels, F); - size_t channelsF4 = AlignLo(channels, 4 * F); - size_t tail = channels - channelsF; - size_t spatial4 = AlignLo(spatial, 4); - size_t s = 0; - for (; s < spatial4; s += 4, src += 4 * F, dst += 4 * channels) - { - const float * ps = src; - float * pd = dst; - size_t c = 0; - for (; c < channelsF4; c += 4 * F, ps += 4 * stride, pd += 4 * F) - Transpose4x4xF(ps, stride, pd, channels); - for (; c < channelsF; c += F, ps += stride, pd += F) - { - Copy(ps + 0 * F, pd + 0 * channels); - Copy(ps + 1 * F, pd + 1 * channels); - Copy(ps + 2 * F, pd + 2 * channels); - Copy(ps + 3 * F, pd + 3 * channels); - } - if (tail) - { - for (size_t i = 0; i < tail; ++i) - { - pd[i + 0 * channels] = ps[i + 0 * F]; - pd[i + 1 * channels] = ps[i + 1 * F]; - pd[i + 2 * channels] = ps[i + 2 * F]; - pd[i + 3 * channels] = ps[i + 3 * F]; - } - } - } - for (; s < spatial; ++s, src += F) - { - const float * ps = src; - for (size_t c = 0; c < channelsF; c += F, ps += stride, dst += F) - Copy(ps, dst); - if (tail) - { - for (size_t i = 0; i < tail; ++i) - *(dst++) = ps[i]; - } - } - } - - typedef void(*SynetImageConverterPtr)(size_t channels, size_t spatial, const float * src, float * dst); - SynetImageConverterPtr GetImageConverter(SimdTensorFormatType src, SimdTensorFormatType dst) - { - if (src == SimdTensorFormatNchw) - { - if (dst == SimdTensorFormatNhwc) - return SynetReorderImage_Chw_Hwc; - if (dst == SimdTensorFormatNchw8c) - return SynetReorderImage_Chw_Chw8c; - } - if (src == SimdTensorFormatNhwc) - { - if (dst == SimdTensorFormatNchw) - return SynetReorderImage_Hwc_Chw; - if (dst == SimdTensorFormatNchw8c) - return SynetReorderImage_Hwc_Chw8c; - } - if (src == SimdTensorFormatNchw8c) - { - if (dst == SimdTensorFormatNchw) - return SynetReorderImage_Chw8c_Chw; - if (dst == SimdTensorFormatNhwc) - return SynetReorderImage_Chw8c_Hwc; - } - return NULL; - } - - void SynetReorderImage(size_t batch, size_t channels, size_t spatial, const float * src, SimdTensorFormatType srcFormat, float * dst, SimdTensorFormatType dstFormat) - { - SynetImageConverterPtr imageConverter = GetImageConverter(srcFormat, dstFormat); - if (imageConverter) - { - size_t srcStride = AlignHi(channels, Base::SynetTensorAlignment(srcFormat))*spatial; - size_t dstStride = AlignHi(channels, Base::SynetTensorAlignment(dstFormat))*spatial; - for (size_t n = 0; n < batch; ++n) - { - imageConverter(channels, spatial, src, dst); - src += srcStride; - dst += dstStride; - } - } - else - return Sse::SynetReorderImage(batch, channels, spatial, src, srcFormat, dst, dstFormat); - } - - template void SynetReorderFilter_Oiyx_Yxio(size_t output, size_t input, size_t kernel, const float * src, float * dst) - { - if (kernel == 1) - { - SynetReorderImage_Chw_Hwc(output, input, src, dst); - return; - } - size_t output8 = AlignLo(output, 8); - size_t kernel8 = AlignLo(kernel, 8); - size_t ik = input * kernel, oi = output * input; - for (size_t i = 0; i < input; ++i, src += kernel, dst += output) - { - const float * ps = src; - float * pd = dst; - size_t k = 0; - for (; k < kernel8; k += 8, ps += 8, pd += 8 * oi) - { - size_t o = 0; - for (; o < output8; o += 8) - Transpose8x8(ps + o * ik, ik, pd + o, oi); - for (; o < output; ++o) - { - pd[0 * oi + o] = ps[o * ik + 0]; - pd[1 * oi + o] = ps[o * ik + 1]; - pd[2 * oi + o] = ps[o * ik + 2]; - pd[3 * oi + o] = ps[o * ik + 3]; - pd[4 * oi + o] = ps[o * ik + 4]; - pd[5 * oi + o] = ps[o * ik + 5]; - pd[6 * oi + o] = ps[o * ik + 6]; - pd[7 * oi + o] = ps[o * ik + 7]; - } - } - for (; k < kernel; ++k, ps += 1, pd += oi) - for (size_t o = 0; o < output; ++o) - pd[o] = ps[o*ik]; - } - } - - template void SynetReorderFilter_Oiyx_Oyxi8o(size_t output, size_t input, size_t kernel, const float * src, float * dst) - { - if (kernel == 1) - { - SynetReorderImage_Chw_Chw8c(output, input, src, dst); - return; - } - size_t outputF = AlignLo(output, F); - size_t kernelF = AlignLo(kernel, F); - size_t tail = output - outputF; - size_t ik = input * kernel; - size_t stride = input * F; - for (size_t o = 0; o < outputF; o += F) - { - for (size_t i = 0; i < input; ++i) - { - const float * ps = src + o * ik + i * kernel; - float * pd = dst + o * ik + i * F; - size_t k = 0; - for (; k < kernelF; k += F, ps += F, pd += F * stride) - Transpose8x8(ps, ik, pd, stride); - for (; k < kernel; ++k, ps += 1, pd += stride) - for (size_t j = 0; j < F; ++j) - pd[j] = ps[j*ik]; - } - } - if (tail) - { - for (size_t i = 0; i < input; ++i) - { - const float * ps = src + outputF * ik + i * kernel; - float * pd = dst + outputF * ik + i * F; - for (size_t k = 0; k < kernel; ++k, ps += 1, pd += stride) - { - size_t j = 0; - for (; j < tail; ++j) - pd[j] = ps[j*ik]; - for (; j < F; ++j) - pd[j] = 0; - } - } - } - } - - template void SynetReorderFilter_Yxio_Oiyx(size_t output, size_t input, size_t kernel, const float * src, float * dst) - { - if (kernel == 1) - { - SynetReorderImage_Chw_Hwc(input, output, src, dst); - return; - } - SynetReorderFilter_Oiyx_Yxio(kernel, input, output, src, dst); - } - - template void SynetReorderFilter_Yxio_Oyxi8o(size_t output, size_t input, size_t kernel, const float * src, float * dst) - { - size_t outputF = AlignLo(output, F); - size_t outputF4 = AlignLo(output, F * 4); - size_t ki = kernel * input; - size_t stride = ki * F; - size_t ki4 = AlignLo(ki, 4); - size_t o = 0; - for (; o < outputF4; o += 4 * F, src += 4 * F) - { - const float * ps = src; - float * pd = dst; - size_t i = 0; - for (; i < ki4; i += 4, pd += 4 * F, ps += 4 * output) - Transpose4x4xF(ps, output, pd, stride); - for (; i < ki; ++i, pd += F, ps += output) - { - Copy(ps + 0 * F, pd + 0 * stride); - Copy(ps + 1 * F, pd + 1 * stride); - Copy(ps + 2 * F, pd + 2 * stride); - Copy(ps + 3 * F, pd + 3 * stride); - } - dst += 4 * stride; - } - for (; o < outputF; o += F, src += F) - { - const float * ps = src; - float * pd = dst; - size_t i = 0; - for (; i < ki; ++i, pd += F, ps += output) - Copy(ps, pd); - dst += stride; - } - if (outputF < output) - { - size_t tail = output - outputF; - for (size_t k = 0; k < kernel; ++k) - { - for (size_t i = 0; i < input; ++i, src += output) - { - size_t j = 0; - for (; j < tail; ++j) - *(dst++) = src[j]; - for (; j < F; ++j) - *(dst++) = 0; - } - } - } - } - - template void SynetReorderFilter_Oyxi8o_Oiyx(size_t output, size_t input, size_t kernel, const float * src, float * dst) - { - if (kernel == 1) - { - SynetReorderImage_Chw8c_Chw(output, input, src, dst); - return; - } - size_t outputF = AlignLo(output, F); - size_t tail = output - outputF; - size_t kernelF = AlignLo(kernel, F); - size_t ik = input * kernel; - size_t stride = F * input; - size_t o = 0; - for (; o < outputF; o += F, src += F * ik) - { - const float * ps = src; - float * pd = dst; - for (size_t i = 0; i < input; ++i, ps += F) - { - size_t k = 0; - for (; k < kernelF; k += F, pd += F) - Transpose8x8(ps + k * stride, stride, pd, ik); - for (; k < kernel; ++k, pd++) - { - pd[0 * ik] = ps[k*stride + 0]; - pd[1 * ik] = ps[k*stride + 1]; - pd[2 * ik] = ps[k*stride + 2]; - pd[3 * ik] = ps[k*stride + 3]; - pd[4 * ik] = ps[k*stride + 4]; - pd[5 * ik] = ps[k*stride + 5]; - pd[6 * ik] = ps[k*stride + 6]; - pd[7 * ik] = ps[k*stride + 7]; - } - } - dst += F * ik; - } - if (tail) - { - for (size_t j = 0; j < tail; ++j) - { - const float * ps = src + j; - for (size_t i = 0; i < input; ++i, ps += F) - for (size_t k = 0; k < kernel; ++k) - *(dst++) = ps[k*stride]; - } - } - } - - template void SynetReorderFilter_Oyxi8o_Yxio(size_t output, size_t input, size_t kernel, const float * src, float * dst) - { - size_t outputF = AlignLo(output, F); - size_t outputF4 = AlignLo(output, 4 * F); - size_t tail = output - outputF; - size_t ki = kernel * input; - size_t ki4 = AlignLo(ki, 4); - size_t stride = ki * F; - size_t i = 0; - for (; i < ki4; i += 4, src += 4 * F) - { - const float * ps = src; - float * pd = dst; - size_t o = 0; - for (; o < outputF4; o += 4 * F, ps += 4 * stride, pd += 4 * F) - Transpose4x4xF(ps, stride, pd, output); - for (; o < outputF; o += F, ps += stride, pd += F) - { - Copy(ps + 0 * F, pd + 0 * output); - Copy(ps + 1 * F, pd + 1 * output); - Copy(ps + 2 * F, pd + 2 * output); - Copy(ps + 3 * F, pd + 3 * output); - } - if (tail) - { - for (size_t j = 0; j < tail; ++j) - { - pd[j + 0 * output] = ps[j + 0 * F]; - pd[j + 1 * output] = ps[j + 1 * F]; - pd[j + 2 * output] = ps[j + 2 * F]; - pd[j + 3 * output] = ps[j + 3 * F]; - } - } - dst += 4 * output; - } - for (; i < ki; ++i, src += F) - { - const float * ps = src; - for (size_t o = 0; o < outputF; o += F, ps += stride, dst += F) - Copy(ps, dst); - if (tail) - { - for (size_t j = 0; j < tail; ++j) - *(dst++) = ps[j]; - } - } - } - - typedef void(*SynetFilterConverterPtr)(size_t output, size_t input, size_t kernel, const float * src, float * dst); - SynetFilterConverterPtr GetFilterConverter(SimdTensorFormatType src, SimdTensorFormatType dst) - { - if (src == SimdTensorFormatOiyx) - { - if (dst == SimdTensorFormatYxio) - return SynetReorderFilter_Oiyx_Yxio; - if (dst == SimdTensorFormatOyxi8o) - return SynetReorderFilter_Oiyx_Oyxi8o; - } - if (src == SimdTensorFormatYxio) - { - if (dst == SimdTensorFormatOiyx) - return SynetReorderFilter_Yxio_Oiyx; - if (dst == SimdTensorFormatOyxi8o) - return SynetReorderFilter_Yxio_Oyxi8o; - } - if (src == SimdTensorFormatOyxi8o) - { - if (dst == SimdTensorFormatOiyx) - return SynetReorderFilter_Oyxi8o_Oiyx; - if (dst == SimdTensorFormatYxio) - return SynetReorderFilter_Oyxi8o_Yxio; - } - return NULL; - } - - void SynetReorderFilter(size_t output, size_t input, size_t kernel, const float * src, SimdTensorFormatType srcFormat, float * dst, SimdTensorFormatType dstFormat) - { - SynetFilterConverterPtr filterConverter = GetFilterConverter(srcFormat, dstFormat); - if (filterConverter) - filterConverter(output, input, kernel, src, dst); - else - Sse::SynetReorderFilter(output, input, kernel, src, srcFormat, dst, dstFormat); - } - } -#endif// SIMD_AVX_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx1SynetConvolution32f.cpp b/src/3rd/Simd/Simd/SimdAvx1SynetConvolution32f.cpp deleted file mode 100644 index da01a66b..00000000 --- a/src/3rd/Simd/Simd/SimdAvx1SynetConvolution32f.cpp +++ /dev/null @@ -1,2104 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdAvx1.h" -#include "Simd/SimdGemm.h" - -namespace Simd -{ -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - void ConvolutionBiasAndActivation(const float * bias, size_t count, size_t size, ::SimdConvolutionActivationType activation, const float * params, ::SimdBool trans, float * dst) - { - size_t aligned = trans ? AlignLo(count, F) : AlignLo(size, F); - if (activation == ::SimdConvolutionActivationIdentity) - { - if (bias) - SynetAddBias(bias, count, size, dst, (SimdTensorFormatType)trans); - } - else if (activation == ::SimdConvolutionActivationRelu) - { - if (bias) - { - __m256 _0 = _mm256_set1_ps(0.0f); - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - size_t i = 0; - for (; i < aligned; i += F) - { - __m256 _dst = _mm256_loadu_ps(dst + i); - __m256 _bias = _mm256_loadu_ps(bias + i); - _mm256_storeu_ps(dst + i, _mm256_max_ps(_0, _mm256_add_ps(_dst, _bias))); - } - for (; i < count; ++i) - dst[i] = Simd::Max(0.0f, dst[i] + bias[i]); - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - __m256 _bias = _mm256_set1_ps(bias[i]); - size_t j = 0; - for (; j < aligned; j += F) - { - __m256 _dst = _mm256_loadu_ps(dst + j); - _mm256_storeu_ps(dst + j, _mm256_max_ps(_0, _mm256_add_ps(_dst, _bias))); - } - for (; j < size; ++j) - dst[j] = Simd::Max(0.0f, dst[j] + bias[i]); - dst += size; - } - } - } - else - { - float slope = 0; - SynetRelu32f(dst, size*count, &slope, dst); - } - } - else if (activation == ::SimdConvolutionActivationLeakyRelu) - { - float slope = params[0]; - if (bias) - { - __m256 _slope = _mm256_set1_ps(slope); - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - size_t i = 0; - for (; i < aligned; i += F) - { - __m256 value = _mm256_add_ps(_mm256_loadu_ps(dst + i), _mm256_loadu_ps(bias + i)); - _mm256_storeu_ps(dst + i, SynetRelu32f(value, _slope)); - } - for (; i < count; ++i) - dst[i] = Base::SynetRelu32f(dst[i] + bias[i], slope); - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - __m256 _bias = _mm256_set1_ps(bias[i]); - size_t j = 0; - for (; j < aligned; j += F) - { - __m256 value = _mm256_add_ps(_mm256_loadu_ps(dst + j), _bias); - _mm256_storeu_ps(dst + j, SynetRelu32f(value, _slope)); - } - for (; j < size; ++j) - dst[j] = Base::SynetRelu32f(dst[j] + bias[i], slope); - dst += size; - } - } - } - else - SynetRelu32f(dst, size*count, &slope, dst); - } - else if (activation == ::SimdConvolutionActivationRestrictRange) - { - float lower = params[0]; - float upper = params[1]; - if (bias) - { - __m256 _lower = _mm256_set1_ps(lower); - __m256 _upper = _mm256_set1_ps(upper); - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - size_t i = 0; - for (; i < aligned; i += F) - { - __m256 value = _mm256_add_ps(_mm256_loadu_ps(dst + i), _mm256_loadu_ps(bias + i)); - _mm256_storeu_ps(dst + i, _mm256_min_ps(_mm256_max_ps(_lower, value), _upper)); - } - for (; i < count; ++i) - dst[i] = Simd::RestrictRange(dst[i] + bias[i], lower, upper); - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - __m256 _bias = _mm256_set1_ps(bias[i]); - size_t j = 0; - for (; j < aligned; j += F) - { - __m256 value = _mm256_add_ps(_mm256_loadu_ps(dst + j), _bias); - _mm256_storeu_ps(dst + j, _mm256_min_ps(_mm256_max_ps(_lower, value), _upper)); - } - for (; j < size; ++j) - dst[j] = Simd::RestrictRange(dst[j] + bias[i], lower, upper); - dst += size; - } - } - } - else - SynetRestrictRange32f(dst, size*count, &lower, &upper, dst); - } - else if (activation == ::SimdConvolutionActivationPrelu) - { - if (bias) - { - if (trans) - { - if (count == 1 || count == 2 || count == 4 || count == 8) - { - __m256 _bias, _slope; - if (count == 1) - { - _bias = _mm256_set1_ps(bias[0]); - _slope = _mm256_set1_ps(params[0]); - } - else if (count == 2) - { - _bias = _mm256_setr_ps(bias[0], bias[1], bias[0], bias[1], bias[0], bias[1], bias[0], bias[1]); - _slope = _mm256_setr_ps(params[0], params[1], params[0], params[1], params[0], params[1], params[0], params[1]); - } - else if (count == 4) - { - _bias = _mm256_setr_ps(bias[0], bias[1], bias[2], bias[3], bias[0], bias[1], bias[2], bias[3]); - _slope = _mm256_setr_ps(params[0], params[1], params[2], params[3], params[0], params[1], params[2], params[3]); - } - else if (count == 8) - { - _bias = _mm256_setr_ps(bias[0], bias[1], bias[2], bias[3], bias[4], bias[5], bias[6], bias[7]); - _slope = _mm256_setr_ps(params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7]); - } - else - assert(0); - size_t n = size * count, nF = AlignLo(n, F), i = 0; - for (; i < nF; i += F) - { - __m256 value = _mm256_add_ps(_mm256_loadu_ps(dst + i), _bias); - _mm256_storeu_ps(dst + i, SynetRelu32f(value, _slope)); - } - dst += nF; - for (size_t j = nF/count; j < size; ++j) - { - for (size_t i = 0; i < count; ++i) - dst[i] = Base::SynetRelu32f(dst[i] + bias[i], params[i]); - dst += count; - } - } - else - { - for (size_t j = 0; j < size; ++j) - { - size_t i = 0; - for (; i < aligned; i += F) - { - __m256 value = _mm256_add_ps(_mm256_loadu_ps(dst + i), _mm256_loadu_ps(bias + i)); - _mm256_storeu_ps(dst + i, SynetRelu32f(value, _mm256_loadu_ps(params + i))); - } - for (; i < count; ++i) - dst[i] = Base::SynetRelu32f(dst[i] + bias[i], params[i]); - dst += count; - } - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - __m256 _bias = _mm256_set1_ps(bias[i]); - __m256 _slope = _mm256_set1_ps(params[i]); - size_t j = 0; - for (; j < aligned; j += F) - { - __m256 value = _mm256_add_ps(_mm256_loadu_ps(dst + j), _bias); - _mm256_storeu_ps(dst + j, SynetRelu32f(value, _slope)); - } - for (; j < size; ++j) - dst[j] = Base::SynetRelu32f(dst[j] + bias[i], params[i]); - dst += size; - } - } - } - else - Avx::SynetPreluLayerForward(dst, params, count, size, dst, (SimdTensorFormatType)trans); - } - else if (activation == ::SimdConvolutionActivationHswish) - { - float shift = params[0]; - float scale = params[1]; - if (bias) - { - __m256 _shift = _mm256_set1_ps(shift); - __m256 _scale = _mm256_set1_ps(scale); - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - size_t i = 0; - for (; i < aligned; i += F) - { - __m256 value = _mm256_add_ps(Load(dst + i), Load(bias + i)); - Store(dst + i, SynetHswish32f(value, _shift, _scale)); - } - for (; i < count; ++i) - dst[i] = Base::SynetHswish32f(dst[i] + bias[i], shift, scale); - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - __m256 _bias = _mm256_set1_ps(bias[i]); - size_t j = 0; - for (; j < aligned; j += F) - { - __m256 value = _mm256_add_ps(Load(dst + j), _bias); - Store(dst + j, SynetHswish32f(value, _shift, _scale)); - } - for (; j < size; ++j) - dst[j] = Base::SynetHswish32f(dst[j] + bias[i], shift, scale); - dst += size; - } - } - } - } - else - { - Sse2::ConvolutionBiasAndActivation(bias, count, size, activation, params, trans, dst); - } - } - - //--------------------------------------------------------------------- - - SynetConvolution32fGemmNN::SynetConvolution32fGemmNN(const ConvParam32f & p) - : Sse2::SynetConvolution32fGemmNN(p) - { - _gemm.Init(InitGemmFuncs(Avx::Gemm32fNN, "Avx", p.gemm, "Ext")); - if (_param.trans && _param.group == 1) - { - if (NHWC_GEMM_RUNTIME) - { - _gemmCb.Init(InitGemmCbFuncs(Avx::Gemm32fNNcbBufferSize, Avx::Gemm32fNNcbReorderB, Avx::Gemm32fNNcbRun, "Avx", GemmKernelF2, GemmKernelF3)); - _nhwcWeight.Resize(_gemmCb.At(0).BufferSize(_M*_merge, _N, _K)); - } - else - _nhwcWeight.Resize(Avx::Gemm32fNNcbBufferSize(_M*_merge, _N, _K, GemmKernelAny, NHWC_GEMM_COMPATIBLE)); - _nhwcRun = Avx::Gemm32fNNcbRun; - _nhwcReorderB = Avx::Gemm32fNNcbReorderB; - } - _biasAndActivation = Avx::ConvolutionBiasAndActivation; - } - - template SIMD_INLINE void Copy(const float * src, float * dst) - { - memcpy(dst, src, size * sizeof(float)); - } - - template SIMD_INLINE void Zero(float * dst) - { - memset(dst, 0, size * sizeof(float)); - } - - template<> SIMD_INLINE void Copy<16>(const float * src, float * dst) - { - _mm256_stream_ps(dst + 0 * F, _mm256_loadu_ps(src + 0 * F)); - _mm256_stream_ps(dst + 1 * F, _mm256_loadu_ps(src + 1 * F)); - } - - template<> SIMD_INLINE void Zero<16>(float * dst) - { - _mm256_stream_ps(dst + 0 * F, _mm256_setzero_ps()); - _mm256_stream_ps(dst + 1 * F, _mm256_setzero_ps()); - } - - template<> SIMD_INLINE void Copy<24>(const float * src, float * dst) - { - _mm256_stream_ps(dst + 0 * F, _mm256_loadu_ps(src + 0 * F)); - _mm256_stream_ps(dst + 1 * F, _mm256_loadu_ps(src + 1 * F)); - _mm256_stream_ps(dst + 2 * F, _mm256_loadu_ps(src + 2 * F)); - } - - template<> SIMD_INLINE void Zero<24>(float * dst) - { - _mm256_stream_ps(dst + 0 * F, _mm256_setzero_ps()); - _mm256_stream_ps(dst + 1 * F, _mm256_setzero_ps()); - _mm256_stream_ps(dst + 2 * F, _mm256_setzero_ps()); - } - - template<> SIMD_INLINE void Copy<32>(const float * src, float * dst) - { - _mm256_stream_ps(dst + 0 * F, _mm256_loadu_ps(src + 0 * F)); - _mm256_stream_ps(dst + 1 * F, _mm256_loadu_ps(src + 1 * F)); - _mm256_stream_ps(dst + 2 * F, _mm256_loadu_ps(src + 2 * F)); - _mm256_stream_ps(dst + 3 * F, _mm256_loadu_ps(src + 3 * F)); - } - - template<> SIMD_INLINE void Zero<32>(float * dst) - { - _mm256_stream_ps(dst + 0 * F, _mm256_setzero_ps()); - _mm256_stream_ps(dst + 1 * F, _mm256_setzero_ps()); - _mm256_stream_ps(dst + 2 * F, _mm256_setzero_ps()); - _mm256_stream_ps(dst + 3 * F, _mm256_setzero_ps()); - } - - template<> SIMD_INLINE void Copy<48>(const float * src, float * dst) - { - _mm256_stream_ps(dst + 0 * F, _mm256_loadu_ps(src + 0 * F)); - _mm256_stream_ps(dst + 1 * F, _mm256_loadu_ps(src + 1 * F)); - _mm256_stream_ps(dst + 2 * F, _mm256_loadu_ps(src + 2 * F)); - _mm256_stream_ps(dst + 3 * F, _mm256_loadu_ps(src + 3 * F)); - _mm256_stream_ps(dst + 4 * F, _mm256_loadu_ps(src + 4 * F)); - _mm256_stream_ps(dst + 5 * F, _mm256_loadu_ps(src + 5 * F)); - } - - template<> SIMD_INLINE void Zero<48>(float * dst) - { - _mm256_stream_ps(dst + 0 * F, _mm256_setzero_ps()); - _mm256_stream_ps(dst + 1 * F, _mm256_setzero_ps()); - _mm256_stream_ps(dst + 2 * F, _mm256_setzero_ps()); - _mm256_stream_ps(dst + 3 * F, _mm256_setzero_ps()); - _mm256_stream_ps(dst + 4 * F, _mm256_setzero_ps()); - _mm256_stream_ps(dst + 5 * F, _mm256_setzero_ps()); - } - - template<> SIMD_INLINE void Copy<64>(const float * src, float * dst) - { - _mm256_stream_ps(dst + 0 * F, _mm256_loadu_ps(src + 0 * F)); - _mm256_stream_ps(dst + 1 * F, _mm256_loadu_ps(src + 1 * F)); - _mm256_stream_ps(dst + 2 * F, _mm256_loadu_ps(src + 2 * F)); - _mm256_stream_ps(dst + 3 * F, _mm256_loadu_ps(src + 3 * F)); - _mm256_stream_ps(dst + 4 * F, _mm256_loadu_ps(src + 4 * F)); - _mm256_stream_ps(dst + 5 * F, _mm256_loadu_ps(src + 5 * F)); - _mm256_stream_ps(dst + 6 * F, _mm256_loadu_ps(src + 6 * F)); - _mm256_stream_ps(dst + 7 * F, _mm256_loadu_ps(src + 7 * F)); - } - - template<> SIMD_INLINE void Zero<64>(float * dst) - { - _mm256_stream_ps(dst + 0 * F, _mm256_setzero_ps()); - _mm256_stream_ps(dst + 1 * F, _mm256_setzero_ps()); - _mm256_stream_ps(dst + 2 * F, _mm256_setzero_ps()); - _mm256_stream_ps(dst + 3 * F, _mm256_setzero_ps()); - _mm256_stream_ps(dst + 4 * F, _mm256_setzero_ps()); - _mm256_stream_ps(dst + 5 * F, _mm256_setzero_ps()); - _mm256_stream_ps(dst + 6 * F, _mm256_setzero_ps()); - _mm256_stream_ps(dst + 7 * F, _mm256_setzero_ps()); - } - - template void ImgToCol(const ConvParam32f & p, const float * src, float * dst) - { - for (size_t g = 0; g < p.group; ++g) - { - for (size_t dy = 0; dy < p.dstH; ++dy) - { - for (size_t dx = 0; dx < p.dstW; ++dx) - { - for (size_t ky = 0; ky < p.kernelY; ky++) - { - size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; kx++) - { - size_t sx = dx * p.strideX + kx * p.dilationX - p.padX; - if (sx < p.srcW) - { - Copy(src + (sy * p.srcW + sx)*p.srcC, dst); - dst += size; - } - else - { - Zero(dst); - dst += size; - } - } - } - else - { - for (size_t kx = 0; kx < p.kernelX; kx++) - Zero(dst), dst += size; - } - } - } - } - src += size; - } - } - - void SynetConvolution32fGemmNN::ImgToRow(const float * src, float * dst) - { - const ConvParam32f & p = _param; - assert(p.trans); - size_t size = p.srcC / p.group; - if (size*p.dstH*p.dstW*p.kernelY*p.kernelX >= 1024 * 512 && Aligned(dst)) - { - if (size == 16) - { - Avx::ImgToCol<16>(p, src, dst); - return; - } - if (size == 24) - { - Avx::ImgToCol<24>(p, src, dst); - return; - } - if (size == 32) - { - Avx::ImgToCol<32>(p, src, dst); - return; - } - if (size == 48) - { - Avx::ImgToCol<48>(p, src, dst); - return; - } - if (size == 64) - { - Avx::ImgToCol<64>(p, src, dst); - return; - } - } - for (size_t g = 0; g < p.group; ++g) - { - for (size_t dy = 0; dy < p.dstH; ++dy) - { - for (size_t dx = 0; dx < p.dstW; ++dx) - { - for (size_t ky = 0; ky < p.kernelY; ky++) - { - size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; kx++) - { - size_t sx = dx * p.strideX + kx * p.dilationX - p.padX; - if (sx < p.srcW) - { - memcpy(dst, src + (sy * p.srcW + sx)*p.srcC, size * sizeof(float)); - dst += size; - } - else - { - memset(dst, 0, size * sizeof(float)); - dst += size; - } - } - } - else - { - memset(dst, 0, p.kernelX * size * sizeof(float)); - dst += p.kernelX * size; - } - } - } - } - src += size; - } - } - - //--------------------------------------------------------------------- - - SynetConvolution32fGemmNT::SynetConvolution32fGemmNT(const ConvParam32f & p) - : Sse3::SynetConvolution32fGemmNT(p) - { - _gemm.Init(InitGemmFuncs(Avx::Gemm32fNT, "Avx")); - _biasAndActivation = Avx::ConvolutionBiasAndActivation; - } - - //--------------------------------------------------------------------- - - SynetConvolution32fWinograd::SynetConvolution32fWinograd(const ConvParam32f & p) - : Sse2::SynetConvolution32fWinograd(p) - { - if (p.kernelY == 1 && p.kernelX == 3) - { - { - SetBlock(1, 4); - _setFilter = Avx::WinogradKernel1x3Block1x4SetFilter; - _setInput = Avx::WinogradKernel1x3Block1x4SetInput; - _setOutput = Avx::WinogradKernel1x3Block1x4SetOutput; - } - } - else if (p.kernelY == 1 && p.kernelX == 5) - { - { - SetBlock(1, 4); - _setFilter = Avx::WinogradKernel1x5Block1x4SetFilter; - _setInput = Avx::WinogradKernel1x5Block1x4SetInput; - _setOutput = Avx::WinogradKernel1x5Block1x4SetOutput; - } - } - else if (p.kernelY == 2 && p.kernelX == 2) - { - if (_blockY == 4 && _blockX == 4) - { - SetBlock(4, 4); - _setFilter = Avx::WinogradKernel2x2Block4x4SetFilter; - _setInput = Avx::WinogradKernel2x2Block4x4SetInput; - _setOutput = Avx::WinogradKernel2x2Block4x4SetOutput; - } - else if (_blockY == 2 && _blockX == 2) - { - SetBlock(2, 2); - _setFilter = Avx::WinogradKernel2x2Block2x2SetFilter; - _setInput = Avx::WinogradKernel2x2Block2x2SetInput; - _setOutput = Avx::WinogradKernel2x2Block2x2SetOutput; - } - else - assert(0); - } - else if (p.kernelY == 3 && p.kernelX == 3) - { - if (_blockY == 4 && _blockX == 4) - { - _setFilter = Avx::WinogradKernel3x3Block4x4SetFilter; - _setInput = Avx::WinogradKernel3x3Block4x4SetInput; - _setOutput = Avx::WinogradKernel3x3Block4x4SetOutput; - } - else if (_blockY == 3 && _blockX == 3) - { - _setFilter = Avx::WinogradKernel3x3Block3x3SetFilter; - _setInput = Avx::WinogradKernel3x3Block3x3SetInput; - _setOutput = Avx::WinogradKernel3x3Block3x3SetOutput; - } - else if (_blockY == 2 && _blockX == 2) - { - _setFilter = Avx::WinogradKernel3x3Block2x2SetFilter; - _setInput = Avx::WinogradKernel3x3Block2x2SetInput; - _setOutput = Avx::WinogradKernel3x3Block2x2SetOutput; - } - else - assert(0); - } - else - assert(0); - _gemm.Init(InitGemmFuncs(Avx::Gemm32fNN, "Avx", p.gemm, "Ext")); - if (_param.trans) - { - if (NHWC_GEMM_RUNTIME) - { - _gemmCb.Init(InitGemmCbFuncs(Avx::Gemm32fNNcbBufferSize, Avx::Gemm32fNNcbReorderB, Avx::Gemm32fNNcbRun, "Avx", GemmKernelF2, GemmKernelF3)); - _nhwcStrideW = _gemmCb.At(0).BufferSize(_M*_merge, _N, _K); - } - else - _nhwcStrideW = Avx::Gemm32fNNcbBufferSize(_M*_merge, _N, _K, GemmKernelAny, NHWC_GEMM_COMPATIBLE); - _nhwcWeight.Resize(_nhwcStrideW*_count); - _nhwcRun = Avx::Gemm32fNNcbRun; - _nhwcReorderB = Avx::Gemm32fNNcbReorderB; - } - _biasAndActivation = Avx::ConvolutionBiasAndActivation; - } - - //--------------------------------------------------------------------- - - SynetConvolution32fDirectNchw::SynetConvolution32fDirectNchw(const ConvParam32f & p) - : Sse2::SynetConvolution32fDirectNchw(p) - { - _convolutionBiasActivation = SetConvolutionBiasActivation(); - } - - template SIMD_INLINE void LoadWeight(const float * src, __m256 * dst) - { - for (size_t i = 0; i < size; ++i) - dst[i] = _mm256_set1_ps(src[i]); - } - - template struct Kernel - { - static __m256 SynetConvolution32f(const float * src, size_t step, const __m256 * weight); - }; - - template<> struct Kernel<1, 1> - { - static SIMD_INLINE __m256 SynetConvolution32f(const float * src, size_t step, const __m256 * weight) - { - return _mm256_mul_ps(_mm256_loadu_ps(src), weight[0]); - } - }; - - template<> struct Kernel<2, 1> - { - static SIMD_INLINE __m256 RowConv(const float * src, const __m256 * weight) - { - return _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src + 0), weight[0]), _mm256_mul_ps(_mm256_loadu_ps(src + 1), weight[1])); - } - - static SIMD_INLINE __m256 SynetConvolution32f(const float * src, size_t step, const __m256 * weight) - { - return _mm256_add_ps(RowConv(src, weight), RowConv(src + step, weight + 2)); - } - }; - - template<> struct Kernel<3, 1> - { - static SIMD_INLINE __m256 RowConv(const float * src, const __m256 * weight) - { - return _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src), weight[0]), - _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src + 1), weight[1]), - _mm256_mul_ps(_mm256_loadu_ps(src + 2), weight[2]))); - } - - static SIMD_INLINE __m256 SynetConvolution32f(const float * src, size_t step, const __m256 * weight) - { - return _mm256_add_ps(RowConv(src, weight), - _mm256_add_ps(RowConv(src + step, weight + 3), - RowConv(src + 2 * step, weight + 6))); - } - }; - - template<::SimdConvolutionActivationType type> SIMD_INLINE __m256 Activate(__m256 value, const __m256 * params); - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationIdentity>(__m256 value, const __m256 * params) - { - return value; - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationRelu>(__m256 value, const __m256 * params) - { - return _mm256_max_ps(_mm256_setzero_ps(), value); - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationLeakyRelu>(__m256 value, const __m256 * params) - { - return _mm256_add_ps(_mm256_max_ps(_mm256_setzero_ps(), value), _mm256_mul_ps(params[0], _mm256_min_ps(_mm256_setzero_ps(), value))); - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationRestrictRange>(__m256 value, const __m256 * params) - { - return _mm256_min_ps(_mm256_max_ps(params[0], value), params[1]); - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationPrelu>(__m256 value, const __m256 * params) - { - return _mm256_add_ps(_mm256_max_ps(_mm256_setzero_ps(), value), _mm256_mul_ps(params[0], _mm256_min_ps(_mm256_setzero_ps(), value))); - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationHswish>(__m256 value, const __m256 * params) - { - return Avx::SynetHswish32f(value, params[0], params[1]); - } - - template - void ConvolutionBiasActivation(const float * src, size_t srcC, size_t srcH, size_t srcW, const float * weight, - const float * bias, const float * params, float * dst, size_t dstC, size_t dstH, size_t dstW) - { - __m256 _weight[kernel*kernel]; - __m256 _params[2]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - size_t dstWF = Simd::AlignLo(dstW, F); - __m256 tail = RightNotZero32f(dstW - dstWF); - for (size_t dc = 0; dc < dstC; ++dc) - { - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm256_set1_ps(params[dc]); - if (srcC == 1) - { - const float * ps = src; - float * pd = dst; - LoadWeight(weight, _weight); - __m256 _bias = bias ? _mm256_set1_ps(bias[dc]) : _mm256_setzero_ps(); - for (size_t y = 0; y < dstH; ++y) - { - for (size_t x = 0; x < dstWF; x += F) - { - __m256 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm256_storeu_ps(pd + x, Activate(_mm256_add_ps(_bias, conv), _params)); - } - if (dstWF < dstW) - { - size_t x = dstW - F; - __m256 _dst = _mm256_loadu_ps(pd + x); - __m256 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm256_storeu_ps(pd + x, _mm256_blendv_ps(_dst, Activate(_mm256_add_ps(_bias, conv), _params), tail)); - } - ps += srcW * stride; - pd += dstW; - } - weight += kernel * kernel; - } - else - { - size_t sc = 0; - for (; sc < 1; ++sc) - { - const float * ps = src; - float * pd = dst; - LoadWeight(weight, _weight); - __m256 _bias = bias ? _mm256_set1_ps(bias[dc]) : _mm256_setzero_ps(); - for (size_t y = 0; y < dstH; ++y) - { - for (size_t x = 0; x < dstWF; x += F) - { - __m256 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm256_storeu_ps(pd + x, _mm256_add_ps(_bias, conv)); - } - if (dstWF < dstW) - { - size_t x = dstW - F; - __m256 _dst = _mm256_loadu_ps(pd + x); - __m256 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm256_storeu_ps(pd + x, _mm256_blendv_ps(_dst, _mm256_add_ps(_bias, conv), tail)); - } - ps += srcW * stride; - pd += dstW; - } - weight += kernel * kernel; - } - for (; sc < srcC - 1; ++sc) - { - const float * ps = src + sc * srcW * srcH; - float * pd = dst; - LoadWeight(weight, _weight); - for (size_t y = 0; y < dstH; ++y) - { - for (size_t x = 0; x < dstWF; x += F) - { - __m256 _dst = _mm256_loadu_ps(pd + x); - __m256 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm256_storeu_ps(pd + x, _mm256_add_ps(_dst, conv)); - } - if (dstWF < dstW) - { - size_t x = dstW - F; - __m256 _dst = _mm256_loadu_ps(pd + x); - __m256 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm256_storeu_ps(pd + x, _mm256_add_ps(_dst, _mm256_and_ps(conv, tail))); - } - ps += srcW * stride; - pd += dstW; - } - weight += kernel * kernel; - } - for (; sc < srcC; ++sc) - { - const float * ps = src + sc * srcW * srcH; - float * pd = dst; - LoadWeight(weight, _weight); - for (size_t y = 0; y < dstH; ++y) - { - for (size_t x = 0; x < dstWF; x += F) - { - __m256 _dst = _mm256_loadu_ps(pd + x); - __m256 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm256_storeu_ps(pd + x, Activate(_mm256_add_ps(_dst, conv), _params)); - } - if (dstWF < dstW) - { - size_t x = dstW - F; - __m256 _dst = _mm256_loadu_ps(pd + x); - __m256 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm256_storeu_ps(pd + x, _mm256_blendv_ps(_dst, Activate(_mm256_add_ps(_dst, conv), _params), tail)); - } - ps += srcW * stride; - pd += dstW; - } - weight += kernel * kernel; - } - } - dst += dstH * dstW; - } - } - - template SynetConvolution32fDirectNchw::ConvolutionBiasActivationPtr SetConvolutionBiasActivation(::SimdConvolutionActivationType type) - { - switch (type) - { - case ::SimdConvolutionActivationIdentity: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationRelu: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationLeakyRelu: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationRestrictRange: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationPrelu: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationHswish: return ConvolutionBiasActivation; - default: - assert(0); - return NULL; - } - } - - SynetConvolution32fDirectNchw::ConvolutionBiasActivationPtr SynetConvolution32fDirectNchw::SetConvolutionBiasActivation() - { - const ConvParam32f & p = _param; - if (p.dstW < F) - return Sse2::SynetConvolution32fDirectNchw::SetConvolutionBiasActivation(); - switch (p.strideX) - { - case 1: - if (p.kernelX == 1) - return Avx::SetConvolutionBiasActivation<1, 1>(p.activation); - if (p.kernelX == 2) - return Avx::SetConvolutionBiasActivation<2, 1>(p.activation); - if (p.kernelX == 3) - return Avx::SetConvolutionBiasActivation<3, 1>(p.activation); - break; - } - return Sse2::SynetConvolution32fDirectNchw::SetConvolutionBiasActivation(); - } - - //--------------------------------------------------------------------- - - SynetConvolution32fDirectNhwc::SynetConvolution32fDirectNhwc(const ConvParam32f & p) - : Sse2::SynetConvolution32fDirectNhwc(p) - { - _convolutionBiasActivation = SetConvolutionBiasActivation(); - } - - bool SynetConvolution32fDirectNhwc::Preferable(const ConvParam32f & p) - { - if (!p.IsDilation(1) || p.trans == 0) - return false; - if (p.group == 1) - { - if (p.kernelY > p.srcH || p.kernelX > p.srcW) - return false; - if (p.trans && p.IsKernel(1) && p.dstC < Sse::F) - return false; - return p.srcC <= 16 || (p.IsKernel(1) && p.srcC*p.dstC <= 8 * 1024 && p.dstC >= F && p.dstC > p.srcC); - } - else if (p.IsDepthwise()) - { - return true; - } - return false; - } - - template<::SimdConvolutionActivationType type> SIMD_INLINE __m256 Activate(__m256 value, const float * params, size_t offset); - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationIdentity>(__m256 value, const float * params, size_t offset) - { - return value; - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationRelu>(__m256 value, const float * params, size_t offset) - { - return _mm256_max_ps(_mm256_setzero_ps(), value); - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationLeakyRelu>(__m256 value, const float * params, size_t offset) - { - return _mm256_add_ps(_mm256_max_ps(_mm256_setzero_ps(), value), _mm256_mul_ps(_mm256_set1_ps(params[0]), _mm256_min_ps(_mm256_setzero_ps(), value))); - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationRestrictRange>(__m256 value, const float * params, size_t offset) - { - return _mm256_min_ps(_mm256_max_ps(_mm256_set1_ps(params[0]), value), _mm256_set1_ps(params[1])); - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationPrelu>(__m256 value, const float * params, size_t offset) - { - return _mm256_add_ps(_mm256_max_ps(_mm256_setzero_ps(), value), _mm256_mul_ps(_mm256_loadu_ps(params + offset), _mm256_min_ps(_mm256_setzero_ps(), value))); - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationHswish>(__m256 value, const float * params, size_t offset) - { - return Avx::SynetHswish32f(value, _mm256_set1_ps(params[0]), _mm256_set1_ps(params[1])); - } - - SIMD_INLINE void KernelHwcDefaultEdge(const float * src, const ConvParam32f & p, size_t kH, size_t kW, const float * weight, __m256 & sum) - { - size_t size = kW * p.srcC, tail = (p.kernelX - kW)*p.srcC*p.dstC, dstC = p.dstC, stride = p.srcW * p.srcC; - for (size_t ky = 0; ky < kH; ++ky) - { - for (size_t i = 0; i < size; ++i, weight += dstC) - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(src[i]), _mm256_loadu_ps(weight)), sum); - weight += tail; - src += stride; - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void KernelHwcDefaultEdge(const float * src, const ConvParam32f & p, size_t kH, size_t kW, const float * weight, const float * bias, const float * params, float * dst) - { - size_t dstC = p.dstC; - size_t dstCF = AlignLo(dstC, F); - size_t dc = 0; - for (; dc < dstCF; dc += F) - { - __m256 conv = bias ? _mm256_loadu_ps(bias + dc) : _mm256_setzero_ps(); - KernelHwcDefaultEdge(src, p, kH, kW, weight + dc, conv); - _mm256_storeu_ps(dst + dc, Activate(conv, params, dc)); - } - if (dc < dstC) - { - dc = dstC - F; - __m256 conv = bias ? _mm256_loadu_ps(bias + dc) : _mm256_setzero_ps(); - KernelHwcDefaultEdge(src, p, kH, kW, weight + dc, conv); - _mm256_storeu_ps(dst + dc, Activate(conv, params, dc)); - } - } - - SIMD_INLINE void KernelHwcDefaultBody2x2(const float * src, const ConvParam32f & p, const float * weight, __m256 sums[2][2]) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - __m256 w0, w1, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0 * F); - w1 = _mm256_loadu_ps(weight + 1 * F); - s0 = _mm256_set1_ps(src0[offset]); - sums[0][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[0][0]); - sums[0][1] = _mm256_add_ps(_mm256_mul_ps(s0, w1), sums[0][1]); - s0 = _mm256_set1_ps(src1[offset]); - sums[1][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[1][0]); - sums[1][1] = _mm256_add_ps(_mm256_mul_ps(s0, w1), sums[1][1]); - weight += dstC; - } - } - } - - SIMD_INLINE void KernelHwcDefaultBody2x1(const float * src, const ConvParam32f & p, const float * weight, __m256 sums[2][1]) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - __m256 w0, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0 * F); - s0 = _mm256_set1_ps(src0[offset]); - sums[0][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[0][0]); - s0 = _mm256_set1_ps(src1[offset]); - sums[1][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[1][0]); - weight += dstC; - } - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void KernelHwcDefaultBody2(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t dstC = p.dstC; - size_t dstCF1 = AlignLo(dstC, 1 * F); - size_t dstCF2 = AlignLo(dstC, 2 * F); - size_t dc = 0; - for (; dc < dstCF2; dc += 2 * F) - { - __m256 sums[2][2]; - __m256 bias0 = bias ? _mm256_loadu_ps(bias + dc + 0 * F) : _mm256_setzero_ps(); - __m256 bias1 = bias ? _mm256_loadu_ps(bias + dc + 1 * F) : _mm256_setzero_ps(); - sums[0][0] = bias0; - sums[0][1] = bias1; - sums[1][0] = bias0; - sums[1][1] = bias1; - KernelHwcDefaultBody2x2(src, p, weight + dc, sums); - _mm256_storeu_ps(dst + dc + 0 * dstC + 0 * F, Activate(sums[0][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 0 * dstC + 1 * F, Activate(sums[0][1], params, dc + 1 * F)); - _mm256_storeu_ps(dst + dc + 1 * dstC + 0 * F, Activate(sums[1][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 1 * dstC + 1 * F, Activate(sums[1][1], params, dc + 1 * F)); - } - for (; dc < dstCF1; dc += 1 * F) - { - __m256 sums[2][1]; - __m256 bias0 = bias ? _mm256_loadu_ps(bias + dc) : _mm256_setzero_ps(); - sums[0][0] = bias0; - sums[1][0] = bias0; - KernelHwcDefaultBody2x1(src, p, weight + dc, sums); - _mm256_storeu_ps(dst + dc + 0 * dstC, Activate(sums[0][0], params, dc)); - _mm256_storeu_ps(dst + dc + 1 * dstC, Activate(sums[1][0], params, dc)); - } - if (dc < dstC) - { - dc = dstC - F; - __m256 sums[2][1]; - __m256 bias0 = bias ? _mm256_loadu_ps(bias + dc) : _mm256_setzero_ps(); - sums[0][0] = bias0; - sums[1][0] = bias0; - KernelHwcDefaultBody2x1(src, p, weight + dc, sums); - _mm256_storeu_ps(dst + dc + 0 * dstC, Activate(sums[0][0], params, dc)); - _mm256_storeu_ps(dst + dc + 1 * dstC, Activate(sums[1][0], params, dc)); - } - } - - SIMD_INLINE void KernelHwcDefaultBody6x2(const float * src, const ConvParam32f & p, const float * weight, __m256 sums[6][2]) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - const float * src2 = src + 2 * step; - const float * src3 = src + 3 * step; - const float * src4 = src + 4 * step; - const float * src5 = src + 5 * step; - __m256 w0, w1, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0 * F); - w1 = _mm256_loadu_ps(weight + 1 * F); - s0 = _mm256_set1_ps(src0[offset]); - sums[0][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[0][0]); - sums[0][1] = _mm256_add_ps(_mm256_mul_ps(s0, w1), sums[0][1]); - s0 = _mm256_set1_ps(src1[offset]); - sums[1][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[1][0]); - sums[1][1] = _mm256_add_ps(_mm256_mul_ps(s0, w1), sums[1][1]); - s0 = _mm256_set1_ps(src2[offset]); - sums[2][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[2][0]); - sums[2][1] = _mm256_add_ps(_mm256_mul_ps(s0, w1), sums[2][1]); - s0 = _mm256_set1_ps(src3[offset]); - sums[3][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[3][0]); - sums[3][1] = _mm256_add_ps(_mm256_mul_ps(s0, w1), sums[3][1]); - s0 = _mm256_set1_ps(src4[offset]); - sums[4][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[4][0]); - sums[4][1] = _mm256_add_ps(_mm256_mul_ps(s0, w1), sums[4][1]); - s0 = _mm256_set1_ps(src5[offset]); - sums[5][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[5][0]); - sums[5][1] = _mm256_add_ps(_mm256_mul_ps(s0, w1), sums[5][1]); - weight += dstC; - } - } - } - - SIMD_INLINE void KernelHwcDefaultBody6x1(const float * src, const ConvParam32f & p, const float * weight, __m256 sums[6][1]) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - const float * src2 = src + 2 * step; - const float * src3 = src + 3 * step; - const float * src4 = src + 4 * step; - const float * src5 = src + 5 * step; - __m256 w0, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0 * F); - s0 = _mm256_set1_ps(src0[offset]); - sums[0][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[0][0]); - s0 = _mm256_set1_ps(src1[offset]); - sums[1][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[1][0]); - s0 = _mm256_set1_ps(src2[offset]); - sums[2][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[2][0]); - s0 = _mm256_set1_ps(src3[offset]); - sums[3][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[3][0]); - s0 = _mm256_set1_ps(src4[offset]); - sums[4][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[4][0]); - s0 = _mm256_set1_ps(src5[offset]); - sums[5][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[5][0]); - weight += dstC; - } - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void KernelHwcDefaultBody6(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t dstC = p.dstC; - size_t dstCF1 = AlignLo(dstC, 1 * F); - size_t dstCF2 = AlignLo(dstC, 2 * F); - size_t dc = 0; - for (; dc < dstCF2; dc += 2 * F) - { - __m256 sums[6][2]; - __m256 bias0 = bias ? _mm256_loadu_ps(bias + dc + 0 * F) : _mm256_setzero_ps(); - __m256 bias1 = bias ? _mm256_loadu_ps(bias + dc + 1 * F) : _mm256_setzero_ps(); - sums[0][0] = bias0; - sums[0][1] = bias1; - sums[1][0] = bias0; - sums[1][1] = bias1; - sums[2][0] = bias0; - sums[2][1] = bias1; - sums[3][0] = bias0; - sums[3][1] = bias1; - sums[4][0] = bias0; - sums[4][1] = bias1; - sums[5][0] = bias0; - sums[5][1] = bias1; - KernelHwcDefaultBody6x2(src, p, weight + dc, sums); - _mm256_storeu_ps(dst + dc + 0 * dstC + 0 * F, Activate(sums[0][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 0 * dstC + 1 * F, Activate(sums[0][1], params, dc + 1 * F)); - _mm256_storeu_ps(dst + dc + 1 * dstC + 0 * F, Activate(sums[1][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 1 * dstC + 1 * F, Activate(sums[1][1], params, dc + 1 * F)); - _mm256_storeu_ps(dst + dc + 2 * dstC + 0 * F, Activate(sums[2][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 2 * dstC + 1 * F, Activate(sums[2][1], params, dc + 1 * F)); - _mm256_storeu_ps(dst + dc + 3 * dstC + 0 * F, Activate(sums[3][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 3 * dstC + 1 * F, Activate(sums[3][1], params, dc + 1 * F)); - _mm256_storeu_ps(dst + dc + 4 * dstC + 0 * F, Activate(sums[4][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 4 * dstC + 1 * F, Activate(sums[4][1], params, dc + 1 * F)); - _mm256_storeu_ps(dst + dc + 5 * dstC + 0 * F, Activate(sums[5][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 5 * dstC + 1 * F, Activate(sums[5][1], params, dc + 1 * F)); - } - for (; dc < dstCF1; dc += 1 * F) - { - __m256 sums[6][1]; - __m256 bias0 = bias ? _mm256_loadu_ps(bias + dc) : _mm256_setzero_ps(); - sums[0][0] = bias0; - sums[1][0] = bias0; - sums[2][0] = bias0; - sums[3][0] = bias0; - sums[4][0] = bias0; - sums[5][0] = bias0; - KernelHwcDefaultBody6x1(src, p, weight + dc, sums); - _mm256_storeu_ps(dst + dc + 0 * dstC, Activate(sums[0][0], params, dc)); - _mm256_storeu_ps(dst + dc + 1 * dstC, Activate(sums[1][0], params, dc)); - _mm256_storeu_ps(dst + dc + 2 * dstC, Activate(sums[2][0], params, dc)); - _mm256_storeu_ps(dst + dc + 3 * dstC, Activate(sums[3][0], params, dc)); - _mm256_storeu_ps(dst + dc + 4 * dstC, Activate(sums[4][0], params, dc)); - _mm256_storeu_ps(dst + dc + 5 * dstC, Activate(sums[5][0], params, dc)); - } - if (dc < dstC) - { - dc = dstC - F; - __m256 sums[6][1]; - __m256 bias0 = bias ? _mm256_loadu_ps(bias + dc) : _mm256_setzero_ps(); - sums[0][0] = bias0; - sums[1][0] = bias0; - sums[2][0] = bias0; - sums[3][0] = bias0; - sums[4][0] = bias0; - sums[5][0] = bias0; - KernelHwcDefaultBody6x1(src, p, weight + dc, sums); - _mm256_storeu_ps(dst + dc + 0 * dstC, Activate(sums[0][0], params, dc)); - _mm256_storeu_ps(dst + dc + 1 * dstC, Activate(sums[1][0], params, dc)); - _mm256_storeu_ps(dst + dc + 2 * dstC, Activate(sums[2][0], params, dc)); - _mm256_storeu_ps(dst + dc + 3 * dstC, Activate(sums[3][0], params, dc)); - _mm256_storeu_ps(dst + dc + 4 * dstC, Activate(sums[4][0], params, dc)); - _mm256_storeu_ps(dst + dc + 5 * dstC, Activate(sums[5][0], params, dc)); - } - } - - SIMD_INLINE void KernelHwcDefaultBody4x3(const float * src, const ConvParam32f & p, const float * weight, __m256 sums[4][3]) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - const float * src2 = src + 2 * step; - const float * src3 = src + 3 * step; - __m256 w0, w1, w2, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0 * F); - w1 = _mm256_loadu_ps(weight + 1 * F); - w2 = _mm256_loadu_ps(weight + 2 * F); - s0 = _mm256_set1_ps(src0[offset]); - sums[0][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[0][0]); - sums[0][1] = _mm256_add_ps(_mm256_mul_ps(s0, w1), sums[0][1]); - sums[0][2] = _mm256_add_ps(_mm256_mul_ps(s0, w2), sums[0][2]); - s0 = _mm256_set1_ps(src1[offset]); - sums[1][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[1][0]); - sums[1][1] = _mm256_add_ps(_mm256_mul_ps(s0, w1), sums[1][1]); - sums[1][2] = _mm256_add_ps(_mm256_mul_ps(s0, w2), sums[1][2]); - s0 = _mm256_set1_ps(src2[offset]); - sums[2][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[2][0]); - sums[2][1] = _mm256_add_ps(_mm256_mul_ps(s0, w1), sums[2][1]); - sums[2][2] = _mm256_add_ps(_mm256_mul_ps(s0, w2), sums[2][2]); - s0 = _mm256_set1_ps(src3[offset]); - sums[3][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[3][0]); - sums[3][1] = _mm256_add_ps(_mm256_mul_ps(s0, w1), sums[3][1]); - sums[3][2] = _mm256_add_ps(_mm256_mul_ps(s0, w2), sums[3][2]); - weight += dstC; - } - } - } - - SIMD_INLINE void KernelHwcDefaultBody4x1(const float * src, const ConvParam32f & p, const float * weight, __m256 sums[4][1]) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - const float * src2 = src + 2 * step; - const float * src3 = src + 3 * step; - __m256 w0, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0 * F); - s0 = _mm256_set1_ps(src0[offset]); - sums[0][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[0][0]); - s0 = _mm256_set1_ps(src1[offset]); - sums[1][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[1][0]); - s0 = _mm256_set1_ps(src2[offset]); - sums[2][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[2][0]); - s0 = _mm256_set1_ps(src3[offset]); - sums[3][0] = _mm256_add_ps(_mm256_mul_ps(s0, w0), sums[3][0]); - weight += dstC; - } - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void KernelHwcDefaultBody4(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t dstC = p.dstC; - size_t dstCF1 = AlignLo(dstC, 1 * F); - size_t dstCF3 = AlignLoAny(dstC, 3 * F); - size_t dc = 0; - for (; dc < dstCF3; dc += 3 * F) - { - __m256 sums[4][3]; - __m256 bias0 = bias ? _mm256_loadu_ps(bias + dc + 0 * F) : _mm256_setzero_ps(); - __m256 bias1 = bias ? _mm256_loadu_ps(bias + dc + 1 * F) : _mm256_setzero_ps(); - __m256 bias2 = bias ? _mm256_loadu_ps(bias + dc + 2 * F) : _mm256_setzero_ps(); - sums[0][0] = bias0; - sums[0][1] = bias1; - sums[0][2] = bias2; - sums[1][0] = bias0; - sums[1][1] = bias1; - sums[1][2] = bias2; - sums[2][0] = bias0; - sums[2][1] = bias1; - sums[2][2] = bias2; - sums[3][0] = bias0; - sums[3][1] = bias1; - sums[3][2] = bias2; - KernelHwcDefaultBody4x3(src, p, weight + dc, sums); - _mm256_storeu_ps(dst + dc + 0 * dstC + 0 * F, Activate(sums[0][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 0 * dstC + 1 * F, Activate(sums[0][1], params, dc + 1 * F)); - _mm256_storeu_ps(dst + dc + 0 * dstC + 2 * F, Activate(sums[0][2], params, dc + 2 * F)); - _mm256_storeu_ps(dst + dc + 1 * dstC + 0 * F, Activate(sums[1][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 1 * dstC + 1 * F, Activate(sums[1][1], params, dc + 1 * F)); - _mm256_storeu_ps(dst + dc + 1 * dstC + 2 * F, Activate(sums[1][2], params, dc + 2 * F)); - _mm256_storeu_ps(dst + dc + 2 * dstC + 0 * F, Activate(sums[2][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 2 * dstC + 1 * F, Activate(sums[2][1], params, dc + 1 * F)); - _mm256_storeu_ps(dst + dc + 2 * dstC + 2 * F, Activate(sums[2][2], params, dc + 2 * F)); - _mm256_storeu_ps(dst + dc + 3 * dstC + 0 * F, Activate(sums[3][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 3 * dstC + 1 * F, Activate(sums[3][1], params, dc + 1 * F)); - _mm256_storeu_ps(dst + dc + 3 * dstC + 2 * F, Activate(sums[3][2], params, dc + 2 * F)); - } - for (; dc < dstCF1; dc += 1 * F) - { - __m256 sums[4][1]; - __m256 bias0 = bias ? _mm256_loadu_ps(bias + dc) : _mm256_setzero_ps(); - sums[0][0] = bias0; - sums[1][0] = bias0; - sums[2][0] = bias0; - sums[3][0] = bias0; - KernelHwcDefaultBody4x1(src, p, weight + dc, sums); - _mm256_storeu_ps(dst + dc + 0 * dstC, Activate(sums[0][0], params, dc)); - _mm256_storeu_ps(dst + dc + 1 * dstC, Activate(sums[1][0], params, dc)); - _mm256_storeu_ps(dst + dc + 2 * dstC, Activate(sums[2][0], params, dc)); - _mm256_storeu_ps(dst + dc + 3 * dstC, Activate(sums[3][0], params, dc)); - } - if (dc < dstC) - { - dc = dstC - F; - __m256 sums[4][1]; - __m256 bias0 = bias ? _mm256_loadu_ps(bias + dc) : _mm256_setzero_ps(); - sums[0][0] = bias0; - sums[1][0] = bias0; - sums[2][0] = bias0; - sums[3][0] = bias0; - KernelHwcDefaultBody4x1(src, p, weight + dc, sums); - _mm256_storeu_ps(dst + dc + 0 * dstC, Activate(sums[0][0], params, dc)); - _mm256_storeu_ps(dst + dc + 1 * dstC, Activate(sums[1][0], params, dc)); - _mm256_storeu_ps(dst + dc + 2 * dstC, Activate(sums[2][0], params, dc)); - _mm256_storeu_ps(dst + dc + 3 * dstC, Activate(sums[3][0], params, dc)); - } - } - - template<::SimdConvolutionActivationType type> void ConvolutionDirectNhwcConvolutionBiasActivationDefault(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t noseH = p.padY, noseW = p.padX; - size_t bodyH = p.srcH - p.kernelY + 1 + noseH, bodyW = p.srcW - p.kernelX + 1 + noseW; - size_t tailH = bodyH + p.padH, tailW = bodyW + p.padW; - size_t bodyW2 = AlignLoAny(bodyW - noseW, 2 * p.strideX) + noseW; - size_t bodyW4 = AlignLoAny(bodyW - noseW, 4 * p.strideX) + noseW; - size_t bodyW6 = AlignLoAny(bodyW - noseW, 6 * p.strideX) + noseW; - size_t wS = p.srcC*p.dstC; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - size_t sy = 0; - for (; sy < noseH; sy += p.strideY) - { - size_t sx = 0; - const float * w = weight + (noseH - sy) * p.kernelY * wS; - for (; sx < noseW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src, p, kY + sy, kX + sx, w + (noseW - sx)*wS, bias, params, dst); - for (; sx < bodyW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, kY + sy, p.kernelX, w, bias, params, dst); - for (; sx < tailW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, kY + sy, kW - sx, w, bias, params, dst); - } - src += (sy - noseH)*p.srcW*p.srcC; - for (; sy < bodyH; sy += p.strideY) - { - size_t sx = 0; - for (; sx < noseW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src, p, p.kernelY, kX + sx, weight + (noseW - sx)*wS, bias, params, dst); - if (p.dstC == 24) - { - for (; sx < bodyW4; sx += 4 * p.strideX, dst += 4 * p.dstC) - KernelHwcDefaultBody4(src + (sx - noseW) * p.srcC, p, weight, bias, params, dst); - } - else - { - for (; sx < bodyW6; sx += 6 * p.strideX, dst += 6 * p.dstC) - KernelHwcDefaultBody6(src + (sx - noseW) * p.srcC, p, weight, bias, params, dst); - } - for (; sx < bodyW2; sx += 2 * p.strideX, dst += 2 * p.dstC) - KernelHwcDefaultBody2(src + (sx - noseW) * p.srcC, p, weight, bias, params, dst); - for (; sx < bodyW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, p.kernelY, p.kernelX, weight, bias, params, dst); - for (; sx < tailW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, p.kernelY, kW - sx, weight, bias, params, dst); - src += p.strideY*p.srcW*p.srcC; - } - for (; sy < tailH; sy += p.strideY) - { - size_t sx = 0; - for (; sx < noseW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src, p, kH - sy, kX + sx, weight + (noseW - sx)*wS, bias, params, dst); - for (; sx < bodyW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, kH - sy, p.kernelX, weight, bias, params, dst); - for (; sx < tailW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, kH - sy, kW - sx, weight, bias, params, dst); - src += p.strideY*p.srcW*p.srcC; - } - } - - template<::SimdConvolutionActivationType type> void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t size = p.group; - size_t sizeF = AlignLo(size, F); - size_t size2F = AlignLo(size, 2 * F); - size_t size4F = AlignLo(size, 4 * F); - size_t size8F = AlignLo(size, 8 * F); - for (size_t dy = 0; dy < p.dstH; ++dy) - { - for (size_t dx = 0; dx < p.dstW; ++dx) - { - size_t i = 0; - for (; i < size8F; i += 8 * F) - { - __m256 sums[8]; - if (bias) - { - sums[0] = _mm256_loadu_ps(bias + i + 0 * F); - sums[1] = _mm256_loadu_ps(bias + i + 1 * F); - sums[2] = _mm256_loadu_ps(bias + i + 2 * F); - sums[3] = _mm256_loadu_ps(bias + i + 3 * F); - sums[4] = _mm256_loadu_ps(bias + i + 4 * F); - sums[5] = _mm256_loadu_ps(bias + i + 5 * F); - sums[6] = _mm256_loadu_ps(bias + i + 6 * F); - sums[7] = _mm256_loadu_ps(bias + i + 7 * F); - } - else - { - sums[0] = _mm256_setzero_ps(); - sums[1] = _mm256_setzero_ps(); - sums[2] = _mm256_setzero_ps(); - sums[3] = _mm256_setzero_ps(); - sums[4] = _mm256_setzero_ps(); - sums[5] = _mm256_setzero_ps(); - sums[6] = _mm256_setzero_ps(); - sums[7] = _mm256_setzero_ps(); - } - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * p.strideX + kx * p.dilationX - p.padX; - if (sx < p.srcW) - { - const float * pw = weight + (ky*p.kernelX + kx)*size + i; - const float * ps = src + (sy*p.srcW + sx)*size + i; - sums[0] = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 0 * F), _mm256_loadu_ps(pw + 0 * F)), sums[0]); - sums[1] = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 1 * F), _mm256_loadu_ps(pw + 1 * F)), sums[1]); - sums[2] = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 2 * F), _mm256_loadu_ps(pw + 2 * F)), sums[2]); - sums[3] = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 3 * F), _mm256_loadu_ps(pw + 3 * F)), sums[3]); - sums[4] = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 4 * F), _mm256_loadu_ps(pw + 4 * F)), sums[4]); - sums[5] = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 5 * F), _mm256_loadu_ps(pw + 5 * F)), sums[5]); - sums[6] = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 6 * F), _mm256_loadu_ps(pw + 6 * F)), sums[6]); - sums[7] = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 7 * F), _mm256_loadu_ps(pw + 7 * F)), sums[7]); - } - } - } - } - _mm256_storeu_ps(dst + i + 0 * F, Activate(sums[0], params, i + 0 * F)); - _mm256_storeu_ps(dst + i + 1 * F, Activate(sums[1], params, i + 1 * F)); - _mm256_storeu_ps(dst + i + 2 * F, Activate(sums[2], params, i + 2 * F)); - _mm256_storeu_ps(dst + i + 3 * F, Activate(sums[3], params, i + 3 * F)); - _mm256_storeu_ps(dst + i + 4 * F, Activate(sums[4], params, i + 4 * F)); - _mm256_storeu_ps(dst + i + 5 * F, Activate(sums[5], params, i + 5 * F)); - _mm256_storeu_ps(dst + i + 6 * F, Activate(sums[6], params, i + 6 * F)); - _mm256_storeu_ps(dst + i + 7 * F, Activate(sums[7], params, i + 7 * F)); - } - for (; i < size4F; i += 4 * F) - { - __m256 sums[4]; - if (bias) - { - sums[0] = _mm256_loadu_ps(bias + i + 0 * F); - sums[1] = _mm256_loadu_ps(bias + i + 1 * F); - sums[2] = _mm256_loadu_ps(bias + i + 2 * F); - sums[3] = _mm256_loadu_ps(bias + i + 3 * F); - } - else - { - sums[0] = _mm256_setzero_ps(); - sums[1] = _mm256_setzero_ps(); - sums[2] = _mm256_setzero_ps(); - sums[3] = _mm256_setzero_ps(); - } - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * p.strideX + kx * p.dilationX - p.padX; - if (sx < p.srcW) - { - const float * pw = weight + (ky*p.kernelX + kx)*size + i; - const float * ps = src + (sy*p.srcW + sx)*size + i; - sums[0] = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 0 * F), _mm256_loadu_ps(pw + 0 * F)), sums[0]); - sums[1] = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 1 * F), _mm256_loadu_ps(pw + 1 * F)), sums[1]); - sums[2] = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 2 * F), _mm256_loadu_ps(pw + 2 * F)), sums[2]); - sums[3] = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 3 * F), _mm256_loadu_ps(pw + 3 * F)), sums[3]); - } - } - } - } - _mm256_storeu_ps(dst + i + 0 * F, Activate(sums[0], params, i + 0 * F)); - _mm256_storeu_ps(dst + i + 1 * F, Activate(sums[1], params, i + 1 * F)); - _mm256_storeu_ps(dst + i + 2 * F, Activate(sums[2], params, i + 2 * F)); - _mm256_storeu_ps(dst + i + 3 * F, Activate(sums[3], params, i + 3 * F)); - } - for (; i < size2F; i += 2 * F) - { - __m256 sums[2]; - if (bias) - { - sums[0] = _mm256_loadu_ps(bias + i + 0 * F); - sums[1] = _mm256_loadu_ps(bias + i + 1 * F); - } - else - { - sums[0] = _mm256_setzero_ps(); - sums[1] = _mm256_setzero_ps(); - } - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * p.strideX + kx * p.dilationX - p.padX; - if (sx < p.srcW) - { - const float * pw = weight + (ky*p.kernelX + kx)*size + i; - const float * ps = src + (sy*p.srcW + sx)*size + i; - sums[0] = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 0 * F), _mm256_loadu_ps(pw + 0 * F)), sums[0]); - sums[1] = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 1 * F), _mm256_loadu_ps(pw + 1 * F)), sums[1]); - } - } - } - } - _mm256_storeu_ps(dst + i + 0 * F, Activate(sums[0], params, i + 0 * F)); - _mm256_storeu_ps(dst + i + 1 * F, Activate(sums[1], params, i + 1 * F)); - } - for (; i < size; i += F) - { - size_t ci = i >= sizeF ? size - F : i; - __m256 sum = bias ? _mm256_loadu_ps(bias + ci) : _mm256_setzero_ps(); - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * p.strideX + kx * p.dilationX - p.padX; - if (sx < p.srcW) - { - const float * pw = weight + (ky*p.kernelX + kx)*size + ci; - const float * ps = src + (sy*p.srcW + sx)*size + ci; - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps), _mm256_loadu_ps(pw)), sum); - } - } - } - } - _mm256_storeu_ps(dst + ci, Activate(sum, params, ci)); - } - dst += p.dstC; - } - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(const float * src, const ConvParam32f & p, size_t dy, size_t dx, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcC = p.srcC; - size_t srcCF = AlignLo(srcC, F); - size_t c = 0; - for (; c < srcCF; c += F) - { - __m256 sum = bias ? _mm256_loadu_ps(bias + c) : _mm256_setzero_ps(); - for (size_t ky = 0; ky < 3; ++ky) - { - size_t sy = dy * p.strideY + ky - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < 3; ++kx) - { - size_t sx = dx * p.strideX + kx - p.padX; - if (sx < p.srcW) - { - const float * pw = weight + (ky * 3 + kx) * srcC; - const float * ps = src + (sy*p.srcW + sx) * srcC; - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps), _mm256_loadu_ps(pw)), sum); - } - } - } - } - _mm256_storeu_ps(dst + c, Activate(sum, params, c)); - src += F; - weight += F; - } - if (c < srcC) - { - c = p.srcC - F; - src -= srcCF - c; - weight -= srcCF - c; - __m256 sum = bias ? _mm256_loadu_ps(bias + c) : _mm256_setzero_ps(); - for (size_t ky = 0; ky < 3; ++ky) - { - size_t sy = dy * p.strideY + ky - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < 3; ++kx) - { - size_t sx = dx * p.strideX + kx - p.padX; - if (sx < p.srcW) - { - const float * pw = weight + (ky * 3 + kx) * srcC; - const float * ps = src + (sy*p.srcW + sx) * srcC; - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps), _mm256_loadu_ps(pw)), sum); - } - } - } - } - _mm256_storeu_ps(dst + c, Activate(sum, params, c)); - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1(const float * src, size_t srcS, size_t srcC, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcCF = AlignLo(srcC, F); - size_t c = 0; - for (; c < srcCF; c += F) - { - __m256 sum = bias ? _mm256_loadu_ps(bias + c) : _mm256_setzero_ps(); - for (size_t ky = 0; ky < 3; ++ky) - { - const float * ps = src + ky * srcS; - const float * pw = weight + ky * 3 * srcC; - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 0 * srcC), _mm256_loadu_ps(pw + 0 * srcC)), sum); - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 1 * srcC), _mm256_loadu_ps(pw + 1 * srcC)), sum); - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 2 * srcC), _mm256_loadu_ps(pw + 2 * srcC)), sum); - } - _mm256_storeu_ps(dst + c, Activate(sum, params, c)); - src += F; - weight += F; - } - if (c < srcC) - { - c = srcC - F; - src -= srcCF - c; - weight -= srcCF - c; - __m256 sum = bias ? _mm256_loadu_ps(bias + c) : _mm256_setzero_ps(); - for (size_t ky = 0; ky < 3; ++ky) - { - const float * ps = src + ky * srcS; - const float * pw = weight + ky * 3 * srcC; - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 0 * srcC), _mm256_loadu_ps(pw + 0 * srcC)), sum); - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 1 * srcC), _mm256_loadu_ps(pw + 1 * srcC)), sum); - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 2 * srcC), _mm256_loadu_ps(pw + 2 * srcC)), sum); - } - _mm256_storeu_ps(dst + c, Activate(sum, params, c)); - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2(const float * src, size_t srcS, size_t srcX, size_t srcC, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcCF = AlignLo(srcC, F); - size_t c = 0; - __m256 sum0, sum1, w0; - for (; c < srcCF; c += F) - { - sum0 = bias ? _mm256_loadu_ps(bias + c) : _mm256_setzero_ps(); - sum1 = sum0; - const float * pw = weight + c; - for (size_t ky = 0; ky < 3; ++ky) - { - const float * ps0 = src + ky * srcS; - const float * ps1 = ps0 + srcX; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps0 + 0 * srcC), w0), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps1 + 0 * srcC), w0), sum1); - pw += srcC; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps0 + 1 * srcC), w0), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps1 + 1 * srcC), w0), sum1); - pw += srcC; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps0 + 2 * srcC), w0), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps1 + 2 * srcC), w0), sum1); - pw += srcC; - } - _mm256_storeu_ps(dst + c, Activate(sum0, params, c)); - _mm256_storeu_ps(dst + c + srcC, Activate(sum1, params, c)); - src += F; - } - if (c < srcC) - { - c = srcC - F; - src -= srcCF - c; - sum0 = bias ? _mm256_loadu_ps(bias + c) : _mm256_setzero_ps(); - sum1 = sum0; - const float * pw = weight + c; - for (size_t ky = 0; ky < 3; ++ky) - { - const float * ps0 = src + ky * srcS; - const float * ps1 = ps0 + srcX; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps0 + 0 * srcC), w0), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps1 + 0 * srcC), w0), sum1); - pw += srcC; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps0 + 1 * srcC), w0), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps1 + 1 * srcC), w0), sum1); - pw += srcC; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps0 + 2 * srcC), w0), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps1 + 2 * srcC), w0), sum1); - pw += srcC; - } - _mm256_storeu_ps(dst + c, Activate(sum0, params, c)); - _mm256_storeu_ps(dst + c + srcC, Activate(sum1, params, c)); - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4(const float * src, size_t srcS, size_t srcX, size_t srcC, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcCF = AlignLo(srcC, F); - size_t c = 0; - for (; c < srcCF; c += F) - { - __m256 sum0, sum1, sum2, sum3, w0; - sum0 = bias ? _mm256_loadu_ps(bias + c) : _mm256_setzero_ps(); - sum1 = sum0; - sum2 = sum0; - sum3 = sum0; - const float * pw = weight + c; - const float * ps0 = src + 0 * srcX; - const float * ps1 = src + 1 * srcX; - const float * ps2 = src + 2 * srcX; - const float * ps3 = src + 3 * srcX; - for (size_t ky = 0; ky < 3; ++ky) - { - size_t offset = ky * srcS; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps0 + offset), w0), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps1 + offset), w0), sum1); - sum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps2 + offset), w0), sum2); - sum3 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps3 + offset), w0), sum3); - pw += srcC, offset += srcC; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps0 + offset), w0), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps1 + offset), w0), sum1); - sum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps2 + offset), w0), sum2); - sum3 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps3 + offset), w0), sum3); - pw += srcC, offset += srcC; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps0 + offset), w0), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps1 + offset), w0), sum1); - sum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps2 + offset), w0), sum2); - sum3 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps3 + offset), w0), sum3); - pw += srcC, offset += srcC; - } - _mm256_storeu_ps(dst + 0 * srcC, Activate(sum0, params, c)); - _mm256_storeu_ps(dst + 1 * srcC, Activate(sum1, params, c)); - _mm256_storeu_ps(dst + 2 * srcC, Activate(sum2, params, c)); - _mm256_storeu_ps(dst + 3 * srcC, Activate(sum3, params, c)); - src += F; - dst += F; - } - if (c < srcC) - { - c = srcC - F; - src -= srcCF - c; - dst -= srcCF - c; - __m256 sum0, sum1, sum2, sum3, w0; - sum0 = bias ? _mm256_loadu_ps(bias + c) : _mm256_setzero_ps(); - sum1 = sum0; - sum2 = sum0; - sum3 = sum0; - const float * pw = weight + c; - const float * ps0 = src + 0 * srcX; - const float * ps1 = src + 1 * srcX; - const float * ps2 = src + 2 * srcX; - const float * ps3 = src + 3 * srcX; - for (size_t ky = 0; ky < 3; ++ky) - { - size_t offset = ky * srcS; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps0 + offset), w0), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps1 + offset), w0), sum1); - sum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps2 + offset), w0), sum2); - sum3 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps3 + offset), w0), sum3); - pw += srcC, offset += srcC; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps0 + offset), w0), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps1 + offset), w0), sum1); - sum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps2 + offset), w0), sum2); - sum3 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps3 + offset), w0), sum3); - pw += srcC, offset += srcC; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps0 + offset), w0), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps1 + offset), w0), sum1); - sum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps2 + offset), w0), sum2); - sum3 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps3 + offset), w0), sum3); - pw += srcC, offset += srcC; - } - _mm256_storeu_ps(dst + 0 * srcC, Activate(sum0, params, c)); - _mm256_storeu_ps(dst + 1 * srcC, Activate(sum1, params, c)); - _mm256_storeu_ps(dst + 2 * srcC, Activate(sum2, params, c)); - _mm256_storeu_ps(dst + 3 * srcC, Activate(sum3, params, c)); - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge8(const float * src, const ConvParam32f & p, size_t dy, size_t dx, const __m256 * weight, __m256 bias, const float * params, float * dst) - { - __m256 sum = bias; - for (size_t ky = 0; ky < 3; ++ky) - { - size_t sy = dy * p.strideY + ky - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < 3; ++kx) - { - size_t sx = dx * p.strideX + kx - p.padX; - if (sx < p.srcW) - { - const float * ps = src + (sy*p.srcW + sx) * F; - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps), weight[ky * 3 + kx]), sum); - } - } - } - } - _mm256_storeu_ps(dst, Activate(sum, params, 0)); - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main8x1(const float * src, size_t srcS, const __m256 * weight, __m256 bias, const float * params, float * dst) - { - __m256 sum = bias; - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src + 0 * F), weight[0]), sum); - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src + 1 * F), weight[1]), sum); - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src + 2 * F), weight[2]), sum); - src += srcS; - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src + 0 * F), weight[3]), sum); - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src + 1 * F), weight[4]), sum); - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src + 2 * F), weight[5]), sum); - src += srcS; - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src + 0 * F), weight[6]), sum); - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src + 1 * F), weight[7]), sum); - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src + 2 * F), weight[8]), sum); - _mm256_storeu_ps(dst, Activate(sum, params, 0)); - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main8x2(const float * src, size_t srcS, const __m256 * weight, __m256 bias, const float * params, float * dst) - { - __m256 sum0 = bias; - __m256 sum1 = bias; - for (size_t ky = 0; ky < 3; ++ky) - { - __m256 s0 = _mm256_loadu_ps(src + 0 * F); - __m256 s1 = _mm256_loadu_ps(src + 1 * F); - __m256 s2 = _mm256_loadu_ps(src + 2 * F); - __m256 s3 = _mm256_loadu_ps(src + 3 * F); - sum0 = _mm256_add_ps(_mm256_mul_ps(s0, weight[0]), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(s1, weight[0]), sum1); - sum0 = _mm256_add_ps(_mm256_mul_ps(s1, weight[1]), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(s2, weight[1]), sum1); - sum0 = _mm256_add_ps(_mm256_mul_ps(s2, weight[2]), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(s3, weight[2]), sum1); - src += srcS; - weight += 3; - } - _mm256_storeu_ps(dst + 0, Activate(sum0, params, 0)); - _mm256_storeu_ps(dst + F, Activate(sum1, params, 0)); - } - - template<::SimdConvolutionActivationType type> void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcS = p.srcC*p.srcW; - size_t srcX = p.srcC*p.strideX; - size_t dstH = p.dstH - p.padH; - size_t dstW = p.dstW - p.padW; - size_t dstW2 = AlignLo(dstW - p.padX, 2) + p.padX; - size_t dstW4 = AlignLo(dstW - p.padX, 4) + p.padX; - if (p.dstC == F && p.strideX == 1) - { - __m256 _weight[9]; - for (size_t i = 0; i < 9; ++i) - _weight[i] = _mm256_loadu_ps(weight + i * F); - __m256 _bias = bias ? _mm256_loadu_ps(bias) : _mm256_setzero_ps(); - size_t dy = 0; - for (; dy < p.padY; ++dy) - for (size_t dx = 0; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge8(src, p, dy, dx, _weight, _bias, params, dst), dst += F; - for (; dy < dstH; ++dy) - { - size_t dx = 0; - for (; dx < p.padX; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge8(src, p, dy, dx, _weight, _bias, params, dst), dst += F; - size_t offset = ((dy * p.strideY - p.padY)*p.srcW + dx * p.strideX - p.padX)*p.srcC; - for (; dx < dstW2; dx += 2) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main8x2(src + offset, srcS, _weight, _bias, params, dst), offset += 2 * F, dst += 2 * F; - for (; dx < dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main8x1(src + offset, srcS, _weight, _bias, params, dst), offset += F, dst += F; - for (; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge8(src, p, dy, dx, _weight, _bias, params, dst), dst += F; - } - for (; dy < p.dstH; ++dy) - for (size_t dx = 0; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge8(src, p, dy, dx, _weight, _bias, params, dst), dst += F; - } - else - { - size_t dy = 0; - for (; dy < p.padY; ++dy) - for (size_t dx = 0; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC; - for (; dy < dstH; ++dy) - { - size_t dx = 0; - for (; dx < p.padX; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC; - size_t offset = ((dy * p.strideY - p.padY)*p.srcW + dx * p.strideX - p.padX)*p.srcC; - for (; dx < dstW4; dx += 4) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4(src + offset, srcS, srcX, p.srcC, weight, bias, params, dst), dst += 4 * p.dstC, offset += 4 * srcX; - for (; dx < dstW2; dx += 2) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2(src + offset, srcS, srcX, p.srcC, weight, bias, params, dst), dst += 2 * p.dstC, offset += 2 * srcX; - for (; dx < dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1(src + offset, srcS, p.srcC, weight, bias, params, dst), dst += p.dstC, offset += srcX; - for (; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC; - } - for (; dy < p.dstH; ++dy) - for (size_t dx = 0; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC; - } - } - - template <::SimdConvolutionActivationType type> SynetConvolution32fDirectNhwc::ConvolutionBiasActivationPtr GetConvolutionBiasActivation(const ConvParam32f & p) - { - if (p.group == 1) - return ConvolutionDirectNhwcConvolutionBiasActivationDefault; - else if (p.IsDepthwise()) - { - if (p.IsKernel(3) && p.IsDilation(1)) - return ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3; - else - return ConvolutionDirectNhwcConvolutionBiasActivationDepthwise; - } - return NULL; - } - - SynetConvolution32fDirectNhwc::ConvolutionBiasActivationPtr SynetConvolution32fDirectNhwc::SetConvolutionBiasActivation() - { - const ConvParam32f & p = _param; - SynetConvolution32fDirectNhwc::ConvolutionBiasActivationPtr func = NULL; - if (p.dstC >= F && p.dstH >= p.padY + p.padH && p.dstW >= p.padX + p.padW) - { - switch (p.activation) - { - case ::SimdConvolutionActivationIdentity: func = GetConvolutionBiasActivation<::SimdConvolutionActivationIdentity>(p); break; - case ::SimdConvolutionActivationRelu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationRelu>(p); break; - case ::SimdConvolutionActivationLeakyRelu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationLeakyRelu>(p); break; - case ::SimdConvolutionActivationRestrictRange: func = GetConvolutionBiasActivation<::SimdConvolutionActivationRestrictRange>(p); break; - case ::SimdConvolutionActivationPrelu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationPrelu>(p); break; - case ::SimdConvolutionActivationHswish: func = GetConvolutionBiasActivation<::SimdConvolutionActivationHswish>(p); break; - } - } - return func ? func : Sse2::SynetConvolution32fDirectNhwc::SetConvolutionBiasActivation(); - }; - - //--------------------------------------------------------------------- - - SynetConvolution32fDepthwiseDotProduct::SynetConvolution32fDepthwiseDotProduct(const ConvParam32f & p) - : Sse2::SynetConvolution32fDepthwiseDotProduct(p) - { - } - - SIMD_INLINE void DotProduct(const float * a, const float * b, size_t offset, __m256 & sum) - { - __m256 _a = _mm256_loadu_ps(a + offset); - __m256 _b = _mm256_loadu_ps(b + offset); - sum = _mm256_add_ps(_mm256_mul_ps(_a, _b), sum); - } - - SIMD_INLINE float DotProduct(const float * a, const float * b, size_t size) - { - float sum = 0; - size_t partialAlignedSize = AlignLo(size, F); - size_t fullAlignedSize = AlignLo(size, QF); - size_t i = 0; - if (partialAlignedSize) - { - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - if (fullAlignedSize) - { - for (; i < fullAlignedSize; i += QF) - { - DotProduct(a, b, i + F * 0, sums[0]); - DotProduct(a, b, i + F * 1, sums[1]); - DotProduct(a, b, i + F * 2, sums[2]); - DotProduct(a, b, i + F * 3, sums[3]); - } - sums[0] = _mm256_add_ps(_mm256_add_ps(sums[0], sums[1]), _mm256_add_ps(sums[2], sums[3])); - } - for (; i < partialAlignedSize; i += F) - DotProduct(a, b, i, sums[0]); - sum += ExtractSum(sums[0]); - } - for (; i < size; ++i) - sum += a[i] * b[i]; - return sum; - } - - void SynetConvolution32fDepthwiseDotProduct::Forward(const float * src, float * buf, float * dst) - { - for (size_t b = 0; b < _batch; ++b) - { - if (_bias) - { - for (size_t i = 0; i < _count; ++i) - dst[i] = DotProduct(src + i * _size, _weight + i * _size, _size) + _bias[i]; - } - else - { - for (size_t i = 0; i < _count; ++i) - dst[i] = DotProduct(src + i * _size, _weight + i * _size, _size); - } - if (_param.activation) - ConvolutionBiasAndActivation(NULL, _count, 1, _param.activation, _params, ::SimdFalse, dst); - src += _sizeS; - dst += _sizeD; - } - } - - //--------------------------------------------------------------------- - - SynetConvolution32fNhwcDirect::SynetConvolution32fNhwcDirect(const ConvParam32f& p) - : Sse2::SynetConvolution32fNhwcDirect(p) - { - if (p.dstC <= Sse::F) - return; -#ifdef SIMD_SYNET_CONVOLUTION_NHWC_DIRECT_OLD - //_old.enable = true; - if (_old.enable) - { - if (Set2f(p, _old.convolution)) - OldSetAlgParam(F); - } - else -#endif - { - RunFuncs funcs; - for (size_t n = 2; n <= 3; ++n) - { - funcs.push_back(RunFunc(Ext() + "-" + ToStr(n))); - SetAlgParam(F, n, funcs.back().alg); - if (!SetRt(p, funcs.back().alg)) - return; - } - _run.Init(funcs); - } - } - - bool SynetConvolution32fNhwcDirect::SetRt(const ConvParam32f& p, AlgParam& a) - { - switch (a.microD) - { - case 2 * F: return Set2r(p, a); - case 3 * F: return Set3r(p, a); - default: - return false; - } - } - - //--------------------------------------------------------------------- - - void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdGemm32fNNPtr gemm) - { - if (conv->activation == SimdConvolutionActivationElu) - return Sse2::SynetConvolution32fInit(batch, conv, gemm); - ConvParam32f param(batch, conv, gemm); - if (!param.Valid()) - return NULL; - else if (SynetConvolution32fDepthwiseDotProduct::Preferable(param)) - return new SynetConvolution32fDepthwiseDotProduct(param); - else if (SynetConvolution32fWinograd::Preferable(param)) - return new SynetConvolution32fWinograd(param); - else if (SynetConvolution32fGemmNT::Preferable(param)) - return new SynetConvolution32fGemmNT(param); - else if (SynetConvolution32fDirectNchw::Preferable(param)) - return new Avx::SynetConvolution32fDirectNchw(param); - else if (SynetConvolution32fNhwcDirect::Preferable(param)) - return new SynetConvolution32fNhwcDirect(param); - else if (SynetConvolution32fDirectNhwc::Preferable(param)) - return new SynetConvolution32fDirectNhwc(param); - else - return new SynetConvolution32fGemmNN(param); - } - } -#endif//SIMD_AVX_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx1SynetConvolution32fNhwcDirect2f.cpp b/src/3rd/Simd/Simd/SimdAvx1SynetConvolution32fNhwcDirect2f.cpp deleted file mode 100644 index f1d7631f..00000000 --- a/src/3rd/Simd/Simd/SimdAvx1SynetConvolution32fNhwcDirect2f.cpp +++ /dev/null @@ -1,796 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - using AlgParam = SynetConvolution32fNhwcDirect::AlgParam; - - template void ConvolutionNhwcDirect_2x6(const float* src0, const ConvParam32f& p, - size_t kernelH, size_t kernelW, size_t srcC, size_t dstC, const float* weight, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t dS = p.srcC * p.strideX, dW = DF * (p.kernelX - kernelW) * srcC, dY = p.srcW * p.srcC, dX = p.srcC, dD = p.dstC; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - d00 = _mm256_setzero_ps(); d01 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); d11 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); d21 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); d31 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(); d41 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(); d51 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - s0 = _mm256_set1_ps(src0[offset]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - s0 = _mm256_set1_ps(src1[offset]); - d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11); - s0 = _mm256_set1_ps(src2[offset]); - d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21); - s0 = _mm256_set1_ps(src3[offset]); - d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31); - s0 = _mm256_set1_ps(src4[offset]); - d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40); - d41 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d41); - s0 = _mm256_set1_ps(src5[offset]); - d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50); - d51 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d51); - weight += DF; - } - } - weight += dW; - } - if (dstC == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params); - } - else - { - dstC -= F; - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params, dstC); - } - } - else - { - d00 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - s0 = _mm256_set1_ps(src0[offset]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - s0 = _mm256_set1_ps(src1[offset]); - d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - s0 = _mm256_set1_ps(src2[offset]); - d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - s0 = _mm256_set1_ps(src3[offset]); - d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - s0 = _mm256_set1_ps(src4[offset]); - d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40); - s0 = _mm256_set1_ps(src5[offset]); - d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50); - weight += DF; - } - } - weight += dW; - } - if (dstC == F) - { - Term::template Save(dst + 0, d00, bias, params); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d10, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d20, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d30, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d40, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d50, bias, params, dstC); - } - } - } - - template void ConvolutionNhwcDirect_2x3(const float* src0, const ConvParam32f& p, - size_t kernelH, size_t kernelW, size_t srcC, size_t dstC, const float* weight, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d10, d11, d20, d21, s0, w0, w1; - size_t dS = p.srcC * p.strideX, dW = DF * (p.kernelX - kernelW) * srcC, dY = p.srcW * p.srcC, dX = p.srcC, dD = p.dstC; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - if (dstC > F) - { - d00 = _mm256_setzero_ps(); d01 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); d11 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); d21 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - s0 = _mm256_set1_ps(src0[offset]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - s0 = _mm256_set1_ps(src1[offset]); - d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11); - s0 = _mm256_set1_ps(src2[offset]); - d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21); - weight += DF; - } - } - weight += dW; - } - if (dstC == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params); - } - else - { - dstC -= F; - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, dstC); - } - } - else - { - d00 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - s0 = _mm256_set1_ps(src0[offset]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - s0 = _mm256_set1_ps(src1[offset]); - d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - s0 = _mm256_set1_ps(src2[offset]); - d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - weight += DF; - } - } - weight += dW; - } - if (dstC == F) - { - Term::template Save(dst + 0, d00, bias, params); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d10, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d20, bias, params, dstC); - } - } - } - - template void ConvolutionNhwcDirect_2x1(const float* src0, const ConvParam32f& p, - size_t kernelH, size_t kernelW, size_t srcC, size_t dstC, const float* weight, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, s0, w0, w1; - size_t dW = DF * (p.kernelX - kernelW) * srcC, dY = p.srcW * p.srcC, dX = p.srcC; - if (dstC > F) - { - d00 = _mm256_setzero_ps(); - d01 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - s0 = _mm256_set1_ps(src0[offset]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - weight += DF; - } - } - weight += dW; - } - if (dstC == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, dstC - F); - } - } - else - { - d00 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - s0 = _mm256_set1_ps(src0[offset]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - weight += DF; - } - } - weight += dW; - } - if (dstC == F) - Term::template Save(dst + 0, d00, bias, params); - else - Term::template Save(dst + 0, d00, bias, params, dstC); - } - } - - template void ConvolutionNhwcDirect_2(const float* src, const ConvParam32f& p, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t noseH = p.padY, noseW = p.padX; - size_t bodyH = p.srcH - p.kernelY + 1 + noseH, bodyW = p.srcW - p.kernelX + 1 + noseW; - size_t bodyW3 = AlignLoAny(bodyW - noseW, 3 * p.strideX) + noseW; - size_t bodyW6 = AlignLoAny(bodyW - noseW, 6 * p.strideX) + noseW; - size_t tailH = bodyH + p.padH, tailW = bodyW + p.padW; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - - __m256 _params[2], _bias[2]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - - for (size_t dc = 0; dc < dstC; dc += DF) - { - size_t dC = Simd::Min(DF, dstC - dc); - _bias[0] = _mm256_loadu_ps(bias + dc + 0); - _bias[1] = _mm256_loadu_ps(bias + dc + F); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = _mm256_loadu_ps(params + dc + 0); - _params[1] = _mm256_loadu_ps(params + dc + F); - } - float* d = dst + dc + yBeg * p.dstW * p.dstC; - size_t dy = yBeg, sy = dy * p.strideY; - for (; sy < noseH && dy < yEnd; sy += p.strideY, dy++) - { - size_t sx = 0; - const float* s = src; - const float* w = weight + (noseH - sy) * p.kernelX * srcC * DF; - for (; sx < noseW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s, p, kY + sy, kX + sx, srcC, dC, w + (noseW - sx) * srcC * DF, _bias, _params, d); - for (; sx < bodyW6; sx += 6 * p.strideX, d += 6 * p.dstC) - ConvolutionNhwcDirect_2x6(s + (sx - noseW) * p.srcC, p, kY + sy, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < bodyW3; sx += 3 * p.strideX, d += 3 * p.dstC) - ConvolutionNhwcDirect_2x3(s + (sx - noseW) * p.srcC, p, kY + sy, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < bodyW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, kY + sy, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < tailW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, kY + sy, kW - sx, srcC, dC, w, _bias, _params, d); - } - for (; sy < bodyH && dy < yEnd; sy += p.strideY, dy++) - { - size_t sx = 0; - const float* s = src + (sy - noseH) * p.srcW * p.srcC; - const float* w = weight; - for (; sx < noseW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s, p, p.kernelY, kX + sx, srcC, dC, w + (noseW - sx) * srcC * DF, _bias, _params, d); - for (; sx < bodyW6; sx += 6 * p.strideX, d += 6 * p.dstC) - ConvolutionNhwcDirect_2x6(s + (sx - noseW) * p.srcC, p, p.kernelY, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < bodyW3; sx += 3 * p.strideX, d += 3 * p.dstC) - ConvolutionNhwcDirect_2x3(s + (sx - noseW) * p.srcC, p, p.kernelY, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < bodyW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, p.kernelY, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < tailW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, p.kernelY, kW - sx, srcC, dC, w, _bias, _params, d); - } - for (; sy < tailH && dy < yEnd; sy += p.strideY, dy++) - { - size_t sx = 0; - const float* s = src + (sy - noseH) * p.srcW * p.srcC; - const float* w = weight; - for (; sx < noseW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s, p, kH - sy, kX + sx, srcC, dC, w + (noseW - sx) * srcC * DF, _bias, _params, d); - for (; sx < bodyW6; sx += 6 * p.strideX, d += 6 * p.dstC) - ConvolutionNhwcDirect_2x6(s + (sx - noseW) * p.srcC, p, kH - sy, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < bodyW3; sx += 3 * p.strideX, d += 3 * p.dstC) - ConvolutionNhwcDirect_2x3(s + (sx - noseW) * p.srcC, p, kH - sy, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < bodyW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, kH - sy, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < tailW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, kH - sy, kW - sx, srcC, dC, w, _bias, _params, d); - } - weight += p.kernelY * p.kernelX * srcC * DF; - } - } - - template void ConvolutionNhwcDirect_2(const float* src, const ConvParam32f& p, - const SynetConvolution32fNhwcDirect::AlgParam& a, const float* weight, const float* bias, const float* params, float* dst) - { - for (size_t dc = 0; dc < p.dstC; dc += a.macroD) - { - size_t macroD = Simd::Min(p.dstC, dc + a.macroD) - dc; - for (size_t sc = 0; sc < p.srcC; sc += a.macroC) - { - size_t macroC = Simd::Min(p.srcC, sc + a.macroC) - sc; - size_t macroK = p.kernelY * p.kernelX * macroC; - for (size_t yBeg = 0; yBeg < p.dstH;) - { - size_t yEnd = Simd::Min(yBeg + a.macroH, p.dstH); - if (a.macroC == p.srcC) - ConvolutionNhwcDirect_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc == 0) - ConvolutionNhwcDirect_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc + macroC == p.srcC) - ConvolutionNhwcDirect_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else - ConvolutionNhwcDirect_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - yBeg = yEnd; - } - weight += AlignHiAny(macroD, a.microD) * macroK; - } - if (type == ::SimdConvolutionActivationPrelu) - params += macroD; - } - } - - //--------------------------------------------------------------------- - - template void ConvolutionNhwcDirect1x1_2x6(const float* src0, const ConvParam32f& p, - size_t srcC, size_t dstC, const float* weight, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t dS = p.srcC, dD = p.dstC; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - d00 = _mm256_setzero_ps(); d01 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); d11 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); d21 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); d31 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(); d41 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(); d51 = _mm256_setzero_ps(); - for (size_t offset = 0; offset < srcC; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - s0 = _mm256_set1_ps(src0[offset]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - s0 = _mm256_set1_ps(src1[offset]); - d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11); - s0 = _mm256_set1_ps(src2[offset]); - d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21); - s0 = _mm256_set1_ps(src3[offset]); - d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31); - s0 = _mm256_set1_ps(src4[offset]); - d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40); - d41 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d41); - s0 = _mm256_set1_ps(src5[offset]); - d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50); - d51 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d51); - weight += DF; - } - if (dstC == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params); - } - else - { - dstC -= F; - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params, dstC); - } - } - else - { - d00 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(); - for (size_t offset = 0; offset < srcC; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - s0 = _mm256_set1_ps(src0[offset]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - s0 = _mm256_set1_ps(src1[offset]); - d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - s0 = _mm256_set1_ps(src2[offset]); - d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - s0 = _mm256_set1_ps(src3[offset]); - d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - s0 = _mm256_set1_ps(src4[offset]); - d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40); - s0 = _mm256_set1_ps(src5[offset]); - d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50); - weight += DF; - } - if (dstC == F) - { - Term::template Save(dst + 0, d00, bias, params); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d10, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d20, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d30, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d40, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d50, bias, params, dstC); - } - } - } - - template void ConvolutionNhwcDirect1x1_2xM(const float* src0, const ConvParam32f& p, - size_t srcC, size_t dstC, const float* weight, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t dS = p.srcC, dD = p.dstC; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - if (M > 0) d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - if (M > 4) d40 = _mm256_setzero_ps(), d41 = _mm256_setzero_ps(); - if (M > 5) d50 = _mm256_setzero_ps(), d51 = _mm256_setzero_ps(); - for (size_t offset = 0; offset < srcC; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - if (M > 0) s0 = _mm256_set1_ps(src0[offset]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00), d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - if (M > 1) s0 = _mm256_set1_ps(src1[offset]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10), d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11); - if (M > 2) s0 = _mm256_set1_ps(src2[offset]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20), d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21); - if (M > 3) s0 = _mm256_set1_ps(src3[offset]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30), d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31); - if (M > 4) s0 = _mm256_set1_ps(src4[offset]), d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40), d41 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d41); - if (M > 5) s0 = _mm256_set1_ps(src5[offset]), d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50), d51 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d51); - weight += DF; - } - if (dstC == DF) - { - if (M > 0) Term::template Save(dst + 0, d00, bias, params), Term::template Save(dst + F, d01, bias, params), dst += dD; - if (M > 1) Term::template Save(dst + 0, d10, bias, params), Term::template Save(dst + F, d11, bias, params), dst += dD; - if (M > 2) Term::template Save(dst + 0, d20, bias, params), Term::template Save(dst + F, d21, bias, params), dst += dD; - if (M > 3) Term::template Save(dst + 0, d30, bias, params), Term::template Save(dst + F, d31, bias, params), dst += dD; - if (M > 4) Term::template Save(dst + 0, d40, bias, params), Term::template Save(dst + F, d41, bias, params), dst += dD; - if (M > 5) Term::template Save(dst + 0, d50, bias, params), Term::template Save(dst + F, d51, bias, params), dst += dD; - } - else - { - dstC -= F; - if (M > 0) Term::template Save(dst + 0, d00, bias, params), Term::template Save(dst + F, d01, bias, params, dstC), dst += dD; - if (M > 1) Term::template Save(dst + 0, d10, bias, params), Term::template Save(dst + F, d11, bias, params, dstC), dst += dD; - if (M > 2) Term::template Save(dst + 0, d20, bias, params), Term::template Save(dst + F, d21, bias, params, dstC), dst += dD; - if (M > 3) Term::template Save(dst + 0, d30, bias, params), Term::template Save(dst + F, d31, bias, params, dstC), dst += dD; - if (M > 4) Term::template Save(dst + 0, d40, bias, params), Term::template Save(dst + F, d41, bias, params, dstC), dst += dD; - if (M > 5) Term::template Save(dst + 0, d50, bias, params), Term::template Save(dst + F, d51, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0) d00 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(); - if (M > 4) d40 = _mm256_setzero_ps(); - if (M > 5) d50 = _mm256_setzero_ps(); - for (size_t offset = 0; offset < srcC; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - if (M > 0) s0 = _mm256_set1_ps(src0[offset]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - if (M > 1) s0 = _mm256_set1_ps(src1[offset]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - if (M > 2) s0 = _mm256_set1_ps(src2[offset]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - if (M > 3) s0 = _mm256_set1_ps(src3[offset]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - if (M > 4) s0 = _mm256_set1_ps(src4[offset]), d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40); - if (M > 5) s0 = _mm256_set1_ps(src5[offset]), d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50); - weight += DF; - } - if (dstC == F) - { - if (M > 0) Term::template Save(dst + 0, d00, bias, params), dst += dD; - if (M > 1) Term::template Save(dst + 0, d10, bias, params), dst += dD; - if (M > 2) Term::template Save(dst + 0, d20, bias, params), dst += dD; - if (M > 3) Term::template Save(dst + 0, d30, bias, params), dst += dD; - if (M > 4) Term::template Save(dst + 0, d40, bias, params), dst += dD; - if (M > 5) Term::template Save(dst + 0, d50, bias, params), dst += dD; - } - else - { - if (M > 0) Term::template Save(dst + 0, d00, bias, params, dstC), dst += dD; - if (M > 1) Term::template Save(dst + 0, d10, bias, params, dstC), dst += dD; - if (M > 2) Term::template Save(dst + 0, d20, bias, params, dstC), dst += dD; - if (M > 3) Term::template Save(dst + 0, d30, bias, params, dstC), dst += dD; - if (M > 4) Term::template Save(dst + 0, d40, bias, params, dstC), dst += dD; - if (M > 5) Term::template Save(dst + 0, d50, bias, params, dstC), dst += dD; - } - } - } - - typedef void(*ConvolutionNhwcDirect1x1_2xM_Ptr)(const float* src0, const ConvParam32f& p, size_t srcC, size_t dstC, const float* weight, const __m256* bias, const __m256* params, float* dst); - - template ConvolutionNhwcDirect1x1_2xM_Ptr GetConvolutionNhwcDirect1x1_2xM(size_t M) - { - switch (M) - { - case 0: return ConvolutionNhwcDirect1x1_2xM; - case 1: return ConvolutionNhwcDirect1x1_2xM; - case 2: return ConvolutionNhwcDirect1x1_2xM; - case 3: return ConvolutionNhwcDirect1x1_2xM; - case 4: return ConvolutionNhwcDirect1x1_2xM; - case 5: return ConvolutionNhwcDirect1x1_2xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect1x1_2(const float* src, const ConvParam32f& p, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t n1 = (yEnd - yBeg) * p.dstW; - size_t n6 = AlignLoAny(n1, 6); - size_t nTail = n1 - n6; - ConvolutionNhwcDirect1x1_2xM_Ptr tailN = GetConvolutionNhwcDirect1x1_2xM(nTail); - - __m256 _params[2], _bias[2]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - - for (size_t dc = 0; dc < dstC; dc += DF) - { - size_t dC = Simd::Min(DF, dstC - dc); - _bias[0] = _mm256_loadu_ps(bias + dc + 0); - _bias[1] = _mm256_loadu_ps(bias + dc + F); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = _mm256_loadu_ps(params + dc + 0); - _params[1] = _mm256_loadu_ps(params + dc + F); - } - const float* ps = src + yBeg * p.srcW * p.srcC; - float* pd = dst + dc + yBeg * p.dstW * p.dstC; - size_t i = 0; - for (; i < n6; i += 6, ps += 6 * p.srcC, pd += 6 * p.dstC) - ConvolutionNhwcDirect1x1_2x6(ps, p, srcC, dC, weight, _bias, _params, pd); - if (nTail) - tailN(ps, p, srcC, dC, weight, _bias, _params, pd), ps += nTail * p.srcC, pd += nTail * p.dstC; - weight += srcC * DF; - } - } - - template void ConvolutionNhwcDirect1x1_2(const float* src, const ConvParam32f& p, - const SynetConvolution32fNhwcDirect::AlgParam& a, const float* weight, const float* bias, const float* params, float* dst) - { - for (size_t dc = 0; dc < p.dstC; dc += a.macroD) - { - size_t macroD = Simd::Min(p.dstC, dc + a.macroD) - dc; - for (size_t sc = 0; sc < p.srcC; sc += a.macroC) - { - size_t macroC = Simd::Min(p.srcC, sc + a.macroC) - sc; - for (size_t yBeg = 0; yBeg < p.dstH;) - { - size_t yEnd = Simd::Min(yBeg + a.macroH, p.dstH); - if (a.macroC == p.srcC) - ConvolutionNhwcDirect1x1_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc == 0) - ConvolutionNhwcDirect1x1_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc + macroC == p.srcC) - ConvolutionNhwcDirect1x1_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else - ConvolutionNhwcDirect1x1_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - yBeg = yEnd; - } - weight += AlignHiAny(macroD, a.microD) * macroC; - } - if (type == ::SimdConvolutionActivationPrelu) - params += macroD; - } - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void Set(const ConvParam32f& p, SynetConvolution32fNhwcDirect::OldConvolutionPtr& convolution) - { - if (p.Is1x1()) - convolution = ConvolutionNhwcDirect1x1_2; - else - convolution = ConvolutionNhwcDirect_2; - } - - bool SynetConvolution32fNhwcDirect::Set2f(const ConvParam32f& p, OldConvolutionPtr& convolution) - { - switch (p.activation) - { - case SimdConvolutionActivationIdentity: Set(p, convolution); break; - case SimdConvolutionActivationRelu: Set(p, convolution); break; - case SimdConvolutionActivationLeakyRelu: Set(p, convolution); break; - case SimdConvolutionActivationRestrictRange: Set(p, convolution); break; - case SimdConvolutionActivationPrelu: Set(p, convolution); break; - case SimdConvolutionActivationHswish: Set(p, convolution); break; - default: return false; - } - return true; - } - } -#endif//SIMD_AVX_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx1SynetConvolution32fNhwcDirect2r.cpp b/src/3rd/Simd/Simd/SimdAvx1SynetConvolution32fNhwcDirect2r.cpp deleted file mode 100644 index f9ec7607..00000000 --- a/src/3rd/Simd/Simd/SimdAvx1SynetConvolution32fNhwcDirect2r.cpp +++ /dev/null @@ -1,689 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - using AlgParam = SynetConvolution32fNhwcDirect::AlgParam; - - typedef void(*ConvolutionNhwcDirect_NxM_Ptr)(const float* src0, const ConvParam32f& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst); - typedef void(*ConvolutionNhwcDirect1x1_NxM_Ptr)(const float* src0, const ConvParam32f& p, const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst); - - //--------------------------------------------------------------------- - - template void ConvolutionNhwcDirect_2x1(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, s0, w0, w1; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - if (dstC > F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00), d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - } - } - weight0 += dW, weight1 += dW; - } - } - if (dstC == DF) - Save2(dst, d00, d01, bias, params); - else - Save2(dst, d00, d01, bias, params, dstC - F); - } - else - { - d00 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - } - } - weight0 += dW; - } - } - if (dstC == F) - Save1(dst, d00, bias, params); - else - Save1(dst, d00, bias, params, dstC); - } - } - - template void ConvolutionNhwcDirect_2x6(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(), d41 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(), d51 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 6 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00), d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10), d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11); - s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20), d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21); - s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30), d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31); - s0 = _mm256_set1_ps(src4[offs]), d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40), d41 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d41); - s0 = _mm256_set1_ps(src5[offs]), d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50), d51 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d51); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == DF) - { - Save2(dst, d00, d01, bias, params), dst += dD; - Save2(dst, d10, d11, bias, params), dst += dD; - Save2(dst, d20, d21, bias, params), dst += dD; - Save2(dst, d30, d31, bias, params), dst += dD; - Save2(dst, d40, d41, bias, params), dst += dD; - Save2(dst, d50, d51, bias, params), dst += dD; - } - else - { - dstC -= F; - Save2(dst, d00, d01, bias, params, dstC), dst += dD; - Save2(dst, d10, d11, bias, params, dstC), dst += dD; - Save2(dst, d20, d21, bias, params, dstC), dst += dD; - Save2(dst, d30, d31, bias, params, dstC), dst += dD; - Save2(dst, d40, d41, bias, params, dstC), dst += dD; - Save2(dst, d50, d51, bias, params, dstC), dst += dD; - } - } - else - { - d00 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 6 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - s0 = _mm256_set1_ps(src4[offs]), d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40); - s0 = _mm256_set1_ps(src5[offs]), d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - Save1(dst, d00, bias, params), dst += dD; - Save1(dst, d10, bias, params), dst += dD; - Save1(dst, d20, bias, params), dst += dD; - Save1(dst, d30, bias, params), dst += dD; - Save1(dst, d40, bias, params), dst += dD; - Save1(dst, d50, bias, params), dst += dD; - } - else - { - Save1(dst, d00, bias, params, dstC), dst += dD; - Save1(dst, d10, bias, params, dstC), dst += dD; - Save1(dst, d20, bias, params, dstC), dst += dD; - Save1(dst, d30, bias, params, dstC), dst += dD; - Save1(dst, d40, bias, params, dstC), dst += dD; - Save1(dst, d50, bias, params, dstC), dst += dD; - } - } - } - - template void ConvolutionNhwcDirect_2xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - if (M > 0) d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - if (M > 4) d40 = _mm256_setzero_ps(), d41 = _mm256_setzero_ps(); - if (M > 5) d50 = _mm256_setzero_ps(), d51 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - if (M > 0) s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00), d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - if (M > 1) s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10), d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11); - if (M > 2) s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20), d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21); - if (M > 3) s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30), d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31); - if (M > 4) s0 = _mm256_set1_ps(src4[offs]), d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40), d41 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d41); - if (M > 5) s0 = _mm256_set1_ps(src5[offs]), d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50), d51 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d51); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == DF) - { - if (M > 0) Save2(dst, d00, d01, bias, params), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params), dst += dD; - } - else - { - dstC -= F; - if (M > 0) Save2(dst, d00, d01, bias, params, dstC), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params, dstC), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params, dstC), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params, dstC), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params, dstC), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0) d00 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(); - if (M > 4) d40 = _mm256_setzero_ps(); - if (M > 5) d50 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - if (M > 0) s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - if (M > 1) s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - if (M > 2) s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - if (M > 3) s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - if (M > 4) s0 = _mm256_set1_ps(src4[offs]), d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40); - if (M > 5) s0 = _mm256_set1_ps(src5[offs]), d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - if (M > 0) Save1(dst, d00, bias, params), dst += dD; - if (M > 1) Save1(dst, d10, bias, params), dst += dD; - if (M > 2) Save1(dst, d20, bias, params), dst += dD; - if (M > 3) Save1(dst, d30, bias, params), dst += dD; - if (M > 4) Save1(dst, d40, bias, params), dst += dD; - if (M > 5) Save1(dst, d50, bias, params), dst += dD; - } - else - { - if (M > 0) Save1(dst, d00, bias, params, dstC), dst += dD; - if (M > 1) Save1(dst, d10, bias, params, dstC), dst += dD; - if (M > 2) Save1(dst, d20, bias, params, dstC), dst += dD; - if (M > 3) Save1(dst, d30, bias, params, dstC), dst += dD; - if (M > 4) Save1(dst, d40, bias, params, dstC), dst += dD; - if (M > 5) Save1(dst, d50, bias, params, dstC), dst += dD; - } - } - } - - template ConvolutionNhwcDirect_NxM_Ptr GetConvolutionNhwcDirect_2xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect_2xM; - case 2: return ConvolutionNhwcDirect_2xM; - case 3: return ConvolutionNhwcDirect_2xM; - case 4: return ConvolutionNhwcDirect_2xM; - case 5: return ConvolutionNhwcDirect_2xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect_2(const float* src, const ConvParam32f& p, const AlgParam& a, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t noseH = p.NoseH(), noseW = p.NoseW(), bodyH = p.BodyH(), bodyW = p.BodyW(); - size_t n = 6, bodyWn = AlignLoAny(bodyW - noseW, n) + noseW, m = bodyW - bodyWn; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_2x1 = ConvolutionNhwcDirect_2x1; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_2xN = ConvolutionNhwcDirect_2x6; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_2xM = GetConvolutionNhwcDirect_2xM(m); - size_t tailH = p.dstH, tailW = p.dstW; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - - __m256 _params[2], _bias[2]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - - for (size_t dc = 0; dc < dstC; dc += a.microD) - { - size_t dC = Simd::Min(a.microD, dstC - dc); - if (dC > 0 * F) _bias[0] = _mm256_loadu_ps(bias + dc + 0 * F); - if (dC > 1 * F) _bias[1] = _mm256_loadu_ps(bias + dc + 1 * F); - if (type == ::SimdConvolutionActivationPrelu) - { - if (dC > 0 * F) _params[0] = _mm256_loadu_ps(params + dc + 0 * F); - if (dC > 1 * F) _params[1] = _mm256_loadu_ps(params + dc + 1 * F); - } - float* d = dst + dc + yBeg * p.dstW * p.dstC; - size_t dy = yBeg; - for (; dy < noseH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_2xN(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - } - for (; dy < bodyH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_2xN(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - } - for (; dy < tailH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_2xN(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - } - weight += p.kernelY * p.kernelX * p.srcC * a.microD; - } - } - - //--------------------------------------------------------------------- - - template void ConvolutionNhwcDirect1x1_2x6(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(), d41 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(), d51 = _mm256_setzero_ps(); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00), d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10), d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11); - s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20), d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21); - s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30), d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31); - s0 = _mm256_set1_ps(src4[offs]), d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40), d41 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d41); - s0 = _mm256_set1_ps(src5[offs]), d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50), d51 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d51); - } - if (dstC == DF) - { - Save2(dst, d00, d01, bias, params), dst += dD; - Save2(dst, d10, d11, bias, params), dst += dD; - Save2(dst, d20, d21, bias, params), dst += dD; - Save2(dst, d30, d31, bias, params), dst += dD; - Save2(dst, d40, d41, bias, params), dst += dD; - Save2(dst, d50, d51, bias, params), dst += dD; - } - else - { - dstC -= F; - Save2(dst, d00, d01, bias, params, dstC), dst += dD; - Save2(dst, d10, d11, bias, params, dstC), dst += dD; - Save2(dst, d20, d21, bias, params, dstC), dst += dD; - Save2(dst, d30, d31, bias, params, dstC), dst += dD; - Save2(dst, d40, d41, bias, params, dstC), dst += dD; - Save2(dst, d50, d51, bias, params, dstC), dst += dD; - } - } - else - { - d00 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - s0 = _mm256_set1_ps(src4[offs]), d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40); - s0 = _mm256_set1_ps(src5[offs]), d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50); - } - if (dstC == F) - { - Save1(dst, d00, bias, params), dst += dD; - Save1(dst, d10, bias, params), dst += dD; - Save1(dst, d20, bias, params), dst += dD; - Save1(dst, d30, bias, params), dst += dD; - Save1(dst, d40, bias, params), dst += dD; - Save1(dst, d50, bias, params), dst += dD; - } - else - { - Save1(dst, d00, bias, params, dstC), dst += dD; - Save1(dst, d10, bias, params, dstC), dst += dD; - Save1(dst, d20, bias, params, dstC), dst += dD; - Save1(dst, d30, bias, params, dstC), dst += dD; - Save1(dst, d40, bias, params, dstC), dst += dD; - Save1(dst, d50, bias, params, dstC), dst += dD; - } - } - } - - template void ConvolutionNhwcDirect1x1_2xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - if (M > 0) d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - if (M > 4) d40 = _mm256_setzero_ps(), d41 = _mm256_setzero_ps(); - if (M > 5) d50 = _mm256_setzero_ps(), d51 = _mm256_setzero_ps(); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - if (M > 0) s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00), d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - if (M > 1) s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10), d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11); - if (M > 2) s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20), d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21); - if (M > 3) s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30), d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31); - if (M > 4) s0 = _mm256_set1_ps(src4[offs]), d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40), d41 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d41); - if (M > 5) s0 = _mm256_set1_ps(src5[offs]), d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50), d51 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d51); - } - if (dstC == DF) - { - if (M > 0) Save2(dst, d00, d01, bias, params), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params), dst += dD; - } - else - { - dstC -= F; - if (M > 0) Save2(dst, d00, d01, bias, params, dstC), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params, dstC), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params, dstC), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params, dstC), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params, dstC), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0) d00 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(); - if (M > 4) d40 = _mm256_setzero_ps(); - if (M > 5) d50 = _mm256_setzero_ps(); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - if (M > 0) s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - if (M > 1) s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - if (M > 2) s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - if (M > 3) s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - if (M > 4) s0 = _mm256_set1_ps(src4[offs]), d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40); - if (M > 5) s0 = _mm256_set1_ps(src5[offs]), d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50); - } - if (dstC == F) - { - if (M > 0) Save1(dst, d00, bias, params), dst += dD; - if (M > 1) Save1(dst, d10, bias, params), dst += dD; - if (M > 2) Save1(dst, d20, bias, params), dst += dD; - if (M > 3) Save1(dst, d30, bias, params), dst += dD; - if (M > 4) Save1(dst, d40, bias, params), dst += dD; - if (M > 5) Save1(dst, d50, bias, params), dst += dD; - } - else - { - if (M > 0) Save1(dst, d00, bias, params, dstC), dst += dD; - if (M > 1) Save1(dst, d10, bias, params, dstC), dst += dD; - if (M > 2) Save1(dst, d20, bias, params, dstC), dst += dD; - if (M > 3) Save1(dst, d30, bias, params, dstC), dst += dD; - if (M > 4) Save1(dst, d40, bias, params, dstC), dst += dD; - if (M > 5) Save1(dst, d50, bias, params, dstC), dst += dD; - } - } - } - - template ConvolutionNhwcDirect1x1_NxM_Ptr GetConvolutionNhwcDirect1x1_2xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect1x1_2xM; - case 2: return ConvolutionNhwcDirect1x1_2xM; - case 3: return ConvolutionNhwcDirect1x1_2xM; - case 4: return ConvolutionNhwcDirect1x1_2xM; - case 5: return ConvolutionNhwcDirect1x1_2xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect1x1_2(const float* src, const ConvParam32f& p, const AlgParam& a, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t n = 6, n1 = (yEnd - yBeg) * p.dstW, nn = AlignLoAny(n1, n), m = n1 - nn; - ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_2xN = ConvolutionNhwcDirect1x1_2x6; - ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_2xM = GetConvolutionNhwcDirect1x1_2xM(m); - - __m256 _params[2], _bias[2]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - - for (size_t dc = 0; dc < dstC; dc += a.microD) - { - size_t dC = Simd::Min(a.microD, dstC - dc); - if (dC > 0 * F) _bias[0] = _mm256_loadu_ps(bias + dc + 0 * F); - if (dC > 1 * F) _bias[1] = _mm256_loadu_ps(bias + dc + 1 * F); - if (type == ::SimdConvolutionActivationPrelu) - { - if (dC > 0 * F) _params[0] = _mm256_loadu_ps(params + dc + 0 * F); - if (dC > 1 * F) _params[1] = _mm256_loadu_ps(params + dc + 1 * F); - } - const float* ps = src + yBeg * p.srcW * p.srcC; - float* pd = dst + dc + yBeg * p.dstW * p.dstC; - size_t i = 0; - for (; i < nn; i += n, ps += n * p.srcC, pd += n * p.dstC) - convolutionNhwcDirect1x1_2xN(ps, p, a, srcC, dC, weight, _bias, _params, pd); - for (; i < n1; i += m, ps += m * p.srcC, pd += m * p.dstC) - convolutionNhwcDirect1x1_2xM(ps, p, a, srcC, dC, weight, _bias, _params, pd); - weight += p.srcC * a.microD; - } - } - - //--------------------------------------------------------------------- - - template static SIMD_INLINE void Set(const ConvParam32f& p, AlgParam& a) - { - a.convolutions[term] = p.Is1x1() ? ConvolutionNhwcDirect1x1_2 : ConvolutionNhwcDirect_2; - } - - template static SIMD_INLINE void Set(const ConvParam32f& p, AlgParam& a) - { - Set(p, a); - Set(p, a); - Set(p, a); - Set(p, a); - } - - bool SynetConvolution32fNhwcDirect::Set2r(const ConvParam32f& p, AlgParam& a) - { - assert(a.microD == 2 * F); - switch (p.activation) - { - case SimdConvolutionActivationIdentity: Set(p, a); break; - case SimdConvolutionActivationRelu: Set(p, a); break; - case SimdConvolutionActivationLeakyRelu: Set(p, a); break; - case SimdConvolutionActivationRestrictRange: Set(p, a); break; - case SimdConvolutionActivationPrelu: Set(p, a); break; - case SimdConvolutionActivationHswish: Set(p, a); break; - default: return false; - } - return true; - } - } -#endif//SIMD_AVX_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx1SynetConvolution32fNhwcDirect3r.cpp b/src/3rd/Simd/Simd/SimdAvx1SynetConvolution32fNhwcDirect3r.cpp deleted file mode 100644 index 8af4f56a..00000000 --- a/src/3rd/Simd/Simd/SimdAvx1SynetConvolution32fNhwcDirect3r.cpp +++ /dev/null @@ -1,807 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - using AlgParam = SynetConvolution32fNhwcDirect::AlgParam; - - typedef void(*ConvolutionNhwcDirect_NxM_Ptr)(const float* src0, const ConvParam32f& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst); - typedef void(*ConvolutionNhwcDirect1x1_NxM_Ptr)(const float* src0, const ConvParam32f& p, const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst); - - //--------------------------------------------------------------------- - - template void ConvolutionNhwcDirect_3x1(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d02, s0, w0, w1, w2; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - if (dstC > 2 * F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(), d02 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - w2 = _mm256_loadu_ps(weight2 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00), d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01), d02 = _mm256_add_ps(_mm256_mul_ps(s0, w2), d02); - } - } - weight0 += dW, weight1 += dW, weight2 += dW; - } - } - if (dstC == 3 * F) - Save3(dst, d00, d01, d02, bias, params); - else - Save3(dst, d00, d01, d02, bias, params, dstC - 2 * F); - } - else if (dstC > F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00), d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - } - } - weight0 += dW, weight1 += dW; - } - } - if (dstC == 2 * F) - Save2(dst, d00, d01, bias, params); - else - Save2(dst, d00, d01, bias, params, dstC - F); - } - else - { - d00 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - } - } - weight0 += dW; - } - } - if (dstC == F) - Save1(dst, d00, bias, params); - else - Save1(dst, d00, bias, params, dstC); - } - } - - template void ConvolutionNhwcDirect_3x4(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d02, d10, d11, d12, d20, d21, d22, d30, d31, d32, s0, w0, w1, w2; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - if (dstC > 2 * F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(), d02 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(), d12 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(), d22 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(), d32 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 4 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - w2 = _mm256_loadu_ps(weight2 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00), d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01), d02 = _mm256_add_ps(_mm256_mul_ps(s0, w2), d02); - s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10), d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11), d12 = _mm256_add_ps(_mm256_mul_ps(s0, w2), d12); - s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20), d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21), d22 = _mm256_add_ps(_mm256_mul_ps(s0, w2), d22); - s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30), d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31), d32 = _mm256_add_ps(_mm256_mul_ps(s0, w2), d32); - } - weight0 += dW, weight1 += dW, weight2 += dW; - } - } - else - weight0 += dWz, weight1 += dWz, weight2 += dWz; - } - if (dstC == 3 * F) - { - Save3(dst, d00, d01, d02, bias, params), dst += dD; - Save3(dst, d10, d11, d12, bias, params), dst += dD; - Save3(dst, d20, d21, d22, bias, params), dst += dD; - Save3(dst, d30, d31, d32, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - Save3(dst, d20, d21, d22, bias, params, dstC), dst += dD; - Save3(dst, d30, d31, d32, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 4 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00), d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10), d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11); - s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20), d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21); - s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30), d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == 2 * F) - { - Save2(dst, d00, d01, bias, params), dst += dD; - Save2(dst, d10, d11, bias, params), dst += dD; - Save2(dst, d20, d21, bias, params), dst += dD; - Save2(dst, d30, d31, bias, params), dst += dD; - } - else - { - dstC -= 1 * F; - Save2(dst, d00, d01, bias, params, dstC), dst += dD; - Save2(dst, d10, d11, bias, params, dstC), dst += dD; - Save2(dst, d20, d21, bias, params, dstC), dst += dD; - Save2(dst, d30, d31, bias, params, dstC), dst += dD; - } - } - else - { - d00 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 4 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - Save1(dst, d00, bias, params), dst += dD; - Save1(dst, d10, bias, params), dst += dD; - Save1(dst, d20, bias, params), dst += dD; - Save1(dst, d30, bias, params), dst += dD; - } - else - { - Save1(dst, d00, bias, params, dstC), dst += dD; - Save1(dst, d10, bias, params, dstC), dst += dD; - Save1(dst, d20, bias, params, dstC), dst += dD; - Save1(dst, d30, bias, params, dstC), dst += dD; - } - } - } - - template void ConvolutionNhwcDirect_3xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d02, d10, d11, d12, d20, d21, d22, d30, d31, d32, s0, w0, w1, w2; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - if (dstC > 2 * F) - { - if (M > 0) d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(), d02 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(), d12 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(), d22 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(), d32 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - w2 = _mm256_loadu_ps(weight2 + offw); - if (M > 0) s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00), d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01), d02 = _mm256_add_ps(_mm256_mul_ps(s0, w2), d02); - if (M > 1) s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10), d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11), d12 = _mm256_add_ps(_mm256_mul_ps(s0, w2), d12); - if (M > 2) s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20), d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21), d22 = _mm256_add_ps(_mm256_mul_ps(s0, w2), d22); - if (M > 3) s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30), d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31), d32 = _mm256_add_ps(_mm256_mul_ps(s0, w2), d32); - } - weight0 += dW, weight1 += dW, weight2 += dW; - } - } - else - weight0 += dWz, weight1 += dWz, weight2 += dWz; - } - if (dstC == 3 * F) - { - if (M > 0) Save3(dst, d00, d01, d02, bias, params), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - if (M > 0) Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params, dstC), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - if (M > 0) d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - if (M > 0) s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00), d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - if (M > 1) s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10), d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11); - if (M > 2) s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20), d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21); - if (M > 3) s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30), d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == 2 * F) - { - if (M > 0) Save2(dst, d00, d01, bias, params), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params), dst += dD; - } - else - { - dstC -= 1 * F; - if (M > 0) Save2(dst, d00, d01, bias, params, dstC), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params, dstC), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params, dstC), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0) d00 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - if (M > 0) s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - if (M > 1) s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - if (M > 2) s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - if (M > 3) s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - if (M > 0) Save1(dst, d00, bias, params), dst += dD; - if (M > 1) Save1(dst, d10, bias, params), dst += dD; - if (M > 2) Save1(dst, d20, bias, params), dst += dD; - if (M > 3) Save1(dst, d30, bias, params), dst += dD; - } - else - { - if (M > 0) Save1(dst, d00, bias, params, dstC), dst += dD; - if (M > 1) Save1(dst, d10, bias, params, dstC), dst += dD; - if (M > 2) Save1(dst, d20, bias, params, dstC), dst += dD; - if (M > 3) Save1(dst, d30, bias, params, dstC), dst += dD; - } - } - } - - template ConvolutionNhwcDirect_NxM_Ptr GetConvolutionNhwcDirect_3xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect_3xM; - case 2: return ConvolutionNhwcDirect_3xM; - case 3: return ConvolutionNhwcDirect_3xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect_3(const float* src, const ConvParam32f& p, const AlgParam& a, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t noseH = p.NoseH(), noseW = p.NoseW(), bodyH = p.BodyH(), bodyW = p.BodyW(); - size_t n = 4, bodyWn = AlignLoAny(bodyW - noseW, n) + noseW, m = bodyW - bodyWn; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_3x1 = ConvolutionNhwcDirect_3x1; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_3xN = ConvolutionNhwcDirect_3x4; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_3xM = GetConvolutionNhwcDirect_3xM(m); - size_t tailH = p.dstH, tailW = p.dstW; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - - __m256 _params[3], _bias[3]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - - for (size_t dc = 0; dc < dstC; dc += a.microD) - { - size_t dC = Simd::Min(a.microD, dstC - dc); - if (dC > 0 * F) _bias[0] = _mm256_loadu_ps(bias + dc + 0 * F); - if (dC > 1 * F) _bias[1] = _mm256_loadu_ps(bias + dc + 1 * F); - if (dC > 2 * F) _bias[2] = _mm256_loadu_ps(bias + dc + 2 * F); - if (type == ::SimdConvolutionActivationPrelu) - { - if (dC > 0 * F) _params[0] = _mm256_loadu_ps(params + dc + 0 * F); - if (dC > 1 * F) _params[1] = _mm256_loadu_ps(params + dc + 1 * F); - if (dC > 2 * F) _params[2] = _mm256_loadu_ps(params + dc + 2 * F); - } - float* d = dst + dc + yBeg * p.dstW * p.dstC; - size_t dy = yBeg; - for (; dy < noseH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_3xN(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_3xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - } - for (; dy < bodyH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_3xN(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_3xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - } - for (; dy < tailH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_3xN(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_3xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - } - weight += p.kernelY * p.kernelX * p.srcC * a.microD; - } - } - - //--------------------------------------------------------------------- - - template void ConvolutionNhwcDirect1x1_3x4(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d02, d10, d11, d12, d20, d21, d22, d30, d31, d32, s0, w0, w1, w2; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - if (dstC > 2 * F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(), d02 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(), d12 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(), d22 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(), d32 = _mm256_setzero_ps(); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - w2 = _mm256_loadu_ps(weight2 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00), d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01), d02 = _mm256_add_ps(_mm256_mul_ps(s0, w2), d02); - s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10), d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11), d12 = _mm256_add_ps(_mm256_mul_ps(s0, w2), d12); - s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20), d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21), d22 = _mm256_add_ps(_mm256_mul_ps(s0, w2), d22); - s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30), d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31), d32 = _mm256_add_ps(_mm256_mul_ps(s0, w2), d32); - } - if (dstC == 3 * F) - { - Save3(dst, d00, d01, d02, bias, params), dst += dD; - Save3(dst, d10, d11, d12, bias, params), dst += dD; - Save3(dst, d20, d21, d22, bias, params), dst += dD; - Save3(dst, d30, d31, d32, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - Save3(dst, d20, d21, d22, bias, params, dstC), dst += dD; - Save3(dst, d30, d31, d32, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00), d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10), d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11); - s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20), d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21); - s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30), d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31); - } - if (dstC == 2 * F) - { - Save2(dst, d00, d01, bias, params), dst += dD; - Save2(dst, d10, d11, bias, params), dst += dD; - Save2(dst, d20, d21, bias, params), dst += dD; - Save2(dst, d30, d31, bias, params), dst += dD; - } - else - { - dstC -= 1 * F; - Save2(dst, d00, d01, bias, params, dstC), dst += dD; - Save2(dst, d10, d11, bias, params, dstC), dst += dD; - Save2(dst, d20, d21, bias, params, dstC), dst += dD; - Save2(dst, d30, d31, bias, params, dstC), dst += dD; - } - } - else - { - d00 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - } - if (dstC == F) - { - Save1(dst, d00, bias, params), dst += dD; - Save1(dst, d10, bias, params), dst += dD; - Save1(dst, d20, bias, params), dst += dD; - Save1(dst, d30, bias, params), dst += dD; - } - else - { - Save1(dst, d00, bias, params, dstC), dst += dD; - Save1(dst, d10, bias, params, dstC), dst += dD; - Save1(dst, d20, bias, params, dstC), dst += dD; - Save1(dst, d30, bias, params, dstC), dst += dD; - } - } - } - - template void ConvolutionNhwcDirect1x1_3xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d02, d10, d11, d12, d20, d21, d22, d30, d31, d32, s0, w0, w1, w2; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - if (dstC > 2 * F) - { - if (M > 0) d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(), d02 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(), d12 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(), d22 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(), d32 = _mm256_setzero_ps(); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - w2 = _mm256_loadu_ps(weight2 + offw); - if (M > 0) s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00), d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01), d02 = _mm256_add_ps(_mm256_mul_ps(s0, w2), d02); - if (M > 1) s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10), d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11), d12 = _mm256_add_ps(_mm256_mul_ps(s0, w2), d12); - if (M > 2) s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20), d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21), d22 = _mm256_add_ps(_mm256_mul_ps(s0, w2), d22); - if (M > 3) s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30), d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31), d32 = _mm256_add_ps(_mm256_mul_ps(s0, w2), d32); - } - if (dstC == 3 * F) - { - if (M > 0) Save3(dst, d00, d01, d02, bias, params), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - if (M > 0) Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params, dstC), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - if (M > 0) d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - if (M > 0) s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00), d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - if (M > 1) s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10), d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11); - if (M > 2) s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20), d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21); - if (M > 3) s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30), d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31); - } - if (dstC == DF) - { - if (M > 0) Save2(dst, d00, d01, bias, params), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params), dst += dD; - } - else - { - dstC -= F; - if (M > 0) Save2(dst, d00, d01, bias, params, dstC), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params, dstC), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params, dstC), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0) d00 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - if (M > 0) s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - if (M > 1) s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - if (M > 2) s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - if (M > 3) s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - } - if (dstC == F) - { - if (M > 0) Save1(dst, d00, bias, params), dst += dD; - if (M > 1) Save1(dst, d10, bias, params), dst += dD; - if (M > 2) Save1(dst, d20, bias, params), dst += dD; - if (M > 3) Save1(dst, d30, bias, params), dst += dD; - } - else - { - if (M > 0) Save1(dst, d00, bias, params, dstC), dst += dD; - if (M > 1) Save1(dst, d10, bias, params, dstC), dst += dD; - if (M > 2) Save1(dst, d20, bias, params, dstC), dst += dD; - if (M > 3) Save1(dst, d30, bias, params, dstC), dst += dD; - } - } - } - - template ConvolutionNhwcDirect1x1_NxM_Ptr GetConvolutionNhwcDirect1x1_3xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect1x1_3xM; - case 2: return ConvolutionNhwcDirect1x1_3xM; - case 3: return ConvolutionNhwcDirect1x1_3xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect1x1_3(const float* src, const ConvParam32f& p, const AlgParam& a, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t n = 4, n1 = (yEnd - yBeg) * p.dstW, nn = AlignLoAny(n1, n), m = n1 - nn; - ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_3xN = ConvolutionNhwcDirect1x1_3x4; - ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_3xM = GetConvolutionNhwcDirect1x1_3xM(m); - - __m256 _params[3], _bias[3]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - - for (size_t dc = 0; dc < dstC; dc += a.microD) - { - size_t dC = Simd::Min(a.microD, dstC - dc); - if (dC > 0 * F) _bias[0] = _mm256_loadu_ps(bias + dc + 0 * F); - if (dC > 1 * F) _bias[1] = _mm256_loadu_ps(bias + dc + 1 * F); - if (dC > 2 * F) _bias[2] = _mm256_loadu_ps(bias + dc + 2 * F); - if (type == ::SimdConvolutionActivationPrelu) - { - if (dC > 0 * F) _params[0] = _mm256_loadu_ps(params + dc + 0 * F); - if (dC > 1 * F) _params[1] = _mm256_loadu_ps(params + dc + 1 * F); - if (dC > 2 * F) _params[2] = _mm256_loadu_ps(params + dc + 2 * F); - } - const float* ps = src + yBeg * p.srcW * p.srcC; - float* pd = dst + dc + yBeg * p.dstW * p.dstC; - size_t i = 0; - for (; i < nn; i += n, ps += n * p.srcC, pd += n * p.dstC) - convolutionNhwcDirect1x1_3xN(ps, p, a, srcC, dC, weight, _bias, _params, pd); - for (; i < n1; i += m, ps += m * p.srcC, pd += m * p.dstC) - convolutionNhwcDirect1x1_3xM(ps, p, a, srcC, dC, weight, _bias, _params, pd); - weight += p.srcC * a.microD; - } - } - - //--------------------------------------------------------------------- - - template static SIMD_INLINE void Set(const ConvParam32f& p, AlgParam& a) - { - a.convolutions[term] = p.Is1x1() ? ConvolutionNhwcDirect1x1_3 : ConvolutionNhwcDirect_3; - } - - template static SIMD_INLINE void Set(const ConvParam32f& p, AlgParam& a) - { - Set(p, a); - Set(p, a); - Set(p, a); - Set(p, a); - } - - bool SynetConvolution32fNhwcDirect::Set3r(const ConvParam32f& p, AlgParam& a) - { - assert(a.microD == 3 * F); - switch (p.activation) - { - case SimdConvolutionActivationIdentity: Set(p, a); break; - case SimdConvolutionActivationRelu: Set(p, a); break; - case SimdConvolutionActivationLeakyRelu: Set(p, a); break; - case SimdConvolutionActivationRestrictRange: Set(p, a); break; - case SimdConvolutionActivationPrelu: Set(p, a); break; - case SimdConvolutionActivationHswish: Set(p, a); break; - default: return false; - } - return true; - } - } -#endif//SIMD_AVX_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx1SynetDeconvolution32f.cpp b/src/3rd/Simd/Simd/SimdAvx1SynetDeconvolution32f.cpp deleted file mode 100644 index 35b0ff96..00000000 --- a/src/3rd/Simd/Simd/SimdAvx1SynetDeconvolution32f.cpp +++ /dev/null @@ -1,303 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetDeconvolution32f.h" -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdAvx1.h" -#include "Simd/SimdGemm.h" -#include "Simd/SimdExp.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - SynetDeconvolution32fGemmNN::SynetDeconvolution32fGemmNN(const DeconvParam32f & p) - : Sse2::SynetDeconvolution32fGemmNN(p) - { - _gemm.Init(InitGemmFuncs(Avx::Gemm32fNN, "Avx", p.gemm, "Ext")); - if (_param.trans && _param.group == 1) - { - if (NHWC_GEMM_RUNTIME) - { - _gemmCb.Init(InitGemmCbFuncs(Avx::Gemm32fNNcbBufferSize, Avx::Gemm32fNNcbReorderB, Avx::Gemm32fNNcbRun, "Avx", GemmKernelF2, GemmKernelF3)); - _nhwcWeight.Resize(_gemmCb.At(0).BufferSize(_M*_merge, _N, _K)); - } - else - _nhwcWeight.Resize(Avx::Gemm32fNNcbBufferSize(_M*_merge, _N, _K, GemmKernelAny, NHWC_GEMM_COMPATIBLE)); - _nhwcRun = Avx::Gemm32fNNcbRun; - _nhwcReorderB = Avx::Gemm32fNNcbReorderB; - } - _biasAndActivation = Avx::ConvolutionBiasAndActivation; - } - - //--------------------------------------------------------------------- - - typedef void(*DeconvolutionNhwcDirect2x2_Ptr) (const float * src0, const DeconvParam32f & p, size_t srcC, size_t dstC, const float * weight, const __m256 * bias, const __m256 * params, float * ds); - - template void DeconvolutionNhwcDirect2x2_6(const float * src0, - const DeconvParam32f & p, size_t srcC, size_t dstC, const float * weight0, const __m256 * bias, const __m256 * params, float * dst) - { - size_t dS = p.srcC, dD = p.dstC; - const float * weight1 = weight0 + srcC * F; - const float * src1 = src0 + 1 * dS; - const float * src2 = src0 + 2 * dS; - const float * src3 = src0 + 3 * dS; - const float * src4 = src0 + 4 * dS; - const float * src5 = src0 + 5 * dS; - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - d00 = _mm256_setzero_ps(); d01 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); d11 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); d21 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); d31 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(); d41 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(); d51 = _mm256_setzero_ps(); - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm256_loadu_ps(weight0); - w1 = _mm256_loadu_ps(weight1); - s0 = _mm256_set1_ps(src0[sc]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - s0 = _mm256_set1_ps(src1[sc]); - d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11); - s0 = _mm256_set1_ps(src2[sc]); - d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21); - s0 = _mm256_set1_ps(src3[sc]); - d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31); - s0 = _mm256_set1_ps(src4[sc]); - d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40); - d41 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d41); - s0 = _mm256_set1_ps(src5[sc]); - d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50); - d51 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d51); - weight0 += F; - weight1 += F; - } - if (dstC == F) - { - Term::template Save(dst + 0x0 * dD, d00, bias, params); - Term::template Save(dst + 0x1 * dD, d01, bias, params); - Term::template Save(dst + 0x2 * dD, d10, bias, params); - Term::template Save(dst + 0x3 * dD, d11, bias, params); - Term::template Save(dst + 0x4 * dD, d20, bias, params); - Term::template Save(dst + 0x5 * dD, d21, bias, params); - Term::template Save(dst + 0x6 * dD, d30, bias, params); - Term::template Save(dst + 0x7 * dD, d31, bias, params); - Term::template Save(dst + 0x8 * dD, d40, bias, params); - Term::template Save(dst + 0x9 * dD, d41, bias, params); - Term::template Save(dst + 0xA * dD, d50, bias, params); - Term::template Save(dst + 0xB * dD, d51, bias, params); - } - else - { - Term::template Save(dst + 0x0 * dD, d00, bias, params, dstC); - Term::template Save(dst + 0x1 * dD, d01, bias, params, dstC); - Term::template Save(dst + 0x2 * dD, d10, bias, params, dstC); - Term::template Save(dst + 0x3 * dD, d11, bias, params, dstC); - Term::template Save(dst + 0x4 * dD, d20, bias, params, dstC); - Term::template Save(dst + 0x5 * dD, d21, bias, params, dstC); - Term::template Save(dst + 0x6 * dD, d30, bias, params, dstC); - Term::template Save(dst + 0x7 * dD, d31, bias, params, dstC); - Term::template Save(dst + 0x8 * dD, d40, bias, params, dstC); - Term::template Save(dst + 0x9 * dD, d41, bias, params, dstC); - Term::template Save(dst + 0xA * dD, d50, bias, params, dstC); - Term::template Save(dst + 0xB * dD, d51, bias, params, dstC); - } - } - - template void DeconvolutionNhwcDirect2x2_M(const float * src0, - const DeconvParam32f & p, size_t srcC, size_t dstC, const float * weight0, const __m256 * bias, const __m256 * params, float * dst) - { - size_t dS = p.srcC, dD = p.dstC; - const float * weight1 = weight0 + srcC * F, *src1, *src2, *src3, *src4, *src5; - if (tail > 1) src1 = src0 + 1 * dS; - if (tail > 2) src2 = src0 + 2 * dS; - if (tail > 3) src3 = src0 + 3 * dS; - if (tail > 4) src4 = src0 + 4 * dS; - if (tail > 5) src5 = src0 + 5 * dS; - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - if (tail > 0) d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - if (tail > 1) d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - if (tail > 2) d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - if (tail > 3) d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - if (tail > 4) d40 = _mm256_setzero_ps(), d41 = _mm256_setzero_ps(); - if (tail > 5) d50 = _mm256_setzero_ps(), d51 = _mm256_setzero_ps(); - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm256_loadu_ps(weight0); - w1 = _mm256_loadu_ps(weight1); - if (tail > 0) s0 = _mm256_set1_ps(src0[sc]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00), d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - if (tail > 1) s0 = _mm256_set1_ps(src1[sc]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10), d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11); - if (tail > 2) s0 = _mm256_set1_ps(src2[sc]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20), d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21); - if (tail > 3) s0 = _mm256_set1_ps(src3[sc]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30), d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31); - if (tail > 4) s0 = _mm256_set1_ps(src4[sc]), d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40), d41 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d41); - if (tail > 5) s0 = _mm256_set1_ps(src5[sc]), d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50), d51 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d51); - weight0 += F; - weight1 += F; - } - if (dstC == F) - { - if (tail > 0) Term::template Save(dst + 0x0 * dD, d00, bias, params), Term::template Save(dst + 0x1 * dD, d01, bias, params); - if (tail > 1) Term::template Save(dst + 0x2 * dD, d10, bias, params), Term::template Save(dst + 0x3 * dD, d11, bias, params); - if (tail > 2) Term::template Save(dst + 0x4 * dD, d20, bias, params), Term::template Save(dst + 0x5 * dD, d21, bias, params); - if (tail > 3) Term::template Save(dst + 0x6 * dD, d30, bias, params), Term::template Save(dst + 0x7 * dD, d31, bias, params); - if (tail > 4) Term::template Save(dst + 0x8 * dD, d40, bias, params), Term::template Save(dst + 0x9 * dD, d41, bias, params); - if (tail > 5) Term::template Save(dst + 0xA * dD, d50, bias, params), Term::template Save(dst + 0xB * dD, d51, bias, params); - } - else - { - if (tail > 0) Term::template Save(dst + 0x0 * dD, d00, bias, params, dstC), Term::template Save(dst + 0x1 * dD, d01, bias, params, dstC); - if (tail > 1) Term::template Save(dst + 0x2 * dD, d10, bias, params, dstC), Term::template Save(dst + 0x3 * dD, d11, bias, params, dstC); - if (tail > 2) Term::template Save(dst + 0x4 * dD, d20, bias, params, dstC), Term::template Save(dst + 0x5 * dD, d21, bias, params, dstC); - if (tail > 3) Term::template Save(dst + 0x6 * dD, d30, bias, params, dstC), Term::template Save(dst + 0x7 * dD, d31, bias, params, dstC); - if (tail > 4) Term::template Save(dst + 0x8 * dD, d40, bias, params, dstC), Term::template Save(dst + 0x9 * dD, d41, bias, params, dstC); - if (tail > 5) Term::template Save(dst + 0xA * dD, d50, bias, params, dstC), Term::template Save(dst + 0xB * dD, d51, bias, params, dstC); - } - } - - template SIMD_INLINE DeconvolutionNhwcDirect2x2_Ptr GetTailKernel(size_t tail) - { - switch (tail) - { - case 0: return DeconvolutionNhwcDirect2x2_M; - case 1: return DeconvolutionNhwcDirect2x2_M; - case 2: return DeconvolutionNhwcDirect2x2_M; - case 3: return DeconvolutionNhwcDirect2x2_M; - case 4: return DeconvolutionNhwcDirect2x2_M; - case 5: return DeconvolutionNhwcDirect2x2_M; - default: - assert(0); - return NULL; - } - } - - template void DeconvolutionNhwcDirect2x2(const float * src, const DeconvParam32f & p, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcW6 = AlignLoAny(p.srcW, 6), tail = p.srcW - srcW6; - DeconvolutionNhwcDirect2x2_Ptr bodyKernel = DeconvolutionNhwcDirect2x2_6; - DeconvolutionNhwcDirect2x2_Ptr tailKernel = GetTailKernel(tail); - - __m256 _params[2], _bias[1]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - - for (size_t dc = 0; dc < dstC; dc += F) - { - size_t dC = Simd::Min(F, dstC - dc); - _bias[0] = _mm256_loadu_ps(bias + dc); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm256_loadu_ps(params + dc); - const float * s = src + yBeg * p.srcW * p.srcC; - float * d = dst + yBeg * p.strideY * p.dstW * p.dstC; - const float * w0 = weight + 0 * p.kernelX * p.srcC * F; - const float * w1 = weight + 1 * p.kernelX * p.srcC * F; - for (size_t sy = yBeg; sy < yEnd; sy += 1, s += p.srcW * p.srcC) - { - for (size_t sx = 0; sx < srcW6; sx += 6) - bodyKernel(s + sx * p.srcC, p, srcC, dC, w0, _bias, _params, d), d += 6 * p.strideX * p.dstC; - if (tail) - tailKernel(s + srcW6 * p.srcC, p, srcC, dC, w0, _bias, _params, d), d += tail * p.strideX * p.dstC; - for (size_t sx = 0; sx < srcW6; sx += 6) - bodyKernel(s + sx * p.srcC, p, srcC, dC, w1, _bias, _params, d), d += 6 * p.strideX * p.dstC; - if (tail) - tailKernel(s + srcW6 * p.srcC, p, srcC, dC, w1, _bias, _params, d), d += tail * p.strideX * p.dstC; - } - weight += p.kernelY * p.kernelX*srcC*F; - dst += F; - } - } - - template void DeconvolutionNhwcDirect2x2(const float * src, const DeconvParam32f & p, - const SynetDeconvolution32fNhwcDirect2x2::AlgParam & a, const float * weight, const float * bias, const float * params, float * dst) - { - for (size_t dc = 0; dc < p.dstC; dc += a.macroD) - { - size_t macroD = Simd::Min(p.dstC, dc + a.macroD) - dc; - for (size_t sc = 0; sc < p.srcC; sc += a.macroC) - { - size_t macroC = Simd::Min(p.srcC, sc + a.macroC) - sc; - size_t macroK = p.kernelY * p.kernelX * macroC; - for (size_t yBeg = 0; yBeg < p.srcH;) - { - size_t yEnd = Simd::Min(yBeg + a.macroH, p.srcH); - if (a.macroC == p.srcC) - DeconvolutionNhwcDirect2x2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc == 0) - DeconvolutionNhwcDirect2x2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc + macroC == p.srcC) - DeconvolutionNhwcDirect2x2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else - DeconvolutionNhwcDirect2x2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - yBeg = yEnd; - } - weight += AlignHiAny(macroD, a.microD)*macroK; - } - if (type == ::SimdConvolutionActivationPrelu) - params += macroD; - } - } - - SynetDeconvolution32fNhwcDirect2x2::SynetDeconvolution32fNhwcDirect2x2(const DeconvParam32f & p) - : Sse2::SynetDeconvolution32fNhwcDirect2x2(p) - { - if (p.dstC > HF) - { - switch (p.activation) - { - case SimdConvolutionActivationIdentity: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationRelu: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationLeakyRelu: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationRestrictRange: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationPrelu: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationHswish: _deconvolution = DeconvolutionNhwcDirect2x2; break; - default: return; - } - SetAlgParam(F, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3()); - } - } - - //--------------------------------------------------------------------- - - void * SynetDeconvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdGemm32fNNPtr gemm) - { - DeconvParam32f param(batch, conv, gemm); - if (!param.Valid()) - return NULL; - if (SynetDeconvolution32fNhwcDirect2x2::Preferable(param)) - return new SynetDeconvolution32fNhwcDirect2x2(param); - else - return new SynetDeconvolution32fGemmNN(param); - } - } -#endif//SIMD_AVX_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx1SynetFused.cpp b/src/3rd/Simd/Simd/SimdAvx1SynetFused.cpp deleted file mode 100644 index 8fff765a..00000000 --- a/src/3rd/Simd/Simd/SimdAvx1SynetFused.cpp +++ /dev/null @@ -1,1240 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdSse1.h" -#include "Simd/SimdAvx1.h" - -namespace Simd -{ -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - template SIMD_INLINE void SynetFusedLayerForward0(const float * src, const float * bias, const float * scale, __m256 sign, float * dst, size_t offset) - { - __m256 _bias = Load(bias + offset); - __m256 x = _mm256_add_ps(Load(src + offset), _bias); - __m256 _scale = Load(scale + offset); - Store(dst + offset, _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(x, _mm256_andnot_ps(sign, x)), _scale), _mm256_max_ps(_mm256_setzero_ps(), x))); - } - - template SIMD_INLINE void SynetFusedLayerForward0(const float * src, __m256 bias, __m256 scale, __m256 sign, float * dst, size_t offset) - { - __m256 x = _mm256_add_ps(Load(src + offset), bias); - Store(dst + offset, _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(x, _mm256_andnot_ps(sign, x)), scale), _mm256_max_ps(_mm256_setzero_ps(), x))); - } - - template void SynetFusedLayerForward0Nchw(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(spatial, F) && Aligned(dst)); - - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - __m256 sign = _mm256_set1_ps(-0.0f); - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - if (partial) - { - __m256 _bias = _mm256_set1_ps(bias[c]); - __m256 _scale = _mm256_set1_ps(scale[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward0(src, _bias, _scale, sign, dst, s + F * 0); - SynetFusedLayerForward0(src, _bias, _scale, sign, dst, s + F * 1); - SynetFusedLayerForward0(src, _bias, _scale, sign, dst, s + F * 2); - SynetFusedLayerForward0(src, _bias, _scale, sign, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetFusedLayerForward0(src, _bias, _scale, sign, dst, s); - } - for (; s < spatial; ++s) - dst[s] = Base::SynetFusedLayerForward0(src[s] + bias[c], scale[c]); - src += spatial; - dst += spatial; - } - } - - SIMD_INLINE void SynetFusedLayerForward0Nchw(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(spatial, F) && Aligned(dst)) - SynetFusedLayerForward0Nchw(src, bias, scale, channels, spatial, dst); - else - SynetFusedLayerForward0Nchw(src, bias, scale, channels, spatial, dst); - } - - template void SynetFusedLayerForward0Nhwc(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(bias) && Aligned(scale) && Aligned(channels, F) && Aligned(dst)); - - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - __m256 sign = _mm256_set1_ps(-0.0f); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - if (partial) - { - for (; c < aligned; c += QF) - { - SynetFusedLayerForward0(src, bias, scale, sign, dst, c + F * 0); - SynetFusedLayerForward0(src, bias, scale, sign, dst, c + F * 1); - SynetFusedLayerForward0(src, bias, scale, sign, dst, c + F * 2); - SynetFusedLayerForward0(src, bias, scale, sign, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetFusedLayerForward0(src, bias, scale, sign, dst, c); - } - for (; c < channels; ++c) - dst[c] = Base::SynetFusedLayerForward0(src[c] + bias[c], scale[c]); - src += channels; - dst += channels; - } - } - - SIMD_INLINE void SynetFusedLayerForward0Nhwc(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(bias) && Aligned(scale) && Aligned(channels, F) && Aligned(dst)) - SynetFusedLayerForward0Nhwc(src, bias, scale, channels, spatial, dst); - else - SynetFusedLayerForward0Nhwc(src, bias, scale, channels, spatial, dst); - } - - template void SynetFusedLayerForward0Nchw8c(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - __m256 sign = _mm256_set1_ps(-0.0f); - for (size_t c = 0; c < channels; c += F) - { - __m256 _bias = Load(bias + c); - __m256 _scale = Load(scale + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward0(src, _bias, _scale, sign, dst, s + F * 0); - SynetFusedLayerForward0(src, _bias, _scale, sign, dst, s + F * 1); - SynetFusedLayerForward0(src, _bias, _scale, sign, dst, s + F * 2); - SynetFusedLayerForward0(src, _bias, _scale, sign, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward0(src, _bias, _scale, sign, dst, s); - src += spatialF; - dst += spatialF; - } - } - - SIMD_INLINE void SynetFusedLayerForward0Nchw8c(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetFusedLayerForward0Nchw8c(src, bias, scale, channels, spatial, dst); - else - SynetFusedLayerForward0Nchw8c(src, bias, scale, channels, spatial, dst); - } - - void SynetFusedLayerForward0(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward0Nchw(src, bias, scale, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward0Nhwc(src, bias, scale, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - Sse::SynetFusedLayerForward0(src, bias, scale, channels, spatial, dst, format); - else if (format == SimdTensorFormatNchw8c) - SynetFusedLayerForward0Nchw8c(src, bias, scale, channels, spatial, dst); - else - Base::SynetFusedLayerForward0(src, bias, scale, channels, spatial, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetFusedLayerForward1(const float * src, const float * bias0, const float * scale1, const float * bias1, float * dst, size_t offset) - { - __m256 _bias0 = Load(bias0 + offset); - __m256 x = _mm256_add_ps(Load(src + offset), _bias0); - __m256 _scale1 = Load(scale1 + offset); - __m256 _bias1 = Load(bias1 + offset); - Store(dst + offset, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_max_ps(_mm256_setzero_ps(), _mm256_sub_ps(_mm256_setzero_ps(), x)), _scale1), _bias1), _mm256_max_ps(_mm256_setzero_ps(), x))); - } - - template SIMD_INLINE void SynetFusedLayerForward1(const float * src, __m256 bias0, __m256 scale1, __m256 bias1, float * dst, size_t offset) - { - __m256 x = _mm256_add_ps(Load(src + offset), bias0); - Store(dst + offset, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_max_ps(_mm256_setzero_ps(), _mm256_sub_ps(_mm256_setzero_ps(), x)), scale1), bias1), _mm256_max_ps(_mm256_setzero_ps(), x))); - } - - template void SynetFusedLayerForward1Nchw(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(spatial, F) && Aligned(dst)); - - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - if (partial) - { - __m256 _bias0 = _mm256_set1_ps(bias0[c]); - __m256 _scale1 = _mm256_set1_ps(scale1[c]); - __m256 _bias1 = _mm256_set1_ps(bias1[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, dst, s + F * 0); - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, dst, s + F * 1); - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, dst, s + F * 2); - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, dst, s); - } - for (; s < spatial; ++s) - dst[s] = Base::SynetFusedLayerForward1(src[s] + bias0[c], scale1[c], bias1[c]); - src += spatial; - dst += spatial; - } - } - - SIMD_INLINE void SynetFusedLayerForward1Nchw(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(spatial, F) && Aligned(dst)) - SynetFusedLayerForward1Nchw(src, bias0, scale1, bias1, channels, spatial, dst); - else - SynetFusedLayerForward1Nchw(src, bias0, scale1, bias1, channels, spatial, dst); - } - - template void SynetFusedLayerForward1Nhwc(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(bias0) && Aligned(scale1) && Aligned(bias1) && Aligned(channels, F) && Aligned(dst)); - - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - if (partial) - { - for (; c < aligned; c += QF) - { - SynetFusedLayerForward1(src, bias0, scale1, bias1, dst, c + F * 0); - SynetFusedLayerForward1(src, bias0, scale1, bias1, dst, c + F * 1); - SynetFusedLayerForward1(src, bias0, scale1, bias1, dst, c + F * 2); - SynetFusedLayerForward1(src, bias0, scale1, bias1, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetFusedLayerForward1(src, bias0, scale1, bias1, dst, c); - } - for (; c < channels; ++c) - dst[c] = Base::SynetFusedLayerForward1(src[c] + bias0[c], scale1[c], bias1[c]); - src += channels; - dst += channels; - } - } - - SIMD_INLINE void SynetFusedLayerForward1Nhwc(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(bias0) && Aligned(scale1) && Aligned(bias1) && Aligned(channels, F) && Aligned(dst)) - SynetFusedLayerForward1Nhwc(src, bias0, scale1, bias1, channels, spatial, dst); - else - SynetFusedLayerForward1Nhwc(src, bias0, scale1, bias1, channels, spatial, dst); - } - - template void SynetFusedLayerForward1Nchw8c(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - for (size_t c = 0; c < channels; c += F) - { - __m256 _bias0 = Load(bias0 + c); - __m256 _scale1 = Load(scale1 + c); - __m256 _bias1 = Load(bias1 + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, dst, s + F * 0); - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, dst, s + F * 1); - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, dst, s + F * 2); - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, dst, s); - src += spatialF; - dst += spatialF; - } - } - - SIMD_INLINE void SynetFusedLayerForward1Nchw8c(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetFusedLayerForward1Nchw8c(src, bias0, scale1, bias1, channels, spatial, dst); - else - SynetFusedLayerForward1Nchw8c(src, bias0, scale1, bias1, channels, spatial, dst); - } - - void SynetFusedLayerForward1(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward1Nchw(src, bias0, scale1, bias1, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward1Nhwc(src, bias0, scale1, bias1, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - Sse::SynetFusedLayerForward1(src, bias0, scale1, bias1, channels, spatial, dst, format); - else if (format == SimdTensorFormatNchw8c) - SynetFusedLayerForward1Nchw8c(src, bias0, scale1, bias1, channels, spatial, dst); - else - Base::SynetFusedLayerForward1(src, bias0, scale1, bias1, channels, spatial, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetFusedLayerForward2(const float * src, const float * scale, const float * bias, __m256 slope, float * dst, size_t offset) - { - __m256 _src = Load(src + offset); - __m256 _scale = Load(scale + offset); - __m256 _bias = Load(bias + offset); - __m256 x = _mm256_add_ps(_mm256_mul_ps(_src, _scale), _bias); - Store(dst + offset, _mm256_add_ps(_mm256_max_ps(_mm256_setzero_ps(), x), _mm256_mul_ps(_mm256_min_ps(_mm256_setzero_ps(), x), slope))); - } - - template SIMD_INLINE void SynetFusedLayerForward2(const float * src, __m256 scale, __m256 bias, __m256 slope, float * dst, size_t offset) - { - __m256 _src = Load(src + offset); - __m256 x = _mm256_add_ps(_mm256_mul_ps(_src, scale), bias); - Store(dst + offset, _mm256_add_ps(_mm256_max_ps(_mm256_setzero_ps(), x), _mm256_mul_ps(_mm256_min_ps(_mm256_setzero_ps(), x), slope))); - } - - template void SynetFusedLayerForward2Nchw(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(spatial, F) && Aligned(dst)); - - __m256 _slope = _mm256_set1_ps(slope[0]); - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - if (partial) - { - __m256 _scale = _mm256_set1_ps(scale[c]); - __m256 _bias = _mm256_set1_ps(bias[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward2(src, _scale, _bias, _slope, dst, s + F * 0); - SynetFusedLayerForward2(src, _scale, _bias, _slope, dst, s + F * 1); - SynetFusedLayerForward2(src, _scale, _bias, _slope, dst, s + F * 2); - SynetFusedLayerForward2(src, _scale, _bias, _slope, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetFusedLayerForward2(src, _scale, _bias, _slope, dst, s); - } - for (; s < spatial; ++s) - dst[s] = Base::SynetFusedLayerForward2(src[s], scale[c], bias[c], slope[0]); - src += spatial; - dst += spatial; - } - } - - SIMD_INLINE void SynetFusedLayerForward2Nchw(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(spatial, F) && Aligned(dst)) - SynetFusedLayerForward2Nchw(src, scale, bias, channels, spatial, slope, dst); - else - SynetFusedLayerForward2Nchw(src, scale, bias, channels, spatial, slope, dst); - } - - template void SynetFusedLayerForward2Nhwc(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(scale) && Aligned(bias) && Aligned(channels, F) && Aligned(dst)); - - __m256 _slope = _mm256_set1_ps(slope[0]); - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - if (partial) - { - for (; c < aligned; c += QF) - { - SynetFusedLayerForward2(src, scale, bias, _slope, dst, c + F * 0); - SynetFusedLayerForward2(src, scale, bias, _slope, dst, c + F * 1); - SynetFusedLayerForward2(src, scale, bias, _slope, dst, c + F * 2); - SynetFusedLayerForward2(src, scale, bias, _slope, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetFusedLayerForward2(src, scale, bias, _slope, dst, c); - } - for (; c < channels; ++c) - dst[c] = Base::SynetFusedLayerForward2(src[c], scale[c], bias[c], slope[0]); - src += channels; - dst += channels; - } - } - - SIMD_INLINE void SynetFusedLayerForward2Nhwc(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(scale) && Aligned(bias) && Aligned(channels, F) && Aligned(dst)) - SynetFusedLayerForward2Nhwc(src, scale, bias, channels, spatial, slope, dst); - else - SynetFusedLayerForward2Nhwc(src, scale, bias, channels, spatial, slope, dst); - } - - template void SynetFusedLayerForward2Nchw8c(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - __m256 _slope = _mm256_set1_ps(slope[0]); - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - for (size_t c = 0; c < channels; c += F) - { - __m256 _scale = Load(scale + c); - __m256 _bias = Load(bias + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward2(src, _scale, _bias, _slope, dst, s + F * 0); - SynetFusedLayerForward2(src, _scale, _bias, _slope, dst, s + F * 1); - SynetFusedLayerForward2(src, _scale, _bias, _slope, dst, s + F * 2); - SynetFusedLayerForward2(src, _scale, _bias, _slope, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward2(src, _scale, _bias, _slope, dst, s); - src += spatialF; - dst += spatialF; - } - } - - SIMD_INLINE void SynetFusedLayerForward2Nchw8c(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetFusedLayerForward2Nchw8c(src, scale, bias, channels, spatial, slope, dst); - else - SynetFusedLayerForward2Nchw8c(src, scale, bias, channels, spatial, slope, dst); - } - - void SynetFusedLayerForward2(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward2Nchw(src, scale, bias, channels, spatial, slope, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward2Nhwc(src, scale, bias, channels, spatial, slope, dst); - else if (format == SimdTensorFormatNchw4c) - Sse::SynetFusedLayerForward2(src, scale, bias, channels, spatial, slope, dst, format); - else if (format == SimdTensorFormatNchw8c) - SynetFusedLayerForward2Nchw8c(src, scale, bias, channels, spatial, slope, dst); - else - Base::SynetFusedLayerForward2(src, scale, bias, channels, spatial, slope, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetFusedLayerForward3(const float * src, const float * bias, const float * scale, float * dst, size_t offset) - { - __m256 _bias = Load(bias + offset); - __m256 x = _mm256_add_ps(Load(src + offset), _bias); - __m256 _scale = Load(scale + offset); - __m256 pos = _mm256_max_ps(_mm256_setzero_ps(), x); - __m256 neg = _mm256_min_ps(_mm256_setzero_ps(), x); - Store(dst + offset, _mm256_add_ps(pos, _mm256_mul_ps(_scale, neg))); - } - - template SIMD_INLINE void SynetFusedLayerForward3(const float * src, __m256 bias, __m256 scale, float * dst, size_t offset) - { - __m256 x = _mm256_add_ps(Load(src + offset), bias); - __m256 pos = _mm256_max_ps(_mm256_setzero_ps(), x); - __m256 neg = _mm256_min_ps(_mm256_setzero_ps(), x); - Store(dst + offset, _mm256_add_ps(pos, _mm256_mul_ps(scale, neg))); - } - - template void SynetFusedLayerForward3Nchw(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(spatial, F) && Aligned(dst)); - - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - if (partial) - { - __m256 _bias = _mm256_set1_ps(bias[c]); - __m256 _scale = _mm256_set1_ps(scale[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward3(src, _bias, _scale, dst, s + F * 0); - SynetFusedLayerForward3(src, _bias, _scale, dst, s + F * 1); - SynetFusedLayerForward3(src, _bias, _scale, dst, s + F * 2); - SynetFusedLayerForward3(src, _bias, _scale, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetFusedLayerForward3(src, _bias, _scale, dst, s); - } - for (; s < spatial; ++s) - dst[s] = Base::SynetFusedLayerForward3(src[s] + bias[c], scale[c]); - src += spatial; - dst += spatial; - } - } - - SIMD_INLINE void SynetFusedLayerForward3Nchw(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(spatial, F) && Aligned(dst)) - SynetFusedLayerForward3Nchw(src, bias, scale, channels, spatial, dst); - else - SynetFusedLayerForward3Nchw(src, bias, scale, channels, spatial, dst); - } - - template void SynetFusedLayerForward3Nhwc(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(bias) && Aligned(scale) && Aligned(channels, F) && Aligned(dst)); - - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - if (partial) - { - for (; c < aligned; c += QF) - { - SynetFusedLayerForward3(src, bias, scale, dst, c + F * 0); - SynetFusedLayerForward3(src, bias, scale, dst, c + F * 1); - SynetFusedLayerForward3(src, bias, scale, dst, c + F * 2); - SynetFusedLayerForward3(src, bias, scale, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetFusedLayerForward3(src, bias, scale, dst, c); - } - for (; c < channels; ++c) - dst[c] = Base::SynetFusedLayerForward3(src[c] + bias[c], scale[c]); - src += channels; - dst += channels; - } - } - - SIMD_INLINE void SynetFusedLayerForward3Nhwc(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(bias) && Aligned(scale) && Aligned(channels, F) && Aligned(dst)) - SynetFusedLayerForward3Nhwc(src, bias, scale, channels, spatial, dst); - else - SynetFusedLayerForward3Nhwc(src, bias, scale, channels, spatial, dst); - } - - template void SynetFusedLayerForward3Nchw8c(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - for (size_t c = 0; c < channels; c += F) - { - __m256 _bias = Load(bias + c); - __m256 _scale = Load(scale + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward3(src, _bias, _scale, dst, s + F * 0); - SynetFusedLayerForward3(src, _bias, _scale, dst, s + F * 1); - SynetFusedLayerForward3(src, _bias, _scale, dst, s + F * 2); - SynetFusedLayerForward3(src, _bias, _scale, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward3(src, _bias, _scale, dst, s); - src += spatialF; - dst += spatialF; - } - } - - SIMD_INLINE void SynetFusedLayerForward3Nchw8c(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetFusedLayerForward3Nchw8c(src, bias, scale, channels, spatial, dst); - else - SynetFusedLayerForward3Nchw8c(src, bias, scale, channels, spatial, dst); - } - - void SynetFusedLayerForward3(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward3Nchw(src, bias, scale, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward3Nhwc(src, bias, scale, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - Sse::SynetFusedLayerForward3(src, bias, scale, channels, spatial, dst, format); - else if (format == SimdTensorFormatNchw8c) - SynetFusedLayerForward3Nchw8c(src, bias, scale, channels, spatial, dst); - else - Base::SynetFusedLayerForward3(src, bias, scale, channels, spatial, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetFusedLayerForward4(const float * src, const float * bias0, __m256 scale1, __m256 bias1, float * dst0, float * dst1, size_t offset) - { - __m256 x = _mm256_add_ps(Load(src + offset), Load(bias0 + offset)); - Store(dst0 + offset, _mm256_max_ps(_mm256_setzero_ps(), x)); - Store(dst1 + offset, _mm256_max_ps(_mm256_setzero_ps(), _mm256_add_ps(bias1, _mm256_mul_ps(scale1, x)))); - } - - template SIMD_INLINE void SynetFusedLayerForward4(const float * src, __m256 bias0, __m256 scale1, __m256 bias1, float * dst0, float * dst1, size_t offset) - { - __m256 x = _mm256_add_ps(Load(src + offset), bias0); - Store(dst0 + offset, _mm256_max_ps(_mm256_setzero_ps(), x)); - Store(dst1 + offset, _mm256_max_ps(_mm256_setzero_ps(), _mm256_add_ps(bias1, _mm256_mul_ps(scale1, x)))); - } - - template void SynetFusedLayerForward4Nchw(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst0) - { - if (align) - assert(Aligned(src) && Aligned(spatial, F) && Aligned(dst0)); - - __m256 _bias1 = _mm256_set1_ps(bias1[0]); - __m256 _scale1 = _mm256_set1_ps(scale1[0]); - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - float * dst1 = dst0 + channels * spatial; - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - if (partial) - { - __m256 _bias0 = _mm256_set1_ps(bias0[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, dst0, dst1, s + F * 0); - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, dst0, dst1, s + F * 1); - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, dst0, dst1, s + F * 2); - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, dst0, dst1, s + F * 3); - } - for (; s < partial; s += F) - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, dst0, dst1, s); - } - for (; s < spatial; ++s) - Base::SynetFusedLayerForward4(src[s], bias0[c], scale1[0], bias1[0], dst0 + s, dst1 + s); - src += spatial; - dst0 += spatial; - dst1 += spatial; - } - } - - SIMD_INLINE void SynetFusedLayerForward4Nchw(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(spatial, F) && Aligned(dst)) - SynetFusedLayerForward4Nchw(src, bias0, scale1, bias1, channels, spatial, dst); - else - SynetFusedLayerForward4Nchw(src, bias0, scale1, bias1, channels, spatial, dst); - } - - template void SynetFusedLayerForward4Nhwc(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst0) - { - if (align) - assert(Aligned(src) && Aligned(bias0) && Aligned(channels, F) && Aligned(dst0)); - - __m256 _bias1 = _mm256_set1_ps(bias1[0]); - __m256 _scale1 = _mm256_set1_ps(scale1[0]); - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - float * dst1 = dst0 + channels; - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - if (partial) - { - for (; c < aligned; c += QF) - { - SynetFusedLayerForward4(src, bias0, _scale1, _bias1, dst0, dst1, c + F * 0); - SynetFusedLayerForward4(src, bias0, _scale1, _bias1, dst0, dst1, c + F * 1); - SynetFusedLayerForward4(src, bias0, _scale1, _bias1, dst0, dst1, c + F * 2); - SynetFusedLayerForward4(src, bias0, _scale1, _bias1, dst0, dst1, c + F * 3); - } - for (; c < partial; c += F) - SynetFusedLayerForward4(src, bias0, _scale1, _bias1, dst0, dst1, c); - } - for (; c < channels; ++c) - Base::SynetFusedLayerForward4(src[c], bias0[c], scale1[0], bias1[0], dst0 + c, dst1 + c); - src += channels; - dst0 += 2 * channels; - dst1 += 2 * channels; - } - } - - SIMD_INLINE void SynetFusedLayerForward4Nhwc(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(bias0) && Aligned(channels, F) && Aligned(dst)) - SynetFusedLayerForward4Nhwc(src, bias0, scale1, bias1, channels, spatial, dst); - else - SynetFusedLayerForward4Nhwc(src, bias0, scale1, bias1, channels, spatial, dst); - } - - template void SynetFusedLayerForward4Nchw8cA(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst0) - { - if (align) - assert(Aligned(src) && Aligned(dst0)); - - __m256 _bias1 = _mm256_set1_ps(bias1[0]); - __m256 _scale1 = _mm256_set1_ps(scale1[0]); - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4) * F; - float * dst1 = dst0 + channels * spatial; - for (size_t c = 0; c < channels; c += F) - { - __m256 _bias0 = Load(bias0 + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, dst0, dst1, s + F * 0); - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, dst0, dst1, s + F * 1); - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, dst0, dst1, s + F * 2); - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, dst0, dst1, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, dst0, dst1, s); - src += spatialF; - dst0 += spatialF; - dst1 += spatialF; - } - } - - SIMD_INLINE void SynetFusedLayerForward4Nchw8cA(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - assert(Aligned(channels, F)); - if (Aligned(src) && Aligned(dst)) - SynetFusedLayerForward4Nchw8cA(src, bias0, scale1, bias1, channels, spatial, dst); - else - SynetFusedLayerForward4Nchw8cA(src, bias0, scale1, bias1, channels, spatial, dst); - } - - void SynetFusedLayerForward4(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward4Nchw(src, bias0, scale1, bias1, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward4Nhwc(src, bias0, scale1, bias1, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - Sse::SynetFusedLayerForward4(src, bias0, scale1, bias1, channels, spatial, dst, format); - else if (format == SimdTensorFormatNchw8c && Aligned(channels, F)) - SynetFusedLayerForward4Nchw8cA(src, bias0, scale1, bias1, channels, spatial, dst); - else - Base::SynetFusedLayerForward4(src, bias0, scale1, bias1, channels, spatial, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetFusedLayerForward8(const float * src0, const float * src1, const float * src2, float * dst, size_t offset) - { - Store(dst + offset, _mm256_add_ps(Load(src0 + offset), _mm256_mul_ps(Load(src1 + offset), Load(src2 + offset)))); - } - - template SIMD_INLINE void SynetFusedLayerForward8(const float * src0, const float * src1, const __m256 & src2, float * dst, size_t offset) - { - Store(dst + offset, _mm256_add_ps(Load(src0 + offset), _mm256_mul_ps(Load(src1 + offset), src2))); - } - - template void SynetFusedLayerForward8Nchw(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src0) && Aligned(src1) && Aligned(spatial, F) && Aligned(dst)); - - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - if (partial) - { - __m256 _src2 = _mm256_set1_ps(src2[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 0); - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 1); - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 2); - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetFusedLayerForward8(src0, src1, _src2, dst, s); - } - for (; s < spatial; ++s) - dst[s] = Base::SynetFusedLayerForward8(src0[s], src1[s], src2[c]); - src0 += spatial; - src1 += spatial; - dst += spatial; - } - } - - SIMD_INLINE void SynetFusedLayerForward8Nchw(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src0) && Aligned(src1) && Aligned(spatial, F) && Aligned(dst)) - SynetFusedLayerForward8Nchw(src0, src1, src2, channels, spatial, dst); - else - SynetFusedLayerForward8Nchw(src0, src1, src2, channels, spatial, dst); - } - - template void SynetFusedLayerForward8Nhwc(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src0) && Aligned(src1) && Aligned(src2) && Aligned(channels, F) && Aligned(dst)); - - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - if (partial) - { - for (; c < aligned; c += QF) - { - SynetFusedLayerForward8(src0, src1, src2, dst, c + F * 0); - SynetFusedLayerForward8(src0, src1, src2, dst, c + F * 1); - SynetFusedLayerForward8(src0, src1, src2, dst, c + F * 2); - SynetFusedLayerForward8(src0, src1, src2, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetFusedLayerForward8(src0, src1, src2, dst, c); - } - for (; c < channels; ++c) - dst[c] = Base::SynetFusedLayerForward8(src0[c], src1[c], src2[c]); - src0 += channels; - src1 += channels; - dst += channels; - } - } - - SIMD_INLINE void SynetFusedLayerForward8Nhwc(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src0) && Aligned(src1) && Aligned(src2) && Aligned(channels, F) && Aligned(dst)) - SynetFusedLayerForward8Nhwc(src0, src1, src2, channels, spatial, dst); - else - SynetFusedLayerForward8Nhwc(src0, src1, src2, channels, spatial, dst); - } - - template void SynetFusedLayerForward8Nchw8c(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src0) && Aligned(src1) && Aligned(dst)); - - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - for (size_t c = 0; c < channels; c += F) - { - __m256 _src2 = Load(src2 + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 0); - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 1); - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 2); - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward8(src0, src1, _src2, dst, s); - src0 += spatialF; - src1 += spatialF; - dst += spatialF; - } - } - - SIMD_INLINE void SynetFusedLayerForward8Nchw8c(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src0) && Aligned(src1) && Aligned(dst)) - SynetFusedLayerForward8Nchw8c(src0, src1, src2, channels, spatial, dst); - else - SynetFusedLayerForward8Nchw8c(src0, src1, src2, channels, spatial, dst); - } - - void SynetFusedLayerForward8(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward8Nchw(src0, src1, src2, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward8Nhwc(src0, src1, src2, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - Sse::SynetFusedLayerForward8(src0, src1, src2, channels, spatial, dst, format); - else if (format == SimdTensorFormatNchw8c) - SynetFusedLayerForward8Nchw8c(src0, src1, src2, channels, spatial, dst); - else - Base::SynetFusedLayerForward8(src0, src1, src2, channels, spatial, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetFusedLayerForward9(const float * src, const float * scale, const float * bias, float * dst0, float * dst1, size_t offset) - { - __m256 _src = Load(src + offset); - Store(dst0 + offset, _mm256_max_ps(_mm256_setzero_ps(), _mm256_add_ps(_mm256_mul_ps(_src, Load(scale + offset)), Load(bias + offset)))); - Store(dst1 + offset, _src); - } - - template SIMD_INLINE void SynetFusedLayerForward9(const float * src, const float * scale, const float * bias, float * dst0, size_t offset) - { - __m256 _src = Load(src + offset); - Store(dst0 + offset, _mm256_max_ps(_mm256_setzero_ps(), _mm256_add_ps(_mm256_mul_ps(_src, Load(scale + offset)), Load(bias + offset)))); - } - - template SIMD_INLINE void SynetFusedLayerForward9(const float * src, const __m256 & scale, const __m256 & bias, float * dst0, float * dst1, size_t offset) - { - __m256 _src = Load(src + offset); - Store(dst0 + offset, _mm256_max_ps(_mm256_setzero_ps(), _mm256_add_ps(_mm256_mul_ps(_src, scale), bias))); - Store(dst1 + offset, _src); - } - - template SIMD_INLINE void SynetFusedLayerForward9(const float * src, const __m256 & scale, const __m256 & bias, float * dst0, size_t offset) - { - __m256 _src = Load(src + offset); - Store(dst0 + offset, _mm256_max_ps(_mm256_setzero_ps(), _mm256_add_ps(_mm256_mul_ps(_src, scale), bias))); - } - - template void SynetFusedLayerForward9Nchw(const float * src0, const float * src1, const float * scale0, const float * bias0, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - if (align) - assert(Aligned(src0) && Aligned(src1) && Aligned(spatial, F) && Aligned(dst0) && Aligned(dst1)); - const float * scale1 = scale0 + channels0; - const float * bias1 = bias0 + channels0; - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - if (dst1) - { - for (size_t c = 0; c < channels0; ++c) - { - size_t s = 0; - if (partial) - { - __m256 _scale0 = _mm256_set1_ps(scale0[c]); - __m256 _bias0 = _mm256_set1_ps(bias0[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + 0 * F); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + 1 * F); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + 2 * F); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + 3 * F); - } - for (; s < partial; s += F) - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s); - } - for (; s < spatial; ++s) - dst0[s] = Base::SynetFusedLayerForward9(src0[s], scale0[c], bias0[c]), dst1[s] = src0[s]; - src0 += spatial; - dst0 += spatial; - dst1 += spatial; - } - for (size_t c = 0; c < channels1; ++c) - { - size_t s = 0; - if (partial) - { - __m256 _scale1 = _mm256_set1_ps(scale1[c]); - __m256 _bias1 = _mm256_set1_ps(bias1[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + 0 * F); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + 1 * F); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + 2 * F); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + 3 * F); - } - for (; s < partial; s += F) - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s); - } - for (; s < spatial; ++s) - dst0[s] = Base::SynetFusedLayerForward9(src1[s], scale1[c], bias1[c]), dst1[s] = src1[s]; - src1 += spatial; - dst0 += spatial; - dst1 += spatial; - } - } - else - { - for (size_t c = 0; c < channels0; ++c) - { - size_t s = 0; - if (partial) - { - __m256 _scale0 = _mm256_set1_ps(scale0[c]); - __m256 _bias0 = _mm256_set1_ps(bias0[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + 0 * F); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + 1 * F); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + 2 * F); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + 3 * F); - } - for (; s < partial; s += F) - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s); - } - for (; s < spatial; ++s) - dst0[s] = Base::SynetFusedLayerForward9(src0[s], scale0[c], bias0[c]); - src0 += spatial; - dst0 += spatial; - } - for (size_t c = 0; c < channels1; ++c) - { - size_t s = 0; - if (partial) - { - __m256 _scale1 = _mm256_set1_ps(scale1[c]); - __m256 _bias1 = _mm256_set1_ps(bias1[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + 0 * F); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + 1 * F); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + 2 * F); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + 3 * F); - } - for (; s < partial; s += F) - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s); - } - for (; s < spatial; ++s) - dst0[s] = Base::SynetFusedLayerForward9(src1[s], scale1[c], bias1[c]); - src1 += spatial; - dst0 += spatial; - } - } - } - - SIMD_INLINE void SynetFusedLayerForward9Nchw(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - if (Aligned(src0) && Aligned(src1) && Aligned(spatial, F) && Aligned(dst0) && Aligned(dst1)) - SynetFusedLayerForward9Nchw(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else - SynetFusedLayerForward9Nchw(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - } - - template void SynetFusedLayerForward9Nhwc(const float * src0, const float * src1, const float * scale0, const float * bias0, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - if (align) - assert(Aligned(src0) && Aligned(src1) && Aligned(scale0) && Aligned(bias0) && Aligned(channels0, F) && Aligned(channels1, F) && Aligned(dst0) && Aligned(dst1)); - const float * scale1 = scale0 + channels0; - const float * bias1 = bias0 + channels0; - size_t aligned0 = AlignLo(channels0, QF); - size_t partial0 = AlignLo(channels0, F); - size_t aligned1 = AlignLo(channels1, QF); - size_t partial1 = AlignLo(channels1, F); - if (dst1) - { - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned0; c += QF) - { - SynetFusedLayerForward9(src0, scale0, bias0, dst0, dst1, c + 0 * F); - SynetFusedLayerForward9(src0, scale0, bias0, dst0, dst1, c + 1 * F); - SynetFusedLayerForward9(src0, scale0, bias0, dst0, dst1, c + 2 * F); - SynetFusedLayerForward9(src0, scale0, bias0, dst0, dst1, c + 3 * F); - } - for (; c < partial0; c += F) - SynetFusedLayerForward9(src0, scale0, bias0, dst0, dst1, c); - for (; c < channels0; ++c) - dst0[c] = Base::SynetFusedLayerForward9(src0[c], scale0[c], bias0[c]), dst1[c] = src0[c]; - src0 += channels0; - dst0 += channels0; - dst1 += channels0; - c = 0; - for (; c < aligned1; c += QF) - { - SynetFusedLayerForward9(src1, scale1, bias1, dst0, dst1, c + 0 * F); - SynetFusedLayerForward9(src1, scale1, bias1, dst0, dst1, c + 1 * F); - SynetFusedLayerForward9(src1, scale1, bias1, dst0, dst1, c + 2 * F); - SynetFusedLayerForward9(src1, scale1, bias1, dst0, dst1, c + 3 * F); - } - for (; c < partial1; c += F) - SynetFusedLayerForward9(src1, scale1, bias1, dst0, dst1, c); - for (; c < channels1; ++c) - dst0[c] = Base::SynetFusedLayerForward9(src1[c], scale1[c], bias1[c]), dst1[c] = src1[c]; - src1 += channels1; - dst0 += channels1; - dst1 += channels1; - } - } - else - { - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned0; c += QF) - { - SynetFusedLayerForward9(src0, scale0, bias0, dst0, c + 0 * F); - SynetFusedLayerForward9(src0, scale0, bias0, dst0, c + 1 * F); - SynetFusedLayerForward9(src0, scale0, bias0, dst0, c + 2 * F); - SynetFusedLayerForward9(src0, scale0, bias0, dst0, c + 3 * F); - } - for (; c < partial0; c += F) - SynetFusedLayerForward9(src0, scale0, bias0, dst0, c); - for (; c < channels0; ++c) - dst0[c] = Base::SynetFusedLayerForward9(src0[c], scale0[c], bias0[c]); - src0 += channels0; - dst0 += channels0; - c = 0; - for (; c < aligned1; c += QF) - { - SynetFusedLayerForward9(src1, scale1, bias1, dst0, c + 0 * F); - SynetFusedLayerForward9(src1, scale1, bias1, dst0, c + 1 * F); - SynetFusedLayerForward9(src1, scale1, bias1, dst0, c + 2 * F); - SynetFusedLayerForward9(src1, scale1, bias1, dst0, c + 3 * F); - } - for (; c < partial1; c += F) - SynetFusedLayerForward9(src1, scale1, bias1, dst0, c); - for (; c < channels1; ++c) - dst0[c] = Base::SynetFusedLayerForward9(src1[c], scale1[c], bias1[c]); - src1 += channels1; - dst0 += channels1; - } - } - } - - SIMD_INLINE void SynetFusedLayerForward9Nhwc(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - if (Aligned(src0) && Aligned(src1) && Aligned(scale) && Aligned(bias) && Aligned(channels0, F) && Aligned(channels1, F) && Aligned(dst0) && Aligned(dst1)) - SynetFusedLayerForward9Nhwc(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else - SynetFusedLayerForward9Nhwc(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - } - - template void SynetFusedLayerForward9Nchw8cA(const float * src0, const float * src1, const float * scale0, const float * bias0, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - if (align) - assert(Aligned(src0) && Aligned(src1) && Aligned(dst0) && Aligned(dst1)); - const float * scale1 = scale0 + channels0; - const float * bias1 = bias0 + channels0; - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - if (dst1) - { - for (size_t c = 0; c < channels0; c += F) - { - __m256 _scale0 = Load(scale0 + c); - __m256 _bias0 = Load(bias0 + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + F * 0); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + F * 1); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + F * 2); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s); - src0 += spatialF; - dst0 += spatialF; - dst1 += spatialF; - } - for (size_t c = 0; c < channels1; c += F) - { - __m256 _scale1 = Load(scale1 + c); - __m256 _bias1 = Load(bias1 + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + F * 0); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + F * 1); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + F * 2); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s); - src1 += spatialF; - dst0 += spatialF; - dst1 += spatialF; - } - } - else - { - for (size_t c = 0; c < channels0; c += F) - { - __m256 _scale0 = Load(scale0 + c); - __m256 _bias0 = Load(bias0 + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + F * 0); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + F * 1); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + F * 2); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s); - src0 += spatialF; - dst0 += spatialF; - } - for (size_t c = 0; c < channels1; c += F) - { - __m256 _scale1 = Load(scale1 + c); - __m256 _bias1 = Load(bias1 + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + F * 0); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + F * 1); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + F * 2); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s); - src1 += spatialF; - dst0 += spatialF; - } - } - } - - SIMD_INLINE void SynetFusedLayerForward9Nchw8cA(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - assert(Aligned(channels0, F)); - if (Aligned(src0) && Aligned(src1) && Aligned(dst0) && Aligned(dst1)) - SynetFusedLayerForward9Nchw8cA(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else - SynetFusedLayerForward9Nchw8cA(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - } - - void SynetFusedLayerForward9(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels0 + channels1, spatial, format)) - SynetFusedLayerForward9Nchw(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else if (Base::NhwcCompatible(channels0 + channels1, spatial, format)) - SynetFusedLayerForward9Nhwc(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else if (format == SimdTensorFormatNchw4c) - Sse::SynetFusedLayerForward9(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1, format); - else if (format == SimdTensorFormatNchw8c && Aligned(channels0, F)) - SynetFusedLayerForward9Nchw8cA(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else - Base::SynetFusedLayerForward9(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1, format); - } - } -#endif// SIMD_AVX_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx1SynetMergedConvolution32f.cpp b/src/3rd/Simd/Simd/SimdAvx1SynetMergedConvolution32f.cpp deleted file mode 100644 index 0863ad34..00000000 --- a/src/3rd/Simd/Simd/SimdAvx1SynetMergedConvolution32f.cpp +++ /dev/null @@ -1,1436 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetMergedConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdUpdate.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#if defined(SIMD_AVX_ENABLE) - namespace Avx - { - template SIMD_INLINE void InputConvolution1x1_2x6(const float * src0, size_t srcC, - const float * weight, const __m256 * bias, const __m256 * params, float * dst0, float * dst1) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - d00 = bias[0], d01 = bias[1]; - d10 = bias[0], d11 = bias[1]; - d20 = bias[0], d21 = bias[1]; - d30 = bias[0], d31 = bias[1]; - d40 = bias[0], d41 = bias[1]; - d50 = bias[0], d51 = bias[1]; - const float * src1 = src0 + 1 * srcC; - const float * src2 = src0 + 2 * srcC; - const float * src3 = src0 + 3 * srcC; - const float * src4 = src0 + 4 * srcC; - const float * src5 = src0 + 5 * srcC; - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - s0 = _mm256_set1_ps(src0[sc]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - s0 = _mm256_set1_ps(src1[sc]); - d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11); - s0 = _mm256_set1_ps(src2[sc]); - d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21); - s0 = _mm256_set1_ps(src3[sc]); - d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31); - s0 = _mm256_set1_ps(src4[sc]); - d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40); - d41 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d41); - s0 = _mm256_set1_ps(src5[sc]); - d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50); - d51 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d51); - weight += DF; - } - _mm256_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); - _mm256_storeu_ps(dst1 + 0 * F, Activate(d01, params, 1)); - _mm256_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); - _mm256_storeu_ps(dst1 + 1 * F, Activate(d11, params, 1)); - _mm256_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); - _mm256_storeu_ps(dst1 + 2 * F, Activate(d21, params, 1)); - _mm256_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); - _mm256_storeu_ps(dst1 + 3 * F, Activate(d31, params, 1)); - _mm256_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); - _mm256_storeu_ps(dst1 + 4 * F, Activate(d41, params, 1)); - _mm256_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); - _mm256_storeu_ps(dst1 + 5 * F, Activate(d51, params, 1)); - } - - template SIMD_INLINE void InputConvolution1x1_2xM(const float * src0, size_t srcC, - const float * weight, const __m256 * bias, const __m256 * params, float * dst0, float * dst1) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - if (M > 0) d00 = bias[0], d01 = bias[1]; - if (M > 1) d10 = bias[0], d11 = bias[1]; - if (M > 2) d20 = bias[0], d21 = bias[1]; - if (M > 3) d30 = bias[0], d31 = bias[1]; - if (M > 4) d40 = bias[0], d41 = bias[1]; - if (M > 5) d50 = bias[0], d51 = bias[1]; - const float * src1 = src0 + 1 * srcC; - const float * src2 = src0 + 2 * srcC; - const float * src3 = src0 + 3 * srcC; - const float * src4 = src0 + 4 * srcC; - const float * src5 = src0 + 5 * srcC; - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - if (M > 0) s0 = _mm256_set1_ps(src0[sc]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00), d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - if (M > 1) s0 = _mm256_set1_ps(src1[sc]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10), d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11); - if (M > 2) s0 = _mm256_set1_ps(src2[sc]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20), d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21); - if (M > 3) s0 = _mm256_set1_ps(src3[sc]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30), d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31); - if (M > 4) s0 = _mm256_set1_ps(src4[sc]), d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40), d41 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d41); - if (M > 5) s0 = _mm256_set1_ps(src5[sc]), d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50), d51 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d51); - weight += DF; - } - if (M > 0) _mm256_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)), _mm256_storeu_ps(dst1 + 0 * F, Activate(d01, params, 1)); - if (M > 1) _mm256_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)), _mm256_storeu_ps(dst1 + 1 * F, Activate(d11, params, 1)); - if (M > 2) _mm256_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)), _mm256_storeu_ps(dst1 + 2 * F, Activate(d21, params, 1)); - if (M > 3) _mm256_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)), _mm256_storeu_ps(dst1 + 3 * F, Activate(d31, params, 1)); - if (M > 4) _mm256_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)), _mm256_storeu_ps(dst1 + 4 * F, Activate(d41, params, 1)); - if (M > 5) _mm256_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)), _mm256_storeu_ps(dst1 + 5 * F, Activate(d51, params, 1)); - } - - typedef void(*InputConvolution1x1_2xM_Ptr)(const float * src0, size_t srcC, const float * weight, const __m256 * bias, const __m256 * params, float * dst0, float * dst1); - - template InputConvolution1x1_2xM_Ptr GetInputConvolution1x1_2xM(size_t M) - { - switch (M) - { - case 0: return InputConvolution1x1_2xM; - case 1: return InputConvolution1x1_2xM; - case 2: return InputConvolution1x1_2xM; - case 3: return InputConvolution1x1_2xM; - case 4: return InputConvolution1x1_2xM; - case 5: return InputConvolution1x1_2xM; - } - assert(0); - return NULL; - } - - template SIMD_INLINE void InputConvolution1x1_1x6(const float * src0, size_t srcC, - const float * weight, const __m256 * bias, const __m256 * params, float * dst0) - { - __m256 d00, d10, d20, d30, d40, d50, s0, w0; - d00 = bias[0]; - d10 = bias[0]; - d20 = bias[0]; - d30 = bias[0]; - d40 = bias[0]; - d50 = bias[0]; - const float * src1 = src0 + 1 * srcC; - const float * src2 = src0 + 2 * srcC; - const float * src3 = src0 + 3 * srcC; - const float * src4 = src0 + 4 * srcC; - const float * src5 = src0 + 5 * srcC; - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm256_loadu_ps(weight + 0); - s0 = _mm256_set1_ps(src0[sc]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - s0 = _mm256_set1_ps(src1[sc]); - d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - s0 = _mm256_set1_ps(src2[sc]); - d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - s0 = _mm256_set1_ps(src3[sc]); - d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - s0 = _mm256_set1_ps(src4[sc]); - d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40); - s0 = _mm256_set1_ps(src5[sc]); - d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50); - weight += DF; - } - _mm256_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); - _mm256_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); - _mm256_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); - _mm256_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); - _mm256_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); - _mm256_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); - } - - template SIMD_INLINE void InputConvolution1x1_1xM(const float * src0, size_t srcC, - const float * weight, const __m256 * bias, const __m256 * params, float * dst0) - { - __m256 d00, d10, d20, d30, d40, d50, s0, w0; - if (M > 0) d00 = bias[0]; - if (M > 1) d10 = bias[0]; - if (M > 2) d20 = bias[0]; - if (M > 3) d30 = bias[0]; - if (M > 4) d40 = bias[0]; - if (M > 5) d50 = bias[0]; - const float * src1 = src0 + 1 * srcC; - const float * src2 = src0 + 2 * srcC; - const float * src3 = src0 + 3 * srcC; - const float * src4 = src0 + 4 * srcC; - const float * src5 = src0 + 5 * srcC; - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm256_loadu_ps(weight + 0); - if (M > 0) s0 = _mm256_set1_ps(src0[sc]), d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - if (M > 1) s0 = _mm256_set1_ps(src1[sc]), d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - if (M > 2) s0 = _mm256_set1_ps(src2[sc]), d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - if (M > 3) s0 = _mm256_set1_ps(src3[sc]), d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - if (M > 4) s0 = _mm256_set1_ps(src4[sc]), d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40); - if (M > 5) s0 = _mm256_set1_ps(src5[sc]), d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50); - weight += DF; - } - if (M > 0) _mm256_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); - if (M > 1) _mm256_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); - if (M > 2) _mm256_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); - if (M > 3) _mm256_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); - if (M > 4) _mm256_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); - if (M > 5) _mm256_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); - } - - typedef void(*InputConvolution1x1_1xM_Ptr)(const float * src0, size_t srcC, const float * weight, const __m256 * bias, const __m256 * params, float * dst0); - - template InputConvolution1x1_1xM_Ptr GetInputConvolution1x1_1xM(size_t M) - { - switch (M) - { - case 0: return InputConvolution1x1_1xM; - case 1: return InputConvolution1x1_1xM; - case 2: return InputConvolution1x1_1xM; - case 3: return InputConvolution1x1_1xM; - case 4: return InputConvolution1x1_1xM; - case 5: return InputConvolution1x1_1xM; - } - assert(0); - return NULL; - } - - template void InputConvolution1x1(const float * src, const SimdConvolutionParameters & p, - size_t dstC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcH = p.srcH, srcW = p.srcW, srcC = p.srcC, dstW = p.dstW; - size_t dstM = (bufH[0] - 1), dstS = bufH[0] * dstW *F; - size_t dstCDF = AlignLo(dstC, DF); - __m256 _params[2], _bias[2]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); -#ifdef SIMD_MERGECONV_MERGE_INPUT_ROWS_1X1 - size_t yInt = Simd::Max(yBeg, yEnd&(~dstM)), nBeg = yBeg * dstW, nInt = yInt * dstW, nEnd = yEnd * dstW; - size_t nInt6 = AlignLoAny(nInt - nBeg, 6) + nBeg, nEnd6 = AlignLoAny(nEnd - nInt, 6) + nInt, nIntTail = nInt - nInt6, nEndTail = nEnd - nEnd6; - InputConvolution1x1_2xM_Ptr tailInt_2 = GetInputConvolution1x1_2xM(nIntTail); - InputConvolution1x1_2xM_Ptr tailEnd_2 = GetInputConvolution1x1_2xM(nEndTail); -#else - size_t dstW6 = AlignLoAny(dstW, 6), wTail = dstW - dstW6; - InputConvolution1x1_2xM_Ptr tailW_2 = GetInputConvolution1x1_2xM(wTail); - InputConvolution1x1_1xM_Ptr tailW_1 = GetInputConvolution1x1_1xM(wTail); -#endif - - size_t dc = 0; - for (; dc < dstC; dc += DF) - { - _bias[0] = bias ? _mm256_loadu_ps(bias + dc + 0) : _mm256_setzero_ps(); - _bias[1] = bias ? _mm256_loadu_ps(bias + dc + F) : _mm256_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = _mm256_loadu_ps(params + dc + 0); - _params[1] = _mm256_loadu_ps(params + dc + F); - } - const float * pS = src + yBeg * srcW*srcC; - const float * pW = weight + dc * srcC; - float * pD = dst + (dc / F)*dstS; -#ifdef SIMD_MERGECONV_MERGE_INPUT_ROWS_1X1 - float * dst0 = pD + (yBeg&dstM)*dstW*F; - float * dst1 = pD + (yInt&dstM)*dstW*F; - size_t dn = nBeg; - if (dstC - dc > F) - { - for (; dn < nInt6; dn += 6, pS += 6 * srcC, dst0 += 6 * F) - InputConvolution1x1_2x6(pS, srcC, pW, _bias, _params, dst0, dst0 + dstS); - if (nIntTail) - tailInt_2(pS, srcC, pW, _bias, _params, dst0, dst0 + dstS), pS += nIntTail * srcC, dn += nIntTail; - for (; dn < nEnd6; dn += 6, pS += 6 * srcC, dst1 += 6 * F) - InputConvolution1x1_2x6(pS, srcC, pW, _bias, _params, dst1, dst1 + dstS); - if (nEndTail) - tailEnd_2(pS, srcC, pW, _bias, _params, dst1, dst1 + dstS), pS += nEndTail * srcC, dn += nEndTail; - } - else - { - InputConvolution1x1_1xM_Ptr tailInt_1 = GetInputConvolution1x1_1xM(nIntTail); - InputConvolution1x1_1xM_Ptr tailEnd_1 = GetInputConvolution1x1_1xM(nEndTail); - for (; dn < nInt6; dn += 6, pS += 6 * srcC, dst0 += 6 * F) - InputConvolution1x1_1x6(pS, srcC, pW, _bias, _params, dst0); - if (nIntTail) - tailInt_1(pS, srcC, pW, _bias, _params, dst0), pS += nIntTail * srcC, dn += nIntTail; - for (; dn < nEnd6; dn += 6, pS += 6 * srcC, dst1 += 6 * F) - InputConvolution1x1_1x6(pS, srcC, pW, _bias, _params, dst1); - if (nEndTail) - tailEnd_1(pS, srcC, pW, _bias, _params, dst1), pS += nEndTail * srcC, dn += nEndTail; - } -#else - for (size_t dy = yBeg; dy < yEnd; ++dy) - { - float * dst0 = pD + (dy&dstM)*dstW*F; - size_t dx = 0; - if (dstC - dc > F) - { - for (; dx < dstW6; dx += 6, pS += 6 * srcC, dst0 += 6 * F) - InputConvolution1x1_2x6(pS, srcC, pW, _bias, _params, dst0, dst0 + dstS); - if (wTail) - tailW_2(pS, srcC, pW, _bias, _params, dst0, dst0 + dstS), pS += wTail * srcC, dx += wTail; - } - else - { - for (; dx < dstW6; dx += 6, pS += 6 * srcC, dst0 += 6 * F) - InputConvolution1x1_1x6(pS, srcC, pW, _bias, _params, dst0); - if (wTail) - tailW_1(pS, srcC, pW, _bias, _params, dst0), pS += wTail * srcC, dx += wTail; - } - } -#endif - } - } - - template SIMD_INLINE void InputConvolution_2x1(const float * src0, const SimdConvolutionParameters & p, - size_t kH, size_t kW, const float * weight, const __m256 * bias, const __m256 * params, float * dst0, float * dst1) - { - __m256 d00, d01, s0, w0, w1; - d00 = bias[0]; - d01 = bias[1]; - size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW)*p.srcC, stride = p.srcW * p.srcC; - for (size_t ky = 0; ky < kH; ++ky) - { - for (size_t i = 0; i < size; ++i) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - s0 = _mm256_set1_ps(src0[i]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - weight += DF; - } - weight += tail; - src0 += stride; - } - _mm256_storeu_ps(dst0, Activate(d00, params, 0)); - _mm256_storeu_ps(dst1, Activate(d01, params, 1)); - } - - template SIMD_INLINE void InputConvolution_1x1(const float * src0, const SimdConvolutionParameters & p, - size_t kH, size_t kW, const float * weight, const __m256 * bias, const __m256 * params, float * dst0) - { - __m256 d00, s0, w0; - d00 = bias[0]; - size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW)*p.srcC, stride = p.srcW * p.srcC; - for (size_t ky = 0; ky < kH; ++ky) - { - for (size_t i = 0; i < size; ++i) - { - w0 = _mm256_loadu_ps(weight + 0); - s0 = _mm256_set1_ps(src0[i]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - weight += DF; - } - weight += tail; - src0 += stride; - } - _mm256_storeu_ps(dst0, Activate(d00, params, 0)); - } - - template SIMD_INLINE void InputConvolution_2x6(const float * src0, const SimdConvolutionParameters & p, - size_t kH, size_t kW, const float * weight, const __m256 * bias, const __m256 * params, float * dst0, float * dst1) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - d00 = bias[0], d01 = bias[1]; - d10 = bias[0], d11 = bias[1]; - d20 = bias[0], d21 = bias[1]; - d30 = bias[0], d31 = bias[1]; - d40 = bias[0], d41 = bias[1]; - d50 = bias[0], d51 = bias[1]; - size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW)*p.srcC, stride = p.srcW * p.srcC, step = p.srcC*p.strideX; - const float * src1 = src0 + 1 * step; - const float * src2 = src0 + 2 * step; - const float * src3 = src0 + 3 * step; - const float * src4 = src0 + 4 * step; - const float * src5 = src0 + 5 * step; - for (size_t ky = 0; ky < kH; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - s0 = _mm256_set1_ps(src0[offset]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - s0 = _mm256_set1_ps(src1[offset]); - d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11); - s0 = _mm256_set1_ps(src2[offset]); - d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21); - s0 = _mm256_set1_ps(src3[offset]); - d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31); - s0 = _mm256_set1_ps(src4[offset]); - d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40); - d41 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d41); - s0 = _mm256_set1_ps(src5[offset]); - d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50); - d51 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d51); - weight += DF; - } - weight += tail; - } - _mm256_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); - _mm256_storeu_ps(dst1 + 0 * F, Activate(d01, params, 1)); - _mm256_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); - _mm256_storeu_ps(dst1 + 1 * F, Activate(d11, params, 1)); - _mm256_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); - _mm256_storeu_ps(dst1 + 2 * F, Activate(d21, params, 1)); - _mm256_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); - _mm256_storeu_ps(dst1 + 3 * F, Activate(d31, params, 1)); - _mm256_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); - _mm256_storeu_ps(dst1 + 4 * F, Activate(d41, params, 1)); - _mm256_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); - _mm256_storeu_ps(dst1 + 5 * F, Activate(d51, params, 1)); - } - - template SIMD_INLINE void InputConvolution_1x6(const float * src0, const SimdConvolutionParameters & p, - size_t kH, size_t kW, const float * weight, const __m256 * bias, const __m256 * params, float * dst0) - { - __m256 d00, d10, d20, d30, d40, d50, s0, w0; - d00 = bias[0]; - d10 = bias[0]; - d20 = bias[0]; - d30 = bias[0]; - d40 = bias[0]; - d50 = bias[0]; - size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW)*p.srcC, stride = p.srcW * p.srcC, step = p.srcC*p.strideX; - const float * src1 = src0 + 1 * step; - const float * src2 = src0 + 2 * step; - const float * src3 = src0 + 3 * step; - const float * src4 = src0 + 4 * step; - const float * src5 = src0 + 5 * step; - for (size_t ky = 0; ky < kH; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - s0 = _mm256_set1_ps(src0[offset]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - s0 = _mm256_set1_ps(src1[offset]); - d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - s0 = _mm256_set1_ps(src2[offset]); - d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - s0 = _mm256_set1_ps(src3[offset]); - d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - s0 = _mm256_set1_ps(src4[offset]); - d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40); - s0 = _mm256_set1_ps(src5[offset]); - d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50); - weight += DF; - } - weight += tail; - } - _mm256_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); - _mm256_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); - _mm256_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); - _mm256_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); - _mm256_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); - _mm256_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); - } - - template void InputConvolution(const float * src, const SimdConvolutionParameters & p, - size_t dstC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcH = p.srcH, srcW = p.srcW, srcC = p.srcC, dstW = p.dstW; - size_t kernelY = p.kernelY, kernelX = p.kernelX, strideY = p.strideY, strideX = p.strideX; - size_t dstM = (bufH[0] - 1), dstS = bufH[0] * dstW * F; - size_t dstCDF = AlignLo(dstC, DF); - if (dstC - F > dstCDF) - dstCDF += DF; - - size_t noseH = p.padY, noseW = p.padX; - size_t bodyH = p.srcH - p.kernelY + 1 + noseH, bodyW = p.srcW - p.kernelX + 1 + noseW; - size_t bodyW6 = AlignLoAny(bodyW - noseW, 6 * p.strideX) + noseW; - size_t tailH = bodyH + p.padH, tailW = bodyW + p.padW; - size_t wS = p.srcC*p.dstC; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - - __m256 _params[2], _bias[2]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - - size_t dc = 0; - for (; dc < dstCDF; dc += DF) - { - _bias[0] = bias ? _mm256_loadu_ps(bias + dc + 0) : _mm256_setzero_ps(); - _bias[1] = bias ? _mm256_loadu_ps(bias + dc + F) : _mm256_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = _mm256_loadu_ps(params + dc + 0); - _params[1] = _mm256_loadu_ps(params + dc + F); - } - size_t dy = yBeg, sy = dy * strideY; - for (; sy < noseH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS, *dst1 = dst0 + dstS; - size_t sx = 0; - const float * s = src; - const float * w = weight + (noseH - sy) * kernelX * DF * srcC; - for (; sx < noseW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s, p, kY + sy, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0, dst1); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F, dst1 += 6 * F) - InputConvolution_2x6(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < bodyW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < tailW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kY + sy, kW - sx, w, _bias, _params, dst0, dst1); - } - for (; sy < bodyH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS, *dst1 = dst0 + dstS; - size_t sx = 0; - const float * s = src + (sy - noseH)*srcW*srcC; - const float * w = weight; - for (; sx < noseW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s, p, kernelY, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0, dst1); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F, dst1 += 6 * F) - InputConvolution_2x6(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < bodyW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < tailW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kernelY, kW - sx, w, _bias, _params, dst0, dst1); - } - for (; sy < tailH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS, *dst1 = dst0 + dstS; - size_t sx = 0; - const float * s = src + (sy - noseH)*srcW*srcC; - const float * w = weight; - for (; sx < noseW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s, p, kH - sy, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0, dst1); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F, dst1 += 6 * F) - InputConvolution_2x6(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < bodyW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < tailW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kH - sy, kW - sx, w, _bias, _params, dst0, dst1); - } - weight += kernelY * kernelX*srcC*DF; - } - if (dc < dstC) - { - _bias[0] = bias ? _mm256_loadu_ps(bias + dc) : _mm256_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm256_loadu_ps(params + dc); - size_t dy = yBeg, sy = dy * strideY; - for (; sy < noseH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS; - size_t sx = 0; - const float * s = src; - const float * w = weight + (noseH - sy) * kernelX * DF * srcC; - for (; sx < noseW; sx += strideX, dst0 += F) - InputConvolution_1x1(s, p, kY + sy, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F) - InputConvolution_1x6(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0); - for (; sx < bodyW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0); - for (; sx < tailW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kY + sy, kW - sx, w, _bias, _params, dst0); - } - for (; sy < bodyH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS; - size_t sx = 0; - const float * s = src + (sy - noseH)*srcW*srcC; - const float * w = weight; - for (; sx < noseW; sx += strideX, dst0 += F) - InputConvolution_1x1(s, p, kernelY, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F) - InputConvolution_1x6(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0); - for (; sx < bodyW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0); - for (; sx < tailW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kernelY, kW - sx, w, _bias, _params, dst0); - } - for (; sy < tailH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS; - size_t sx = 0; - const float * s = src + (sy - noseH)*srcW*srcC; - const float * w = weight; - for (; sx < noseW; sx += strideX, dst0 += F) - InputConvolution_1x1(s, p, kH - sy, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F) - InputConvolution_1x6(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0); - for (; sx < bodyW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0); - for (; sx < tailW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kH - sy, kW - sx, w, _bias, _params, dst0); - } - } - } - - template void DepthwiseConvolution(const float* src, const SimdConvolutionParameters& p, - size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float* weight, const float* bias, const float* params, float* dst) - { - size_t strideY = p.strideY, strideX = p.strideX, padY = p.padY, padX = p.padX, padH = p.padH, padW = p.padW; - size_t srcW = p.srcW * F, dstW = p.dstW * F, weightS = p.kernelY * p.kernelX * F, strideXF = strideX * F; - size_t srcM = (bufH[0] - 1), dstM = (bufH[1] - 1), srcS = bufH[0] * srcW, dstS = bufH[1] * dstW; - size_t noseY = (p.padY + p.strideY - 1) / p.strideY; - size_t bodyY = (p.srcH + p.padY + p.strideY - p.kernelY) / p.strideY; - size_t noseX = (p.padX + p.strideX - 1) / p.strideX; - size_t bodyX = (p.srcW + p.padX + p.strideX - p.kernelX) / p.strideX; - size_t bodyX2 = AlignLo(bodyX - noseX, 2) + noseX; - size_t bodyX4 = AlignLo(bodyX - noseX, 4) + noseX; - size_t bodyX8 = AlignLo(bodyX - noseX, 8) + noseX; - - __m256 _params[2]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - for (size_t c = 0; c < srcC; c += F) - { - __m256 _bias = bias ? _mm256_loadu_ps(bias + c) : _mm256_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm256_loadu_ps(params + c); - - for (size_t dy = yBeg; dy < yEnd; ++dy) - { - float* pd = dst + (dy & dstM) * dstW; - if (dy >= noseY && dy < bodyY) - { - size_t dx = 0; - for (; dx < noseX; ++dx, pd += F) - { - __m256 sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * p.strideY + ky - padY; - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * p.strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + ((sy & srcM) * p.srcW + sx) * F; - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps), _mm256_loadu_ps(pw)), sum); - } - } - } - _mm256_storeu_ps(pd, Activate(sum, _params, 0)); - } - for (; dx < bodyX8; dx += 8, pd += 8 * F) - { - __m256 sum0 = _bias; - __m256 sum1 = _bias; - __m256 sum2 = _bias; - __m256 sum3 = _bias; - __m256 sum4 = _bias; - __m256 sum5 = _bias; - __m256 sum6 = _bias; - __m256 sum7 = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - __m256 w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 0 * strideXF), w0), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 1 * strideXF), w0), sum1); - sum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 2 * strideXF), w0), sum2); - sum3 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 3 * strideXF), w0), sum3); - sum4 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 4 * strideXF), w0), sum4); - sum5 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 5 * strideXF), w0), sum5); - sum6 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 6 * strideXF), w0), sum6); - sum7 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 7 * strideXF), w0), sum7); - } - } - _mm256_storeu_ps(pd + 0 * F, Activate(sum0, _params, 0)); - _mm256_storeu_ps(pd + 1 * F, Activate(sum1, _params, 0)); - _mm256_storeu_ps(pd + 2 * F, Activate(sum2, _params, 0)); - _mm256_storeu_ps(pd + 3 * F, Activate(sum3, _params, 0)); - _mm256_storeu_ps(pd + 4 * F, Activate(sum4, _params, 0)); - _mm256_storeu_ps(pd + 5 * F, Activate(sum5, _params, 0)); - _mm256_storeu_ps(pd + 6 * F, Activate(sum6, _params, 0)); - _mm256_storeu_ps(pd + 7 * F, Activate(sum7, _params, 0)); - } - for (; dx < bodyX4; dx += 4, pd += 4 * F) - { - __m256 sum0 = _bias; - __m256 sum1 = _bias; - __m256 sum2 = _bias; - __m256 sum3 = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - __m256 w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 0 * strideXF), w0), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 1 * strideXF), w0), sum1); - sum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 2 * strideXF), w0), sum2); - sum3 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 3 * strideXF), w0), sum3); - } - } - _mm256_storeu_ps(pd + 0 * F, Activate(sum0, _params, 0)); - _mm256_storeu_ps(pd + 1 * F, Activate(sum1, _params, 0)); - _mm256_storeu_ps(pd + 2 * F, Activate(sum2, _params, 0)); - _mm256_storeu_ps(pd + 3 * F, Activate(sum3, _params, 0)); - } - for (; dx < bodyX2; dx += 2, pd += 2 * F) - { - __m256 sum0 = _bias; - __m256 sum1 = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - __m256 w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 0 * strideXF), w0), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 1 * strideXF), w0), sum1); - } - } - _mm256_storeu_ps(pd + 0 * F, Activate(sum0, _params, 0)); - _mm256_storeu_ps(pd + 1 * F, Activate(sum1, _params, 0)); - } - for (; dx < bodyX; ++dx, pd += F) - { - __m256 sum = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - __m256 w0 = _mm256_loadu_ps(pw); - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps), w0), sum); - } - } - _mm256_storeu_ps(pd, Activate(sum, _params, 0)); - } - for (; dx < p.dstW; ++dx, pd += F) - { - __m256 sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + ((sy & srcM) * p.srcW + sx) * F; - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps), _mm256_loadu_ps(pw)), sum); - } - } - } - _mm256_storeu_ps(pd, Activate(sum, _params, 0)); - } - } - else - { - for (size_t dx = 0; dx < p.dstW; ++dx, pd += F) - { - __m256 sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + ((sy & srcM) * p.srcW + sx) * F; - sum = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps), _mm256_loadu_ps(pw)), sum); - } - } - } - } - _mm256_storeu_ps(pd, Activate(sum, _params, 0)); - } - } - } - src += srcS; - dst += dstS; - weight += weightS; - } - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Edge2x2( - const float * src0, const float * src1, const __m256 * weight, const __m256 & bias, const __m256 * params, float * dst) - { - __m256 sum0 = bias, sum1 = _mm256_setzero_ps(); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src0 + 0 * F), weight[0]), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src0 + 1 * F), weight[1]), sum1); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src1 + 0 * F), weight[3]), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src1 + 1 * F), weight[4]), sum1); - _mm256_storeu_ps(dst, Activate(_mm256_add_ps(sum0, sum1), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Edge2x3( - const float * src0, const float * src1, const __m256 * weight, const __m256 & bias, const __m256 * params, float * dst) - { - __m256 sum0 = bias, sum1 = _mm256_setzero_ps(), sum2 = _mm256_setzero_ps(); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src0 + 0 * F), weight[0]), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src0 + 1 * F), weight[1]), sum1); - sum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src0 + 2 * F), weight[2]), sum2); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src1 + 0 * F), weight[3]), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src1 + 1 * F), weight[4]), sum1); - sum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src1 + 2 * F), weight[5]), sum2); - _mm256_storeu_ps(dst, Activate(_mm256_add_ps(_mm256_add_ps(sum0, sum1), sum2), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Edge3x2( - const float * src0, const float * src1, const float * src2, const __m256 * weight, const __m256 & bias, const __m256 * params, float * dst) - { - __m256 sum0 = bias, sum1 = _mm256_setzero_ps(); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src0 + 0 * F), weight[0]), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src0 + 1 * F), weight[1]), sum1); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src1 + 0 * F), weight[3]), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src1 + 1 * F), weight[4]), sum1); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src2 + 0 * F), weight[6]), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src2 + 1 * F), weight[7]), sum1); - _mm256_storeu_ps(dst, Activate(_mm256_add_ps(sum0, sum1), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Main1x1( - const float * src0, const float * src1, const float * src2, const __m256 * weight, const __m256 & bias, const __m256 * params, float * dst) - { - __m256 sum0 = bias, sum1 = _mm256_setzero_ps(), sum2 = _mm256_setzero_ps(); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src0 + 0 * F), weight[0]), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src0 + 1 * F), weight[1]), sum1); - sum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src0 + 2 * F), weight[2]), sum2); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src1 + 0 * F), weight[3]), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src1 + 1 * F), weight[4]), sum1); - sum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src1 + 2 * F), weight[5]), sum2); - sum0 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src2 + 0 * F), weight[6]), sum0); - sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src2 + 1 * F), weight[7]), sum1); - sum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src2 + 2 * F), weight[8]), sum2); - _mm256_storeu_ps(dst, Activate(_mm256_add_ps(_mm256_add_ps(sum0, sum1), sum2), params, 0)); - } - - template void DepthwiseConvolution3x3(const float * src, const SimdConvolutionParameters & p, - size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float * weight, const float * bias, const float * params, float * dst) - { - size_t strideY = p.strideY, padY = p.padY, padX = p.padX, padH = p.padH, padW = p.padW; - size_t srcW = p.srcW * F, dstW = p.dstW * F, weightS = p.kernelY * p.kernelX * F; - size_t srcM = (bufH[0] - 1), dstM = (bufH[1] - 1), srcS = bufH[0] * srcW, dstS = bufH[1] * dstW; - size_t xStep = F * p.strideX, xStep0 = (p.strideX - p.padX)*F; - size_t xMainEnd = p.dstW - p.padW, yMainEnd = yEnd == p.dstH && p.padH ? yEnd - 1 : yEnd; - - __m256 _params[2]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - for (size_t c = 0; c < srcC; c += F) - { - __m256 _weight[9]; - for (size_t i = 0; i < 9; ++i) - _weight[i] = _mm256_loadu_ps(weight + i * F); - __m256 _bias = bias ? _mm256_loadu_ps(bias + c) : _mm256_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm256_loadu_ps(params + c); - - size_t dy = yBeg; - if (yBeg == 0 && padY) - { - size_t sy = 0, dx = 0; - const float * src0 = src + ((sy + 0)&srcM)*srcW; - const float * src1 = src + ((sy + 1)&srcM)*srcW; - float * pDst = dst + (dy&dstM)*dstW; - if (padX) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 4, _bias, _params, pDst), pDst += F, dx++, src0 += xStep0, src1 += xStep0; - for (; dx < xMainEnd; dx++, pDst += F, src0 += xStep, src1 += xStep) - ConvolutionDepthwise3x3Edge2x3(src0, src1, _weight + 3, _bias, _params, pDst); - if (padW) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 3, _bias, _params, pDst); - dy++; - } - for (; dy < yMainEnd; ++dy) - { - size_t sy = dy * strideY - padY, dx = 0; - const float * src0 = src + ((sy + 0)&srcM)*srcW; - const float * src1 = src + ((sy + 1)&srcM)*srcW; - const float * src2 = src + ((sy + 2)&srcM)*srcW; - float * pDst = dst + (dy&dstM)*dstW; - if (padX) - ConvolutionDepthwise3x3Edge3x2(src0, src1, src2, _weight + 1, _bias, _params, pDst), pDst += F, dx++, src0 += xStep0, src1 += xStep0, src2 += xStep0; - for (; dx < xMainEnd; dx++, pDst += F, src0 += xStep, src1 += xStep, src2 += xStep) - ConvolutionDepthwise3x3Main1x1(src0, src1, src2, _weight + 0, _bias, _params, pDst); - if (padW) - ConvolutionDepthwise3x3Edge3x2(src0, src1, src2, _weight + 0, _bias, _params, pDst); - } - if (dy < yEnd) - { - size_t sy = dy * strideY - padY, dx = 0; - const float * src0 = src + ((sy + 0)&srcM)*srcW; - const float * src1 = src + ((sy + 1)&srcM)*srcW; - float * pDst = dst + (dy&dstM)*dstW; - if (padX) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 1, _bias, _params, pDst), pDst += F, dx++, src0 += xStep0, src1 += xStep0; - for (; dx < xMainEnd; dx++, pDst += F, src0 += xStep, src1 += xStep) - ConvolutionDepthwise3x3Edge2x3(src0, src1, _weight + 0, _bias, _params, pDst); - if (padW) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 0, _bias, _params, pDst); - } - src += srcS; - dst += dstS; - weight += weightS; - } - } - - template void OutputConvolution_2x6(const float * src, size_t srcC, size_t srcS, - const float * weight, const __m256 * bias, const __m256 * params, float * dst, size_t dstC, size_t tail) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - if (tail > F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(), d41 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(), d51 = _mm256_setzero_ps(); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - s0 = _mm256_set1_ps(src[i + 0 * F]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - s0 = _mm256_set1_ps(src[i + 1 * F]); - d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11); - s0 = _mm256_set1_ps(src[i + 2 * F]); - d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21); - s0 = _mm256_set1_ps(src[i + 3 * F]); - d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31); - s0 = _mm256_set1_ps(src[i + 4 * F]); - d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40); - d41 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d41); - s0 = _mm256_set1_ps(src[i + 5 * F]); - d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50); - d51 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d51); - } - src += srcS; - } - if (tail == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params); - dst += dstC; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params); - dst += dstC; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params); - } - else - { - tail -= F; - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params, tail); - } - } - else - { - d00 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm256_loadu_ps(weight + 0); - s0 = _mm256_set1_ps(src[i + 0 * F]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - s0 = _mm256_set1_ps(src[i + 1 * F]); - d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - s0 = _mm256_set1_ps(src[i + 2 * F]); - d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - s0 = _mm256_set1_ps(src[i + 3 * F]); - d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - s0 = _mm256_set1_ps(src[i + 4 * F]); - d40 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d40); - s0 = _mm256_set1_ps(src[i + 5 * F]); - d50 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d50); - } - src += srcS; - } - if (tail == F) - { - Term::template Save(dst + 0, d00, bias, params); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - dst += dstC; - Term::template Save(dst + 0, d40, bias, params); - dst += dstC; - Term::template Save(dst + 0, d50, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d40, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d50, bias, params, tail); - } - } - } - - template void OutputConvolution_2x4(const float * src, size_t srcC, size_t srcS, - const float * weight, const __m256 * bias, const __m256 * params, float * dst, size_t dstC, size_t tail) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, s0, w0, w1; - if (tail > F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - s0 = _mm256_set1_ps(src[i + 0 * F]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - s0 = _mm256_set1_ps(src[i + 1 * F]); - d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11); - s0 = _mm256_set1_ps(src[i + 2 * F]); - d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21); - s0 = _mm256_set1_ps(src[i + 3 * F]); - d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - d31 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d31); - } - src += srcS; - } - if (tail == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params); - } - else - { - tail -= F; - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params, tail); - } - } - else - { - d00 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm256_loadu_ps(weight + 0); - s0 = _mm256_set1_ps(src[i + 0 * F]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - s0 = _mm256_set1_ps(src[i + 1 * F]); - d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - s0 = _mm256_set1_ps(src[i + 2 * F]); - d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - s0 = _mm256_set1_ps(src[i + 3 * F]); - d30 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d30); - } - src += srcS; - } - if (tail == F) - { - Term::template Save(dst + 0, d00, bias, params); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params, tail); - } - } - } - - template void OutputConvolution_2x3(const float * src, size_t srcC, size_t srcS, - const float * weight, const __m256 * bias, const __m256 * params, float * dst, size_t dstC, size_t tail) - { - __m256 d00, d01, d10, d11, d20, d21, s0, w0, w1; - if (tail > F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - s0 = _mm256_set1_ps(src[i + 0 * F]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - s0 = _mm256_set1_ps(src[i + 1 * F]); - d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - d11 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d11); - s0 = _mm256_set1_ps(src[i + 2 * F]); - d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - d21 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d21); - } - src += srcS; - } - if (tail == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params); - } - else - { - tail -= F; - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, tail); - } - } - else - { - d00 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm256_loadu_ps(weight + 0); - s0 = _mm256_set1_ps(src[i + 0 * F]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - s0 = _mm256_set1_ps(src[i + 1 * F]); - d10 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d10); - s0 = _mm256_set1_ps(src[i + 2 * F]); - d20 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d20); - } - src += srcS; - } - if (tail == F) - { - Term::template Save(dst + 0, d00, bias, params); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params, tail); - } - } - } - - template void OutputConvolution_2x1(const float * src, size_t srcC, size_t srcS, - const float * weight, const __m256 * bias, const __m256 * params, float * dst, size_t dstC, size_t tail) - { - __m256 d00, d01, s0, w0, w1; - if (tail > F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - s0 = _mm256_set1_ps(src[i + 0 * F]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - d01 = _mm256_add_ps(_mm256_mul_ps(s0, w1), d01); - } - src += srcS; - } - if (tail == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, tail - F); - } - } - else - { - d00 = _mm256_setzero_ps(); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm256_loadu_ps(weight + 0); - s0 = _mm256_set1_ps(src[i + 0 * F]); - d00 = _mm256_add_ps(_mm256_mul_ps(s0, w0), d00); - } - src += srcS; - } - if (tail == F) - Term::template Save(dst + 0, d00, bias, params); - else - Term::template Save(dst + 0, d00, bias, params, tail); - } - } - - template void OutputConvolution(const float * src, const SimdConvolutionParameters & p, - size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float * weight, const float * bias, const float * params, float * dst) - { - assert(p.group == 1 && p.kernelY == 1 && p.strideY == 1); - size_t srcH = p.srcH, srcW = p.srcW, dstW = p.dstW, dstC = p.dstC; - size_t srcM = (bufH[1] - 1), srcS = bufH[1] * srcW*F; - size_t dstW3 = AlignLoAny(dstW, 3), dstW6 = AlignLoAny(dstW, 6); - __m256 _params[2], _bias[2]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - - dst += yBeg * p.dstW * p.dstC; - size_t dc = 0; - for (; dc < dstC; dc += DF) - { - size_t tail = Simd::Min(DF, dstC - dc); - _bias[0] = _mm256_loadu_ps(bias + dc + 0); - _bias[1] = _mm256_loadu_ps(bias + dc + F); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = _mm256_loadu_ps(params + dc + 0); - _params[1] = _mm256_loadu_ps(params + dc + F); - } - float * pDst = dst + dc; - for (size_t y = yBeg; y < yEnd; ++y) - { - const float * pSrc = src + (y&srcM)*srcW*F; - size_t x = 0; - for (; x < dstW6; x += 6, pDst += 6 * dstC, pSrc += 6 * F) - OutputConvolution_2x6(pSrc, srcC, srcS, weight, _bias, _params, pDst, dstC, tail); - if (dstW - dstW6 == 4) - OutputConvolution_2x4(pSrc, srcC, srcS, weight, _bias, _params, pDst, dstC, tail), pDst += 4 * dstC; - else - { - for (; x < dstW3; x += 3, pDst += 3 * dstC, pSrc += 3 * F) - OutputConvolution_2x3(pSrc, srcC, srcS, weight, _bias, _params, pDst, dstC, tail); - for (; x < dstW; ++x, pDst += dstC, pSrc += F) - OutputConvolution_2x1(pSrc, srcC, srcS, weight, _bias, _params, pDst, dstC, tail); - } - } - weight += srcC * DF; - } - } - - template void SetConvolutionPtr(const MergConvParam32f & p, size_t index, SynetMergedConvolution32f::ConvolutionPtr convolution[3]) - { - switch (index) - { - case 0: - if (p.conv[0].kernelY == 1 && p.conv[0].strideY == 1) - convolution[0] = InputConvolution1x1; - else - convolution[0] = InputConvolution; - break; - case 1: - if (p.conv[1].kernelY == 3) - convolution[1] = DepthwiseConvolution3x3; - else - convolution[1] = DepthwiseConvolution; - break; - case 2: - if (p.add) - { - convolution[2] = OutputConvolution; - convolution[3] = OutputConvolution; - convolution[4] = OutputConvolution; - convolution[5] = OutputConvolution; - } - else - { - convolution[2] = OutputConvolution; - convolution[3] = OutputConvolution; - convolution[4] = OutputConvolution; - convolution[5] = OutputConvolution; - } - break; - default: - assert(0); - } - } - - SynetMergedConvolution32f::SynetMergedConvolution32f(const MergConvParam32f & p) - : Sse2::SynetMergedConvolution32f(p) - { - for (size_t i = 0; i < _param.count; ++i) - if (p.conv[i].activation == SimdConvolutionActivationElu) - return; - for (size_t i = 0; i < _param.count; ++i) - { - switch (p.conv[i].activation) - { - case SimdConvolutionActivationIdentity: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationRelu: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationLeakyRelu: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationRestrictRange: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationPrelu: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationHswish: SetConvolutionPtr(_param, i, _convolution); break; - default: assert(0); - } - } - SetSize(Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), Avx::F); - } - - //--------------------------------------------------------------------- - - void * SynetMergedConvolution32fInit(size_t batch, const SimdConvolutionParameters * convs, size_t count, SimdBool add) - { - for(size_t i = 0; i < count; ++i) - if (convs[i].activation == SimdConvolutionActivationElu) - return Sse2::SynetMergedConvolution32fInit(batch, convs, count, add); - MergConvParam32f param(batch, convs, count, add); - if (!param.Valid()) - return NULL; - if (param.conv[2].dstC < F) - return new Sse2::SynetMergedConvolution32f(param); - else - return new Avx::SynetMergedConvolution32f(param); - } - } - #endif//SIMD_AVX_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx1SynetPooling.cpp b/src/3rd/Simd/Simd/SimdAvx1SynetPooling.cpp deleted file mode 100644 index 56d50487..00000000 --- a/src/3rd/Simd/Simd/SimdAvx1SynetPooling.cpp +++ /dev/null @@ -1,366 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdSse1.h" -#include "Simd/SimdAvx1.h" - -namespace Simd -{ -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - SIMD_INLINE void PoolingAverageNhwc1(const float* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m256& norm, float* dst) - { - __m256 sum0 = _mm256_setzero_ps(); - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - sum0 = _mm256_add_ps(sum0, _mm256_loadu_ps(src + w * srcC + 0 * F)); - } - src += srcS; - } - _mm256_storeu_ps(dst + 0 * F, _mm256_mul_ps(sum0, norm)); - } - - SIMD_INLINE void PoolingAverageNhwc2(const float* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m256& norm, float* dst) - { - __m256 sum0 = _mm256_setzero_ps(); - __m256 sum1 = _mm256_setzero_ps(); - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - sum0 = _mm256_add_ps(sum0, _mm256_loadu_ps(src + w * srcC + 0 * F)); - sum1 = _mm256_add_ps(sum1, _mm256_loadu_ps(src + w * srcC + 1 * F)); - } - src += srcS; - } - _mm256_storeu_ps(dst + 0 * F, _mm256_mul_ps(sum0, norm)); - _mm256_storeu_ps(dst + 1 * F, _mm256_mul_ps(sum1, norm)); - } - - SIMD_INLINE void PoolingAverageNhwc4(const float* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m256& norm, float* dst) - { - __m256 sum0 = _mm256_setzero_ps(); - __m256 sum1 = _mm256_setzero_ps(); - __m256 sum2 = _mm256_setzero_ps(); - __m256 sum3 = _mm256_setzero_ps(); - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - sum0 = _mm256_add_ps(sum0, _mm256_loadu_ps(src + w * srcC + 0 * F)); - sum1 = _mm256_add_ps(sum1, _mm256_loadu_ps(src + w * srcC + 1 * F)); - sum2 = _mm256_add_ps(sum2, _mm256_loadu_ps(src + w * srcC + 2 * F)); - sum3 = _mm256_add_ps(sum3, _mm256_loadu_ps(src + w * srcC + 3 * F)); - } - src += srcS; - } - _mm256_storeu_ps(dst + 0 * F, _mm256_mul_ps(sum0, norm)); - _mm256_storeu_ps(dst + 1 * F, _mm256_mul_ps(sum1, norm)); - _mm256_storeu_ps(dst + 2 * F, _mm256_mul_ps(sum2, norm)); - _mm256_storeu_ps(dst + 3 * F, _mm256_mul_ps(sum3, norm)); - } - - SIMD_INLINE void PoolingAverageNhwc8(const float* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m256& norm, float* dst) - { - __m256 sum0 = _mm256_setzero_ps(); - __m256 sum1 = _mm256_setzero_ps(); - __m256 sum2 = _mm256_setzero_ps(); - __m256 sum3 = _mm256_setzero_ps(); - __m256 sum4 = _mm256_setzero_ps(); - __m256 sum5 = _mm256_setzero_ps(); - __m256 sum6 = _mm256_setzero_ps(); - __m256 sum7 = _mm256_setzero_ps(); - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - sum0 = _mm256_add_ps(sum0, _mm256_loadu_ps(src + w * srcC + 0 * F)); - sum1 = _mm256_add_ps(sum1, _mm256_loadu_ps(src + w * srcC + 1 * F)); - sum2 = _mm256_add_ps(sum2, _mm256_loadu_ps(src + w * srcC + 2 * F)); - sum3 = _mm256_add_ps(sum3, _mm256_loadu_ps(src + w * srcC + 3 * F)); - sum4 = _mm256_add_ps(sum4, _mm256_loadu_ps(src + w * srcC + 4 * F)); - sum5 = _mm256_add_ps(sum5, _mm256_loadu_ps(src + w * srcC + 5 * F)); - sum6 = _mm256_add_ps(sum6, _mm256_loadu_ps(src + w * srcC + 6 * F)); - sum7 = _mm256_add_ps(sum7, _mm256_loadu_ps(src + w * srcC + 7 * F)); - } - src += srcS; - } - _mm256_storeu_ps(dst + 0 * F, _mm256_mul_ps(sum0, norm)); - _mm256_storeu_ps(dst + 1 * F, _mm256_mul_ps(sum1, norm)); - _mm256_storeu_ps(dst + 2 * F, _mm256_mul_ps(sum2, norm)); - _mm256_storeu_ps(dst + 3 * F, _mm256_mul_ps(sum3, norm)); - _mm256_storeu_ps(dst + 4 * F, _mm256_mul_ps(sum4, norm)); - _mm256_storeu_ps(dst + 5 * F, _mm256_mul_ps(sum5, norm)); - _mm256_storeu_ps(dst + 6 * F, _mm256_mul_ps(sum6, norm)); - _mm256_storeu_ps(dst + 7 * F, _mm256_mul_ps(sum7, norm)); - } - - SIMD_INLINE void PoolingAverageNhwc(const float* src, size_t srcS, size_t srcC, size_t srcCF1, - size_t srcCF2, size_t srcCF4, size_t srcCF8, size_t kernelY, size_t kernelX, const __m256& norm, float* dst) - { - size_t c = 0; - for (; c < srcCF8; c += 8 * F) - PoolingAverageNhwc8(src + c, srcS, srcC, kernelY, kernelX, norm, dst + c); - for (; c < srcCF4; c += 4 * F) - PoolingAverageNhwc4(src + c, srcS, srcC, kernelY, kernelX, norm, dst + c); - for (; c < srcCF2; c += 2 * F) - PoolingAverageNhwc2(src + c, srcS, srcC, kernelY, kernelX, norm, dst + c); - for (; c < srcCF1; c += 1 * F) - PoolingAverageNhwc1(src + c, srcS, srcC, kernelY, kernelX, norm, dst + c); - if (c < srcC) - PoolingAverageNhwc1(src + srcC - F, srcS, srcC, kernelY, kernelX, norm, dst + srcC - F); - } - - void SynetPoolingForwardAverage(const float* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float* dst, size_t dstH, size_t dstW, SimdBool excludePad, SimdTensorFormatType format) - { - if (format == SimdTensorFormatNhwc) - { - if (srcC >= F) - { - size_t srcS = srcW * srcC; - size_t srcCF1 = AlignLo(srcC, 1 * F); - size_t srcCF2 = AlignLo(srcC, 2 * F); - size_t srcCF4 = AlignLo(srcC, 4 * F); - size_t srcCF8 = AlignLo(srcC, 8 * F); - if (padX == 0 && padY == 0 && (dstW - 1) * strideX + kernelX == srcW && (dstH - 1) * strideY + kernelY == srcH) - { - size_t stepY = srcW * srcC * strideY, stepX = strideX * srcC; - __m256 norm = _mm256_set1_ps(1.0f / (kernelY * kernelX)); - for (size_t ph = 0; ph < dstH; ++ph) - { - const float* ps = src + ph * stepY; - for (size_t pw = 0; pw < dstW; ++pw, ps += stepX, dst += srcC) - PoolingAverageNhwc(ps, srcS, srcC, srcCF1, srcCF2, srcCF4, srcCF8, kernelY, kernelX, norm, dst); - } - } - else if (excludePad) - { - for (size_t ph = 0; ph < dstH; ++ph) - { - size_t hStart = ph * strideY - padY; - size_t hEnd = Simd::Min(hStart + kernelY, srcH); - hStart = Simd::Max(0, hStart); - size_t kH = hEnd - hStart; - for (size_t pw = 0; pw < dstW; ++pw) - { - size_t wStart = pw * strideX - padX; - size_t wEnd = Simd::Min(wStart + kernelX, srcW); - wStart = Simd::Max(0, wStart); - size_t kW = wEnd - wStart; - const float* ps = src + hStart * srcS + wStart * srcC; - __m256 norm = _mm256_set1_ps(1.0f / (kH * kW)); - PoolingAverageNhwc(ps, srcS, srcC, srcCF1, srcCF2, srcCF4, srcCF8, kH, kW, norm, dst); - dst += srcC; - } - } - } - else - { - __m256 norm = _mm256_set1_ps(1.0f / (kernelY * kernelX)); - for (size_t ph = 0; ph < dstH; ++ph) - { - size_t hStart = ph * strideY - padY; - size_t hEnd = Simd::Min(hStart + kernelY, srcH); - hStart = Simd::Max(0, hStart); - size_t kH = hEnd - hStart; - for (size_t pw = 0; pw < dstW; ++pw) - { - size_t wStart = pw * strideX - padX; - size_t wEnd = Simd::Min(wStart + kernelX, srcW); - wStart = Simd::Max(0, wStart); - size_t kW = wEnd - wStart; - const float* ps = src + hStart * srcS + wStart * srcC; - PoolingAverageNhwc(ps, srcS, srcC, srcCF1, srcCF2, srcCF4, srcCF8, kH, kW, norm, dst); - dst += srcC; - } - } - } - return; - } - } - else if (format == SimdTensorFormatNchw) - { - } - Sse::SynetPoolingForwardAverage(src, srcC, srcH, srcW, kernelY, kernelX, strideY, strideX, padY, padX, dst, dstH, dstW, excludePad, format); - } - - //--------------------------------------------------------------------- - - SIMD_INLINE void PoolingMaxHwc1(const float * src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m256 & min, float * dst) - { - __m256 max0 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - const float * ps = src + w * srcC; - max0 = _mm256_max_ps(max0, _mm256_loadu_ps(ps + 0 * F)); - } - src += srcS; - } - _mm256_storeu_ps(dst + 0 * F, max0); - } - - SIMD_INLINE void PoolingMaxHwc2(const float * src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m256 & min, float * dst) - { - __m256 max0 = min; - __m256 max1 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - const float* ps = src + w * srcC; - max0 = _mm256_max_ps(max0, _mm256_loadu_ps(ps + 0 * F)); - max1 = _mm256_max_ps(max1, _mm256_loadu_ps(ps + 1 * F)); - } - src += srcS; - } - _mm256_storeu_ps(dst + 0 * F, max0); - _mm256_storeu_ps(dst + 1 * F, max1); - } - - SIMD_INLINE void PoolingMaxHwc4(const float * src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m256 & min, float * dst) - { - __m256 max0 = min; - __m256 max1 = min; - __m256 max2 = min; - __m256 max3 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - const float* ps = src + w * srcC; - max0 = _mm256_max_ps(max0, _mm256_loadu_ps(ps + 0 * F)); - max1 = _mm256_max_ps(max1, _mm256_loadu_ps(ps + 1 * F)); - max2 = _mm256_max_ps(max2, _mm256_loadu_ps(ps + 2 * F)); - max3 = _mm256_max_ps(max3, _mm256_loadu_ps(ps + 3 * F)); - } - src += srcS; - } - _mm256_storeu_ps(dst + 0 * F, max0); - _mm256_storeu_ps(dst + 1 * F, max1); - _mm256_storeu_ps(dst + 2 * F, max2); - _mm256_storeu_ps(dst + 3 * F, max3); - } - - SIMD_INLINE void PoolingMaxHwc8(const float * src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m256 & min, float * dst) - { - __m256 max0 = min; - __m256 max1 = min; - __m256 max2 = min; - __m256 max3 = min; - __m256 max4 = min; - __m256 max5 = min; - __m256 max6 = min; - __m256 max7 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - const float* ps = src + w * srcC; - max0 = _mm256_max_ps(max0, _mm256_loadu_ps(ps + 0 * F)); - max1 = _mm256_max_ps(max1, _mm256_loadu_ps(ps + 1 * F)); - max2 = _mm256_max_ps(max2, _mm256_loadu_ps(ps + 2 * F)); - max3 = _mm256_max_ps(max3, _mm256_loadu_ps(ps + 3 * F)); - max4 = _mm256_max_ps(max4, _mm256_loadu_ps(ps + 4 * F)); - max5 = _mm256_max_ps(max5, _mm256_loadu_ps(ps + 5 * F)); - max6 = _mm256_max_ps(max6, _mm256_loadu_ps(ps + 6 * F)); - max7 = _mm256_max_ps(max7, _mm256_loadu_ps(ps + 7 * F)); - } - src += srcS; - } - _mm256_storeu_ps(dst + 0 * F, max0); - _mm256_storeu_ps(dst + 1 * F, max1); - _mm256_storeu_ps(dst + 2 * F, max2); - _mm256_storeu_ps(dst + 3 * F, max3); - _mm256_storeu_ps(dst + 4 * F, max4); - _mm256_storeu_ps(dst + 5 * F, max5); - _mm256_storeu_ps(dst + 6 * F, max6); - _mm256_storeu_ps(dst + 7 * F, max7); - } - - void SynetPoolingForwardMax32f(const float * src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float * dst, size_t dstH, size_t dstW, SimdTensorFormatType format) - { - if (format == SimdTensorFormatNhwc) - { - if (srcC >= F) - { - size_t srcS = srcW * srcC; - size_t srcCF1 = AlignLo(srcC, 1 * F); - size_t srcCF2 = AlignLo(srcC, 2 * F); - size_t srcCF4 = AlignLo(srcC, 4 * F); - size_t srcCF8 = AlignLo(srcC, 8 * F); - __m256 min = _mm256_set1_ps(-FLT_MAX); - for (size_t ph = 0; ph < dstH; ++ph) - { - size_t hStart = ph * strideY - padY; - size_t hEnd = Simd::Min(hStart + kernelY, srcH); - hStart = Simd::Max(0, hStart); - for (size_t pw = 0; pw < dstW; ++pw) - { - size_t wStart = pw * strideX - padX; - size_t wEnd = Simd::Min(wStart + kernelX, srcW); - wStart = Simd::Max(0, wStart); - const float* ps = src + hStart * srcS + wStart * srcC; - size_t c = 0; - for (; c < srcCF8; c += 8 * F) - PoolingMaxHwc8(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - for (; c < srcCF4; c += 4 * F) - PoolingMaxHwc4(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - for (; c < srcCF2; c += 2 * F) - PoolingMaxHwc2(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - for (; c < srcCF1; c += 1 * F) - PoolingMaxHwc1(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - if (c < srcC) - PoolingMaxHwc1(ps + srcC - F, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + srcC - F); - dst += srcC; - } - } - } - } - else if (format == SimdTensorFormatNchw) - { - if (strideY == 2 && strideX == 2 && kernelY == 2 && kernelX == 2 && padY == 0 && padX == 0 && dstW >= F) - { - for (size_t c = 0; c < srcC; ++c, src += srcH * srcW, dst += dstH * dstW) - Avx::NeuralPooling2x2Max2x2(src, srcW, srcW, srcH, dst, dstW); - return; - } - Sse::SynetPoolingForwardMax32f(src, srcC, srcH, srcW, kernelY, kernelX, strideY, strideX, padY, padX, dst, dstH, dstW, format); - } - else - assert(0); - } - } -#endif// SIMD_AVX_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx1Winograd.cpp b/src/3rd/Simd/Simd/SimdAvx1Winograd.cpp deleted file mode 100644 index d37c8e20..00000000 --- a/src/3rd/Simd/Simd/SimdAvx1Winograd.cpp +++ /dev/null @@ -1,2802 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdWinograd.h" -#include "Simd/SimdSse1.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - SIMD_INLINE void WinogradKernel1x3Block1x4SetFilter(const __m256* t, float* dst, size_t stride) - { - const __m256 r4 = _mm256_set1_ps(1.0f / 4.0f); - const __m256 r6 = _mm256_set1_ps(1.0f / 6.0f); - const __m256 mr6 = _mm256_set1_ps(-1.0f / 6.0f); - const __m256 r12 = _mm256_set1_ps(1.0f / 12.0f); - const __m256 r24 = _mm256_set1_ps(1.0f / 24.0f); - _mm256_storeu_ps(dst + 0 * stride, _mm256_mul_ps(r4, t[0])); - __m256 t0 = _mm256_add_ps(t[0], t[2]); - _mm256_storeu_ps(dst + 1 * stride, _mm256_mul_ps(mr6, _mm256_add_ps(t0, t[1]))); - _mm256_storeu_ps(dst + 2 * stride, _mm256_mul_ps(mr6, _mm256_sub_ps(t0, t[1]))); - __m256 t1 = _mm256_add_ps(_mm256_mul_ps(r24, t[0]), _mm256_mul_ps(r6, t[2])); - __m256 t2 = _mm256_mul_ps(r12, t[1]); - _mm256_storeu_ps(dst + 3 * stride, _mm256_add_ps(t1, t2)); - _mm256_storeu_ps(dst + 4 * stride, _mm256_sub_ps(t1, t2)); - _mm256_storeu_ps(dst + 5 * stride, t[2]); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetFilter8t(const float* src, float* dst, size_t stride) - { - __m256 s[3]; - s[0] = _mm256_loadu_ps(src + 0 * stride); - s[1] = _mm256_loadu_ps(src + 1 * stride); - s[2] = _mm256_loadu_ps(src + 2 * stride); - WinogradKernel1x3Block1x4SetFilter(s, dst + 0 * stride, stride); - } - - void WinogradKernel1x3Block1x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans) - { - size_t sizeF = AlignLo(size, F), i = 0; - if (trans) - { - for (; i < sizeF; i += F) - WinogradKernel1x3Block1x4SetFilter8t(src + i, dst + i, size); - for (; i < size; i += 1) - Base::WinogradKernel1x3Block1x4SetFilter1t(src + i, dst + i, size); - } - else - { - Sse::WinogradKernel1x3Block1x4SetFilter(src, size, dst, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel1x3Block1x4SetInput6Store(const __m256 src[6], float* dst, size_t stride) - { - __m256 _2 = _mm256_set1_ps(2.0f); - __m256 _4 = _mm256_set1_ps(4.0f); - __m256 _5 = _mm256_set1_ps(5.0f); - _mm256_storeu_ps(dst + 0 * stride, _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, src[0]), _mm256_mul_ps(_5, src[2])), src[4])); - _mm256_storeu_ps(dst + 1 * stride, _mm256_sub_ps(_mm256_add_ps(src[3], src[4]), _mm256_mul_ps(_4, _mm256_add_ps(src[1], src[2])))); - _mm256_storeu_ps(dst + 2 * stride, _mm256_add_ps(_mm256_mul_ps(_4, _mm256_sub_ps(src[1], src[2])), _mm256_sub_ps(src[4], src[3]))); - _mm256_storeu_ps(dst + 3 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[3], src[1])), _mm256_sub_ps(src[4], src[2]))); - _mm256_storeu_ps(dst + 4 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[1], src[3])), _mm256_sub_ps(src[4], src[2]))); - _mm256_storeu_ps(dst + 5 * stride, _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, src[1]), _mm256_mul_ps(_5, src[3])), src[5])); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetInput8t(const float* src, size_t srcC, __m256 dst[6]) - { - dst[0] = _mm256_loadu_ps(src + 0 * srcC); - dst[1] = _mm256_loadu_ps(src + 1 * srcC); - dst[2] = _mm256_loadu_ps(src + 2 * srcC); - dst[3] = _mm256_loadu_ps(src + 3 * srcC); - dst[4] = _mm256_loadu_ps(src + 4 * srcC); - dst[5] = _mm256_loadu_ps(src + 5 * srcC); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetInput8t(const float* src, size_t srcC, float* dst, size_t dstStride) - { - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - __m256 tmp[6]; - WinogradKernel1x3Block1x4SetInput8t(src + c, srcC, tmp); - WinogradKernel1x3Block1x4SetInput6Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - __m256 tmp[6]; - WinogradKernel1x3Block1x4SetInput8t(src + srcC - F, srcC, tmp); - WinogradKernel1x3Block1x4SetInput6Store(tmp, dst + srcC - F, dstStride); - } - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetInput8t(const float* src, size_t srcC, size_t colB, size_t colE, __m256 dst[6]) - { - for (size_t col = 0; col < colB; ++col) - dst[col] = _mm256_setzero_ps(); - for (size_t col = colB; col < colE; ++col) - dst[col] = _mm256_loadu_ps(src + col * srcC); - for (size_t col = colE; col < 6; ++col) - dst[col] = _mm256_setzero_ps(); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetInput8t(const float* src, size_t srcC, size_t colB, size_t colE, float* dst, size_t dstStride) - { - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - __m256 tmp[6]; - WinogradKernel1x3Block1x4SetInput8t(src + c, srcC, colB, colE, tmp); - WinogradKernel1x3Block1x4SetInput6Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - __m256 tmp[6]; - WinogradKernel1x3Block1x4SetInput8t(src + srcC - F, srcC, colB, colE, tmp); - WinogradKernel1x3Block1x4SetInput6Store(tmp, dst + srcC - F, dstStride); - } - } - - void WinogradKernel1x3Block1x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padX == padW && padY == 0 && padH == 0 && (padX == 0 || padX == 1)); - if (trans ? (srcChannels < F) : (srcWidth < 12)) - { - Sse::WinogradKernel1x3Block1x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - return; - } - size_t dstH = srcHeight; - size_t dstW = padX ? srcWidth : srcWidth - 2; - size_t tileW = (dstW + 3) / 4; - size_t dstW4 = AlignLo(dstW, 4); - if (trans) - { - size_t noseW = Simd::Min(6, dstW + 1); - size_t startX = padX ? 4 : 0; - if (padX) - { - if (dstW == dstW4) - dstW4 -= 4; - src -= srcChannels; - } - size_t tailW = dstW - dstW4 + (padX ? 1 : 2); - for (size_t row = 0; row < dstH; row += 1) - { - size_t col = 0; - if (padX) - WinogradKernel1x3Block1x4SetInput8t(src, srcChannels, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel1x3Block1x4SetInput8t(src + col * srcChannels, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel1x3Block1x4SetInput8t(src + col * srcChannels, srcChannels, 0, tailW, dst, dstStride), dst += srcChannels; - src += srcWidth * srcChannels; - } - } - else - { - Base::WinogradKernel1x3Block1x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel1x3Block1x4SetOutputLoad6(const float* src, size_t stride, __m256 dst[4]) - { - __m256 s[6]; - s[0] = _mm256_loadu_ps(src + 0 * stride); - s[1] = _mm256_loadu_ps(src + 1 * stride); - s[2] = _mm256_loadu_ps(src + 2 * stride); - s[3] = _mm256_loadu_ps(src + 3 * stride); - s[4] = _mm256_loadu_ps(src + 4 * stride); - s[5] = _mm256_loadu_ps(src + 5 * stride); - __m256 _2 = _mm256_set1_ps(2.0f); - __m256 _4 = _mm256_set1_ps(4.0f); - __m256 _8 = _mm256_set1_ps(8.0f); - dst[0] = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(s[0], s[1]), _mm256_add_ps(s[2], s[3])), s[4]); - dst[1] = _mm256_add_ps(_mm256_sub_ps(s[1], s[2]), _mm256_mul_ps(_2, _mm256_sub_ps(s[3], s[4]))); - dst[2] = _mm256_add_ps(_mm256_add_ps(s[1], s[2]), _mm256_mul_ps(_4, _mm256_add_ps(s[3], s[4]))); - dst[3] = _mm256_add_ps(_mm256_add_ps(_mm256_sub_ps(s[1], s[2]), _mm256_mul_ps(_8, _mm256_sub_ps(s[3], s[4]))), s[5]); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetOutputStore4(const __m256 src[4], float* dst, size_t dstC) - { - _mm256_storeu_ps(dst + 0 * dstC, src[0]); - _mm256_storeu_ps(dst + 1 * dstC, src[1]); - _mm256_storeu_ps(dst + 2 * dstC, src[2]); - _mm256_storeu_ps(dst + 3 * dstC, src[3]); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetOutput8t(const float* src, size_t srcStride, float* dst, size_t dstC) - { - size_t dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m256 tmp[4]; - WinogradKernel1x3Block1x4SetOutputLoad6(src + d, srcStride, tmp); - WinogradKernel1x3Block1x4SetOutputStore4(tmp, dst + d, dstC); - } - if (dstCF < dstC) - { - __m256 tmp[4]; - WinogradKernel1x3Block1x4SetOutputLoad6(src + dstC - F, srcStride, tmp); - WinogradKernel1x3Block1x4SetOutputStore4(tmp, dst + dstC - F, dstC); - } - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetOutputStore4(const __m256 src[4], float* dst, size_t dstC, size_t colE) - { - for (size_t col = 0; col < colE; ++col) - _mm256_storeu_ps(dst + col * dstC, src[col]); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetOutput8t(const float* src, size_t srcStride, float* dst, size_t dstC, size_t colE) - { - size_t dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m256 tmp[4]; - WinogradKernel1x3Block1x4SetOutputLoad6(src + d, srcStride, tmp); - WinogradKernel1x3Block1x4SetOutputStore4(tmp, dst + d, dstC, colE); - } - if (dstCF < dstC) - { - __m256 tmp[4]; - WinogradKernel1x3Block1x4SetOutputLoad6(src + dstC - F, srcStride, tmp); - WinogradKernel1x3Block1x4SetOutputStore4(tmp, dst + dstC - F, dstC, colE); - } - } - - void WinogradKernel1x3Block1x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - if (trans ? (dstChannels < F) : (dstWidth < 16)) - { - Sse::WinogradKernel1x3Block1x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - return; - } - size_t tileW = (dstWidth + 3) / 4; - size_t dstW4 = AlignLo(dstWidth, 4); - if (trans) - { - for (size_t row = 0; row < dstHeight; row += 1) - { - size_t col; - for (col = 0; col < dstW4; col += 4) - WinogradKernel1x3Block1x4SetOutput8t(src, srcStride, dst + col * dstChannels, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel1x3Block1x4SetOutput8t(src, srcStride, dst + col * dstChannels, dstChannels, dstWidth - col), src += dstChannels; - dst += dstWidth * dstChannels; - } - } - else - { - Base::WinogradKernel1x3Block1x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel1x5Block1x4SetFilter(const __m256* t, float* dst, size_t stride) - { - const __m256 r36 = _mm256_set1_ps(1.0f / 36.0f); - const __m256 r48 = _mm256_set1_ps(1.0f / 48.0f); - const __m256 mr120 = _mm256_set1_ps(-1.0f / 120.0f); - const __m256 r720 = _mm256_set1_ps(1.0f / 720.0f); - const __m256 _2 = _mm256_set1_ps(2.0f); - const __m256 _3 = _mm256_set1_ps(3.0f); - const __m256 _4 = _mm256_set1_ps(4.0f); - const __m256 _9 = _mm256_set1_ps(9.0f); - _mm256_storeu_ps(dst + 0 * stride, _mm256_mul_ps(r36, t[0])); - __m256 a[2]; - a[0] = _mm256_add_ps(_mm256_add_ps(t[0], t[2]), t[4]); - a[1] = _mm256_add_ps(t[1], t[3]); - _mm256_storeu_ps(dst + 1 * stride, _mm256_mul_ps(r48, _mm256_add_ps(a[0], a[1]))); - _mm256_storeu_ps(dst + 2 * stride, _mm256_mul_ps(r48, _mm256_sub_ps(a[0], a[1]))); - a[0] = _mm256_add_ps(t[0], _mm256_mul_ps(_4, _mm256_add_ps(t[2], _mm256_mul_ps(_4, t[4])))); - a[1] = _mm256_mul_ps(_2, _mm256_add_ps(t[1], _mm256_mul_ps(_4, t[3]))); - _mm256_storeu_ps(dst + 3 * stride, _mm256_mul_ps(mr120, _mm256_add_ps(a[0], a[1]))); - _mm256_storeu_ps(dst + 4 * stride, _mm256_mul_ps(mr120, _mm256_sub_ps(a[0], a[1]))); - a[0] = _mm256_add_ps(t[0], _mm256_mul_ps(_9, _mm256_add_ps(t[2], _mm256_mul_ps(_9, t[4])))); - a[1] = _mm256_mul_ps(_3, _mm256_add_ps(t[1], _mm256_mul_ps(_9, t[3]))); - _mm256_storeu_ps(dst + 5 * stride, _mm256_mul_ps(r720, _mm256_add_ps(a[0], a[1]))); - _mm256_storeu_ps(dst + 6 * stride, _mm256_mul_ps(r720, _mm256_sub_ps(a[0], a[1]))); - _mm256_storeu_ps(dst + 7 * stride, t[4]); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetFilter8t(const float* src, float* dst, size_t stride) - { - __m256 s[5]; - s[0] = _mm256_loadu_ps(src + 0 * stride); - s[1] = _mm256_loadu_ps(src + 1 * stride); - s[2] = _mm256_loadu_ps(src + 2 * stride); - s[3] = _mm256_loadu_ps(src + 3 * stride); - s[4] = _mm256_loadu_ps(src + 4 * stride); - WinogradKernel1x5Block1x4SetFilter(s, dst, stride); - } - - void WinogradKernel1x5Block1x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans) - { - size_t sizeF = AlignLo(size, F), i = 0; - if (trans) - { - for (; i < sizeF; i += F) - WinogradKernel1x5Block1x4SetFilter8t(src + i, dst + i, size); - for (; i < size; i += 1) - Base::WinogradKernel1x5Block1x4SetFilter1t(src + i, dst + i, size); - } - else - { - Sse::WinogradKernel1x5Block1x4SetFilter(src, size, dst, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel1x5Block1x4SetInput8Store(const __m256 src[8], float* dst, size_t stride) - { - __m256 _2 = _mm256_set1_ps(2.0f); - __m256 _3 = _mm256_set1_ps(3.0f); - __m256 _4 = _mm256_set1_ps(4.0f); - __m256 _5 = _mm256_set1_ps(5.0f); - __m256 _9 = _mm256_set1_ps(9.0f); - __m256 _10 = _mm256_set1_ps(10.0f); - __m256 _13 = _mm256_set1_ps(13.0f); - __m256 _14 = _mm256_set1_ps(14.0f); - __m256 _36 = _mm256_set1_ps(36.0f); - __m256 _49 = _mm256_set1_ps(49.0f); - _mm256_storeu_ps(dst + 0 * stride, _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_36, src[0]), _mm256_mul_ps(_49, src[2])), _mm256_sub_ps(_mm256_mul_ps(_14, src[4]), src[6]))); - __m256 a[2]; - a[0] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_36, src[2]), _mm256_mul_ps(_13, src[4])), src[6]); - a[1] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_36, src[1]), _mm256_mul_ps(_13, src[3])), src[5]); - _mm256_storeu_ps(dst + 1 * stride, _mm256_add_ps(a[0], a[1])); - _mm256_storeu_ps(dst + 2 * stride, _mm256_sub_ps(a[0], a[1])); - a[0] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_9, src[2]), _mm256_mul_ps(_10, src[4])), src[6]); - a[1] = _mm256_mul_ps(_2, _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_9, src[1]), _mm256_mul_ps(_10, src[3])), src[5])); - _mm256_storeu_ps(dst + 3 * stride, _mm256_add_ps(a[0], a[1])); - _mm256_storeu_ps(dst + 4 * stride, _mm256_sub_ps(a[0], a[1])); - a[0] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, src[2]), _mm256_mul_ps(_5, src[4])), src[6]); - a[1] = _mm256_mul_ps(_3, _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, src[1]), _mm256_mul_ps(_5, src[3])), src[5])); - _mm256_storeu_ps(dst + 5 * stride, _mm256_add_ps(a[0], a[1])); - _mm256_storeu_ps(dst + 6 * stride, _mm256_sub_ps(a[0], a[1])); - _mm256_storeu_ps(dst + 7 * stride, _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_49, src[3]), _mm256_mul_ps(_36, src[1])), _mm256_sub_ps(src[7], _mm256_mul_ps(_14, src[5])))); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetInput8t(const float* src, size_t srcC, __m256 dst[8]) - { - dst[0] = _mm256_loadu_ps(src + 0 * srcC); - dst[1] = _mm256_loadu_ps(src + 1 * srcC); - dst[2] = _mm256_loadu_ps(src + 2 * srcC); - dst[3] = _mm256_loadu_ps(src + 3 * srcC); - dst[4] = _mm256_loadu_ps(src + 4 * srcC); - dst[5] = _mm256_loadu_ps(src + 5 * srcC); - dst[6] = _mm256_loadu_ps(src + 6 * srcC); - dst[7] = _mm256_loadu_ps(src + 7 * srcC); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetInput8t(const float* src, size_t srcC, float* dst, size_t dstStride) - { - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - __m256 tmp[8]; - WinogradKernel1x5Block1x4SetInput8t(src + c, srcC, tmp); - WinogradKernel1x5Block1x4SetInput8Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - __m256 tmp[8]; - WinogradKernel1x5Block1x4SetInput8t(src + srcC - F, srcC, tmp); - WinogradKernel1x5Block1x4SetInput8Store(tmp, dst + srcC - F, dstStride); - } - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetInput8t(const float* src, size_t srcC, size_t colB, size_t colE, __m256 dst[8]) - { - for (size_t col = 0; col < colB; ++col) - dst[col] = _mm256_setzero_ps(); - for (size_t col = colB; col < colE; ++col) - dst[col] = _mm256_loadu_ps(src + col * srcC); - for (size_t col = colE; col < 8; ++col) - dst[col] = _mm256_setzero_ps(); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetInput8t(const float* src, size_t srcC, size_t colB, size_t colE, float* dst, size_t dstStride) - { - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - __m256 tmp[8]; - WinogradKernel1x5Block1x4SetInput8t(src + c, srcC, colB, colE, tmp); - WinogradKernel1x5Block1x4SetInput8Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - __m256 tmp[8]; - WinogradKernel1x5Block1x4SetInput8t(src + srcC - F, srcC, colB, colE, tmp); - WinogradKernel1x5Block1x4SetInput8Store(tmp, dst + srcC - F, dstStride); - } - } - - void WinogradKernel1x5Block1x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padX == padW && padY == 0 && padH == 0 && (padX == 0 || padX == 2)); - if (trans ? (srcChannels < F) : true) - { - Sse::WinogradKernel1x5Block1x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - return; - } - size_t dstH = srcHeight; - size_t dstW = padX ? srcWidth : srcWidth - 4; - size_t tileW = (dstW + 3) / 4; - size_t dstW4 = AlignLo(dstW, 4); - size_t noseW = Simd::Min(8, dstW + 2); - size_t startX = padX ? 4 : 0; - if (padX) - { - if (dstW == dstW4 || dstW == dstW4 + 1) - dstW4 -= 4; - src -= 2 * srcChannels; - } - size_t tailW = dstW - dstW4 + (padX ? 2 : 4); - for (size_t row = 0; row < dstH; row += 1) - { - size_t col = 0; - if (padX) - WinogradKernel1x5Block1x4SetInput8t(src, srcChannels, 2, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel1x5Block1x4SetInput8t(src + col * srcChannels, srcChannels, dst, dstStride), dst += srcChannels; - for (size_t tail = tailW; col < dstW; col += 4, tail -= 4) - WinogradKernel1x5Block1x4SetInput8t(src + col * srcChannels, srcChannels, 0, tail, dst, dstStride), dst += srcChannels; - src += srcWidth * srcChannels; - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel1x5Block1x4SetOutputLoad8(const float* src, size_t stride, __m256 dst[4]) - { - const __m256 _2 = _mm256_set1_ps(2.0f); - const __m256 _3 = _mm256_set1_ps(3.0f); - const __m256 _4 = _mm256_set1_ps(4.0f); - const __m256 _9 = _mm256_set1_ps(9.0f); - __m256 s[8]; - s[0] = _mm256_loadu_ps(src + 1 * stride); - s[7] = _mm256_loadu_ps(src + 2 * stride); - s[1] = _mm256_add_ps(s[0], s[7]); - s[2] = _mm256_sub_ps(s[0], s[7]); - s[0] = _mm256_loadu_ps(src + 3 * stride); - s[7] = _mm256_loadu_ps(src + 4 * stride); - s[3] = _mm256_add_ps(s[0], s[7]); - s[4] = _mm256_mul_ps(_2, _mm256_sub_ps(s[0], s[7])); - s[0] = _mm256_loadu_ps(src + 5 * stride); - s[7] = _mm256_loadu_ps(src + 6 * stride); - s[5] = _mm256_add_ps(s[0], s[7]); - s[6] = _mm256_mul_ps(_3, _mm256_sub_ps(s[0], s[7])); - dst[0] = _mm256_add_ps(_mm256_loadu_ps(src + 0 * stride), _mm256_add_ps(_mm256_add_ps(s[1], s[3]), s[5])); - dst[1] = _mm256_add_ps(s[2], _mm256_add_ps(s[4], s[6])); - dst[2] = _mm256_add_ps(s[1], _mm256_add_ps(_mm256_mul_ps(_4, s[3]), _mm256_mul_ps(_9, s[5]))); - dst[3] = _mm256_add_ps(_mm256_loadu_ps(src + 7 * stride), _mm256_add_ps(_mm256_add_ps(s[2], _mm256_mul_ps(_4, s[4])), _mm256_mul_ps(_9, s[6]))); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetOutputStore4(const __m256 src[4], float* dst, size_t dstC) - { - _mm256_storeu_ps(dst + 0 * dstC, src[0]); - _mm256_storeu_ps(dst + 1 * dstC, src[1]); - _mm256_storeu_ps(dst + 2 * dstC, src[2]); - _mm256_storeu_ps(dst + 3 * dstC, src[3]); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstC) - { - size_t dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m256 tmp[4]; - WinogradKernel1x5Block1x4SetOutputLoad8(src + d, srcStride, tmp); - WinogradKernel1x5Block1x4SetOutputStore4(tmp, dst + d, dstC); - } - if (dstCF < dstC) - { - __m256 tmp[4]; - WinogradKernel1x5Block1x4SetOutputLoad8(src + dstC - F, srcStride, tmp); - WinogradKernel1x5Block1x4SetOutputStore4(tmp, dst + dstC - F, dstC); - } - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetOutputStore4(const __m256 src[4], float* dst, size_t dstC, size_t colE) - { - for (size_t col = 0; col < colE; ++col) - _mm256_storeu_ps(dst + col * dstC, src[col]); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstC, size_t colE) - { - size_t dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m256 tmp[4]; - WinogradKernel1x5Block1x4SetOutputLoad8(src + d, srcStride, tmp); - WinogradKernel1x5Block1x4SetOutputStore4(tmp, dst + d, dstC, colE); - } - if (dstCF < dstC) - { - __m256 tmp[4]; - WinogradKernel1x5Block1x4SetOutputLoad8(src + dstC - F, srcStride, tmp); - WinogradKernel1x5Block1x4SetOutputStore4(tmp, dst + dstC - F, dstC, colE); - } - } - - void WinogradKernel1x5Block1x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - if (trans ? (dstChannels < F) : true) - { - Sse::WinogradKernel1x5Block1x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - return; - } - size_t tileW = (dstWidth + 3) / 4; - size_t dstW4 = AlignLo(dstWidth, 4); - for (size_t row = 0; row < dstHeight; row += 1) - { - size_t col; - for (col = 0; col < dstW4; col += 4) - WinogradKernel1x5Block1x4SetOutput4t(src, srcStride, dst + col * dstChannels, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel1x5Block1x4SetOutput4t(src, srcStride, dst + col * dstChannels, dstChannels, dstWidth - col), src += dstChannels; - dst += dstWidth * dstChannels; - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block2x2SetFilter(const __m256 src[4], float* dst, size_t stride) - { - _mm256_storeu_ps(dst + 0 * stride, src[0]); - _mm256_storeu_ps(dst + 1 * stride, _mm256_add_ps(src[0], src[1])); - _mm256_storeu_ps(dst + 2 * stride, src[1]); - - _mm256_storeu_ps(dst + 3 * stride, _mm256_add_ps(src[0], src[2])); - _mm256_storeu_ps(dst + 4 * stride, _mm256_add_ps(_mm256_add_ps(src[0], src[1]), _mm256_add_ps(src[2], src[3]))); - _mm256_storeu_ps(dst + 5 * stride, _mm256_add_ps(src[1], src[3])); - - _mm256_storeu_ps(dst + 6 * stride, src[2]); - _mm256_storeu_ps(dst + 7 * stride, _mm256_add_ps(src[2], src[3])); - _mm256_storeu_ps(dst + 8 * stride, src[3]); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetFilter8t(const float* src, float* dst, size_t stride) - { - __m256 _src[4]; - _src[0] = _mm256_loadu_ps(src + 0 * stride); - _src[1] = _mm256_loadu_ps(src + 1 * stride); - _src[2] = _mm256_loadu_ps(src + 2 * stride); - _src[3] = _mm256_loadu_ps(src + 3 * stride); - WinogradKernel2x2Block2x2SetFilter(_src, dst, stride); - } - - void WinogradKernel2x2Block2x2SetFilter(const float* src, size_t size, float* dst, SimdBool trans) - { - size_t size8 = AlignLo(size, 8), i = 0; - if (trans) - { - for (; i < size8; i += 8) - WinogradKernel2x2Block2x2SetFilter8t(src + i, dst + i, size); - for (; i < size; i += 1) - Base::WinogradKernel2x2Block2x2SetFilter1t(src + i, dst + i, size); - } - else - { - Sse::WinogradKernel2x2Block2x2SetFilter(src, size, dst, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block2x2SetInput8Store(const __m256* src, float* dst, size_t stride) - { - _mm256_storeu_ps(dst + 0 * stride, _mm256_add_ps(_mm256_sub_ps(src[0], src[1]), _mm256_sub_ps(src[4], src[3]))); - _mm256_storeu_ps(dst + 1 * stride, _mm256_sub_ps(src[1], src[4])); - _mm256_storeu_ps(dst + 2 * stride, _mm256_add_ps(_mm256_sub_ps(src[2], src[1]), _mm256_sub_ps(src[4], src[5]))); - _mm256_storeu_ps(dst + 3 * stride, _mm256_sub_ps(src[3], src[4])); - _mm256_storeu_ps(dst + 4 * stride, src[4]); - _mm256_storeu_ps(dst + 5 * stride, _mm256_sub_ps(src[5], src[4])); - _mm256_storeu_ps(dst + 6 * stride, _mm256_add_ps(_mm256_sub_ps(src[4], src[3]), _mm256_sub_ps(src[6], src[7]))); - _mm256_storeu_ps(dst + 7 * stride, _mm256_sub_ps(src[7], src[4])); - _mm256_storeu_ps(dst + 8 * stride, _mm256_add_ps(_mm256_sub_ps(src[4], src[5]), _mm256_sub_ps(src[8], src[7]))); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetInput8t(const float* src, size_t srcS, size_t srcC, __m256 dst[9]) - { - dst[0] = _mm256_loadu_ps(src + 0 * srcS + 0 * srcC); - dst[1] = _mm256_loadu_ps(src + 0 * srcS + 1 * srcC); - dst[2] = _mm256_loadu_ps(src + 0 * srcS + 2 * srcC); - dst[3] = _mm256_loadu_ps(src + 1 * srcS + 0 * srcC); - dst[4] = _mm256_loadu_ps(src + 1 * srcS + 1 * srcC); - dst[5] = _mm256_loadu_ps(src + 1 * srcS + 2 * srcC); - dst[6] = _mm256_loadu_ps(src + 2 * srcS + 0 * srcC); - dst[7] = _mm256_loadu_ps(src + 2 * srcS + 1 * srcC); - dst[8] = _mm256_loadu_ps(src + 2 * srcS + 2 * srcC); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetInput8t(const float* src, size_t srcW, size_t srcC, float* dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - __m256 tmp[9]; - WinogradKernel2x2Block2x2SetInput8t(src + c, srcS, srcC, tmp); - WinogradKernel2x2Block2x2SetInput8Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - __m256 tmp[9]; - WinogradKernel2x2Block2x2SetInput8t(src + srcC - F, srcS, srcC, tmp); - WinogradKernel2x2Block2x2SetInput8Store(tmp, dst + srcC - F, dstStride); - } - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetInput8t(const float* src, size_t srcS, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, __m256 dst[9]) - { - for (size_t row = 0; row < rowB; ++row) - { - for (size_t col = 0; col < 3; ++col) - dst[col] = _mm256_setzero_ps(); - dst += 3; - } - for (size_t row = rowB; row < rowE; ++row) - { - for (size_t col = 0; col < colB; ++col) - dst[col] = _mm256_setzero_ps(); - for (size_t col = colB; col < colE; ++col) - dst[col] = _mm256_loadu_ps(src + row * srcS + col * srcC); - for (size_t col = colE; col < 3; ++col) - dst[col] = _mm256_setzero_ps(); - dst += 3; - } - for (size_t row = rowE; row < 3; ++row) - { - for (size_t col = 0; col < 3; ++col) - dst[col] = _mm256_setzero_ps(); - dst += 3; - } - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetInput8t(const float* src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float* dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - __m256 tmp[9]; - WinogradKernel2x2Block2x2SetInput8t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel2x2Block2x2SetInput8Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - __m256 tmp[9]; - WinogradKernel2x2Block2x2SetInput8t(src + srcC - F, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel2x2Block2x2SetInput8Store(tmp, dst + srcC - F, dstStride); - } - } - - void WinogradKernel2x2Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padY == padX && padW == padH && (padY + padH == 0 || padY + padH == 1)); - if (trans ? (srcChannels < F) : true) - { - Sse::WinogradKernel2x2Block2x2SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - return; - } - size_t dstH = srcHeight - 1 + padY + padH; - size_t dstW = srcWidth - 1 + padX + padW; - size_t dstH2 = AlignLo(dstH, 2); - size_t dstW2 = AlignLo(dstW, 2); - size_t noseW = Simd::Min(3, dstW + 1); - size_t noseH = Simd::Min(3, dstH + 1); - size_t startY = padY ? 2 : 0; - size_t startX = padX ? 2 : 0; - if (padY || padH) - { - if (dstH == dstH2) - dstH2 -= 2; - if (dstW == dstW2) - dstW2 -= 2; - if (padY) - src -= (srcWidth + 1) * (trans ? srcChannels : 1); - } - size_t tailW = dstW - dstW2 + (padW ? 0 : 1); - size_t tailH = dstH - dstH2 + (padH ? 0 : 1); - size_t row = 0, col = 0; - if (padY) - { - if (padX) - WinogradKernel2x2Block2x2SetInput8t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW2; col += 2) - WinogradKernel2x2Block2x2SetInput8t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 3, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel2x2Block2x2SetInput8t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; - } - for (row = startY; row < dstH2; row += 2) - { - if (padX) - WinogradKernel2x2Block2x2SetInput8t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 3, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW2; col += 2) - WinogradKernel2x2Block2x2SetInput8t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel2x2Block2x2SetInput8t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 3, 0, tailW, dst, dstStride), dst += srcChannels; - } - if (row < dstH) - { - if (padX) - WinogradKernel2x2Block2x2SetInput8t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW2; col += 2) - WinogradKernel2x2Block2x2SetInput8t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 3, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel2x2Block2x2SetInput8t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block2x2SetOutputLoad9(const float* src, size_t stride, __m256* dst) - { - __m256 s[9]; - s[0] = _mm256_loadu_ps(src + 0 * stride); - s[1] = _mm256_loadu_ps(src + 1 * stride); - s[2] = _mm256_loadu_ps(src + 2 * stride); - s[3] = _mm256_loadu_ps(src + 3 * stride); - s[4] = _mm256_loadu_ps(src + 4 * stride); - s[5] = _mm256_loadu_ps(src + 5 * stride); - s[6] = _mm256_loadu_ps(src + 6 * stride); - s[7] = _mm256_loadu_ps(src + 7 * stride); - s[8] = _mm256_loadu_ps(src + 8 * stride); - dst[0] = _mm256_add_ps(_mm256_add_ps(s[0], s[1]), _mm256_add_ps(s[3], s[4])); - dst[1] = _mm256_add_ps(_mm256_add_ps(s[1], s[2]), _mm256_add_ps(s[4], s[5])); - dst[2] = _mm256_add_ps(_mm256_add_ps(s[3], s[4]), _mm256_add_ps(s[6], s[7])); - dst[3] = _mm256_add_ps(_mm256_add_ps(s[4], s[5]), _mm256_add_ps(s[7], s[8])); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetOutputStore8(const __m256 src[4], float* dst, size_t dstS, size_t dstC) - { - _mm256_storeu_ps(dst + 0 * dstS + 0 * dstC, src[0]); - _mm256_storeu_ps(dst + 0 * dstS + 1 * dstC, src[1]); - _mm256_storeu_ps(dst + 1 * dstS + 0 * dstC, src[2]); - _mm256_storeu_ps(dst + 1 * dstS + 1 * dstC, src[3]); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetOutput8t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m256 tmp[4]; - WinogradKernel2x2Block2x2SetOutputLoad9(src + d, srcStride, tmp); - WinogradKernel2x2Block2x2SetOutputStore8(tmp, dst + d, dstS, dstC); - } - if (dstCF < dstC) - { - __m256 tmp[4]; - WinogradKernel2x2Block2x2SetOutputLoad9(src + dstC - F, srcStride, tmp); - WinogradKernel2x2Block2x2SetOutputStore8(tmp, dst + dstC - F, dstS, dstC); - } - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetOutputStore8(const __m256 src[4], float* dst, size_t dstS, size_t dstC, size_t rowE, size_t colE) - { - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - _mm256_storeu_ps(dst + row * dstS + col * dstC, src[row * 2 + col]); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetOutput8t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m256 tmp[4]; - WinogradKernel2x2Block2x2SetOutputLoad9(src + d, srcStride, tmp); - WinogradKernel2x2Block2x2SetOutputStore8(tmp, dst + d, dstS, dstC, rowE, colE); - } - if (dstCF < dstC) - { - __m256 tmp[4]; - WinogradKernel2x2Block2x2SetOutputLoad9(src + dstC - F, srcStride, tmp); - WinogradKernel2x2Block2x2SetOutputStore8(tmp, dst + dstC - F, dstS, dstC, rowE, colE); - } - } - - void WinogradKernel2x2Block2x2SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - if (trans ? (dstChannels < F) : true) - { - Sse::WinogradKernel2x2Block2x2SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - return; - } - size_t tileH = (dstHeight + 1) / 2; - size_t tileW = (dstWidth + 1) / 2; - size_t dstH2 = AlignLo(dstHeight, 2); - size_t dstW2 = AlignLo(dstWidth, 2); - size_t row, col; - for (row = 0; row < dstH2; row += 2) - { - for (col = 0; col < dstW2; col += 2) - WinogradKernel2x2Block2x2SetOutput8t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel2x2Block2x2SetOutput8t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, 2, dstWidth - col), src += dstChannels; - } - if (row < dstHeight) - { - for (col = 0; col < dstW2; col += 2) - WinogradKernel2x2Block2x2SetOutput8t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, 2), src += dstChannels; - if (col < dstWidth) - WinogradKernel2x2Block2x2SetOutput8t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block4x4SetFilterRow(const __m256* t, float* dst, size_t stride) - { - const __m256 r2 = _mm256_set1_ps(1.0f / 2.0f); - const __m256 r3 = _mm256_set1_ps(1.0f / 3.0f); - const __m256 r6 = _mm256_set1_ps(1.0f / 6.0f); - const __m256 mr2 = _mm256_set1_ps(-1.0f / 2.0f); - - _mm256_storeu_ps(dst + 0 * stride, _mm256_mul_ps(r2, t[0])); - _mm256_storeu_ps(dst + 1 * stride, _mm256_mul_ps(mr2, _mm256_add_ps(t[0], t[1]))); - _mm256_storeu_ps(dst + 2 * stride, _mm256_mul_ps(r6, _mm256_sub_ps(t[1], t[0]))); - _mm256_storeu_ps(dst + 3 * stride, _mm256_add_ps(_mm256_mul_ps(r6, t[0]), _mm256_mul_ps(r3, t[1]))); - _mm256_storeu_ps(dst + 4 * stride, t[1]); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetFilter(const __m256 src[4], float* dst, size_t stride) - { - const __m256 r2 = _mm256_set1_ps(1.0f / 2.0f); - const __m256 r3 = _mm256_set1_ps(1.0f / 3.0f); - const __m256 r6 = _mm256_set1_ps(1.0f / 6.0f); - const __m256 mr2 = _mm256_set1_ps(-1.0f / 2.0f); - - __m256 t[2]; - t[0] = _mm256_mul_ps(r2, src[0]); - t[1] = _mm256_mul_ps(r2, src[1]); - WinogradKernel2x2Block4x4SetFilterRow(t, dst + 0 * stride, stride); - - t[0] = _mm256_mul_ps(mr2, _mm256_add_ps(src[0], src[2])); - t[1] = _mm256_mul_ps(mr2, _mm256_add_ps(src[1], src[3])); - WinogradKernel2x2Block4x4SetFilterRow(t, dst + 5 * stride, stride); - - t[0] = _mm256_mul_ps(r6, _mm256_sub_ps(src[2], src[0])); - t[1] = _mm256_mul_ps(r6, _mm256_sub_ps(src[3], src[1])); - WinogradKernel2x2Block4x4SetFilterRow(t, dst + 10 * stride, stride); - - t[0] = _mm256_add_ps(_mm256_mul_ps(r6, src[0]), _mm256_mul_ps(r3, src[2])); - t[1] = _mm256_add_ps(_mm256_mul_ps(r6, src[1]), _mm256_mul_ps(r3, src[3])); - WinogradKernel2x2Block4x4SetFilterRow(t, dst + 15 * stride, stride); - - t[0] = src[2]; - t[1] = src[3]; - WinogradKernel2x2Block4x4SetFilterRow(t, dst + 20 * stride, stride); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetFilter8t(const float* src, float* dst, size_t stride) - { - __m256 _src[4]; - _src[0] = _mm256_loadu_ps(src + 0 * stride); - _src[1] = _mm256_loadu_ps(src + 1 * stride); - _src[2] = _mm256_loadu_ps(src + 2 * stride); - _src[3] = _mm256_loadu_ps(src + 3 * stride); - WinogradKernel2x2Block4x4SetFilter(_src, dst, stride); - } - - void WinogradKernel2x2Block4x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans) - { - size_t sizeF = AlignLo(size, F), i = 0; - if (trans) - { - for (; i < sizeF; i += F) - WinogradKernel2x2Block4x4SetFilter8t(src + i, dst + i, size); - for (; i < size; i += 1) - Base::WinogradKernel2x2Block4x4SetFilter1t(src + i, dst + i, size); - } - else - { - Sse::WinogradKernel2x2Block4x4SetFilter(src, size, dst, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInputStoreRow(const __m256 tmp[5], float* dst, size_t stride) - { - const __m256 _2 = _mm256_set1_ps(2.0f); - const __m256 _3 = _mm256_set1_ps(3.0f); - _mm256_storeu_ps(dst + 0 * stride, _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_2, tmp[0]), tmp[1]), _mm256_sub_ps(tmp[3], _mm256_mul_ps(_2, tmp[2])))); - _mm256_storeu_ps(dst + 1 * stride, _mm256_sub_ps(tmp[3], _mm256_add_ps(_mm256_mul_ps(_2, tmp[1]), tmp[2]))); - _mm256_storeu_ps(dst + 2 * stride, _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_2, tmp[1]), _mm256_mul_ps(_3, tmp[2])), tmp[3])); - _mm256_storeu_ps(dst + 3 * stride, _mm256_sub_ps(tmp[3], tmp[1])); - _mm256_storeu_ps(dst + 4 * stride, _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_2, tmp[1]), tmp[2]), _mm256_sub_ps(tmp[4], _mm256_mul_ps(_2, tmp[3])))); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInputStore(const __m256* src, float* dst, size_t stride) - { - const __m256 _2 = _mm256_set1_ps(2.0f); - const __m256 _3 = _mm256_set1_ps(3.0f); - __m256 tmp[5]; - tmp[0] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_2, src[0]), src[5]), _mm256_sub_ps(src[15], _mm256_mul_ps(_2, src[10]))); - tmp[1] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_2, src[1]), src[6]), _mm256_sub_ps(src[16], _mm256_mul_ps(_2, src[11]))); - tmp[2] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_2, src[2]), src[7]), _mm256_sub_ps(src[17], _mm256_mul_ps(_2, src[12]))); - tmp[3] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_2, src[3]), src[8]), _mm256_sub_ps(src[18], _mm256_mul_ps(_2, src[13]))); - tmp[4] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_2, src[4]), src[9]), _mm256_sub_ps(src[19], _mm256_mul_ps(_2, src[14]))); - WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 0 * stride, stride); - - tmp[0] = _mm256_sub_ps(src[15], _mm256_add_ps(_mm256_mul_ps(_2, src[5]), src[10])); - tmp[1] = _mm256_sub_ps(src[16], _mm256_add_ps(_mm256_mul_ps(_2, src[6]), src[11])); - tmp[2] = _mm256_sub_ps(src[17], _mm256_add_ps(_mm256_mul_ps(_2, src[7]), src[12])); - tmp[3] = _mm256_sub_ps(src[18], _mm256_add_ps(_mm256_mul_ps(_2, src[8]), src[13])); - tmp[4] = _mm256_sub_ps(src[19], _mm256_add_ps(_mm256_mul_ps(_2, src[9]), src[14])); - WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 5 * stride, stride); - - tmp[0] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_2, src[5]), _mm256_mul_ps(_3, src[10])), src[15]); - tmp[1] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_2, src[6]), _mm256_mul_ps(_3, src[11])), src[16]); - tmp[2] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_2, src[7]), _mm256_mul_ps(_3, src[12])), src[17]); - tmp[3] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_2, src[8]), _mm256_mul_ps(_3, src[13])), src[18]); - tmp[4] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_2, src[9]), _mm256_mul_ps(_3, src[14])), src[19]); - WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 10 * stride, stride); - - tmp[0] = _mm256_sub_ps(src[15], src[5]); - tmp[1] = _mm256_sub_ps(src[16], src[6]); - tmp[2] = _mm256_sub_ps(src[17], src[7]); - tmp[3] = _mm256_sub_ps(src[18], src[8]); - tmp[4] = _mm256_sub_ps(src[19], src[9]); - WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 15 * stride, stride); - - tmp[0] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_2, src[5]), src[10]), _mm256_sub_ps(src[20], _mm256_mul_ps(_2, src[15]))); - tmp[1] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_2, src[6]), src[11]), _mm256_sub_ps(src[21], _mm256_mul_ps(_2, src[16]))); - tmp[2] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_2, src[7]), src[12]), _mm256_sub_ps(src[22], _mm256_mul_ps(_2, src[17]))); - tmp[3] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_2, src[8]), src[13]), _mm256_sub_ps(src[23], _mm256_mul_ps(_2, src[18]))); - tmp[4] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_2, src[9]), src[14]), _mm256_sub_ps(src[24], _mm256_mul_ps(_2, src[19]))); - WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 20 * stride, stride); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInput8t(const float* src, size_t srcS, size_t srcC, __m256 dst[25]) - { - dst[0] = _mm256_loadu_ps(src + 0 * srcS + 0 * srcC); - dst[1] = _mm256_loadu_ps(src + 0 * srcS + 1 * srcC); - dst[2] = _mm256_loadu_ps(src + 0 * srcS + 2 * srcC); - dst[3] = _mm256_loadu_ps(src + 0 * srcS + 3 * srcC); - dst[4] = _mm256_loadu_ps(src + 0 * srcS + 4 * srcC); - dst[5] = _mm256_loadu_ps(src + 1 * srcS + 0 * srcC); - dst[6] = _mm256_loadu_ps(src + 1 * srcS + 1 * srcC); - dst[7] = _mm256_loadu_ps(src + 1 * srcS + 2 * srcC); - dst[8] = _mm256_loadu_ps(src + 1 * srcS + 3 * srcC); - dst[9] = _mm256_loadu_ps(src + 1 * srcS + 4 * srcC); - dst[10] = _mm256_loadu_ps(src + 2 * srcS + 0 * srcC); - dst[11] = _mm256_loadu_ps(src + 2 * srcS + 1 * srcC); - dst[12] = _mm256_loadu_ps(src + 2 * srcS + 2 * srcC); - dst[13] = _mm256_loadu_ps(src + 2 * srcS + 3 * srcC); - dst[14] = _mm256_loadu_ps(src + 2 * srcS + 4 * srcC); - dst[15] = _mm256_loadu_ps(src + 3 * srcS + 0 * srcC); - dst[16] = _mm256_loadu_ps(src + 3 * srcS + 1 * srcC); - dst[17] = _mm256_loadu_ps(src + 3 * srcS + 2 * srcC); - dst[18] = _mm256_loadu_ps(src + 3 * srcS + 3 * srcC); - dst[19] = _mm256_loadu_ps(src + 3 * srcS + 4 * srcC); - dst[20] = _mm256_loadu_ps(src + 4 * srcS + 0 * srcC); - dst[21] = _mm256_loadu_ps(src + 4 * srcS + 1 * srcC); - dst[22] = _mm256_loadu_ps(src + 4 * srcS + 2 * srcC); - dst[23] = _mm256_loadu_ps(src + 4 * srcS + 3 * srcC); - dst[24] = _mm256_loadu_ps(src + 4 * srcS + 4 * srcC); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInput8t(const float* src, size_t srcW, size_t srcC, float* dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - __m256 tmp[25]; - WinogradKernel2x2Block4x4SetInput8t(src + c, srcS, srcC, tmp); - WinogradKernel2x2Block4x4SetInputStore(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - __m256 tmp[25]; - WinogradKernel2x2Block4x4SetInput8t(src + srcC - F, srcS, srcC, tmp); - WinogradKernel2x2Block4x4SetInputStore(tmp, dst + srcC - F, dstStride); - } - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInput8t(const float* src, size_t srcS, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, __m256 dst[25]) - { - for (size_t row = 0; row < rowB; ++row) - { - for (size_t col = 0; col < 5; ++col) - dst[col] = _mm256_setzero_ps(); - dst += 5; - } - for (size_t row = rowB; row < rowE; ++row) - { - for (size_t col = 0; col < colB; ++col) - dst[col] = _mm256_setzero_ps(); - for (size_t col = colB; col < colE; ++col) - dst[col] = _mm256_loadu_ps(src + row * srcS + col * srcC); - for (size_t col = colE; col < 5; ++col) - dst[col] = _mm256_setzero_ps(); - dst += 5; - } - for (size_t row = rowE; row < 5; ++row) - { - for (size_t col = 0; col < 5; ++col) - dst[col] = _mm256_setzero_ps(); - dst += 5; - } - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInput8t(const float* src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float* dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - __m256 tmp[25]; - WinogradKernel2x2Block4x4SetInput8t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel2x2Block4x4SetInputStore(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - __m256 tmp[25]; - WinogradKernel2x2Block4x4SetInput8t(src + srcC - F, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel2x2Block4x4SetInputStore(tmp, dst + srcC - F, dstStride); - } - } - - void WinogradKernel2x2Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padY == padX && padW == padH && (padY + padH == 0 || padY + padH == 1)); - if (trans ? (srcChannels < F) : true) - { - Base::WinogradKernel2x2Block4x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - return; - } - size_t dstH = srcHeight - 1 + padY + padH; - size_t dstW = srcWidth - 1 + padX + padW; - size_t dstH4 = AlignLo(dstH, 4); - size_t dstW4 = AlignLo(dstW, 4); - size_t noseW = Simd::Min(5, dstW + 1); - size_t noseH = Simd::Min(5, dstH + 1); - size_t startY = padY ? 4 : 0; - size_t startX = padX ? 4 : 0; - if (padY || padH) - { - if (dstH == dstH4) - dstH4 -= 4; - if (dstW == dstW4) - dstW4 -= 4; - if (padY) - src -= (srcWidth + 1) * (trans ? srcChannels : 1); - } - size_t tailW = dstW - dstW4 + (padW ? 0 : 1); - size_t tailH = dstH - dstH4 + (padH ? 0 : 1); - size_t row = 0, col = 0; - if (padY) - { - if (padX) - WinogradKernel2x2Block4x4SetInput8t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel2x2Block4x4SetInput8t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 5, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel2x2Block4x4SetInput8t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; - } - for (row = startY; row < dstH4; row += 4) - { - if (padX) - WinogradKernel2x2Block4x4SetInput8t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 5, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel2x2Block4x4SetInput8t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel2x2Block4x4SetInput8t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 5, 0, tailW, dst, dstStride), dst += srcChannels; - } - if (row < dstH) - { - if (padX) - WinogradKernel2x2Block4x4SetInput8t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel2x2Block4x4SetInput8t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 5, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel2x2Block4x4SetInput8t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputGetRow(const __m256* s, __m256* d) - { - const __m256 _2 = _mm256_set1_ps(2.0f); - const __m256 _4 = _mm256_set1_ps(4.0f); - const __m256 _8 = _mm256_set1_ps(8.0f); - d[0] = _mm256_add_ps(_mm256_add_ps(s[0], s[1]), _mm256_add_ps(s[2], s[3])); - d[1] = _mm256_add_ps(_mm256_sub_ps(s[1], s[2]), _mm256_mul_ps(_2, s[3])); - d[2] = _mm256_add_ps(_mm256_add_ps(s[1], s[2]), _mm256_mul_ps(_4, s[3])); - d[3] = _mm256_add_ps(_mm256_sub_ps(s[1], s[2]), _mm256_add_ps(_mm256_mul_ps(_8, s[3]), s[4])); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputLoad25(const float* src, size_t stride, __m256* dst) - { - __m256 s[25]; - s[0] = _mm256_loadu_ps(src + 0 * stride); - s[1] = _mm256_loadu_ps(src + 1 * stride); - s[2] = _mm256_loadu_ps(src + 2 * stride); - s[3] = _mm256_loadu_ps(src + 3 * stride); - s[4] = _mm256_loadu_ps(src + 4 * stride); - s[5] = _mm256_loadu_ps(src + 5 * stride); - s[6] = _mm256_loadu_ps(src + 6 * stride); - s[7] = _mm256_loadu_ps(src + 7 * stride); - s[8] = _mm256_loadu_ps(src + 8 * stride); - s[9] = _mm256_loadu_ps(src + 9 * stride); - s[10] = _mm256_loadu_ps(src + 10 * stride); - s[11] = _mm256_loadu_ps(src + 11 * stride); - s[12] = _mm256_loadu_ps(src + 12 * stride); - s[13] = _mm256_loadu_ps(src + 13 * stride); - s[14] = _mm256_loadu_ps(src + 14 * stride); - s[15] = _mm256_loadu_ps(src + 15 * stride); - s[16] = _mm256_loadu_ps(src + 16 * stride); - s[17] = _mm256_loadu_ps(src + 17 * stride); - s[18] = _mm256_loadu_ps(src + 18 * stride); - s[19] = _mm256_loadu_ps(src + 19 * stride); - s[20] = _mm256_loadu_ps(src + 20 * stride); - s[21] = _mm256_loadu_ps(src + 21 * stride); - s[22] = _mm256_loadu_ps(src + 22 * stride); - s[23] = _mm256_loadu_ps(src + 23 * stride); - s[24] = _mm256_loadu_ps(src + 24 * stride); - - const __m256 _2 = _mm256_set1_ps(2.0f); - const __m256 _4 = _mm256_set1_ps(4.0f); - const __m256 _8 = _mm256_set1_ps(8.0f); - __m256 t[5]; - t[0] = _mm256_add_ps(_mm256_add_ps(s[0], s[5]), _mm256_add_ps(s[10], s[15])); - t[1] = _mm256_add_ps(_mm256_add_ps(s[1], s[6]), _mm256_add_ps(s[11], s[16])); - t[2] = _mm256_add_ps(_mm256_add_ps(s[2], s[7]), _mm256_add_ps(s[12], s[17])); - t[3] = _mm256_add_ps(_mm256_add_ps(s[3], s[8]), _mm256_add_ps(s[13], s[18])); - t[4] = _mm256_add_ps(_mm256_add_ps(s[4], s[9]), _mm256_add_ps(s[14], s[19])); - WinogradKernel2x2Block4x4SetOutputGetRow(t, dst + 0); - - t[0] = _mm256_add_ps(_mm256_sub_ps(s[5], s[10]), _mm256_mul_ps(_2, s[15])); - t[1] = _mm256_add_ps(_mm256_sub_ps(s[6], s[11]), _mm256_mul_ps(_2, s[16])); - t[2] = _mm256_add_ps(_mm256_sub_ps(s[7], s[12]), _mm256_mul_ps(_2, s[17])); - t[3] = _mm256_add_ps(_mm256_sub_ps(s[8], s[13]), _mm256_mul_ps(_2, s[18])); - t[4] = _mm256_add_ps(_mm256_sub_ps(s[9], s[14]), _mm256_mul_ps(_2, s[19])); - WinogradKernel2x2Block4x4SetOutputGetRow(t, dst + 4); - - t[0] = _mm256_add_ps(_mm256_add_ps(s[5], s[10]), _mm256_mul_ps(_4, s[15])); - t[1] = _mm256_add_ps(_mm256_add_ps(s[6], s[11]), _mm256_mul_ps(_4, s[16])); - t[2] = _mm256_add_ps(_mm256_add_ps(s[7], s[12]), _mm256_mul_ps(_4, s[17])); - t[3] = _mm256_add_ps(_mm256_add_ps(s[8], s[13]), _mm256_mul_ps(_4, s[18])); - t[4] = _mm256_add_ps(_mm256_add_ps(s[9], s[14]), _mm256_mul_ps(_4, s[19])); - WinogradKernel2x2Block4x4SetOutputGetRow(t, dst + 8); - - t[0] = _mm256_add_ps(_mm256_sub_ps(s[5], s[10]), _mm256_add_ps(_mm256_mul_ps(_8, s[15]), s[20])); - t[1] = _mm256_add_ps(_mm256_sub_ps(s[6], s[11]), _mm256_add_ps(_mm256_mul_ps(_8, s[16]), s[21])); - t[2] = _mm256_add_ps(_mm256_sub_ps(s[7], s[12]), _mm256_add_ps(_mm256_mul_ps(_8, s[17]), s[22])); - t[3] = _mm256_add_ps(_mm256_sub_ps(s[8], s[13]), _mm256_add_ps(_mm256_mul_ps(_8, s[18]), s[23])); - t[4] = _mm256_add_ps(_mm256_sub_ps(s[9], s[14]), _mm256_add_ps(_mm256_mul_ps(_8, s[19]), s[24])); - WinogradKernel2x2Block4x4SetOutputGetRow(t, dst + 12); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputStore16(const __m256 src[16], float* dst, size_t dstS, size_t dstC) - { - _mm256_storeu_ps(dst + 0 * dstS + 0 * dstC, src[0]); - _mm256_storeu_ps(dst + 0 * dstS + 1 * dstC, src[1]); - _mm256_storeu_ps(dst + 0 * dstS + 2 * dstC, src[2]); - _mm256_storeu_ps(dst + 0 * dstS + 3 * dstC, src[3]); - _mm256_storeu_ps(dst + 1 * dstS + 0 * dstC, src[4]); - _mm256_storeu_ps(dst + 1 * dstS + 1 * dstC, src[5]); - _mm256_storeu_ps(dst + 1 * dstS + 2 * dstC, src[6]); - _mm256_storeu_ps(dst + 1 * dstS + 3 * dstC, src[7]); - _mm256_storeu_ps(dst + 2 * dstS + 0 * dstC, src[8]); - _mm256_storeu_ps(dst + 2 * dstS + 1 * dstC, src[9]); - _mm256_storeu_ps(dst + 2 * dstS + 2 * dstC, src[10]); - _mm256_storeu_ps(dst + 2 * dstS + 3 * dstC, src[11]); - _mm256_storeu_ps(dst + 3 * dstS + 0 * dstC, src[12]); - _mm256_storeu_ps(dst + 3 * dstS + 1 * dstC, src[13]); - _mm256_storeu_ps(dst + 3 * dstS + 2 * dstC, src[14]); - _mm256_storeu_ps(dst + 3 * dstS + 3 * dstC, src[15]); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutput8t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m256 tmp[16]; - WinogradKernel2x2Block4x4SetOutputLoad25(src + d, srcStride, tmp); - WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + d, dstS, dstC); - } - if (dstCF < dstC) - { - __m256 tmp[16]; - WinogradKernel2x2Block4x4SetOutputLoad25(src + dstC - F, srcStride, tmp); - WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + dstC - F, dstS, dstC); - } - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputStore16(const __m256 src[16], float* dst, size_t dstS, size_t dstC, size_t rowE, size_t colE) - { - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - _mm256_storeu_ps(dst + row * dstS + col * dstC, src[row * 4 + col]); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutput8t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m256 tmp[16]; - WinogradKernel2x2Block4x4SetOutputLoad25(src + d, srcStride, tmp); - WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + d, dstS, dstC, rowE, colE); - } - if (dstCF < dstC) - { - __m256 tmp[16]; - WinogradKernel2x2Block4x4SetOutputLoad25(src + dstC - F, srcStride, tmp); - WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + dstC - F, dstS, dstC, rowE, colE); - } - } - - void WinogradKernel2x2Block4x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - if (trans ? (dstChannels < F) : true) - { - Base::WinogradKernel2x2Block4x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - return; - } - size_t tileH = (dstHeight + 3) / 4; - size_t tileW = (dstWidth + 3) / 4; - size_t dstH4 = AlignLo(dstHeight, 4); - size_t dstW4 = AlignLo(dstWidth, 4); - size_t row, col; - for (row = 0; row < dstH4; row += 4) - { - for (col = 0; col < dstW4; col += 4) - WinogradKernel2x2Block4x4SetOutput8t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel2x2Block4x4SetOutput8t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, 4, dstWidth - col), src += dstChannels; - } - if (row < dstHeight) - { - for (col = 0; col < dstW4; col += 4) - WinogradKernel2x2Block4x4SetOutput8t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, 4), src += dstChannels; - if (col < dstWidth) - WinogradKernel2x2Block4x4SetOutput8t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block2x2SetFilter8t(const float * src, float * dst, size_t stride) - { - const __m256 r2 = _mm256_set1_ps(1.0f / 2.0f); - const __m256 r4 = _mm256_set1_ps(1.0f / 4.0f); - - __m256 s[9]; - s[0] = _mm256_loadu_ps(src + 0 * stride); - s[1] = _mm256_loadu_ps(src + 1 * stride); - s[2] = _mm256_loadu_ps(src + 2 * stride); - s[3] = _mm256_loadu_ps(src + 3 * stride); - s[4] = _mm256_loadu_ps(src + 4 * stride); - s[5] = _mm256_loadu_ps(src + 5 * stride); - s[6] = _mm256_loadu_ps(src + 6 * stride); - s[7] = _mm256_loadu_ps(src + 7 * stride); - s[8] = _mm256_loadu_ps(src + 8 * stride); - - _mm256_storeu_ps(dst + 0 * stride, s[0]); - __m256 _0a2 = _mm256_add_ps(s[0], s[2]); - _mm256_storeu_ps(dst + 1 * stride, _mm256_mul_ps(_mm256_add_ps(_0a2, s[1]), r2)); - _mm256_storeu_ps(dst + 2 * stride, _mm256_mul_ps(_mm256_sub_ps(_0a2, s[1]), r2)); - _mm256_storeu_ps(dst + 3 * stride, s[2]); - - __m256 _0a6a3 = _mm256_add_ps(_mm256_add_ps(s[0], s[6]), s[3]); - _mm256_storeu_ps(dst + 4 * stride, _mm256_mul_ps(_0a6a3, r2)); - __m256 _2a8a5 = _mm256_add_ps(_mm256_add_ps(s[2], s[8]), s[5]); - __m256 _1a7a4 = _mm256_add_ps(_mm256_add_ps(s[1], s[7]), s[4]); - _mm256_storeu_ps(dst + 5 * stride, _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_0a6a3, _2a8a5), _1a7a4), r4)); - _mm256_storeu_ps(dst + 6 * stride, _mm256_mul_ps(_mm256_sub_ps(_mm256_add_ps(_0a6a3, _2a8a5), _1a7a4), r4)); - _mm256_storeu_ps(dst + 7 * stride, _mm256_mul_ps(_2a8a5, r2)); - - __m256 _0a6s3 = _mm256_sub_ps(_mm256_add_ps(s[0], s[6]), s[3]); - _mm256_storeu_ps(dst + 8 * stride, _mm256_mul_ps(_0a6s3, r2)); - __m256 _2a8s5 = _mm256_sub_ps(_mm256_add_ps(s[2], s[8]), s[5]); - __m256 _1a7s4 = _mm256_sub_ps(_mm256_add_ps(s[1], s[7]), s[4]); - _mm256_storeu_ps(dst + 9 * stride, _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_0a6s3, _2a8s5), _1a7s4), r4)); - _mm256_storeu_ps(dst + 10 * stride, _mm256_mul_ps(_mm256_sub_ps(_mm256_add_ps(_0a6s3, _2a8s5), _1a7s4), r4)); - _mm256_storeu_ps(dst + 11 * stride, _mm256_mul_ps(_2a8s5, r2)); - - _mm256_storeu_ps(dst + 12 * stride, s[6]); - __m256 _6a8 = _mm256_add_ps(s[6], s[8]); - _mm256_storeu_ps(dst + 13 * stride, _mm256_mul_ps(_mm256_add_ps(_6a8, s[7]), r2)); - _mm256_storeu_ps(dst + 14 * stride, _mm256_mul_ps(_mm256_sub_ps(_6a8, s[7]), r2)); - _mm256_storeu_ps(dst + 15 * stride, s[8]); - } - - void WinogradKernel3x3Block2x2SetFilter(const float * src, size_t size, float * dst, SimdBool trans) - { - if (trans) - { - size_t size8 = AlignLo(size, 8), i = 0; - for (; i < size8; i += 8) - WinogradKernel3x3Block2x2SetFilter8t(src + i, dst + i, size); - for (; i < size; i += 1) - Base::WinogradKernel3x3Block2x2SetFilter1t(src + i, dst + i, size); - } - else - { - Sse::WinogradKernel3x3Block2x2SetFilter(src, size, dst, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInputLoad8n(const float * src, __m256 * dst) - { - __m256 a0 = Load(src + 0, src + 8); - __m256 a1 = Load(src + 2, src + 10); - __m256 a2 = Load(src + 4, src + 12); - __m256 a3 = Load(src + 6, src + 14); - dst[0] = _mm256_shuffle_ps(a0, a2, 0x88); - dst[1] = _mm256_shuffle_ps(a0, a2, 0xDD); - dst[2] = _mm256_shuffle_ps(a1, a3, 0x88); - dst[3] = _mm256_shuffle_ps(a1, a3, 0xDD); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInputLoad8n(const float * src, __m256 * dst, PadType pad) - { - __m256 a0 = Set(pad == PadNose1 ? Sse::LoadPadZeroNose1(src + 0) : _mm_loadu_ps(src + 0), _mm_loadu_ps(src + 8)); - __m256 a1 = Load(src + 2, src + 10); - __m256 a2 = Load(src + 4, src + 12); - __m256 a3 = Set(_mm_loadu_ps(src + 6), pad == PadTail2 ? Sse::LoadPadZeroTail2(src + 14) : (pad == PadTail1 ? Sse::LoadPadZeroTail1(src + 14) : _mm_loadu_ps(src + 14))); - dst[0] = _mm256_shuffle_ps(a0, a2, 0x88); - dst[1] = _mm256_shuffle_ps(a0, a2, 0xDD); - dst[2] = _mm256_shuffle_ps(a1, a3, 0x88); - dst[3] = _mm256_shuffle_ps(a1, a3, 0xDD); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInputLoad8z(__m256 * dst) - { - dst[0] = _mm256_setzero_ps(); - dst[1] = _mm256_setzero_ps(); - dst[2] = _mm256_setzero_ps(); - dst[3] = _mm256_setzero_ps(); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput8Store(const __m256 * src, float * dst, size_t stride) - { - _mm256_storeu_ps(dst + 0 * stride, _mm256_sub_ps(_mm256_sub_ps(src[0], src[8]), _mm256_sub_ps(src[2], src[10]))); - _mm256_storeu_ps(dst + 1 * stride, _mm256_add_ps(_mm256_sub_ps(src[1], src[9]), _mm256_sub_ps(src[2], src[10]))); - _mm256_storeu_ps(dst + 2 * stride, _mm256_sub_ps(_mm256_sub_ps(src[2], src[10]), _mm256_sub_ps(src[1], src[9]))); - _mm256_storeu_ps(dst + 3 * stride, _mm256_sub_ps(_mm256_sub_ps(src[1], src[9]), _mm256_sub_ps(src[3], src[11]))); - _mm256_storeu_ps(dst + 4 * stride, _mm256_sub_ps(_mm256_add_ps(src[4], src[8]), _mm256_add_ps(src[6], src[10]))); - _mm256_storeu_ps(dst + 5 * stride, _mm256_add_ps(_mm256_add_ps(src[5], src[9]), _mm256_add_ps(src[6], src[10]))); - _mm256_storeu_ps(dst + 6 * stride, _mm256_sub_ps(_mm256_add_ps(src[6], src[10]), _mm256_add_ps(src[5], src[9]))); - _mm256_storeu_ps(dst + 7 * stride, _mm256_sub_ps(_mm256_add_ps(src[5], src[9]), _mm256_add_ps(src[7], src[11]))); - _mm256_storeu_ps(dst + 8 * stride, _mm256_sub_ps(_mm256_sub_ps(src[8], src[4]), _mm256_sub_ps(src[10], src[6]))); - _mm256_storeu_ps(dst + 9 * stride, _mm256_add_ps(_mm256_sub_ps(src[9], src[5]), _mm256_sub_ps(src[10], src[6]))); - _mm256_storeu_ps(dst + 10 * stride, _mm256_sub_ps(_mm256_sub_ps(src[10], src[6]), _mm256_sub_ps(src[9], src[5]))); - _mm256_storeu_ps(dst + 11 * stride, _mm256_sub_ps(_mm256_sub_ps(src[9], src[5]), _mm256_sub_ps(src[11], src[7]))); - _mm256_storeu_ps(dst + 12 * stride, _mm256_sub_ps(_mm256_sub_ps(src[4], src[12]), _mm256_sub_ps(src[6], src[14]))); - _mm256_storeu_ps(dst + 13 * stride, _mm256_add_ps(_mm256_sub_ps(src[5], src[13]), _mm256_sub_ps(src[6], src[14]))); - _mm256_storeu_ps(dst + 14 * stride, _mm256_sub_ps(_mm256_sub_ps(src[6], src[14]), _mm256_sub_ps(src[5], src[13]))); - _mm256_storeu_ps(dst + 15 * stride, _mm256_sub_ps(_mm256_sub_ps(src[5], src[13]), _mm256_sub_ps(src[7], src[15]))); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput8n(const float * src, size_t srcStride, float * dst, size_t dstStride) - { - __m256 t[16]; - WinogradKernel3x3Block2x2SetInputLoad8n(src + 0 * srcStride, t + 0); - WinogradKernel3x3Block2x2SetInputLoad8n(src + 1 * srcStride, t + 4); - WinogradKernel3x3Block2x2SetInputLoad8n(src + 2 * srcStride, t + 8); - WinogradKernel3x3Block2x2SetInputLoad8n(src + 3 * srcStride, t + 12); - WinogradKernel3x3Block2x2SetInput8Store(t, dst, dstStride); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput8n(const float * src, size_t srcStride, PadType rowPad, PadType colPad, float * dst, size_t dstStride) - { - __m256 t[16]; - if (rowPad == PadNose1) - WinogradKernel3x3Block2x2SetInputLoad8z(t + 0); - else - WinogradKernel3x3Block2x2SetInputLoad8n(src + 0 * srcStride, t + 0, colPad); - WinogradKernel3x3Block2x2SetInputLoad8n(src + 1 * srcStride, t + 4, colPad); - if (rowPad == PadTail2) - WinogradKernel3x3Block2x2SetInputLoad8z(t + 8); - else - WinogradKernel3x3Block2x2SetInputLoad8n(src + 2 * srcStride, t + 8, colPad); - if (rowPad >= PadTail1) - WinogradKernel3x3Block2x2SetInputLoad8z(t + 12); - else - WinogradKernel3x3Block2x2SetInputLoad8n(src + 3 * srcStride, t + 12, colPad); - WinogradKernel3x3Block2x2SetInput8Store(t, dst, dstStride); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput8t(const float * src, size_t srcS, size_t srcC, __m256 dst[16]) - { - dst[0] = _mm256_loadu_ps(src + 0 * srcS + 0 * srcC); - dst[1] = _mm256_loadu_ps(src + 0 * srcS + 1 * srcC); - dst[2] = _mm256_loadu_ps(src + 0 * srcS + 2 * srcC); - dst[3] = _mm256_loadu_ps(src + 0 * srcS + 3 * srcC); - dst[4] = _mm256_loadu_ps(src + 1 * srcS + 0 * srcC); - dst[5] = _mm256_loadu_ps(src + 1 * srcS + 1 * srcC); - dst[6] = _mm256_loadu_ps(src + 1 * srcS + 2 * srcC); - dst[7] = _mm256_loadu_ps(src + 1 * srcS + 3 * srcC); - dst[8] = _mm256_loadu_ps(src + 2 * srcS + 0 * srcC); - dst[9] = _mm256_loadu_ps(src + 2 * srcS + 1 * srcC); - dst[10] = _mm256_loadu_ps(src + 2 * srcS + 2 * srcC); - dst[11] = _mm256_loadu_ps(src + 2 * srcS + 3 * srcC); - dst[12] = _mm256_loadu_ps(src + 3 * srcS + 0 * srcC); - dst[13] = _mm256_loadu_ps(src + 3 * srcS + 1 * srcC); - dst[14] = _mm256_loadu_ps(src + 3 * srcS + 2 * srcC); - dst[15] = _mm256_loadu_ps(src + 3 * srcS + 3 * srcC); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput8t(const float * src, size_t srcW, size_t srcC, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - __m256 tmp[16]; - WinogradKernel3x3Block2x2SetInput8t(src + c, srcS, srcC, tmp); - WinogradKernel3x3Block2x2SetInput8Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - __m256 tmp[16]; - WinogradKernel3x3Block2x2SetInput8t(src + srcC - F, srcS, srcC, tmp); - WinogradKernel3x3Block2x2SetInput8Store(tmp, dst + srcC - F, dstStride); - } - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput8t(const float * src, size_t srcS, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, __m256 dst[16]) - { - for (size_t row = 0; row < rowB; ++row) - { - for (size_t col = 0; col < 4; ++col) - dst[col] = _mm256_setzero_ps(); - dst += 4; - } - for (size_t row = rowB; row < rowE; ++row) - { - for (size_t col = 0; col < colB; ++col) - dst[col] = _mm256_setzero_ps(); - for (size_t col = colB; col < colE; ++col) - dst[col] = _mm256_loadu_ps(src + row * srcS + col * srcC); - for (size_t col = colE; col < 4; ++col) - dst[col] = _mm256_setzero_ps(); - dst += 4; - } - for (size_t row = rowE; row < 4; ++row) - { - for (size_t col = 0; col < 4; ++col) - dst[col] = _mm256_setzero_ps(); - dst += 4; - } - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput8t(const float * src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - __m256 tmp[16]; - WinogradKernel3x3Block2x2SetInput8t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel3x3Block2x2SetInput8Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - __m256 tmp[16]; - WinogradKernel3x3Block2x2SetInput8t(src + srcC - F, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel3x3Block2x2SetInput8Store(tmp, dst + srcC - F, dstStride); - } - } - - void WinogradKernel3x3Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padY == padX && padY == padH && padY == padW && (padY == 0 || padY == 1)); - SimdBool pad = padY > 0 ? SimdTrue : SimdFalse; - if (trans ? (srcChannels < 8) : (srcHeight < 4 || srcWidth < 18)) - { - Sse::WinogradKernel3x3Block2x2SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - return; - } - size_t dstH = pad ? srcHeight : srcHeight - 2; - size_t dstW = pad ? srcWidth : srcWidth - 2; - size_t tileH = (dstH + 1) / 2; - size_t tileW = (dstW + 1) / 2; - size_t dstH2 = AlignLo(dstH, 2); - size_t dstW2 = AlignLo(dstW, 2); - if (trans) - { - size_t noseW = Simd::Min(4, dstW + 1); - size_t noseH = Simd::Min(4, dstH + 1); - size_t start = pad ? 2 : 0; - if (pad) - { - if (dstH == dstH2) - dstH2 -= 2; - if (dstW == dstW2) - dstW2 -= 2; - src -= (srcWidth + 1)*srcChannels; - } - size_t tailW = dstW - dstW2 + (pad ? 1 : 2); - size_t tailH = dstH - dstH2 + (pad ? 1 : 2); - size_t row = 0, col = 0; - if (pad) - { - if (pad) - WinogradKernel3x3Block2x2SetInput8t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstW2; col += 2) - WinogradKernel3x3Block2x2SetInput8t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 4, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block2x2SetInput8t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; - } - for (row = start; row < dstH2; row += 2) - { - if (pad) - WinogradKernel3x3Block2x2SetInput8t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 4, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstW2; col += 2) - WinogradKernel3x3Block2x2SetInput8t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block2x2SetInput8t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 4, 0, tailW, dst, dstStride), dst += srcChannels; - } - if (row < dstH) - { - if (pad) - WinogradKernel3x3Block2x2SetInput8t(src + row * srcWidth* srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstW2; col += 2) - WinogradKernel3x3Block2x2SetInput8t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 4, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block2x2SetInput8t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; - } - } - else - { - size_t dstW16 = AlignLo(dstW, 16); - if (pad && dstW16 == dstW) - dstW16 -= 16; - PadType rowPad = dstH2 < dstH ? PadTail1 : PadNone; - PadType colPad = dstW2 < dstW ? PadTail1 : PadNone; - size_t tailCol = dstW2 < dstW ? dstW - 15 : dstW - 16; - size_t tailRow = dstH2 < dstH ? dstH - 1 : dstH - 2; - bool specialColTail = dstW16 < dstW || pad; - bool specialRowTail = dstH2 < dstH || pad; - if (pad) - { - src -= srcWidth + 1; - rowPad = dstH2 < dstH ? PadTail2 : PadTail1; - colPad = dstW2 < dstW ? PadTail2 : PadTail1; - if (dstH2 == dstH) - dstH2 -= 2; - } - for (size_t c = 0; c < srcChannels; ++c) - { - size_t row = 0, tileY = 0; - if (pad) - { - size_t col = 0, tileX = 0; - const float * s = src + row * srcWidth; - float * d = dst + tileY * tileW; - if (pad) - WinogradKernel3x3Block2x2SetInput8n(s + col, srcWidth, PadNose1, PadNose1, d + tileX, dstStride), col += 16, tileX += 8; - for (; col < dstW16; col += 16, tileX += 8) - WinogradKernel3x3Block2x2SetInput8n(s + col, srcWidth, PadNose1, PadNone, d + tileX, dstStride); - if (specialColTail) - WinogradKernel3x3Block2x2SetInput8n(s + tailCol, srcWidth, PadNose1, colPad, d + tileW - 8, dstStride); - row += 2, tileY += 1; - } - for (; row < dstH2; row += 2, tileY += 1) - { - size_t col = 0, tileX = 0; - const float * s = src + row * srcWidth; - float * d = dst + tileY * tileW; - if (pad) - WinogradKernel3x3Block2x2SetInput8n(s + col, srcWidth, PadNone, PadNose1, d + tileX, dstStride), col += 16, tileX += 8; - for (; col < dstW16; col += 16, tileX += 8) - WinogradKernel3x3Block2x2SetInput8n(s + col, srcWidth, d + tileX, dstStride); - if (specialColTail) - WinogradKernel3x3Block2x2SetInput8n(s + tailCol, srcWidth, PadNone, colPad, d + tileW - 8, dstStride); - } - if (specialRowTail) - { - size_t col = 0, tileX = 0; - const float * s = src + tailRow * srcWidth; - float * d = dst + (tileH - 1) * tileW; - if (pad) - WinogradKernel3x3Block2x2SetInput8n(s + col, srcWidth, rowPad, PadNose1, d + tileX, dstStride), col += 16, tileX += 8; - for (; col < dstW16; col += 16, tileX += 8) - WinogradKernel3x3Block2x2SetInput8n(s + col, srcWidth, rowPad, PadNone, d + tileX, dstStride); - if (specialColTail) - WinogradKernel3x3Block2x2SetInput8n(s + tailCol, srcWidth, rowPad, colPad, d + tileW - 8, dstStride); - } - src += srcWidth * srcHeight; - dst += tileW * tileH; - } - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutputLoad4(const float * src, size_t stride, __m256 * dst) - { - __m256 s0 = _mm256_loadu_ps(src + 0 * stride); - __m256 s1 = _mm256_loadu_ps(src + 1 * stride); - __m256 s2 = _mm256_loadu_ps(src + 2 * stride); - __m256 s3 = _mm256_loadu_ps(src + 3 * stride); - dst[0] = _mm256_add_ps(_mm256_add_ps(s0, s1), s2); - dst[1] = _mm256_sub_ps(_mm256_sub_ps(s1, s2), s3); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutputLoad16(const float * src, size_t stride, __m256 * dst) - { - __m256 tmp[8]; - WinogradKernel3x3Block2x2SetOutputLoad4(src + 0 * stride, stride, tmp + 0); - WinogradKernel3x3Block2x2SetOutputLoad4(src + 4 * stride, stride, tmp + 2); - WinogradKernel3x3Block2x2SetOutputLoad4(src + 8 * stride, stride, tmp + 4); - WinogradKernel3x3Block2x2SetOutputLoad4(src + 12 * stride, stride, tmp + 6); - dst[0] = _mm256_add_ps(_mm256_add_ps(tmp[0], tmp[2]), tmp[4]); - dst[1] = _mm256_add_ps(_mm256_add_ps(tmp[1], tmp[3]), tmp[5]); - dst[2] = _mm256_sub_ps(_mm256_sub_ps(tmp[2], tmp[4]), tmp[6]); - dst[3] = _mm256_sub_ps(_mm256_sub_ps(tmp[3], tmp[5]), tmp[7]); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutput8n(const float * src, size_t stride, __m256 * dst) - { - __m256 d[4], u[4]; - WinogradKernel3x3Block2x2SetOutputLoad16(src, stride, d); - u[0] = _mm256_unpacklo_ps(d[0], d[1]); - u[1] = _mm256_unpackhi_ps(d[0], d[1]); - u[2] = _mm256_unpacklo_ps(d[2], d[3]); - u[3] = _mm256_unpackhi_ps(d[2], d[3]); - dst[0] = _mm256_permute2f128_ps(u[0], u[1], 0x20); - dst[1] = _mm256_permute2f128_ps(u[0], u[1], 0x31); - dst[2] = _mm256_permute2f128_ps(u[2], u[3], 0x20); - dst[3] = _mm256_permute2f128_ps(u[2], u[3], 0x31); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutput8n(const float * src, size_t srcStride, float * dst, size_t dstStride) - { - __m256 d[4]; - WinogradKernel3x3Block2x2SetOutput8n(src, srcStride, d); - _mm256_storeu_ps(dst + 0 * dstStride + 0, d[0]); - _mm256_storeu_ps(dst + 0 * dstStride + 8, d[1]); - _mm256_storeu_ps(dst + 1 * dstStride + 0, d[2]); - _mm256_storeu_ps(dst + 1 * dstStride + 8, d[3]); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutput8n(const float * src, size_t srcStride, float * dst, size_t dstStride, bool lastRow, bool lastCol, const __m256 & mask) - { - __m256 d[4]; - WinogradKernel3x3Block2x2SetOutput8n(src, srcStride, d); - _mm256_storeu_ps(dst + 0, d[0]); - if (lastCol) - _mm256_storeu_ps(dst + 8, d[1]); - else - StoreMasked(dst + 8, d[1], mask); - if (lastRow) - { - dst += dstStride; - _mm256_storeu_ps(dst + 0, d[2]); - if (lastCol) - _mm256_storeu_ps(dst + 8, d[3]); - else - StoreMasked(dst + 8, d[3], mask); - } - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutputStore4(const __m256 src[4], float * dst, size_t dstS, size_t dstC) - { - _mm256_storeu_ps(dst + 0 * dstS + 0 * dstC, src[0]); - _mm256_storeu_ps(dst + 0 * dstS + 1 * dstC, src[1]); - _mm256_storeu_ps(dst + 1 * dstS + 0 * dstC, src[2]); - _mm256_storeu_ps(dst + 1 * dstS + 1 * dstC, src[3]); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutput8t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m256 tmp[4]; - WinogradKernel3x3Block2x2SetOutputLoad16(src + d, srcStride, tmp); - WinogradKernel3x3Block2x2SetOutputStore4(tmp, dst + d, dstS, dstC); - } - if (dstCF < dstC) - { - __m256 tmp[4]; - WinogradKernel3x3Block2x2SetOutputLoad16(src + dstC - F, srcStride, tmp); - WinogradKernel3x3Block2x2SetOutputStore4(tmp, dst + dstC - F, dstS, dstC); - } - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutputStore4(const __m256 src[4], float * dst, size_t dstS, size_t dstC, size_t rowE, size_t colE) - { - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - _mm256_storeu_ps(dst + row * dstS + col * dstC, src[row * 2 + col]); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutput8t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m256 tmp[4]; - WinogradKernel3x3Block2x2SetOutputLoad16(src + d, srcStride, tmp); - WinogradKernel3x3Block2x2SetOutputStore4(tmp, dst + d, dstS, dstC, rowE, colE); - } - if (dstCF < dstC) - { - __m256 tmp[4]; - WinogradKernel3x3Block2x2SetOutputLoad16(src + dstC - F, srcStride, tmp); - WinogradKernel3x3Block2x2SetOutputStore4(tmp, dst + dstC - F, dstS, dstC, rowE, colE); - } - } - - void WinogradKernel3x3Block2x2SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - if (trans ? (dstChannels < 8) : (dstHeight < 2 || dstWidth < 16)) - { - Sse::WinogradKernel3x3Block2x2SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - return; - } - size_t tileH = (dstHeight + 1) / 2; - size_t tileW = (dstWidth + 1) / 2; - size_t dstH2 = AlignLo(dstHeight, 2); - size_t dstW2 = AlignLo(dstWidth, 2); - if (trans) - { - size_t row, col; - for (row = 0; row < dstH2; row += 2) - { - for (col = 0; col < dstW2; col += 2) - WinogradKernel3x3Block2x2SetOutput8t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block2x2SetOutput8t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, 2, dstWidth - col), src += dstChannels; - } - if (row < dstHeight) - { - for (col = 0; col < dstW2; col += 2) - WinogradKernel3x3Block2x2SetOutput8t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, 2), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block2x2SetOutput8t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; - } - } - else - { - size_t dstW16 = AlignLo(dstWidth, 16); - __m256 tailMask = LeftNotZero32f(8 + dstW2 - dstWidth); - size_t tailCol = dstW2 < dstWidth ? dstWidth - 15 : dstWidth - 16; - for (size_t c = 0; c < dstChannels; ++c) - { - size_t row = 0, tileY = 0; - for (; row < dstH2; row += 2, tileY += 1) - { - size_t col = 0, tileX = 0; - const float * s = src + tileY * tileW; - float * d = dst + row * dstWidth; - for (; col < dstW16; col += 16, tileX += 8) - WinogradKernel3x3Block2x2SetOutput8n(s + tileX, srcStride, d + col, dstWidth); - if (col < dstWidth) - WinogradKernel3x3Block2x2SetOutput8n(s + tileW - 8, srcStride, d + tailCol, dstWidth, true, false, tailMask); - } - if (row < dstHeight) - { - size_t col = 0, tileX = 0; - const float * s = src + (tileH - 1) * tileW; - float * d = dst + (dstHeight - 1) * dstWidth; - for (; col < dstW16; col += 16, tileX += 8) - WinogradKernel3x3Block2x2SetOutput8n(s + tileX, srcStride, d + col, dstWidth, false, true, tailMask); - if (col < dstWidth) - WinogradKernel3x3Block2x2SetOutput8n(s + tileW - 8, srcStride, d + tailCol, dstWidth, false, false, tailMask); - } - src += tileW * tileH; - dst += dstHeight * dstWidth; - } - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block3x3SetFilter8Row(const __m256 * t, float * dst, size_t stride) - { - const __m256 r6 = _mm256_set1_ps(1.0f / 6.0f); - const __m256 r3 = _mm256_set1_ps(1.0f / 3.0f); - const __m256 r2 = _mm256_set1_ps(1.0f / 2.0f); - const __m256 f2_3 = _mm256_set1_ps(2.0f / 3.0f); - const __m256 mr2 = _mm256_set1_ps(-1.0f / 2.0f); - - _mm256_storeu_ps(dst + 0 * stride, _mm256_mul_ps(r2, t[0])); - __m256 t0 = _mm256_add_ps(t[0], t[2]); - _mm256_storeu_ps(dst + 1 * stride, _mm256_mul_ps(mr2, _mm256_add_ps(t0, t[1]))); - _mm256_storeu_ps(dst + 2 * stride, _mm256_mul_ps(r6, _mm256_sub_ps(t[1], t0))); - _mm256_storeu_ps(dst + 3 * stride, _mm256_add_ps(_mm256_mul_ps(r6, t[0]), _mm256_add_ps(_mm256_mul_ps(r3, t[1]), _mm256_mul_ps(f2_3, t[2])))); - _mm256_storeu_ps(dst + 4 * stride, t[2]); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetFilter8All(const __m256 * s, float * dst, size_t stride) - { - const __m256 r6 = _mm256_set1_ps(1.0f / 6.0f); - const __m256 r3 = _mm256_set1_ps(1.0f / 3.0f); - const __m256 r2 = _mm256_set1_ps(1.0f / 2.0f); - const __m256 f2_3 = _mm256_set1_ps(2.0f / 3.0f); - const __m256 mr2 = _mm256_set1_ps(-1.0f / 2.0f); - - __m256 t[3]; - t[0] = _mm256_mul_ps(r2, s[0]); - t[1] = _mm256_mul_ps(r2, s[1]); - t[2] = _mm256_mul_ps(r2, s[2]); - WinogradKernel3x3Block3x3SetFilter8Row(t, dst + 0 * stride, stride); - - t[0] = _mm256_mul_ps(mr2, _mm256_add_ps(_mm256_add_ps(s[0], s[6]), s[3])); - t[1] = _mm256_mul_ps(mr2, _mm256_add_ps(_mm256_add_ps(s[1], s[7]), s[4])); - t[2] = _mm256_mul_ps(mr2, _mm256_add_ps(_mm256_add_ps(s[2], s[8]), s[5])); - WinogradKernel3x3Block3x3SetFilter8Row(t, dst + 5 * stride, stride); - - t[0] = _mm256_mul_ps(r6, _mm256_sub_ps(s[3], _mm256_add_ps(s[0], s[6]))); - t[1] = _mm256_mul_ps(r6, _mm256_sub_ps(s[4], _mm256_add_ps(s[1], s[7]))); - t[2] = _mm256_mul_ps(r6, _mm256_sub_ps(s[5], _mm256_add_ps(s[2], s[8]))); - WinogradKernel3x3Block3x3SetFilter8Row(t, dst + 10 * stride, stride); - - t[0] = _mm256_add_ps(_mm256_mul_ps(r6, s[0]), _mm256_add_ps(_mm256_mul_ps(r3, s[3]), _mm256_mul_ps(f2_3, s[6]))); - t[1] = _mm256_add_ps(_mm256_mul_ps(r6, s[1]), _mm256_add_ps(_mm256_mul_ps(r3, s[4]), _mm256_mul_ps(f2_3, s[7]))); - t[2] = _mm256_add_ps(_mm256_mul_ps(r6, s[2]), _mm256_add_ps(_mm256_mul_ps(r3, s[5]), _mm256_mul_ps(f2_3, s[8]))); - WinogradKernel3x3Block3x3SetFilter8Row(t, dst + 15 * stride, stride); - - WinogradKernel3x3Block3x3SetFilter8Row(s + 6, dst + 20 * stride, stride); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetFilter8t(const float * src, float * dst, size_t stride) - { - __m256 s[9]; - s[0] = _mm256_loadu_ps(src + 0 * stride); - s[1] = _mm256_loadu_ps(src + 1 * stride); - s[2] = _mm256_loadu_ps(src + 2 * stride); - s[3] = _mm256_loadu_ps(src + 3 * stride); - s[4] = _mm256_loadu_ps(src + 4 * stride); - s[5] = _mm256_loadu_ps(src + 5 * stride); - s[6] = _mm256_loadu_ps(src + 6 * stride); - s[7] = _mm256_loadu_ps(src + 7 * stride); - s[8] = _mm256_loadu_ps(src + 8 * stride); - WinogradKernel3x3Block3x3SetFilter8All(s, dst + 0 * stride, stride); - } - - void WinogradKernel3x3Block3x3SetFilter(const float * src, size_t size, float * dst, SimdBool trans) - { - size_t size8 = AlignLo(size, 8), i = 0; - if (trans) - { - for (; i < size8; i += 8) - WinogradKernel3x3Block3x3SetFilter8t(src + i, dst + i, size); - for (; i < size; i += 1) - Base::WinogradKernel3x3Block3x3SetFilter1t(src + i, dst + i, size); - } - else - { - Sse::WinogradKernel3x3Block3x3SetFilter(src, size, dst, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block3x3SetInput8Store(const __m256 src[25], float * dst, size_t stride) - { - __m256 _2 = _mm256_set1_ps(2.0f); - __m256 _3 = _mm256_set1_ps(3.0f); - __m256 tmp[5]; - - tmp[0] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[0], src[10])), _mm256_sub_ps(src[15], src[5])); - tmp[1] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[1], src[11])), _mm256_sub_ps(src[16], src[6])); - tmp[2] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[2], src[12])), _mm256_sub_ps(src[17], src[7])); - tmp[3] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[3], src[13])), _mm256_sub_ps(src[18], src[8])); - tmp[4] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[4], src[14])), _mm256_sub_ps(src[19], src[9])); - _mm256_storeu_ps(dst + 0 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[0], tmp[2])), _mm256_sub_ps(tmp[3], tmp[1]))); - _mm256_storeu_ps(dst + 1 * stride, _mm256_sub_ps(_mm256_sub_ps(tmp[3], tmp[2]), _mm256_mul_ps(_2, tmp[1]))); - _mm256_storeu_ps(dst + 2 * stride, _mm256_add_ps(_mm256_mul_ps(_2, tmp[1]), _mm256_sub_ps(tmp[3], _mm256_mul_ps(_3, tmp[2])))); - _mm256_storeu_ps(dst + 3 * stride, _mm256_sub_ps(tmp[3], tmp[1])); - _mm256_storeu_ps(dst + 4 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[1], tmp[3])), _mm256_sub_ps(tmp[4], tmp[2]))); - - tmp[0] = _mm256_sub_ps(_mm256_sub_ps(src[15], src[10]), _mm256_mul_ps(_2, src[5])); - tmp[1] = _mm256_sub_ps(_mm256_sub_ps(src[16], src[11]), _mm256_mul_ps(_2, src[6])); - tmp[2] = _mm256_sub_ps(_mm256_sub_ps(src[17], src[12]), _mm256_mul_ps(_2, src[7])); - tmp[3] = _mm256_sub_ps(_mm256_sub_ps(src[18], src[13]), _mm256_mul_ps(_2, src[8])); - tmp[4] = _mm256_sub_ps(_mm256_sub_ps(src[19], src[14]), _mm256_mul_ps(_2, src[9])); - _mm256_storeu_ps(dst + 5 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[0], tmp[2])), _mm256_sub_ps(tmp[3], tmp[1]))); - _mm256_storeu_ps(dst + 6 * stride, _mm256_sub_ps(_mm256_sub_ps(tmp[3], tmp[2]), _mm256_mul_ps(_2, tmp[1]))); - _mm256_storeu_ps(dst + 7 * stride, _mm256_add_ps(_mm256_mul_ps(_2, tmp[1]), _mm256_sub_ps(tmp[3], _mm256_mul_ps(_3, tmp[2])))); - _mm256_storeu_ps(dst + 8 * stride, _mm256_sub_ps(tmp[3], tmp[1])); - _mm256_storeu_ps(dst + 9 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[1], tmp[3])), _mm256_sub_ps(tmp[4], tmp[2]))); - - tmp[0] = _mm256_add_ps(_mm256_mul_ps(_2, src[5]), _mm256_sub_ps(src[15], _mm256_mul_ps(_3, src[10]))); - tmp[1] = _mm256_add_ps(_mm256_mul_ps(_2, src[6]), _mm256_sub_ps(src[16], _mm256_mul_ps(_3, src[11]))); - tmp[2] = _mm256_add_ps(_mm256_mul_ps(_2, src[7]), _mm256_sub_ps(src[17], _mm256_mul_ps(_3, src[12]))); - tmp[3] = _mm256_add_ps(_mm256_mul_ps(_2, src[8]), _mm256_sub_ps(src[18], _mm256_mul_ps(_3, src[13]))); - tmp[4] = _mm256_add_ps(_mm256_mul_ps(_2, src[9]), _mm256_sub_ps(src[19], _mm256_mul_ps(_3, src[14]))); - _mm256_storeu_ps(dst + 10 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[0], tmp[2])), _mm256_sub_ps(tmp[3], tmp[1]))); - _mm256_storeu_ps(dst + 11 * stride, _mm256_sub_ps(_mm256_sub_ps(tmp[3], tmp[2]), _mm256_mul_ps(_2, tmp[1]))); - _mm256_storeu_ps(dst + 12 * stride, _mm256_add_ps(_mm256_mul_ps(_2, tmp[1]), _mm256_sub_ps(tmp[3], _mm256_mul_ps(_3, tmp[2])))); - _mm256_storeu_ps(dst + 13 * stride, _mm256_sub_ps(tmp[3], tmp[1])); - _mm256_storeu_ps(dst + 14 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[1], tmp[3])), _mm256_sub_ps(tmp[4], tmp[2]))); - - tmp[0] = _mm256_sub_ps(src[15], src[5]); - tmp[1] = _mm256_sub_ps(src[16], src[6]); - tmp[2] = _mm256_sub_ps(src[17], src[7]); - tmp[3] = _mm256_sub_ps(src[18], src[8]); - tmp[4] = _mm256_sub_ps(src[19], src[9]); - _mm256_storeu_ps(dst + 15 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[0], tmp[2])), _mm256_sub_ps(tmp[3], tmp[1]))); - _mm256_storeu_ps(dst + 16 * stride, _mm256_sub_ps(_mm256_sub_ps(tmp[3], tmp[2]), _mm256_mul_ps(_2, tmp[1]))); - _mm256_storeu_ps(dst + 17 * stride, _mm256_add_ps(_mm256_mul_ps(_2, tmp[1]), _mm256_sub_ps(tmp[3], _mm256_mul_ps(_3, tmp[2])))); - _mm256_storeu_ps(dst + 18 * stride, _mm256_sub_ps(tmp[3], tmp[1])); - _mm256_storeu_ps(dst + 19 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[1], tmp[3])), _mm256_sub_ps(tmp[4], tmp[2]))); - - tmp[0] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[5], src[15])), _mm256_sub_ps(src[20], src[10])); - tmp[1] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[6], src[16])), _mm256_sub_ps(src[21], src[11])); - tmp[2] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[7], src[17])), _mm256_sub_ps(src[22], src[12])); - tmp[3] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[8], src[18])), _mm256_sub_ps(src[23], src[13])); - tmp[4] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[9], src[19])), _mm256_sub_ps(src[24], src[14])); - _mm256_storeu_ps(dst + 20 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[0], tmp[2])), _mm256_sub_ps(tmp[3], tmp[1]))); - _mm256_storeu_ps(dst + 21 * stride, _mm256_sub_ps(_mm256_sub_ps(tmp[3], tmp[2]), _mm256_mul_ps(_2, tmp[1]))); - _mm256_storeu_ps(dst + 22 * stride, _mm256_add_ps(_mm256_mul_ps(_2, tmp[1]), _mm256_sub_ps(tmp[3], _mm256_mul_ps(_3, tmp[2])))); - _mm256_storeu_ps(dst + 23 * stride, _mm256_sub_ps(tmp[3], tmp[1])); - _mm256_storeu_ps(dst + 24 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[1], tmp[3])), _mm256_sub_ps(tmp[4], tmp[2]))); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetInput8t(const float * src, size_t srcS, size_t srcC, __m256 dst[25]) - { - dst[0] = _mm256_loadu_ps(src + 0 * srcS + 0 * srcC); - dst[1] = _mm256_loadu_ps(src + 0 * srcS + 1 * srcC); - dst[2] = _mm256_loadu_ps(src + 0 * srcS + 2 * srcC); - dst[3] = _mm256_loadu_ps(src + 0 * srcS + 3 * srcC); - dst[4] = _mm256_loadu_ps(src + 0 * srcS + 4 * srcC); - dst[5] = _mm256_loadu_ps(src + 1 * srcS + 0 * srcC); - dst[6] = _mm256_loadu_ps(src + 1 * srcS + 1 * srcC); - dst[7] = _mm256_loadu_ps(src + 1 * srcS + 2 * srcC); - dst[8] = _mm256_loadu_ps(src + 1 * srcS + 3 * srcC); - dst[9] = _mm256_loadu_ps(src + 1 * srcS + 4 * srcC); - dst[10] = _mm256_loadu_ps(src + 2 * srcS + 0 * srcC); - dst[11] = _mm256_loadu_ps(src + 2 * srcS + 1 * srcC); - dst[12] = _mm256_loadu_ps(src + 2 * srcS + 2 * srcC); - dst[13] = _mm256_loadu_ps(src + 2 * srcS + 3 * srcC); - dst[14] = _mm256_loadu_ps(src + 2 * srcS + 4 * srcC); - dst[15] = _mm256_loadu_ps(src + 3 * srcS + 0 * srcC); - dst[16] = _mm256_loadu_ps(src + 3 * srcS + 1 * srcC); - dst[17] = _mm256_loadu_ps(src + 3 * srcS + 2 * srcC); - dst[18] = _mm256_loadu_ps(src + 3 * srcS + 3 * srcC); - dst[19] = _mm256_loadu_ps(src + 3 * srcS + 4 * srcC); - dst[20] = _mm256_loadu_ps(src + 4 * srcS + 0 * srcC); - dst[21] = _mm256_loadu_ps(src + 4 * srcS + 1 * srcC); - dst[22] = _mm256_loadu_ps(src + 4 * srcS + 2 * srcC); - dst[23] = _mm256_loadu_ps(src + 4 * srcS + 3 * srcC); - dst[24] = _mm256_loadu_ps(src + 4 * srcS + 4 * srcC); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetInput8t(const float * src, size_t srcW, size_t srcC, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - __m256 tmp[25]; - WinogradKernel3x3Block3x3SetInput8t(src + c, srcS, srcC, tmp); - WinogradKernel3x3Block3x3SetInput8Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - __m256 tmp[25]; - WinogradKernel3x3Block3x3SetInput8t(src + srcC - F, srcS, srcC, tmp); - WinogradKernel3x3Block3x3SetInput8Store(tmp, dst + srcC - F, dstStride); - } - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetInput8t(const float * src, size_t srcS, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, __m256 dst[25]) - { - for (size_t row = 0; row < rowB; ++row) - { - for (size_t col = 0; col < 5; ++col) - dst[col] = _mm256_setzero_ps(); - dst += 5; - } - for (size_t row = rowB; row < rowE; ++row) - { - for (size_t col = 0; col < colB; ++col) - dst[col] = _mm256_setzero_ps(); - for (size_t col = colB; col < colE; ++col) - dst[col] = _mm256_loadu_ps(src + row * srcS + col * srcC); - for (size_t col = colE; col < 5; ++col) - dst[col] = _mm256_setzero_ps(); - dst += 5; - } - for (size_t row = rowE; row < 5; ++row) - { - for (size_t col = 0; col < 5; ++col) - dst[col] = _mm256_setzero_ps(); - dst += 5; - } - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetInput8t(const float * src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - __m256 tmp[25]; - WinogradKernel3x3Block3x3SetInput8t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel3x3Block3x3SetInput8Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - __m256 tmp[25]; - WinogradKernel3x3Block3x3SetInput8t(src + srcC - F, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel3x3Block3x3SetInput8Store(tmp, dst + srcC - F, dstStride); - } - } - - void WinogradKernel3x3Block3x3SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padY == padX && padY == padH && padY == padW && (padY == 0 || padY == 1)); - SimdBool pad = padY > 0 ? SimdTrue : SimdFalse; - if (trans ? (srcChannels < 8) : (srcHeight < 5 || srcWidth < 15)) - { - Sse::WinogradKernel3x3Block3x3SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - return; - } - size_t dstH = pad ? srcHeight : srcHeight - 2; - size_t dstW = pad ? srcWidth : srcWidth - 2; - size_t tileH = (dstH + 2) / 3; - size_t tileW = (dstW + 2) / 3; - size_t dstH3 = AlignLoAny(dstH, 3); - size_t dstW3 = AlignLoAny(dstW, 3); - if (trans) - { - size_t noseW = Simd::Min(5, dstW + 1); - size_t noseH = Simd::Min(5, dstH + 1); - size_t start = pad ? 3 : 0; - if (pad) - { - if (dstH == dstH3) - dstH3 -= 3; - if (dstW == dstW3) - dstW3 -= 3; - src -= (srcWidth + 1)*srcChannels; - } - size_t tailW = dstW - dstW3 + (pad ? 1 : 2); - size_t tailH = dstH - dstH3 + (pad ? 1 : 2); - size_t row = 0, col = 0; - if (pad) - { - if (pad) - WinogradKernel3x3Block3x3SetInput8t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstW3; col += 3) - WinogradKernel3x3Block3x3SetInput8t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 5, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block3x3SetInput8t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; - } - for (row = start; row < dstH3; row += 3) - { - if (pad) - WinogradKernel3x3Block3x3SetInput8t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 5, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstW3; col += 3) - WinogradKernel3x3Block3x3SetInput8t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block3x3SetInput8t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 5, 0, tailW, dst, dstStride), dst += srcChannels; - } - if (row < dstH) - { - if (pad) - WinogradKernel3x3Block3x3SetInput8t(src + row * srcWidth* srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstW3; col += 3) - WinogradKernel3x3Block3x3SetInput8t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 5, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block3x3SetInput8t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; - } - } - else - { - Base::WinogradKernel3x3Block3x3SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block3x3SetOutputLoad25(const float * src, size_t stride, __m256 dst[9]) - { - __m256 s[25]; - s[0] = _mm256_loadu_ps(src + 0 * stride); - s[1] = _mm256_loadu_ps(src + 1 * stride); - s[2] = _mm256_loadu_ps(src + 2 * stride); - s[3] = _mm256_loadu_ps(src + 3 * stride); - s[4] = _mm256_loadu_ps(src + 4 * stride); - s[5] = _mm256_loadu_ps(src + 5 * stride); - s[6] = _mm256_loadu_ps(src + 6 * stride); - s[7] = _mm256_loadu_ps(src + 7 * stride); - s[8] = _mm256_loadu_ps(src + 8 * stride); - s[9] = _mm256_loadu_ps(src + 9 * stride); - s[10] = _mm256_loadu_ps(src + 10 * stride); - s[11] = _mm256_loadu_ps(src + 11 * stride); - s[12] = _mm256_loadu_ps(src + 12 * stride); - s[13] = _mm256_loadu_ps(src + 13 * stride); - s[14] = _mm256_loadu_ps(src + 14 * stride); - s[15] = _mm256_loadu_ps(src + 15 * stride); - s[16] = _mm256_loadu_ps(src + 16 * stride); - s[17] = _mm256_loadu_ps(src + 17 * stride); - s[18] = _mm256_loadu_ps(src + 18 * stride); - s[19] = _mm256_loadu_ps(src + 19 * stride); - s[20] = _mm256_loadu_ps(src + 20 * stride); - s[21] = _mm256_loadu_ps(src + 21 * stride); - s[22] = _mm256_loadu_ps(src + 22 * stride); - s[23] = _mm256_loadu_ps(src + 23 * stride); - s[24] = _mm256_loadu_ps(src + 24 * stride); - - __m256 _2 = _mm256_set1_ps(2.0f); - __m256 _4 = _mm256_set1_ps(4.0f); - __m256 t[5]; - t[0] = _mm256_add_ps(_mm256_add_ps(s[0], s[5]), _mm256_add_ps(s[10], s[15])); - t[1] = _mm256_add_ps(_mm256_add_ps(s[1], s[6]), _mm256_add_ps(s[11], s[16])); - t[2] = _mm256_add_ps(_mm256_add_ps(s[2], s[7]), _mm256_add_ps(s[12], s[17])); - t[3] = _mm256_add_ps(_mm256_add_ps(s[3], s[8]), _mm256_add_ps(s[13], s[18])); - t[4] = _mm256_add_ps(_mm256_add_ps(s[4], s[9]), _mm256_add_ps(s[14], s[19])); - dst[0] = _mm256_add_ps(_mm256_add_ps(t[0], t[1]), _mm256_add_ps(t[2], t[3])); - dst[1] = _mm256_add_ps(_mm256_sub_ps(t[1], t[2]), _mm256_mul_ps(_2, t[3])); - dst[2] = _mm256_add_ps(_mm256_add_ps(t[1], t[2]), _mm256_add_ps(_mm256_mul_ps(_4, t[3]), t[4])); - - t[0] = _mm256_add_ps(_mm256_sub_ps(s[5], s[10]), _mm256_mul_ps(_2, s[15])); - t[1] = _mm256_add_ps(_mm256_sub_ps(s[6], s[11]), _mm256_mul_ps(_2, s[16])); - t[2] = _mm256_add_ps(_mm256_sub_ps(s[7], s[12]), _mm256_mul_ps(_2, s[17])); - t[3] = _mm256_add_ps(_mm256_sub_ps(s[8], s[13]), _mm256_mul_ps(_2, s[18])); - t[4] = _mm256_add_ps(_mm256_sub_ps(s[9], s[14]), _mm256_mul_ps(_2, s[19])); - dst[3] = _mm256_add_ps(_mm256_add_ps(t[0], t[1]), _mm256_add_ps(t[2], t[3])); - dst[4] = _mm256_add_ps(_mm256_sub_ps(t[1], t[2]), _mm256_mul_ps(_2, t[3])); - dst[5] = _mm256_add_ps(_mm256_add_ps(t[1], t[2]), _mm256_add_ps(_mm256_mul_ps(_4, t[3]), t[4])); - - t[0] = _mm256_add_ps(_mm256_add_ps(s[5], s[10]), _mm256_add_ps(_mm256_mul_ps(_4, s[15]), s[20])); - t[1] = _mm256_add_ps(_mm256_add_ps(s[6], s[11]), _mm256_add_ps(_mm256_mul_ps(_4, s[16]), s[21])); - t[2] = _mm256_add_ps(_mm256_add_ps(s[7], s[12]), _mm256_add_ps(_mm256_mul_ps(_4, s[17]), s[22])); - t[3] = _mm256_add_ps(_mm256_add_ps(s[8], s[13]), _mm256_add_ps(_mm256_mul_ps(_4, s[18]), s[23])); - t[4] = _mm256_add_ps(_mm256_add_ps(s[9], s[14]), _mm256_add_ps(_mm256_mul_ps(_4, s[19]), s[24])); - dst[6] = _mm256_add_ps(_mm256_add_ps(t[0], t[1]), _mm256_add_ps(t[2], t[3])); - dst[7] = _mm256_add_ps(_mm256_sub_ps(t[1], t[2]), _mm256_mul_ps(_2, t[3])); - dst[8] = _mm256_add_ps(_mm256_add_ps(t[1], t[2]), _mm256_add_ps(_mm256_mul_ps(_4, t[3]), t[4])); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetOutputStore9(const __m256 src[9], float * dst, size_t dstS, size_t dstC) - { - _mm256_storeu_ps(dst + 0 * dstS + 0 * dstC, src[0]); - _mm256_storeu_ps(dst + 0 * dstS + 1 * dstC, src[1]); - _mm256_storeu_ps(dst + 0 * dstS + 2 * dstC, src[2]); - _mm256_storeu_ps(dst + 1 * dstS + 0 * dstC, src[3]); - _mm256_storeu_ps(dst + 1 * dstS + 1 * dstC, src[4]); - _mm256_storeu_ps(dst + 1 * dstS + 2 * dstC, src[5]); - _mm256_storeu_ps(dst + 2 * dstS + 0 * dstC, src[6]); - _mm256_storeu_ps(dst + 2 * dstS + 1 * dstC, src[7]); - _mm256_storeu_ps(dst + 2 * dstS + 2 * dstC, src[8]); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetOutput8t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m256 tmp[9]; - WinogradKernel3x3Block3x3SetOutputLoad25(src + d, srcStride, tmp); - WinogradKernel3x3Block3x3SetOutputStore9(tmp, dst + d, dstS, dstC); - } - if (dstCF < dstC) - { - __m256 tmp[9]; - WinogradKernel3x3Block3x3SetOutputLoad25(src + dstC - F, srcStride, tmp); - WinogradKernel3x3Block3x3SetOutputStore9(tmp, dst + dstC - F, dstS, dstC); - } - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetOutputStore9(const __m256 src[16], float * dst, size_t dstS, size_t dstC, size_t rowE, size_t colE) - { - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - _mm256_storeu_ps(dst + row * dstS + col * dstC, src[row * 3 + col]); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetOutput8t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m256 tmp[9]; - WinogradKernel3x3Block3x3SetOutputLoad25(src + d, srcStride, tmp); - WinogradKernel3x3Block3x3SetOutputStore9(tmp, dst + d, dstS, dstC, rowE, colE); - } - if (dstCF < dstC) - { - __m256 tmp[9]; - WinogradKernel3x3Block3x3SetOutputLoad25(src + dstC - F, srcStride, tmp); - WinogradKernel3x3Block3x3SetOutputStore9(tmp, dst + dstC - F, dstS, dstC, rowE, colE); - } - } - - void WinogradKernel3x3Block3x3SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - if (trans ? (dstChannels < 8) : (dstHeight < 3 || dstWidth < 15)) - { - Sse::WinogradKernel3x3Block3x3SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - return; - } - size_t tileH = (dstHeight + 2) / 3; - size_t tileW = (dstWidth + 2) / 3; - size_t dstH3 = AlignLoAny(dstHeight, 3); - size_t dstW3 = AlignLoAny(dstWidth, 3); - if (trans) - { - size_t row, col; - for (row = 0; row < dstH3; row += 3) - { - for (col = 0; col < dstW3; col += 3) - WinogradKernel3x3Block3x3SetOutput8t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block3x3SetOutput8t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, 3, dstWidth - col), src += dstChannels; - } - if (row < dstHeight) - { - for (col = 0; col < dstW3; col += 3) - WinogradKernel3x3Block3x3SetOutput8t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, 3), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block3x3SetOutput8t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; - } - } - else - { - Base::WinogradKernel3x3Block3x3SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block4x4SetFilter8Row(const __m256 * t, float * dst, size_t stride) - { - const __m256 r4 = _mm256_set1_ps(1.0f / 4.0f); - const __m256 r6 = _mm256_set1_ps(1.0f / 6.0f); - const __m256 mr6 = _mm256_set1_ps(-1.0f / 6.0f); - const __m256 r12 = _mm256_set1_ps(1.0f / 12.0f); - const __m256 r24 = _mm256_set1_ps(1.0f / 24.0f); - _mm256_storeu_ps(dst + 0 * stride, _mm256_mul_ps(r4, t[0])); - __m256 t0 = _mm256_add_ps(t[0], t[2]); - _mm256_storeu_ps(dst + 1 * stride, _mm256_mul_ps(mr6, _mm256_add_ps(t0, t[1]))); - _mm256_storeu_ps(dst + 2 * stride, _mm256_mul_ps(mr6, _mm256_sub_ps(t0, t[1]))); - __m256 t1 = _mm256_add_ps(_mm256_mul_ps(r24, t[0]), _mm256_mul_ps(r6, t[2])); - __m256 t2 = _mm256_mul_ps(r12, t[1]); - _mm256_storeu_ps(dst + 3 * stride, _mm256_add_ps(t1, t2)); - _mm256_storeu_ps(dst + 4 * stride, _mm256_sub_ps(t1, t2)); - _mm256_storeu_ps(dst + 5 * stride, t[2]); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetFilter8All(const __m256 * s, float * dst, size_t stride) - { - const __m256 r4 = _mm256_set1_ps(1.0f / 4.0f); - const __m256 r6 = _mm256_set1_ps(1.0f / 6.0f); - const __m256 mr6 = _mm256_set1_ps(-1.0f / 6.0f); - const __m256 r12 = _mm256_set1_ps(1.0f / 12.0f); - const __m256 r24 = _mm256_set1_ps(1.0f / 24.0f); - - __m256 t[3]; - t[0] = _mm256_mul_ps(r4, s[0]); - t[1] = _mm256_mul_ps(r4, s[1]); - t[2] = _mm256_mul_ps(r4, s[2]); - WinogradKernel3x3Block4x4SetFilter8Row(t, dst + 0 * stride, stride); - - t[0] = _mm256_mul_ps(mr6, _mm256_add_ps(_mm256_add_ps(s[0], s[3]), s[6])); - t[1] = _mm256_mul_ps(mr6, _mm256_add_ps(_mm256_add_ps(s[1], s[4]), s[7])); - t[2] = _mm256_mul_ps(mr6, _mm256_add_ps(_mm256_add_ps(s[2], s[5]), s[8])); - WinogradKernel3x3Block4x4SetFilter8Row(t, dst + 6 * stride, stride); - - t[0] = _mm256_mul_ps(mr6, _mm256_add_ps(_mm256_sub_ps(s[0], s[3]), s[6])); - t[1] = _mm256_mul_ps(mr6, _mm256_add_ps(_mm256_sub_ps(s[1], s[4]), s[7])); - t[2] = _mm256_mul_ps(mr6, _mm256_add_ps(_mm256_sub_ps(s[2], s[5]), s[8])); - WinogradKernel3x3Block4x4SetFilter8Row(t, dst + 12 * stride, stride); - - t[0] = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(r24, s[0]), _mm256_mul_ps(r12, s[3])), _mm256_mul_ps(r6, s[6])); - t[1] = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(r24, s[1]), _mm256_mul_ps(r12, s[4])), _mm256_mul_ps(r6, s[7])); - t[2] = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(r24, s[2]), _mm256_mul_ps(r12, s[5])), _mm256_mul_ps(r6, s[8])); - WinogradKernel3x3Block4x4SetFilter8Row(t, dst + 18 * stride, stride); - - t[0] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(r24, s[0]), _mm256_mul_ps(r12, s[3])), _mm256_mul_ps(r6, s[6])); - t[1] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(r24, s[1]), _mm256_mul_ps(r12, s[4])), _mm256_mul_ps(r6, s[7])); - t[2] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(r24, s[2]), _mm256_mul_ps(r12, s[5])), _mm256_mul_ps(r6, s[8])); - WinogradKernel3x3Block4x4SetFilter8Row(t, dst + 24 * stride, stride); - - WinogradKernel3x3Block4x4SetFilter8Row(s + 6, dst + 30 * stride, stride); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetFilter8t(const float * src, float * dst, size_t stride) - { - __m256 s[9]; - s[0] = _mm256_loadu_ps(src + 0 * stride); - s[1] = _mm256_loadu_ps(src + 1 * stride); - s[2] = _mm256_loadu_ps(src + 2 * stride); - s[3] = _mm256_loadu_ps(src + 3 * stride); - s[4] = _mm256_loadu_ps(src + 4 * stride); - s[5] = _mm256_loadu_ps(src + 5 * stride); - s[6] = _mm256_loadu_ps(src + 6 * stride); - s[7] = _mm256_loadu_ps(src + 7 * stride); - s[8] = _mm256_loadu_ps(src + 8 * stride); - WinogradKernel3x3Block4x4SetFilter8All(s, dst + 0 * stride, stride); - } - - void WinogradKernel3x3Block4x4SetFilter(const float * src, size_t size, float * dst, SimdBool trans) - { - if (trans) - { - size_t size8 = AlignLo(size, 8), i = 0; - for (; i < size8; i += 8) - WinogradKernel3x3Block4x4SetFilter8t(src + i, dst + i, size); - for (; i < size; i += 1) - Base::WinogradKernel3x3Block4x4SetFilter1t(src + i, dst + i, size); - } - else - { - Sse::WinogradKernel3x3Block4x4SetFilter(src, size, dst, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block4x4SetInput8Store(const __m256 src[36], float * dst, size_t stride) - { - __m256 _2 = _mm256_set1_ps(2.0f); - __m256 _4 = _mm256_set1_ps(4.0f); - __m256 _5 = _mm256_set1_ps(5.0f); - __m256 tmp[36]; - tmp[0] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, src[0]), _mm256_mul_ps(_5, src[12])), src[24]); - tmp[1] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, src[1]), _mm256_mul_ps(_5, src[13])), src[25]); - tmp[2] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, src[2]), _mm256_mul_ps(_5, src[14])), src[26]); - tmp[3] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, src[3]), _mm256_mul_ps(_5, src[15])), src[27]); - tmp[4] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, src[4]), _mm256_mul_ps(_5, src[16])), src[28]); - tmp[5] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, src[5]), _mm256_mul_ps(_5, src[17])), src[29]); - tmp[6] = _mm256_sub_ps(_mm256_add_ps(src[18], src[24]), _mm256_mul_ps(_4, _mm256_add_ps(src[6], src[12]))); - tmp[7] = _mm256_sub_ps(_mm256_add_ps(src[19], src[25]), _mm256_mul_ps(_4, _mm256_add_ps(src[7], src[13]))); - tmp[8] = _mm256_sub_ps(_mm256_add_ps(src[20], src[26]), _mm256_mul_ps(_4, _mm256_add_ps(src[8], src[14]))); - tmp[9] = _mm256_sub_ps(_mm256_add_ps(src[21], src[27]), _mm256_mul_ps(_4, _mm256_add_ps(src[9], src[15]))); - tmp[10] = _mm256_sub_ps(_mm256_add_ps(src[22], src[28]), _mm256_mul_ps(_4, _mm256_add_ps(src[10], src[16]))); - tmp[11] = _mm256_sub_ps(_mm256_add_ps(src[23], src[29]), _mm256_mul_ps(_4, _mm256_add_ps(src[11], src[17]))); - tmp[12] = _mm256_add_ps(_mm256_mul_ps(_4, _mm256_sub_ps(src[6], src[12])), _mm256_sub_ps(src[24], src[18])); - tmp[13] = _mm256_add_ps(_mm256_mul_ps(_4, _mm256_sub_ps(src[7], src[13])), _mm256_sub_ps(src[25], src[19])); - tmp[14] = _mm256_add_ps(_mm256_mul_ps(_4, _mm256_sub_ps(src[8], src[14])), _mm256_sub_ps(src[26], src[20])); - tmp[15] = _mm256_add_ps(_mm256_mul_ps(_4, _mm256_sub_ps(src[9], src[15])), _mm256_sub_ps(src[27], src[21])); - tmp[16] = _mm256_add_ps(_mm256_mul_ps(_4, _mm256_sub_ps(src[10], src[16])), _mm256_sub_ps(src[28], src[22])); - tmp[17] = _mm256_add_ps(_mm256_mul_ps(_4, _mm256_sub_ps(src[11], src[17])), _mm256_sub_ps(src[29], src[23])); - tmp[18] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[18], src[6])), _mm256_sub_ps(src[24], src[12])); - tmp[19] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[19], src[7])), _mm256_sub_ps(src[25], src[13])); - tmp[20] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[20], src[8])), _mm256_sub_ps(src[26], src[14])); - tmp[21] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[21], src[9])), _mm256_sub_ps(src[27], src[15])); - tmp[22] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[22], src[10])), _mm256_sub_ps(src[28], src[16])); - tmp[23] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[23], src[11])), _mm256_sub_ps(src[29], src[17])); - tmp[24] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[6], src[18])), _mm256_sub_ps(src[24], src[12])); - tmp[25] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[7], src[19])), _mm256_sub_ps(src[25], src[13])); - tmp[26] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[8], src[20])), _mm256_sub_ps(src[26], src[14])); - tmp[27] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[9], src[21])), _mm256_sub_ps(src[27], src[15])); - tmp[28] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[10], src[22])), _mm256_sub_ps(src[28], src[16])); - tmp[29] = _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(src[11], src[23])), _mm256_sub_ps(src[29], src[17])); - tmp[30] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, src[6]), _mm256_mul_ps(_5, src[18])), src[30]); - tmp[31] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, src[7]), _mm256_mul_ps(_5, src[19])), src[31]); - tmp[32] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, src[8]), _mm256_mul_ps(_5, src[20])), src[32]); - tmp[33] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, src[9]), _mm256_mul_ps(_5, src[21])), src[33]); - tmp[34] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, src[10]), _mm256_mul_ps(_5, src[22])), src[34]); - tmp[35] = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, src[11]), _mm256_mul_ps(_5, src[23])), src[35]); - - _mm256_storeu_ps(dst + 0 * stride, _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, tmp[0]), _mm256_mul_ps(_5, tmp[2])), tmp[4])); - _mm256_storeu_ps(dst + 1 * stride, _mm256_sub_ps(_mm256_add_ps(tmp[3], tmp[4]), _mm256_mul_ps(_4, _mm256_add_ps(tmp[1], tmp[2])))); - _mm256_storeu_ps(dst + 2 * stride, _mm256_add_ps(_mm256_mul_ps(_4, _mm256_sub_ps(tmp[1], tmp[2])), _mm256_sub_ps(tmp[4], tmp[3]))); - _mm256_storeu_ps(dst + 3 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[3], tmp[1])), _mm256_sub_ps(tmp[4], tmp[2]))); - _mm256_storeu_ps(dst + 4 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[1], tmp[3])), _mm256_sub_ps(tmp[4], tmp[2]))); - _mm256_storeu_ps(dst + 5 * stride, _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, tmp[1]), _mm256_mul_ps(_5, tmp[3])), tmp[5])); - _mm256_storeu_ps(dst + 6 * stride, _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, tmp[6]), _mm256_mul_ps(_5, tmp[8])), tmp[10])); - _mm256_storeu_ps(dst + 7 * stride, _mm256_sub_ps(_mm256_add_ps(tmp[9], tmp[10]), _mm256_mul_ps(_4, _mm256_add_ps(tmp[7], tmp[8])))); - _mm256_storeu_ps(dst + 8 * stride, _mm256_add_ps(_mm256_mul_ps(_4, _mm256_sub_ps(tmp[7], tmp[8])), _mm256_sub_ps(tmp[10], tmp[9]))); - _mm256_storeu_ps(dst + 9 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[9], tmp[7])), _mm256_sub_ps(tmp[10], tmp[8]))); - _mm256_storeu_ps(dst + 10 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[7], tmp[9])), _mm256_sub_ps(tmp[10], tmp[8]))); - _mm256_storeu_ps(dst + 11 * stride, _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, tmp[7]), _mm256_mul_ps(_5, tmp[9])), tmp[11])); - _mm256_storeu_ps(dst + 12 * stride, _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, tmp[12]), _mm256_mul_ps(_5, tmp[14])), tmp[16])); - _mm256_storeu_ps(dst + 13 * stride, _mm256_sub_ps(_mm256_add_ps(tmp[15], tmp[16]), _mm256_mul_ps(_4, _mm256_add_ps(tmp[13], tmp[14])))); - _mm256_storeu_ps(dst + 14 * stride, _mm256_add_ps(_mm256_mul_ps(_4, _mm256_sub_ps(tmp[13], tmp[14])), _mm256_sub_ps(tmp[16], tmp[15]))); - _mm256_storeu_ps(dst + 15 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[15], tmp[13])), _mm256_sub_ps(tmp[16], tmp[14]))); - _mm256_storeu_ps(dst + 16 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[13], tmp[15])), _mm256_sub_ps(tmp[16], tmp[14]))); - _mm256_storeu_ps(dst + 17 * stride, _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, tmp[13]), _mm256_mul_ps(_5, tmp[15])), tmp[17])); - _mm256_storeu_ps(dst + 18 * stride, _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, tmp[18]), _mm256_mul_ps(_5, tmp[20])), tmp[22])); - _mm256_storeu_ps(dst + 19 * stride, _mm256_sub_ps(_mm256_add_ps(tmp[21], tmp[22]), _mm256_mul_ps(_4, _mm256_add_ps(tmp[19], tmp[20])))); - _mm256_storeu_ps(dst + 20 * stride, _mm256_add_ps(_mm256_mul_ps(_4, _mm256_sub_ps(tmp[19], tmp[20])), _mm256_sub_ps(tmp[22], tmp[21]))); - _mm256_storeu_ps(dst + 21 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[21], tmp[19])), _mm256_sub_ps(tmp[22], tmp[20]))); - _mm256_storeu_ps(dst + 22 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[19], tmp[21])), _mm256_sub_ps(tmp[22], tmp[20]))); - _mm256_storeu_ps(dst + 23 * stride, _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, tmp[19]), _mm256_mul_ps(_5, tmp[21])), tmp[23])); - _mm256_storeu_ps(dst + 24 * stride, _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, tmp[24]), _mm256_mul_ps(_5, tmp[26])), tmp[28])); - _mm256_storeu_ps(dst + 25 * stride, _mm256_sub_ps(_mm256_add_ps(tmp[27], tmp[28]), _mm256_mul_ps(_4, _mm256_add_ps(tmp[25], tmp[26])))); - _mm256_storeu_ps(dst + 26 * stride, _mm256_add_ps(_mm256_mul_ps(_4, _mm256_sub_ps(tmp[25], tmp[26])), _mm256_sub_ps(tmp[28], tmp[27]))); - _mm256_storeu_ps(dst + 27 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[27], tmp[25])), _mm256_sub_ps(tmp[28], tmp[26]))); - _mm256_storeu_ps(dst + 28 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[25], tmp[27])), _mm256_sub_ps(tmp[28], tmp[26]))); - _mm256_storeu_ps(dst + 29 * stride, _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, tmp[25]), _mm256_mul_ps(_5, tmp[27])), tmp[29])); - _mm256_storeu_ps(dst + 30 * stride, _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, tmp[30]), _mm256_mul_ps(_5, tmp[32])), tmp[34])); - _mm256_storeu_ps(dst + 31 * stride, _mm256_sub_ps(_mm256_add_ps(tmp[33], tmp[34]), _mm256_mul_ps(_4, _mm256_add_ps(tmp[31], tmp[32])))); - _mm256_storeu_ps(dst + 32 * stride, _mm256_add_ps(_mm256_mul_ps(_4, _mm256_sub_ps(tmp[31], tmp[32])), _mm256_sub_ps(tmp[34], tmp[33]))); - _mm256_storeu_ps(dst + 33 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[33], tmp[31])), _mm256_sub_ps(tmp[34], tmp[32]))); - _mm256_storeu_ps(dst + 34 * stride, _mm256_add_ps(_mm256_mul_ps(_2, _mm256_sub_ps(tmp[31], tmp[33])), _mm256_sub_ps(tmp[34], tmp[32]))); - _mm256_storeu_ps(dst + 35 * stride, _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(_4, tmp[31]), _mm256_mul_ps(_5, tmp[33])), tmp[35])); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetInput8t(const float * src, size_t srcS, size_t srcC, __m256 dst[36]) - { - dst[0] = _mm256_loadu_ps(src + 0 * srcS + 0 * srcC); - dst[1] = _mm256_loadu_ps(src + 0 * srcS + 1 * srcC); - dst[2] = _mm256_loadu_ps(src + 0 * srcS + 2 * srcC); - dst[3] = _mm256_loadu_ps(src + 0 * srcS + 3 * srcC); - dst[4] = _mm256_loadu_ps(src + 0 * srcS + 4 * srcC); - dst[5] = _mm256_loadu_ps(src + 0 * srcS + 5 * srcC); - dst[6] = _mm256_loadu_ps(src + 1 * srcS + 0 * srcC); - dst[7] = _mm256_loadu_ps(src + 1 * srcS + 1 * srcC); - dst[8] = _mm256_loadu_ps(src + 1 * srcS + 2 * srcC); - dst[9] = _mm256_loadu_ps(src + 1 * srcS + 3 * srcC); - dst[10] = _mm256_loadu_ps(src + 1 * srcS + 4 * srcC); - dst[11] = _mm256_loadu_ps(src + 1 * srcS + 5 * srcC); - dst[12] = _mm256_loadu_ps(src + 2 * srcS + 0 * srcC); - dst[13] = _mm256_loadu_ps(src + 2 * srcS + 1 * srcC); - dst[14] = _mm256_loadu_ps(src + 2 * srcS + 2 * srcC); - dst[15] = _mm256_loadu_ps(src + 2 * srcS + 3 * srcC); - dst[16] = _mm256_loadu_ps(src + 2 * srcS + 4 * srcC); - dst[17] = _mm256_loadu_ps(src + 2 * srcS + 5 * srcC); - dst[18] = _mm256_loadu_ps(src + 3 * srcS + 0 * srcC); - dst[19] = _mm256_loadu_ps(src + 3 * srcS + 1 * srcC); - dst[20] = _mm256_loadu_ps(src + 3 * srcS + 2 * srcC); - dst[21] = _mm256_loadu_ps(src + 3 * srcS + 3 * srcC); - dst[22] = _mm256_loadu_ps(src + 3 * srcS + 4 * srcC); - dst[23] = _mm256_loadu_ps(src + 3 * srcS + 5 * srcC); - dst[24] = _mm256_loadu_ps(src + 4 * srcS + 0 * srcC); - dst[25] = _mm256_loadu_ps(src + 4 * srcS + 1 * srcC); - dst[26] = _mm256_loadu_ps(src + 4 * srcS + 2 * srcC); - dst[27] = _mm256_loadu_ps(src + 4 * srcS + 3 * srcC); - dst[28] = _mm256_loadu_ps(src + 4 * srcS + 4 * srcC); - dst[29] = _mm256_loadu_ps(src + 4 * srcS + 5 * srcC); - dst[30] = _mm256_loadu_ps(src + 5 * srcS + 0 * srcC); - dst[31] = _mm256_loadu_ps(src + 5 * srcS + 1 * srcC); - dst[32] = _mm256_loadu_ps(src + 5 * srcS + 2 * srcC); - dst[33] = _mm256_loadu_ps(src + 5 * srcS + 3 * srcC); - dst[34] = _mm256_loadu_ps(src + 5 * srcS + 4 * srcC); - dst[35] = _mm256_loadu_ps(src + 5 * srcS + 5 * srcC); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetInput8t(const float * src, size_t srcW, size_t srcC, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - __m256 tmp[36]; - WinogradKernel3x3Block4x4SetInput8t(src + c, srcS, srcC, tmp); - WinogradKernel3x3Block4x4SetInput8Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - __m256 tmp[36]; - WinogradKernel3x3Block4x4SetInput8t(src + srcC - F, srcS, srcC, tmp); - WinogradKernel3x3Block4x4SetInput8Store(tmp, dst + srcC - F, dstStride); - } - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetInput8t(const float * src, size_t srcS, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, __m256 dst[36]) - { - for (size_t row = 0; row < rowB; ++row) - { - dst[0] = _mm256_setzero_ps(); - dst[1] = _mm256_setzero_ps(); - dst[2] = _mm256_setzero_ps(); - dst[3] = _mm256_setzero_ps(); - dst[4] = _mm256_setzero_ps(); - dst[5] = _mm256_setzero_ps(); - dst += 6; - } - for (size_t row = rowB; row < rowE; ++row) - { - for (size_t col = 0; col < colB; ++col) - dst[col] = _mm256_setzero_ps(); - for (size_t col = colB; col < colE; ++col) - dst[col] = _mm256_loadu_ps(src + row * srcS + col * srcC); - for (size_t col = colE; col < 6; ++col) - dst[col] = _mm256_setzero_ps(); - dst += 6; - } - for (size_t row = rowE; row < 6; ++row) - { - dst[0] = _mm256_setzero_ps(); - dst[1] = _mm256_setzero_ps(); - dst[2] = _mm256_setzero_ps(); - dst[3] = _mm256_setzero_ps(); - dst[4] = _mm256_setzero_ps(); - dst[5] = _mm256_setzero_ps(); - dst += 6; - } - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetInput8t(const float * src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - __m256 tmp[36]; - WinogradKernel3x3Block4x4SetInput8t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel3x3Block4x4SetInput8Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - __m256 tmp[36]; - WinogradKernel3x3Block4x4SetInput8t(src + srcC - F, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel3x3Block4x4SetInput8Store(tmp, dst + srcC - F, dstStride); - } - } - - void WinogradKernel3x3Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - if (trans ? (srcChannels < 8) : (srcHeight < 6 || srcWidth < 14)) - { - Sse::WinogradKernel3x3Block4x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - return; - } - if (trans) - { - assert(padY + padH <= 2 && padX + padW <= 2); - size_t dstH = srcHeight - 2 + padY + padH; - size_t dstW = srcWidth - 2 + padX + padW; - size_t dstH4 = dstH / 4 * 4; - size_t dstW4 = dstW / 4 * 4; - size_t noseW = Simd::Min(6, srcWidth + padX); - size_t noseH = Simd::Min(6, srcHeight + padY); - size_t startY = padY ? 4 : 0; - size_t startX = padX ? 4 : 0; - if (padH && dstH == dstH4) - dstH4 -= 4; - if (padY) - src -= srcWidth * srcChannels; - if (padW && dstW == dstW4) - dstW4 -= 4; - if (padX) - src -= srcChannels; - size_t tailW = dstW - dstW4 + (padW ? 1 : 2); - size_t tailH = dstH - dstH4 + (padH ? 1 : 2); - size_t row = 0, col = 0; - if (padY) - { - if (padX) - WinogradKernel3x3Block4x4SetInput8t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel3x3Block4x4SetInput8t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 6, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block4x4SetInput8t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; - } - for (row = startY; row < dstH4; row += 4) - { - if (padX) - WinogradKernel3x3Block4x4SetInput8t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 6, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel3x3Block4x4SetInput8t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block4x4SetInput8t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 6, 0, tailW, dst, dstStride), dst += srcChannels; - } - if (row < dstH) - { - if (padX) - WinogradKernel3x3Block4x4SetInput8t(src + row * srcWidth* srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel3x3Block4x4SetInput8t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 6, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block4x4SetInput8t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; - } - } - else - { - Base::WinogradKernel3x3Block4x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutputLoad36(const float * src, size_t stride, __m256 dst[16]) - { - __m256 s[36]; - s[0] = _mm256_loadu_ps(src + 0 * stride); - s[1] = _mm256_loadu_ps(src + 1 * stride); - s[2] = _mm256_loadu_ps(src + 2 * stride); - s[3] = _mm256_loadu_ps(src + 3 * stride); - s[4] = _mm256_loadu_ps(src + 4 * stride); - s[5] = _mm256_loadu_ps(src + 5 * stride); - s[6] = _mm256_loadu_ps(src + 6 * stride); - s[7] = _mm256_loadu_ps(src + 7 * stride); - s[8] = _mm256_loadu_ps(src + 8 * stride); - s[9] = _mm256_loadu_ps(src + 9 * stride); - s[10] = _mm256_loadu_ps(src + 10 * stride); - s[11] = _mm256_loadu_ps(src + 11 * stride); - s[12] = _mm256_loadu_ps(src + 12 * stride); - s[13] = _mm256_loadu_ps(src + 13 * stride); - s[14] = _mm256_loadu_ps(src + 14 * stride); - s[15] = _mm256_loadu_ps(src + 15 * stride); - s[16] = _mm256_loadu_ps(src + 16 * stride); - s[17] = _mm256_loadu_ps(src + 17 * stride); - s[18] = _mm256_loadu_ps(src + 18 * stride); - s[19] = _mm256_loadu_ps(src + 19 * stride); - s[20] = _mm256_loadu_ps(src + 20 * stride); - s[21] = _mm256_loadu_ps(src + 21 * stride); - s[22] = _mm256_loadu_ps(src + 22 * stride); - s[23] = _mm256_loadu_ps(src + 23 * stride); - s[24] = _mm256_loadu_ps(src + 24 * stride); - s[25] = _mm256_loadu_ps(src + 25 * stride); - s[26] = _mm256_loadu_ps(src + 26 * stride); - s[27] = _mm256_loadu_ps(src + 27 * stride); - s[28] = _mm256_loadu_ps(src + 28 * stride); - s[29] = _mm256_loadu_ps(src + 29 * stride); - s[30] = _mm256_loadu_ps(src + 30 * stride); - s[31] = _mm256_loadu_ps(src + 31 * stride); - s[32] = _mm256_loadu_ps(src + 32 * stride); - s[33] = _mm256_loadu_ps(src + 33 * stride); - s[34] = _mm256_loadu_ps(src + 34 * stride); - s[35] = _mm256_loadu_ps(src + 35 * stride); - - __m256 _2 = _mm256_set1_ps(2.0f); - __m256 _4 = _mm256_set1_ps(4.0f); - __m256 _8 = _mm256_set1_ps(8.0f); - __m256 t[24]; - t[0] = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(s[0], s[6]), _mm256_add_ps(s[12], s[18])), s[24]); - t[1] = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(s[1], s[7]), _mm256_add_ps(s[13], s[19])), s[25]); - t[2] = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(s[2], s[8]), _mm256_add_ps(s[14], s[20])), s[26]); - t[3] = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(s[3], s[9]), _mm256_add_ps(s[15], s[21])), s[27]); - t[4] = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(s[4], s[10]), _mm256_add_ps(s[16], s[22])), s[28]); - t[5] = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(s[5], s[11]), _mm256_add_ps(s[17], s[23])), s[29]); - t[6] = _mm256_add_ps(_mm256_sub_ps(s[6], s[12]), _mm256_mul_ps(_2, _mm256_sub_ps(s[18], s[24]))); - t[7] = _mm256_add_ps(_mm256_sub_ps(s[7], s[13]), _mm256_mul_ps(_2, _mm256_sub_ps(s[19], s[25]))); - t[8] = _mm256_add_ps(_mm256_sub_ps(s[8], s[14]), _mm256_mul_ps(_2, _mm256_sub_ps(s[20], s[26]))); - t[9] = _mm256_add_ps(_mm256_sub_ps(s[9], s[15]), _mm256_mul_ps(_2, _mm256_sub_ps(s[21], s[27]))); - t[10] = _mm256_add_ps(_mm256_sub_ps(s[10], s[16]), _mm256_mul_ps(_2, _mm256_sub_ps(s[22], s[28]))); - t[11] = _mm256_add_ps(_mm256_sub_ps(s[11], s[17]), _mm256_mul_ps(_2, _mm256_sub_ps(s[23], s[29]))); - t[12] = _mm256_add_ps(_mm256_add_ps(s[6], s[12]), _mm256_mul_ps(_4, _mm256_add_ps(s[18], s[24]))); - t[13] = _mm256_add_ps(_mm256_add_ps(s[7], s[13]), _mm256_mul_ps(_4, _mm256_add_ps(s[19], s[25]))); - t[14] = _mm256_add_ps(_mm256_add_ps(s[8], s[14]), _mm256_mul_ps(_4, _mm256_add_ps(s[20], s[26]))); - t[15] = _mm256_add_ps(_mm256_add_ps(s[9], s[15]), _mm256_mul_ps(_4, _mm256_add_ps(s[21], s[27]))); - t[16] = _mm256_add_ps(_mm256_add_ps(s[10], s[16]), _mm256_mul_ps(_4, _mm256_add_ps(s[22], s[28]))); - t[17] = _mm256_add_ps(_mm256_add_ps(s[11], s[17]), _mm256_mul_ps(_4, _mm256_add_ps(s[23], s[29]))); - t[18] = _mm256_add_ps(_mm256_add_ps(_mm256_sub_ps(s[6], s[12]), _mm256_mul_ps(_8, _mm256_sub_ps(s[18], s[24]))), s[30]); - t[19] = _mm256_add_ps(_mm256_add_ps(_mm256_sub_ps(s[7], s[13]), _mm256_mul_ps(_8, _mm256_sub_ps(s[19], s[25]))), s[31]); - t[20] = _mm256_add_ps(_mm256_add_ps(_mm256_sub_ps(s[8], s[14]), _mm256_mul_ps(_8, _mm256_sub_ps(s[20], s[26]))), s[32]); - t[21] = _mm256_add_ps(_mm256_add_ps(_mm256_sub_ps(s[9], s[15]), _mm256_mul_ps(_8, _mm256_sub_ps(s[21], s[27]))), s[33]); - t[22] = _mm256_add_ps(_mm256_add_ps(_mm256_sub_ps(s[10], s[16]), _mm256_mul_ps(_8, _mm256_sub_ps(s[22], s[28]))), s[34]); - t[23] = _mm256_add_ps(_mm256_add_ps(_mm256_sub_ps(s[11], s[17]), _mm256_mul_ps(_8, _mm256_sub_ps(s[23], s[29]))), s[35]); - - dst[0] = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(t[0], t[1]), _mm256_add_ps(t[2], t[3])), t[4]); - dst[1] = _mm256_add_ps(_mm256_sub_ps(t[1], t[2]), _mm256_mul_ps(_2, _mm256_sub_ps(t[3], t[4]))); - dst[2] = _mm256_add_ps(_mm256_add_ps(t[1], t[2]), _mm256_mul_ps(_4, _mm256_add_ps(t[3], t[4]))); - dst[3] = _mm256_add_ps(_mm256_add_ps(_mm256_sub_ps(t[1], t[2]), _mm256_mul_ps(_8, _mm256_sub_ps(t[3], t[4]))), t[5]); - dst[4] = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(t[6], t[7]), _mm256_add_ps(t[8], t[9])), t[10]); - dst[5] = _mm256_add_ps(_mm256_sub_ps(t[7], t[8]), _mm256_mul_ps(_2, _mm256_sub_ps(t[9], t[10]))); - dst[6] = _mm256_add_ps(_mm256_add_ps(t[7], t[8]), _mm256_mul_ps(_4, _mm256_add_ps(t[9], t[10]))); - dst[7] = _mm256_add_ps(_mm256_add_ps(_mm256_sub_ps(t[7], t[8]), _mm256_mul_ps(_8, _mm256_sub_ps(t[9], t[10]))), t[11]); - dst[8] = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(t[12], t[13]), _mm256_add_ps(t[14], t[15])), t[16]); - dst[9] = _mm256_add_ps(_mm256_sub_ps(t[13], t[14]), _mm256_mul_ps(_2, _mm256_sub_ps(t[15], t[16]))); - dst[10] = _mm256_add_ps(_mm256_add_ps(t[13], t[14]), _mm256_mul_ps(_4, _mm256_add_ps(t[15], t[16]))); - dst[11] = _mm256_add_ps(_mm256_add_ps(_mm256_sub_ps(t[13], t[14]), _mm256_mul_ps(_8, _mm256_sub_ps(t[15], t[16]))), t[17]); - dst[12] = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(t[18], t[19]), _mm256_add_ps(t[20], t[21])), t[22]); - dst[13] = _mm256_add_ps(_mm256_sub_ps(t[19], t[20]), _mm256_mul_ps(_2, _mm256_sub_ps(t[21], t[22]))); - dst[14] = _mm256_add_ps(_mm256_add_ps(t[19], t[20]), _mm256_mul_ps(_4, _mm256_add_ps(t[21], t[22]))); - dst[15] = _mm256_add_ps(_mm256_add_ps(_mm256_sub_ps(t[19], t[20]), _mm256_mul_ps(_8, _mm256_sub_ps(t[21], t[22]))), t[23]); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutputStore16(const __m256 src[16], float * dst, size_t dstS, size_t dstC) - { - _mm256_storeu_ps(dst + 0 * dstS + 0 * dstC, src[0]); - _mm256_storeu_ps(dst + 0 * dstS + 1 * dstC, src[1]); - _mm256_storeu_ps(dst + 0 * dstS + 2 * dstC, src[2]); - _mm256_storeu_ps(dst + 0 * dstS + 3 * dstC, src[3]); - _mm256_storeu_ps(dst + 1 * dstS + 0 * dstC, src[4]); - _mm256_storeu_ps(dst + 1 * dstS + 1 * dstC, src[5]); - _mm256_storeu_ps(dst + 1 * dstS + 2 * dstC, src[6]); - _mm256_storeu_ps(dst + 1 * dstS + 3 * dstC, src[7]); - _mm256_storeu_ps(dst + 2 * dstS + 0 * dstC, src[8]); - _mm256_storeu_ps(dst + 2 * dstS + 1 * dstC, src[9]); - _mm256_storeu_ps(dst + 2 * dstS + 2 * dstC, src[10]); - _mm256_storeu_ps(dst + 2 * dstS + 3 * dstC, src[11]); - _mm256_storeu_ps(dst + 3 * dstS + 0 * dstC, src[12]); - _mm256_storeu_ps(dst + 3 * dstS + 1 * dstC, src[13]); - _mm256_storeu_ps(dst + 3 * dstS + 2 * dstC, src[14]); - _mm256_storeu_ps(dst + 3 * dstS + 3 * dstC, src[15]); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutput8t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m256 tmp[16]; - WinogradKernel3x3Block4x4SetOutputLoad36(src + d, srcStride, tmp); - WinogradKernel3x3Block4x4SetOutputStore16(tmp, dst + d, dstS, dstC); - } - if (dstCF < dstC) - { - __m256 tmp[16]; - WinogradKernel3x3Block4x4SetOutputLoad36(src + dstC - F, srcStride, tmp); - WinogradKernel3x3Block4x4SetOutputStore16(tmp, dst + dstC - F, dstS, dstC); - } - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutputStore16(const __m256 src[16], float * dst, size_t dstS, size_t dstC, size_t rowE, size_t colE) - { - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - _mm256_storeu_ps(dst + row * dstS + col * dstC, src[row * 4 + col]); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutput8t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m256 tmp[16]; - WinogradKernel3x3Block4x4SetOutputLoad36(src + d, srcStride, tmp); - WinogradKernel3x3Block4x4SetOutputStore16(tmp, dst + d, dstS, dstC, rowE, colE); - } - if (dstCF < dstC) - { - __m256 tmp[16]; - WinogradKernel3x3Block4x4SetOutputLoad36(src + dstC - F, srcStride, tmp); - WinogradKernel3x3Block4x4SetOutputStore16(tmp, dst + dstC - F, dstS, dstC, rowE, colE); - } - } - - void WinogradKernel3x3Block4x4SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - if (trans ? (dstChannels < 8) : (dstHeight < 4 || dstWidth < 16)) - { - Sse::WinogradKernel3x3Block4x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - return; - } - size_t tileH = (dstHeight + 3) / 4; - size_t tileW = (dstWidth + 3) / 4; - size_t dstH4 = AlignLo(dstHeight, 4); - size_t dstW4 = AlignLo(dstWidth, 4); - if (trans) - { - size_t row, col; - for (row = 0; row < dstH4; row += 4) - { - for (col = 0; col < dstW4; col += 4) - WinogradKernel3x3Block4x4SetOutput8t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block4x4SetOutput8t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, 4, dstWidth - col), src += dstChannels; - } - if (row < dstHeight) - { - for (col = 0; col < dstW4; col += 4) - WinogradKernel3x3Block4x4SetOutput8t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, 4), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block4x4SetOutput8t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; - } - } - else - { - Base::WinogradKernel3x3Block4x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - } - } - } -#endif// SIMD_AVX_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2.h b/src/3rd/Simd/Simd/SimdAvx2.h deleted file mode 100644 index 63d915e9..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2.h +++ /dev/null @@ -1,526 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar, -* 2019-2019 Facundo Galan. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdAvx2_h__ -#define __SimdAvx2_h__ - -#include "Simd/SimdDefs.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - void AbsDifference(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, uint8_t *c, size_t cStride, - size_t width, size_t height); - - void AbsDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - size_t width, size_t height, uint64_t * sum); - - void AbsDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum); - - void AbsDifferenceSums3x3(const uint8_t *current, size_t currentStride, const uint8_t * background, size_t backgroundStride, - size_t width, size_t height, uint64_t * sums); - - void AbsDifferenceSums3x3Masked(const uint8_t *current, size_t currentStride, const uint8_t *background, size_t backgroundStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sums); - - void AbsGradientSaturatedSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void AddFeatureDifference(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, - uint16_t weight, uint8_t * difference, size_t differenceStride); - - void AlphaBlending(const uint8_t *src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t *alpha, size_t alphaStride, uint8_t *dst, size_t dstStride); - - void AlphaFilling(uint8_t * dst, size_t dstStride, size_t width, size_t height, const uint8_t * channel, size_t channelCount, const uint8_t * alpha, size_t alphaStride); - - void BackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride); - - void BackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride); - - void BackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * loValue, size_t loValueStride, const uint8_t * hiValue, size_t hiValueStride, - uint8_t * loCount, size_t loCountStride, uint8_t * hiCount, size_t hiCountStride); - - void BackgroundAdjustRange(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold); - - void BackgroundAdjustRangeMasked(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride); - - void BackgroundShiftRange(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride); - - void BackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride, const uint8_t * mask, size_t maskStride); - - void BackgroundInitMask(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t index, uint8_t value, uint8_t * dst, size_t dstStride); - - void BayerToBgr(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgr, size_t bgrStride); - - void BayerToBgra(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void BgraToBgr(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* bgr, size_t bgrStride); - - void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride); - - void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride); - - void BgraToYuv420p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void BgraToYuv422p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void BgraToYuv444p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void BgraToYuva420p(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, - uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride, uint8_t * a, size_t aStride); - - void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, - const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride); - - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride); - - void BgrToYuv420p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void BgrToYuv422p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void BgrToYuv444p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void Binarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride, SimdCompareType compareType); - - void AveragingBinarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, size_t neighborhood, uint8_t threshold, uint8_t positive, uint8_t negative, - uint8_t * dst, size_t dstStride, SimdCompareType compareType); - - void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, - uint8_t value, SimdCompareType compareType, uint32_t * count); - - void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, - int16_t value, SimdCompareType compareType, uint32_t * count); - - void ConditionalSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum); - - void ConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum); - - void ConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum); - - void ConditionalFill(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t threshold, SimdCompareType compareType, uint8_t value, uint8_t * dst, size_t dstStride); - - void DeinterleaveUv(const uint8_t * uv, size_t uvStride, size_t width, size_t height, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride); - - void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride); - - void DetectionHaarDetect32fp(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void DetectionHaarDetect32fi(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void DetectionLbpDetect32fp(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void DetectionLbpDetect32fi(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void DetectionLbpDetect16ip(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void DetectionLbpDetect16ii(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void EdgeBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride); - - void EdgeBackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride); - - void EdgeBackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t * backgroundCount, size_t backgroundCountStride); - - void EdgeBackgroundAdjustRange(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold); - - void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride); - - void EdgeBackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride, const uint8_t * mask, size_t maskStride); - - void FillBgr(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red); - - void FillBgra(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red, uint8_t alpha); - - void FillPixel(uint8_t * dst, size_t stride, size_t width, size_t height, const uint8_t * pixel, size_t pixelSize); - - void Float32ToFloat16(const float * src, size_t size, uint16_t * dst); - - void Float16ToFloat32(const uint16_t * src, size_t size, float * dst); - - void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum); - - void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance); - - void CosineDistancesMxNa16f(size_t M, size_t N, size_t K, const uint16_t * const * A, const uint16_t * const * B, float * distances); - - void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst); - - void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst); - - void CosineDistance32f(const float * a, const float * b, size_t size, float * distance); - - void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); - - void Gemm32fNT(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); - - void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride); - - void GrayToBgra(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha); - - void HistogramMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t index, uint32_t * histogram); - - void HistogramConditional(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint32_t * histogram); - - void AbsSecondDerivativeHistogram(const uint8_t *src, size_t width, size_t height, size_t stride, - size_t step, size_t indent, uint32_t * histogram); - - void HogDirectionHistograms(const uint8_t * src, size_t stride, size_t width, size_t height, - size_t cellX, size_t cellY, size_t quantization, float * histograms); - - void HogExtractFeatures(const uint8_t * src, size_t stride, size_t width, size_t height, float * features); - - void HogDeinterleave(const float * src, size_t srcStride, size_t width, size_t height, size_t count, float ** dst, size_t dstStride); - - void HogFilterSeparable(const float * src, size_t srcStride, size_t width, size_t height, const float * rowFilter, size_t rowSize, const float * colFilter, size_t colSize, float * dst, size_t dstStride, int add); - - void HogLiteExtractFeatures(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t cell, float * features, size_t featuresStride); - - void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride); - - void HogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight); - - void HogLiteCompressFeatures(const float * src, size_t srcStride, size_t width, size_t height, const float * pca, float * dst, size_t dstStride); - - void HogLiteFilterSeparable(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * hFilter, size_t hSize, const float * vFilter, size_t vSize, float * dst, size_t dstStride, int add); - - void HogLiteFindMax7x7(const float * a, size_t aStride, const float * b, size_t bStride, size_t height, float * value, size_t * col, size_t * row); - - void HogLiteCreateMask(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, const float * threshold, size_t scale, size_t size, uint32_t * dst, size_t dstStride); - - void Int16ToGray(const uint8_t * src, size_t width, size_t height, size_t srcStride, uint8_t * dst, size_t dstStride); - - void Integral(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t * sum, size_t sumStride, uint8_t * sqsum, size_t sqsumStride, uint8_t * tilted, size_t tiltedStride, - SimdPixelFormatType sumFormat, SimdPixelFormatType sqsumFormat); - - void InterferenceIncrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t increment, int16_t saturation); - - void InterferenceIncrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t increment, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index); - - void InterferenceDecrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t decrement, int16_t saturation); - - void InterferenceDecrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t decrement, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index); - - void InterleaveUv(const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * uv, size_t uvStride); - - void InterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - void InterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride); - - void Laplace(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void LaplaceAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void LaplaceAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - void LbpEstimate(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void MeanFilter3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride); - - void MedianFilterRhomb3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - void MedianFilterRhomb5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - void MedianFilterSquare3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - void MedianFilterSquare5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - void NeuralConvert(const uint8_t * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride, int inversion); - - void NeuralProductSum(const float * a, const float * b, size_t size, float * sum); - - void NeuralAddVectorMultipliedByValue(const float * src, size_t size, const float * value, float * dst); - - void NeuralRoughSigmoid2(const float * src, size_t size, const float * slope, float * dst); - - void NeuralPow(const float * src, size_t size, const float * exponent, float * dst); - - void NeuralAddConvolution2x2Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution3x3Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution4x4Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution5x5Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution2x2Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution3x3Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution4x4Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution5x5Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution2x2Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - void NeuralAddConvolution3x3Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - void NeuralAddConvolution4x4Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - void NeuralAddConvolution5x5Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - void NeuralPooling1x1Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride); - - void NeuralPooling2x2Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride); - - void NeuralConvolutionForward(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, const float * weight, - size_t kernelX, size_t kernelY, size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, - void * buffer, size_t * size, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth, int add); - - void OperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride, SimdOperationBinary8uType type); - - void OperationBinary16i(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint8_t * dst, size_t dstStride, SimdOperationBinary16iType type); - - void VectorProduct(const uint8_t * vertical, const uint8_t * horizontal, uint8_t * dst, size_t stride, size_t width, size_t height); - - void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); - - void ReduceGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - void ReduceGray3x3(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation); - - void ReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - void ReduceGray5x5(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation); - - void Reorder16bit(const uint8_t * src, size_t size, uint8_t * dst); - - void Reorder32bit(const uint8_t * src, size_t size, uint8_t * dst); - - void Reorder64bit(const uint8_t * src, size_t size, uint8_t * dst); - - void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); - - void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); - - void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride); - - void SegmentationChangeIndex(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t oldIndex, uint8_t newIndex); - - void SegmentationFillSingleHoles(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index); - - void SegmentationPropagate2x2(const uint8_t * parent, size_t parentStride, size_t width, size_t height, - uint8_t * child, size_t childStride, const uint8_t * difference, size_t differenceStride, - uint8_t currentIndex, uint8_t invalidIndex, uint8_t emptyIndex, uint8_t differenceThreshold); - - void SegmentationShrinkRegion(const uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index, - ptrdiff_t * left, ptrdiff_t * top, ptrdiff_t * right, ptrdiff_t * bottom); - - void ShiftBilinear(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t * bkg, size_t bkgStride, const double * shiftX, const double * shiftY, - size_t cropLeft, size_t cropTop, size_t cropRight, size_t cropBottom, uint8_t * dst, size_t dstStride); - - void SobelDx(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void SobelDxAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void SobelDxAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - void SobelDy(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void SobelDyAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void SobelDyAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - void ContourMetrics(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void ContourMetricsMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t indexMin, uint8_t * dst, size_t dstStride); - - void ContourAnchors(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t step, int16_t threshold, uint8_t * dst, size_t dstStride); - - void SquaredDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - size_t width, size_t height, uint64_t * sum); - - void SquaredDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum); - - void GetStatistic(const uint8_t * src, size_t stride, size_t width, size_t height, - uint8_t * min, uint8_t * max, uint8_t * average); - - void GetMoments(const uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index, - uint64_t * area, uint64_t * x, uint64_t * y, uint64_t * xx, uint64_t * xy, uint64_t * yy); - - void GetObjectMoments(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t* mask, size_t maskStride, uint8_t index, - uint64_t* n, uint64_t* s, uint64_t* sx, uint64_t* sy, uint64_t* sxx, uint64_t* sxy, uint64_t* syy); - - void GetRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - void GetColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - void GetAbsDyRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - void GetAbsDxColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - void ValueSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - void SquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum); - - void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum); - - void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - void SynetConvert32fTo8u(const float* src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float* shift, uint8_t* dst, SimdSynetCompatibilityType compatibility); - - void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst); - - void SynetElu32f(const float * src, size_t size, const float * alpha, float * dst); - - void SynetInnerProductLayerForward(const float * src, const float * weight, const float * bias, size_t count, size_t size, float * dst); - - void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst, SimdTensorFormatType format); - - void SynetPoolingForwardMax32f(const float * src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float * dst, size_t dstH, size_t dstW, SimdTensorFormatType format); - - void SynetPoolingForwardMax8u(const uint8_t* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, uint8_t* dst, size_t dstH, size_t dstW, SimdTensorFormatType format); - - void SynetScaleLayerForward(const float* src, const float* scale, const float* bias, size_t channels, size_t height, size_t width, float* dst, SimdTensorFormatType format, SimdSynetCompatibilityType compatibility); - - void SynetSetInput(const uint8_t * src, size_t width, size_t height, size_t stride, SimdPixelFormatType srcFormat, - const float * lower, const float * upper, float * dst, size_t channels, SimdTensorFormatType dstFormat); - - void SynetSigmoid32f(const float* src, size_t size, const float* slope, float* dst); - - void SynetSoftmaxLayerForward(const float * src, size_t outer, size_t size, size_t inner, float * dst); - - void SynetSoftplus32f(const float* src, size_t size, const float* beta, const float* threshold, float* dst); - - void SynetTanh32f(const float* src, size_t size, const float* slope, float* dst); - - void SynetUnaryOperation32fLayerForward(const float* src, size_t size, SimdSynetUnaryOperation32fType type, float* dst); - - void TextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride); - - void TextureBoostedUv(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t boost, uint8_t * dst, size_t dstStride); - - void TextureGetDifferenceSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, int64_t * sum); - - void TexturePerformCompensation(const uint8_t * src, size_t srcStride, size_t width, size_t height, - int shift, uint8_t * dst, size_t dstStride); - - void Yuva420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride); - - void Yuv420pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - void Yuv422pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - void Yuv444pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - void Yuv420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void Yuv422pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void Yuv444pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void Yuv420pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride); - - void Yuv444pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride); - - void Yuv420pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride); - - void Yuv422pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride); - - void Yuv444pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride); - } -#endif// SIMD_AVX2_ENABLE -} -#endif//__SimdAvx2_h__ diff --git a/src/3rd/Simd/Simd/SimdAvx2AbsDifference.cpp b/src/3rd/Simd/Simd/SimdAvx2AbsDifference.cpp deleted file mode 100644 index 6e5a660b..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2AbsDifference.cpp +++ /dev/null @@ -1,75 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar, -* 2019-2019 Facundo Galan. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdLoad.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template void AbsDifference( - const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, uint8_t *c, size_t cStride, - size_t width, size_t height) - { - assert(width >= A); - if (align) - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(c) && Aligned(cStride)); - - size_t bodyWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < bodyWidth; col += A) - { - const __m256i a_ = Load((__m256i*)(a + col)); - const __m256i b_ = Load((__m256i*)(b + col)); - Store((__m256i*)(c + col), _mm256_sub_epi8(_mm256_max_epu8(a_, b_), _mm256_min_epu8(a_, b_))); - } - if (width - bodyWidth) - { - const __m256i a_ = Load((__m256i*)(a + width - A)); - const __m256i b_ = Load((__m256i*)(b + width - A)); - Store((__m256i*)(c + width - A), _mm256_sub_epi8(_mm256_max_epu8(a_, b_), _mm256_min_epu8(a_, b_))); - } - a += aStride; - b += bStride; - c += bStride; - } - } - - void AbsDifference(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, uint8_t *c, size_t cStride, - size_t width, size_t height) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)) - AbsDifference(a, aStride, b, bStride, c, cStride, width, height); - else - AbsDifference(a, aStride, b, bStride, c, cStride, width, height); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2AbsDifferenceSum.cpp b/src/3rd/Simd/Simd/SimdAvx2AbsDifferenceSum.cpp deleted file mode 100644 index e7f24ccb..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2AbsDifferenceSum.cpp +++ /dev/null @@ -1,251 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdLoad.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSet.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template void AbsDifferenceSum( - const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - size_t width, size_t height, uint64_t * sum) - { - assert(width >= A); - if (align) - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); - - size_t bodyWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + bodyWidth, 0xFF); - __m256i fullSum = _mm256_setzero_si256(); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < bodyWidth; col += A) - { - const __m256i a_ = Load((__m256i*)(a + col)); - const __m256i b_ = Load((__m256i*)(b + col)); - fullSum = _mm256_add_epi64(_mm256_sad_epu8(a_, b_), fullSum); - } - if (width - bodyWidth) - { - const __m256i a_ = _mm256_and_si256(tailMask, Load((__m256i*)(a + width - A))); - const __m256i b_ = _mm256_and_si256(tailMask, Load((__m256i*)(b + width - A))); - fullSum = _mm256_add_epi64(_mm256_sad_epu8(a_, b_), fullSum); - } - a += aStride; - b += bStride; - } - *sum = ExtractSum(fullSum); - } - - template void AbsDifferenceSumMasked( - const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) - { - assert(width >= A); - if (align) - { - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); - assert(Aligned(mask) && Aligned(maskStride)); - } - - size_t bodyWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + bodyWidth, 0xFF); - __m256i fullSum = _mm256_setzero_si256(); - __m256i index_ = _mm256_set1_epi8(index); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < bodyWidth; col += A) - { - const __m256i mask_ = LoadMaskI8((__m256i*)(mask + col), index_); - const __m256i a_ = _mm256_and_si256(mask_, Load((__m256i*)(a + col))); - const __m256i b_ = _mm256_and_si256(mask_, Load((__m256i*)(b + col))); - fullSum = _mm256_add_epi64(_mm256_sad_epu8(a_, b_), fullSum); - } - if (width - bodyWidth) - { - const __m256i mask_ = _mm256_and_si256(tailMask, LoadMaskI8((__m256i*)(mask + width - A), index_)); - const __m256i a_ = _mm256_and_si256(mask_, Load((__m256i*)(a + width - A))); - const __m256i b_ = _mm256_and_si256(mask_, Load((__m256i*)(b + width - A))); - fullSum = _mm256_add_epi64(_mm256_sad_epu8(a_, b_), fullSum); - } - a += aStride; - b += bStride; - mask += maskStride; - } - *sum = ExtractSum(fullSum); - } - - void AbsDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - size_t width, size_t height, uint64_t * sum) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)) - AbsDifferenceSum(a, aStride, b, bStride, width, height, sum); - else - AbsDifferenceSum(a, aStride, b, bStride, width, height, sum); - } - - void AbsDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(mask) && Aligned(maskStride)) - AbsDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - else - AbsDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - } - - template void AbsDifferenceSums3(__m256i current, const uint8_t * background, __m256i sums[3]) - { - sums[0] = _mm256_add_epi64(sums[0], _mm256_sad_epu8(current, Load((__m256i*)(background - 1)))); - sums[1] = _mm256_add_epi64(sums[1], _mm256_sad_epu8(current, Load((__m256i*)(background)))); - sums[2] = _mm256_add_epi64(sums[2], _mm256_sad_epu8(current, Load((__m256i*)(background + 1)))); - } - - template void AbsDifferenceSums3x3(__m256i current, const uint8_t * background, size_t stride, __m256i sums[9]) - { - AbsDifferenceSums3(current, background - stride, sums + 0); - AbsDifferenceSums3(current, background, sums + 3); - AbsDifferenceSums3(current, background + stride, sums + 6); - } - - template void AbsDifferenceSums3Masked(__m256i current, const uint8_t * background, __m256i mask, __m256i sums[3]) - { - sums[0] = _mm256_add_epi64(sums[0], _mm256_sad_epu8(current, _mm256_and_si256(mask, Load((__m256i*)(background - 1))))); - sums[1] = _mm256_add_epi64(sums[1], _mm256_sad_epu8(current, _mm256_and_si256(mask, Load((__m256i*)(background))))); - sums[2] = _mm256_add_epi64(sums[2], _mm256_sad_epu8(current, _mm256_and_si256(mask, Load((__m256i*)(background + 1))))); - } - - template void AbsDifferenceSums3x3Masked(__m256i current, const uint8_t * background, size_t stride, __m256i mask, __m256i sums[9]) - { - AbsDifferenceSums3Masked(current, background - stride, mask, sums + 0); - AbsDifferenceSums3Masked(current, background, mask, sums + 3); - AbsDifferenceSums3Masked(current, background + stride, mask, sums + 6); - } - - template void AbsDifferenceSums3x3(const uint8_t * current, size_t currentStride, - const uint8_t * background, size_t backgroundStride, size_t width, size_t height, uint64_t * sums) - { - assert(height > 2 && width >= A + 2); - if (align) - assert(Aligned(background) && Aligned(backgroundStride)); - - width -= 2; - height -= 2; - current += 1 + currentStride; - background += 1 + backgroundStride; - - size_t bodyWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + bodyWidth, 0xFF); - - __m256i fullSums[9]; - for (size_t i = 0; i < 9; ++i) - fullSums[i] = _mm256_setzero_si256(); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < bodyWidth; col += A) - { - const __m256i _current = Load((__m256i*)(current + col)); - AbsDifferenceSums3x3(_current, background + col, backgroundStride, fullSums); - } - if (width - bodyWidth) - { - const __m256i _current = _mm256_and_si256(tailMask, Load((__m256i*)(current + width - A))); - AbsDifferenceSums3x3Masked(_current, background + width - A, backgroundStride, tailMask, fullSums); - } - current += currentStride; - background += backgroundStride; - } - - for (size_t i = 0; i < 9; ++i) - sums[i] = ExtractSum(fullSums[i]); - } - - void AbsDifferenceSums3x3(const uint8_t * current, size_t currentStride, const uint8_t * background, size_t backgroundStride, - size_t width, size_t height, uint64_t * sums) - { - if (Aligned(background) && Aligned(backgroundStride)) - AbsDifferenceSums3x3(current, currentStride, background, backgroundStride, width, height, sums); - else - AbsDifferenceSums3x3(current, currentStride, background, backgroundStride, width, height, sums); - } - - template void AbsDifferenceSums3x3Masked(const uint8_t *current, size_t currentStride, const uint8_t *background, size_t backgroundStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sums) - { - assert(height > 2 && width >= A + 2); - if (align) - assert(Aligned(background) && Aligned(backgroundStride)); - - width -= 2; - height -= 2; - current += 1 + currentStride; - background += 1 + backgroundStride; - mask += 1 + maskStride; - - size_t bodyWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + bodyWidth, 0xFF); - __m256i _index = _mm256_set1_epi8(index); - - __m256i fullSums[9]; - for (size_t i = 0; i < 9; ++i) - fullSums[i] = _mm256_setzero_si256(); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < bodyWidth; col += A) - { - const __m256i _mask = LoadMaskI8((__m256i*)(mask + col), _index); - const __m256i _current = _mm256_and_si256(Load((__m256i*)(current + col)), _mask); - AbsDifferenceSums3x3Masked(_current, background + col, backgroundStride, _mask, fullSums); - } - if (width - bodyWidth) - { - const __m256i _mask = _mm256_and_si256(LoadMaskI8((__m256i*)(mask + width - A), _index), tailMask); - const __m256i _current = _mm256_and_si256(_mask, Load((__m256i*)(current + width - A))); - AbsDifferenceSums3x3Masked(_current, background + width - A, backgroundStride, _mask, fullSums); - } - current += currentStride; - background += backgroundStride; - mask += maskStride; - } - - for (size_t i = 0; i < 9; ++i) - sums[i] = ExtractSum(fullSums[i]); - } - - void AbsDifferenceSums3x3Masked(const uint8_t *current, size_t currentStride, const uint8_t *background, size_t backgroundStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sums) - { - if (Aligned(background) && Aligned(backgroundStride)) - AbsDifferenceSums3x3Masked(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums); - else - AbsDifferenceSums3x3Masked(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2AbsGradientSaturatedSum.cpp b/src/3rd/Simd/Simd/SimdAvx2AbsGradientSaturatedSum.cpp deleted file mode 100644 index 9d5fc38e..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2AbsGradientSaturatedSum.cpp +++ /dev/null @@ -1,75 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdMath.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE __m256i AbsGradientSaturatedSum(const uint8_t * src, size_t stride) - { - const __m256i s10 = Load((__m256i*)(src - 1)); - const __m256i s12 = Load((__m256i*)(src + 1)); - const __m256i s01 = Load((__m256i*)(src - stride)); - const __m256i s21 = Load((__m256i*)(src + stride)); - const __m256i dx = AbsDifferenceU8(s10, s12); - const __m256i dy = AbsDifferenceU8(s01, s21); - return _mm256_adds_epu8(dx, dy); - } - - template void AbsGradientSaturatedSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - size_t alignedWidth = AlignLo(width, A); - memset(dst, 0, width); - src += srcStride; - dst += dstStride; - for (size_t row = 2; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - Store((__m256i*)(dst + col), AbsGradientSaturatedSum(src + col, srcStride)); - if (width != alignedWidth) - Store((__m256i*)(dst + width - A), AbsGradientSaturatedSum(src + width - A, srcStride)); - - dst[0] = 0; - dst[width - 1] = 0; - - src += srcStride; - dst += dstStride; - } - memset(dst, 0, width); - } - - void AbsGradientSaturatedSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - AbsGradientSaturatedSum(src, srcStride, width, height, dst, dstStride); - else - AbsGradientSaturatedSum(src, srcStride, width, height, dst, dstStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2AddFeatureDifference.cpp b/src/3rd/Simd/Simd/SimdAvx2AddFeatureDifference.cpp deleted file mode 100644 index 4ab73779..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2AddFeatureDifference.cpp +++ /dev/null @@ -1,105 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdSet.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - SIMD_INLINE __m256i FeatureDifference(__m256i value, __m256i lo, __m256i hi) - { - return _mm256_max_epu8(_mm256_subs_epu8(value, hi), _mm256_subs_epu8(lo, value)); - } - - SIMD_INLINE __m256i ShiftedWeightedSquare16(__m256i difference, __m256i weight) - { - return _mm256_mulhi_epu16(_mm256_mullo_epi16(difference, difference), weight); - } - - SIMD_INLINE __m256i ShiftedWeightedSquare8(__m256i difference, __m256i weight) - { - const __m256i lo = ShiftedWeightedSquare16(_mm256_unpacklo_epi8(difference, K_ZERO), weight); - const __m256i hi = ShiftedWeightedSquare16(_mm256_unpackhi_epi8(difference, K_ZERO), weight); - return _mm256_packus_epi16(lo, hi); - } - - template SIMD_INLINE void AddFeatureDifference(const uint8_t * value, const uint8_t * lo, const uint8_t * hi, - uint8_t * difference, size_t offset, __m256i weight, __m256i mask) - { - const __m256i _value = Load((__m256i*)(value + offset)); - const __m256i _lo = Load((__m256i*)(lo + offset)); - const __m256i _hi = Load((__m256i*)(hi + offset)); - __m256i _difference = Load((__m256i*)(difference + offset)); - - const __m256i featureDifference = FeatureDifference(_value, _lo, _hi); - const __m256i inc = _mm256_and_si256(mask, ShiftedWeightedSquare8(featureDifference, weight)); - Store((__m256i*)(difference + offset), _mm256_adds_epu8(_difference, inc)); - } - - template void AddFeatureDifference(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, - uint16_t weight, uint8_t * difference, size_t differenceStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(lo) && Aligned(loStride)); - assert(Aligned(hi) && Aligned(hiStride)); - assert(Aligned(difference) && Aligned(differenceStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + alignedWidth, 0xFF); - __m256i _weight = _mm256_set1_epi16((short)weight); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - AddFeatureDifference(value, lo, hi, difference, col, _weight, K_INV_ZERO); - if (alignedWidth != width) - AddFeatureDifference(value, lo, hi, difference, width - A, _weight, tailMask); - value += valueStride; - lo += loStride; - hi += hiStride; - difference += differenceStride; - } - } - - void AddFeatureDifference(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, - uint16_t weight, uint8_t * difference, size_t differenceStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(lo) && Aligned(loStride) && - Aligned(hi) && Aligned(hiStride) && Aligned(difference) && Aligned(differenceStride)) - AddFeatureDifference(value, valueStride, width, height, lo, loStride, hi, hiStride, weight, difference, differenceStride); - else - AddFeatureDifference(value, valueStride, width, height, lo, loStride, hi, hiStride, weight, difference, differenceStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2AlphaBlending.cpp b/src/3rd/Simd/Simd/SimdAvx2AlphaBlending.cpp deleted file mode 100644 index 6f37ff86..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2AlphaBlending.cpp +++ /dev/null @@ -1,277 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdConversion.h" -#include "Simd/SimdSet.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - SIMD_INLINE __m256i AlphaBlendingI16(__m256i src, __m256i dst, __m256i alpha) - { - return DivideI16By255(_mm256_add_epi16(_mm256_mullo_epi16(src, alpha), _mm256_mullo_epi16(dst, _mm256_sub_epi16(K16_00FF, alpha)))); - } - - template SIMD_INLINE void AlphaBlending(const __m256i * src, __m256i * dst, __m256i alpha) - { - __m256i _src = Load(src); - __m256i _dst = Load(dst); - __m256i lo = AlphaBlendingI16(_mm256_unpacklo_epi8(_src, K_ZERO), _mm256_unpacklo_epi8(_dst, K_ZERO), _mm256_unpacklo_epi8(alpha, K_ZERO)); - __m256i hi = AlphaBlendingI16(_mm256_unpackhi_epi8(_src, K_ZERO), _mm256_unpackhi_epi8(_dst, K_ZERO), _mm256_unpackhi_epi8(alpha, K_ZERO)); - Store(dst, _mm256_packus_epi16(lo, hi)); - } - - template struct AlphaBlender - { - void operator()(const __m256i * src, __m256i * dst, __m256i alpha); - }; - - template struct AlphaBlender - { - SIMD_INLINE void operator()(const __m256i * src, __m256i * dst, __m256i alpha) - { - AlphaBlending(src, dst, alpha); - } - }; - - template struct AlphaBlender - { - SIMD_INLINE void operator()(const __m256i * src, __m256i * dst, __m256i alpha) - { - alpha = _mm256_permute4x64_epi64(alpha, 0xD8); - AlphaBlending(src + 0, dst + 0, _mm256_unpacklo_epi8(alpha, alpha)); - AlphaBlending(src + 1, dst + 1, _mm256_unpackhi_epi8(alpha, alpha)); - } - }; - - template struct AlphaBlender - { - SIMD_INLINE void operator()(const __m256i * src, __m256i * dst, __m256i alpha) - { - AlphaBlending(src + 0, dst + 0, GrayToBgr<0>(alpha)); - AlphaBlending(src + 1, dst + 1, GrayToBgr<1>(alpha)); - AlphaBlending(src + 2, dst + 2, GrayToBgr<2>(alpha)); - } - }; - - template struct AlphaBlender - { - SIMD_INLINE void operator()(const __m256i * src, __m256i * dst, __m256i alpha) - { - alpha = _mm256_permute4x64_epi64(alpha, 0xD8); - __m256i lo = _mm256_permute4x64_epi64(_mm256_unpacklo_epi8(alpha, alpha), 0xD8); - AlphaBlending(src + 0, dst + 0, _mm256_unpacklo_epi8(lo, lo)); - AlphaBlending(src + 1, dst + 1, _mm256_unpackhi_epi8(lo, lo)); - __m256i hi = _mm256_permute4x64_epi64(_mm256_unpackhi_epi8(alpha, alpha), 0xD8); - AlphaBlending(src + 2, dst + 2, _mm256_unpacklo_epi8(hi, hi)); - AlphaBlending(src + 3, dst + 3, _mm256_unpackhi_epi8(hi, hi)); - } - }; - - template void AlphaBlending(const uint8_t *src, size_t srcStride, size_t width, size_t height, - const uint8_t *alpha, size_t alphaStride, uint8_t *dst, size_t dstStride) - { - size_t alignedWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + alignedWidth, 0xFF); - size_t step = channelCount * A; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < alignedWidth; col += A, offset += step) - { - __m256i _alpha = Load((__m256i*)(alpha + col)); - AlphaBlender()((__m256i*)(src + offset), (__m256i*)(dst + offset), _alpha); - } - if (alignedWidth != width) - { - __m256i _alpha = _mm256_and_si256(Load((__m256i*)(alpha + width - A)), tailMask); - AlphaBlender()((__m256i*)(src + (width - A)*channelCount), (__m256i*)(dst + (width - A)*channelCount), _alpha); - } - src += srcStride; - alpha += alphaStride; - dst += dstStride; - } - } - - template void AlphaBlending(const uint8_t *src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t *alpha, size_t alphaStride, uint8_t *dst, size_t dstStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(src) && Aligned(srcStride)); - assert(Aligned(alpha) && Aligned(alphaStride)); - assert(Aligned(dst) && Aligned(dstStride)); - } - - switch (channelCount) - { - case 1: AlphaBlending(src, srcStride, width, height, alpha, alphaStride, dst, dstStride); break; - case 2: AlphaBlending(src, srcStride, width, height, alpha, alphaStride, dst, dstStride); break; - case 3: AlphaBlending(src, srcStride, width, height, alpha, alphaStride, dst, dstStride); break; - case 4: AlphaBlending(src, srcStride, width, height, alpha, alphaStride, dst, dstStride); break; - default: - assert(0); - } - } - - void AlphaBlending(const uint8_t *src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t *alpha, size_t alphaStride, uint8_t *dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(alpha) && Aligned(alphaStride) && Aligned(dst) && Aligned(dstStride)) - AlphaBlending(src, srcStride, width, height, channelCount, alpha, alphaStride, dst, dstStride); - else - AlphaBlending(src, srcStride, width, height, channelCount, alpha, alphaStride, dst, dstStride); - } - - template SIMD_INLINE void AlphaFilling(__m256i * dst, __m256i channelLo, __m256i channelHi, __m256i alpha) - { - __m256i _dst = Load(dst); - __m256i lo = AlphaBlendingI16(channelLo, _mm256_unpacklo_epi8(_dst, K_ZERO), _mm256_unpacklo_epi8(alpha, K_ZERO)); - __m256i hi = AlphaBlendingI16(channelHi, _mm256_unpackhi_epi8(_dst, K_ZERO), _mm256_unpackhi_epi8(alpha, K_ZERO)); - Store(dst, _mm256_packus_epi16(lo, hi)); - } - - template struct AlphaFiller - { - void operator() (__m256i * dst, const __m256i * channel, __m256i alpha); - }; - - template struct AlphaFiller - { - SIMD_INLINE void operator()(__m256i * dst, const __m256i * channel, __m256i alpha) - { - AlphaFilling(dst, channel[0], channel[0], alpha); - } - }; - - template struct AlphaFiller - { - SIMD_INLINE void operator()(__m256i * dst, const __m256i * channel, __m256i alpha) - { - alpha = _mm256_permute4x64_epi64(alpha, 0xD8); - AlphaFilling(dst + 0, channel[0], channel[0], UnpackU8<0>(alpha, alpha)); - AlphaFilling(dst + 1, channel[0], channel[0], UnpackU8<1>(alpha, alpha)); - } - }; - - template struct AlphaFiller - { - SIMD_INLINE void operator()(__m256i * dst, const __m256i * channel, __m256i alpha) - { - AlphaFilling(dst + 0, channel[0], channel[1], GrayToBgr<0>(alpha)); - AlphaFilling(dst + 1, channel[1], channel[2], GrayToBgr<1>(alpha)); - AlphaFilling(dst + 2, channel[2], channel[0], GrayToBgr<2>(alpha)); - } - }; - - template struct AlphaFiller - { - SIMD_INLINE void operator()(__m256i * dst, const __m256i * channel, __m256i alpha) - { - alpha = _mm256_permute4x64_epi64(alpha, 0xD8); - __m256i lo = _mm256_permute4x64_epi64(_mm256_unpacklo_epi8(alpha, alpha), 0xD8); - AlphaFilling(dst + 0, channel[0], channel[0], UnpackU8<0>(lo, lo)); - AlphaFilling(dst + 1, channel[0], channel[0], UnpackU8<1>(lo, lo)); - __m256i hi = _mm256_permute4x64_epi64(_mm256_unpackhi_epi8(alpha, alpha), 0xD8); - AlphaFilling(dst + 2, channel[0], channel[0], UnpackU8<0>(hi, hi)); - AlphaFilling(dst + 3, channel[0], channel[0], UnpackU8<1>(hi, hi)); - } - }; - - template void AlphaFilling(uint8_t * dst, size_t dstStride, size_t width, size_t height, const __m256i * channel, const uint8_t * alpha, size_t alphaStride) - { - size_t alignedWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + alignedWidth, 0xFF); - size_t step = channelCount * A; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < alignedWidth; col += A, offset += step) - { - __m256i _alpha = Load((__m256i*)(alpha + col)); - AlphaFiller()((__m256i*)(dst + offset), channel, _alpha); - } - if (alignedWidth != width) - { - __m256i _alpha = _mm256_and_si256(Load((__m256i*)(alpha + width - A)), tailMask); - AlphaFiller()((__m256i*)(dst + (width - A)*channelCount), channel, _alpha); - } - alpha += alphaStride; - dst += dstStride; - } - } - - template void AlphaFilling(uint8_t * dst, size_t dstStride, size_t width, size_t height, const uint8_t * channel, size_t channelCount, const uint8_t * alpha, size_t alphaStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(dst) && Aligned(dstStride)); - assert(Aligned(alpha) && Aligned(alphaStride)); - } - - __m256i _channel[3]; - switch (channelCount) - { - case 1: - _channel[0] = UnpackU8<0>(_mm256_set1_epi8(*(uint8_t*)channel)); - AlphaFilling(dst, dstStride, width, height, _channel, alpha, alphaStride); - break; - case 2: - _channel[0] = UnpackU8<0>(_mm256_set1_epi16(*(uint16_t*)channel)); - AlphaFilling(dst, dstStride, width, height, _channel, alpha, alphaStride); - break; - case 3: - _channel[0] = _mm256_setr_epi16( - channel[0], channel[1], channel[2], channel[0], channel[1], channel[2], channel[0], channel[1], - channel[1], channel[2], channel[0], channel[1], channel[2], channel[0], channel[1], channel[2]); - _channel[1] = _mm256_setr_epi16( - channel[2], channel[0], channel[1], channel[2], channel[0], channel[1], channel[2], channel[0], - channel[0], channel[1], channel[2], channel[0], channel[1], channel[2], channel[0], channel[1]); - _channel[2] = _mm256_setr_epi16( - channel[1], channel[2], channel[0], channel[1], channel[2], channel[0], channel[1], channel[2], - channel[2], channel[0], channel[1], channel[2], channel[0], channel[1], channel[2], channel[0]); - AlphaFilling(dst, dstStride, width, height, _channel, alpha, alphaStride); - break; - case 4: - _channel[0] = UnpackU8<0>(_mm256_set1_epi32(*(uint32_t*)channel)); - AlphaFilling(dst, dstStride, width, height, _channel, alpha, alphaStride); - break; - default: - assert(0); - } - } - - void AlphaFilling(uint8_t * dst, size_t dstStride, size_t width, size_t height, const uint8_t * channel, size_t channelCount, const uint8_t * alpha, size_t alphaStride) - { - if (Aligned(dst) && Aligned(dstStride) && Aligned(alpha) && Aligned(alphaStride)) - AlphaFilling(dst, dstStride, width, height, channel, channelCount, alpha, alphaStride); - else - AlphaFilling(dst, dstStride, width, height, channel, channelCount, alpha, alphaStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Background.cpp b/src/3rd/Simd/Simd/SimdAvx2Background.cpp deleted file mode 100644 index b8f4e0b3..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Background.cpp +++ /dev/null @@ -1,433 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void BackgroundGrowRangeSlow(const uint8_t * value, uint8_t * lo, uint8_t * hi, __m256i tailMask) - { - const __m256i _value = Load((__m256i*)value); - const __m256i _lo = Load((__m256i*)lo); - const __m256i _hi = Load((__m256i*)hi); - - const __m256i inc = _mm256_and_si256(tailMask, Greater8u(_value, _hi)); - const __m256i dec = _mm256_and_si256(tailMask, Lesser8u(_value, _lo)); - - Store((__m256i*)lo, _mm256_subs_epu8(_lo, dec)); - Store((__m256i*)hi, _mm256_adds_epu8(_hi, inc)); - } - - template void BackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(lo) && Aligned(loStride)); - assert(Aligned(hi) && Aligned(hiStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + alignedWidth, 1); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BackgroundGrowRangeSlow(value + col, lo + col, hi + col, K8_01); - if (alignedWidth != width) - BackgroundGrowRangeSlow(value + width - A, lo + width - A, hi + width - A, tailMask); - value += valueStride; - lo += loStride; - hi += hiStride; - } - } - - void BackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(lo) && Aligned(loStride) && Aligned(hi) && Aligned(hiStride)) - BackgroundGrowRangeSlow(value, valueStride, width, height, lo, loStride, hi, hiStride); - else - BackgroundGrowRangeSlow(value, valueStride, width, height, lo, loStride, hi, hiStride); - } - - template SIMD_INLINE void BackgroundGrowRangeFast(const uint8_t * value, uint8_t * lo, uint8_t * hi) - { - const __m256i _value = Load((__m256i*)value); - const __m256i _lo = Load((__m256i*)lo); - const __m256i _hi = Load((__m256i*)hi); - - Store((__m256i*)lo, _mm256_min_epu8(_lo, _value)); - Store((__m256i*)hi, _mm256_max_epu8(_hi, _value)); - } - - template void BackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(lo) && Aligned(loStride)); - assert(Aligned(hi) && Aligned(hiStride)); - } - - size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BackgroundGrowRangeFast(value + col, lo + col, hi + col); - if (alignedWidth != width) - BackgroundGrowRangeFast(value + width - A, lo + width - A, hi + width - A); - value += valueStride; - lo += loStride; - hi += hiStride; - } - } - - void BackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(lo) && Aligned(loStride) && Aligned(hi) && Aligned(hiStride)) - BackgroundGrowRangeFast(value, valueStride, width, height, lo, loStride, hi, hiStride); - else - BackgroundGrowRangeFast(value, valueStride, width, height, lo, loStride, hi, hiStride); - } - - template SIMD_INLINE void BackgroundIncrementCount(const uint8_t * value, - const uint8_t * loValue, const uint8_t * hiValue, uint8_t * loCount, uint8_t * hiCount, size_t offset, __m256i tailMask) - { - const __m256i _value = Load((__m256i*)(value + offset)); - const __m256i _loValue = Load((__m256i*)(loValue + offset)); - const __m256i _loCount = Load((__m256i*)(loCount + offset)); - const __m256i _hiValue = Load((__m256i*)(hiValue + offset)); - const __m256i _hiCount = Load((__m256i*)(hiCount + offset)); - - const __m256i incLo = _mm256_and_si256(tailMask, Lesser8u(_value, _loValue)); - const __m256i incHi = _mm256_and_si256(tailMask, Greater8u(_value, _hiValue)); - - Store((__m256i*)(loCount + offset), _mm256_adds_epu8(_loCount, incLo)); - Store((__m256i*)(hiCount + offset), _mm256_adds_epu8(_hiCount, incHi)); - } - - template void BackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * loValue, size_t loValueStride, const uint8_t * hiValue, size_t hiValueStride, - uint8_t * loCount, size_t loCountStride, uint8_t * hiCount, size_t hiCountStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(loValue) && Aligned(loValueStride) && Aligned(hiValue) && Aligned(hiValueStride)); - assert(Aligned(loCount) && Aligned(loCountStride) && Aligned(hiCount) && Aligned(hiCountStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + alignedWidth, 1); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BackgroundIncrementCount(value, loValue, hiValue, loCount, hiCount, col, K8_01); - if (alignedWidth != width) - BackgroundIncrementCount(value, loValue, hiValue, loCount, hiCount, width - A, tailMask); - value += valueStride; - loValue += loValueStride; - hiValue += hiValueStride; - loCount += loCountStride; - hiCount += hiCountStride; - } - } - - void BackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * loValue, size_t loValueStride, const uint8_t * hiValue, size_t hiValueStride, - uint8_t * loCount, size_t loCountStride, uint8_t * hiCount, size_t hiCountStride) - { - if (Aligned(value) && Aligned(valueStride) && - Aligned(loValue) && Aligned(loValueStride) && Aligned(hiValue) && Aligned(hiValueStride) && - Aligned(loCount) && Aligned(loCountStride) && Aligned(hiCount) && Aligned(hiCountStride)) - BackgroundIncrementCount(value, valueStride, width, height, - loValue, loValueStride, hiValue, hiValueStride, loCount, loCountStride, hiCount, hiCountStride); - else - BackgroundIncrementCount(value, valueStride, width, height, - loValue, loValueStride, hiValue, hiValueStride, loCount, loCountStride, hiCount, hiCountStride); - } - - SIMD_INLINE __m256i AdjustLo(const __m256i &count, const __m256i & value, const __m256i & mask, const __m256i & threshold) - { - const __m256i dec = _mm256_and_si256(mask, Greater8u(count, threshold)); - const __m256i inc = _mm256_and_si256(mask, Lesser8u(count, threshold)); - return _mm256_subs_epu8(_mm256_adds_epu8(value, inc), dec); - } - - SIMD_INLINE __m256i AdjustHi(const __m256i &count, const __m256i & value, const __m256i & mask, const __m256i & threshold) - { - const __m256i inc = _mm256_and_si256(mask, Greater8u(count, threshold)); - const __m256i dec = _mm256_and_si256(mask, Lesser8u(count, threshold)); - return _mm256_subs_epu8(_mm256_adds_epu8(value, inc), dec); - } - - template SIMD_INLINE void BackgroundAdjustRange(uint8_t * loCount, uint8_t * loValue, - uint8_t * hiCount, uint8_t * hiValue, size_t offset, const __m256i & threshold, const __m256i & mask) - { - const __m256i _loCount = Load((__m256i*)(loCount + offset)); - const __m256i _loValue = Load((__m256i*)(loValue + offset)); - const __m256i _hiCount = Load((__m256i*)(hiCount + offset)); - const __m256i _hiValue = Load((__m256i*)(hiValue + offset)); - - Store((__m256i*)(loValue + offset), AdjustLo(_loCount, _loValue, mask, threshold)); - Store((__m256i*)(hiValue + offset), AdjustHi(_hiCount, _hiValue, mask, threshold)); - Store((__m256i*)(loCount + offset), K_ZERO); - Store((__m256i*)(hiCount + offset), K_ZERO); - } - - template void BackgroundAdjustRange(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold) - { - assert(width >= A); - if (align) - { - assert(Aligned(loValue) && Aligned(loValueStride) && Aligned(hiValue) && Aligned(hiValueStride)); - assert(Aligned(loCount) && Aligned(loCountStride) && Aligned(hiCount) && Aligned(hiCountStride)); - } - - const __m256i _threshold = _mm256_set1_epi8((char)threshold); - size_t alignedWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + alignedWidth, 1); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BackgroundAdjustRange(loCount, loValue, hiCount, hiValue, col, _threshold, K8_01); - if (alignedWidth != width) - BackgroundAdjustRange(loCount, loValue, hiCount, hiValue, width - A, _threshold, tailMask); - loValue += loValueStride; - hiValue += hiValueStride; - loCount += loCountStride; - hiCount += hiCountStride; - } - } - - void BackgroundAdjustRange(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold) - { - if (Aligned(loValue) && Aligned(loValueStride) && Aligned(hiValue) && Aligned(hiValueStride) && - Aligned(loCount) && Aligned(loCountStride) && Aligned(hiCount) && Aligned(hiCountStride)) - BackgroundAdjustRange(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride, hiValue, hiValueStride, threshold); - else - BackgroundAdjustRange(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride, hiValue, hiValueStride, threshold); - } - - template SIMD_INLINE void BackgroundAdjustRangeMasked(uint8_t * loCount, uint8_t * loValue, uint8_t * hiCount, uint8_t * hiValue, - const uint8_t * mask, size_t offset, const __m256i & threshold, const __m256i & tailMask) - { - const __m256i _mask = Load((const __m256i*)(mask + offset)); - BackgroundAdjustRange(loCount, loValue, hiCount, hiValue, offset, threshold, _mm256_and_si256(_mask, tailMask)); - } - - template void BackgroundAdjustRangeMasked(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(loValue) && Aligned(loValueStride) && Aligned(hiValue) && Aligned(hiValueStride)); - assert(Aligned(loCount) && Aligned(loCountStride) && Aligned(hiCount) && Aligned(hiCountStride)); - assert(Aligned(mask) && Aligned(maskStride)); - } - - const __m256i _threshold = _mm256_set1_epi8((char)threshold); - size_t alignedWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + alignedWidth, 1); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BackgroundAdjustRangeMasked(loCount, loValue, hiCount, hiValue, mask, col, _threshold, K8_01); - if (alignedWidth != width) - BackgroundAdjustRangeMasked(loCount, loValue, hiCount, hiValue, mask, width - A, _threshold, tailMask); - loValue += loValueStride; - hiValue += hiValueStride; - loCount += loCountStride; - hiCount += hiCountStride; - mask += maskStride; - } - } - - void BackgroundAdjustRangeMasked(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride) - { - if (Aligned(loValue) && Aligned(loValueStride) && Aligned(hiValue) && Aligned(hiValueStride) && - Aligned(loCount) && Aligned(loCountStride) && Aligned(hiCount) && Aligned(hiCountStride) && - Aligned(mask) && Aligned(maskStride)) - BackgroundAdjustRangeMasked(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride, hiValue, hiValueStride, threshold, mask, maskStride); - else - BackgroundAdjustRangeMasked(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride, hiValue, hiValueStride, threshold, mask, maskStride); - } - - template SIMD_INLINE void BackgroundShiftRange(const uint8_t * value, uint8_t * lo, uint8_t * hi, size_t offset, __m256i mask) - { - const __m256i _value = Load((__m256i*)(value + offset)); - const __m256i _lo = Load((__m256i*)(lo + offset)); - const __m256i _hi = Load((__m256i*)(hi + offset)); - - const __m256i add = _mm256_and_si256(mask, _mm256_subs_epu8(_value, _hi)); - const __m256i sub = _mm256_and_si256(mask, _mm256_subs_epu8(_lo, _value)); - - Store((__m256i*)(lo + offset), _mm256_subs_epu8(_mm256_adds_epu8(_lo, add), sub)); - Store((__m256i*)(hi + offset), _mm256_subs_epu8(_mm256_adds_epu8(_hi, add), sub)); - } - - template void BackgroundShiftRange(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(lo) && Aligned(loStride)); - assert(Aligned(hi) && Aligned(hiStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + alignedWidth, 0xFF); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BackgroundShiftRange(value, lo, hi, col, K_INV_ZERO); - if (alignedWidth != width) - BackgroundShiftRange(value, lo, hi, width - A, tailMask); - value += valueStride; - lo += loStride; - hi += hiStride; - } - } - - void BackgroundShiftRange(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(lo) && Aligned(loStride) && Aligned(hi) && Aligned(hiStride)) - BackgroundShiftRange(value, valueStride, width, height, lo, loStride, hi, hiStride); - else - BackgroundShiftRange(value, valueStride, width, height, lo, loStride, hi, hiStride); - } - - template SIMD_INLINE void BackgroundShiftRangeMasked(const uint8_t * value, uint8_t * lo, uint8_t * hi, const uint8_t * mask, - size_t offset, __m256i tailMask) - { - const __m256i _mask = Load((const __m256i*)(mask + offset)); - BackgroundShiftRange(value, lo, hi, offset, _mm256_and_si256(_mask, tailMask)); - } - - template void BackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride, const uint8_t * mask, size_t maskStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(lo) && Aligned(loStride)); - assert(Aligned(hi) && Aligned(hiStride)); - assert(Aligned(mask) && Aligned(maskStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + alignedWidth, 0xFF); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BackgroundShiftRangeMasked(value, lo, hi, mask, col, K_INV_ZERO); - if (alignedWidth != width) - BackgroundShiftRangeMasked(value, lo, hi, mask, width - A, tailMask); - value += valueStride; - lo += loStride; - hi += hiStride; - mask += maskStride; - } - } - - void BackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride, const uint8_t * mask, size_t maskStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(lo) && Aligned(loStride) && - Aligned(hi) && Aligned(hiStride) && Aligned(mask) && Aligned(maskStride)) - BackgroundShiftRangeMasked(value, valueStride, width, height, lo, loStride, hi, hiStride, mask, maskStride); - else - BackgroundShiftRangeMasked(value, valueStride, width, height, lo, loStride, hi, hiStride, mask, maskStride); - } - - template SIMD_INLINE void BackgroundInitMask(const uint8_t * src, uint8_t * dst, const __m256i & index, const __m256i & value) - { - __m256i _mask = _mm256_cmpeq_epi8(Load((__m256i*)src), index); - __m256i _old = _mm256_andnot_si256(_mask, Load((__m256i*)dst)); - __m256i _new = _mm256_and_si256(_mask, value); - Store((__m256i*)dst, _mm256_or_si256(_old, _new)); - } - - template void BackgroundInitMask(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t index, uint8_t value, uint8_t * dst, size_t dstStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(src) && Aligned(srcStride)); - assert(Aligned(dst) && Aligned(dstStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __m256i _index = _mm256_set1_epi8(index); - __m256i _value = _mm256_set1_epi8(value); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BackgroundInitMask(src + col, dst + col, _index, _value); - if (alignedWidth != width) - BackgroundInitMask(src + width - A, dst + width - A, _index, _value); - src += srcStride; - dst += dstStride; - } - } - - void BackgroundInitMask(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t index, uint8_t value, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - BackgroundInitMask(src, srcStride, width, height, index, value, dst, dstStride); - else - BackgroundInitMask(src, srcStride, width, height, index, value, dst, dstStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2BayerToBgr.cpp b/src/3rd/Simd/Simd/SimdAvx2BayerToBgr.cpp deleted file mode 100644 index 5bc2b526..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2BayerToBgr.cpp +++ /dev/null @@ -1,107 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdBayer.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void SaveBgr(const __m256i src[3], uint8_t * dst) - { - Store((__m256i*)dst + 0, InterleaveBgr<0>(src[0], src[1], src[2])); - Store((__m256i*)dst + 1, InterleaveBgr<1>(src[0], src[1], src[2])); - Store((__m256i*)dst + 2, InterleaveBgr<2>(src[0], src[1], src[2])); - } - - template void BayerToBgr(const __m256i src[12], uint8_t * bgr, size_t stride) - { - __m256i _bgr[6]; - BayerToBgr(src, _bgr); - SaveBgr(_bgr + 0, bgr); - SaveBgr(_bgr + 3, bgr + stride); - } - - template void BayerToBgr(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, uint8_t * bgr, size_t bgrStride) - { - const uint8_t * src[3]; - __m256i _src[12]; - size_t body = AlignHi(width - 2, A) - A; - for (size_t row = 0; row < height; row += 2) - { - src[0] = (row == 0 ? bayer : bayer - 2 * bayerStride); - src[1] = bayer; - src[2] = (row == height - 2 ? bayer : bayer + 2 * bayerStride); - - LoadBayerNose(src, 0, bayerStride, _src); - BayerToBgr(_src, bgr, bgrStride); - for (size_t col = A; col < body; col += A) - { - LoadBayerBody(src, col, bayerStride, _src); - BayerToBgr(_src, bgr + 3 * col, bgrStride); - } - LoadBayerTail(src, width - A, bayerStride, _src); - BayerToBgr(_src, bgr + 3 * (width - A), bgrStride); - - bayer += 2 * bayerStride; - bgr += 2 * bgrStride; - } - } - - template void BayerToBgr(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgr, size_t bgrStride) - { - switch (bayerFormat) - { - case SimdPixelFormatBayerGrbg: - BayerToBgr(bayer, width, height, bayerStride, bgr, bgrStride); - break; - case SimdPixelFormatBayerGbrg: - BayerToBgr(bayer, width, height, bayerStride, bgr, bgrStride); - break; - case SimdPixelFormatBayerRggb: - BayerToBgr(bayer, width, height, bayerStride, bgr, bgrStride); - break; - case SimdPixelFormatBayerBggr: - BayerToBgr(bayer, width, height, bayerStride, bgr, bgrStride); - break; - default: - assert(0); - } - } - - void BayerToBgr(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgr, size_t bgrStride) - { - assert((width % 2 == 0) && (height % 2 == 0)); - - if (Aligned(bayer) && Aligned(bgr) && Aligned(bayerStride) && Aligned(bgrStride)) - BayerToBgr(bayer, width, height, bayerStride, bayerFormat, bgr, bgrStride); - else - BayerToBgr(bayer, width, height, bayerStride, bayerFormat, bgr, bgrStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2BayerToBgra.cpp b/src/3rd/Simd/Simd/SimdAvx2BayerToBgra.cpp deleted file mode 100644 index a7cda5e2..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2BayerToBgra.cpp +++ /dev/null @@ -1,112 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdBayer.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void SaveBgra(const __m256i bgr[3], const __m256i & alpha, uint8_t * bgra) - { - __m256i bgLo = PermutedUnpackLoU8(bgr[0], bgr[1]); - __m256i bgHi = PermutedUnpackHiU8(bgr[0], bgr[1]); - __m256i raLo = PermutedUnpackLoU8(bgr[2], alpha); - __m256i raHi = PermutedUnpackHiU8(bgr[2], alpha); - Store((__m256i*)bgra + 0, UnpackU16<0>(bgLo, raLo)); - Store((__m256i*)bgra + 1, UnpackU16<0>(bgHi, raHi)); - Store((__m256i*)bgra + 2, UnpackU16<1>(bgLo, raLo)); - Store((__m256i*)bgra + 3, UnpackU16<1>(bgHi, raHi)); - } - - template void BayerToBgra(const __m256i src[12], const __m256i & alpha, uint8_t * bgra, size_t stride) - { - __m256i bgr[6]; - BayerToBgr(src, bgr); - SaveBgra(bgr + 0, alpha, bgra); - SaveBgra(bgr + 3, alpha, bgra + stride); - } - - template void BayerToBgra(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - const uint8_t * src[3]; - __m256i _src[12]; - __m256i _alpha = _mm256_set1_epi8((char)alpha); - size_t body = AlignHi(width - 2, A) - A; - for (size_t row = 0; row < height; row += 2) - { - src[0] = (row == 0 ? bayer : bayer - 2 * bayerStride); - src[1] = bayer; - src[2] = (row == height - 2 ? bayer : bayer + 2 * bayerStride); - - LoadBayerNose(src, 0, bayerStride, _src); - BayerToBgra(_src, _alpha, bgra, bgraStride); - for (size_t col = A; col < body; col += A) - { - LoadBayerBody(src, col, bayerStride, _src); - BayerToBgra(_src, _alpha, bgra + 4 * col, bgraStride); - } - LoadBayerTail(src, width - A, bayerStride, _src); - BayerToBgra(_src, _alpha, bgra + 4 * (width - A), bgraStride); - - bayer += 2 * bayerStride; - bgra += 2 * bgraStride; - } - } - - template void BayerToBgra(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - switch (bayerFormat) - { - case SimdPixelFormatBayerGrbg: - BayerToBgra(bayer, width, height, bayerStride, bgra, bgraStride, alpha); - break; - case SimdPixelFormatBayerGbrg: - BayerToBgra(bayer, width, height, bayerStride, bgra, bgraStride, alpha); - break; - case SimdPixelFormatBayerRggb: - BayerToBgra(bayer, width, height, bayerStride, bgra, bgraStride, alpha); - break; - case SimdPixelFormatBayerBggr: - BayerToBgra(bayer, width, height, bayerStride, bgra, bgraStride, alpha); - break; - default: - assert(0); - } - } - - void BayerToBgra(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - assert((width % 2 == 0) && (height % 2 == 0)); - - if (Aligned(bayer) && Aligned(bgra) && Aligned(bayerStride) && Aligned(bgraStride)) - BayerToBgra(bayer, width, height, bayerStride, bayerFormat, bgra, bgraStride, alpha); - else - BayerToBgra(bayer, width, height, bayerStride, bayerFormat, bgra, bgraStride, alpha); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2BgrToBgra.cpp b/src/3rd/Simd/Simd/SimdAvx2BgrToBgra.cpp deleted file mode 100644 index 980eb016..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2BgrToBgra.cpp +++ /dev/null @@ -1,163 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void BgrToBgra(const uint8_t * bgr, uint8_t * bgra, __m256i alpha) - { - Store((__m256i*)bgra + 0, BgrToBgra(Load((__m256i*)(bgr + 0)), alpha)); - Store((__m256i*)bgra + 1, BgrToBgra(Load((__m256i*)(bgr + 24)), alpha)); - Store((__m256i*)bgra + 2, BgrToBgra(Load((__m256i*)(bgr + 48)), alpha)); - Store((__m256i*)bgra + 3, BgrToBgra(Load((__m256i*)(bgr + 64)), alpha)); - } - - template void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - assert(width >= A); - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)); - - size_t alignedWidth = AlignLo(width, A); - - __m256i _alpha = _mm256_slli_si256(_mm256_set1_epi32(alpha), 3); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BgrToBgra(bgr + 3 * col, bgra + 4 * col, _alpha); - if (width != alignedWidth) - BgrToBgra(bgr + 3 * (width - A), bgra + 4 * (width - A), _alpha); - bgr += bgrStride; - bgra += bgraStride; - } - } - - void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); - else - BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void Bgr48pToBgra32(uint8_t * bgra, - const uint8_t * blue, const uint8_t * green, const uint8_t * red, size_t offset, __m256i alpha) - { - __m256i _blue = _mm256_and_si256(LoadPermuted((__m256i*)(blue + offset)), K16_00FF); - __m256i _green = _mm256_and_si256(LoadPermuted((__m256i*)(green + offset)), K16_00FF); - __m256i _red = _mm256_and_si256(LoadPermuted((__m256i*)(red + offset)), K16_00FF); - - __m256i bg = _mm256_or_si256(_blue, _mm256_slli_si256(_green, 1)); - __m256i ra = _mm256_or_si256(_red, alpha); - - Store((__m256i*)bgra + 0, _mm256_unpacklo_epi16(bg, ra)); - Store((__m256i*)bgra + 1, _mm256_unpackhi_epi16(bg, ra)); - } - - template void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, - const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - assert(width >= HA); - if (align) - { - assert(Aligned(blue) && Aligned(blueStride)); - assert(Aligned(green) && Aligned(greenStride)); - assert(Aligned(red) && Aligned(redStride)); - assert(Aligned(bgra) && Aligned(bgraStride)); - } - - __m256i _alpha = _mm256_slli_si256(_mm256_set1_epi16(alpha), 1); - size_t alignedWidth = AlignLo(width, HA); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, srcOffset = 0, dstOffset = 0; col < alignedWidth; col += HA, srcOffset += A, dstOffset += DA) - Bgr48pToBgra32(bgra + dstOffset, blue, green, red, srcOffset, _alpha); - if (width != alignedWidth) - Bgr48pToBgra32(bgra + (width - HA) * 4, blue, green, red, (width - HA) * 2, _alpha); - blue += blueStride; - green += greenStride; - red += redStride; - bgra += bgraStride; - } - } - - void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, - const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(blue) && Aligned(blueStride) && Aligned(green) && Aligned(greenStride) && - Aligned(red) && Aligned(redStride) && Aligned(bgra) && Aligned(bgraStride)) - Bgr48pToBgra32(blue, blueStride, width, height, green, greenStride, red, redStride, bgra, bgraStride, alpha); - else - Bgr48pToBgra32(blue, blueStride, width, height, green, greenStride, red, redStride, bgra, bgraStride, alpha); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void RgbToBgra(const uint8_t* rgb, uint8_t* bgra, __m256i alpha) - { - Store((__m256i*)bgra + 0, RgbToBgra(Load((__m256i*)(rgb + 0)), alpha)); - Store((__m256i*)bgra + 1, RgbToBgra(Load((__m256i*)(rgb + 24)), alpha)); - Store((__m256i*)bgra + 2, RgbToBgra(Load((__m256i*)(rgb + 48)), alpha)); - Store((__m256i*)bgra + 3, RgbToBgra(Load((__m256i*)(rgb + 64)), alpha)); - } - - template void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) - { - assert(width >= A); - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)); - - size_t alignedWidth = AlignLo(width, A); - - __m256i _alpha = _mm256_slli_si256(_mm256_set1_epi32(alpha), 3); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - RgbToBgra(rgb + 3 * col, bgra + 4 * col, _alpha); - if (width != alignedWidth) - RgbToBgra(rgb + 3 * (width - A), bgra + 4 * (width - A), _alpha); - rgb += rgbStride; - bgra += bgraStride; - } - } - - void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)) - RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); - else - RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); - } - } -#endif//SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2BgrToGray.cpp b/src/3rd/Simd/Simd/SimdAvx2BgrToGray.cpp deleted file mode 100644 index a71782e0..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2BgrToGray.cpp +++ /dev/null @@ -1,146 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - const __m256i K16_BLUE_RED = SIMD_MM256_SET2_EPI16(Base::BLUE_TO_GRAY_WEIGHT, Base::RED_TO_GRAY_WEIGHT); - const __m256i K16_GREEN_ROUND = SIMD_MM256_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, Base::BGR_TO_GRAY_ROUND_TERM); - - SIMD_INLINE __m256i BgraToGray32(__m256i bgra) - { - const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(bgra, 1), K16_00FF); - const __m256i b0r0 = _mm256_and_si256(bgra, K16_00FF); - const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_ROUND), _mm256_madd_epi16(b0r0, K16_BLUE_RED)); - return _mm256_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT); - } - - SIMD_INLINE __m256i BgraToGray(__m256i bgra[4]) - { - const __m256i lo = PackI32ToI16(BgraToGray32(bgra[0]), BgraToGray32(bgra[1])); - const __m256i hi = PackI32ToI16(BgraToGray32(bgra[2]), BgraToGray32(bgra[3])); - return PackI16ToU8(lo, hi); - } - - template SIMD_INLINE __m256i BgrToGray(const uint8_t * bgr) - { - __m256i bgra[4]; - bgra[0] = BgrToBgra(Load((__m256i*)(bgr + 0)), K32_01000000); - bgra[1] = BgrToBgra(Load((__m256i*)(bgr + 24)), K32_01000000); - bgra[2] = BgrToBgra(Load((__m256i*)(bgr + 48)), K32_01000000); - bgra[3] = BgrToBgra(Load((__m256i*)(bgr + 64)), K32_01000000); - return BgraToGray(bgra); - } - - template void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride) - { - assert(width >= A); - if (align) - assert(Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride)); - - size_t alignedWidth = AlignLo(width, A); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - Store((__m256i*)(gray + col), BgrToGray(bgr + 3 * col)); - if (width != alignedWidth) - Store((__m256i*)(gray + width - A), BgrToGray(bgr + 3 * (width - A))); - bgr += bgrStride; - gray += grayStride; - } - } - - void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride) - { - if (Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToGray(bgr, width, height, bgrStride, gray, grayStride); - else - BgrToGray(bgr, width, height, bgrStride, gray, grayStride); - } - - - //--------------------------------------------------------------------- - - const __m256i K16_RED_BLUE = SIMD_MM256_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT); - - SIMD_INLINE __m256i RgbaToGray32(__m256i rgba) - { - const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(rgba, 1), K16_00FF); - const __m256i r0b0 = _mm256_and_si256(rgba, K16_00FF); - const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_ROUND), _mm256_madd_epi16(r0b0, K16_RED_BLUE)); - return _mm256_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT); - } - - SIMD_INLINE __m256i RgbaToGray(__m256i rgba[4]) - { - const __m256i lo = PackI32ToI16(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1])); - const __m256i hi = PackI32ToI16(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3])); - return PackI16ToU8(lo, hi); - } - - template SIMD_INLINE __m256i RgbToGray(const uint8_t* rgb) - { - __m256i rgba[4]; - rgba[0] = BgrToBgra(Load((__m256i*)(rgb + 0)), K32_01000000); - rgba[1] = BgrToBgra(Load((__m256i*)(rgb + 24)), K32_01000000); - rgba[2] = BgrToBgra(Load((__m256i*)(rgb + 48)), K32_01000000); - rgba[3] = BgrToBgra(Load((__m256i*)(rgb + 64)), K32_01000000); - return RgbaToGray(rgba); - } - - template void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) - { - assert(width >= A); - if (align) - assert(Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride)); - - size_t alignedWidth = AlignLo(width, A); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - Store((__m256i*)(gray + col), RgbToGray(rgb + 3 * col)); - if (width != alignedWidth) - Store((__m256i*)(gray + width - A), RgbToGray(rgb + 3 * (width - A))); - rgb += rgbStride; - gray += grayStride; - } - } - - void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) - { - if (Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride)) - RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - else - RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - } - } -#endif//SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2BgrToRgb.cpp b/src/3rd/Simd/Simd/SimdAvx2BgrToRgb.cpp deleted file mode 100644 index f14b496a..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2BgrToRgb.cpp +++ /dev/null @@ -1,97 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - const __m256i K8_SHFL_0S0 = SIMD_MM256_SETR_EPI8(0x2, 0x1, 0x0, 0x5, 0x4, 0x3, 0x8, 0x7, 0x6, 0xB, 0xA, 0x9, 0xE, 0xD, 0xC, -1, - 0x0, -1, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0xA, 0x9, 0x8, 0xD, 0xC, 0xB, -1, 0xF); - const __m256i K8_SHFL_0P0 = SIMD_MM256_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x9, - -1, 0x7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m256i K8_SHFL_0P1 = SIMD_MM256_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x8, -1); - const __m256i K8_SHFL_1S1 = SIMD_MM256_SETR_EPI8(-1, 0x3, 0x2, 0x1, 0x6, 0x5, 0x4, 0x9, 0x8, 0x7, 0xC, 0xB, 0xA, 0xF, 0xE, 0xD, - 0x2, 0x1, 0x0, 0x5, 0x4, 0x3, 0x8, 0x7, 0x6, 0xB, 0xA, 0x9, 0xE, 0xD, 0xC, -1); - const __m256i K8_SHFL_1P0 = SIMD_MM256_SETR_EPI8(0x6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m256i K8_SHFL_1P2 = SIMD_MM256_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x9); - const __m256i K8_SHFL_2S2 = SIMD_MM256_SETR_EPI8(0x0, -1, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0xA, 0x9, 0x8, 0xD, 0xC, 0xB, -1, 0xF, - -1, 0x3, 0x2, 0x1, 0x6, 0x5, 0x4, 0x9, 0x8, 0x7, 0xC, 0xB, 0xA, 0xF, 0xE, 0xD); - const __m256i K8_SHFL_2P1 = SIMD_MM256_SETR_EPI8(-1, 0x7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m256i K8_SHFL_2P2 = SIMD_MM256_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x8, -1, - 0x6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - - template SIMD_INLINE void BgrToRgb(const uint8_t * src, uint8_t * dst) - { - __m256i s0 = Load((__m256i*)src + 0); - __m256i s1 = Load((__m256i*)src + 1); - __m256i s2 = Load((__m256i*)src + 2); - __m256i p0 = _mm256_permute4x64_epi64(s0, 0x1B); - __m256i p1 = _mm256_permute4x64_epi64(s1, 0x1B); - __m256i p2 = _mm256_permute4x64_epi64(s2, 0x1B); - Store((__m256i*)dst + 0, _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s0, K8_SHFL_0S0), - _mm256_shuffle_epi8(p0, K8_SHFL_0P0)), _mm256_shuffle_epi8(p1, K8_SHFL_0P1))); - Store((__m256i*)dst + 1, _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s1, K8_SHFL_1S1), - _mm256_shuffle_epi8(p0, K8_SHFL_1P0)), _mm256_shuffle_epi8(p2, K8_SHFL_1P2))); - Store((__m256i*)dst + 2, _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s2, K8_SHFL_2S2), - _mm256_shuffle_epi8(p1, K8_SHFL_2P1)), _mm256_shuffle_epi8(p2, K8_SHFL_2P2))); - } - - template void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) - { - assert(width >= A); - if (align) - assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)); - - const size_t A3 = A * 3; - size_t size = width * 3; - size_t aligned = AlignLo(width, A) * 3; - - for (size_t row = 0; row < height; ++row) - { - for (size_t i = 0; i < aligned; i += A3) - BgrToRgb(bgr + i, rgb + i); - if (aligned < size) - BgrToRgb(bgr + size - A3, rgb + size - A3); - bgr += bgrStride; - rgb += rgbStride; - } - } - - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) - { - if (Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)) - BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); - else - BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); - } - } -#endif//SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2BgrToYuv.cpp b/src/3rd/Simd/Simd/SimdAvx2BgrToYuv.cpp deleted file mode 100644 index 7b04851e..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2BgrToYuv.cpp +++ /dev/null @@ -1,242 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void LoadBgr(const __m256i * p, __m256i & blue, __m256i & green, __m256i & red) - { - __m256i bgr[3]; - bgr[0] = Load(p + 0); - bgr[1] = Load(p + 1); - bgr[2] = Load(p + 2); - blue = BgrToBlue(bgr); - green = BgrToGreen(bgr); - red = BgrToRed(bgr); - } - -#if defined(_MSC_VER) // Workaround for Visual Studio 2012 compiler bug in release mode: - SIMD_INLINE __m256i Average16(const __m256i & s0, const __m256i & s1) - { - return _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16( - _mm256_hadd_epi16(_mm256_unpacklo_epi8(s0, K_ZERO), _mm256_unpackhi_epi8(s0, K_ZERO)), - _mm256_hadd_epi16(_mm256_unpacklo_epi8(s1, K_ZERO), _mm256_unpackhi_epi8(s1, K_ZERO))), K16_0002), 2); - } -#else - SIMD_INLINE __m256i Average16(const __m256i & s0, const __m256i & s1) - { - return _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(_mm256_maddubs_epi16(s0, K8_01), _mm256_maddubs_epi16(s1, K8_01)), K16_0002), 2); - } -#endif - - template SIMD_INLINE void BgrToYuv420p(const uint8_t * bgr0, size_t bgrStride, uint8_t * y0, size_t yStride, uint8_t * u, uint8_t * v) - { - const uint8_t * bgr1 = bgr0 + bgrStride; - uint8_t * y1 = y0 + yStride; - - __m256i blue[2][2], green[2][2], red[2][2]; - - LoadBgr((__m256i*)bgr0 + 0, blue[0][0], green[0][0], red[0][0]); - Store((__m256i*)y0 + 0, BgrToY8(blue[0][0], green[0][0], red[0][0])); - - LoadBgr((__m256i*)bgr0 + 3, blue[0][1], green[0][1], red[0][1]); - Store((__m256i*)y0 + 1, BgrToY8(blue[0][1], green[0][1], red[0][1])); - - LoadBgr((__m256i*)bgr1 + 0, blue[1][0], green[1][0], red[1][0]); - Store((__m256i*)y1 + 0, BgrToY8(blue[1][0], green[1][0], red[1][0])); - - LoadBgr((__m256i*)bgr1 + 3, blue[1][1], green[1][1], red[1][1]); - Store((__m256i*)y1 + 1, BgrToY8(blue[1][1], green[1][1], red[1][1])); - - blue[0][0] = Average16(blue[0][0], blue[1][0]); - blue[0][1] = Average16(blue[0][1], blue[1][1]); - green[0][0] = Average16(green[0][0], green[1][0]); - green[0][1] = Average16(green[0][1], green[1][1]); - red[0][0] = Average16(red[0][0], red[1][0]); - red[0][1] = Average16(red[0][1], red[1][1]); - - Store((__m256i*)u, PackI16ToU8(BgrToU16(blue[0][0], green[0][0], red[0][0]), BgrToU16(blue[0][1], green[0][1], red[0][1]))); - Store((__m256i*)v, PackI16ToU8(BgrToV16(blue[0][0], green[0][0], red[0][0]), BgrToV16(blue[0][1], green[0][1], red[0][1]))); - } - - template void BgrToYuv420p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= DA) && (height >= 2)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); - } - - size_t alignedWidth = AlignLo(width, DA); - const size_t A6 = A * 6; - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colY = 0, colBgr = 0; colY < alignedWidth; colY += DA, colUV += A, colBgr += A6) - BgrToYuv420p(bgr + colBgr, bgrStride, y + colY, yStride, u + colUV, v + colUV); - if (width != alignedWidth) - { - size_t offset = width - DA; - BgrToYuv420p(bgr + offset * 3, bgrStride, y + offset, yStride, u + offset / 2, v + offset / 2); - } - y += 2 * yStride; - u += uStride; - v += vStride; - bgr += 2 * bgrStride; - } - } - - void BgrToYuv420p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToYuv420p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else - BgrToYuv420p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - } - - SIMD_INLINE void Average16(__m256i & a) - { -#ifdef SIMD_MADDUBS_ERROR - a = _mm256_srli_epi16(_mm256_add_epi16(_mm256_hadd_epi16(_mm256_unpacklo_epi8(a, K_ZERO), _mm256_unpackhi_epi8(a, K_ZERO)), K16_0001), 1); -#else - a = _mm256_srli_epi16(_mm256_add_epi16(_mm256_maddubs_epi16(a, K8_01), K16_0001), 1); -#endif - } - - template SIMD_INLINE void BgrToYuv422p(const uint8_t * bgr, uint8_t * y, uint8_t * u, uint8_t * v) - { - __m256i blue[2], green[2], red[2]; - - LoadBgr((__m256i*)bgr + 0, blue[0], green[0], red[0]); - Store((__m256i*)y + 0, BgrToY8(blue[0], green[0], red[0])); - - LoadBgr((__m256i*)bgr + 3, blue[1], green[1], red[1]); - Store((__m256i*)y + 1, BgrToY8(blue[1], green[1], red[1])); - - Average16(blue[0]); - Average16(blue[1]); - Average16(green[0]); - Average16(green[1]); - Average16(red[0]); - Average16(red[1]); - - Store((__m256i*)u, PackI16ToU8(BgrToU16(blue[0], green[0], red[0]), BgrToU16(blue[1], green[1], red[1]))); - Store((__m256i*)v, PackI16ToU8(BgrToV16(blue[0], green[0], red[0]), BgrToV16(blue[1], green[1], red[1]))); - } - - template void BgrToYuv422p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert((width % 2 == 0) && (width >= DA)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); - } - - size_t alignedWidth = AlignLo(width, DA); - const size_t A6 = A * 6; - for (size_t row = 0; row < height; ++row) - { - for (size_t colUV = 0, colY = 0, colBgr = 0; colY < alignedWidth; colY += DA, colUV += A, colBgr += A6) - BgrToYuv422p(bgr + colBgr, y + colY, u + colUV, v + colUV); - if (width != alignedWidth) - { - size_t offset = width - DA; - BgrToYuv422p(bgr + offset * 3, y + offset, u + offset / 2, v + offset / 2); - } - y += yStride; - u += uStride; - v += vStride; - bgr += bgrStride; - } - } - - void BgrToYuv422p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToYuv422p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else - BgrToYuv422p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - } - - template SIMD_INLINE void BgrToYuv444p(const uint8_t * bgr, uint8_t * y, uint8_t * u, uint8_t * v) - { - __m256i blue, green, red; - LoadBgr((__m256i*)bgr, blue, green, red); - Store((__m256i*)y, BgrToY8(blue, green, red)); - Store((__m256i*)u, BgrToU8(blue, green, red)); - Store((__m256i*)v, BgrToV8(blue, green, red)); - } - - template void BgrToYuv444p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); - } - - size_t alignedWidth = AlignLo(width, A); - const size_t A3 = A * 3; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colBgr = 0; col < alignedWidth; col += A, colBgr += A3) - BgrToYuv444p(bgr + colBgr, y + col, u + col, v + col); - if (width != alignedWidth) - { - size_t col = width - A; - BgrToYuv444p(bgr + col * 3, y + col, u + col, v + col); - } - y += yStride; - u += uStride; - v += vStride; - bgr += bgrStride; - } - } - - void BgrToYuv444p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToYuv444p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else - BgrToYuv444p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2BgraToBgr.cpp b/src/3rd/Simd/Simd/SimdAvx2BgraToBgr.cpp deleted file mode 100644 index 1fa7e2f3..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2BgraToBgr.cpp +++ /dev/null @@ -1,115 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - const __m256i K8_SUFFLE_BGRA_TO_BGR = SIMD_MM256_SETR_EPI8( - 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, - 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1); - - const __m256i K32_PERMUTE_BGRA_TO_BGR = SIMD_MM256_SETR_EPI32(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, -1, -1); - - template SIMD_INLINE __m256i BgraToBgr(const uint8_t* bgra) - { - __m256i _bgra = Load((__m256i*)bgra); - return _mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(_bgra, K8_SUFFLE_BGRA_TO_BGR), K32_PERMUTE_BGRA_TO_BGR); - } - - template void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) - { - assert(width >= F); - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)); - - size_t widthF = AlignLo(width, F); - if (width == widthF) - widthF -= F; - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < widthF; col += F) - Store((__m256i*)(bgr + 3 * col), BgraToBgr(bgra + 4 * col)); - if (width != widthF) - Store24(bgr + 3 * (width - F), BgraToBgr(bgra + 4 * (width - F))); - bgra += bgraStride; - bgr += bgrStride; - } - } - - void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)) - BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); - else - BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); - } - - //--------------------------------------------------------------------- - - const __m256i K8_SUFFLE_BGRA_TO_RGB = SIMD_MM256_SETR_EPI8( - 0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1, - 0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1); - - template SIMD_INLINE __m256i BgraToRgb(const uint8_t* bgra) - { - __m256i _bgra = Load((__m256i*)bgra); - return _mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(_bgra, K8_SUFFLE_BGRA_TO_RGB), K32_PERMUTE_BGRA_TO_BGR); - } - - template void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) - { - assert(width >= F); - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)); - - size_t widthF = AlignLo(width, F); - if (width == widthF) - widthF -= F; - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < widthF; col += F) - Store((__m256i*)(rgb + 3 * col), BgraToRgb(bgra + 4 * col)); - if (width != widthF) - Store24(rgb + 3 * (width - F), BgraToRgb(bgra + 4 * (width - F))); - bgra += bgraStride; - rgb += rgbStride; - } - } - - void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)) - BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); - else - BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2BgraToGray.cpp b/src/3rd/Simd/Simd/SimdAvx2BgraToGray.cpp deleted file mode 100644 index 832c29f5..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2BgraToGray.cpp +++ /dev/null @@ -1,94 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - const __m256i K16_BLUE_RED = SIMD_MM256_SET2_EPI16(Base::BLUE_TO_GRAY_WEIGHT, Base::RED_TO_GRAY_WEIGHT); - const __m256i K16_GREEN_0000 = SIMD_MM256_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, 0x0000); - const __m256i K32_ROUND_TERM = SIMD_MM256_SET1_EPI32(Base::BGR_TO_GRAY_ROUND_TERM); - - SIMD_INLINE __m256i BgraToGray32(__m256i bgra) - { - const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(bgra, 1), K16_00FF); - const __m256i b0r0 = _mm256_and_si256(bgra, K16_00FF); - const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_0000), _mm256_madd_epi16(b0r0, K16_BLUE_RED)); - return _mm256_srli_epi32(_mm256_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT); - } - - SIMD_INLINE __m256i BgraToGray(__m256i bgra[4]) - { - const __m256i lo = PackI32ToI16(BgraToGray32(bgra[0]), BgraToGray32(bgra[1])); - const __m256i hi = PackI32ToI16(BgraToGray32(bgra[2]), BgraToGray32(bgra[3])); - return PackI16ToU8(lo, hi); - } - - template SIMD_INLINE void Load(const uint8_t* p, __m256i a[4]) - { - a[0] = Load((__m256i*)p + 0); - a[1] = Load((__m256i*)p + 1); - a[2] = Load((__m256i*)p + 2); - a[3] = Load((__m256i*)p + 3); - } - - template void BgraToGray(const uint8_t *bgra, size_t width, size_t height, size_t bgraStride, uint8_t *gray, size_t grayStride) - { - assert(width >= A); - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(gray) && Aligned(grayStride)); - - size_t alignedWidth = AlignLo(width, A); - __m256i a[4]; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - { - Load(bgra + 4 * col, a); - Store((__m256i*)(gray + col), BgraToGray(a)); - } - if (alignedWidth != width) - { - Load(bgra + 4 * (width - A), a); - Store((__m256i*)(gray + width - A), BgraToGray(a)); - } - bgra += bgraStride; - gray += grayStride; - } - } - - void BgraToGray(const uint8_t *bgra, size_t width, size_t height, size_t bgraStride, uint8_t *gray, size_t grayStride) - { - if (Aligned(bgra) && Aligned(gray) && Aligned(bgraStride) && Aligned(grayStride)) - BgraToGray(bgra, width, height, bgraStride, gray, grayStride); - else - BgraToGray(bgra, width, height, bgraStride, gray, grayStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2BgraToYuv.cpp b/src/3rd/Simd/Simd/SimdAvx2BgraToYuv.cpp deleted file mode 100644 index d8c4cfb1..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2BgraToYuv.cpp +++ /dev/null @@ -1,341 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void LoadPreparedBgra16(const __m256i * bgra, __m256i & b16_r16, __m256i & g16_1) - { - __m256i _bgra = Load(bgra); - b16_r16 = _mm256_and_si256(_bgra, K16_00FF); - g16_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_si256(_bgra, 1), K32_000000FF), K32_00010000); - } - - template SIMD_INLINE __m256i LoadAndConvertY16(const __m256i * bgra, __m256i & b16_r16, __m256i & g16_1) - { - __m256i _b16_r16[2], _g16_1[2]; - LoadPreparedBgra16(bgra + 0, _b16_r16[0], _g16_1[0]); - LoadPreparedBgra16(bgra + 1, _b16_r16[1], _g16_1[1]); - b16_r16 = _mm256_permute4x64_epi64(_mm256_hadd_epi32(_b16_r16[0], _b16_r16[1]), 0xD8); - g16_1 = _mm256_permute4x64_epi64(_mm256_hadd_epi32(_g16_1[0], _g16_1[1]), 0xD8); - return SaturateI16ToU8(_mm256_add_epi16(K16_Y_ADJUST, PackI32ToI16(BgrToY32(_b16_r16[0], _g16_1[0]), BgrToY32(_b16_r16[1], _g16_1[1])))); - } - - template SIMD_INLINE __m256i LoadAndConvertY8(const __m256i * bgra, __m256i b16_r16[2], __m256i g16_1[2]) - { - return PackI16ToU8(LoadAndConvertY16(bgra + 0, b16_r16[0], g16_1[0]), LoadAndConvertY16(bgra + 2, b16_r16[1], g16_1[1])); - } - - SIMD_INLINE void Average16(__m256i & a, const __m256i & b) - { - a = _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(a, b), K16_0002), 2); - } - - SIMD_INLINE __m256i ConvertU16(__m256i b16_r16[2], __m256i g16_1[2]) - { - return SaturateI16ToU8(_mm256_add_epi16(K16_UV_ADJUST, PackI32ToI16(BgrToU32(b16_r16[0], g16_1[0]), BgrToU32(b16_r16[1], g16_1[1])))); - } - - SIMD_INLINE __m256i ConvertV16(__m256i b16_r16[2], __m256i g16_1[2]) - { - return SaturateI16ToU8(_mm256_add_epi16(K16_UV_ADJUST, PackI32ToI16(BgrToV32(b16_r16[0], g16_1[0]), BgrToV32(b16_r16[1], g16_1[1])))); - } - - template SIMD_INLINE void BgraToYuv420p(const uint8_t * bgra0, size_t bgraStride, uint8_t * y0, size_t yStride, uint8_t * u, uint8_t * v) - { - const uint8_t * bgra1 = bgra0 + bgraStride; - uint8_t * y1 = y0 + yStride; - - __m256i _b16_r16[2][2][2], _g16_1[2][2][2]; - Store((__m256i*)y0 + 0, LoadAndConvertY8((__m256i*)bgra0 + 0, _b16_r16[0][0], _g16_1[0][0])); - Store((__m256i*)y0 + 1, LoadAndConvertY8((__m256i*)bgra0 + 4, _b16_r16[0][1], _g16_1[0][1])); - Store((__m256i*)y1 + 0, LoadAndConvertY8((__m256i*)bgra1 + 0, _b16_r16[1][0], _g16_1[1][0])); - Store((__m256i*)y1 + 1, LoadAndConvertY8((__m256i*)bgra1 + 4, _b16_r16[1][1], _g16_1[1][1])); - - Average16(_b16_r16[0][0][0], _b16_r16[1][0][0]); - Average16(_b16_r16[0][0][1], _b16_r16[1][0][1]); - Average16(_b16_r16[0][1][0], _b16_r16[1][1][0]); - Average16(_b16_r16[0][1][1], _b16_r16[1][1][1]); - - Average16(_g16_1[0][0][0], _g16_1[1][0][0]); - Average16(_g16_1[0][0][1], _g16_1[1][0][1]); - Average16(_g16_1[0][1][0], _g16_1[1][1][0]); - Average16(_g16_1[0][1][1], _g16_1[1][1][1]); - - Store((__m256i*)u, PackI16ToU8(ConvertU16(_b16_r16[0][0], _g16_1[0][0]), ConvertU16(_b16_r16[0][1], _g16_1[0][1]))); - Store((__m256i*)v, PackI16ToU8(ConvertV16(_b16_r16[0][0], _g16_1[0][0]), ConvertV16(_b16_r16[0][1], _g16_1[0][1]))); - } - - template void BgraToYuv420p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= DA) && (height >= 2)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - size_t alignedWidth = AlignLo(width, DA); - const size_t A8 = A * 8; - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colY = 0, colBgra = 0; colY < alignedWidth; colY += DA, colUV += A, colBgra += A8) - BgraToYuv420p(bgra + colBgra, bgraStride, y + colY, yStride, u + colUV, v + colUV); - if (width != alignedWidth) - { - size_t offset = width - DA; - BgraToYuv420p(bgra + offset * 4, bgraStride, y + offset, yStride, u + offset / 2, v + offset / 2); - } - y += 2 * yStride; - u += uStride; - v += vStride; - bgra += 2 * bgraStride; - } - } - - void BgraToYuv420p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)) - BgraToYuv420p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else - BgraToYuv420p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - } - - SIMD_INLINE void Average16(__m256i a[2][2]) - { - a[0][0] = _mm256_srli_epi16(_mm256_add_epi16(a[0][0], K16_0001), 1); - a[0][1] = _mm256_srli_epi16(_mm256_add_epi16(a[0][1], K16_0001), 1); - a[1][0] = _mm256_srli_epi16(_mm256_add_epi16(a[1][0], K16_0001), 1); - a[1][1] = _mm256_srli_epi16(_mm256_add_epi16(a[1][1], K16_0001), 1); - } - - template SIMD_INLINE void BgraToYuv422p(const uint8_t * bgra, uint8_t * y, uint8_t * u, uint8_t * v) - { - __m256i _b16_r16[2][2], _g16_1[2][2]; - Store((__m256i*)y + 0, LoadAndConvertY8((__m256i*)bgra + 0, _b16_r16[0], _g16_1[0])); - Store((__m256i*)y + 1, LoadAndConvertY8((__m256i*)bgra + 4, _b16_r16[1], _g16_1[1])); - - Average16(_b16_r16); - Average16(_g16_1); - - Store((__m256i*)u, PackI16ToU8(ConvertU16(_b16_r16[0], _g16_1[0]), ConvertU16(_b16_r16[1], _g16_1[1]))); - Store((__m256i*)v, PackI16ToU8(ConvertV16(_b16_r16[0], _g16_1[0]), ConvertV16(_b16_r16[1], _g16_1[1]))); - } - - template void BgraToYuv422p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert((width % 2 == 0) && (width >= DA)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - size_t alignedWidth = AlignLo(width, DA); - const size_t A8 = A * 8; - for (size_t row = 0; row < height; ++row) - { - for (size_t colUV = 0, colY = 0, colBgra = 0; colY < alignedWidth; colY += DA, colUV += A, colBgra += A8) - BgraToYuv422p(bgra + colBgra, y + colY, u + colUV, v + colUV); - if (width != alignedWidth) - { - size_t offset = width - DA; - BgraToYuv422p(bgra + offset * 4, y + offset, u + offset / 2, v + offset / 2); - } - y += yStride; - u += uStride; - v += vStride; - bgra += bgraStride; - } - } - - void BgraToYuv422p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)) - BgraToYuv422p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else - BgraToYuv422p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - } - - SIMD_INLINE __m256i ConvertY16(__m256i b16_r16[2], __m256i g16_1[2]) - { - return SaturateI16ToU8(_mm256_add_epi16(K16_Y_ADJUST, PackI32ToI16(BgrToY32(b16_r16[0], g16_1[0]), BgrToY32(b16_r16[1], g16_1[1])))); - } - - template SIMD_INLINE void BgraToYuv444p(const uint8_t * bgra, uint8_t * y, uint8_t * u, uint8_t * v) - { - __m256i _b16_r16[2][2], _g16_1[2][2]; - LoadPreparedBgra16((__m256i*)bgra + 0, _b16_r16[0][0], _g16_1[0][0]); - LoadPreparedBgra16((__m256i*)bgra + 1, _b16_r16[0][1], _g16_1[0][1]); - LoadPreparedBgra16((__m256i*)bgra + 2, _b16_r16[1][0], _g16_1[1][0]); - LoadPreparedBgra16((__m256i*)bgra + 3, _b16_r16[1][1], _g16_1[1][1]); - - Store((__m256i*)y, PackI16ToU8(ConvertY16(_b16_r16[0], _g16_1[0]), ConvertY16(_b16_r16[1], _g16_1[1]))); - Store((__m256i*)u, PackI16ToU8(ConvertU16(_b16_r16[0], _g16_1[0]), ConvertU16(_b16_r16[1], _g16_1[1]))); - Store((__m256i*)v, PackI16ToU8(ConvertV16(_b16_r16[0], _g16_1[0]), ConvertV16(_b16_r16[1], _g16_1[1]))); - } - - template void BgraToYuv444p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colBgra = 0; col < alignedWidth; col += A, colBgra += QA) - BgraToYuv444p(bgra + colBgra, y + col, u + col, v + col); - if (width != alignedWidth) - { - size_t offset = width - A; - BgraToYuv444p(bgra + offset * 4, y + offset, u + offset, v + offset); - } - y += yStride; - u += uStride; - v += vStride; - bgra += bgraStride; - } - } - - void BgraToYuv444p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)) - BgraToYuv444p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else - BgraToYuv444p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - } - - template SIMD_INLINE void LoadPreparedBgra16(const __m256i * bgra, __m256i & b16_r16, __m256i & g16_1, __m256i & a32) - { - __m256i _bgra = Load(bgra); - b16_r16 = _mm256_and_si256(_bgra, K16_00FF); - g16_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_si256(_bgra, 1), K32_000000FF), K32_00010000); - a32 = _mm256_and_si256(_mm256_srli_si256(_bgra, 3), K32_000000FF); - } - - template SIMD_INLINE void LoadAndConvertYA16(const __m256i * bgra, __m256i & b16_r16, __m256i & g16_1, __m256i & y16, __m256i & a16) - { - __m256i _b16_r16[2], _g16_1[2], a32[2]; - LoadPreparedBgra16(bgra + 0, _b16_r16[0], _g16_1[0], a32[0]); - LoadPreparedBgra16(bgra + 1, _b16_r16[1], _g16_1[1], a32[1]); - b16_r16 = _mm256_permute4x64_epi64(_mm256_hadd_epi32(_b16_r16[0], _b16_r16[1]), 0xD8); - g16_1 = _mm256_permute4x64_epi64(_mm256_hadd_epi32(_g16_1[0], _g16_1[1]), 0xD8); - y16 = SaturateI16ToU8(_mm256_add_epi16(K16_Y_ADJUST, PackI32ToI16(BgrToY32(_b16_r16[0], _g16_1[0]), BgrToY32(_b16_r16[1], _g16_1[1])))); - a16 = PackI32ToI16(a32[0], a32[1]); - } - - template SIMD_INLINE void LoadAndStoreYA(const __m256i * bgra, __m256i b16_r16[2], __m256i g16_1[2], __m256i * y, __m256i * a) - { - __m256i y16[2], a16[2]; - LoadAndConvertYA16(bgra + 0, b16_r16[0], g16_1[0], y16[0], a16[0]); - LoadAndConvertYA16(bgra + 2, b16_r16[1], g16_1[1], y16[1], a16[1]); - Store(y, PackI16ToU8(y16[0], y16[1])); - Store(a, PackI16ToU8(a16[0], a16[1])); - } - - template SIMD_INLINE void BgraToYuva420p(const uint8_t * bgra0, size_t bgraStride, uint8_t * y0, size_t yStride, uint8_t * u, uint8_t * v, uint8_t * a0, size_t aStride) - { - const uint8_t * bgra1 = bgra0 + bgraStride; - uint8_t * y1 = y0 + yStride; - uint8_t * a1 = a0 + aStride; - - __m256i _b16_r16[2][2][2], _g16_1[2][2][2]; - LoadAndStoreYA((__m256i*)bgra0 + 0, _b16_r16[0][0], _g16_1[0][0], (__m256i*)y0 + 0, (__m256i*)a0 + 0); - LoadAndStoreYA((__m256i*)bgra0 + 4, _b16_r16[0][1], _g16_1[0][1], (__m256i*)y0 + 1, (__m256i*)a0 + 1); - LoadAndStoreYA((__m256i*)bgra1 + 0, _b16_r16[1][0], _g16_1[1][0], (__m256i*)y1 + 0, (__m256i*)a1 + 0); - LoadAndStoreYA((__m256i*)bgra1 + 4, _b16_r16[1][1], _g16_1[1][1], (__m256i*)y1 + 1, (__m256i*)a1 + 1); - - Average16(_b16_r16[0][0][0], _b16_r16[1][0][0]); - Average16(_b16_r16[0][0][1], _b16_r16[1][0][1]); - Average16(_b16_r16[0][1][0], _b16_r16[1][1][0]); - Average16(_b16_r16[0][1][1], _b16_r16[1][1][1]); - - Average16(_g16_1[0][0][0], _g16_1[1][0][0]); - Average16(_g16_1[0][0][1], _g16_1[1][0][1]); - Average16(_g16_1[0][1][0], _g16_1[1][1][0]); - Average16(_g16_1[0][1][1], _g16_1[1][1][1]); - - Store((__m256i*)u, PackI16ToU8(ConvertU16(_b16_r16[0][0], _g16_1[0][0]), ConvertU16(_b16_r16[0][1], _g16_1[0][1]))); - Store((__m256i*)v, PackI16ToU8(ConvertV16(_b16_r16[0][0], _g16_1[0][0]), ConvertV16(_b16_r16[0][1], _g16_1[0][1]))); - } - - template void BgraToYuva420p(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride, uint8_t * a, size_t aStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= DA) && (height >= 2)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(a) && Aligned(aStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - size_t alignedWidth = AlignLo(width, DA); - const size_t A8 = A * 8; - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colYA = 0, colBgra = 0; colYA < alignedWidth; colYA += DA, colUV += A, colBgra += A8) - BgraToYuva420p(bgra + colBgra, bgraStride, y + colYA, yStride, u + colUV, v + colUV, a + colYA, aStride); - if (width != alignedWidth) - { - size_t offset = width - DA; - BgraToYuva420p(bgra + offset * 4, bgraStride, y + offset, yStride, u + offset / 2, v + offset / 2, a + offset, aStride); - } - y += 2 * yStride; - u += uStride; - v += vStride; - a += 2 * aStride; - bgra += 2 * bgraStride; - } - } - - void BgraToYuva420p(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride, uint8_t * a, size_t aStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride) - && Aligned(a) && Aligned(aStride) && Aligned(bgra) && Aligned(bgraStride)) - BgraToYuva420p(bgra, bgraStride, width, height, y, yStride, u, uStride, v, vStride, a, aStride); - else - BgraToYuva420p(bgra, bgraStride, width, height, y, yStride, u, uStride, v, vStride, a, aStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Binarization.cpp b/src/3rd/Simd/Simd/SimdAvx2Binarization.cpp deleted file mode 100644 index 1558190d..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Binarization.cpp +++ /dev/null @@ -1,261 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template - void Binarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride) - { - assert(width >= A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - - __m256i _value = _mm256_set1_epi8(value); - __m256i _positive = _mm256_set1_epi8(positive); - __m256i _negative = _mm256_set1_epi8(negative); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - { - const __m256i mask = Compare8u(Load((__m256i*)(src + col)), _value); - Store((__m256i*)(dst + col), _mm256_blendv_epi8(_negative, _positive, mask)); - } - if (alignedWidth != width) - { - const __m256i mask = Compare8u(Load((__m256i*)(src + width - A)), _value); - Store((__m256i*)(dst + width - A), _mm256_blendv_epi8(_negative, _positive, mask)); - } - src += srcStride; - dst += dstStride; - } - } - - template - void Binarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - else - Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - } - - void Binarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride, SimdCompareType compareType) - { - switch (compareType) - { - case SimdCompareEqual: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - case SimdCompareNotEqual: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - case SimdCompareGreater: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - case SimdCompareGreaterOrEqual: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - case SimdCompareLesser: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - case SimdCompareLesserOrEqual: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - default: - assert(0); - } - } - - namespace - { - struct Buffer - { - Buffer(size_t width, size_t edge) - { - size_t size = sizeof(uint16_t)*(width + 2 * edge) + sizeof(uint32_t)*(2 * width + 2 * edge); - _p = Allocate(size); - memset(_p, 0, size); - sa = (uint16_t*)_p + edge; - s0a0 = (uint32_t*)(sa + width + edge) + edge; - sum = (uint32_t*)(s0a0 + width + edge); - } - - ~Buffer() - { - Free(_p); - } - - uint16_t * sa; - uint32_t * s0a0; - uint32_t * sum; - private: - void *_p; - }; - } - - template - SIMD_INLINE void AddRows(const uint8_t * src, uint16_t * sa, const __m256i & value, const __m256i & mask) - { - const __m256i inc = _mm256_permute4x64_epi64(_mm256_and_si256(Compare8u(Load((__m256i*)src), value), mask), 0xD8); - Store((__m256i*)sa + 0, _mm256_add_epi8(Load((__m256i*)sa + 0), _mm256_unpacklo_epi8(inc, _mm256_permute4x64_epi64(mask, 0xD8)))); - Store((__m256i*)sa + 1, _mm256_add_epi8(Load((__m256i*)sa + 1), _mm256_unpackhi_epi8(inc, _mm256_permute4x64_epi64(mask, 0xD8)))); - } - - template - SIMD_INLINE void SubRows(const uint8_t * src, uint16_t * sa, const __m256i & value, const __m256i & mask) - { - const __m256i dec = _mm256_permute4x64_epi64(_mm256_and_si256(Compare8u(Load((__m256i*)src), value), mask), 0xD8); - Store((__m256i*)sa + 0, _mm256_sub_epi8(Load((__m256i*)sa + 0), _mm256_unpacklo_epi8(dec, _mm256_permute4x64_epi64(mask, 0xD8)))); - Store((__m256i*)sa + 1, _mm256_sub_epi8(Load((__m256i*)sa + 1), _mm256_unpackhi_epi8(dec, _mm256_permute4x64_epi64(mask, 0xD8)))); - } - - template - SIMD_INLINE __m256i CompareSum(const uint32_t * sum, const __m256i & ff_threshold) - { - const __m256i mask0 = _mm256_cmpgt_epi32(_mm256_madd_epi16(Load((__m256i*)sum + 0), ff_threshold), K_ZERO); - const __m256i mask1 = _mm256_cmpgt_epi32(_mm256_madd_epi16(Load((__m256i*)sum + 1), ff_threshold), K_ZERO); - const __m256i mask2 = _mm256_cmpgt_epi32(_mm256_madd_epi16(Load((__m256i*)sum + 2), ff_threshold), K_ZERO); - const __m256i mask3 = _mm256_cmpgt_epi32(_mm256_madd_epi16(Load((__m256i*)sum + 3), ff_threshold), K_ZERO); - return PackI16ToI8(PackI32ToI16(mask0, mask1), PackI32ToI16(mask2, mask3)); - } - - template - void AveragingBinarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, size_t neighborhood, uint8_t threshold, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride) - { - assert(width > neighborhood && height > neighborhood && neighborhood < 0x7F); - - const size_t alignedWidth = AlignLo(width, A); - - const __m256i tailMask = SetMask(0, A - width + alignedWidth, 1); - const __m256i ff_threshold = SetInt16(0xFF, -threshold); - const __m256i _value = _mm256_set1_epi8(value); - const __m256i _positive = _mm256_set1_epi8(positive); - const __m256i _negative = _mm256_set1_epi8(negative); - - Buffer buffer(AlignHi(width, A), AlignHi(neighborhood + 1, A)); - - for (size_t row = 0; row < neighborhood; ++row) - { - const uint8_t * s = src + row*srcStride; - for (size_t col = 0; col < alignedWidth; col += A) - AddRows(s + col, buffer.sa + col, _value, K8_01); - if (alignedWidth != width) - AddRows(s + width - A, buffer.sa + width - A, _value, tailMask); - } - - for (size_t row = 0; row < height; ++row) - { - if (row < height - neighborhood) - { - const uint8_t * s = src + (row + neighborhood)*srcStride; - for (size_t col = 0; col < alignedWidth; col += A) - AddRows(s + col, buffer.sa + col, _value, K8_01); - if (alignedWidth != width) - AddRows(s + width - A, buffer.sa + width - A, _value, tailMask); - } - if (row > neighborhood) - { - const uint8_t * s = src + (row - neighborhood - 1)*srcStride; - for (size_t col = 0; col < alignedWidth; col += A) - SubRows(s + col, buffer.sa + col, _value, K8_01); - if (alignedWidth != width) - SubRows(s + width - A, buffer.sa + width - A, _value, tailMask); - } - - for (size_t col = 0; col < width; col += HA) - { - const __m256i sa = LoadPermuted((__m256i*)(buffer.sa + col)); - Store((__m256i*)(buffer.s0a0 + col) + 0, _mm256_unpacklo_epi8(sa, K_ZERO)); - Store((__m256i*)(buffer.s0a0 + col) + 1, _mm256_unpackhi_epi8(sa, K_ZERO)); - } - - uint32_t sum = 0; - for (size_t col = 0; col < neighborhood; ++col) - { - sum += buffer.s0a0[col]; - } - for (size_t col = 0; col < width; ++col) - { - sum += buffer.s0a0[col + neighborhood]; - sum -= buffer.s0a0[col - neighborhood - 1]; - buffer.sum[col] = sum; - } - - for (size_t col = 0; col < alignedWidth; col += A) - { - const __m256i mask = CompareSum(buffer.sum + col, ff_threshold); - Store((__m256i*)(dst + col), _mm256_blendv_epi8(_negative, _positive, mask)); - } - if (alignedWidth != width) - { - const __m256i mask = CompareSum(buffer.sum + width - A, ff_threshold); - Store((__m256i*)(dst + width - A), _mm256_blendv_epi8(_negative, _positive, mask)); - } - - dst += dstStride; - } - } - - template - void AveragingBinarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, size_t neighborhood, uint8_t threshold, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - else - AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - } - - void AveragingBinarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, size_t neighborhood, uint8_t threshold, uint8_t positive, uint8_t negative, - uint8_t * dst, size_t dstStride, SimdCompareType compareType) - { - switch (compareType) - { - case SimdCompareEqual: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - case SimdCompareNotEqual: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - case SimdCompareGreater: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - case SimdCompareGreaterOrEqual: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - case SimdCompareLesser: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - case SimdCompareLesserOrEqual: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - default: - assert(0); - } - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Conditional.cpp b/src/3rd/Simd/Simd/SimdAvx2Conditional.cpp deleted file mode 100644 index 27a31390..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Conditional.cpp +++ /dev/null @@ -1,447 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template - void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, uint8_t value, uint32_t * count) - { - assert(width >= A); - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + alignedWidth, 0xFF); - - __m256i _value = _mm256_set1_epi8(value); - __m256i _count = _mm256_setzero_si256(); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - { - const __m256i mask = Compare8u(Load((__m256i*)(src + col)), _value); - _count = _mm256_add_epi64(_count, _mm256_sad_epu8(_mm256_and_si256(mask, K8_01), K_ZERO)); - } - if (alignedWidth != width) - { - const __m256i mask = _mm256_and_si256(Compare8u(Load((__m256i*)(src + width - A)), _value), tailMask); - _count = _mm256_add_epi64(_count, _mm256_sad_epu8(_mm256_and_si256(mask, K8_01), K_ZERO)); - } - src += stride; - } - *count = ExtractSum(_count); - } - - template - void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, uint8_t value, uint32_t * count) - { - if (Aligned(src) && Aligned(stride)) - ConditionalCount8u(src, stride, width, height, value, count); - else - ConditionalCount8u(src, stride, width, height, value, count); - } - - void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, - uint8_t value, SimdCompareType compareType, uint32_t * count) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalCount8u(src, stride, width, height, value, count); - case SimdCompareNotEqual: - return ConditionalCount8u(src, stride, width, height, value, count); - case SimdCompareGreater: - return ConditionalCount8u(src, stride, width, height, value, count); - case SimdCompareGreaterOrEqual: - return ConditionalCount8u(src, stride, width, height, value, count); - case SimdCompareLesser: - return ConditionalCount8u(src, stride, width, height, value, count); - case SimdCompareLesserOrEqual: - return ConditionalCount8u(src, stride, width, height, value, count); - default: - assert(0); - } - } - - template - void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, int16_t value, uint32_t * count) - { - assert(width >= HA); - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t alignedWidth = Simd::AlignLo(width, HA); - __m256i tailMask = SetMask(0, HA - width + alignedWidth, 0xFFFF); - - __m256i _value = _mm256_set1_epi16(value); - __m256i _count = _mm256_setzero_si256(); - for (size_t row = 0; row < height; ++row) - { - const int16_t * s = (const int16_t *)src; - for (size_t col = 0; col < alignedWidth; col += HA) - { - const __m256i mask = Compare16i(Load((__m256i*)(s + col)), _value); - _count = _mm256_add_epi64(_count, _mm256_sad_epu8(_mm256_and_si256(mask, K16_0001), K_ZERO)); - } - if (alignedWidth != width) - { - const __m256i mask = _mm256_and_si256(Compare16i(Load((__m256i*)(s + width - HA)), _value), tailMask); - _count = _mm256_add_epi64(_count, _mm256_sad_epu8(_mm256_and_si256(mask, K16_0001), K_ZERO)); - } - src += stride; - } - *count = ExtractSum(_count); - } - - template - void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, int16_t value, uint32_t * count) - { - if (Aligned(src) && Aligned(stride)) - ConditionalCount16i(src, stride, width, height, value, count); - else - ConditionalCount16i(src, stride, width, height, value, count); - } - - void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, - int16_t value, SimdCompareType compareType, uint32_t * count) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalCount16i(src, stride, width, height, value, count); - case SimdCompareNotEqual: - return ConditionalCount16i(src, stride, width, height, value, count); - case SimdCompareGreater: - return ConditionalCount16i(src, stride, width, height, value, count); - case SimdCompareGreaterOrEqual: - return ConditionalCount16i(src, stride, width, height, value, count); - case SimdCompareLesser: - return ConditionalCount16i(src, stride, width, height, value, count); - case SimdCompareLesserOrEqual: - return ConditionalCount16i(src, stride, width, height, value, count); - default: - assert(0); - } - } - - template - void ConditionalSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) - { - assert(width >= A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + alignedWidth, 0xFF); - - __m256i _value = _mm256_set1_epi8(value); - __m256i _sum = _mm256_setzero_si256(); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - { - const __m256i _src = Load((__m256i*)(src + col)); - const __m256i _mask = Compare8u(Load((__m256i*)(mask + col)), _value); - _sum = _mm256_add_epi64(_sum, _mm256_sad_epu8(_mm256_and_si256(_mask, _src), K_ZERO)); - } - if (alignedWidth != width) - { - const __m256i _src = Load((__m256i*)(src + width - A)); - const __m256i _mask = _mm256_and_si256(Compare8u(Load((__m256i*)(mask + width - A)), _value), tailMask); - _sum = _mm256_add_epi64(_sum, _mm256_sad_epu8(_mm256_and_si256(_mask, _src), K_ZERO)); - } - src += srcStride; - mask += maskStride; - } - *sum = ExtractSum(_sum); - } - - template - void ConditionalSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)) - ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - else - ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - } - - void ConditionalSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareNotEqual: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreater: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreaterOrEqual: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesser: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesserOrEqual: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - default: - assert(0); - } - } - - SIMD_INLINE __m256i Square(__m256i value) - { - const __m256i lo = _mm256_unpacklo_epi8(value, _mm256_setzero_si256()); - const __m256i hi = _mm256_unpackhi_epi8(value, _mm256_setzero_si256()); - return _mm256_add_epi32(_mm256_madd_epi16(lo, lo), _mm256_madd_epi16(hi, hi)); - } - - template - void ConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) - { - assert(width >= A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + alignedWidth, 0xFF); - - __m256i _value = _mm256_set1_epi8(value); - __m256i _sum = _mm256_setzero_si256(); - for (size_t row = 0; row < height; ++row) - { - __m256i rowSum = _mm256_setzero_si256(); - for (size_t col = 0; col < alignedWidth; col += A) - { - const __m256i _src = Load((__m256i*)(src + col)); - const __m256i _mask = Compare8u(Load((__m256i*)(mask + col)), _value); - rowSum = _mm256_add_epi32(rowSum, Square(_mm256_and_si256(_mask, _src))); - } - if (alignedWidth != width) - { - const __m256i _src = Load((__m256i*)(src + width - A)); - const __m256i _mask = _mm256_and_si256(Compare8u(Load((__m256i*)(mask + width - A)), _value), tailMask); - rowSum = _mm256_add_epi32(rowSum, Square(_mm256_and_si256(_mask, _src))); - } - _sum = _mm256_add_epi64(_sum, HorizontalSum32(rowSum)); - src += srcStride; - mask += maskStride; - } - *sum = ExtractSum(_sum); - } - - template - void ConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)) - ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - else - ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - } - - void ConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareNotEqual: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreater: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreaterOrEqual: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesser: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesserOrEqual: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - default: - assert(0); - } - } - - template - SIMD_INLINE __m256i SquaredDifference(const uint8_t * src, ptrdiff_t step, __m256i mask) - { - const __m256i a = _mm256_and_si256(Load((__m256i*)(src - step)), mask); - const __m256i b = _mm256_and_si256(Load((__m256i*)(src + step)), mask); - const __m256i lo = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()), _mm256_unpacklo_epi8(b, _mm256_setzero_si256())); - const __m256i hi = _mm256_sub_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()), _mm256_unpackhi_epi8(b, _mm256_setzero_si256())); - return _mm256_add_epi32(_mm256_madd_epi16(lo, lo), _mm256_madd_epi16(hi, hi)); - } - - template - void ConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) - { - assert(width >= A + 2 && height >= 3); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)); - - src += srcStride; - mask += maskStride; - height -= 2; - - size_t alignedWidth = Simd::AlignLo(width - 1, A); - __m256i noseMask = SetMask(0xFF, A - 1, 0); - __m256i tailMask = SetMask(0, A - width + 1 + alignedWidth, 0xFF); - - __m256i _value = _mm256_set1_epi8(value); - __m256i _sum = _mm256_setzero_si256(); - for (size_t row = 0; row < height; ++row) - { - __m256i rowSum = _mm256_setzero_si256(); - { - const __m256i _mask = _mm256_and_si256(Compare8u(Load((__m256i*)(mask + 1)), _value), noseMask); - rowSum = _mm256_add_epi32(rowSum, SquaredDifference(src + 1, 1, _mask)); - rowSum = _mm256_add_epi32(rowSum, SquaredDifference(src + 1, srcStride, _mask)); - } - for (size_t col = A; col < alignedWidth; col += A) - { - const __m256i _mask = Compare8u(Load((__m256i*)(mask + col)), _value); - rowSum = _mm256_add_epi32(rowSum, SquaredDifference(src + col, 1, _mask)); - rowSum = _mm256_add_epi32(rowSum, SquaredDifference(src + col, srcStride, _mask)); - } - if (alignedWidth != width - 1) - { - size_t offset = width - A - 1; - const __m256i _mask = _mm256_and_si256(Compare8u(Load((__m256i*)(mask + offset)), _value), tailMask); - rowSum = _mm256_add_epi32(rowSum, SquaredDifference(src + offset, 1, _mask)); - rowSum = _mm256_add_epi32(rowSum, SquaredDifference(src + offset, srcStride, _mask)); - } - _sum = _mm256_add_epi64(_sum, HorizontalSum32(rowSum)); - src += srcStride; - mask += maskStride; - } - *sum = ExtractSum(_sum); - } - - template - void ConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)) - ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - else - ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - } - - void ConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareNotEqual: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreater: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreaterOrEqual: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesser: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesserOrEqual: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - default: - assert(0); - } - } - - template - SIMD_INLINE void ConditionalFill(const uint8_t * src, size_t offset, const __m256i & threshold, const __m256i & value, uint8_t * dst) - { - const __m256i _src = Load((__m256i*)(src + offset)); - const __m256i _dst = Load((__m256i*)(dst + offset)); - Store((__m256i*)(dst + offset), _mm256_blendv_epi8(_dst, value, Compare8u(_src, threshold))); - } - - template - void ConditionalFill(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t threshold, uint8_t value, uint8_t * dst, size_t dstStride) - { - assert(width >= A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - - __m256i _value = _mm256_set1_epi8(value); - __m256i _threshold = _mm256_set1_epi8(threshold); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - ConditionalFill(src, col, _threshold, _value, dst); - if (alignedWidth != width) - ConditionalFill(src, width - A, _threshold, _value, dst); - src += srcStride; - dst += dstStride; - } - } - - template - void ConditionalFill(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t threshold, uint8_t value, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - else - ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - } - - void ConditionalFill(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t threshold, SimdCompareType compareType, uint8_t value, uint8_t * dst, size_t dstStride) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - case SimdCompareNotEqual: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - case SimdCompareGreater: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - case SimdCompareGreaterOrEqual: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - case SimdCompareLesser: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - case SimdCompareLesserOrEqual: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - default: - assert(0); - } - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Deinterleave.cpp b/src/3rd/Simd/Simd/SimdAvx2Deinterleave.cpp deleted file mode 100644 index aa2deb3c..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Deinterleave.cpp +++ /dev/null @@ -1,189 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - SIMD_INLINE __m256i DeinterleavedU(__m256i uv0, __m256i uv1) - { - return PackI16ToU8(_mm256_and_si256(uv0, K16_00FF), _mm256_and_si256(uv1, K16_00FF)); - } - - SIMD_INLINE __m256i DeinterleavedV(__m256i uv0, __m256i uv1) - { - return DeinterleavedU(_mm256_srli_si256(uv0, 1), _mm256_srli_si256(uv1, 1)); - } - - template void DeinterleaveUv(const uint8_t * uv, size_t uvStride, size_t width, size_t height, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(uv) && Aligned(uvStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride)); - } - - size_t bodyWidth = AlignLo(width, A); - size_t tail = width - bodyWidth; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += DA) - { - __m256i uv0 = Load((__m256i*)(uv + offset)); - __m256i uv1 = Load((__m256i*)(uv + offset + A)); - Store((__m256i*)(u + col), DeinterleavedU(uv0, uv1)); - Store((__m256i*)(v + col), DeinterleavedV(uv0, uv1)); - } - if (tail) - { - size_t col = width - A; - size_t offset = 2 * col; - __m256i uv0 = Load((__m256i*)(uv + offset)); - __m256i uv1 = Load((__m256i*)(uv + offset + A)); - Store((__m256i*)(u + col), DeinterleavedU(uv0, uv1)); - Store((__m256i*)(v + col), DeinterleavedV(uv0, uv1)); - } - uv += uvStride; - u += uStride; - v += vStride; - } - } - - void DeinterleaveUv(const uint8_t * uv, size_t uvStride, size_t width, size_t height, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (Aligned(uv) && Aligned(uvStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride)) - DeinterleaveUv(uv, uvStride, width, height, u, uStride, v, vStride); - else - DeinterleaveUv(uv, uvStride, width, height, u, uStride, v, vStride); - } - - template SIMD_INLINE void DeinterleaveBgr(const uint8_t * bgr, uint8_t * b, uint8_t * g, uint8_t * r, size_t offset) - { - __m256i _bgr[3] = { Load((__m256i*)bgr + 0), Load((__m256i*)bgr + 1), Load((__m256i*)bgr + 2) }; - Store((__m256i*)(b + offset), BgrToBlue(_bgr)); - Store((__m256i*)(g + offset), BgrToGreen(_bgr)); - Store((__m256i*)(r + offset), BgrToRed(_bgr)); - } - - template void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride) - { - assert(width >= A); - if (align) - assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride)); - - size_t alignedWidth = AlignLo(width, A); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - DeinterleaveBgr(bgr + col * 3, b, g, r, col); - if (width != alignedWidth) - DeinterleaveBgr(bgr + 3 * (width - A), b, g, r, width - A); - bgr += bgrStride; - b += bStride; - g += gStride; - r += rStride; - } - } - - void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride) - { - if (Aligned(bgr) && Aligned(bgrStride) && Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride)) - DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); - else - DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); - } - - const __m256i K8_SHUFFLE_BGRA = SIMD_MM256_SETR_EPI8( - 0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF, - 0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF); - - const __m256i K32_PERMUTE_BGRA = SIMD_MM256_SETR_EPI32(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7); - - template SIMD_INLINE void DeinterleaveBgra(const uint8_t * bgra, uint8_t * b, uint8_t * g, uint8_t * r, uint8_t *a, size_t offset) - { - __m256i _bgra[4]; - _bgra[0] = _mm256_shuffle_epi8(Load((__m256i*)bgra + 0), K8_SHUFFLE_BGRA); - _bgra[1] = _mm256_shuffle_epi8(Load((__m256i*)bgra + 1), K8_SHUFFLE_BGRA); - _bgra[2] = _mm256_shuffle_epi8(Load((__m256i*)bgra + 2), K8_SHUFFLE_BGRA); - _bgra[3] = _mm256_shuffle_epi8(Load((__m256i*)bgra + 3), K8_SHUFFLE_BGRA); - - __m256i bbgg0 = _mm256_unpacklo_epi32(_bgra[0], _bgra[1]); - __m256i bbgg1 = _mm256_unpacklo_epi32(_bgra[2], _bgra[3]); - - Store((__m256i*)(b + offset), _mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(bbgg0, bbgg1), K32_PERMUTE_BGRA)); - Store((__m256i*)(g + offset), _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(bbgg0, bbgg1), K32_PERMUTE_BGRA)); - - __m256i rraa0 = _mm256_unpackhi_epi32(_bgra[0], _bgra[1]); - __m256i rraa1 = _mm256_unpackhi_epi32(_bgra[2], _bgra[3]); - - Store((__m256i*)(r + offset), _mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(rraa0, rraa1), K32_PERMUTE_BGRA)); - Store((__m256i*)(a + offset), _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(rraa0, rraa1), K32_PERMUTE_BGRA)); - } - - template void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride)); - assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride)); - } - - size_t alignedWidth = AlignLo(width, A); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - DeinterleaveBgra(bgra + col * 4, b, g, r, a, col); - if (width != alignedWidth) - DeinterleaveBgra(bgra + 4 * (width - A), b, g, r, a, width - A); - bgra += bgraStride; - b += bStride; - g += gStride; - r += rStride; - a += aStride; - } - } - - void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride)) - DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); - else - DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Detection.cpp b/src/3rd/Simd/Simd/SimdAvx2Detection.cpp deleted file mode 100644 index c4c2e2e5..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Detection.cpp +++ /dev/null @@ -1,783 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdDetection.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - using namespace Simd::Detection; - - const __m256i K32_PERMUTE = SIMD_MM256_SETR_EPI32(0, 2, 4, 6, 1, 3, 5, 7); - - SIMD_INLINE void UnpackMask16i(const uint8_t * src, uint16_t * dst, const __m256i & mask) - { - __m256i s = _mm256_and_si256(mask, LoadPermuted((__m256i*)src)); - _mm256_storeu_si256((__m256i*)dst + 0, _mm256_unpacklo_epi8(s, _mm256_setzero_si256())); - _mm256_storeu_si256((__m256i*)dst + 1, _mm256_unpackhi_epi8(s, _mm256_setzero_si256())); - } - - SIMD_INLINE void UnpackMask16i(const uint8_t * src, size_t size, uint16_t * dst, const __m256i & mask) - { - size_t alignedSize = Simd::AlignLo(size, A); - for (size_t i = 0; i < alignedSize; i += A) - UnpackMask16i(src + i, dst + i, mask); - if (size != alignedSize) - UnpackMask16i(src + size - A, dst + size - A, mask); - } - - SIMD_INLINE void UnpackMask32i(const uint8_t * src, uint32_t * dst, const __m256i & mask) - { - __m256i s = _mm256_permutevar8x32_epi32(_mm256_and_si256(mask, _mm256_loadu_si256((__m256i*)src)), K32_PERMUTE); - __m256i lo = _mm256_unpacklo_epi8(s, _mm256_setzero_si256()); - _mm256_storeu_si256((__m256i*)dst + 0, _mm256_unpacklo_epi16(lo, _mm256_setzero_si256())); - _mm256_storeu_si256((__m256i*)dst + 1, _mm256_unpackhi_epi16(lo, _mm256_setzero_si256())); - __m256i hi = _mm256_unpackhi_epi8(s, _mm256_setzero_si256()); - _mm256_storeu_si256((__m256i*)dst + 2, _mm256_unpacklo_epi16(hi, _mm256_setzero_si256())); - _mm256_storeu_si256((__m256i*)dst + 3, _mm256_unpackhi_epi16(hi, _mm256_setzero_si256())); - } - - SIMD_INLINE void UnpackMask32i(const uint8_t * src, size_t size, uint32_t * dst, const __m256i & mask) - { - size_t alignedSize = Simd::AlignLo(size, A); - for (size_t i = 0; i < alignedSize; i += A) - UnpackMask32i(src + i, dst + i, mask); - if (size != alignedSize) - UnpackMask32i(src + size - A, dst + size - A, mask); - } - - SIMD_INLINE void PackResult16i(const uint16_t * src, uint8_t * dst) - { - __m256i lo = _mm256_loadu_si256((__m256i*)src + 0); - __m256i hi = _mm256_loadu_si256((__m256i*)src + 1); - _mm256_storeu_si256((__m256i*)dst, PackI16ToU8(lo, hi)); - } - - SIMD_INLINE void PackResult16i(const uint16_t * src, size_t size, uint8_t * dst) - { - size_t alignedSize = Simd::AlignLo(size, A); - for (size_t i = 0; i < alignedSize; i += A) - PackResult16i(src + i, dst + i); - if (size != alignedSize) - PackResult16i(src + size - A, dst + size - A); - } - - SIMD_INLINE void PackResult32i(const uint32_t * src, uint8_t * dst) - { - const __m256i lo = Simd::Avx2::PackI32ToI16(_mm256_loadu_si256((__m256i*)src + 0), _mm256_loadu_si256((__m256i*)src + 1)); - const __m256i hi = Simd::Avx2::PackI32ToI16(_mm256_loadu_si256((__m256i*)src + 2), _mm256_loadu_si256((__m256i*)src + 3)); - _mm256_storeu_si256((__m256i*)dst, Simd::Avx2::PackI16ToU8(lo, hi)); - } - - SIMD_INLINE void PackResult32i(const uint32_t * src, size_t size, uint8_t * dst) - { - size_t alignedSize = Simd::AlignLo(size, A); - for (size_t i = 0; i < alignedSize; i += A) - PackResult32i(src + i, dst + i); - if (size != alignedSize) - PackResult32i(src + size - A, dst + size - A); - } - - SIMD_INLINE int ResultCount(__m256i result) - { - uint32_t SIMD_ALIGNED(32) buffer[8]; - _mm256_store_si256((__m256i*)buffer, _mm256_sad_epu8(result, _mm256_setzero_si256())); - return buffer[0] + buffer[2] + buffer[4] + buffer[6]; - } - - SIMD_INLINE __m256 ValidSqrt(__m256 value) - { - __m256 mask = _mm256_cmp_ps(value, _mm256_set1_ps(0.0f), _CMP_GT_OQ); - return _mm256_sqrt_ps(_mm256_or_ps(_mm256_and_ps(mask, value), _mm256_andnot_ps(mask, _mm256_set1_ps(1.0f)))); - } - - SIMD_INLINE __m256i Sum32ip(uint32_t * const ptr[4], size_t offset) - { - __m256i s0 = _mm256_loadu_si256((__m256i*)(ptr[0] + offset)); - __m256i s1 = _mm256_loadu_si256((__m256i*)(ptr[1] + offset)); - __m256i s2 = _mm256_loadu_si256((__m256i*)(ptr[2] + offset)); - __m256i s3 = _mm256_loadu_si256((__m256i*)(ptr[3] + offset)); - return _mm256_sub_epi32(_mm256_sub_epi32(s0, s1), _mm256_sub_epi32(s2, s3)); - } - - SIMD_INLINE __m256i Sum32ii(uint32_t * const ptr[4], size_t offset) - { - __m256i lo = Sum32ip(ptr, offset + 0); - __m256i hi = Sum32ip(ptr, offset + 8); - return _mm256_permute2x128_si256( - _mm256_permutevar8x32_epi32(lo, K32_PERMUTE), - _mm256_permutevar8x32_epi32(hi, K32_PERMUTE), 0x20); - } - - SIMD_INLINE __m256 Norm32fp(const HidHaarCascade & hid, size_t offset) - { - __m256 area = _mm256_broadcast_ss(&hid.windowArea); - __m256 sum = _mm256_cvtepi32_ps(Sum32ip(hid.p, offset)); - __m256 sqsum = _mm256_cvtepi32_ps(Sum32ip(hid.pq, offset)); - return ValidSqrt(_mm256_sub_ps(_mm256_mul_ps(sqsum, area), _mm256_mul_ps(sum, sum))); - } - - SIMD_INLINE __m256 Norm32fi(const HidHaarCascade & hid, size_t offset) - { - __m256 area = _mm256_broadcast_ss(&hid.windowArea); - __m256 sum = _mm256_cvtepi32_ps(Sum32ii(hid.p, offset)); - __m256 sqsum = _mm256_cvtepi32_ps(Sum32ii(hid.pq, offset)); - return ValidSqrt(_mm256_sub_ps(_mm256_mul_ps(sqsum, area), _mm256_mul_ps(sum, sum))); - } - - SIMD_INLINE __m256 WeightedSum32f(const WeightedRect & rect, size_t offset) - { - __m256i s0 = _mm256_loadu_si256((__m256i*)(rect.p0 + offset)); - __m256i s1 = _mm256_loadu_si256((__m256i*)(rect.p1 + offset)); - __m256i s2 = _mm256_loadu_si256((__m256i*)(rect.p2 + offset)); - __m256i s3 = _mm256_loadu_si256((__m256i*)(rect.p3 + offset)); - __m256i sum = _mm256_sub_epi32(_mm256_sub_epi32(s0, s1), _mm256_sub_epi32(s2, s3)); - return _mm256_mul_ps(_mm256_cvtepi32_ps(sum), _mm256_broadcast_ss(&rect.weight)); - } - - SIMD_INLINE void StageSum32f(const float * leaves, float threshold, const __m256 & sum, const __m256 & norm, __m256 & stageSum) - { - __m256 mask = _mm256_cmp_ps(_mm256_mul_ps(_mm256_set1_ps(threshold), norm), sum, _CMP_GT_OQ); - stageSum = _mm256_add_ps(stageSum, _mm256_blendv_ps(_mm256_broadcast_ss(leaves + 1), _mm256_broadcast_ss(leaves + 0), mask)); - } - - void Detect32f(const HidHaarCascade & hid, size_t offset, const __m256 & norm, __m256i & result) - { - typedef HidHaarCascade Hid; - const float * leaves = hid.leaves.data(); - const Hid::Node * node = hid.nodes.data(); - const Hid::Stage * stages = hid.stages.data(); - for (int i = 0, n = (int)hid.stages.size(); i < n; ++i) - { - const Hid::Stage & stage = stages[i]; - if (stage.canSkip) - continue; - const Hid::Node * end = node + stage.ntrees; - __m256 stageSum = _mm256_setzero_ps(); - if (stage.hasThree) - { - for (; node < end; ++node, leaves += 2) - { - const Hid::Feature & feature = hid.features[node->featureIdx]; - __m256 sum = _mm256_add_ps(WeightedSum32f(feature.rect[0], offset), WeightedSum32f(feature.rect[1], offset)); - if (feature.rect[2].p0) - sum = _mm256_add_ps(sum, WeightedSum32f(feature.rect[2], offset)); - StageSum32f(leaves, node->threshold, sum, norm, stageSum); - } - } - else - { - for (; node < end; ++node, leaves += 2) - { - const Hid::Feature & feature = hid.features[node->featureIdx]; - __m256 sum = _mm256_add_ps(WeightedSum32f(feature.rect[0], offset), WeightedSum32f(feature.rect[1], offset)); - StageSum32f(leaves, node->threshold, sum, norm, stageSum); - } - } - result = _mm256_andnot_si256(_mm256_castps_si256(_mm256_cmp_ps(_mm256_broadcast_ss(&stage.threshold), stageSum, _CMP_GT_OQ)), result); - int resultCount = ResultCount(result); - if (resultCount == 0) - return; - else if (resultCount == 1) - { - uint32_t SIMD_ALIGNED(32) _result[8]; - float SIMD_ALIGNED(32) _norm[8]; - _mm256_store_si256((__m256i*)_result, result); - _mm256_store_ps(_norm, norm); - for (int j = 0; j < 8; ++j) - { - if (_result[j]) - { - _result[j] = Base::Detect32f(hid, offset + j, i + 1, _norm[j]) > 0 ? 1 : 0; - break; - } - } - result = _mm256_load_si256((__m256i*)_result); - return; - } - } - } - - void DetectionHaarDetect32fp(const HidHaarCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - size_t width = rect.Width(); - size_t alignedWidth = Simd::AlignLo(width, 8); - size_t evenWidth = Simd::AlignLo(width, 2); - - Buffer buffer(width); - for (ptrdiff_t row = rect.top; row < rect.bottom; row += 1) - { - size_t col = 0; - size_t p_offset = row * hid.sum.stride / sizeof(uint32_t) + rect.left; - size_t pq_offset = row * hid.sqsum.stride / sizeof(uint32_t) + rect.left; - - UnpackMask32i(mask.data + row*mask.stride + rect.left, width, buffer.m, K8_01); - memset(buffer.d, 0, width * sizeof(uint32_t)); - for (; col < alignedWidth; col += 8) - { - __m256i result = _mm256_loadu_si256((__m256i*)(buffer.m + col)); - if (_mm256_testz_si256(result, K32_00000001)) - continue; - __m256 norm = Norm32fp(hid, pq_offset + col); - Detect32f(hid, p_offset + col, norm, result); - _mm256_storeu_si256((__m256i*)(buffer.d + col), result); - } - if (evenWidth > alignedWidth + 2) - { - col = evenWidth - 8; - __m256i result = _mm256_loadu_si256((__m256i*)(buffer.m + col)); - if (!_mm256_testz_si256(result, K32_00000001)) - { - __m256 norm = Norm32fp(hid, pq_offset + col); - Detect32f(hid, p_offset + col, norm, result); - _mm256_storeu_si256((__m256i*)(buffer.d + col), result); - } - col += 8; - } - for (; col < width; col += 1) - { - if (buffer.m[col] == 0) - continue; - float norm = Base::Norm32f(hid, pq_offset + col); - buffer.d[col] = Base::Detect32f(hid, p_offset + col, 0, norm) > 0 ? 1 : 0; - } - PackResult32i(buffer.d, width, dst.data + row*dst.stride + rect.left); - } - } - - void DetectionHaarDetect32fp(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidHaarCascade & hid = *(HidHaarCascade*)_hid; - return DetectionHaarDetect32fp(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - - void DetectionHaarDetect32fi(const HidHaarCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - const size_t step = 2; - size_t width = rect.Width(); - size_t alignedWidth = Simd::AlignLo(width, HA); - size_t evenWidth = Simd::AlignLo(width, 2); - - Buffer buffer(evenWidth); - for (ptrdiff_t row = rect.top; row < rect.bottom; row += step) - { - size_t col = 0; - size_t p_offset = row * hid.isum.stride / sizeof(uint32_t) + rect.left / 2; - size_t pq_offset = row * hid.sqsum.stride / sizeof(uint32_t) + rect.left; - - UnpackMask16i(mask.data + row*mask.stride + rect.left, evenWidth, buffer.m, K16_0001); - memset(buffer.d, 0, evenWidth * sizeof(uint16_t)); - for (; col < alignedWidth; col += HA) - { - __m256i result = _mm256_loadu_si256((__m256i*)(buffer.m + col)); - if (_mm256_testz_si256(result, K32_00000001)) - continue; - __m256 norm = Norm32fi(hid, pq_offset + col); - Detect32f(hid, p_offset + col / 2, norm, result); - _mm256_storeu_si256((__m256i*)(buffer.d + col), result); - } - if (evenWidth > alignedWidth) - { - col = evenWidth - HA; - __m256i result = _mm256_loadu_si256((__m256i*)(buffer.m + col)); - if (!_mm256_testz_si256(result, K32_00000001)) - { - __m256 norm = Norm32fi(hid, pq_offset + col); - Detect32f(hid, p_offset + col / 2, norm, result); - _mm256_storeu_si256((__m256i*)(buffer.d + col), result); - } - col += HA; - } - for (; col < width; col += step) - { - if (mask.At(col + rect.left, row) == 0) - continue; - float norm = Base::Norm32f(hid, pq_offset + col); - if (Base::Detect32f(hid, p_offset + col / 2, 0, norm) > 0) - dst.At(col + rect.left, row) = 1; - } - PackResult16i(buffer.d, evenWidth, dst.data + row*dst.stride + rect.left); - } - } - - void DetectionHaarDetect32fi(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidHaarCascade & hid = *(HidHaarCascade*)_hid; - return DetectionHaarDetect32fi(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - - const __m256i K8_SHUFFLE_BITS = SIMD_MM256_SETR_EPI8( - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); - - SIMD_INLINE __m256i IntegralSum32i(const __m256i & s0, const __m256i & s1, const __m256i & s2, const __m256i & s3) - { - return _mm256_sub_epi32(_mm256_sub_epi32(s0, s1), _mm256_sub_epi32(s2, s3)); - } - - SIMD_INLINE __m256i GreaterOrEqual32i(__m256i a, __m256i b) - { - return _mm256_cmpeq_epi32(_mm256_max_epu32(a, b), a); - } - - template SIMD_INLINE void Load(__m256i a[16], const HidLbpFeature & feature, ptrdiff_t offset) - { - a[i] = _mm256_loadu_si256((__m256i*)(feature.p[i] + offset)); - } - - SIMD_INLINE void Calculate(const HidLbpFeature & feature, ptrdiff_t offset, __m256i & index, __m256i & shuffle, __m256i & mask) - { - __m256i a[16]; - Load<5>(a, feature, offset); - Load<6>(a, feature, offset); - Load<9>(a, feature, offset); - Load<10>(a, feature, offset); - __m256i central = IntegralSum32i(a[5], a[6], a[9], a[10]); - - Load<0>(a, feature, offset); - Load<1>(a, feature, offset); - Load<4>(a, feature, offset); - index = GreaterOrEqual32i(IntegralSum32i(a[0], a[1], a[4], a[5]), central); - - shuffle = K32_FFFFFF00; - Load<2>(a, feature, offset); - shuffle = _mm256_or_si256(shuffle, _mm256_and_si256(GreaterOrEqual32i(IntegralSum32i(a[1], a[2], a[5], a[6]), central), K32_00000008)); - Load<3>(a, feature, offset); - Load<7>(a, feature, offset); - shuffle = _mm256_or_si256(shuffle, _mm256_and_si256(GreaterOrEqual32i(IntegralSum32i(a[2], a[3], a[6], a[7]), central), K32_00000004)); - Load<11>(a, feature, offset); - shuffle = _mm256_or_si256(shuffle, _mm256_and_si256(GreaterOrEqual32i(IntegralSum32i(a[6], a[7], a[10], a[11]), central), K32_00000002)); - Load<14>(a, feature, offset); - Load<15>(a, feature, offset); - shuffle = _mm256_or_si256(shuffle, _mm256_and_si256(GreaterOrEqual32i(IntegralSum32i(a[10], a[11], a[14], a[15]), central), K32_00000001)); - - mask = K32_FFFFFF00; - Load<13>(a, feature, offset); - mask = _mm256_or_si256(mask, _mm256_and_si256(GreaterOrEqual32i(IntegralSum32i(a[9], a[10], a[13], a[14]), central), K32_00000004)); - Load<12>(a, feature, offset); - Load<8>(a, feature, offset); - mask = _mm256_or_si256(mask, _mm256_and_si256(GreaterOrEqual32i(IntegralSum32i(a[8], a[9], a[12], a[13]), central), K32_00000002)); - mask = _mm256_or_si256(mask, _mm256_and_si256(GreaterOrEqual32i(IntegralSum32i(a[4], a[5], a[8], a[9]), central), K32_00000001)); - mask = _mm256_shuffle_epi8(K8_SHUFFLE_BITS, mask); - } - - SIMD_INLINE __m256i LeafMask(const HidLbpFeature & feature, ptrdiff_t offset, const int * subset) - { - __m256i index, shuffle, mask; - Calculate(feature, offset, index, shuffle, mask); - - __m256i _subset = _mm256_loadu_si256((__m256i*)subset); - __m256i subset0 = _mm256_permute4x64_epi64(_subset, 0x44); - __m256i subset1 = _mm256_permute4x64_epi64(_subset, 0xEE); - - __m256i value0 = _mm256_and_si256(_mm256_shuffle_epi8(subset0, shuffle), mask); - __m256i value1 = _mm256_and_si256(_mm256_shuffle_epi8(subset1, shuffle), mask); - __m256i value = _mm256_blendv_epi8(value0, value1, index); - - return _mm256_andnot_si256(_mm256_cmpeq_epi32(value, _mm256_setzero_si256()), K_INV_ZERO); - } - - void Detect(const HidLbpCascade & hid, size_t offset, int startStage, __m256i & result) - { - typedef HidLbpCascade Hid; - - size_t subsetSize = (hid.ncategories + 31) / 32; - const int * subsets = hid.subsets.data(); - const Hid::Leave * leaves = hid.leaves.data(); - const Hid::Node * nodes = hid.nodes.data(); - const Hid::Stage * stages = hid.stages.data(); - int nodeOffset = stages[startStage].first; - int leafOffset = 2 * nodeOffset; - for (int i_stage = startStage, n_stages = (int)hid.stages.size(); i_stage < n_stages; i_stage++) - { - const Hid::Stage & stage = stages[i_stage]; - __m256 sum = _mm256_setzero_ps(); - for (int i_tree = 0, n_trees = stage.ntrees; i_tree < n_trees; i_tree++) - { - const Hid::Feature & feature = hid.features[nodes[nodeOffset].featureIdx]; - const int * subset = subsets + nodeOffset*subsetSize; - __m256i mask = LeafMask(feature, offset, subset); - sum = _mm256_add_ps(sum, _mm256_blendv_ps(_mm256_broadcast_ss(leaves + leafOffset + 1), _mm256_broadcast_ss(leaves + leafOffset + 0), _mm256_castsi256_ps(mask))); - nodeOffset++; - leafOffset += 2; - } - result = _mm256_andnot_si256(_mm256_castps_si256(_mm256_cmp_ps(_mm256_broadcast_ss(&stage.threshold), sum, _CMP_GT_OQ)), result); - int resultCount = ResultCount(result); - if (resultCount == 0) - return; - else if (resultCount == 1) - { - uint32_t SIMD_ALIGNED(32) _result[8]; - _mm256_store_si256((__m256i*)_result, result); - for (int i = 0; i < 8; ++i) - { - if (_result[i]) - { - _result[i] = Base::Detect(hid, offset + i, i_stage + 1) > 0 ? 1 : 0; - break; - } - } - result = _mm256_load_si256((__m256i*)_result); - return; - } - } - } - - void DetectionLbpDetect32fp(const HidLbpCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - size_t width = rect.Width(); - size_t alignedWidth = Simd::AlignLo(width, 8); - size_t evenWidth = Simd::AlignLo(width, 2); - - Buffer buffer(width); - for (ptrdiff_t row = rect.top; row < rect.bottom; row += 1) - { - size_t col = 0; - size_t offset = row * hid.sum.stride / sizeof(uint32_t) + rect.left; - - UnpackMask32i(mask.data + row*mask.stride + rect.left, width, buffer.m, K8_01); - memset(buffer.d, 0, width * sizeof(uint32_t)); - for (; col < alignedWidth; col += 8) - { - __m256i result = _mm256_loadu_si256((__m256i*)(buffer.m + col)); - if (_mm256_testz_si256(result, K32_00000001)) - continue; - Detect(hid, offset + col, 0, result); - _mm256_storeu_si256((__m256i*)(buffer.d + col), result); - } - if (evenWidth > alignedWidth + 2) - { - col = evenWidth - 8; - __m256i result = _mm256_loadu_si256((__m256i*)(buffer.m + col)); - if (!_mm256_testz_si256(result, K32_00000001)) - { - Detect(hid, offset + col, 0, result); - _mm256_storeu_si256((__m256i*)(buffer.d + col), result); - } - col += 8; - } - for (; col < width; col += 1) - { - if (buffer.m[col] == 0) - continue; - buffer.d[col] = Base::Detect(hid, offset + col, 0) > 0 ? 1 : 0; - } - PackResult32i(buffer.d, width, dst.data + row*dst.stride + rect.left); - } - } - - void DetectionLbpDetect32fp(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidLbpCascade & hid = *(HidLbpCascade*)_hid; - return DetectionLbpDetect32fp(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - - void DetectionLbpDetect32fi(const HidLbpCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - const size_t step = 2; - size_t width = rect.Width(); - size_t alignedWidth = Simd::AlignLo(width, HA); - size_t evenWidth = Simd::AlignLo(width, 2); - - Buffer buffer(evenWidth); - for (ptrdiff_t row = rect.top; row < rect.bottom; row += step) - { - size_t col = 0; - size_t offset = row * hid.isum.stride / sizeof(uint32_t) + rect.left / 2; - - UnpackMask16i(mask.data + row*mask.stride + rect.left, evenWidth, buffer.m, K16_0001); - memset(buffer.d, 0, evenWidth * sizeof(uint16_t)); - for (; col < alignedWidth; col += HA) - { - __m256i result = _mm256_loadu_si256((__m256i*)(buffer.m + col)); - if (_mm256_testz_si256(result, K32_00000001)) - continue; - Detect(hid, offset + col / 2, 0, result); - _mm256_storeu_si256((__m256i*)(buffer.d + col), result); - } - if (evenWidth > alignedWidth) - { - col = evenWidth - HA; - __m256i result = _mm256_loadu_si256((__m256i*)(buffer.m + col)); - if (!_mm256_testz_si256(result, K32_00000001)) - { - Detect(hid, offset + col / 2, 0, result); - _mm256_storeu_si256((__m256i*)(buffer.d + col), result); - } - col += HA; - } - for (; col < width; col += step) - { - if (mask.At(col + rect.left, row) == 0) - continue; - if (Base::Detect(hid, offset + col / 2, 0) > 0) - dst.At(col + rect.left, row) = 1; - } - PackResult16i(buffer.d, evenWidth, dst.data + row*dst.stride + rect.left); - } - } - - void DetectionLbpDetect32fi(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidLbpCascade & hid = *(HidLbpCascade*)_hid; - return DetectionLbpDetect32fi(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - - SIMD_INLINE __m256i IntegralSum16i(const __m256i & s0, const __m256i & s1, const __m256i & s2, const __m256i & s3) - { - return _mm256_sub_epi16(_mm256_sub_epi16(s0, s1), _mm256_sub_epi16(s2, s3)); - } - - SIMD_INLINE __m256i GreaterOrEqual16i(__m256i a, __m256i b) - { - return _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a); - } - - template SIMD_INLINE void Load(__m256i a[16], const HidLbpFeature & feature, ptrdiff_t offset) - { - a[i] = _mm256_loadu_si256((__m256i*)(feature.p[i] + offset)); - } - - SIMD_INLINE void Calculate(const HidLbpFeature & feature, ptrdiff_t offset, __m256i & index, __m256i & shuffle, __m256i & mask) - { - __m256i a[16]; - Load<5>(a, feature, offset); - Load<6>(a, feature, offset); - Load<9>(a, feature, offset); - Load<10>(a, feature, offset); - __m256i central = IntegralSum16i(a[5], a[6], a[9], a[10]); - - Load<0>(a, feature, offset); - Load<1>(a, feature, offset); - Load<4>(a, feature, offset); - index = GreaterOrEqual16i(IntegralSum16i(a[0], a[1], a[4], a[5]), central); - - shuffle = K16_FF00; - Load<2>(a, feature, offset); - shuffle = _mm256_or_si256(shuffle, _mm256_and_si256(GreaterOrEqual16i(IntegralSum16i(a[1], a[2], a[5], a[6]), central), K16_0008)); - Load<3>(a, feature, offset); - Load<7>(a, feature, offset); - shuffle = _mm256_or_si256(shuffle, _mm256_and_si256(GreaterOrEqual16i(IntegralSum16i(a[2], a[3], a[6], a[7]), central), K16_0004)); - Load<11>(a, feature, offset); - shuffle = _mm256_or_si256(shuffle, _mm256_and_si256(GreaterOrEqual16i(IntegralSum16i(a[6], a[7], a[10], a[11]), central), K16_0002)); - Load<14>(a, feature, offset); - Load<15>(a, feature, offset); - shuffle = _mm256_or_si256(shuffle, _mm256_and_si256(GreaterOrEqual16i(IntegralSum16i(a[10], a[11], a[14], a[15]), central), K16_0001)); - - mask = K16_FF00; - Load<13>(a, feature, offset); - mask = _mm256_or_si256(mask, _mm256_and_si256(GreaterOrEqual16i(IntegralSum16i(a[9], a[10], a[13], a[14]), central), K16_0004)); - Load<12>(a, feature, offset); - Load<8>(a, feature, offset); - mask = _mm256_or_si256(mask, _mm256_and_si256(GreaterOrEqual16i(IntegralSum16i(a[8], a[9], a[12], a[13]), central), K16_0002)); - mask = _mm256_or_si256(mask, _mm256_and_si256(GreaterOrEqual16i(IntegralSum16i(a[4], a[5], a[8], a[9]), central), K16_0001)); - mask = _mm256_shuffle_epi8(K8_SHUFFLE_BITS, mask); - } - - SIMD_INLINE __m256i LeafMask(const HidLbpFeature & feature, ptrdiff_t offset, const int * subset) - { - __m256i index, shuffle, mask; - Calculate(feature, offset, index, shuffle, mask); - - __m256i _subset = _mm256_loadu_si256((__m256i*)subset); - __m256i subset0 = _mm256_permute4x64_epi64(_subset, 0x44); - __m256i subset1 = _mm256_permute4x64_epi64(_subset, 0xEE); - - __m256i value0 = _mm256_and_si256(_mm256_shuffle_epi8(subset0, shuffle), mask); - __m256i value1 = _mm256_and_si256(_mm256_shuffle_epi8(subset1, shuffle), mask); - __m256i value = _mm256_blendv_epi8(value0, value1, index); - - return _mm256_andnot_si256(_mm256_cmpeq_epi16(value, _mm256_setzero_si256()), Simd::Avx2::K_INV_ZERO); - } - - void Detect(const HidLbpCascade & hid, size_t offset, __m256i & result) - { - typedef HidLbpCascade Hid; - - size_t subsetSize = (hid.ncategories + 31) / 32; - const int * subsets = hid.subsets.data(); - const Hid::Leave * leaves = hid.leaves.data(); - const Hid::Node * nodes = hid.nodes.data(); - const Hid::Stage * stages = hid.stages.data(); - int nodeOffset = 0, leafOffset = 0; - for (int i_stage = 0, n_stages = (int)hid.stages.size(); i_stage < n_stages; i_stage++) - { - const Hid::Stage & stage = stages[i_stage]; - __m256i sum = _mm256_setzero_si256(); - for (int i_tree = 0, n_trees = stage.ntrees; i_tree < n_trees; i_tree++) - { - const Hid::Feature & feature = hid.features[nodes[nodeOffset].featureIdx]; - const int * subset = subsets + nodeOffset*subsetSize; - __m256i mask = LeafMask(feature, offset, subset); - sum = _mm256_add_epi16(sum, _mm256_blendv_epi8(_mm256_set1_epi16(leaves[leafOffset + 1]), _mm256_set1_epi16(leaves[leafOffset + 0]), mask)); - nodeOffset++; - leafOffset += 2; - } - result = _mm256_andnot_si256(_mm256_cmpgt_epi16(_mm256_set1_epi16(stage.threshold), sum), result); - int resultCount = ResultCount(result); - if (resultCount == 0) - return; - else if (resultCount == 1) - { - uint16_t SIMD_ALIGNED(32) _result[HA]; - _mm256_store_si256((__m256i*)_result, result); - for (int i = 0; i < HA; ++i) - { - if (_result[i]) - { - _result[i] = Base::Detect(hid, offset + i, i_stage + 1) > 0 ? 1 : 0; - break; - } - } - result = _mm256_load_si256((__m256i*)_result); - return; - } - } - } - - void DetectionLbpDetect16ip(const HidLbpCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - size_t width = rect.Width(); - size_t alignedWidth = Simd::AlignLo(width, HA); - size_t evenWidth = Simd::AlignLo(width, 2); - Buffer buffer(width); - for (ptrdiff_t row = rect.top; row < rect.bottom; row += 1) - { - size_t col = 0; - size_t offset = row * hid.isum.stride / sizeof(uint16_t) + rect.left; - UnpackMask16i(mask.data + row*mask.stride + rect.left, width, buffer.m, K8_01); - memset(buffer.d, 0, width * sizeof(uint16_t)); - for (; col < alignedWidth; col += HA) - { - __m256i result = _mm256_loadu_si256((__m256i*)(buffer.m + col)); - if (_mm256_testz_si256(result, K16_0001)) - continue; - Detect(hid, offset + col, result); - _mm256_storeu_si256((__m256i*)(buffer.d + col), result); - } - if (evenWidth > alignedWidth + 2) - { - col = evenWidth - HA; - __m256i result = _mm256_loadu_si256((__m256i*)(buffer.m + col)); - if (!_mm256_testz_si256(result, K16_0001)) - { - Detect(hid, offset + col, result); - _mm256_storeu_si256((__m256i*)(buffer.d + col), result); - } - col += HA; - } - for (; col < width; ++col) - { - if (buffer.m[col] == 0) - continue; - buffer.d[col] = Base::Detect(hid, offset + col, 0) > 0 ? 1 : 0; - } - PackResult16i(buffer.d, width, dst.data + row*dst.stride + rect.left); - } - } - - void DetectionLbpDetect16ip(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidLbpCascade & hid = *(HidLbpCascade*)_hid; - return DetectionLbpDetect16ip(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - - void DetectionLbpDetect16ii(const HidLbpCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - const size_t step = 2; - size_t width = rect.Width(); - size_t alignedWidth = Simd::AlignLo(width, A); - size_t evenWidth = Simd::AlignLo(width, 2); - - for (ptrdiff_t row = rect.top; row < rect.bottom; row += step) - { - size_t col = 0; - size_t offset = row * hid.isum.stride / sizeof(uint16_t) + rect.left / 2; - const uint8_t * m = mask.data + row*mask.stride + rect.left; - uint8_t * d = dst.data + row*dst.stride + rect.left; - for (; col < alignedWidth; col += A) - { - __m256i result = _mm256_and_si256(_mm256_loadu_si256((__m256i*)(m + col)), K16_0001); - if (_mm256_testz_si256(result, K16_0001)) - continue; - Detect(hid, offset + col / 2, result); - _mm256_storeu_si256((__m256i*)(d + col), result); - } - if (evenWidth > alignedWidth + 2) - { - col = evenWidth - A; - __m256i result = _mm256_and_si256(_mm256_loadu_si256((__m256i*)(m + col)), K16_0001); - if (!_mm256_testz_si256(result, K16_0001)) - { - Detect(hid, offset + col / 2, result); - _mm256_storeu_si256((__m256i*)(d + col), result); - } - col += A; - } - for (; col < width; col += step) - { - if (mask.At(col + rect.left, row) == 0) - continue; - if (Base::Detect(hid, offset + col / 2, 0) > 0) - dst.At(col + rect.left, row) = 1; - } - } - } - - void DetectionLbpDetect16ii(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidLbpCascade & hid = *(HidLbpCascade*)_hid; - return DetectionLbpDetect16ii(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2EdgeBackground.cpp b/src/3rd/Simd/Simd/SimdAvx2EdgeBackground.cpp deleted file mode 100644 index 85205c04..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2EdgeBackground.cpp +++ /dev/null @@ -1,303 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdCompare.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void EdgeBackgroundGrowRangeSlow(const uint8_t * value, uint8_t * background, __m256i tailMask) - { - const __m256i _value = Load((__m256i*)value); - const __m256i _background = Load((__m256i*)background); - const __m256i inc = _mm256_and_si256(tailMask, Greater8u(_value, _background)); - Store((__m256i*)background, _mm256_adds_epu8(_background, inc)); - } - - template void EdgeBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(background) && Aligned(backgroundStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + alignedWidth, 1); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - EdgeBackgroundGrowRangeSlow(value + col, background + col, K8_01); - if (alignedWidth != width) - EdgeBackgroundGrowRangeSlow(value + width - A, background + width - A, tailMask); - value += valueStride; - background += backgroundStride; - } - } - - void EdgeBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(background) && Aligned(backgroundStride)) - EdgeBackgroundGrowRangeSlow(value, valueStride, width, height, background, backgroundStride); - else - EdgeBackgroundGrowRangeSlow(value, valueStride, width, height, background, backgroundStride); - } - - template SIMD_INLINE void EdgeBackgroundGrowRangeFast(const uint8_t * value, uint8_t * background) - { - const __m256i _value = Load((__m256i*)value); - const __m256i _background = Load((__m256i*)background); - Store((__m256i*)background, _mm256_max_epu8(_background, _value)); - } - - template void EdgeBackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(background) && Aligned(backgroundStride)); - } - - size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - EdgeBackgroundGrowRangeFast(value + col, background + col); - if (alignedWidth != width) - EdgeBackgroundGrowRangeFast(value + width - A, background + width - A); - value += valueStride; - background += backgroundStride; - } - } - - void EdgeBackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(background) && Aligned(backgroundStride)) - EdgeBackgroundGrowRangeFast(value, valueStride, width, height, background, backgroundStride); - else - EdgeBackgroundGrowRangeFast(value, valueStride, width, height, background, backgroundStride); - } - - template SIMD_INLINE void EdgeBackgroundIncrementCount(const uint8_t * value, - const uint8_t * backgroundValue, uint8_t * backgroundCount, size_t offset, __m256i tailMask) - { - const __m256i _value = Load((__m256i*)(value + offset)); - const __m256i _backgroundValue = Load((__m256i*)(backgroundValue + offset)); - const __m256i _backgroundCount = Load((__m256i*)(backgroundCount + offset)); - - const __m256i inc = _mm256_and_si256(tailMask, Greater8u(_value, _backgroundValue)); - - Store((__m256i*)(backgroundCount + offset), _mm256_adds_epu8(_backgroundCount, inc)); - } - - template void EdgeBackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t * backgroundCount, size_t backgroundCountStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(backgroundValue) && Aligned(backgroundValueStride)); - assert(Aligned(backgroundCount) && Aligned(backgroundCountStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + alignedWidth, 1); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - EdgeBackgroundIncrementCount(value, backgroundValue, backgroundCount, col, K8_01); - if (alignedWidth != width) - EdgeBackgroundIncrementCount(value, backgroundValue, backgroundCount, width - A, tailMask); - value += valueStride; - backgroundValue += backgroundValueStride; - backgroundCount += backgroundCountStride; - } - } - - void EdgeBackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t * backgroundCount, size_t backgroundCountStride) - { - if (Aligned(value) && Aligned(valueStride) && - Aligned(backgroundValue) && Aligned(backgroundValueStride) && Aligned(backgroundCount) && Aligned(backgroundCountStride)) - EdgeBackgroundIncrementCount(value, valueStride, width, height, - backgroundValue, backgroundValueStride, backgroundCount, backgroundCountStride); - else - EdgeBackgroundIncrementCount(value, valueStride, width, height, - backgroundValue, backgroundValueStride, backgroundCount, backgroundCountStride); - } - - SIMD_INLINE __m256i AdjustEdge(const __m256i &count, const __m256i & value, const __m256i & mask, const __m256i & threshold) - { - const __m256i inc = _mm256_and_si256(mask, Greater8u(count, threshold)); - const __m256i dec = _mm256_and_si256(mask, Lesser8u(count, threshold)); - return _mm256_subs_epu8(_mm256_adds_epu8(value, inc), dec); - } - - template SIMD_INLINE void EdgeBackgroundAdjustRange(uint8_t * backgroundCount, uint8_t * backgroundValue, - size_t offset, const __m256i & threshold, const __m256i & mask) - { - const __m256i _backgroundCount = Load((__m256i*)(backgroundCount + offset)); - const __m256i _backgroundValue = Load((__m256i*)(backgroundValue + offset)); - - Store((__m256i*)(backgroundValue + offset), AdjustEdge(_backgroundCount, _backgroundValue, mask, threshold)); - Store((__m256i*)(backgroundCount + offset), K_ZERO); - } - - template void EdgeBackgroundAdjustRange(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold) - { - assert(width >= A); - if (align) - { - assert(Aligned(backgroundValue) && Aligned(backgroundValueStride)); - assert(Aligned(backgroundCount) && Aligned(backgroundCountStride)); - } - - const __m256i _threshold = _mm256_set1_epi8((char)threshold); - size_t alignedWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + alignedWidth, 1); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - EdgeBackgroundAdjustRange(backgroundCount, backgroundValue, col, _threshold, K8_01); - if (alignedWidth != width) - EdgeBackgroundAdjustRange(backgroundCount, backgroundValue, width - A, _threshold, tailMask); - backgroundValue += backgroundValueStride; - backgroundCount += backgroundCountStride; - } - } - - void EdgeBackgroundAdjustRange(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold) - { - if (Aligned(backgroundValue) && Aligned(backgroundValueStride) && - Aligned(backgroundCount) && Aligned(backgroundCountStride)) - EdgeBackgroundAdjustRange(backgroundCount, backgroundCountStride, width, height, - backgroundValue, backgroundValueStride, threshold); - else - EdgeBackgroundAdjustRange(backgroundCount, backgroundCountStride, width, height, - backgroundValue, backgroundValueStride, threshold); - } - - template SIMD_INLINE void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, uint8_t * backgroundValue, - const uint8_t * mask, size_t offset, const __m256i & threshold, const __m256i & tailMask) - { - const __m256i _mask = Load((const __m256i*)(mask + offset)); - EdgeBackgroundAdjustRange(backgroundCount, backgroundValue, offset, threshold, _mm256_and_si256(_mask, tailMask)); - } - - template void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(backgroundValue) && Aligned(backgroundValueStride)); - assert(Aligned(backgroundCount) && Aligned(backgroundCountStride)); - assert(Aligned(mask) && Aligned(maskStride)); - } - - const __m256i _threshold = _mm256_set1_epi8((char)threshold); - size_t alignedWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + alignedWidth, 1); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - EdgeBackgroundAdjustRangeMasked(backgroundCount, backgroundValue, mask, col, _threshold, K8_01); - if (alignedWidth != width) - EdgeBackgroundAdjustRangeMasked(backgroundCount, backgroundValue, mask, width - A, _threshold, tailMask); - backgroundValue += backgroundValueStride; - backgroundCount += backgroundCountStride; - mask += maskStride; - } - } - - void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride) - { - if (Aligned(backgroundValue) && Aligned(backgroundValueStride) && - Aligned(backgroundCount) && Aligned(backgroundCountStride) && - Aligned(mask) && Aligned(maskStride)) - EdgeBackgroundAdjustRangeMasked(backgroundCount, backgroundCountStride, width, height, - backgroundValue, backgroundValueStride, threshold, mask, maskStride); - else - EdgeBackgroundAdjustRangeMasked(backgroundCount, backgroundCountStride, width, height, - backgroundValue, backgroundValueStride, threshold, mask, maskStride); - } - - template SIMD_INLINE void EdgeBackgroundShiftRangeMasked(const uint8_t * value, uint8_t * background, const uint8_t * mask, size_t offset) - { - const __m256i _value = Load((__m256i*)(value + offset)); - const __m256i _background = Load((__m256i*)(background + offset)); - const __m256i _mask = Load((const __m256i*)(mask + offset)); - Store((__m256i*)(background + offset), _mm256_or_si256(_mm256_and_si256(_mask, _value), _mm256_andnot_si256(_mask, _background))); - } - - template void EdgeBackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride, const uint8_t * mask, size_t maskStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(background) && Aligned(backgroundStride)); - assert(Aligned(mask) && Aligned(maskStride)); - } - - size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - EdgeBackgroundShiftRangeMasked(value, background, mask, col); - if (alignedWidth != width) - EdgeBackgroundShiftRangeMasked(value, background, mask, width - A); - value += valueStride; - background += backgroundStride; - mask += maskStride; - } - } - - void EdgeBackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride, const uint8_t * mask, size_t maskStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(background) && Aligned(backgroundStride) && Aligned(mask) && Aligned(maskStride)) - EdgeBackgroundShiftRangeMasked(value, valueStride, width, height, background, backgroundStride, mask, maskStride); - else - EdgeBackgroundShiftRangeMasked(value, valueStride, width, height, background, backgroundStride, mask, maskStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Fill.cpp b/src/3rd/Simd/Simd/SimdAvx2Fill.cpp deleted file mode 100644 index 7557d491..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Fill.cpp +++ /dev/null @@ -1,165 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template void FillBgr(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red) - { - assert(width >= A); - if (align) - assert(Aligned(dst) && Aligned(stride)); - - size_t size = width * 3; - size_t step = A * 3; - size_t alignedSize = AlignLo(width, A) * 3; - - uint32_t bgrb = uint32_t(blue) | (uint32_t(green) << 8) | (uint32_t(red) << 16) | (uint32_t(blue) << 24); - uint32_t grbg = uint32_t(green) | (uint32_t(red) << 8) | (uint32_t(blue) << 16) | (uint32_t(green) << 24); - uint32_t rbgr = uint32_t(red) | (uint32_t(blue) << 8) | (uint32_t(green) << 16) | (uint32_t(red) << 24); - - __m256i bgrs[3]; - bgrs[0] = _mm256_setr_epi32(bgrb, grbg, rbgr, bgrb, grbg, rbgr, bgrb, grbg); - bgrs[1] = _mm256_setr_epi32(rbgr, bgrb, grbg, rbgr, bgrb, grbg, rbgr, bgrb); - bgrs[2] = _mm256_setr_epi32(grbg, rbgr, bgrb, grbg, rbgr, bgrb, grbg, rbgr); - for (size_t row = 0; row < height; ++row) - { - size_t offset = 0; - for (; offset < alignedSize; offset += step) - { - Store((__m256i*)(dst + offset) + 0, bgrs[0]); - Store((__m256i*)(dst + offset) + 1, bgrs[1]); - Store((__m256i*)(dst + offset) + 2, bgrs[2]); - } - if (offset < size) - { - offset = size - step; - Store((__m256i*)(dst + offset) + 0, bgrs[0]); - Store((__m256i*)(dst + offset) + 1, bgrs[1]); - Store((__m256i*)(dst + offset) + 2, bgrs[2]); - } - dst += stride; - } - } - - void FillBgr(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red) - { - if (Aligned(dst) && Aligned(stride)) - FillBgr(dst, stride, width, height, blue, green, red); - else - FillBgr(dst, stride, width, height, blue, green, red); - } - - template void FillBgra(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red, uint8_t alpha) - { - assert(width >= F); - if (align) - assert(Aligned(dst) && Aligned(stride)); - - uint32_t bgra32 = uint32_t(blue) | (uint32_t(green) << 8) | (uint32_t(red) << 16) | (uint32_t(alpha) << 24); - size_t alignedWidth = AlignLo(width, 8); - __m256i bgra256 = _mm256_set1_epi32(bgra32); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += 8) - Store((__m256i*)((uint32_t*)dst + col), bgra256); - if (width != alignedWidth) - Store((__m256i*)((uint32_t*)dst + width - 8), bgra256); - dst += stride; - } - } - - void FillBgra(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red, uint8_t alpha) - { - if (Aligned(dst) && Aligned(stride)) - FillBgra(dst, stride, width, height, blue, green, red, alpha); - else - FillBgra(dst, stride, width, height, blue, green, red, alpha); - } - - template void FillPixel(uint8_t * dst, size_t stride, size_t width, size_t height, const __m256i & pixel) - { - assert(width >= A); - if (align) - assert(Aligned(dst) && Aligned(stride)); - - size_t fullAlignedWidth = AlignLo(width, QA); - size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QA) - { - Store((__m256i*)(dst + col) + 0, pixel); - Store((__m256i*)(dst + col) + 1, pixel); - Store((__m256i*)(dst + col) + 2, pixel); - Store((__m256i*)(dst + col) + 3, pixel); - } - for (; col < alignedWidth; col += A) - Store((__m256i*)(dst + col), pixel); - if (col < width) - Store((__m256i*)(dst + width - A), pixel); - dst += stride; - } - } - - template void FillPixel(uint8_t * dst, size_t stride, size_t width, size_t height, const uint8_t * pixel, size_t pixelSize) - { - if (pixelSize == 3) - FillBgr(dst, stride, width, height, pixel[0], pixel[1], pixel[2]); - else - { - __m256i _pixel; - switch (pixelSize) - { - case 1: - _pixel = _mm256_set1_epi8(*pixel); - break; - case 2: - _pixel = _mm256_set1_epi16(*(uint16_t*)pixel); - break; - case 4: - _pixel = _mm256_set1_epi32(*(uint32_t*)pixel); - break; - default: - assert(0); - } - FillPixel(dst, stride, width*pixelSize, height, _pixel); - } - } - - void FillPixel(uint8_t * dst, size_t stride, size_t width, size_t height, const uint8_t * pixel, size_t pixelSize) - { - if (Aligned(dst) && Aligned(stride)) - FillPixel(dst, stride, width, height, pixel, pixelSize); - else - FillPixel(dst, stride, width, height, pixel, pixelSize); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Float16.cpp b/src/3rd/Simd/Simd/SimdAvx2Float16.cpp deleted file mode 100644 index 51c42d74..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Float16.cpp +++ /dev/null @@ -1,462 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdArray.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void Float32ToFloat16(const float * src, uint16_t * dst) - { - Sse2::Store((__m128i*)dst, _mm256_cvtps_ph(Avx::Load(src), 0)); - } - - template void Float32ToFloat16(const float * src, size_t size, uint16_t * dst) - { - assert(size >= F); - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t fullAlignedSize = Simd::AlignLo(size, QF); - size_t partialAlignedSize = Simd::AlignLo(size, F); - - size_t i = 0; - for (; i < fullAlignedSize; i += QF) - { - Float32ToFloat16(src + i + F * 0, dst + i + F * 0); - Float32ToFloat16(src + i + F * 1, dst + i + F * 1); - Float32ToFloat16(src + i + F * 2, dst + i + F * 2); - Float32ToFloat16(src + i + F * 3, dst + i + F * 3); - } - for (; i < partialAlignedSize; i += F) - Float32ToFloat16(src + i, dst + i); - if (partialAlignedSize != size) - Float32ToFloat16(src + size - F, dst + size - F); - } - - void Float32ToFloat16(const float * src, size_t size, uint16_t * dst) - { - if (Aligned(src) && Aligned(dst)) - Float32ToFloat16(src, size, dst); - else - Float32ToFloat16(src, size, dst); - } - - template SIMD_INLINE void Float16ToFloat32(const uint16_t * src, float * dst) - { - Avx::Store(dst, _mm256_cvtph_ps(Sse2::Load((__m128i*)src))); - } - - template void Float16ToFloat32(const uint16_t * src, size_t size, float * dst) - { - assert(size >= F); - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t fullAlignedSize = Simd::AlignLo(size, QF); - size_t partialAlignedSize = Simd::AlignLo(size, F); - - size_t i = 0; - for (; i < fullAlignedSize; i += QF) - { - Float16ToFloat32(src + i + F * 0, dst + i + F * 0); - Float16ToFloat32(src + i + F * 1, dst + i + F * 1); - Float16ToFloat32(src + i + F * 2, dst + i + F * 2); - Float16ToFloat32(src + i + F * 3, dst + i + F * 3); - } - for (; i < partialAlignedSize; i += F) - Float16ToFloat32(src + i, dst + i); - if (partialAlignedSize != size) - Float16ToFloat32(src + size - F, dst + size - F); - } - - void Float16ToFloat32(const uint16_t * src, size_t size, float * dst) - { - if (Aligned(src) && Aligned(dst)) - Float16ToFloat32(src, size, dst); - else - Float16ToFloat32(src, size, dst); - } - - template SIMD_INLINE void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t offset, __m256 & sum) - { - __m256 _a = _mm256_cvtph_ps(Sse2::Load((__m128i*)(a + offset))); - __m256 _b = _mm256_cvtph_ps(Sse2::Load((__m128i*)(b + offset))); - __m256 _d = _mm256_sub_ps(_a, _b); - sum = _mm256_fmadd_ps(_d, _d, sum); - } - - template SIMD_INLINE void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum) - { - assert(size >= F); - if (align) - assert(Aligned(a) && Aligned(b)); - - size_t partialAlignedSize = AlignLo(size, F); - size_t fullAlignedSize = AlignLo(size, QF); - size_t i = 0; - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - if (fullAlignedSize) - { - for (; i < fullAlignedSize; i += QF) - { - SquaredDifferenceSum16f(a, b, i + F * 0, sums[0]); - SquaredDifferenceSum16f(a, b, i + F * 1, sums[1]); - SquaredDifferenceSum16f(a, b, i + F * 2, sums[2]); - SquaredDifferenceSum16f(a, b, i + F * 3, sums[3]); - } - sums[0] = _mm256_add_ps(_mm256_add_ps(sums[0], sums[1]), _mm256_add_ps(sums[2], sums[3])); - } - for (; i < partialAlignedSize; i += F) - SquaredDifferenceSum16f(a, b, i, sums[0]); - if (partialAlignedSize != size) - { - __m256 mask = RightNotZero32f(size - partialAlignedSize); - __m256 _a = _mm256_cvtph_ps(Sse2::Load((__m128i*)(a + size - F))); - __m256 _b = _mm256_cvtph_ps(Sse2::Load((__m128i*)(b + size - F))); - __m256 _d = _mm256_and_ps(_mm256_sub_ps(_a, _b), mask); - sums[0] = _mm256_fmadd_ps(_d, _d, sums[0]); - } - *sum = Avx::ExtractSum(sums[0]); - } - - void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum) - { - if (Aligned(a) && Aligned(b)) - SquaredDifferenceSum16f(a, b, size, sum); - else - SquaredDifferenceSum16f(a, b, size, sum); - } - - template void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance) - { - if (align) - assert(Aligned(a) && Aligned(b)); - - size_t partialAlignedSize = AlignLo(size, F); - size_t fullAlignedSize = AlignLo(size, DF); - size_t i = 0; - __m256 _aa[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }; - __m256 _ab[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }; - __m256 _bb[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }; - if (fullAlignedSize) - { - for (; i < fullAlignedSize; i += DF) - { - __m256 a0 = _mm256_cvtph_ps(Sse2::Load((__m128i*)(a + i) + 0)); - __m256 b0 = _mm256_cvtph_ps(Sse2::Load((__m128i*)(b + i) + 0)); - _aa[0] = _mm256_fmadd_ps(a0, a0, _aa[0]); - _ab[0] = _mm256_fmadd_ps(a0, b0, _ab[0]); - _bb[0] = _mm256_fmadd_ps(b0, b0, _bb[0]); - __m256 a1 = _mm256_cvtph_ps(Sse2::Load((__m128i*)(a + i) + 1)); - __m256 b1 = _mm256_cvtph_ps(Sse2::Load((__m128i*)(b + i) + 1)); - _aa[1] = _mm256_fmadd_ps(a1, a1, _aa[1]); - _ab[1] = _mm256_fmadd_ps(a1, b1, _ab[1]); - _bb[1] = _mm256_fmadd_ps(b1, b1, _bb[1]); - } - _aa[0] = _mm256_add_ps(_aa[0], _aa[1]); - _ab[0] = _mm256_add_ps(_ab[0], _ab[1]); - _bb[0] = _mm256_add_ps(_bb[0], _bb[1]); - } - for (; i < partialAlignedSize; i += F) - { - __m256 a0 = _mm256_cvtph_ps(Sse2::Load((__m128i*)(a + i) + 0)); - __m256 b0 = _mm256_cvtph_ps(Sse2::Load((__m128i*)(b + i) + 0)); - _aa[0] = _mm256_fmadd_ps(a0, a0, _aa[0]); - _ab[0] = _mm256_fmadd_ps(a0, b0, _ab[0]); - _bb[0] = _mm256_fmadd_ps(b0, b0, _bb[0]); - } - if (partialAlignedSize != size) - { - __m256 mask = RightNotZero32f(size - partialAlignedSize); - __m256 a0 = _mm256_and_ps(mask, _mm256_cvtph_ps(Sse2::Load((__m128i*)(a + size - F)))); - __m256 b0 = _mm256_and_ps(mask, _mm256_cvtph_ps(Sse2::Load((__m128i*)(b + size - F)))); - _aa[0] = _mm256_fmadd_ps(a0, a0, _aa[0]); - _ab[0] = _mm256_fmadd_ps(a0, b0, _ab[0]); - _bb[0] = _mm256_fmadd_ps(b0, b0, _bb[0]); - } - float aa = Avx::ExtractSum(_aa[0]), ab = Avx::ExtractSum(_ab[0]), bb = Avx::ExtractSum(_bb[0]); - *distance = 1.0f - ab / ::sqrt(aa*bb); - } - - void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance) - { - if (Aligned(a) && Aligned(b)) - CosineDistance16f(a, b, size, distance); - else - CosineDistance16f(a, b, size, distance); - } - - static void Squares(size_t M, size_t K, const uint16_t * const * A, float * squares) - { - size_t M4 = AlignLo(M, 4); - size_t KF = AlignLo(K, F); - __m256 mask = RightNotZero32f(K - KF); - size_t i = 0; - for (; i < M4; i += 4) - { - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - for (size_t k = 0; k < KF; k += F) - { - __m256 a0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[i + 0] + k))); - __m256 a1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[i + 1] + k))); - __m256 a2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[i + 2] + k))); - __m256 a3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[i + 3] + k))); - sums[0] = _mm256_fmadd_ps(a0, a0, sums[0]); - sums[1] = _mm256_fmadd_ps(a1, a1, sums[1]); - sums[2] = _mm256_fmadd_ps(a2, a2, sums[2]); - sums[3] = _mm256_fmadd_ps(a3, a3, sums[3]); - } - if (KF < K) - { - size_t k = K - F; - __m256 a0 = _mm256_and_ps(mask, _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[i + 0] + k)))); - __m256 a1 = _mm256_and_ps(mask, _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[i + 1] + k)))); - __m256 a2 = _mm256_and_ps(mask, _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[i + 2] + k)))); - __m256 a3 = _mm256_and_ps(mask, _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[i + 3] + k)))); - sums[0] = _mm256_fmadd_ps(a0, a0, sums[0]); - sums[1] = _mm256_fmadd_ps(a1, a1, sums[1]); - sums[2] = _mm256_fmadd_ps(a2, a2, sums[2]); - sums[3] = _mm256_fmadd_ps(a3, a3, sums[3]); - } - _mm_storeu_ps(squares + i, Extract4Sums(sums)); - } - for (; i < M; i += 1) - { - __m256 sum = _mm256_setzero_ps(); - for (size_t k = 0; k < KF; k += F) - { - __m256 a = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[i] + k))); - sum = _mm256_fmadd_ps(a, a, sum); - } - if (KF < K) - { - size_t k = K - F; - __m256 a = _mm256_and_ps(mask, _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[i] + k)))); - sum = _mm256_fmadd_ps(a, a, sum); - } - squares[i] = Avx::ExtractSum(sum); - } - } - - SIMD_INLINE __m256 Tail(size_t tail) - { - const int32_t mask[DF] = { 0, 0, 0, 0, 0, 0, 0, 0 , -1, -1, -1, -1, -1, -1, -1, -1 }; - return _mm256_loadu_ps((float*)(mask + tail)); - } - - static void MicroCosineDistances3x4(size_t K, const uint16_t * const * A, const uint16_t * const * B, const float * aa, const float * bb, float * distances, size_t stride) - { - size_t K8 = K & (~7); - __m256 c00 = _mm256_setzero_ps(); - __m256 c01 = _mm256_setzero_ps(); - __m256 c02 = _mm256_setzero_ps(); - __m256 c03 = _mm256_setzero_ps(); - __m256 c10 = _mm256_setzero_ps(); - __m256 c11 = _mm256_setzero_ps(); - __m256 c12 = _mm256_setzero_ps(); - __m256 c13 = _mm256_setzero_ps(); - __m256 c20 = _mm256_setzero_ps(); - __m256 c21 = _mm256_setzero_ps(); - __m256 c22 = _mm256_setzero_ps(); - __m256 c23 = _mm256_setzero_ps(); - __m256 a0, a1, a2, b0; - for (size_t k = 0; k < K8; k += 8) - { - a0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[0] + k))); - a1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[1] + k))); - a2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[2] + k))); - b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(B[0] + k))); - c00 = _mm256_fmadd_ps(a0, b0, c00); - c10 = _mm256_fmadd_ps(a1, b0, c10); - c20 = _mm256_fmadd_ps(a2, b0, c20); - b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(B[1] + k))); - c01 = _mm256_fmadd_ps(a0, b0, c01); - c11 = _mm256_fmadd_ps(a1, b0, c11); - c21 = _mm256_fmadd_ps(a2, b0, c21); - b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(B[2] + k))); - c02 = _mm256_fmadd_ps(a0, b0, c02); - c12 = _mm256_fmadd_ps(a1, b0, c12); - c22 = _mm256_fmadd_ps(a2, b0, c22); - b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(B[3] + k))); - c03 = _mm256_fmadd_ps(a0, b0, c03); - c13 = _mm256_fmadd_ps(a1, b0, c13); - c23 = _mm256_fmadd_ps(a2, b0, c23); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 tail = Tail(K - K8); - a0 = _mm256_and_ps(tail, _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[0] + k)))); - a1 = _mm256_and_ps(tail, _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[1] + k)))); - a2 = _mm256_and_ps(tail, _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[2] + k)))); - b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(B[0] + k))); - c00 = _mm256_fmadd_ps(a0, b0, c00); - c10 = _mm256_fmadd_ps(a1, b0, c10); - c20 = _mm256_fmadd_ps(a2, b0, c20); - b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(B[1] + k))); - c01 = _mm256_fmadd_ps(a0, b0, c01); - c11 = _mm256_fmadd_ps(a1, b0, c11); - c21 = _mm256_fmadd_ps(a2, b0, c21); - b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(B[2] + k))); - c02 = _mm256_fmadd_ps(a0, b0, c02); - c12 = _mm256_fmadd_ps(a1, b0, c12); - c22 = _mm256_fmadd_ps(a2, b0, c22); - b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(B[3] + k))); - c03 = _mm256_fmadd_ps(a0, b0, c03); - c13 = _mm256_fmadd_ps(a1, b0, c13); - c23 = _mm256_fmadd_ps(a2, b0, c23); - } - __m128 _bb = _mm_loadu_ps(bb); - __m128 _1 = _mm_set1_ps(1.0f); - _mm_storeu_ps(distances + 0 * stride, _mm_fnmadd_ps(_mm_rsqrt_ps(_mm_mul_ps(_bb, _mm_set1_ps(aa[0]))), Extract4Sums(c00, c01, c02, c03), _1)); - _mm_storeu_ps(distances + 1 * stride, _mm_fnmadd_ps(_mm_rsqrt_ps(_mm_mul_ps(_bb, _mm_set1_ps(aa[1]))), Extract4Sums(c10, c11, c12, c13), _1)); - _mm_storeu_ps(distances + 2 * stride, _mm_fnmadd_ps(_mm_rsqrt_ps(_mm_mul_ps(_bb, _mm_set1_ps(aa[2]))), Extract4Sums(c20, c21, c22, c23), _1)); - } - - static void MicroCosineDistances3x1(size_t K, const uint16_t * const * A, const uint16_t * const * B, const float * aa, const float * bb, float * distances, size_t stride) - { - size_t K8 = K & (~7); - __m256 c00 = _mm256_setzero_ps(); - __m256 c10 = _mm256_setzero_ps(); - __m256 c20 = _mm256_setzero_ps(); - __m256 a0, b0; - for (size_t k = 0; k < K8; k += 8) - { - b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(B[0] + k))); - a0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[0] + k))); - c00 = _mm256_fmadd_ps(a0, b0, c00); - a0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[1] + k))); - c10 = _mm256_fmadd_ps(a0, b0, c10); - a0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[2] + k))); - c20 = _mm256_fmadd_ps(a0, b0, c20); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 tail = Tail(K - K8); - b0 = _mm256_and_ps(tail, _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(B[0] + k)))); - a0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[0] + k))); - c00 = _mm256_fmadd_ps(a0, b0, c00); - a0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[1] + k))); - c10 = _mm256_fmadd_ps(a0, b0, c10); - a0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[2] + k))); - c20 = _mm256_fmadd_ps(a0, b0, c20); - } - distances[0 * stride] = 1.0f - Avx::ExtractSum(c00) / sqrt(bb[0] * aa[0]); - distances[1 * stride] = 1.0f - Avx::ExtractSum(c10) / sqrt(bb[0] * aa[1]); - distances[2 * stride] = 1.0f - Avx::ExtractSum(c20) / sqrt(bb[0] * aa[2]); - } - - static void MicroCosineDistances1x4(size_t K, const uint16_t * const * A, const uint16_t * const * B, const float * aa, const float * bb, float * distances, size_t stride) - { - size_t K8 = K & (~7); - __m256 c00 = _mm256_setzero_ps(); - __m256 c01 = _mm256_setzero_ps(); - __m256 c02 = _mm256_setzero_ps(); - __m256 c03 = _mm256_setzero_ps(); - __m256 a0, b0; - for (size_t k = 0; k < K8; k += 8) - { - a0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[0] + k))); - b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(B[0] + k))); - c00 = _mm256_fmadd_ps(a0, b0, c00); - b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(B[1] + k))); - c01 = _mm256_fmadd_ps(a0, b0, c01); - b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(B[2] + k))); - c02 = _mm256_fmadd_ps(a0, b0, c02); - b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(B[3] + k))); - c03 = _mm256_fmadd_ps(a0, b0, c03); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 tail = Tail(K - K8); - a0 = _mm256_and_ps(tail, _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(A[0] + k)))); - b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(B[0] + k))); - c00 = _mm256_fmadd_ps(a0, b0, c00); - b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(B[1] + k))); - c01 = _mm256_fmadd_ps(a0, b0, c01); - b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(B[2] + k))); - c02 = _mm256_fmadd_ps(a0, b0, c02); - b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(B[3] + k))); - c03 = _mm256_fmadd_ps(a0, b0, c03); - } - __m128 _bb = _mm_loadu_ps(bb); - __m128 _1 = _mm_set1_ps(1.0f); - _mm_storeu_ps(distances + 0 * stride, _mm_fnmadd_ps(_mm_rsqrt_ps(_mm_mul_ps(_bb, _mm_set1_ps(aa[0]))), Extract4Sums(c00, c01, c02, c03), _1)); - } - - static void MacroCosineDistances(size_t M, size_t N, size_t K, const uint16_t * const * A, const uint16_t * const * B, const float * aa, const float * bb, float * distances, size_t stride) - { - size_t M3 = AlignLoAny(M, 3); - size_t N4 = AlignLo(N, 4); - size_t i = 0; - for (; i < M3; i += 3) - { - size_t j = 0; - for (; j < N4; j += 4) - MicroCosineDistances3x4(K, A + i, B + j, aa + i, bb + j, distances + j, stride); - for (; j < N; j += 1) - MicroCosineDistances3x1(K, A + i, B + j, aa + i, bb + j, distances + j, stride); - distances += 3 * stride; - } - for (; i < M; i++) - { - size_t j = 0; - for (; j < N4; j += 4) - MicroCosineDistances1x4(K, A + i, B + j, aa + i, bb + j, distances + j, stride); - for (; j < N; j += 1) - CosineDistance16f(A[i], B[j], K, distances + j); - distances += 1 * stride; - } - } - - void CosineDistancesMxNa16f(size_t M, size_t N, size_t K, const uint16_t * const * A, const uint16_t * const * B, float * distances) - { - const size_t L2 = Base::AlgCacheL2(); - size_t mN = AlignLoAny(L2 / 2 / K, 4); - size_t mM = AlignLoAny(L2 / 2 / K, 3); - Array32f aa(M), bb(N); - for (size_t i = 0; i < M; i += mM) - { - size_t dM = Simd::Min(M, i + mM) - i; - Squares(dM, K, A + i, aa.data + i); - for (size_t j = 0; j < N; j += mN) - { - size_t dN = Simd::Min(N, j + mN) - j; - if(i == 0) - Squares(dN, K, B + j, bb.data + j); - MacroCosineDistances(dM, dN, K, A + i, B + j, aa.data + i, bb.data + j, distances + i * N + j, N); - } - } - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Float32.cpp b/src/3rd/Simd/Simd/SimdAvx2Float32.cpp deleted file mode 100644 index e83660f5..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Float32.cpp +++ /dev/null @@ -1,167 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE __m256i Float32ToUint8(const float * src, const __m256 & lower, const __m256 & upper, const __m256 & boost) - { - return _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_sub_ps(_mm256_min_ps(_mm256_max_ps(Avx::Load(src), lower), upper), lower), boost)); - } - - template SIMD_INLINE void Float32ToUint8(const float * src, const __m256 & lower, const __m256 & upper, const __m256 & boost, uint8_t * dst) - { - __m256i d0 = Float32ToUint8(src + F * 0, lower, upper, boost); - __m256i d1 = Float32ToUint8(src + F * 1, lower, upper, boost); - __m256i d2 = Float32ToUint8(src + F * 2, lower, upper, boost); - __m256i d3 = Float32ToUint8(src + F * 3, lower, upper, boost); - Store((__m256i*)dst, PackI16ToU8(PackU32ToI16(d0, d1), PackU32ToI16(d2, d3))); - } - - template void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst) - { - assert(size >= A); - if (align) - assert(Aligned(src) && Aligned(dst)); - - __m256 _lower = _mm256_set1_ps(lower[0]); - __m256 _upper = _mm256_set1_ps(upper[0]); - __m256 boost = _mm256_set1_ps(255.0f / (upper[0] - lower[0])); - - size_t alignedSize = AlignLo(size, A); - for (size_t i = 0; i < alignedSize; i += A) - Float32ToUint8(src + i, _lower, _upper, boost, dst + i); - if (alignedSize != size) - Float32ToUint8(src + size - A, _lower, _upper, boost, dst + size - A); - } - - void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst) - { - if (Aligned(src) && Aligned(dst)) - Float32ToUint8(src, size, lower, upper, dst); - else - Float32ToUint8(src, size, lower, upper, dst); - } - - SIMD_INLINE __m256 Uint8ToFloat32(const __m128i & value, const __m256 & lower, const __m256 & boost) - { - return _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(value)), boost), lower); - } - - template SIMD_INLINE void Uint8ToFloat32(const uint8_t * src, const __m256 & lower, const __m256 & boost, float * dst) - { - __m128i _src = Sse2::Load((__m128i*)src); - Avx::Store(dst + 0, Uint8ToFloat32(_src, lower, boost)); - Avx::Store(dst + F, Uint8ToFloat32(_mm_srli_si128(_src, 8), lower, boost)); - } - - template void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst) - { - assert(size >= HA); - if (align) - assert(Aligned(src) && Aligned(dst)); - - __m256 _lower = _mm256_set1_ps(lower[0]); - __m256 boost = _mm256_set1_ps((upper[0] - lower[0]) / 255.0f); - - size_t alignedSize = AlignLo(size, HA); - for (size_t i = 0; i < alignedSize; i += HA) - Uint8ToFloat32(src + i, _lower, boost, dst + i); - if (alignedSize != size) - Uint8ToFloat32(src + size - HA, _lower, boost, dst + size - HA); - } - - void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst) - { - if (Aligned(src) && Aligned(dst)) - Uint8ToFloat32(src, size, lower, upper, dst); - else - Uint8ToFloat32(src, size, lower, upper, dst); - } - - template void CosineDistance32f(const float * a, const float * b, size_t size, float * distance) - { - if (align) - assert(Aligned(a) && Aligned(b)); - - size_t partialAlignedSize = AlignLo(size, F); - size_t fullAlignedSize = AlignLo(size, DF); - size_t i = 0; - __m256 _aa[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }; - __m256 _ab[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }; - __m256 _bb[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }; - if (fullAlignedSize) - { - for (; i < fullAlignedSize; i += DF) - { - __m256 a0 = Load(a + i + 0 * F); - __m256 b0 = Load(b + i + 0 * F); - _aa[0] = _mm256_fmadd_ps(a0, a0, _aa[0]); - _ab[0] = _mm256_fmadd_ps(a0, b0, _ab[0]); - _bb[0] = _mm256_fmadd_ps(b0, b0, _bb[0]); - __m256 a1 = Load(a + i + 1 * F); - __m256 b1 = Load(b + i + 1 * F); - _aa[1] = _mm256_fmadd_ps(a1, a1, _aa[1]); - _ab[1] = _mm256_fmadd_ps(a1, b1, _ab[1]); - _bb[1] = _mm256_fmadd_ps(b1, b1, _bb[1]); - } - _aa[0] = _mm256_add_ps(_aa[0], _aa[1]); - _ab[0] = _mm256_add_ps(_ab[0], _ab[1]); - _bb[0] = _mm256_add_ps(_bb[0], _bb[1]); - } - for (; i < partialAlignedSize; i += F) - { - __m256 a0 = Load(a + i); - __m256 b0 = Load(b + i); - _aa[0] = _mm256_fmadd_ps(a0, a0, _aa[0]); - _ab[0] = _mm256_fmadd_ps(a0, b0, _ab[0]); - _bb[0] = _mm256_fmadd_ps(b0, b0, _bb[0]); - } - float aa = Avx::ExtractSum(_aa[0]), ab = Avx::ExtractSum(_ab[0]), bb = Avx::ExtractSum(_bb[0]); - for (; i < size; ++i) - { - float _a = a[i]; - float _b = b[i]; - aa += _a * _a; - ab += _a * _b; - bb += _b * _b; - } - *distance = 1.0f - ab / ::sqrt(aa*bb); - } - - void CosineDistance32f(const float * a, const float * b, size_t size, float * distance) - { - if (Aligned(a) && Aligned(b)) - CosineDistance32f(a, b, size, distance); - else - CosineDistance32f(a, b, size, distance); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2GaussianBlur3x3.cpp b/src/3rd/Simd/Simd/SimdAvx2GaussianBlur3x3.cpp deleted file mode 100644 index 4fb24faf..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2GaussianBlur3x3.cpp +++ /dev/null @@ -1,163 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - namespace - { - struct Buffer - { - Buffer(size_t width) - { - _p = Allocate(sizeof(uint16_t) * 3 * width); - src0 = (uint16_t*)_p; - src1 = src0 + width; - src2 = src1 + width; - } - - ~Buffer() - { - Free(_p); - } - - uint16_t * src0; - uint16_t * src1; - uint16_t * src2; - private: - void * _p; - }; - } - - SIMD_INLINE __m256i DivideBy16(__m256i value) - { - return _mm256_srli_epi16(_mm256_add_epi16(value, K16_0008), 4); - } - - const __m256i K8_01_02 = SIMD_MM256_SET2_EPI8(0x01, 0x02); - - template SIMD_INLINE __m256i BinomialSumUnpackedU8(__m256i a[3]) - { - return _mm256_add_epi16(_mm256_maddubs_epi16(UnpackU8(a[0], a[1]), K8_01_02), UnpackU8(a[2])); - } - - template SIMD_INLINE void BlurCol(__m256i a[3], uint16_t * b) - { - Store((__m256i*)b + 0, BinomialSumUnpackedU8<0>(a)); - Store((__m256i*)b + 1, BinomialSumUnpackedU8<1>(a)); - } - - template SIMD_INLINE __m256i BlurRow16(const Buffer & buffer, size_t offset) - { - return DivideBy16(BinomialSum16( - Load((__m256i*)(buffer.src0 + offset)), - Load((__m256i*)(buffer.src1 + offset)), - Load((__m256i*)(buffer.src2 + offset)))); - } - - template SIMD_INLINE __m256i BlurRow(const Buffer & buffer, size_t offset) - { - return _mm256_packus_epi16(BlurRow16(buffer, offset), BlurRow16(buffer, offset + HA)); - } - - template void GaussianBlur3x3( - const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(step*(width - 1) >= A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(step*width) && Aligned(dst) && Aligned(dstStride)); - - __m256i a[3]; - - size_t size = step*width; - size_t bodySize = Simd::AlignHi(size, A) - A; - - Buffer buffer(Simd::AlignHi(size, A)); - - LoadNose3(src + 0, a); - BlurCol(a, buffer.src0 + 0); - for (size_t col = A; col < bodySize; col += A) - { - LoadBody3(src + col, a); - BlurCol(a, buffer.src0 + col); - } - LoadTail3(src + size - A, a); - BlurCol(a, buffer.src0 + bodySize); - - memcpy(buffer.src1, buffer.src0, sizeof(uint16_t)*(bodySize + A)); - - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - const uint8_t *src2 = src + srcStride*(row + 1); - if (row >= height - 2) - src2 = src + srcStride*(height - 1); - - LoadNose3(src2 + 0, a); - BlurCol(a, buffer.src2 + 0); - for (size_t col = A; col < bodySize; col += A) - { - LoadBody3(src2 + col, a); - BlurCol(a, buffer.src2 + col); - } - LoadTail3(src2 + size - A, a); - BlurCol(a, buffer.src2 + bodySize); - - for (size_t col = 0; col < bodySize; col += A) - Store((__m256i*)(dst + col), BlurRow(buffer, col)); - Store((__m256i*)(dst + size - A), BlurRow(buffer, bodySize)); - - Swap(buffer.src0, buffer.src2); - Swap(buffer.src0, buffer.src1); - } - } - - template void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - assert(channelCount > 0 && channelCount <= 4); - - switch (channelCount) - { - case 1: GaussianBlur3x3(src, srcStride, width, height, dst, dstStride); break; - case 2: GaussianBlur3x3(src, srcStride, width, height, dst, dstStride); break; - case 3: GaussianBlur3x3(src, srcStride, width, height, dst, dstStride); break; - case 4: GaussianBlur3x3(src, srcStride, width, height, dst, dstStride); break; - } - } - - void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(channelCount*width) && Aligned(dst) && Aligned(dstStride)) - GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else - GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Gemm32f.cpp b/src/3rd/Simd/Simd/SimdAvx2Gemm32f.cpp deleted file mode 100644 index 23251c9e..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Gemm32f.cpp +++ /dev/null @@ -1,994 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdGemm.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - SIMD_INLINE void AddProduct(float * ptr, __m256 value, __m256 alpha) - { - _mm256_storeu_ps(ptr, _mm256_fmadd_ps(value, alpha, _mm256_loadu_ps(ptr))); - } - - SIMD_INLINE void AddProduct(float * ptr, __m256 value, __m256 alpha, size_t tail) - { - if (tail == F) - AddProduct(ptr, value, alpha); - else - { - float tmp[F]; - _mm256_storeu_ps(tmp, _mm256_add_ps(_mm256_mul_ps(value, alpha), _mm256_loadu_ps(ptr))); - for (size_t i = 0; i < tail; ++i) - ptr[i] = tmp[i]; - } - } - - SIMD_NOINLINE void GemmKernel4x24nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - __m256 c00 = _mm256_setzero_ps(); - __m256 c10 = _mm256_setzero_ps(); - __m256 c20 = _mm256_setzero_ps(); - __m256 c30 = _mm256_setzero_ps(); - __m256 c01 = _mm256_setzero_ps(); - __m256 c11 = _mm256_setzero_ps(); - __m256 c21 = _mm256_setzero_ps(); - __m256 c31 = _mm256_setzero_ps(); - __m256 c02 = _mm256_setzero_ps(); - __m256 c12 = _mm256_setzero_ps(); - __m256 c22 = _mm256_setzero_ps(); - __m256 c32 = _mm256_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t sa = lda == 1 ? 4 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - __m256 b0, b1, b2, a0; - for (size_t k = 0; k < K; ++k) - { - //_mm_prefetch((char*)B + ob0 + 384, _MM_HINT_T0); - //_mm_prefetch((char*)B + ob1 + 384, _MM_HINT_T0); - //_mm_prefetch((char*)B + ob2 + 384, _MM_HINT_T0); - b0 = _mm256_loadu_ps(B + ob0); - b1 = _mm256_loadu_ps(B + ob1); - b2 = _mm256_loadu_ps(B + ob2); - a0 = _mm256_set1_ps(A[oa0]); - c00 = _mm256_fmadd_ps(a0, b0, c00); - c01 = _mm256_fmadd_ps(a0, b1, c01); - c02 = _mm256_fmadd_ps(a0, b2, c02); - a0 = _mm256_set1_ps(A[oa1]); - c10 = _mm256_fmadd_ps(a0, b0, c10); - c11 = _mm256_fmadd_ps(a0, b1, c11); - c12 = _mm256_fmadd_ps(a0, b2, c12); - a0 = _mm256_set1_ps(A[oa2]); - c20 = _mm256_fmadd_ps(a0, b0, c20); - c21 = _mm256_fmadd_ps(a0, b1, c21); - c22 = _mm256_fmadd_ps(a0, b2, c22); - a0 = _mm256_set1_ps(A[oa3]); - c30 = _mm256_fmadd_ps(a0, b0, c30); - c31 = _mm256_fmadd_ps(a0, b1, c31); - c32 = _mm256_fmadd_ps(a0, b2, c32); - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01); - AddProduct(C + 2 * F, _alpha, c02, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11); - AddProduct(C + 2 * F, _alpha, c12, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21); - AddProduct(C + 2 * F, _alpha, c22, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31); - AddProduct(C + 2 * F, _alpha, c32, tail); - } - - SIMD_NOINLINE void GemmKernel4x16nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - __m256 c00 = _mm256_setzero_ps(); - __m256 c10 = _mm256_setzero_ps(); - __m256 c20 = _mm256_setzero_ps(); - __m256 c30 = _mm256_setzero_ps(); - __m256 c01 = _mm256_setzero_ps(); - __m256 c11 = _mm256_setzero_ps(); - __m256 c21 = _mm256_setzero_ps(); - __m256 c31 = _mm256_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t sa = lda == 1 ? 4 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - __m256 b0, b1, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm256_loadu_ps(B + ob0); - b1 = _mm256_loadu_ps(B + ob1); - a0 = _mm256_set1_ps(A[oa0]); - c00 = _mm256_fmadd_ps(a0, b0, c00); - c01 = _mm256_fmadd_ps(a0, b1, c01); - a0 = _mm256_set1_ps(A[oa1]); - c10 = _mm256_fmadd_ps(a0, b0, c10); - c11 = _mm256_fmadd_ps(a0, b1, c11); - a0 = _mm256_set1_ps(A[oa2]); - c20 = _mm256_fmadd_ps(a0, b0, c20); - c21 = _mm256_fmadd_ps(a0, b1, c21); - a0 = _mm256_set1_ps(A[oa3]); - c30 = _mm256_fmadd_ps(a0, b0, c30); - c31 = _mm256_fmadd_ps(a0, b1, c31); - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31, tail); - } - - SIMD_NOINLINE void GemmKernel4x8nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - __m256 c0 = _mm256_setzero_ps(); - __m256 c1 = _mm256_setzero_ps(); - __m256 c2 = _mm256_setzero_ps(); - __m256 c3 = _mm256_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t sa = lda == 1 ? 4 : 1; - const size_t ob0 = ldb * 0; - __m256 b0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm256_loadu_ps(B + ob0); - c0 = _mm256_fmadd_ps(b0, _mm256_set1_ps(A[oa0]), c0); - c1 = _mm256_fmadd_ps(b0, _mm256_set1_ps(A[oa1]), c1); - c2 = _mm256_fmadd_ps(b0, _mm256_set1_ps(A[oa2]), c2); - c3 = _mm256_fmadd_ps(b0, _mm256_set1_ps(A[oa3]), c3); - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - AddProduct(C + 0 * ldc, _alpha, c0, tail); - AddProduct(C + 1 * ldc, _alpha, c1, tail); - AddProduct(C + 2 * ldc, _alpha, c2, tail); - AddProduct(C + 3 * ldc, _alpha, c3, tail); - } - - SIMD_NOINLINE void GemmKernel6x16nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - __m256 c00 = _mm256_setzero_ps(); - __m256 c10 = _mm256_setzero_ps(); - __m256 c20 = _mm256_setzero_ps(); - __m256 c30 = _mm256_setzero_ps(); - __m256 c40 = _mm256_setzero_ps(); - __m256 c50 = _mm256_setzero_ps(); - __m256 c01 = _mm256_setzero_ps(); - __m256 c11 = _mm256_setzero_ps(); - __m256 c21 = _mm256_setzero_ps(); - __m256 c31 = _mm256_setzero_ps(); - __m256 c41 = _mm256_setzero_ps(); - __m256 c51 = _mm256_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t sa = lda == 1 ? 6 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - __m256 b0, b1, a0, a1; - for (size_t k = 0; k < K; k++) - { - //_mm_prefetch((char*)B + ob0 + 384, _MM_HINT_T0); - //_mm_prefetch((char*)B + ob1 + 384, _MM_HINT_T0); - b0 = _mm256_loadu_ps(B + ob0); - b1 = _mm256_loadu_ps(B + ob1); - a0 = _mm256_set1_ps(A[oa0]); - a1 = _mm256_set1_ps(A[oa1]); - c00 = _mm256_fmadd_ps(a0, b0, c00); - c01 = _mm256_fmadd_ps(a0, b1, c01); - c10 = _mm256_fmadd_ps(a1, b0, c10); - c11 = _mm256_fmadd_ps(a1, b1, c11); - a0 = _mm256_set1_ps(A[oa2]); - a1 = _mm256_set1_ps(A[oa3]); - c20 = _mm256_fmadd_ps(a0, b0, c20); - c21 = _mm256_fmadd_ps(a0, b1, c21); - c30 = _mm256_fmadd_ps(a1, b0, c30); - c31 = _mm256_fmadd_ps(a1, b1, c31); - a0 = _mm256_set1_ps(A[oa4]); - a1 = _mm256_set1_ps(A[oa5]); - c40 = _mm256_fmadd_ps(a0, b0, c40); - c41 = _mm256_fmadd_ps(a0, b1, c41); - c50 = _mm256_fmadd_ps(a1, b0, c50); - c51 = _mm256_fmadd_ps(a1, b1, c51); - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40); - AddProduct(C + 1 * F, _alpha, c41, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50); - AddProduct(C + 1 * F, _alpha, c51, tail); - } - - SIMD_NOINLINE void GemmKernel6x8nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - __m256 c0 = _mm256_setzero_ps(); - __m256 c1 = _mm256_setzero_ps(); - __m256 c2 = _mm256_setzero_ps(); - __m256 c3 = _mm256_setzero_ps(); - __m256 c4 = _mm256_setzero_ps(); - __m256 c5 = _mm256_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t sa = lda == 1 ? 6 : 1; - const size_t ob0 = ldb * 0; - __m256 b0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm256_loadu_ps(B + ob0); - c0 = _mm256_fmadd_ps(b0, _mm256_set1_ps(A[oa0]), c0); - c1 = _mm256_fmadd_ps(b0, _mm256_set1_ps(A[oa1]), c1); - c2 = _mm256_fmadd_ps(b0, _mm256_set1_ps(A[oa2]), c2); - c3 = _mm256_fmadd_ps(b0, _mm256_set1_ps(A[oa3]), c3); - c4 = _mm256_fmadd_ps(b0, _mm256_set1_ps(A[oa4]), c4); - c5 = _mm256_fmadd_ps(b0, _mm256_set1_ps(A[oa5]), c5); - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - AddProduct(C + 0 * ldc, _alpha, c0, tail); - AddProduct(C + 1 * ldc, _alpha, c1, tail); - AddProduct(C + 2 * ldc, _alpha, c2, tail); - AddProduct(C + 3 * ldc, _alpha, c3, tail); - AddProduct(C + 4 * ldc, _alpha, c4, tail); - AddProduct(C + 5 * ldc, _alpha, c5, tail); - } - - SIMD_NOINLINE void GemmKernelMx24nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - __m256 c[4][3]; - size_t oa[4]; - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - for (size_t i = 0; i < M; ++i) - { - c[i][0] = _mm256_setzero_ps(); - c[i][1] = _mm256_setzero_ps(); - c[i][2] = _mm256_setzero_ps(); - oa[i] = lda * i; - } - __m256 b0, b1, b2, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm256_loadu_ps(B + ob0); - b1 = _mm256_loadu_ps(B + ob1); - b2 = _mm256_loadu_ps(B + ob2); - for (size_t i = 0; i < M; ++i) - { - a0 = _mm256_set1_ps(A[oa[i]]); - c[i][0] = _mm256_add_ps(_mm256_mul_ps(b0, a0), c[i][0]); - c[i][1] = _mm256_add_ps(_mm256_mul_ps(b1, a0), c[i][1]); - c[i][2] = _mm256_add_ps(_mm256_mul_ps(b2, a0), c[i][2]); - } - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - for (size_t i = 0; i < M; ++i) - { - AddProduct(C + 0 * F, _alpha, c[i][0]); - AddProduct(C + 1 * F, _alpha, c[i][1]); - AddProduct(C + 2 * F, _alpha, c[i][2], tail); - C += ldc; - } - } - - SIMD_NOINLINE void GemmKernelMx16nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - __m256 c[6][2]; - size_t oa[6]; - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - for (size_t i = 0; i < M; ++i) - { - c[i][0] = _mm256_setzero_ps(); - c[i][1] = _mm256_setzero_ps(); - oa[i] = lda * i; - } - __m256 b0, b1, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm256_loadu_ps(B + ob0); - b1 = _mm256_loadu_ps(B + ob1); - for (size_t i = 0; i < M; ++i) - { - a0 = _mm256_set1_ps(A[oa[i]]); - c[i][0] = _mm256_fmadd_ps(b0, a0, c[i][0]); - c[i][1] = _mm256_fmadd_ps(b1, a0, c[i][1]); - } - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - for (size_t i = 0; i < M; ++i) - { - AddProduct(C + 0 * F, _alpha, c[i][0]); - AddProduct(C + 1 * F, _alpha, c[i][1], tail); - C += ldc; - } - } - - SIMD_NOINLINE void GemmKernelMx8nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { -#ifdef SIMD_X64_ENABLE - __m256 c[6]; - size_t oa[6]; -#else - __m256 c[4]; - size_t oa[4]; -#endif - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - for (size_t i = 0; i < M; ++i) - { - c[i] = _mm256_setzero_ps(); - oa[i] = lda * i; - } - __m256 b0, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm256_loadu_ps(B + ob0); - for (size_t i = 0; i < M; ++i) - { - a0 = _mm256_set1_ps(A[oa[i]]); - c[i] = _mm256_fmadd_ps(b0, a0, c[i]); - } - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - for (size_t i = 0; i < M; ++i) - AddProduct(C + i * ldc, _alpha, c[i], tail); - } - - template SIMD_NOINLINE void GemmKernelMx24nnT(size_t, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - __m256 c00, c01, c02, c03, c10, c11, c12, c13, c20, c21, c22, c23, b0, b1, b2, a0; - if (M > 0) c00 = _mm256_setzero_ps(), c10 = _mm256_setzero_ps(), c20 = _mm256_setzero_ps(); - if (M > 1) c01 = _mm256_setzero_ps(), c11 = _mm256_setzero_ps(), c21 = _mm256_setzero_ps(); - if (M > 2) c02 = _mm256_setzero_ps(), c12 = _mm256_setzero_ps(), c22 = _mm256_setzero_ps(); - if (M > 3) c03 = _mm256_setzero_ps(), c13 = _mm256_setzero_ps(), c23 = _mm256_setzero_ps(); - size_t oa0, oa1, oa2, oa3; - if (M > 0) oa0 = lda * 0; - if (M > 1) oa1 = lda * 1; - if (M > 2) oa2 = lda * 2; - if (M > 3) oa3 = lda * 3; - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - for (size_t k = 0; k < K; k++) - { - b0 = _mm256_loadu_ps(B + ob0); - b1 = _mm256_loadu_ps(B + ob1); - b2 = _mm256_loadu_ps(B + ob2); - if (M > 0) a0 = _mm256_set1_ps(A[oa0]), c00 = _mm256_fmadd_ps(b0, a0, c00), c10 = _mm256_fmadd_ps(b1, a0, c10), c20 = _mm256_fmadd_ps(b2, a0, c20); - if (M > 1) a0 = _mm256_set1_ps(A[oa1]), c01 = _mm256_fmadd_ps(b0, a0, c01), c11 = _mm256_fmadd_ps(b1, a0, c11), c21 = _mm256_fmadd_ps(b2, a0, c21); - if (M > 2) a0 = _mm256_set1_ps(A[oa2]), c02 = _mm256_fmadd_ps(b0, a0, c02), c12 = _mm256_fmadd_ps(b1, a0, c12), c22 = _mm256_fmadd_ps(b2, a0, c22); - if (M > 3) a0 = _mm256_set1_ps(A[oa3]), c03 = _mm256_fmadd_ps(b0, a0, c03), c13 = _mm256_fmadd_ps(b1, a0, c13), c23 = _mm256_fmadd_ps(b2, a0, c23); - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - if (M > 0) AddProduct(C + 0 * F, _alpha, c00), AddProduct(C + 1 * F, _alpha, c10), AddProduct(C + 2 * F, _alpha, c20, tail), C += ldc; - if (M > 1) AddProduct(C + 0 * F, _alpha, c01), AddProduct(C + 1 * F, _alpha, c11), AddProduct(C + 2 * F, _alpha, c21, tail), C += ldc; - if (M > 2) AddProduct(C + 0 * F, _alpha, c02), AddProduct(C + 1 * F, _alpha, c12), AddProduct(C + 2 * F, _alpha, c22, tail), C += ldc; - if (M > 3) AddProduct(C + 0 * F, _alpha, c03), AddProduct(C + 1 * F, _alpha, c13), AddProduct(C + 2 * F, _alpha, c23, tail), C += ldc; - } - - template SIMD_NOINLINE void GemmKernelMx16nnT(size_t, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - __m256 c00, c01, c02, c03, c04, c05, c10, c11, c12, c13, c14, c15, b0, b1, a0; - if (M > 0) c00 = _mm256_setzero_ps(), c10 = _mm256_setzero_ps(); - if (M > 1) c01 = _mm256_setzero_ps(), c11 = _mm256_setzero_ps(); - if (M > 2) c02 = _mm256_setzero_ps(), c12 = _mm256_setzero_ps(); - if (M > 3) c03 = _mm256_setzero_ps(), c13 = _mm256_setzero_ps(); - if (M > 4) c04 = _mm256_setzero_ps(), c14 = _mm256_setzero_ps(); - if (M > 5) c05 = _mm256_setzero_ps(), c15 = _mm256_setzero_ps(); - size_t oa0, oa1, oa2, oa3, oa4, oa5; - if (M > 0) oa0 = lda * 0; - if (M > 1) oa1 = lda * 1; - if (M > 2) oa2 = lda * 2; - if (M > 3) oa3 = lda * 3; - if (M > 4) oa4 = lda * 4; - if (M > 5) oa5 = lda * 5; - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - for (size_t k = 0; k < K; k++) - { - b0 = _mm256_loadu_ps(B + ob0); - b1 = _mm256_loadu_ps(B + ob1); - if (M > 0) a0 = _mm256_set1_ps(A[oa0]), c00 = _mm256_fmadd_ps(b0, a0, c00), c10 = _mm256_fmadd_ps(b1, a0, c10); - if (M > 1) a0 = _mm256_set1_ps(A[oa1]), c01 = _mm256_fmadd_ps(b0, a0, c01), c11 = _mm256_fmadd_ps(b1, a0, c11); - if (M > 2) a0 = _mm256_set1_ps(A[oa2]), c02 = _mm256_fmadd_ps(b0, a0, c02), c12 = _mm256_fmadd_ps(b1, a0, c12); - if (M > 3) a0 = _mm256_set1_ps(A[oa3]), c03 = _mm256_fmadd_ps(b0, a0, c03), c13 = _mm256_fmadd_ps(b1, a0, c13); - if (M > 4) a0 = _mm256_set1_ps(A[oa4]), c04 = _mm256_fmadd_ps(b0, a0, c04), c14 = _mm256_fmadd_ps(b1, a0, c14); - if (M > 5) a0 = _mm256_set1_ps(A[oa5]), c05 = _mm256_fmadd_ps(b0, a0, c05), c15 = _mm256_fmadd_ps(b1, a0, c15); - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - if (M > 0) AddProduct(C + 0 * F, _alpha, c00), AddProduct(C + 1 * F, _alpha, c10, tail), C += ldc; - if (M > 1) AddProduct(C + 0 * F, _alpha, c01), AddProduct(C + 1 * F, _alpha, c11, tail), C += ldc; - if (M > 2) AddProduct(C + 0 * F, _alpha, c02), AddProduct(C + 1 * F, _alpha, c12, tail), C += ldc; - if (M > 3) AddProduct(C + 0 * F, _alpha, c03), AddProduct(C + 1 * F, _alpha, c13, tail), C += ldc; - if (M > 4) AddProduct(C + 0 * F, _alpha, c04), AddProduct(C + 1 * F, _alpha, c14, tail), C += ldc; - if (M > 5) AddProduct(C + 0 * F, _alpha, c05), AddProduct(C + 1 * F, _alpha, c15, tail), C += ldc; - } - - template SIMD_NOINLINE void GemmKernelMx8nnT(size_t, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - __m256 c00, c01, c02, c03, c04, c05, b0; - if (M > 0) c00 = _mm256_setzero_ps(); - if (M > 1) c01 = _mm256_setzero_ps(); - if (M > 2) c02 = _mm256_setzero_ps(); - if (M > 3) c03 = _mm256_setzero_ps(); - if (M > 4) c04 = _mm256_setzero_ps(); - if (M > 5) c05 = _mm256_setzero_ps(); - size_t oa0, oa1, oa2, oa3, oa4, oa5; - if (M > 0) oa0 = lda * 0; - if (M > 1) oa1 = lda * 1; - if (M > 2) oa2 = lda * 2; - if (M > 3) oa3 = lda * 3; - if (M > 4) oa4 = lda * 4; - if (M > 5) oa5 = lda * 5; - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm256_loadu_ps(B + ob0); - if (M > 0) c00 = _mm256_fmadd_ps(b0, _mm256_set1_ps(A[oa0]), c00); - if (M > 1) c01 = _mm256_fmadd_ps(b0, _mm256_set1_ps(A[oa1]), c01); - if (M > 2) c02 = _mm256_fmadd_ps(b0, _mm256_set1_ps(A[oa2]), c02); - if (M > 3) c03 = _mm256_fmadd_ps(b0, _mm256_set1_ps(A[oa3]), c03); - if (M > 4) c04 = _mm256_fmadd_ps(b0, _mm256_set1_ps(A[oa4]), c04); - if (M > 5) c05 = _mm256_fmadd_ps(b0, _mm256_set1_ps(A[oa5]), c05); - B += sb; - A += sa; - } - __m256 _alpha = _mm256_set1_ps(alpha); - if (M > 0) AddProduct(C + 0 * ldc, _alpha, c00, tail); - if (M > 1) AddProduct(C + 1 * ldc, _alpha, c01, tail); - if (M > 2) AddProduct(C + 2 * ldc, _alpha, c02, tail); - if (M > 3) AddProduct(C + 3 * ldc, _alpha, c03, tail); - if (M > 4) AddProduct(C + 4 * ldc, _alpha, c04, tail); - if (M > 5) AddProduct(C + 5 * ldc, _alpha, c05, tail); - } - - SIMD_INLINE Simd::GemmNN::Tail GetGemmTail(size_t M, size_t N) - { - if (N <= 8) - { - switch (M) - { - case 0: return GemmKernelMx8nnT<0>; - case 1: return GemmKernelMx8nnT<1>; - case 2: return GemmKernelMx8nnT<2>; - case 3: return GemmKernelMx8nnT<3>; - case 4: return GemmKernelMx8nnT<4>; - case 5: return GemmKernelMx8nnT<5>; - } - } - else if (N <= 16) - { - switch (M) - { - case 0: return GemmKernelMx16nnT<0>; - case 1: return GemmKernelMx16nnT<1>; - case 2: return GemmKernelMx16nnT<2>; - case 3: return GemmKernelMx16nnT<3>; - case 4: return GemmKernelMx16nnT<4>; - case 5: return GemmKernelMx16nnT<5>; - } - } - else if (N <= 24) - { - switch (M) - { - case 0: return GemmKernelMx24nnT<0>; - case 1: return GemmKernelMx24nnT<1>; - case 2: return GemmKernelMx24nnT<2>; - case 3: return GemmKernelMx24nnT<3>; - } - } - assert(0); - return NULL; - } - - void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc) - { - SIMD_PERF_BEGF(Simd::ToStr(M) + "-" + Simd::ToStr(N) + "-" + Simd::ToStr(K), M*N*K*2); - - typedef Simd::GemmNN GemmNN; - GemmNN::Main kernelMM, kernelMT; - GemmNN::Tail kernelTM, kernelTT; - size_t microM, microN, L1, L2; -#ifdef SIMD_X64_ENABLE - if (N <= K && M != 4) - { - microM = 6; - microN = 16; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = GemmKernel6x16nn; - kernelMT = tail > F ? GemmKernel6x16nn : GemmKernel6x8nn; - kernelTM = GemmKernelMx16nn; - kernelTT = tail > F ? GemmKernelMx16nn : GemmKernelMx8nn; - } - else - { - microM = 4; - microN = 24; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = GemmKernel4x24nn; - kernelMT = tail > DF ? GemmKernel4x24nn : (tail > F ? GemmKernel4x16nn : GemmKernel4x8nn); - kernelTM = GemmKernelMx24nn; - kernelTT = tail > DF ? GemmKernelMx24nn : (tail > F ? GemmKernelMx16nn : GemmKernelMx8nn); - } -#else - microM = 4; - microN = 8; - kernelMM = GemmKernel4x8nn; - kernelMT = GemmKernel4x8nn; - kernelTM = GemmKernelMx8nn; - kernelTT = GemmKernelMx8nn; -#endif - GemmNN::PackA packA = NULL;// K*M > 1024 * 1024 ? Avx::GemmPackA : NULL; - L1 = N > 4096 ? Base::AlgCacheL2() : Base::AlgCacheL1(); - L2 = N > 4096 ? Base::AlgCacheL3() : Base::AlgCacheL2(); - GemmNN gemmNN(M, N, K, microM, microN, L1, L2, Base::AlgCacheL3(), F, - kernelMM, kernelMT, kernelTM, kernelTT, packA, Avx::GemmPackB, Avx::GemmScaleC, NULL); - gemmNN.Run(alpha, A, lda, B, ldb, beta, C, ldc); - } - - //--------------------------------------------------------------------- - - typedef Simd::GemmNNcb Gemm32fNNcb; - - SIMD_INLINE Gemm32fNNcb CreateGemm32fNNcb(size_t M, size_t N, size_t K, GemmKernelType type, bool compatibility) - { - Gemm32fNNcb::Main kernelMM, kernelMT; - Gemm32fNNcb::Tail kernelTM, kernelTT; - size_t microM, microN; -#ifdef SIMD_X64_ENABLE - if (type == GemmKernelF3 || (type == GemmKernelAny && (M == 4 || M == 8 || M == 16 || N == 24 || N == 48 || N == 96) && N > 16)) - { - microM = 4; - microN = 24; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = Avx2::GemmKernel4x24nn; - kernelMT = tail > DF ? Avx2::GemmKernel4x24nn : (tail > F ? Avx2::GemmKernel4x16nn : Avx2::GemmKernel4x8nn); - kernelTM = Avx2::GetGemmTail(M%microM, microN); - kernelTT = Avx2::GetGemmTail(M%microM, tail); - type = GemmKernelF3; - } - if (type == GemmKernelF2 || (type == GemmKernelF3 && N <= 16) || (type == GemmKernelAny && N > 8)) - { - microM = 6; - microN = 16; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = Avx2::GemmKernel6x16nn; - kernelMT = tail > F ? Avx2::GemmKernel6x16nn : Avx2::GemmKernel6x8nn; - kernelTM = Avx2::GetGemmTail(M%microM, microN); - kernelTT = Avx2::GetGemmTail(M%microM, tail); - type = GemmKernelF2; - } - if (type == GemmKernelF1 || (type == GemmKernelF2 && N <= 8) || type == GemmKernelAny) - { - microM = 6; - microN = 8; - kernelMM = Avx2::GemmKernel6x8nn; - kernelMT = Avx2::GemmKernel6x8nn; - kernelTM = Avx2::GetGemmTail(M%microM, microN); - kernelTT = Avx2::GetGemmTail(M%microM, microN); - type = GemmKernelF1; - } -#else - microM = 4; - microN = 8; - kernelMM = Avx2::GemmKernel4x8nn; - kernelMT = Avx2::GemmKernel4x8nn; - kernelTM = Avx2::GetGemmTail(M%microM, microN); - kernelTT = Avx2::GetGemmTail(M%microM, microN); -#endif - Gemm32fNNcb::PackA packA = (K >= 256 && M > 256) ? Avx::GemmPackA : NULL; - return Gemm32fNNcb(M, N, K, microM, microN, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), - F, kernelMM, kernelMT, kernelTM, kernelTT, packA, Avx::GemmPackB, Avx::GemmScaleC, NULL, compatibility); - } - - size_t Gemm32fNNcbBufferSize(size_t M, size_t N, size_t K, GemmKernelType type, bool compatibility) - { - Gemm32fNNcb gemm = CreateGemm32fNNcb(M, N, K, type, compatibility); - return gemm.BufferSize(); - } - - void Gemm32fNNcbReorderB(size_t M, size_t N, size_t K, const float * B, float * pB, GemmKernelType type, bool compatibility) - { - Gemm32fNNcb gemm = CreateGemm32fNNcb(M, N, K, type, compatibility); - gemm.ReorderB(B, N, pB); - } - - void Gemm32fNNcbRun(size_t M, size_t N, size_t K, const float * A, const float * pB, float * C, GemmKernelType type, bool compatibility) - { - Gemm32fNNcb gemm = CreateGemm32fNNcb(M, N, K, type, compatibility); - gemm.Run(A, K, pB, C, N); - } - - //--------------------------------------------------------------------- - - SIMD_INLINE __m256 Tail(size_t tail) - { - const int32_t mask[DF] = { 0, 0, 0, 0, 0, 0, 0, 0 , -1, -1, -1, -1, -1, -1, -1, -1 }; - return _mm256_loadu_ps((float*)(mask + tail)); - } - - SIMD_INLINE void Add4ExtractedSums(const __m256 & sum0, const __m256 & sum1, const __m256 & sum2, const __m256 & sum3, const __m128 & alpha, float * dst) - { - __m256 sum256 = _mm256_hadd_ps(_mm256_hadd_ps(sum0, sum1), _mm256_hadd_ps(sum2, sum3)); - __m128 sum128 = _mm_add_ps(_mm256_extractf128_ps(sum256, 0), _mm256_extractf128_ps(sum256, 1)); - _mm_storeu_ps(dst, _mm_fmadd_ps(alpha, sum128, _mm_loadu_ps(dst))); - } - - static SIMD_NOINLINE void Kernel1x1x8nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K8 = K & (~7); - const float * A0 = A + 0 * lda; - const float * B0 = B + 0 * ldb; - __m256 c00 = _mm256_setzero_ps(); - __m256 a0, b0; - for (size_t k = 0; k < K8; k += 8) - { - a0 = _mm256_loadu_ps(A0 + k); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_fmadd_ps(a0, b0, c00); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 tail = Tail(K - K8); - a0 = _mm256_and_ps(tail, _mm256_loadu_ps(A0 + k)); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_fmadd_ps(a0, b0, c00); - } - C[0] += alpha * Avx::ExtractSum(c00); - } - - static SIMD_NOINLINE void Kernel1x4x8nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K8 = K & (~7); - const float * A0 = A + 0 * lda; - const float * B0 = B + 0 * ldb; - const float * B1 = B + 1 * ldb; - const float * B2 = B + 2 * ldb; - const float * B3 = B + 3 * ldb; - __m256 c00 = _mm256_setzero_ps(); - __m256 c01 = _mm256_setzero_ps(); - __m256 c02 = _mm256_setzero_ps(); - __m256 c03 = _mm256_setzero_ps(); - __m256 a0, b0; - for (size_t k = 0; k < K8; k += 8) - { - a0 = _mm256_loadu_ps(A0 + k); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_fmadd_ps(a0, b0, c00); - b0 = _mm256_loadu_ps(B1 + k); - c01 = _mm256_fmadd_ps(a0, b0, c01); - b0 = _mm256_loadu_ps(B2 + k); - c02 = _mm256_fmadd_ps(a0, b0, c02); - b0 = _mm256_loadu_ps(B3 + k); - c03 = _mm256_fmadd_ps(a0, b0, c03); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 tail = Tail(K - K8); - a0 = _mm256_and_ps(tail, _mm256_loadu_ps(A0 + k)); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_fmadd_ps(a0, b0, c00); - b0 = _mm256_loadu_ps(B1 + k); - c01 = _mm256_fmadd_ps(a0, b0, c01); - b0 = _mm256_loadu_ps(B2 + k); - c02 = _mm256_fmadd_ps(a0, b0, c02); - b0 = _mm256_loadu_ps(B3 + k); - c03 = _mm256_fmadd_ps(a0, b0, c03); - } - __m128 _alpha = _mm_set1_ps(alpha); - Add4ExtractedSums(c00, c01, c02, c03, _alpha, C + 0 * ldc); - } - - static SIMD_NOINLINE void Kernel2x1x8nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K8 = K & (~7); - const float * A0 = A + 0 * lda; - const float * A1 = A + 1 * lda; - const float * B0 = B + 0 * ldb; - __m256 c00 = _mm256_setzero_ps(); - __m256 c10 = _mm256_setzero_ps(); - __m256 a0, a1, b0; - for (size_t k = 0; k < K8; k += 8) - { - a0 = _mm256_loadu_ps(A0 + k); - a1 = _mm256_loadu_ps(A1 + k); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_fmadd_ps(a0, b0, c00); - c10 = _mm256_fmadd_ps(a1, b0, c10); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 tail = Tail(K - K8); - a0 = _mm256_and_ps(tail, _mm256_loadu_ps(A0 + k)); - a1 = _mm256_and_ps(tail, _mm256_loadu_ps(A1 + k)); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_fmadd_ps(a0, b0, c00); - c10 = _mm256_fmadd_ps(a1, b0, c10); - } - C[0 * ldc] += alpha * Avx::ExtractSum(c00); - C[1 * ldc] += alpha * Avx::ExtractSum(c10); - } - - static SIMD_NOINLINE void Kernel2x4x8nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K8 = K & (~7); - const float * A0 = A + 0 * lda; - const float * A1 = A + 1 * lda; - const float * B0 = B + 0 * ldb; - const float * B1 = B + 1 * ldb; - const float * B2 = B + 2 * ldb; - const float * B3 = B + 3 * ldb; - __m256 c00 = _mm256_setzero_ps(); - __m256 c01 = _mm256_setzero_ps(); - __m256 c02 = _mm256_setzero_ps(); - __m256 c03 = _mm256_setzero_ps(); - __m256 c10 = _mm256_setzero_ps(); - __m256 c11 = _mm256_setzero_ps(); - __m256 c12 = _mm256_setzero_ps(); - __m256 c13 = _mm256_setzero_ps(); - __m256 a0, a1, b0; - for (size_t k = 0; k < K8; k += 8) - { - a0 = _mm256_loadu_ps(A0 + k); - a1 = _mm256_loadu_ps(A1 + k); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_fmadd_ps(a0, b0, c00); - c10 = _mm256_fmadd_ps(a1, b0, c10); - b0 = _mm256_loadu_ps(B1 + k); - c01 = _mm256_fmadd_ps(a0, b0, c01); - c11 = _mm256_fmadd_ps(a1, b0, c11); - b0 = _mm256_loadu_ps(B2 + k); - c02 = _mm256_fmadd_ps(a0, b0, c02); - c12 = _mm256_fmadd_ps(a1, b0, c12); - b0 = _mm256_loadu_ps(B3 + k); - c03 = _mm256_fmadd_ps(a0, b0, c03); - c13 = _mm256_fmadd_ps(a1, b0, c13); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 tail = Tail(K - K8); - a0 = _mm256_and_ps(tail, _mm256_loadu_ps(A0 + k)); - a1 = _mm256_and_ps(tail, _mm256_loadu_ps(A1 + k)); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_fmadd_ps(a0, b0, c00); - c10 = _mm256_fmadd_ps(a1, b0, c10); - b0 = _mm256_loadu_ps(B1 + k); - c01 = _mm256_fmadd_ps(a0, b0, c01); - c11 = _mm256_fmadd_ps(a1, b0, c11); - b0 = _mm256_loadu_ps(B2 + k); - c02 = _mm256_fmadd_ps(a0, b0, c02); - c12 = _mm256_fmadd_ps(a1, b0, c12); - b0 = _mm256_loadu_ps(B3 + k); - c03 = _mm256_fmadd_ps(a0, b0, c03); - c13 = _mm256_fmadd_ps(a1, b0, c13); - } - __m128 _alpha = _mm_set1_ps(alpha); - Add4ExtractedSums(c00, c01, c02, c03, _alpha, C + 0 * ldc); - Add4ExtractedSums(c10, c11, c12, c13, _alpha, C + 1 * ldc); - } - - static SIMD_NOINLINE void Kernel3x1x8nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K8 = K & (~7); - const float * A0 = A + 0 * lda; - const float * A1 = A + 1 * lda; - const float * A2 = A + 2 * lda; - const float * B0 = B + 0 * ldb; - __m256 c00 = _mm256_setzero_ps(); - __m256 c10 = _mm256_setzero_ps(); - __m256 c20 = _mm256_setzero_ps(); - __m256 a0, a1, a2, b0; - for (size_t k = 0; k < K8; k += 8) - { - a0 = _mm256_loadu_ps(A0 + k); - a1 = _mm256_loadu_ps(A1 + k); - a2 = _mm256_loadu_ps(A2 + k); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_fmadd_ps(a0, b0, c00); - c10 = _mm256_fmadd_ps(a1, b0, c10); - c20 = _mm256_fmadd_ps(a2, b0, c20); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 tail = Tail(K - K8); - a0 = _mm256_and_ps(tail, _mm256_loadu_ps(A0 + k)); - a1 = _mm256_and_ps(tail, _mm256_loadu_ps(A1 + k)); - a2 = _mm256_and_ps(tail, _mm256_loadu_ps(A2 + k)); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_fmadd_ps(a0, b0, c00); - c10 = _mm256_fmadd_ps(a1, b0, c10); - c20 = _mm256_fmadd_ps(a2, b0, c20); - } - C[0 * ldc] += alpha * Avx::ExtractSum(c00); - C[1 * ldc] += alpha * Avx::ExtractSum(c10); - C[2 * ldc] += alpha * Avx::ExtractSum(c20); - } - - static SIMD_NOINLINE void Kernel3x4x8nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K8 = K & (~7); - const float * A0 = A + 0 * lda; - const float * A1 = A + 1 * lda; - const float * A2 = A + 2 * lda; - const float * B0 = B + 0 * ldb; - const float * B1 = B + 1 * ldb; - const float * B2 = B + 2 * ldb; - const float * B3 = B + 3 * ldb; - __m256 c00 = _mm256_setzero_ps(); - __m256 c01 = _mm256_setzero_ps(); - __m256 c02 = _mm256_setzero_ps(); - __m256 c03 = _mm256_setzero_ps(); - __m256 c10 = _mm256_setzero_ps(); - __m256 c11 = _mm256_setzero_ps(); - __m256 c12 = _mm256_setzero_ps(); - __m256 c13 = _mm256_setzero_ps(); - __m256 c20 = _mm256_setzero_ps(); - __m256 c21 = _mm256_setzero_ps(); - __m256 c22 = _mm256_setzero_ps(); - __m256 c23 = _mm256_setzero_ps(); - __m256 a0, a1, a2, b0; - for (size_t k = 0; k < K8; k += 8) - { - a0 = _mm256_loadu_ps(A0 + k); - a1 = _mm256_loadu_ps(A1 + k); - a2 = _mm256_loadu_ps(A2 + k); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_fmadd_ps(a0, b0, c00); - c10 = _mm256_fmadd_ps(a1, b0, c10); - c20 = _mm256_fmadd_ps(a2, b0, c20); - b0 = _mm256_loadu_ps(B1 + k); - c01 = _mm256_fmadd_ps(a0, b0, c01); - c11 = _mm256_fmadd_ps(a1, b0, c11); - c21 = _mm256_fmadd_ps(a2, b0, c21); - b0 = _mm256_loadu_ps(B2 + k); - c02 = _mm256_fmadd_ps(a0, b0, c02); - c12 = _mm256_fmadd_ps(a1, b0, c12); - c22 = _mm256_fmadd_ps(a2, b0, c22); - b0 = _mm256_loadu_ps(B3 + k); - c03 = _mm256_fmadd_ps(a0, b0, c03); - c13 = _mm256_fmadd_ps(a1, b0, c13); - c23 = _mm256_fmadd_ps(a2, b0, c23); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 tail = Tail(K - K8); - a0 = _mm256_and_ps(tail, _mm256_loadu_ps(A0 + k)); - a1 = _mm256_and_ps(tail, _mm256_loadu_ps(A1 + k)); - a2 = _mm256_and_ps(tail, _mm256_loadu_ps(A2 + k)); - b0 = _mm256_loadu_ps(B0 + k); - c00 = _mm256_fmadd_ps(a0, b0, c00); - c10 = _mm256_fmadd_ps(a1, b0, c10); - c20 = _mm256_fmadd_ps(a2, b0, c20); - b0 = _mm256_loadu_ps(B1 + k); - c01 = _mm256_fmadd_ps(a0, b0, c01); - c11 = _mm256_fmadd_ps(a1, b0, c11); - c21 = _mm256_fmadd_ps(a2, b0, c21); - b0 = _mm256_loadu_ps(B2 + k); - c02 = _mm256_fmadd_ps(a0, b0, c02); - c12 = _mm256_fmadd_ps(a1, b0, c12); - c22 = _mm256_fmadd_ps(a2, b0, c22); - b0 = _mm256_loadu_ps(B3 + k); - c03 = _mm256_fmadd_ps(a0, b0, c03); - c13 = _mm256_fmadd_ps(a1, b0, c13); - c23 = _mm256_fmadd_ps(a2, b0, c23); - } - __m128 _alpha = _mm_set1_ps(alpha); - Add4ExtractedSums(c00, c01, c02, c03, _alpha, C + 0 * ldc); - Add4ExtractedSums(c10, c11, c12, c13, _alpha, C + 1 * ldc); - Add4ExtractedSums(c20, c21, c22, c23, _alpha, C + 2 * ldc); - } - - void Gemm32fNT(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc) - { - //SIMD_PERF_BEGF(Simd::ToStr(M) + "-" + Simd::ToStr(N) + "-" + Simd::ToStr(K), M*N*K * 2); - - typedef Simd::GemmNT GemmNT; -#ifdef SIMD_X64_ENABLE - GemmNT gemmNT(M, N, K, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), F, Avx::GemmScaleC, - Kernel1x1x8nt, Kernel1x4x8nt, Kernel2x1x8nt, Kernel2x4x8nt, Kernel3x1x8nt, Kernel3x4x8nt, NULL, NULL); -#else - GemmNT gemmNT(M, N, K, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), F, Sse::GemmScaleC, - Kernel1x1x8nt, Kernel1x4x8nt, NULL, NULL, NULL, NULL, NULL, NULL); -#endif - gemmNT.Run(alpha, A, lda, B, ldb, beta, C, ldc); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2GrayToBgr.cpp b/src/3rd/Simd/Simd/SimdAvx2GrayToBgr.cpp deleted file mode 100644 index ce204601..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2GrayToBgr.cpp +++ /dev/null @@ -1,73 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void GrayToBgr(uint8_t * bgr, __m256i gray) - { - Store((__m256i*)bgr + 0, GrayToBgr<0>(gray)); - Store((__m256i*)bgr + 1, GrayToBgr<1>(gray)); - Store((__m256i*)bgr + 2, GrayToBgr<2>(gray)); - } - - template void GrayToBgr(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride) - { - assert(width >= A); - if (align) - assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(gray) && Aligned(grayStride)); - - size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - { - __m256i _gray = Load((__m256i*)(gray + col)); - GrayToBgr(bgr + 3 * col, _gray); - } - if (alignedWidth != width) - { - __m256i _gray = Load((__m256i*)(gray + width - A)); - GrayToBgr(bgr + 3 * (width - A), _gray); - } - gray += grayStride; - bgr += bgrStride; - } - } - - void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride) - { - if (Aligned(bgr) && Aligned(gray) && Aligned(bgrStride) && Aligned(grayStride)) - GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); - else - GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2GrayToBgra.cpp b/src/3rd/Simd/Simd/SimdAvx2GrayToBgra.cpp deleted file mode 100644 index a643f49d..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2GrayToBgra.cpp +++ /dev/null @@ -1,80 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void GrayToBgra(uint8_t * bgra, __m256i gray, __m256i alpha) - { - __m256i bgLo = _mm256_unpacklo_epi8(gray, gray); - __m256i bgHi = _mm256_unpackhi_epi8(gray, gray); - __m256i raLo = _mm256_unpacklo_epi8(gray, alpha); - __m256i raHi = _mm256_unpackhi_epi8(gray, alpha); - - Store((__m256i*)bgra + 0, _mm256_unpacklo_epi16(bgLo, raLo)); - Store((__m256i*)bgra + 1, _mm256_unpackhi_epi16(bgLo, raLo)); - Store((__m256i*)bgra + 2, _mm256_unpacklo_epi16(bgHi, raHi)); - Store((__m256i*)bgra + 3, _mm256_unpackhi_epi16(bgHi, raHi)); - } - - template void GrayToBgra(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha) - { - assert(width >= A); - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(gray) && Aligned(grayStride)); - - __m256i _alpha = _mm256_set1_epi8(alpha); - __m256i permuteOffsets = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7); - size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - { - __m256i _gray = _mm256_permutevar8x32_epi32(Load((__m256i*)(gray + col)), permuteOffsets); - GrayToBgra(bgra + 4 * col, _gray, _alpha); - } - if (alignedWidth != width) - { - __m256i _gray = _mm256_permutevar8x32_epi32(Load((__m256i*)(gray + width - A)), permuteOffsets); - GrayToBgra(bgra + 4 * (width - A), _gray, _alpha); - } - gray += grayStride; - bgra += bgraStride; - } - } - - void GrayToBgra(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(bgra) && Aligned(gray) && Aligned(bgraStride) && Aligned(grayStride)) - GrayToBgra(gray, width, height, grayStride, bgra, bgraStride, alpha); - else - GrayToBgra(gray, width, height, grayStride, bgra, bgraStride, alpha); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Histogram.cpp b/src/3rd/Simd/Simd/SimdAvx2Histogram.cpp deleted file mode 100644 index faf6770b..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Histogram.cpp +++ /dev/null @@ -1,280 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - namespace - { - template struct Buffer - { - Buffer(size_t rowSize, size_t histogramSize) - { - _p = Allocate(sizeof(T)*rowSize + 4 * sizeof(uint32_t)*histogramSize); - v = (T*)_p; - h[0] = (uint32_t *)(v + rowSize); - h[1] = h[0] + histogramSize; - h[2] = h[1] + histogramSize; - h[3] = h[2] + histogramSize; - memset(h[0], 0, 4 * sizeof(uint32_t)*histogramSize); - } - - ~Buffer() - { - Free(_p); - } - - T * v; - uint32_t * h[4]; - private: - void *_p; - }; - } - - template - SIMD_INLINE __m256i AbsSecondDerivative(const uint8_t * src, ptrdiff_t step) - { - const __m256i s0 = Load((__m256i*)(src - step)); - const __m256i s1 = Load((__m256i*)src); - const __m256i s2 = Load((__m256i*)(src + step)); - return AbsDifferenceU8(_mm256_avg_epu8(s0, s2), s1); - } - - template - SIMD_INLINE void AbsSecondDerivative(const uint8_t * src, ptrdiff_t colStep, ptrdiff_t rowStep, uint8_t * dst) - { - const __m256i sdX = AbsSecondDerivative(src, colStep); - const __m256i sdY = AbsSecondDerivative(src, rowStep); - Store((__m256i*)dst, _mm256_max_epu8(sdY, sdX)); - } - - SIMD_INLINE void SumHistograms(uint32_t * src, size_t start, uint32_t * dst) - { - uint32_t * src0 = src + start; - uint32_t * src1 = src0 + start + HISTOGRAM_SIZE; - uint32_t * src2 = src1 + start + HISTOGRAM_SIZE; - uint32_t * src3 = src2 + start + HISTOGRAM_SIZE; - for (size_t i = 0; i < HISTOGRAM_SIZE; i += 8) - Store((__m256i*)(dst + i), _mm256_add_epi32( - _mm256_add_epi32(Load((__m256i*)(src0 + i)), Load((__m256i*)(src1 + i))), - _mm256_add_epi32(Load((__m256i*)(src2 + i)), Load((__m256i*)(src3 + i))))); - } - - template void AbsSecondDerivativeHistogram(const uint8_t *src, size_t width, size_t height, size_t stride, - size_t step, size_t indent, uint32_t * histogram) - { - Buffer buffer(AlignHi(width, A), HISTOGRAM_SIZE); - buffer.v += indent; - src += indent*(stride + 1); - height -= 2 * indent; - width -= 2 * indent; - - ptrdiff_t bodyStart = (uint8_t*)AlignHi(buffer.v, A) - buffer.v; - ptrdiff_t bodyEnd = bodyStart + AlignLo(width - bodyStart, A); - size_t rowStep = step*stride; - size_t alignedWidth = Simd::AlignLo(width, 4); - for (size_t row = 0; row < height; ++row) - { - if (bodyStart) - AbsSecondDerivative(src, step, rowStep, buffer.v); - for (ptrdiff_t col = bodyStart; col < bodyEnd; col += A) - AbsSecondDerivative(src + col, step, rowStep, buffer.v + col); - if (width != (size_t)bodyEnd) - AbsSecondDerivative(src + width - A, step, rowStep, buffer.v + width - A); - - size_t col = 0; - for (; col < alignedWidth; col += 4) - { - ++buffer.h[0][buffer.v[col + 0]]; - ++buffer.h[1][buffer.v[col + 1]]; - ++buffer.h[2][buffer.v[col + 2]]; - ++buffer.h[3][buffer.v[col + 3]]; - } - for (; col < width; ++col) - ++buffer.h[0][buffer.v[col + 0]]; - src += stride; - } - - SumHistograms(buffer.h[0], 0, histogram); - } - - void AbsSecondDerivativeHistogram(const uint8_t *src, size_t width, size_t height, size_t stride, - size_t step, size_t indent, uint32_t * histogram) - { - assert(width > 2 * indent && height > 2 * indent && indent >= step && width >= A + 2 * indent); - - if (Aligned(src) && Aligned(stride)) - AbsSecondDerivativeHistogram(src, width, height, stride, step, indent, histogram); - else - AbsSecondDerivativeHistogram(src, width, height, stride, step, indent, histogram); - } - - template - SIMD_INLINE void MaskSrc(const uint8_t * src, const uint8_t * mask, const __m256i & index, ptrdiff_t offset, uint16_t * dst) - { - const __m256i _src = Load((__m256i*)(src + offset)); - const __m256i _mask = _mm256_and_si256(_mm256_cmpeq_epi8(Load((__m256i*)(mask + offset)), index), K8_01); - __m256i lo = _mm256_mullo_epi16(_mm256_add_epi16(K16_0008, UnpackU8<0>(_src)), UnpackU8<0>(_mask)); - __m256i hi = _mm256_mullo_epi16(_mm256_add_epi16(K16_0008, UnpackU8<1>(_src)), UnpackU8<1>(_mask)); - Store((__m256i*)(dst + offset) + 0, _mm256_permute2x128_si256(lo, hi, 0x20)); - Store((__m256i*)(dst + offset) + 1, _mm256_permute2x128_si256(lo, hi, 0x31)); - } - - template void HistogramMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t index, uint32_t * histogram) - { - Buffer buffer(AlignHi(width, A), HISTOGRAM_SIZE + 8); - size_t widthAligned4 = Simd::AlignLo(width, 4); - size_t widthAlignedA = Simd::AlignLo(width, A); - size_t widthAlignedDA = Simd::AlignLo(width, DA); - __m256i _index = _mm256_set1_epi8(index); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < widthAlignedDA; col += DA) - { - MaskSrc(src, mask, _index, col, buffer.v); - MaskSrc(src, mask, _index, col + A, buffer.v); - } - for (; col < widthAlignedA; col += A) - MaskSrc(src, mask, _index, col, buffer.v); - if (width != widthAlignedA) - MaskSrc(src, mask, _index, width - A, buffer.v); - - for (col = 0; col < widthAligned4; col += 4) - { - ++buffer.h[0][buffer.v[col + 0]]; - ++buffer.h[1][buffer.v[col + 1]]; - ++buffer.h[2][buffer.v[col + 2]]; - ++buffer.h[3][buffer.v[col + 3]]; - } - for (; col < width; ++col) - ++buffer.h[0][buffer.v[col]]; - - src += srcStride; - mask += maskStride; - } - - SumHistograms(buffer.h[0], 8, histogram); - } - - void HistogramMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t index, uint32_t * histogram) - { - assert(width >= A); - - if (Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)) - HistogramMasked(src, srcStride, width, height, mask, maskStride, index, histogram); - else - HistogramMasked(src, srcStride, width, height, mask, maskStride, index, histogram); - } - - template - SIMD_INLINE void ConditionalSrc(const uint8_t * src, const uint8_t * mask, const __m256i & value, ptrdiff_t offset, uint16_t * dst) - { - const __m256i _src = Load((__m256i*)(src + offset)); - const __m256i _mask = _mm256_and_si256(Compare8u(Load((__m256i*)(mask + offset)), value), K8_01); - __m256i lo = _mm256_mullo_epi16(_mm256_add_epi16(K16_0008, UnpackU8<0>(_src)), UnpackU8<0>(_mask)); - __m256i hi = _mm256_mullo_epi16(_mm256_add_epi16(K16_0008, UnpackU8<1>(_src)), UnpackU8<1>(_mask)); - Store((__m256i*)(dst + offset) + 0, _mm256_permute2x128_si256(lo, hi, 0x20)); - Store((__m256i*)(dst + offset) + 1, _mm256_permute2x128_si256(lo, hi, 0x31)); - } - - template void HistogramConditional(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint32_t * histogram) - { - Buffer buffer(AlignHi(width, A), HISTOGRAM_SIZE + 8); - size_t widthAligned4 = Simd::AlignLo(width, 4); - size_t widthAlignedA = Simd::AlignLo(width, A); - size_t widthAlignedDA = Simd::AlignLo(width, DA); - __m256i _value = _mm256_set1_epi8(value); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < widthAlignedDA; col += DA) - { - ConditionalSrc(src, mask, _value, col, buffer.v); - ConditionalSrc(src, mask, _value, col + A, buffer.v); - } - for (; col < widthAlignedA; col += A) - ConditionalSrc(src, mask, _value, col, buffer.v); - if (width != widthAlignedA) - ConditionalSrc(src, mask, _value, width - A, buffer.v); - - for (col = 0; col < widthAligned4; col += 4) - { - ++buffer.h[0][buffer.v[col + 0]]; - ++buffer.h[1][buffer.v[col + 1]]; - ++buffer.h[2][buffer.v[col + 2]]; - ++buffer.h[3][buffer.v[col + 3]]; - } - for (; col < width; ++col) - ++buffer.h[0][buffer.v[col]]; - - src += srcStride; - mask += maskStride; - } - - SumHistograms(buffer.h[0], 8, histogram); - } - - template - void HistogramConditional(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint32_t * histogram) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)) - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - else - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - } - - void HistogramConditional(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint32_t * histogram) - { - switch (compareType) - { - case SimdCompareEqual: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - case SimdCompareNotEqual: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - case SimdCompareGreater: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - case SimdCompareGreaterOrEqual: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - case SimdCompareLesser: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - case SimdCompareLesserOrEqual: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - default: - assert(0); - } - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Hog.cpp b/src/3rd/Simd/Simd/SimdAvx2Hog.cpp deleted file mode 100644 index 18b748ce..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Hog.cpp +++ /dev/null @@ -1,922 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdArray.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - namespace - { - struct Buffer - { - const int size; - __m256 * cos, *sin; - __m256i * pos, *neg; - int * index; - float * value; - - Buffer(size_t width, size_t quantization) - : size((int)quantization / 2) - { - width = AlignHi(width, A / sizeof(float)); - _p = Allocate(width*(sizeof(int) + sizeof(float)) + (sizeof(__m256i) + sizeof(__m256)) * 2 * size); - index = (int*)_p - 1; - value = (float*)index + width; - cos = (__m256*)(value + width + 1); - sin = cos + size; - pos = (__m256i*)(sin + size); - neg = pos + size; - for (int i = 0; i < size; ++i) - { - cos[i] = _mm256_set1_ps((float)::cos(i*M_PI / size)); - sin[i] = _mm256_set1_ps((float)::sin(i*M_PI / size)); - pos[i] = _mm256_set1_epi32(i); - neg[i] = _mm256_set1_epi32(size + i); - } - } - - ~Buffer() - { - Free(_p); - } - - private: - void *_p; - }; - } - - template SIMD_INLINE void HogDirectionHistograms(const __m256 & dx, const __m256 & dy, Buffer & buffer, size_t col) - { - __m256 bestDot = _mm256_setzero_ps(); - __m256i bestIndex = _mm256_setzero_si256(); - for (int i = 0; i < buffer.size; ++i) - { - __m256 dot = _mm256_fmadd_ps(dx, buffer.cos[i], _mm256_mul_ps(dy, buffer.sin[i])); - __m256 mask = _mm256_cmp_ps(dot, bestDot, _CMP_GT_OS); - bestDot = _mm256_max_ps(dot, bestDot); - bestIndex = _mm256_blendv_epi8(bestIndex, buffer.pos[i], _mm256_castps_si256(mask)); - - dot = _mm256_sub_ps(_mm256_setzero_ps(), dot); - mask = _mm256_cmp_ps(dot, bestDot, _CMP_GT_OS); - bestDot = _mm256_max_ps(dot, bestDot); - bestIndex = _mm256_blendv_epi8(bestIndex, buffer.neg[i], _mm256_castps_si256(mask)); - } - Store((__m256i*)(buffer.index + col), bestIndex); - Avx::Store(buffer.value + col, Avx::Sqrt<0>(_mm256_fmadd_ps(dx, dx, _mm256_mul_ps(dy, dy)))); - } - - template SIMD_INLINE void HogDirectionHistograms(const __m256i & t, const __m256i & l, const __m256i & r, const __m256i & b, Buffer & buffer, size_t col) - { - HogDirectionHistograms( - _mm256_cvtepi32_ps(_mm256_sub_epi32(_mm256_unpacklo_epi16(r, K_ZERO), _mm256_unpacklo_epi16(l, K_ZERO))), - _mm256_cvtepi32_ps(_mm256_sub_epi32(_mm256_unpacklo_epi16(b, K_ZERO), _mm256_unpacklo_epi16(t, K_ZERO))), - buffer, col + 0); - HogDirectionHistograms( - _mm256_cvtepi32_ps(_mm256_sub_epi32(_mm256_unpackhi_epi16(r, K_ZERO), _mm256_unpackhi_epi16(l, K_ZERO))), - _mm256_cvtepi32_ps(_mm256_sub_epi32(_mm256_unpackhi_epi16(b, K_ZERO), _mm256_unpackhi_epi16(t, K_ZERO))), - buffer, col + 8); - } - - template SIMD_INLINE void HogDirectionHistograms(const uint8_t * src, size_t stride, Buffer & buffer, size_t col) - { - const uint8_t * s = src + col; - __m256i t = LoadPermuted((__m256i*)(s - stride)); - __m256i l = LoadPermuted((__m256i*)(s - 1)); - __m256i r = LoadPermuted((__m256i*)(s + 1)); - __m256i b = LoadPermuted((__m256i*)(s + stride)); - HogDirectionHistograms(PermutedUnpackLoU8(t), PermutedUnpackLoU8(l), PermutedUnpackLoU8(r), PermutedUnpackLoU8(b), buffer, col + 0); - HogDirectionHistograms(PermutedUnpackHiU8(t), PermutedUnpackHiU8(l), PermutedUnpackHiU8(r), PermutedUnpackHiU8(b), buffer, col + 16); - } - - namespace Custom_8x8_18 - { - struct Buffer - { - __m256i pos[5]; - __m256 cos[5], sin[5]; - __m128 kx[8], ky[8]; - - int * index; - float * value; - __m128 * hist; - size_t hs; - - Buffer(size_t width) - { - width = AlignHi(width, A / sizeof(float)); - hs = (width / 8 + 1) * 18 * sizeof(__m128); - _p = Allocate(width*(sizeof(int) + sizeof(float)) + hs); - index = (int*)_p - 1; - value = (float*)index + width; - hist = (__m128*)(value + width + 1); - - for (int i = 0; i < 5; ++i) - { - cos[i] = _mm256_set1_ps((float)::cos(i*M_PI / 9)); - sin[i] = _mm256_set1_ps((float)::sin(i*M_PI / 9)); - pos[i] = _mm256_set1_epi32(i); - } - for (int i = 0; i < 8; ++i) - { - float k0 = float((15 - i * 2) / 16.0f); - float k1 = 1.0f - k0; - kx[i] = _mm_setr_ps(k0, k1, k0, k1); - ky[i] = _mm_setr_ps(k0, k0, k1, k1); - } - ClearHist(); - } - - ~Buffer() - { - Free(_p); - } - - void ClearHist() - { - memset(hist, 0, hs); - } - - private: - void *_p; - }; - - const __m256i K32_1 = SIMD_MM256_SET1_EPI32(1); - const __m256i K32_9 = SIMD_MM256_SET1_EPI32(9); - const __m256i K32_18 = SIMD_MM256_SET1_EPI32(18); - - template SIMD_INLINE void HogDirectionHistograms(const __m256 & dx, const __m256 & dy, Buffer & buffer, size_t col) - { - __m256 _0 = _mm256_set1_ps(-0.0f); - __m256 adx = _mm256_andnot_ps(_0, dx); - __m256 ady = _mm256_andnot_ps(_0, dy); - __m256 bestDot = _mm256_fmadd_ps(adx, buffer.cos[0], _mm256_mul_ps(ady, buffer.sin[0])); - __m256i bestIndex = buffer.pos[0]; - for (int i = 1; i < 5; ++i) - { - __m256 dot = _mm256_fmadd_ps(adx, buffer.cos[i], _mm256_mul_ps(ady, buffer.sin[i])); - __m256 mask = _mm256_cmp_ps(dot, bestDot, _CMP_GT_OS); - bestDot = _mm256_max_ps(dot, bestDot); - bestIndex = _mm256_blendv_epi8(bestIndex, buffer.pos[i], _mm256_castps_si256(mask)); - } - __m256i maskDx = _mm256_castps_si256(_mm256_cmp_ps(dx, _mm256_setzero_ps(), _CMP_LT_OS)); - bestIndex = _mm256_blendv_epi8(bestIndex, _mm256_sub_epi32(K32_9, bestIndex), maskDx); - - __m256i maskDy = _mm256_castps_si256(_mm256_cmp_ps(dy, _mm256_setzero_ps(), _CMP_LT_OS)); - __m256i corr = _mm256_and_si256(_mm256_castps_si256(_mm256_cmp_ps(adx, _mm256_setzero_ps(), _CMP_EQ_OS)), K32_1); - bestIndex = _mm256_blendv_epi8(bestIndex, _mm256_sub_epi32(K32_18, _mm256_add_epi32(bestIndex, corr)), maskDy); - - bestIndex = _mm256_andnot_si256(_mm256_cmpeq_epi32(bestIndex, K32_18), bestIndex); - - Store((__m256i*)(buffer.index + col), bestIndex); - Avx::Store(buffer.value + col, Avx::Sqrt<0>(_mm256_fmadd_ps(adx, adx, _mm256_mul_ps(ady, ady)))); - } - - template SIMD_INLINE __m256 CovertDifference(const __m128i & a, const __m128i & b) - { - return _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(Ssse3::SubUnpackedU8(a, b))); - } - - template SIMD_INLINE void HogDirectionHistograms(const uint8_t * src, size_t stride, Buffer & buffer, size_t col) - { - const uint8_t * s = src + col; - __m128i t = Sse2::Load((__m128i*)(s - stride)); - __m128i l = Sse2::Load((__m128i*)(s - 1)); - __m128i r = Sse2::Load((__m128i*)(s + 1)); - __m128i b = Sse2::Load((__m128i*)(s + stride)); - HogDirectionHistograms(CovertDifference<0>(r, l), CovertDifference<0>(b, t), buffer, col + 0); - HogDirectionHistograms(CovertDifference<1>(r, l), CovertDifference<1>(b, t), buffer, col + 8); - } - - void AddRowToBuffer(const uint8_t * src, size_t stride, Buffer & buffer, size_t row, size_t width, size_t aligned) - { - const uint8_t * s = src + stride * row; - for (size_t col = 1; col < aligned; col += HA) - HogDirectionHistograms(s, stride, buffer, col); - HogDirectionHistograms(s, stride, buffer, width - 1 - HA); - - __m128 ky = buffer.ky[(row + 4) & 7]; - __m128 * hist = buffer.hist; - size_t cellEnd = width / 8; - - for (size_t col = 1; col < 4; ++col) - { - int index = buffer.index[col]; - __m128 value = _mm_set1_ps(buffer.value[col]); - __m128 kx = buffer.kx[(col + 4) & 7]; - hist[index] = _mm_fmadd_ps(_mm_mul_ps(ky, kx), value, hist[index]); - } - hist += 18; - - for (size_t cell = 1, col = 4; cell < cellEnd; ++cell) - { - for (size_t i = 0; i < 8; ++i, ++col) - { - int index = buffer.index[col]; - __m128 value = _mm_set1_ps(buffer.value[col]); - __m128 kx = buffer.kx[i]; - hist[index] = _mm_fmadd_ps(_mm_mul_ps(ky, kx), value, hist[index]); - } - hist += 18; - } - - for (size_t col = width - 4; col < width - 1; ++col) - { - int index = buffer.index[col]; - __m128 value = _mm_set1_ps(buffer.value[col]); - __m128 kx = buffer.kx[(col + 4) & 7]; - hist[index] = _mm_fmadd_ps(_mm_mul_ps(ky, kx), value, hist[index]); - } - } - - void AddToHistogram(Buffer & buffer, size_t row, size_t width, size_t height, float * histograms) - { - typedef float f18_t[18]; - - float * src = (float*)buffer.hist; - f18_t * h0 = (f18_t*)histograms + row * width - width - 1; - f18_t * h1 = h0 + width; - - if (row == 0) - { - for (size_t i = 0; i < 18; ++i) - h1[1][i] += src[i * 4 + 3]; - h1++; - src += 72; - for (size_t cell = 1; cell < width; ++cell) - { - for (size_t i = 0; i < 18; ++i) - { - h1[0][i] += src[i * 4 + 2]; - h1[1][i] += src[i * 4 + 3]; - } - h1++; - src += 72; - } - for (size_t i = 0; i < 18; ++i) - h1[0][i] += src[i * 4 + 2]; - } - else if (row == height) - { - for (size_t i = 0; i < 18; ++i) - h0[1][i] += src[i * 4 + 1]; - h0++; - src += 72; - for (size_t cell = 1; cell < width; ++cell) - { - for (size_t i = 0; i < 18; ++i) - { - h0[0][i] += src[i * 4 + 0]; - h0[1][i] += src[i * 4 + 1]; - } - h0++; - src += 72; - } - for (size_t i = 0; i < 18; ++i) - h0[0][i] += src[i * 4 + 0]; - } - else - { - for (size_t i = 0; i < 18; ++i) - { - h0[1][i] += src[i * 4 + 1]; - h1[1][i] += src[i * 4 + 3]; - } - h0++; - h1++; - src += 72; - for (size_t cell = 1; cell < width; ++cell) - { - for (size_t i = 0; i < 16; i += F) - { - const float * s = src + i * 4; - __m256 a0 = Avx::Load(s + 0x00, s + 0x10); - __m256 a1 = Avx::Load(s + 0x04, s + 0x14); - __m256 a2 = Avx::Load(s + 0x08, s + 0x18); - __m256 a3 = Avx::Load(s + 0x0C, s + 0x1C); - __m256 b0 = _mm256_unpacklo_ps(a0, a2); - __m256 b1 = _mm256_unpackhi_ps(a0, a2); - __m256 b2 = _mm256_unpacklo_ps(a1, a3); - __m256 b3 = _mm256_unpackhi_ps(a1, a3); - Avx::Store(h0[0] + i, _mm256_add_ps(Avx::Load(h0[0] + i), _mm256_unpacklo_ps(b0, b2))); - Avx::Store(h0[1] + i, _mm256_add_ps(Avx::Load(h0[1] + i), _mm256_unpackhi_ps(b0, b2))); - Avx::Store(h1[0] + i, _mm256_add_ps(Avx::Load(h1[0] + i), _mm256_unpacklo_ps(b1, b3))); - Avx::Store(h1[1] + i, _mm256_add_ps(Avx::Load(h1[1] + i), _mm256_unpackhi_ps(b1, b3))); - } - for (size_t i = 16; i < 18; ++i) - { - h0[0][i] += src[i * 4 + 0]; - h0[1][i] += src[i * 4 + 1]; - h1[0][i] += src[i * 4 + 2]; - h1[1][i] += src[i * 4 + 3]; - } - h0++; - h1++; - src += 72; - } - for (size_t i = 0; i < 18; ++i) - { - h0[0][i] += src[i * 4 + 0]; - h1[0][i] += src[i * 4 + 2]; - } - } - buffer.ClearHist(); - } - - void HogDirectionHistograms(const uint8_t * src, size_t stride, size_t width, size_t height, float * histograms) - { - const size_t quantization = 18; - - size_t sizeX = width / 8, sizeY = height / 8; - - memset(histograms, 0, quantization*sizeX*sizeY * sizeof(float)); - - Buffer buffer(width); - - size_t aligned = AlignLo(width - 2, HA) + 1; - - for (size_t row = 1; row < 4; ++row) - AddRowToBuffer(src, stride, buffer, row, width, aligned); - AddToHistogram(buffer, 0, sizeX, sizeY, histograms); - for (size_t row = 4, cell = 1; row < height - 4; ++row) - { - AddRowToBuffer(src, stride, buffer, row, width, aligned); - if ((row & 7) == 3) - AddToHistogram(buffer, cell++, sizeX, sizeY, histograms); - } - for (size_t row = height - 4; row < height - 1; ++row) - AddRowToBuffer(src, stride, buffer, row, width, aligned); - AddToHistogram(buffer, sizeY, sizeX, sizeY, histograms); - } - } - - void HogDirectionHistograms(const uint8_t * src, size_t stride, size_t width, size_t height, - size_t cellX, size_t cellY, size_t quantization, float * histograms) - { - assert(width%cellX == 0 && height%cellY == 0 && quantization % 2 == 0); - assert(width >= A + 2); - - if (cellX == 8 && cellY == 8 && quantization == 18) - Custom_8x8_18::HogDirectionHistograms(src, stride, width, height, histograms); - else - { - memset(histograms, 0, quantization*(width / cellX)*(height / cellY) * sizeof(float)); - - Buffer buffer(width, quantization); - - size_t alignedWidth = AlignLo(width - 2, A) + 1; - - for (size_t row = 1; row < height - 1; ++row) - { - const uint8_t * s = src + stride * row; - for (size_t col = 1; col < alignedWidth; col += A) - HogDirectionHistograms(s, stride, buffer, col); - HogDirectionHistograms(s, stride, buffer, width - 1 - A); - Base::AddRowToHistograms(buffer.index, buffer.value, row, width, height, cellX, cellY, quantization, histograms); - } - } - } - - class HogFeatureExtractor - { - static const size_t C = 8; - static const size_t Q = 9; - static const size_t Q2 = 18; - - typedef Array Array32i; - typedef Array Array32f; - - size_t _sx, _sy, _hs; - - __m256i _pos[5]; - __m256 _cos[5], _sin[5]; - __m128 _kx[8], _ky[8]; - __m256i _Q, _Q2; - - Array32i _index; - Array32f _value; - Array32f _buffer; - Array32f _histogram; - Array32f _norm; - - void Init(size_t w, size_t h) - { - _sx = w / C; - _hs = _sx + 2; - _sy = h / C; - for (int i = 0; i < 5; ++i) - { - _cos[i] = _mm256_set1_ps((float)::cos(i*M_PI / Q)); - _sin[i] = _mm256_set1_ps((float)::sin(i*M_PI / Q)); - _pos[i] = _mm256_set1_epi32(i); - } - for (int i = 0; i < C; ++i) - { - float k0 = float((15 - i * 2) / 16.0f); - float k1 = 1.0f - k0; - _kx[i] = _mm_setr_ps(k0, k1, k0, k1); - _ky[i] = _mm_setr_ps(k0, k0, k1, k1); - } - _Q = _mm256_set1_epi32(Q); - _Q2 = _mm256_set1_epi32(Q2); - - _index.Resize(w); - _value.Resize(w); - _buffer.Resize((_sx + 1) * 4 * Q2); - _histogram.Resize((_sx + 2)*(_sy + 2)*Q2); - _norm.Resize((_sx + 2)*(_sy + 2)); - } - - template SIMD_INLINE void GetHistogram(const __m256 & dx, const __m256 & dy, size_t col) - { - __m256 _0 = _mm256_set1_ps(-0.0f); - __m256 adx = _mm256_andnot_ps(_0, dx); - __m256 ady = _mm256_andnot_ps(_0, dy); - __m256 bestDot = _mm256_fmadd_ps(adx, _cos[0], _mm256_mul_ps(ady, _sin[0])); - __m256i bestIndex = _pos[0]; - for (int i = 1; i < 5; ++i) - { - __m256 dot = _mm256_fmadd_ps(adx, _cos[i], _mm256_mul_ps(ady, _sin[i])); - __m256 mask = _mm256_cmp_ps(dot, bestDot, _CMP_GT_OS); - bestDot = _mm256_max_ps(dot, bestDot); - bestIndex = _mm256_blendv_epi8(bestIndex, _pos[i], _mm256_castps_si256(mask)); - } - __m256i maskDx = _mm256_castps_si256(_mm256_cmp_ps(dx, _mm256_setzero_ps(), _CMP_LT_OS)); - bestIndex = _mm256_blendv_epi8(bestIndex, _mm256_sub_epi32(_Q, bestIndex), maskDx); - - __m256i maskDy = _mm256_castps_si256(_mm256_cmp_ps(dy, _mm256_setzero_ps(), _CMP_LT_OS)); - __m256i corr = _mm256_and_si256(_mm256_castps_si256(_mm256_cmp_ps(adx, _mm256_setzero_ps(), _CMP_EQ_OS)), K32_00000001); - bestIndex = _mm256_blendv_epi8(bestIndex, _mm256_sub_epi32(_Q2, _mm256_add_epi32(bestIndex, corr)), maskDy); - - bestIndex = _mm256_andnot_si256(_mm256_cmpeq_epi32(bestIndex, _Q2), bestIndex); - - Store((__m256i*)(_index.data + col), bestIndex); - Avx::Store(_value.data + col, Avx::Sqrt<0>(_mm256_fmadd_ps(adx, adx, _mm256_mul_ps(ady, ady)))); - } - - template SIMD_INLINE __m256 ConvertDifference(const __m128i & a, const __m128i & b) - { - return _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(Ssse3::SubUnpackedU8(a, b))); - } - - template SIMD_INLINE void GetHistogram(const uint8_t * src, size_t stride, size_t col) - { - const uint8_t * s = src + col; - __m128i t = Sse2::Load((__m128i*)(s - stride)); - __m128i l = Sse2::Load((__m128i*)(s - 1)); - __m128i r = Sse2::Load((__m128i*)(s + 1)); - __m128i b = Sse2::Load((__m128i*)(s + stride)); - GetHistogram(ConvertDifference<0>(r, l), ConvertDifference<0>(b, t), col + 0); - GetHistogram(ConvertDifference<1>(r, l), ConvertDifference<1>(b, t), col + 8); - } - - void AddRowToBuffer(const uint8_t * src, size_t stride, size_t row, size_t width, size_t aligned) - { - const uint8_t * s = src + stride * row; - GetHistogram(s, stride, 1); - for (size_t col = HA; col < aligned; col += HA) - GetHistogram(s, stride, col); - GetHistogram(s, stride, width - 1 - HA); - - __m128 ky = _ky[(row + 4) & 7]; - __m128 * buffer = (__m128*)_buffer.data; - for (size_t col = 1, n = C, i = 5; col < width - 1; i = 0, n = Simd::Min(C, width - col - 1)) - { - for (; i < n; ++i, ++col) - { - int index = _index[col]; - __m128 value = _mm_set1_ps(_value[col]); - buffer[index] = _mm_fmadd_ps(_mm_mul_ps(ky, _kx[i]), value, buffer[index]); - } - buffer += Q2; - } - } - - void AddToHistogram(size_t row, size_t width, size_t height) - { - typedef float f18_t[18]; - - float * src = _buffer.data; - f18_t * h0 = (f18_t*)_histogram.data + row * _hs; - f18_t * h1 = h0 + _hs; - - for (size_t cell = 0; cell <= width; ++cell) - { - for (size_t i = 0; i < 16; i += F) - { - const float * s = src + i * 4; - __m256 a0 = Avx::Load(s + 0x00, s + 0x10); - __m256 a1 = Avx::Load(s + 0x04, s + 0x14); - __m256 a2 = Avx::Load(s + 0x08, s + 0x18); - __m256 a3 = Avx::Load(s + 0x0C, s + 0x1C); - __m256 b0 = _mm256_unpacklo_ps(a0, a2); - __m256 b1 = _mm256_unpackhi_ps(a0, a2); - __m256 b2 = _mm256_unpacklo_ps(a1, a3); - __m256 b3 = _mm256_unpackhi_ps(a1, a3); - Avx::Store(h0[0] + i, _mm256_add_ps(Avx::Load(h0[0] + i), _mm256_unpacklo_ps(b0, b2))); - Avx::Store(h0[1] + i, _mm256_add_ps(Avx::Load(h0[1] + i), _mm256_unpackhi_ps(b0, b2))); - Avx::Store(h1[0] + i, _mm256_add_ps(Avx::Load(h1[0] + i), _mm256_unpacklo_ps(b1, b3))); - Avx::Store(h1[1] + i, _mm256_add_ps(Avx::Load(h1[1] + i), _mm256_unpackhi_ps(b1, b3))); - } - __m128 * ps = (__m128*)src; - __m128 s0 = _mm_add_ps(_mm_unpacklo_ps(ps[16], ps[17]), Sse::Load(h0[0] + 16, h0[1] + 16)); - __m128 s1 = _mm_add_ps(_mm_unpackhi_ps(ps[16], ps[17]), Sse::Load(h1[0] + 16, h1[1] + 16)); - Sse::StoreHalf<0>(h0[0] + 16, s0); - Sse::StoreHalf<1>(h0[1] + 16, s0); - Sse::StoreHalf<0>(h1[0] + 16, s1); - Sse::StoreHalf<1>(h1[1] + 16, s1); - h0++; - h1++; - src += 72; - } - _buffer.Clear(); - } - - void EstimateHistogram(const uint8_t * src, size_t stride, size_t width, size_t height) - { - _histogram.Clear(); - - size_t aligned = AlignHi(width - 1, HA) - HA; - - _buffer.Clear(); - for (size_t row = 1; row < 4; ++row) - AddRowToBuffer(src, stride, row, width, aligned); - AddToHistogram(0, _sx, _sy); - for (size_t row = 4, cell = 1; row < height - 4; ++row) - { - AddRowToBuffer(src, stride, row, width, aligned); - if ((row & 7) == 3) - AddToHistogram(cell++, _sx, _sy); - } - for (size_t row = height - 4; row < height - 1; ++row) - AddRowToBuffer(src, stride, row, width, aligned); - AddToHistogram(_sy, _sx, _sy); - } - - SIMD_INLINE float GetNorm(const float * src) - { - __m256 norm = _mm256_add_ps(_mm256_loadu_ps(src), _mm256_loadu_ps(src + Q)); - norm = _mm256_mul_ps(norm, norm); - norm = _mm256_hadd_ps(norm, norm); - norm = _mm256_hadd_ps(norm, norm); - float buf[8]; - _mm256_storeu_ps(buf, norm); - return buf[0] + buf[4] + Simd::Square(src[Q - 1] + src[Q2 - 1]); - } - - void EstimateNorm() - { - _norm.Clear(); - for (size_t y = 0, i = 0; y < _sy; y++) - { - const float * h = _histogram.data + ((y + 1)*_hs + 1)*Q2; - float * n = _norm.data + (y + 1)*_hs + 1; - for (size_t x = 0; x < _sx; x++, i++) - n[x] = GetNorm(h + x * Q2); - } - } - - void ExtractFeatures(float * features) - { - __m128 _02 = _mm_set1_ps(0.2f); - __m128 _05 = _mm_set1_ps(0.5f); - __m128 _02357 = _mm_set1_ps(0.2357f); - __m128 eps = _mm_set1_ps(0.0001f); - for (size_t y = 0; y < _sy; y++) - { - float * ph = _histogram.data + ((y + 1)*_hs + 1)*Q2; - for (size_t x = 0; x < _sx; x++) - { - float * dst = features + (y*_sx + x) * 31; - - float * p0 = _norm.data + y * _hs + x; - float * p1 = p0 + _hs; - float * p2 = p1 + _hs; - - __m128 n = _mm_setr_ps( - p1[1] + p1[2] + p2[1] + p2[2], - p0[1] + p0[2] + p1[1] + p1[2], - p1[0] + p1[1] + p2[0] + p2[1], - p0[0] + p0[1] + p1[0] + p1[1]); - - n = _mm_rsqrt_ps(_mm_add_ps(n, eps)); - - __m128 t = _mm_setzero_ps(); - - float * src = ph + x * Q2; - for (int o = 0; o < 16; o += 4) - { - __m128 s = _mm_loadu_ps(src); - __m128 h0 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<0>(s), n), _02); - __m128 h1 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<1>(s), n), _02); - __m128 h2 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<2>(s), n), _02); - __m128 h3 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<3>(s), n), _02); - t = _mm_add_ps(t, _mm_add_ps(_mm_add_ps(h0, h1), _mm_add_ps(h2, h3))); - _mm_storeu_ps(dst, _mm_mul_ps(_05, _mm_hadd_ps(_mm_hadd_ps(h0, h1), _mm_hadd_ps(h2, h3)))); - dst += 4; - src += 4; - } - { - __m128 h0 = _mm_min_ps(_mm_mul_ps(_mm_set1_ps(*src++), n), _02); - __m128 h1 = _mm_min_ps(_mm_mul_ps(_mm_set1_ps(*src++), n), _02); - t = _mm_add_ps(t, _mm_add_ps(h0, h1)); - __m128 h = _mm_hadd_ps(h0, h1); - _mm_storeu_ps(dst, _mm_mul_ps(_05, _mm_hadd_ps(h, h))); - dst += 2; - } - - src = ph + x * Q2; - for (int o = 0; o < 8; o += 4) - { - __m128 s = _mm_add_ps(_mm_loadu_ps(src), _mm_loadu_ps(src + Q)); - __m128 h0 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<0>(s), n), _02); - __m128 h1 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<1>(s), n), _02); - __m128 h2 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<2>(s), n), _02); - __m128 h3 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<3>(s), n), _02); - _mm_storeu_ps(dst, _mm_mul_ps(_05, _mm_hadd_ps(_mm_hadd_ps(h0, h1), _mm_hadd_ps(h2, h3)))); - dst += 4; - src += 4; - } - { - __m128 s = _mm_set1_ps(src[0] + src[Q]); - __m128 h = _mm_min_ps(_mm_mul_ps(s, n), _02); - h = _mm_dp_ps(_05, h, 0xF1); - _mm_store_ss(dst++, h); - } - _mm_storeu_ps(dst, _mm_mul_ps(t, _02357)); - } - } - } - - public: - - void Run(const uint8_t * src, size_t stride, size_t width, size_t height, float * features) - { - Init(width, height); - - EstimateHistogram(src, stride, width, height); - - EstimateNorm(); - - ExtractFeatures(features); - } - }; - - void HogExtractFeatures(const uint8_t * src, size_t stride, size_t width, size_t height, float * features) - { - assert(width % 8 == 0 && height % 8 == 0 && width >= 16 && height >= 16); - assert(width >= HA + 2); - - HogFeatureExtractor extractor; - extractor.Run(src, stride, width, height, features); - } - - SIMD_INLINE void HogDeinterleave(const float * src, size_t count, float ** dst, size_t offset, size_t i) - { - src += i; - __m256 a0 = Avx::Load(src + 0 * count, src + 4 * count); - __m256 a1 = Avx::Load(src + 1 * count, src + 5 * count); - __m256 a2 = Avx::Load(src + 2 * count, src + 6 * count); - __m256 a3 = Avx::Load(src + 3 * count, src + 7 * count); - __m256 b0 = _mm256_unpacklo_ps(a0, a2); - __m256 b1 = _mm256_unpackhi_ps(a0, a2); - __m256 b2 = _mm256_unpacklo_ps(a1, a3); - __m256 b3 = _mm256_unpackhi_ps(a1, a3); - Avx::Store(dst[i + 0] + offset, _mm256_unpacklo_ps(b0, b2)); - Avx::Store(dst[i + 1] + offset, _mm256_unpackhi_ps(b0, b2)); - Avx::Store(dst[i + 2] + offset, _mm256_unpacklo_ps(b1, b3)); - Avx::Store(dst[i + 3] + offset, _mm256_unpackhi_ps(b1, b3)); - } - - void HogDeinterleave(const float * src, size_t srcStride, size_t width, size_t height, size_t count, float ** dst, size_t dstStride) - { - assert(width >= F && count >= Sse::F); - - size_t alignedCount = AlignLo(count, Sse::F); - size_t alignedWidth = AlignLo(width, F); - - for (size_t row = 0; row < height; ++row) - { - size_t rowOffset = row * dstStride; - for (size_t col = 0; col < alignedWidth; col += F) - { - const float * s = src + count * col; - size_t offset = rowOffset + col; - for (size_t i = 0; i < alignedCount; i += Sse::F) - HogDeinterleave(s, count, dst, offset, i); - if (alignedCount != count) - HogDeinterleave(s, count, dst, offset, count - Sse::F); - } - if (alignedWidth != width) - { - size_t col = width - F; - const float * s = src + count * col; - size_t offset = rowOffset + col; - for (size_t i = 0; i < alignedCount; i += Sse::F) - HogDeinterleave(s, count, dst, offset, i); - if (alignedCount != count) - HogDeinterleave(s, count, dst, offset, count - Sse::F); - } - src += srcStride; - } - } - - namespace HogSeparableFilter_Detail - { - template SIMD_INLINE void Set(float * dst, const __m256 & value, const __m256 & mask) - { - Avx::Store(dst, value); - } - - template <> SIMD_INLINE void Set<1, false>(float * dst, const __m256 & value, const __m256 & mask) - { - Avx::Store(dst, _mm256_add_ps(Avx::Load(dst), value)); - } - - template <> SIMD_INLINE void Set<1, true>(float * dst, const __m256 & value, const __m256 & mask) - { - Avx::Store(dst, _mm256_add_ps(Avx::Load(dst), _mm256_and_ps(value, mask))); - } - } - - class HogSeparableFilter - { - size_t _w, _h, _s; - Array32f _buffer; - Array256f _filter; - - void Init(size_t w, size_t h, size_t rs, size_t cs) - { - _w = w - rs + 1; - _s = AlignHi(_w, F); - _h = h - cs + 1; - _buffer.Resize(_s*h); - } - - template SIMD_INLINE void FilterRows(const float * src, const __m256 * filter, size_t size, float * dst) - { - __m256 sum = _mm256_setzero_ps(); - for (size_t i = 0; i < size; ++i) - sum = _mm256_fmadd_ps(Avx::Load(src + i), filter[i], sum); - Avx::Store(dst, sum); - } - - void FilterRows(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - _filter.Resize(size); - for (size_t i = 0; i < size; ++i) - _filter[i] = _mm256_set1_ps(filter[i]); - - size_t alignedWidth = AlignLo(width, F); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += F) - FilterRows(src + col, _filter.data, size, dst + col); - if (alignedWidth != width) - FilterRows(src + width - F, _filter.data, size, dst + width - F); - src += srcStride; - dst += dstStride; - } - } - - template SIMD_INLINE void FilterRows_10(const float * src, const __m256 * filter, float * dst) - { - __m256 src0 = Avx::Load(src + 0); - __m256 src4 = Avx::Load(src + 4); - __m256 src8 = Avx::Load(src + 8); - __m256 sum0 = _mm256_mul_ps(src0, filter[0]); - __m256 sum1 = _mm256_mul_ps(Alignr<1>(src0, src4), filter[1]); - sum0 = _mm256_fmadd_ps(Alignr<2>(src0, src4), filter[2], sum0); - sum1 = _mm256_fmadd_ps(Alignr<3>(src0, src4), filter[3], sum1); - sum0 = _mm256_fmadd_ps(src4, filter[4], sum0); - sum1 = _mm256_fmadd_ps(Alignr<1>(src4, src8), filter[5], sum1); - sum0 = _mm256_fmadd_ps(Alignr<2>(src4, src8), filter[6], sum0); - sum1 = _mm256_fmadd_ps(Alignr<3>(src4, src8), filter[7], sum1); - sum0 = _mm256_fmadd_ps(src8, filter[8], sum0); - sum1 = _mm256_fmadd_ps(Avx::Load(src + 9), filter[9], sum1); - Avx::Store(dst, _mm256_add_ps(sum0, sum1)); - } - - void FilterRows_10(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, float * dst, size_t dstStride) - { - __m256 _filter[10]; - for (size_t i = 0; i < 10; ++i) - _filter[i] = _mm256_set1_ps(filter[i]); - - size_t alignedWidth = AlignLo(width, F); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += F) - FilterRows_10(src + col, _filter, dst + col); - if (alignedWidth != width) - FilterRows_10(src + width - F, _filter, dst + width - F); - src += srcStride; - dst += dstStride; - } - } - - template SIMD_INLINE void FilterCols(const float * src, size_t stride, const __m256 * filter, size_t size, float * dst, const __m256 & mask) - { - __m256 sum = _mm256_setzero_ps(); - for (size_t i = 0; i < size; ++i, src += stride) - sum = _mm256_fmadd_ps(Avx::Load(src), filter[i], sum); - HogSeparableFilter_Detail::Set(dst, sum, mask); - } - - template SIMD_INLINE void FilterCols4x(const float * src, size_t stride, const __m256 * filter, size_t size, float * dst, const __m256 & mask) - { - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - for (size_t i = 0; i < size; ++i, src += stride) - { - __m256 f = filter[i]; - sums[0] = _mm256_fmadd_ps(Avx::Load(src + 0 * F), f, sums[0]); - sums[1] = _mm256_fmadd_ps(Avx::Load(src + 1 * F), f, sums[1]); - sums[2] = _mm256_fmadd_ps(Avx::Load(src + 2 * F), f, sums[2]); - sums[3] = _mm256_fmadd_ps(Avx::Load(src + 3 * F), f, sums[3]); - } - HogSeparableFilter_Detail::Set(dst + 0 * F, sums[0], mask); - HogSeparableFilter_Detail::Set(dst + 1 * F, sums[1], mask); - HogSeparableFilter_Detail::Set(dst + 2 * F, sums[2], mask); - HogSeparableFilter_Detail::Set(dst + 3 * F, sums[3], mask); - } - - template void FilterCols(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - _filter.Resize(size); - for (size_t i = 0; i < size; ++i) - _filter[i] = _mm256_set1_ps(filter[i]); - - size_t fullAlignedWidth = AlignLo(width, QF); - size_t partialAlignedWidth = AlignLo(width, F); - __m256 tailMask = RightNotZero32f(width - partialAlignedWidth); - - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QF) - FilterCols4x(src + col, srcStride, _filter.data, size, dst + col, tailMask); - for (; col < partialAlignedWidth; col += F) - FilterCols(src + col, srcStride, _filter.data, size, dst + col, tailMask); - if (partialAlignedWidth != width) - FilterCols(src + width - F, srcStride, _filter.data, size, dst + width - F, tailMask); - src += srcStride; - dst += dstStride; - } - } - - public: - - void Run(const float * src, size_t srcStride, size_t width, size_t height, - const float * rowFilter, size_t rowSize, const float * colFilter, size_t colSize, float * dst, size_t dstStride, int add) - { - Init(width, height, rowSize, colSize); - - if (colSize == 10) - FilterRows_10(src, srcStride, _w, height, rowFilter, _buffer.data, _s); - else - FilterRows(src, srcStride, _w, height, rowFilter, rowSize, _buffer.data, _s); - - if (add) - FilterCols<1>(_buffer.data, _s, _w, _h, colFilter, colSize, dst, dstStride); - else - FilterCols<0>(_buffer.data, _s, _w, _h, colFilter, colSize, dst, dstStride); - } - }; - - void HogFilterSeparable(const float * src, size_t srcStride, size_t width, size_t height, - const float * rowFilter, size_t rowSize, const float * colFilter, size_t colSize, float * dst, size_t dstStride, int add) - { - assert(width >= F + rowSize - 1 && height >= colSize - 1); - - HogSeparableFilter filter; - filter.Run(src, srcStride, width, height, rowFilter, rowSize, colFilter, colSize, dst, dstStride, add); - } - } -#endif -} diff --git a/src/3rd/Simd/Simd/SimdAvx2HogLite.cpp b/src/3rd/Simd/Simd/SimdAvx2HogLite.cpp deleted file mode 100644 index 44e7630a..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2HogLite.cpp +++ /dev/null @@ -1,1208 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdCompare.h" -#include "Simd/SimdArray.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdUpdate.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - const __m256i K8_KX4 = SIMD_MM256_SETR_EPI8( - 1, 3, 5, 7, 7, 5, 3, 1, 1, 3, 5, 7, 7, 5, 3, 1, - 1, 3, 5, 7, 7, 5, 3, 1, 1, 3, 5, 7, 7, 5, 3, 1); - const __m256i K8_KX8 = SIMD_MM256_SETR_EPI8( - 1, 3, 5, 7, 9, 11, 13, 15, 15, 13, 11, 9, 7, 5, 3, 1, - 1, 3, 5, 7, 9, 11, 13, 15, 15, 13, 11, 9, 7, 5, 3, 1); - - const __m256i K32_PERMUTE_BN_0 = SIMD_MM256_SETR_EPI32(1, 0, 3, 2, 2, 1, 4, 3); - const __m256i K32_PERMUTE_BN_1 = SIMD_MM256_SETR_EPI32(5, 4, 3, 2, 6, 5, 4, 3); - - template class HogLiteFeatureExtractor - { - static const size_t FQ = 8; - static const size_t HQ = FQ / 2; - static const size_t DQ = FQ * 2; - static const size_t QQ = FQ * 4; - - typedef Array Bytes; - typedef Array Ints; - typedef Array Floats; - - size_t _hx, _fx, _w, _aw; - Bytes _value, _index; - Ints _hi[2]; - Floats _hf[2], _nf[4], _nb; - int _k0[cell], _k1[cell]; - __m256 _k, _02, _05, _02357, _eps; - - SIMD_INLINE void Init(size_t width) - { - _w = (width / cell - 1)*cell; - _aw = AlignLo(_w, A); - _hx = width / cell; - _fx = _hx - 2; - _value.Resize(_aw + 3 * A, true); - _index.Resize(_aw + 3 * A, true); - for (size_t i = 0; i < cell; ++i) - { - _k0[i] = int(cell - i - 1) * 2 + 1; - _k1[i] = int(i) * 2 + 1; - } - for (size_t i = 0; i < 2; ++i) - { - _hi[i].Resize((_hx + 8)*FQ, true); - _hf[i].Resize(_hx*FQ); - } - for (size_t i = 0; i < 4; ++i) - _nf[i].Resize(_hx + F); - _nb.Resize((_hx + 6) * 4); - _k = _mm256_set1_ps(1.0f / Simd::Square(cell * 2)); - _02 = _mm256_set1_ps(0.2f); - _05 = _mm256_set1_ps(0.5f); - _02357 = _mm256_set1_ps(0.2357f); - _eps = _mm256_set1_ps(0.0001f); - } - - template static SIMD_INLINE void SetIndexAndValue(const uint8_t * src, size_t stride, uint8_t * value, uint8_t * index) - { - __m256i y0 = Load((__m256i*)(src - stride)); - __m256i y1 = Load((__m256i*)(src + stride)); - __m256i x0 = Load((__m256i*)(src - 1)); - __m256i x1 = Load((__m256i*)(src + 1)); - - __m256i ady = AbsDifferenceU8(y0, y1); - __m256i adx = AbsDifferenceU8(x0, x1); - - __m256i max = _mm256_max_epu8(ady, adx); - __m256i min = _mm256_min_epu8(ady, adx); - __m256i val = _mm256_adds_epu8(max, _mm256_avg_epu8(min, K_ZERO)); - Store((__m256i*)value, val); - - __m256i idx = _mm256_blendv_epi8(K8_01, K_ZERO, Compare8u(adx, ady)); - idx = _mm256_blendv_epi8(_mm256_sub_epi8(K8_03, idx), idx, Compare8u(x1, x0)); - idx = _mm256_blendv_epi8(_mm256_sub_epi8(K8_07, idx), idx, Compare8u(y1, y0)); - Store((__m256i*)index, idx); - } - - SIMD_INLINE void SetIndexAndValue(const uint8_t * src, size_t stride) - { - uint8_t * value = _value.data + A; - uint8_t * index = _index.data + A; - for (size_t col = 0; col < _aw; col += A) - SetIndexAndValue(src + col, stride, value + col, index + col); - if (_aw < _w) - { - size_t col = _w - A; - SetIndexAndValue(src + col, stride, value + col, index + col); - } - } - - static SIMD_INLINE void UpdateIntegerHistogram4x4(uint8_t * value, uint8_t * index, const __m256i & ky0, const __m256i & ky1, int * h0, int * h1) - { - __m256i val = Load((__m256i*)value); - __m256i idx = Load((__m256i*)index); - __m256i cur0 = K_ZERO; - __m256i cur1 = K8_01; - __m256i dirs[4]; - for (size_t i = 0; i < 4; ++i) - { - __m256i dir0 = _mm256_maddubs_epi16(_mm256_and_si256(_mm256_cmpeq_epi8(idx, cur0), val), K8_KX4); - __m256i dir1 = _mm256_maddubs_epi16(_mm256_and_si256(_mm256_cmpeq_epi8(idx, cur1), val), K8_KX4); - dirs[i] = _mm256_hadd_epi16(dir0, dir1); - cur0 = _mm256_add_epi8(cur0, K8_02); - cur1 = _mm256_add_epi8(cur1, K8_02); - } - __m256i hx0 = Shuffle32i<0x88>(dirs[0], dirs[1]); - __m256i hx1 = Shuffle32i<0x88>(dirs[2], dirs[3]); - __m256i hx2 = Shuffle32i<0xDD>(dirs[0], dirs[1]); - __m256i hx3 = Shuffle32i<0xDD>(dirs[2], dirs[3]); - __m256i hx0p = _mm256_permute2x128_si256(hx0, hx1, 0x20); - __m256i hx1p = _mm256_permute2x128_si256(hx0, hx1, 0x31); - __m256i hx2p = _mm256_permute2x128_si256(hx2, hx3, 0x20); - __m256i hx3p = _mm256_permute2x128_si256(hx2, hx3, 0x31); - Store((__m256i*)h0 + 0, _mm256_add_epi32(Load((__m256i*)h0 + 0), _mm256_madd_epi16(hx0p, ky0))); - Store((__m256i*)h0 + 2, _mm256_add_epi32(Load((__m256i*)h0 + 2), _mm256_madd_epi16(hx2p, ky0))); - Store((__m256i*)h0 + 4, _mm256_add_epi32(Load((__m256i*)h0 + 4), _mm256_madd_epi16(hx1p, ky0))); - Store((__m256i*)h0 + 6, _mm256_add_epi32(Load((__m256i*)h0 + 6), _mm256_madd_epi16(hx3p, ky0))); - Store((__m256i*)h1 + 0, _mm256_add_epi32(Load((__m256i*)h1 + 0), _mm256_madd_epi16(hx0p, ky1))); - Store((__m256i*)h1 + 2, _mm256_add_epi32(Load((__m256i*)h1 + 2), _mm256_madd_epi16(hx2p, ky1))); - Store((__m256i*)h1 + 4, _mm256_add_epi32(Load((__m256i*)h1 + 4), _mm256_madd_epi16(hx1p, ky1))); - Store((__m256i*)h1 + 6, _mm256_add_epi32(Load((__m256i*)h1 + 6), _mm256_madd_epi16(hx3p, ky1))); - } - - SIMD_INLINE void UpdateIntegerHistogram4x4(size_t rowI, size_t rowF) - { - int * h0 = _hi[(rowI + 0) & 1].data; - int * h1 = _hi[(rowI + 1) & 1].data; - uint8_t * value = _value.data + A - cell; - uint8_t * index = _index.data + A - cell; - __m256i ky0 = _mm256_set1_epi16((short)_k0[rowF]); - __m256i ky1 = _mm256_set1_epi16((short)_k1[rowF]); - for (size_t col = 0; col <= _w;) - { - UpdateIntegerHistogram4x4(value + col, index + col, ky0, ky1, h0, h1); - col += cell; - h0 += FQ; - h1 += FQ; - UpdateIntegerHistogram4x4(value + col, index + col, ky0, ky1, h0, h1); - col += 7 * cell; - h0 += 7 * FQ; - h1 += 7 * FQ; - } - } - - static SIMD_INLINE void UpdateIntegerHistogram8x8(uint8_t * value, uint8_t * index, const __m256i & ky0, const __m256i & ky1, int * h0, int * h1) - { - __m256i val = Load((__m256i*)value); - __m256i idx = Load((__m256i*)index); - __m256i cur0 = K_ZERO; - __m256i cur1 = K8_01; - __m256i dirs[4]; - for (size_t i = 0; i < 4; ++i) - { - __m256i dir0 = _mm256_maddubs_epi16(_mm256_and_si256(_mm256_cmpeq_epi8(idx, cur0), val), K8_KX8); - __m256i dir1 = _mm256_maddubs_epi16(_mm256_and_si256(_mm256_cmpeq_epi8(idx, cur1), val), K8_KX8); - dirs[i] = _mm256_hadd_epi16(dir0, dir1); - cur0 = _mm256_add_epi8(cur0, K8_02); - cur1 = _mm256_add_epi8(cur1, K8_02); - } - dirs[0] = _mm256_hadd_epi16(dirs[0], dirs[1]); - dirs[1] = _mm256_hadd_epi16(dirs[2], dirs[3]); - __m256i hx0 = _mm256_permute2x128_si256(dirs[0], dirs[1], 0x20); - __m256i hx1 = _mm256_permute2x128_si256(dirs[0], dirs[1], 0x31); - Store((__m256i*)h0 + 0, _mm256_add_epi32(Load((__m256i*)h0 + 0), _mm256_madd_epi16(hx0, ky0))); - Store((__m256i*)h0 + 2, _mm256_add_epi32(Load((__m256i*)h0 + 2), _mm256_madd_epi16(hx1, ky0))); - Store((__m256i*)h1 + 0, _mm256_add_epi32(Load((__m256i*)h1 + 0), _mm256_madd_epi16(hx0, ky1))); - Store((__m256i*)h1 + 2, _mm256_add_epi32(Load((__m256i*)h1 + 2), _mm256_madd_epi16(hx1, ky1))); - } - - SIMD_INLINE void UpdateIntegerHistogram8x8(size_t rowI, size_t rowF) - { - int * h0 = _hi[(rowI + 0) & 1].data; - int * h1 = _hi[(rowI + 1) & 1].data; - uint8_t * value = _value.data + A - cell; - uint8_t * index = _index.data + A - cell; - __m256i ky0 = _mm256_set1_epi16((short)_k0[rowF]); - __m256i ky1 = _mm256_set1_epi16((short)_k1[rowF]); - for (size_t col = 0; col <= _w;) - { - UpdateIntegerHistogram8x8(value + col, index + col, ky0, ky1, h0, h1); - col += cell; - h0 += FQ; - h1 += FQ; - UpdateIntegerHistogram8x8(value + col, index + col, ky0, ky1, h0, h1); - col += 3 * cell; - h0 += 3 * FQ; - h1 += 3 * FQ; - } - } - - SIMD_INLINE void UpdateFloatHistogram(size_t rowI) - { - Ints & hi = _hi[rowI & 1]; - Floats & hf = _hf[rowI & 1]; - Floats & nf = _nf[rowI & 3]; - - size_t alignedSize = AlignLo(hf.size, DF), i = 0; - for (; i < alignedSize; i += DF) - { - Avx::Store(hf.data + i + 0, _mm256_mul_ps(_k, _mm256_cvtepi32_ps(Load((__m256i*)(hi.data + i + 0))))); - Avx::Store(hf.data + i + F, _mm256_mul_ps(_k, _mm256_cvtepi32_ps(Load((__m256i*)(hi.data + i + F))))); - } - for (; i < hf.size; i += F) - Avx::Store(hf.data + i, _mm256_mul_ps(_k, _mm256_cvtepi32_ps(Load((__m256i*)(hi.data + i))))); - hi.Clear(); - - const float * h = hf.data; - size_t ahx = AlignLo(_hx, 4), x = 0; - for (; x < ahx; x += 4, h += QQ) - { - __m256 h01 = Load(h + 0 * FQ); - __m256 h23 = Load(h + 1 * FQ); - __m256 h45 = Load(h + 2 * FQ); - __m256 h67 = Load(h + 3 * FQ); - __m256 s01 = _mm256_add_ps(_mm256_permute2f128_ps(h01, h23, 0x20), _mm256_permute2f128_ps(h01, h23, 0x31)); - __m256 n01 = Permute4x64<0x88>(_mm256_dp_ps(s01, s01, 0xF1)); - __m256 s23 = _mm256_add_ps(_mm256_permute2f128_ps(h45, h67, 0x20), _mm256_permute2f128_ps(h45, h67, 0x31)); - __m256 n23 = Permute4x64<0x88>(_mm256_dp_ps(s23, s23, 0xF1)); - _mm_storeu_ps(nf.data + x, _mm_shuffle_ps(_mm256_castps256_ps128(n01), _mm256_castps256_ps128(n23), 0x88)); - } - for (; x < _hx; ++x, h += FQ) - { - __m128 h0 = Sse::Load(h + 00); - __m128 h1 = Sse::Load(h + HQ); - __m128 sum = _mm_add_ps(h0, h1); - _mm_store_ss(nf.data + x, _mm_dp_ps(sum, sum, 0xF1)); - } - } - - SIMD_INLINE void BlockNorm(size_t rowI) - { - const float * src0 = _nf[(rowI - 2) & 3].data; - const float * src1 = _nf[(rowI - 1) & 3].data; - const float * src2 = _nf[(rowI - 0) & 3].data; - float * dst = _nb.data; - for (size_t x = 0; x < _fx; x += 6, dst += 3 * F) - { - __m256 s0 = Avx::Load(src0 + x); - __m256 s1 = Avx::Load(src1 + x); - __m256 s2 = Avx::Load(src2 + x); - __m256 v0 = _mm256_add_ps(s0, s1); - __m256 v1 = _mm256_add_ps(s1, s2); - __m256 h0 = _mm256_add_ps(v0, Alignr<1>(v0, Permute4x64<0xEE>(v0))); - __m256 h1 = _mm256_add_ps(v1, Alignr<1>(v1, Permute4x64<0xEE>(v1))); - __m256 h0p = _mm256_permutevar8x32_ps(h0, K32_PERMUTE_BN_0); - __m256 h1p = _mm256_permutevar8x32_ps(h1, K32_PERMUTE_BN_0); - Avx::Store(dst + 0 * F, _mm256_unpacklo_ps(h1p, h0p)); - Avx::Store(dst + 1 * F, _mm256_unpackhi_ps(h1p, h0p)); - Avx::Store(dst + 2 * F, _mm256_unpacklo_ps(_mm256_permutevar8x32_ps(h1, K32_PERMUTE_BN_1), _mm256_permutevar8x32_ps(h0, K32_PERMUTE_BN_1))); - } - } - - SIMD_INLINE __m256 Features07(const __m256 & n, const __m256 & s, __m256 & t) - { - __m256 h0 = _mm256_min_ps(_mm256_mul_ps(Broadcast<0>(s), n), _02); - __m256 h1 = _mm256_min_ps(_mm256_mul_ps(Broadcast<1>(s), n), _02); - __m256 h2 = _mm256_min_ps(_mm256_mul_ps(Broadcast<2>(s), n), _02); - __m256 h3 = _mm256_min_ps(_mm256_mul_ps(Broadcast<3>(s), n), _02); - t = _mm256_add_ps(t, _mm256_add_ps(_mm256_add_ps(h0, h1), _mm256_add_ps(h2, h3))); - return _mm256_mul_ps(_05, _mm256_hadd_ps(_mm256_hadd_ps(h0, h1), _mm256_hadd_ps(h2, h3))); - } - - SIMD_INLINE __m256 Features8B(const __m256 & n, const __m256 & s) - { - __m256 h0 = _mm256_min_ps(_mm256_mul_ps(Broadcast<0>(s), n), _02); - __m256 h1 = _mm256_min_ps(_mm256_mul_ps(Broadcast<1>(s), n), _02); - __m256 h2 = _mm256_min_ps(_mm256_mul_ps(Broadcast<2>(s), n), _02); - __m256 h3 = _mm256_min_ps(_mm256_mul_ps(Broadcast<3>(s), n), _02); - return _mm256_mul_ps(_05, _mm256_hadd_ps(_mm256_hadd_ps(h0, h1), _mm256_hadd_ps(h2, h3))); - } - - SIMD_INLINE void SetFeatures(size_t rowI, float * dst) - { - const float * hf = _hf[(rowI - 1) & 1].data + FQ; - const float * nb = _nb.data; - size_t x = 0, afx = AlignLo(_fx, 2); - for (; x < afx; x += 2, nb += 8, dst += QQ) - { - __m256 n = _mm256_rsqrt_ps(_mm256_add_ps(_mm256_load_ps(nb), _eps)); - __m256 t = _mm256_setzero_ps(); - __m256 f[4]; - const float * src = hf + x * FQ; - __m256 s0 = Avx::Load(src + 0 * HQ, src + 2 * HQ); - __m256 s1 = Avx::Load(src + 1 * HQ, src + 3 * HQ); - f[0] = Features07(n, s0, t); - f[1] = Features07(n, s1, t); - f[2] = Features8B(n, _mm256_add_ps(s0, s1)); - f[3] = _mm256_mul_ps(t, _02357); - Avx::Store(dst + 0 * F, _mm256_permute2f128_ps(f[0], f[1], 0x20)); - Avx::Store(dst + 1 * F, _mm256_permute2f128_ps(f[2], f[3], 0x20)); - Avx::Store(dst + 2 * F, _mm256_permute2f128_ps(f[0], f[1], 0x31)); - Avx::Store(dst + 3 * F, _mm256_permute2f128_ps(f[2], f[3], 0x31)); - } - for (; x < _fx; ++x, nb += 4) - { - __m128 n = _mm_rsqrt_ps(_mm_add_ps(_mm_load_ps(nb), _mm256_castps256_ps128(_eps))); - __m128 t = _mm_setzero_ps(); - const float * src = hf + x * FQ; - for (int o = 0; o < FQ; o += 4) - { - __m128 s = _mm_loadu_ps(src); - __m128 h0 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<0>(s), n), _mm256_castps256_ps128(_02)); - __m128 h1 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<1>(s), n), _mm256_castps256_ps128(_02)); - __m128 h2 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<2>(s), n), _mm256_castps256_ps128(_02)); - __m128 h3 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<3>(s), n), _mm256_castps256_ps128(_02)); - t = _mm_add_ps(t, _mm_add_ps(_mm_add_ps(h0, h1), _mm_add_ps(h2, h3))); - _mm_storeu_ps(dst, _mm_mul_ps(_mm256_castps256_ps128(_05), _mm_hadd_ps(_mm_hadd_ps(h0, h1), _mm_hadd_ps(h2, h3)))); - dst += Sse2::F; - src += Sse2::F; - } - src = hf + x * FQ; - __m128 s = _mm_add_ps(_mm_loadu_ps(src), _mm_loadu_ps(src + HQ)); - __m128 h0 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<0>(s), n), _mm256_castps256_ps128(_02)); - __m128 h1 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<1>(s), n), _mm256_castps256_ps128(_02)); - __m128 h2 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<2>(s), n), _mm256_castps256_ps128(_02)); - __m128 h3 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<3>(s), n), _mm256_castps256_ps128(_02)); - _mm_storeu_ps(dst, _mm_mul_ps(_mm256_castps256_ps128(_05), _mm_hadd_ps(_mm_hadd_ps(h0, h1), _mm_hadd_ps(h2, h3)))); - dst += 4; - _mm_storeu_ps(dst, _mm_mul_ps(t, _mm256_castps256_ps128(_02357))); - dst += 4; - } - } - - public: - - void Run(const uint8_t * src, size_t srcStride, size_t width, size_t height, float * features, size_t featuresStride) - { - assert(cell == 8 || cell == 4); - assert(width >= cell * 3 && height >= cell * 3); - - Init(width); - - src += (srcStride + 1)*cell / 2; - height = (height / cell - 1)*cell; - - for (size_t row = 0; row < height; ++row) - { - SetIndexAndValue(src, srcStride); - size_t rowI = row / cell; - size_t rowF = row & (cell - 1); - if (cell == 4) - UpdateIntegerHistogram4x4(rowI, rowF); - else - UpdateIntegerHistogram8x8(rowI, rowF); - if (rowF == cell - 1) - { - UpdateFloatHistogram(rowI); - if (rowI >= 2) - { - BlockNorm(rowI); - SetFeatures(rowI, features); - features += featuresStride; - } - } - src += srcStride; - } - size_t rowI = height / cell; - UpdateFloatHistogram(rowI); - BlockNorm(rowI); - SetFeatures(rowI, features); - } - }; - - void HogLiteExtractFeatures(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t cell, float * features, size_t featuresStride) - { - if (cell == 4) - { - HogLiteFeatureExtractor<4> extractor; - extractor.Run(src, srcStride, width, height, features, featuresStride); - } - else - { - HogLiteFeatureExtractor<8> extractor; - extractor.Run(src, srcStride, width, height, features, featuresStride); - } - } - - namespace HogLiteFeatureFilterDetail - { - template struct Feature - { - template static SIMD_INLINE void Sum4x4(const float * src, const float * filter, __m256 * sums); - }; - - template <> struct Feature<8> - { - template static SIMD_INLINE void Sum4x4(const float * src, const float * filter, __m256 * sums) - { - __m256 filter0 = Load(filter + 0 * F); - __m256 src0 = Load(src + 0 * F); - __m256 src1 = Load(src + 1 * F); - __m256 src2 = Load(src + 2 * F); - __m256 src3 = Load(src + 3 * F); - sums[0] = _mm256_fmadd_ps(src0, filter0, sums[0]); - sums[1] = _mm256_fmadd_ps(src1, filter0, sums[1]); - sums[2] = _mm256_fmadd_ps(src2, filter0, sums[2]); - sums[3] = _mm256_fmadd_ps(src3, filter0, sums[3]); - __m256 filter1 = Load(filter + 1 * F); - __m256 src4 = Load(src + 4 * F); - sums[0] = _mm256_fmadd_ps(src1, filter1, sums[0]); - sums[1] = _mm256_fmadd_ps(src2, filter1, sums[1]); - sums[2] = _mm256_fmadd_ps(src3, filter1, sums[2]); - sums[3] = _mm256_fmadd_ps(src4, filter1, sums[3]); - __m256 filter2 = Load(filter + 2 * F); - __m256 src5 = Load(src + 5 * F); - sums[0] = _mm256_fmadd_ps(src2, filter2, sums[0]); - sums[1] = _mm256_fmadd_ps(src3, filter2, sums[1]); - sums[2] = _mm256_fmadd_ps(src4, filter2, sums[2]); - sums[3] = _mm256_fmadd_ps(src5, filter2, sums[3]); - __m256 filter3 = Load(filter + 3 * F); - __m256 src6 = Load(src + 6 * F); - sums[0] = _mm256_fmadd_ps(src3, filter3, sums[0]); - sums[1] = _mm256_fmadd_ps(src4, filter3, sums[1]); - sums[2] = _mm256_fmadd_ps(src5, filter3, sums[2]); - sums[3] = _mm256_fmadd_ps(src6, filter3, sums[3]); - } - }; - - template <> struct Feature<16> - { - template static SIMD_INLINE void Sum4x4(const float * src, const float * filter, __m256 * sums) - { - __m256 filter0 = Load(filter + 0 * F); - __m256 src0 = Load(src + 0 * F); - __m256 src2 = Load(src + 2 * F); - __m256 src4 = Load(src + 4 * F); - __m256 src6 = Load(src + 6 * F); - sums[0] = _mm256_fmadd_ps(src0, filter0, sums[0]); - sums[1] = _mm256_fmadd_ps(src2, filter0, sums[1]); - sums[2] = _mm256_fmadd_ps(src4, filter0, sums[2]); - sums[3] = _mm256_fmadd_ps(src6, filter0, sums[3]); - __m256 filter2 = Load(filter + 2 * F); - __m256 src8 = Load(src + 8 * F); - sums[0] = _mm256_fmadd_ps(src2, filter2, sums[0]); - sums[1] = _mm256_fmadd_ps(src4, filter2, sums[1]); - sums[2] = _mm256_fmadd_ps(src6, filter2, sums[2]); - sums[3] = _mm256_fmadd_ps(src8, filter2, sums[3]); - __m256 filter1 = Load(filter + 1 * F); - __m256 src1 = Load(src + 1 * F); - __m256 src3 = Load(src + 3 * F); - __m256 src5 = Load(src + 5 * F); - __m256 src7 = Load(src + 7 * F); - sums[0] = _mm256_fmadd_ps(src1, filter1, sums[0]); - sums[1] = _mm256_fmadd_ps(src3, filter1, sums[1]); - sums[2] = _mm256_fmadd_ps(src5, filter1, sums[2]); - sums[3] = _mm256_fmadd_ps(src7, filter1, sums[3]); - __m256 filter3 = Load(filter + 3 * F); - __m256 src9 = Load(src + 9 * F); - sums[0] = _mm256_fmadd_ps(src3, filter3, sums[0]); - sums[1] = _mm256_fmadd_ps(src5, filter3, sums[1]); - sums[2] = _mm256_fmadd_ps(src7, filter3, sums[2]); - sums[3] = _mm256_fmadd_ps(src9, filter3, sums[3]); - } - }; - } - - class HogLiteFeatureFilter - { - template SIMD_INLINE void ProductSum1x1(const float * src, const float * filter, __m256 & sum) - { - __m256 _src = Avx::Load(src); - __m256 _filter = Avx::Load(filter); - sum = _mm256_add_ps(sum, _mm256_mul_ps(_src, _filter)); - } - - template SIMD_INLINE void ProductSum1x4(const float * src, const float * filter, __m256 * sums) - { - __m256 _filter = Avx::Load(filter); - sums[0] = _mm256_fmadd_ps(Avx::Load(src + 0 * step), _filter, sums[0]); - sums[1] = _mm256_fmadd_ps(Avx::Load(src + 1 * step), _filter, sums[1]); - sums[2] = _mm256_fmadd_ps(Avx::Load(src + 2 * step), _filter, sums[2]); - sums[3] = _mm256_fmadd_ps(Avx::Load(src + 3 * step), _filter, sums[3]); - } - - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride) - { - size_t filterStride = featureSize * filterWidth; - size_t alignedDstWidth = AlignLo(dstWidth, 4); - size_t alignedFilterStride = AlignLo(filterStride, QF); - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - size_t dstCol = 0; - for (; dstCol < alignedDstWidth; dstCol += 4) - { - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - size_t filterCol = 0; - for (; filterCol < alignedFilterStride; filterCol += QF) - HogLiteFeatureFilterDetail::Feature:: template Sum4x4(pSrc + filterCol, pFilter + filterCol, sums); - for (; filterCol < filterStride; filterCol += F) - ProductSum1x4(pSrc + filterCol, pFilter + filterCol, sums); - pSrc += srcStride; - pFilter += filterStride; - } - _mm_storeu_ps(dst + dstCol, Avx::Extract4Sums(sums)); - } - for (; dstCol < dstWidth; ++dstCol) - { - __m256 sum = _mm256_setzero_ps(); - const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - for (size_t filterCol = 0; filterCol < filterStride; filterCol += F) - ProductSum1x1(pSrc + filterCol, pFilter + filterCol, sum); - pSrc += srcStride; - pFilter += filterStride; - } - dst[dstCol] = Avx::ExtractSum(sum); - } - dst += dstStride; - } - } - - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) - { - size_t filterStride = featureSize * filterWidth; - size_t alignedDstWidth = AlignLo(dstWidth, 4); - size_t alignedFilterStride = AlignLo(filterStride, QF); - __m128 _min = _mm_set1_ps(-FLT_MAX); - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - size_t dstCol = 0; - for (; dstCol < alignedDstWidth; dstCol += 4) - { - __m128 _mask = _mm_castsi128_ps(_mm_loadu_si128((__m128i*)(mask + dstCol))); - if (Sse41::TestZ(_mask)) - _mm_storeu_ps(dst + dstCol, _min); - else - { - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - size_t filterCol = 0; - for (; filterCol < alignedFilterStride; filterCol += QF) - HogLiteFeatureFilterDetail::Feature:: template Sum4x4(pSrc + filterCol, pFilter + filterCol, sums); - for (; filterCol < filterStride; filterCol += F) - ProductSum1x4(pSrc + filterCol, pFilter + filterCol, sums); - pSrc += srcStride; - pFilter += filterStride; - } - _mm_storeu_ps(dst + dstCol, _mm_blendv_ps(_min, Avx::Extract4Sums(sums), _mask)); - } - } - for (; dstCol < dstWidth; ++dstCol) - { - if (mask[dstCol]) - { - __m256 sum = _mm256_setzero_ps(); - const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - for (size_t filterCol = 0; filterCol < filterStride; filterCol += F) - ProductSum1x1(pSrc + filterCol, pFilter + filterCol, sum); - pSrc += srcStride; - pFilter += filterStride; - } - dst[dstCol] = Avx::ExtractSum(sum); - } - else - dst[dstCol] = -FLT_MAX; - } - dst += dstStride; - mask += maskStride; - } - } - - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride) - { - if (featureSize == 16) - Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride); - else - Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride); - } - - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) - { - if (featureSize == 16) - Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - else - Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - } - - public: - - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) - { - assert(featureSize == 8 || featureSize == 16); - assert(srcWidth >= filterWidth && srcHeight >= filterHeight); - - size_t dstWidth = srcWidth - filterWidth + 1; - size_t dstHeight = srcHeight - filterHeight + 1; - - if (mask) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(filter)) - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - else - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - } - else - { - if (Aligned(src) && Aligned(srcStride) && Aligned(filter)) - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride); - else - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride); - } - } - }; - - void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) - { - HogLiteFeatureFilter featureFilter; - featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - } - - namespace HogLiteFeatureResizerDetail - { - template struct Feature - { - template static SIMD_INLINE void Interpolate(const float * src0, const float * src1, const __m256 k[2][2], float * dst); - }; - - template <> struct Feature<8> - { - template static SIMD_INLINE void Interpolate(const float * src0, const float * src1, const __m256 k[2][2], float * dst) - { - Avx::Store(dst + 0 * F, _mm256_add_ps( - _mm256_fmadd_ps(Load(src0 + 0 * F), k[0][0], _mm256_mul_ps(Load(src0 + 1 * F), k[0][1])), - _mm256_fmadd_ps(Load(src1 + 0 * F), k[1][0], _mm256_mul_ps(Load(src1 + 1 * F), k[1][1])))); - } - }; - - template <> struct Feature<16> - { - template static SIMD_INLINE void Interpolate(const float * src0, const float * src1, const __m256 k[2][2], float * dst) - { - Avx::Store(dst + 0 * F, _mm256_add_ps( - _mm256_fmadd_ps(Load(src0 + 0 * F), k[0][0], _mm256_mul_ps(Load(src0 + 2 * F), k[0][1])), - _mm256_fmadd_ps(Load(src1 + 0 * F), k[1][0], _mm256_mul_ps(Load(src1 + 2 * F), k[1][1])))); - Avx::Store(dst + 1 * F, _mm256_add_ps( - _mm256_fmadd_ps(Load(src0 + 1 * F), k[0][0], _mm256_mul_ps(Load(src0 + 3 * F), k[0][1])), - _mm256_fmadd_ps(Load(src1 + 1 * F), k[1][0], _mm256_mul_ps(Load(src1 + 3 * F), k[1][1])))); - } - }; - } - - - class HogLiteFeatureResizer - { - typedef Array Ints; - typedef Array Floats; - - Ints _iy, _ix; - Floats _ky, _kx; - - void InitIndexWeight(size_t srcSize, size_t dstSize, size_t dstStep, Ints & indexes, Floats & weights) - { - indexes.Resize(dstSize); - weights.Resize(dstSize); - - float scale = float(srcSize) / float(dstSize); - for (size_t i = 0; i < dstSize; ++i) - { - float weight = (float)((i + 0.5f)*scale - 0.5f); - int index = (int)::floor(weight); - weight -= index; - if (index < 0) - { - index = 0; - weight = 0.0f; - } - if (index > (int)srcSize - 2) - { - index = (int)srcSize - 2; - weight = 1.0f; - } - indexes[i] = int(index*dstStep); - weights[i] = weight; - } - } - - template void Resize(const float * src, size_t srcStride, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) - { - __m256 _1 = _mm256_set1_ps(1.0f); - for (size_t rowDst = 0; rowDst < dstHeight; ++rowDst) - { - __m256 ky1 = _mm256_set1_ps(_ky[rowDst]); - __m256 ky0 = _mm256_sub_ps(_1, ky1); - const float * pSrc = src + _iy[rowDst]; - float * pDst = dst + rowDst * dstStride; - for (size_t colDst = 0; colDst < dstWidth; ++colDst, pDst += featureSize) - { - __m256 kx1 = _mm256_set1_ps(_kx[colDst]); - __m256 kx0 = _mm256_sub_ps(_1, kx1); - __m256 k[2][2]; - k[0][0] = _mm256_mul_ps(ky0, kx0); - k[0][1] = _mm256_mul_ps(ky0, kx1); - k[1][0] = _mm256_mul_ps(ky1, kx0); - k[1][1] = _mm256_mul_ps(ky1, kx1); - const float * pSrc0 = pSrc + _ix[colDst]; - const float * pSrc1 = pSrc0 + srcStride; - HogLiteFeatureResizerDetail::Feature:: template Interpolate(pSrc0, pSrc1, k, pDst); - } - } - } - - template void Resize(const float * src, size_t srcStride, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) - { - if (featureSize == 8) - Resize(src, srcStride, dst, dstStride, dstWidth, dstHeight); - else - Resize(src, srcStride, dst, dstStride, dstWidth, dstHeight); - } - - public: - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) - { - assert(featureSize == 8 || featureSize == 16); - - if (srcWidth == dstWidth && srcHeight == dstHeight) - { - size_t size = sizeof(float)*srcWidth*featureSize; - for (size_t row = 0; row < dstHeight; ++row) - memcpy(dst + row * dstStride, src + row * srcStride, size); - return; - } - - InitIndexWeight(srcWidth, dstWidth, featureSize, _ix, _kx); - InitIndexWeight(srcHeight, dstHeight, srcStride, _iy, _ky); - - if (Aligned(src) && Aligned(dst)) - Resize(src, srcStride, featureSize, dst, dstStride, dstWidth, dstHeight); - else - Resize(src, srcStride, featureSize, dst, dstStride, dstWidth, dstHeight); - } - }; - - void HogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) - { - HogLiteFeatureResizer featureResizer; - featureResizer.Run(src, srcStride, srcWidth, srcHeight, featureSize, dst, dstStride, dstWidth, dstHeight); - } - - template SIMD_INLINE void StoreHorizontalSums(float * ptr, __m256 * sums) - { - __m256 hsum = _mm256_hadd_ps(_mm256_hadd_ps(sums[0], sums[1]), _mm256_hadd_ps(sums[2], sums[3])); - Sse::Store(ptr, _mm_add_ps(_mm256_castps256_ps128(hsum), _mm256_extractf128_ps(hsum, 1))); - } - - template void HogLiteCompressFeatures(const float * src, size_t srcStride, size_t width, size_t height, const float * pca, float * dst, size_t dstStride) - { - if (align) - assert(Aligned(src) && Aligned(pca) && Aligned(dst)); - - size_t alignedWidth = AlignLo(width, 2); - for (size_t row = 0; row < height; ++row) - { - const float * s = src; - float * d = dst; - size_t col = 0; - for (; col < alignedWidth; col += 2) - { - const float * p = pca; - for (size_t i = 0; i < 8; i += 4, p += 64) - { - __m256 sums[8] = { - _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), - _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - for (size_t j = 0; j < 16; j += F) - { - __m256 s0 = Load(s + j + 00); - __m256 s1 = Load(s + j + 16); - __m256 p0 = Load(p + j + 00); - sums[0] = _mm256_fmadd_ps(s0, p0, sums[0]); - sums[4] = _mm256_fmadd_ps(s1, p0, sums[4]); - __m256 p1 = Load(p + j + 16); - sums[1] = _mm256_fmadd_ps(s0, p1, sums[1]); - sums[5] = _mm256_fmadd_ps(s1, p1, sums[5]); - __m256 p2 = Load(p + j + 32); - sums[2] = _mm256_fmadd_ps(s0, p2, sums[2]); - sums[6] = _mm256_fmadd_ps(s1, p2, sums[6]); - __m256 p3 = Load(p + j + 48); - sums[3] = _mm256_fmadd_ps(s0, p3, sums[3]); - sums[7] = _mm256_fmadd_ps(s1, p3, sums[7]); - } - StoreHorizontalSums(d + i + 0, sums + 0); - StoreHorizontalSums(d + i + 8, sums + 4); - } - s += 32; - d += 16; - } - for (; col < width; ++col) - { - const float * p = pca; - for (size_t i = 0; i < 8; i += 4, p += 64) - { - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - for (size_t j = 0; j < 16; j += F) - { - __m256 _s = Load(s + j); - sums[0] = _mm256_fmadd_ps(_s, Load(p + j + 00), sums[0]); - sums[1] = _mm256_fmadd_ps(_s, Load(p + j + 16), sums[1]); - sums[2] = _mm256_fmadd_ps(_s, Load(p + j + 32), sums[2]); - sums[3] = _mm256_fmadd_ps(_s, Load(p + j + 48), sums[3]); - } - StoreHorizontalSums(d + i, sums); - } - s += 16; - d += 8; - } - src += srcStride; - dst += dstStride; - } - - } - - void HogLiteCompressFeatures(const float * src, size_t srcStride, size_t width, size_t height, const float * pca, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(pca) && Aligned(dst)) - HogLiteCompressFeatures(src, srcStride, width, height, pca, dst, dstStride); - else - HogLiteCompressFeatures(src, srcStride, width, height, pca, dst, dstStride); - } - - class HogLiteSeparableFilter - { - size_t _dstWidth, _dstHeight, _dstStride; - Array32f _buffer; - Array256f _filter; - - void Init(size_t srcWidth, size_t srcHeight, size_t hSize, size_t vSize) - { - _dstWidth = srcWidth - hSize + 1; - _dstStride = AlignHi(_dstWidth, F); - _dstHeight = srcHeight - vSize + 1; - _buffer.Resize(_dstStride*srcHeight); - } - - template static SIMD_INLINE void FilterHx1(const float * src, const float * filter, __m256 & sum) - { - __m256 _src = Avx::Load(src); - __m256 _filter = Avx::Load(filter); - sum = _mm256_fmadd_ps(_src, _filter, sum); - } - - template static SIMD_INLINE void FilterHx4(const float * src, const float * filter, __m256 * sums) - { - __m256 _filter = Avx::Load(filter); - sums[0] = _mm256_fmadd_ps(Avx::Load(src + 0 * step), _filter, sums[0]); - sums[1] = _mm256_fmadd_ps(Avx::Load(src + 1 * step), _filter, sums[1]); - sums[2] = _mm256_fmadd_ps(Avx::Load(src + 2 * step), _filter, sums[2]); - sums[3] = _mm256_fmadd_ps(Avx::Load(src + 3 * step), _filter, sums[3]); - } - - template void FilterH(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - size_t alignedWidth = AlignLo(width, 4); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += 4) - { - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - const float * s = src + col * step; - for (size_t i = 0; i < size; i += F) - FilterHx4(s + i, filter + i, sums); - Sse::Store(dst + col, Avx::Extract4Sums(sums)); - } - for (; col < width; ++col) - { - __m256 sum = _mm256_setzero_ps(); - const float * s = src + col * step; - for (size_t i = 0; i < size; i += F) - FilterHx1(s + i, filter + i, sum); - dst[col] = Avx::ExtractSum(sum); - } - src += srcStride; - dst += dstStride; - } - } - - template void FilterH(const float * src, size_t srcStride, size_t width, size_t height, size_t step, const float * filter, size_t size, float * dst, size_t dstStride) - { - if (step == 16) - FilterH(src, srcStride, width, height, filter, size, dst, dstStride); - else - FilterH(src, srcStride, width, height, filter, size, dst, dstStride); - } - - template static SIMD_INLINE void FilterV(const float * src, size_t stride, const __m256 * filter, size_t size, float * dst, const __m256 & mask) - { - __m256 sum = _mm256_setzero_ps(); - for (size_t i = 0; i < size; ++i, src += stride) - sum = _mm256_fmadd_ps(Avx::Load(src), filter[i], sum); - Avx::Update(dst, Masked(sum, mask)); - } - - template void FilterV(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - _filter.Resize(size); - for (size_t i = 0; i < size; ++i) - _filter[i] = _mm256_set1_ps(filter[i]); - - size_t alignedWidth = AlignLo(width, F); - __m256 tailMask = RightNotZero32f(width - alignedWidth); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += F) - FilterV(src + col, srcStride, _filter.data, size, dst + col, tailMask); - if (alignedWidth != width) - FilterV(src + width - F, srcStride, _filter.data, size, dst + width - F, tailMask); - src += srcStride; - dst += dstStride; - } - } - - template void FilterV(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - if (Aligned(dst) && Aligned(dstStride)) - FilterV(src, srcStride, width, height, filter, size, dst, dstStride); - else - FilterV(src, srcStride, width, height, filter, size, dst, dstStride); - } - - public: - - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * hFilter, size_t hSize, const float * vFilter, size_t vSize, float * dst, size_t dstStride, int add) - { - assert(featureSize == 8 || featureSize == 16); - assert(srcWidth >= hSize && srcHeight >= vSize); - - Init(srcWidth, srcHeight, hSize, vSize); - - if (Aligned(src) && Aligned(srcStride) && Aligned(hFilter)) - FilterH(src, srcStride, _dstWidth, srcHeight, featureSize, hFilter, hSize*featureSize, _buffer.data, _dstStride); - else - FilterH(src, srcStride, _dstWidth, srcHeight, featureSize, hFilter, hSize*featureSize, _buffer.data, _dstStride); - - if (add) - FilterV(_buffer.data, _dstStride, _dstWidth, _dstHeight, vFilter, vSize, dst, dstStride); - else - FilterV(_buffer.data, _dstStride, _dstWidth, _dstHeight, vFilter, vSize, dst, dstStride); - } - }; - - void HogLiteFilterSeparable(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * hFilter, size_t hSize, const float * vFilter, size_t vSize, float * dst, size_t dstStride, int add) - { - HogLiteSeparableFilter filter; - filter.Run(src, srcStride, srcWidth, srcHeight, featureSize, hFilter, hSize, vFilter, vSize, dst, dstStride, add); - } - - __m256i K32_TAIL_MASK = SIMD_MM256_SETR_EPI32(-1, -1, -1, -1, -1, -1, -1, 0); - - void HogLiteFindMax7x7(const float * a, size_t aStride, const float * b, size_t bStride, size_t height, float * pValue, size_t * pCol, size_t * pRow) - { - __m256 sums[7]; - __m256 min = _mm256_set1_ps(-FLT_MAX); - __m256 max = min; - for (size_t row = 0; row < height; ++row) - { - sums[row] = _mm256_add_ps(Load(a), Load(b)); - max = _mm256_max_ps(max, sums[row]); - a += aStride; - b += bStride; - } - max = _mm256_blendv_ps(min, max, _mm256_castsi256_ps(K32_TAIL_MASK)); - max = _mm256_max_ps(Alignr<1>(max, max), max); - max = _mm256_max_ps(Alignr<2>(max, max), max); - max = _mm256_max_ps(max, _mm256_permute2f128_ps(max, max, 0x01)); - _mm_store_ss(pValue, _mm256_castps256_ps128(max)); - for (size_t row = 0; row < height; ++row) - { - int mask = _mm256_movemask_epi8(_mm256_castps_si256(_mm256_cmp_ps(max, sums[row], _CMP_EQ_UQ))) & 0x0FFFFFFF; - if (mask) - { - *pRow = row; - *pCol = _tzcnt_u32(mask) >> 2; - break; - } - } - } - - const __m256i K8_SUM_SUFFLE = SIMD_MM256_SETR_EPI8( - 0x0, -1, -1, -1, -1, -1, -1, -1, - 0x0, 0x4, -1, -1, -1, -1, -1, -1, - 0x0, 0x4, 0x8, -1, -1, -1, -1, -1, - 0x0, 0x4, 0x8, 0xC, -1, -1, -1, -1); - - const __m256i K32_64_TO_32_1 = SIMD_MM256_SETR_EPI32(0, 2, 4, 6, 1, 3, 5, 7); - - const __m256i K32_64_TO_32_2 = SIMD_MM256_SETR_EPI32(0, 0, 2, 2, 4, 4, 6, 6); - - class HogLiteMaskCreater - { - typedef Simd::Array Ints; - Ints _sums[8]; - size_t _dstWidth, _alignedDstWidth, _dstHeight; - - void Init(size_t srcWidth, size_t srcHeight, size_t scale, size_t size) - { - _dstWidth = srcWidth * scale + size - scale; - _alignedDstWidth = AlignLo(_dstWidth, F); - _dstHeight = srcHeight * scale + size - scale; - size_t sumSize = AlignHi(_dstWidth, F) + F; - for (size_t i = 0; i < 8; ++i) - _sums[i].Resize(sumSize, true); - } - - template SIMD_INLINE void SetDstRow(const uint32_t * sum0, const uint32_t * sum1, uint32_t * dst) - { - size_t dstCol = 0; - for (; dstCol < _alignedDstWidth; dstCol += F) - { - __m256i s00 = Load((__m256i*)(sum0 + dstCol - step)); - __m256i s10 = Load((__m256i*)(sum1 + dstCol - step)); - __m256i s01 = Load((__m256i*)(sum0 + dstCol - 0)); - __m256i s11 = Load((__m256i*)(sum1 + dstCol - 0)); - __m256i sum = _mm256_sub_epi32(_mm256_sub_epi32(s11, s10), _mm256_sub_epi32(s01, s00)); - Store((__m256i*)(dst + dstCol), _mm256_cmpgt_epi32(sum, K_ZERO)); - } - for (; dstCol < _dstWidth; ++dstCol) - { - uint32_t sum = sum1[dstCol - 0] - sum1[dstCol - step] - sum0[dstCol - 0] + sum0[dstCol - step]; - dst[dstCol] = sum ? -1 : 0; - } - } - - void CreateMask7x7x1(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, const float * threshold, uint32_t * dst, size_t dstStride) - { - size_t alignedSrcWidth = AlignLo(srcWidth, F); - __m256 _threshold = _mm256_set1_ps(*threshold); - for (size_t row = 0; row < srcHeight; ++row) - { - uint32_t * sum0 = _sums[(row + 0) & 7].data + F; - uint32_t * sum6 = _sums[(row + 6) & 7].data + F; - uint32_t * sum7 = _sums[(row + 7) & 7].data + F; - - __m256i _rowSums = K_ZERO; - size_t col = 0; - for (; col < alignedSrcWidth; col += F) - { - __m256i mask = _mm256_castps_si256(_mm256_cmp_ps(Load(src + col), _threshold, _CMP_GT_OQ)); - - __m256i lo = _mm256_shuffle_epi8(_mm256_permute4x64_epi64(mask, 0x44), K8_SUM_SUFFLE); - _rowSums = _mm256_add_epi32(_rowSums, _mm256_sad_epu8(lo, K_ZERO)); - _mm_storeu_si128((__m128i*)(sum7 + col + 00), _mm_add_epi32(_mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_rowSums, - K32_64_TO_32_1)), _mm_loadu_si128((__m128i*)(sum6 + col + 00)))); - _rowSums = _mm256_permute4x64_epi64(_rowSums, 0xFF); - - __m256i hi = _mm256_shuffle_epi8(_mm256_permute4x64_epi64(mask, 0xEE), K8_SUM_SUFFLE); - _rowSums = _mm256_add_epi32(_rowSums, _mm256_sad_epu8(hi, K_ZERO)); - _mm_storeu_si128((__m128i*)(sum7 + col + HF), _mm_add_epi32(_mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_rowSums, - K32_64_TO_32_1)), _mm_loadu_si128((__m128i*)(sum6 + col + HF)))); - _rowSums = _mm256_permute4x64_epi64(_rowSums, 0xFF); - } - uint32_t rowSum = sum7[col - 1] - sum6[col - 1]; - for (; col < srcWidth; ++col) - { - if (src[col] > *threshold) - rowSum += 0xFF; - sum7[col] = rowSum + sum6[col]; - } - for (; col < _dstWidth; ++col) - sum7[col] = sum7[col - 1]; - - SetDstRow<7>(sum0, sum7, dst); - - src += srcStride; - dst += dstStride; - } - - for (size_t row = srcHeight; row < _dstHeight; ++row) - { - uint32_t * sum0 = _sums[(row + 0) & 7].data + F; - uint32_t * sum7 = _sums[(srcHeight - 1 + 7) & 7].data + F; - SetDstRow<7>(sum0, sum7, dst); - dst += dstStride; - } - } - - void CreateMask7x7x2(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, const float * threshold, uint32_t * dst, size_t dstStride) - { - size_t alignedSrcWidth = AlignLo(srcWidth, F); - __m256 _threshold = _mm256_set1_ps(*threshold); - for (size_t srcRow = 0; srcRow < srcHeight; ++srcRow) - { - uint32_t * sum0 = _sums[(srcRow + 0) & 7].data + F; - uint32_t * sum1 = _sums[(srcRow + 1) & 7].data + F; - uint32_t * sum3 = _sums[(srcRow + 3) & 7].data + F; - uint32_t * sum4 = _sums[(srcRow + 4) & 7].data + F; - - __m256i _rowSums = K_ZERO; - size_t srcCol = 0, dstCol = 0; - for (; srcCol < alignedSrcWidth; srcCol += F, dstCol += DF) - { - __m256i mask = _mm256_castps_si256(_mm256_cmp_ps(Load(src + srcCol), _threshold, _CMP_GT_OQ)); - - __m256i lo = _mm256_shuffle_epi8(_mm256_permute4x64_epi64(mask, 0x44), K8_SUM_SUFFLE); - _rowSums = _mm256_add_epi32(_rowSums, _mm256_sad_epu8(lo, K_ZERO)); - Store((__m256i*)(sum4 + dstCol + 0), _mm256_add_epi32(_mm256_permutevar8x32_epi32(_rowSums, - K32_64_TO_32_2), Load((__m256i*)(sum3 + dstCol + 0)))); - _rowSums = _mm256_permute4x64_epi64(_rowSums, 0xFF); - - __m256i hi = _mm256_shuffle_epi8(_mm256_permute4x64_epi64(mask, 0xEE), K8_SUM_SUFFLE); - _rowSums = _mm256_add_epi32(_rowSums, _mm256_sad_epu8(hi, K_ZERO)); - Store((__m256i*)(sum4 + dstCol + F), _mm256_add_epi32(_mm256_permutevar8x32_epi32(_rowSums, - K32_64_TO_32_2), Load((__m256i*)(sum3 + dstCol + F)))); - _rowSums = _mm256_permute4x64_epi64(_rowSums, 0xFF); - } - uint32_t rowSum = sum4[dstCol - 1] - sum3[dstCol - 1]; - for (; srcCol < srcWidth; srcCol += 1, dstCol += 2) - { - if (src[srcCol] > *threshold) - rowSum += 0xFF; - sum4[dstCol + 0] = rowSum + sum3[dstCol + 0]; - sum4[dstCol + 1] = rowSum + sum3[dstCol + 1]; - } - for (; dstCol < _dstWidth; ++dstCol) - sum4[dstCol] = sum4[dstCol - 1]; - - SetDstRow<7>(sum0, sum4, dst); - dst += dstStride; - SetDstRow<7>(sum1, sum4, dst); - dst += dstStride; - src += srcStride; - } - - uint32_t * sum0 = _sums[(srcHeight + 0) & 7].data + F; - uint32_t * sum1 = _sums[(srcHeight + 1) & 7].data + F; - uint32_t * sum2 = _sums[(srcHeight + 2) & 7].data + F; - uint32_t * sum3 = _sums[(srcHeight + 3) & 7].data + F; - SetDstRow<7>(sum0, sum3, dst + 0 * dstStride); - SetDstRow<7>(sum1, sum3, dst + 1 * dstStride); - SetDstRow<7>(sum1, sum3, dst + 2 * dstStride); - SetDstRow<7>(sum2, sum3, dst + 3 * dstStride); - SetDstRow<7>(sum2, sum3, dst + 4 * dstStride); - } - - public: - - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, const float * threshold, size_t scale, size_t size, uint32_t * dst, size_t dstStride) - { - if (size == 7 && (scale == 1 || scale == 2)) - { - Init(srcWidth, srcHeight, scale, size); - if (scale == 1) - CreateMask7x7x1(src, srcStride, srcWidth, srcHeight, threshold, dst, dstStride); - else - CreateMask7x7x2(src, srcStride, srcWidth, srcHeight, threshold, dst, dstStride); - } - else - Base::HogLiteCreateMask(src, srcStride, srcWidth, srcHeight, threshold, scale, size, dst, dstStride); - } - }; - - void HogLiteCreateMask(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, const float * threshold, size_t scale, size_t size, uint32_t * dst, size_t dstStride) - { - HogLiteMaskCreater maskCreater; - maskCreater.Run(src, srcStride, srcWidth, srcHeight, threshold, scale, size, dst, dstStride); - } - } -#endif// SIMD_AVX2_ENABLE -} - - diff --git a/src/3rd/Simd/Simd/SimdAvx2Int16ToGray.cpp b/src/3rd/Simd/Simd/SimdAvx2Int16ToGray.cpp deleted file mode 100644 index 15385736..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Int16ToGray.cpp +++ /dev/null @@ -1,66 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void Int16ToGray(const int16_t * src, uint8_t * dst) - { - __m256i lo = Load((__m256i*)src + 0); - __m256i hi = Load((__m256i*)src + 1); - Store((__m256i*)dst, PackI16ToU8(lo, hi)); - } - - template void Int16ToGray(const int16_t * src, size_t width, size_t height, size_t srcStride, uint8_t * dst, size_t dstStride) - { - assert(width >= A); - if (align) - assert(Aligned(src) && Aligned(srcStride, HA) && Aligned(dst) && Aligned(dstStride)); - - size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - Int16ToGray(src + col, dst + col); - if (alignedWidth != width) - Int16ToGray(src + width - A, dst + width - A); - src += srcStride; - dst += dstStride; - } - } - - void Int16ToGray(const uint8_t * src, size_t width, size_t height, size_t srcStride, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - Int16ToGray((const int16_t *)src, width, height, srcStride / sizeof(int16_t), dst, dstStride); - else - Int16ToGray((const int16_t *)src, width, height, srcStride / sizeof(int16_t), dst, dstStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Integral.cpp b/src/3rd/Simd/Simd/SimdAvx2Integral.cpp deleted file mode 100644 index c0dcd9b0..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Integral.cpp +++ /dev/null @@ -1,127 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdInit.h" -#include "Simd/SimdIntegral.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - const __m256i K8_SUM_MASK = SIMD_MM256_SETR_EPI8( - 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00); - - const __m256i K32_PACK_64_TO_32 = SIMD_MM256_SETR_EPI32(0, 2, 4, 6, 1, 3, 5, 7); - - void IntegralSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint32_t * sum, size_t sumStride) - { - memset(sum, 0, (width + 1) * sizeof(uint32_t)); - sum += sumStride + 1; - size_t alignedWidth = AlignLo(width, 4); - - for (size_t row = 0; row < height; row++) - { - sum[-1] = 0; - size_t col = 0; - __m256i _rowSums = K_ZERO; - for (; col < alignedWidth; col += 4) - { - __m256i _src = _mm256_and_si256(_mm256_set1_epi32(*(uint32_t*)(src + col)), K8_SUM_MASK); - _rowSums = _mm256_add_epi32(_rowSums, _mm256_sad_epu8(_src, K_ZERO)); - _mm_storeu_si128((__m128i*)(sum + col), _mm_add_epi32(_mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_rowSums, K32_PACK_64_TO_32)), _mm_loadu_si128((__m128i*)(sum + col - sumStride)))); - _rowSums = _mm256_permute4x64_epi64(_rowSums, 0xFF); - } - uint32_t rowSum = sum[col - 1] - sum[col - sumStride - 1]; - for (; col < width; col++) - { - rowSum += src[col]; - sum[col] = rowSum + sum[col - sumStride]; - } - src += srcStride; - sum += sumStride; - } - } - - void Integral(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t * sum, size_t sumStride, uint8_t * sqsum, size_t sqsumStride, uint8_t * tilted, size_t tiltedStride, - SimdPixelFormatType sumFormat, SimdPixelFormatType sqsumFormat) - { - assert(sumFormat == SimdPixelFormatInt32 && sumStride % sizeof(uint32_t) == 0); - if (tilted) - assert(tiltedStride % sizeof(uint32_t) == 0); - - if (sqsum) - { - if (tilted) - { - switch (sqsumFormat) - { - case SimdPixelFormatInt32: - IntegralSumSqsumTilted(src, srcStride, width, height, - (uint32_t*)sum, sumStride / sizeof(uint32_t), (uint32_t*)sqsum, sqsumStride / sizeof(uint32_t), (uint32_t*)tilted, tiltedStride / sizeof(uint32_t)); - break; - case SimdPixelFormatDouble: - IntegralSumSqsumTilted(src, srcStride, width, height, - (uint32_t*)sum, sumStride / sizeof(uint32_t), (double*)sqsum, sqsumStride / sizeof(double), (uint32_t*)tilted, tiltedStride / sizeof(uint32_t)); - break; - default: - assert(0); - } - } - else - { - switch (sqsumFormat) - { - case SimdPixelFormatInt32: - IntegralSumSqsum(src, srcStride, width, height, - (uint32_t*)sum, sumStride / sizeof(uint32_t), (uint32_t*)sqsum, sqsumStride / sizeof(uint32_t)); - break; - case SimdPixelFormatDouble: - IntegralSumSqsum(src, srcStride, width, height, - (uint32_t*)sum, sumStride / sizeof(uint32_t), (double*)sqsum, sqsumStride / sizeof(double)); - break; - default: - assert(0); - } - } - } - else - { - if (tilted) - { - IntegralSumTilted(src, srcStride, width, height, - (uint32_t*)sum, sumStride / sizeof(uint32_t), (uint32_t*)tilted, tiltedStride / sizeof(uint32_t)); - } - else - { - Avx2::IntegralSum(src, srcStride, width, height, (uint32_t*)sum, sumStride / sizeof(uint32_t)); - } - } - } - } -#endif//SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Interference.cpp b/src/3rd/Simd/Simd/SimdAvx2Interference.cpp deleted file mode 100644 index 3152307b..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Interference.cpp +++ /dev/null @@ -1,146 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdSet.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template __m256i InterferenceChange(__m256i statistic, __m256i value, __m256i saturation); - - template<> SIMD_INLINE __m256i InterferenceChange(__m256i statistic, __m256i value, __m256i saturation) - { - return _mm256_min_epi16(_mm256_add_epi16(statistic, value), saturation); - } - - template<> SIMD_INLINE __m256i InterferenceChange(__m256i statistic, __m256i value, __m256i saturation) - { - return _mm256_max_epi16(_mm256_sub_epi16(statistic, value), saturation); - } - - template SIMD_INLINE void InterferenceChange(int16_t * statistic, __m256i value, __m256i saturation) - { - Store((__m256i*)statistic, InterferenceChange(Load((__m256i*)statistic), value, saturation)); - } - - template void InterferenceChange(int16_t * statistic, size_t stride, size_t width, size_t height, uint8_t value, int16_t saturation) - { - assert(width >= HA); - if (align) - assert(Aligned(statistic) && Aligned(stride, HA)); - - size_t alignedWidth = Simd::AlignLo(width, HA); - __m256i tailMask = SetMask(0, HA - width + alignedWidth, 0xFFFF); - - __m256i _value = _mm256_set1_epi16(value); - __m256i _saturation = _mm256_set1_epi16(saturation); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += HA) - InterferenceChange(statistic + col, _value, _saturation); - if (alignedWidth != width) - InterferenceChange(statistic + width - HA, _mm256_and_si256(_value, tailMask), _saturation); - statistic += stride; - } - } - - void InterferenceIncrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t increment, int16_t saturation) - { - assert(Aligned(stride, 2)); - - if (Aligned(statistic) && Aligned(stride)) - InterferenceChange((int16_t*)statistic, stride / 2, width, height, increment, saturation); - else - InterferenceChange((int16_t*)statistic, stride / 2, width, height, increment, saturation); - } - - void InterferenceDecrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t decrement, int16_t saturation) - { - assert(Aligned(stride, 2)); - - if (Aligned(statistic) && Aligned(stride)) - InterferenceChange((int16_t*)statistic, stride / 2, width, height, decrement, saturation); - else - InterferenceChange((int16_t*)statistic, stride / 2, width, height, decrement, saturation); - } - - template void InterferenceChangeMasked(int16_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t value, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index) - { - assert(width >= A); - if (align) - assert(Aligned(statistic) && Aligned(statisticStride, HA) && Aligned(mask) && Aligned(maskStride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + alignedWidth, 0xFF); - - __m256i _value = _mm256_set1_epi16(value); - __m256i _saturation = _mm256_set1_epi16(saturation); - __m256i _index = _mm256_set1_epi8(index); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - { - __m256i _mask = _mm256_cmpeq_epi8(LoadPermuted((__m256i*)(mask + col)), _index); - InterferenceChange(statistic + col, _mm256_and_si256(_value, _mm256_unpacklo_epi8(_mask, _mask)), _saturation); - InterferenceChange(statistic + col + HA, _mm256_and_si256(_value, _mm256_unpackhi_epi8(_mask, _mask)), _saturation); - } - if (alignedWidth != width) - { - __m256i _mask = _mm256_permute4x64_epi64(_mm256_and_si256(_mm256_cmpeq_epi8(Load((__m256i*)(mask + width - A)), _index), tailMask), 0xD8); - InterferenceChange(statistic + width - A, _mm256_and_si256(_value, _mm256_unpacklo_epi8(_mask, _mask)), _saturation); - InterferenceChange(statistic + width - HA, _mm256_and_si256(_value, _mm256_unpackhi_epi8(_mask, _mask)), _saturation); - } - statistic += statisticStride; - mask += maskStride; - } - } - - void InterferenceIncrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t increment, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index) - { - assert(Aligned(statisticStride, 2)); - - if (Aligned(statistic) && Aligned(statisticStride) && Aligned(mask) && Aligned(maskStride)) - InterferenceChangeMasked((int16_t*)statistic, statisticStride / 2, width, height, increment, saturation, mask, maskStride, index); - else - InterferenceChangeMasked((int16_t*)statistic, statisticStride / 2, width, height, increment, saturation, mask, maskStride, index); - } - - void InterferenceDecrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t decrement, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index) - { - assert(Aligned(statisticStride, 2)); - - if (Aligned(statistic) && Aligned(statisticStride) && Aligned(mask) && Aligned(maskStride)) - InterferenceChangeMasked((int16_t*)statistic, statisticStride / 2, width, height, decrement, saturation, mask, maskStride, index); - else - InterferenceChangeMasked((int16_t*)statistic, statisticStride / 2, width, height, decrement, saturation, mask, maskStride, index); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Interleave.cpp b/src/3rd/Simd/Simd/SimdAvx2Interleave.cpp deleted file mode 100644 index 25ada22e..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Interleave.cpp +++ /dev/null @@ -1,171 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void InterleaveUv(const uint8_t * u, const uint8_t * v, uint8_t * uv) - { - __m256i _u = LoadPermuted((__m256i*)u); - __m256i _v = LoadPermuted((__m256i*)v); - Store((__m256i*)uv + 0, UnpackU8<0>(_u, _v)); - Store((__m256i*)uv + 1, UnpackU8<1>(_u, _v)); - } - - template void InterleaveUv(const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * uv, size_t uvStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(uv) && Aligned(uvStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride)); - } - - size_t bodyWidth = AlignLo(width, A); - size_t tail = width - bodyWidth; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += DA) - InterleaveUv(u + col, v + col, uv + offset); - if (tail) - { - size_t col = width - A; - size_t offset = 2 * col; - InterleaveUv(u + col, v + col, uv + offset); - } - u += uStride; - v += vStride; - uv += uvStride; - } - } - - void InterleaveUv(const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * uv, size_t uvStride) - { - if (Aligned(uv) && Aligned(uvStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride)) - InterleaveUv(u, uStride, v, vStride, width, height, uv, uvStride); - else - InterleaveUv(u, uStride, v, vStride, width, height, uv, uvStride); - } - - template SIMD_INLINE void InterleaveBgr(const uint8_t * b, const uint8_t * g, const uint8_t * r, size_t offset, uint8_t * bgr) - { - __m256i _b = Load((__m256i*)(b + offset)); - __m256i _g = Load((__m256i*)(g + offset)); - __m256i _r = Load((__m256i*)(r + offset)); - Store((__m256i*)bgr + 0, InterleaveBgr<0>(_b, _g, _r)); - Store((__m256i*)bgr + 1, InterleaveBgr<1>(_b, _g, _r)); - Store((__m256i*)bgr + 2, InterleaveBgr<2>(_b, _g, _r)); - } - - template void InterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride)); - assert(Aligned(r) && Aligned(rStride) && Aligned(bgr) && Aligned(bgrStride)); - } - - size_t alignedWidth = AlignLo(width, A); - size_t tail = width - alignedWidth; - size_t A3 = A * 3; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < alignedWidth; col += A, offset += A3) - InterleaveBgr(b, g, r, col, bgr + offset); - if (tail) - InterleaveBgr(b, g, r, width - A, bgr + 3 * (width - A)); - b += bStride; - g += gStride; - r += rStride; - bgr += bgrStride; - } - } - - void InterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - if (Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride) - && Aligned(r) && Aligned(rStride) && Aligned(bgr) && Aligned(bgrStride)) - InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride); - else - InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride); - } - - template SIMD_INLINE void InterleaveBgra(const uint8_t * b, const uint8_t * g, const uint8_t * r, const uint8_t * a, size_t offset, uint8_t * bgra) - { - __m256i _b = Load((__m256i*)(b + offset)); - __m256i _g = Load((__m256i*)(g + offset)); - __m256i _r = Load((__m256i*)(r + offset)); - __m256i _a = Load((__m256i*)(a + offset)); - __m256i bg0 = PermutedUnpackLoU8(_b, _g); - __m256i bg1 = PermutedUnpackHiU8(_b, _g); - __m256i ra0 = PermutedUnpackLoU8(_r, _a); - __m256i ra1 = PermutedUnpackHiU8(_r, _a); - Store((__m256i*)bgra + 0, UnpackU16<0>(bg0, ra0)); - Store((__m256i*)bgra + 1, UnpackU16<0>(bg1, ra1)); - Store((__m256i*)bgra + 2, UnpackU16<1>(bg0, ra0)); - Store((__m256i*)bgra + 3, UnpackU16<1>(bg1, ra1)); - } - - template void InterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride)); - assert(Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - size_t alignedWidth = AlignLo(width, A); - size_t tail = width - alignedWidth; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < alignedWidth; col += A, offset += QA) - InterleaveBgra(b, g, r, a, col, bgra + offset); - if (tail) - InterleaveBgra(b, g, r, a, width - A, bgra + 4 * (width - A)); - b += bStride; - g += gStride; - r += rStride; - a += aStride; - bgra += bgraStride; - } - } - - void InterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride) - { - if (Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride) - && Aligned(r) && Aligned(rStride) && Aligned(bgra) && Aligned(bgraStride)) - InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride); - else - InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Laplace.cpp b/src/3rd/Simd/Simd/SimdAvx2Laplace.cpp deleted file mode 100644 index d3a17d9e..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Laplace.cpp +++ /dev/null @@ -1,188 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE __m256i Laplace(__m256i a[3][3]) - { - return _mm256_sub_epi16(_mm256_mullo_epi16(K16_0008, UnpackU8(a[1][1])), - _mm256_add_epi16(_mm256_add_epi16(_mm256_maddubs_epi16(UnpackU8(a[0][0], a[0][1]), K8_01), - _mm256_maddubs_epi16(UnpackU8(a[0][2], a[1][0]), K8_01)), - _mm256_add_epi16(_mm256_maddubs_epi16(UnpackU8(a[1][2], a[2][0]), K8_01), - _mm256_maddubs_epi16(UnpackU8(a[2][1], a[2][2]), K8_01)))); - } - - template SIMD_INLINE void Laplace(__m256i a[3][3], int16_t * dst) - { - __m256i lo = ConditionalAbs(Laplace<0>(a)); - __m256i hi = ConditionalAbs(Laplace<1>(a)); - Store((__m256i*)dst + 0, _mm256_permute2x128_si256(lo, hi, 0x20)); - Store((__m256i*)dst + 1, _mm256_permute2x128_si256(lo, hi, 0x31)); - } - - template void Laplace(const uint8_t * src, size_t srcStride, size_t width, size_t height, int16_t * dst, size_t dstStride) - { - assert(width > A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride, HA)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - __m256i a[3][3]; - - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - LoadNose3(src0 + 0, a[0]); - LoadNose3(src1 + 0, a[1]); - LoadNose3(src2 + 0, a[2]); - Laplace(a, dst + 0); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBody3(src0 + col, a[0]); - LoadBody3(src1 + col, a[1]); - LoadBody3(src2 + col, a[2]); - Laplace(a, dst + col); - } - LoadTail3(src0 + width - A, a[0]); - LoadTail3(src1 + width - A, a[1]); - LoadTail3(src2 + width - A, a[2]); - Laplace(a, dst + width - A); - - dst += dstStride; - } - } - - void Laplace(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - Laplace(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - Laplace(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - void LaplaceAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - Laplace(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - Laplace(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - SIMD_INLINE void LaplaceAbsSum(__m256i a[3][3], __m256i & sum) - { - sum = _mm256_add_epi32(sum, _mm256_madd_epi16(ConditionalAbs(Laplace<0>(a)), K16_0001)); - sum = _mm256_add_epi32(sum, _mm256_madd_epi16(ConditionalAbs(Laplace<1>(a)), K16_0001)); - } - - SIMD_INLINE void SetMask3(__m256i a[3], __m256i mask) - { - a[0] = _mm256_and_si256(a[0], mask); - a[1] = _mm256_and_si256(a[1], mask); - a[2] = _mm256_and_si256(a[2], mask); - } - - SIMD_INLINE void SetMask3x3(__m256i a[3][3], __m256i mask) - { - SetMask3(a[0], mask); - SetMask3(a[1], mask); - SetMask3(a[2], mask); - } - - template void LaplaceAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - assert(width > A); - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - - __m256i a[3][3]; - __m256i tailMask = SetMask(0, A - width + bodyWidth, 0xFF); - __m256i fullSum = _mm256_setzero_si256(); - - for (size_t row = 0; row < height; ++row) - { - src0 = src + stride*(row - 1); - src1 = src0 + stride; - src2 = src1 + stride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - __m256i rowSum = _mm256_setzero_si256(); - - LoadNose3(src0 + 0, a[0]); - LoadNose3(src1 + 0, a[1]); - LoadNose3(src2 + 0, a[2]); - LaplaceAbsSum(a, rowSum); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBody3(src0 + col, a[0]); - LoadBody3(src1 + col, a[1]); - LoadBody3(src2 + col, a[2]); - LaplaceAbsSum(a, rowSum); - } - LoadTail3(src0 + width - A, a[0]); - LoadTail3(src1 + width - A, a[1]); - LoadTail3(src2 + width - A, a[2]); - SetMask3x3(a, tailMask); - LaplaceAbsSum(a, rowSum); - - fullSum = _mm256_add_epi64(fullSum, HorizontalSum32(rowSum)); - } - *sum = ExtractSum(fullSum); - } - - void LaplaceAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(src) && Aligned(stride)) - LaplaceAbsSum(src, stride, width, height, sum); - else - LaplaceAbsSum(src, stride, width, height, sum); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Lbp.cpp b/src/3rd/Simd/Simd/SimdAvx2Lbp.cpp deleted file mode 100644 index 39ea1b54..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Lbp.cpp +++ /dev/null @@ -1,84 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template void LbpEstimate(const uint8_t * src, ptrdiff_t stride, uint8_t * dst) - { - __m256i threshold = Load((__m256i*)src); - __m256i lbp = _mm256_setzero_si256(); - lbp = _mm256_or_si256(lbp, _mm256_and_si256(GreaterOrEqual8u(Load((__m256i*)(src - 1 - stride)), threshold), K8_01)); - lbp = _mm256_or_si256(lbp, _mm256_and_si256(GreaterOrEqual8u(Load((__m256i*)(src - stride)), threshold), K8_02)); - lbp = _mm256_or_si256(lbp, _mm256_and_si256(GreaterOrEqual8u(Load((__m256i*)(src + 1 - stride)), threshold), K8_04)); - lbp = _mm256_or_si256(lbp, _mm256_and_si256(GreaterOrEqual8u(Load((__m256i*)(src + 1)), threshold), K8_08)); - lbp = _mm256_or_si256(lbp, _mm256_and_si256(GreaterOrEqual8u(Load((__m256i*)(src + 1 + stride)), threshold), K8_10)); - lbp = _mm256_or_si256(lbp, _mm256_and_si256(GreaterOrEqual8u(Load((__m256i*)(src + stride)), threshold), K8_20)); - lbp = _mm256_or_si256(lbp, _mm256_and_si256(GreaterOrEqual8u(Load((__m256i*)(src - 1 + stride)), threshold), K8_40)); - lbp = _mm256_or_si256(lbp, _mm256_and_si256(GreaterOrEqual8u(Load((__m256i*)(src - 1)), threshold), K8_80)); - Store((__m256i*)dst, lbp); - } - - template void LbpEstimate( - const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(width >= A + 2); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); - - size_t alignedWidth = AlignLo(width - 2, A) + 1; - - memset(dst, 0, width); - src += srcStride; - dst += dstStride; - for (size_t row = 2; row < height; ++row) - { - dst[0] = 0; - for (size_t col = 1; col < alignedWidth; col += A) - LbpEstimate(src + col, srcStride, dst + col); - if (alignedWidth != width - 1) - LbpEstimate(src + width - 1 - A, srcStride, dst + width - 1 - A); - dst[width - 1] = 0; - - src += srcStride; - dst += dstStride; - } - memset(dst, 0, width); - } - - void LbpEstimate(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - LbpEstimate(src, srcStride, width, height, dst, dstStride); - else - LbpEstimate(src, srcStride, width, height, dst, dstStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2MeanFilter3x3.cpp b/src/3rd/Simd/Simd/SimdAvx2MeanFilter3x3.cpp deleted file mode 100644 index 07e785c3..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2MeanFilter3x3.cpp +++ /dev/null @@ -1,155 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - namespace - { - struct Buffer - { - Buffer(size_t width) - { - _p = Allocate(sizeof(uint16_t) * 3 * width); - src0 = (uint16_t*)_p; - src1 = src0 + width; - src2 = src1 + width; - } - - ~Buffer() - { - Free(_p); - } - - uint16_t * src0; - uint16_t * src1; - uint16_t * src2; - private: - void * _p; - }; - } - - template SIMD_INLINE __m256i SumCol(__m256i a[3]) - { - return _mm256_add_epi16(_mm256_maddubs_epi16(UnpackU8(a[0], a[1]), K8_01), UnpackU8(a[2])); - } - - template SIMD_INLINE void SumCol(__m256i a[3], uint16_t * b) - { - Store((__m256i*)b + 0, SumCol<0>(a)); - Store((__m256i*)b + 1, SumCol<1>(a)); - } - - template SIMD_INLINE __m256i AverageRow16(const Buffer & buffer, size_t offset) - { - return _mm256_mulhi_epu16(K16_DIVISION_BY_9_FACTOR, _mm256_add_epi16( - _mm256_add_epi16(K16_0005, Load((__m256i*)(buffer.src0 + offset))), - _mm256_add_epi16(Load((__m256i*)(buffer.src1 + offset)), Load((__m256i*)(buffer.src2 + offset))))); - } - - template SIMD_INLINE __m256i AverageRow(const Buffer & buffer, size_t offset) - { - return _mm256_packus_epi16(AverageRow16(buffer, offset), AverageRow16(buffer, offset + HA)); - } - - template void MeanFilter3x3( - const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(step*(width - 1) >= A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(step*width) && Aligned(dst) && Aligned(dstStride)); - - __m256i a[3]; - - size_t size = step*width; - size_t bodySize = Simd::AlignHi(size, A) - A; - - Buffer buffer(Simd::AlignHi(size, A)); - - LoadNose3(src + 0, a); - SumCol(a, buffer.src0 + 0); - for (size_t col = A; col < bodySize; col += A) - { - LoadBody3(src + col, a); - SumCol(a, buffer.src0 + col); - } - LoadTail3(src + size - A, a); - SumCol(a, buffer.src0 + bodySize); - - memcpy(buffer.src1, buffer.src0, sizeof(uint16_t)*(bodySize + A)); - - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - const uint8_t *src2 = src + srcStride*(row + 1); - if (row >= height - 2) - src2 = src + srcStride*(height - 1); - - LoadNose3(src2 + 0, a); - SumCol(a, buffer.src2 + 0); - for (size_t col = A; col < bodySize; col += A) - { - LoadBody3(src2 + col, a); - SumCol(a, buffer.src2 + col); - } - LoadTail3(src2 + size - A, a); - SumCol(a, buffer.src2 + bodySize); - - for (size_t col = 0; col < bodySize; col += A) - Store((__m256i*)(dst + col), AverageRow(buffer, col)); - Store((__m256i*)(dst + size - A), AverageRow(buffer, bodySize)); - - Swap(buffer.src0, buffer.src2); - Swap(buffer.src0, buffer.src1); - } - } - - template void MeanFilter3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - assert(channelCount > 0 && channelCount <= 4); - - switch (channelCount) - { - case 1: MeanFilter3x3(src, srcStride, width, height, dst, dstStride); break; - case 2: MeanFilter3x3(src, srcStride, width, height, dst, dstStride); break; - case 3: MeanFilter3x3(src, srcStride, width, height, dst, dstStride); break; - case 4: MeanFilter3x3(src, srcStride, width, height, dst, dstStride); break; - } - } - - void MeanFilter3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(channelCount*width) && Aligned(dst) && Aligned(dstStride)) - MeanFilter3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else - MeanFilter3x3(src, srcStride, width, height, channelCount, dst, dstStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2MedianFilter.cpp b/src/3rd/Simd/Simd/SimdAvx2MedianFilter.cpp deleted file mode 100644 index 83e359d6..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2MedianFilter.cpp +++ /dev/null @@ -1,511 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void LoadNoseRhomb3x3(const uint8_t* y[3], size_t offset, __m256i a[5]) - { - a[0] = Load((__m256i*)(y[0] + offset)); - LoadNose3(y[1] + offset, a + 1); - a[4] = Load((__m256i*)(y[2] + offset)); - } - - template SIMD_INLINE void LoadBodyRhomb3x3(const uint8_t* y[3], size_t offset, __m256i a[5]) - { - a[0] = Load((__m256i*)(y[0] + offset)); - LoadBody3(y[1] + offset, a + 1); - a[4] = Load((__m256i*)(y[2] + offset)); - } - - template SIMD_INLINE void LoadTailRhomb3x3(const uint8_t* y[3], size_t offset, __m256i a[5]) - { - a[0] = Load((__m256i*)(y[0] + offset)); - LoadTail3(y[1] + offset, a + 1); - a[4] = Load((__m256i*)(y[2] + offset)); - } - - SIMD_INLINE void PartialSort5(__m256i a[5]) - { - SortU8(a[2], a[3]); - SortU8(a[1], a[2]); - SortU8(a[2], a[3]); - a[4] = _mm256_max_epu8(a[1], a[4]); - a[0] = _mm256_min_epu8(a[0], a[3]); - SortU8(a[2], a[0]); - a[2] = _mm256_max_epu8(a[4], a[2]); - a[2] = _mm256_min_epu8(a[2], a[0]); - } - - template void MedianFilterRhomb3x3( - const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(step*(width - 1) >= A); - - const uint8_t * y[3]; - __m256i a[5]; - - size_t size = step*width; - size_t bodySize = Simd::AlignHi(size, A) - A; - - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - y[0] = src + srcStride*(row - 1); - y[1] = y[0] + srcStride; - y[2] = y[1] + srcStride; - if (row < 1) - y[0] = y[1]; - if (row >= height - 1) - y[2] = y[1]; - - LoadNoseRhomb3x3(y, 0, a); - PartialSort5(a); - Store((__m256i*)(dst), a[2]); - - for (size_t col = A; col < bodySize; col += A) - { - LoadBodyRhomb3x3(y, col, a); - PartialSort5(a); - Store((__m256i*)(dst + col), a[2]); - } - - size_t col = size - A; - LoadTailRhomb3x3(y, col, a); - PartialSort5(a); - Store((__m256i*)(dst + col), a[2]); - } - } - - template void MedianFilterRhomb3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - assert(channelCount > 0 && channelCount <= 4); - - switch (channelCount) - { - case 1: MedianFilterRhomb3x3(src, srcStride, width, height, dst, dstStride); break; - case 2: MedianFilterRhomb3x3(src, srcStride, width, height, dst, dstStride); break; - case 3: MedianFilterRhomb3x3(src, srcStride, width, height, dst, dstStride); break; - case 4: MedianFilterRhomb3x3(src, srcStride, width, height, dst, dstStride); break; - } - } - - void MedianFilterRhomb3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(width) && Aligned(dst) && Aligned(dstStride)) - MedianFilterRhomb3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else - MedianFilterRhomb3x3(src, srcStride, width, height, channelCount, dst, dstStride); - } - - template SIMD_INLINE void LoadNoseSquare3x3(const uint8_t* y[3], size_t offset, __m256i a[9]) - { - LoadNose3(y[0] + offset, a + 0); - LoadNose3(y[1] + offset, a + 3); - LoadNose3(y[2] + offset, a + 6); - } - - template SIMD_INLINE void LoadBodySquare3x3(const uint8_t* y[3], size_t offset, __m256i a[9]) - { - LoadBody3(y[0] + offset, a + 0); - LoadBody3(y[1] + offset, a + 3); - LoadBody3(y[2] + offset, a + 6); - } - - template SIMD_INLINE void LoadTailSquare3x3(const uint8_t* y[3], size_t offset, __m256i a[9]) - { - LoadTail3(y[0] + offset, a + 0); - LoadTail3(y[1] + offset, a + 3); - LoadTail3(y[2] + offset, a + 6); - } - - SIMD_INLINE void PartialSort9(__m256i a[9]) - { - SortU8(a[1], a[2]); SortU8(a[4], a[5]); SortU8(a[7], a[8]); - SortU8(a[0], a[1]); SortU8(a[3], a[4]); SortU8(a[6], a[7]); - SortU8(a[1], a[2]); SortU8(a[4], a[5]); SortU8(a[7], a[8]); - a[3] = _mm256_max_epu8(a[0], a[3]); - a[5] = _mm256_min_epu8(a[5], a[8]); - SortU8(a[4], a[7]); - a[6] = _mm256_max_epu8(a[3], a[6]); - a[4] = _mm256_max_epu8(a[1], a[4]); - a[2] = _mm256_min_epu8(a[2], a[5]); - a[4] = _mm256_min_epu8(a[4], a[7]); - SortU8(a[4], a[2]); - a[4] = _mm256_max_epu8(a[6], a[4]); - a[4] = _mm256_min_epu8(a[4], a[2]); - } - - template void MedianFilterSquare3x3( - const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(step*(width - 1) >= A); - - const uint8_t * y[3]; - __m256i a[9]; - - size_t size = step*width; - size_t bodySize = Simd::AlignHi(size, A) - A; - - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - y[0] = src + srcStride*(row - 1); - y[1] = y[0] + srcStride; - y[2] = y[1] + srcStride; - if (row < 1) - y[0] = y[1]; - if (row >= height - 1) - y[2] = y[1]; - - LoadNoseSquare3x3(y, 0, a); - PartialSort9(a); - Store((__m256i*)(dst), a[4]); - - for (size_t col = A; col < bodySize; col += A) - { - LoadBodySquare3x3(y, col, a); - PartialSort9(a); - Store((__m256i*)(dst + col), a[4]); - } - - size_t col = size - A; - LoadTailSquare3x3(y, col, a); - PartialSort9(a); - Store((__m256i*)(dst + col), a[4]); - } - } - - template void MedianFilterSquare3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - assert(channelCount > 0 && channelCount <= 4); - - switch (channelCount) - { - case 1: MedianFilterSquare3x3(src, srcStride, width, height, dst, dstStride); break; - case 2: MedianFilterSquare3x3(src, srcStride, width, height, dst, dstStride); break; - case 3: MedianFilterSquare3x3(src, srcStride, width, height, dst, dstStride); break; - case 4: MedianFilterSquare3x3(src, srcStride, width, height, dst, dstStride); break; - } - } - - void MedianFilterSquare3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(width) && Aligned(dst) && Aligned(dstStride)) - MedianFilterSquare3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else - MedianFilterSquare3x3(src, srcStride, width, height, channelCount, dst, dstStride); - } - - template SIMD_INLINE void LoadNoseRhomb5x5(const uint8_t* y[5], size_t offset, __m256i a[13]) - { - a[0] = Load((__m256i*)(y[0] + offset)); - LoadNose3(y[1] + offset, a + 1); - LoadNose5(y[2] + offset, a + 4); - LoadNose3(y[3] + offset, a + 9); - a[12] = Load((__m256i*)(y[4] + offset)); - } - - template SIMD_INLINE void LoadBodyRhomb5x5(const uint8_t* y[5], size_t offset, __m256i a[13]) - { - a[0] = Load((__m256i*)(y[0] + offset)); - LoadBody3(y[1] + offset, a + 1); - LoadBody5(y[2] + offset, a + 4); - LoadBody3(y[3] + offset, a + 9); - a[12] = Load((__m256i*)(y[4] + offset)); - } - - template SIMD_INLINE void LoadTailRhomb5x5(const uint8_t* y[5], size_t offset, __m256i a[13]) - { - a[0] = Load((__m256i*)(y[0] + offset)); - LoadTail3(y[1] + offset, a + 1); - LoadTail5(y[2] + offset, a + 4); - LoadTail3(y[3] + offset, a + 9); - a[12] = Load((__m256i*)(y[4] + offset)); - } - - SIMD_INLINE void PartialSort13(__m256i a[13]) - { - SortU8(a[0], a[1]); SortU8(a[3], a[4]); SortU8(a[2], a[4]); - SortU8(a[2], a[3]); SortU8(a[6], a[7]); SortU8(a[5], a[7]); - SortU8(a[5], a[6]); SortU8(a[9], a[10]); SortU8(a[8], a[10]); - SortU8(a[8], a[9]); SortU8(a[11], a[12]); SortU8(a[5], a[8]); - SortU8(a[2], a[8]); SortU8(a[2], a[5]); SortU8(a[6], a[9]); - SortU8(a[3], a[9]); SortU8(a[3], a[6]); SortU8(a[7], a[10]); - SortU8(a[4], a[10]); SortU8(a[4], a[7]); SortU8(a[3], a[12]); - SortU8(a[0], a[9]); - a[1] = _mm256_min_epu8(a[1], a[10]); - a[1] = _mm256_min_epu8(a[1], a[7]); - a[1] = _mm256_min_epu8(a[1], a[9]); - a[11] = _mm256_max_epu8(a[5], a[11]); - a[11] = _mm256_max_epu8(a[3], a[11]); - a[11] = _mm256_max_epu8(a[2], a[11]); - SortU8(a[0], a[6]); SortU8(a[1], a[8]); SortU8(a[6], a[8]); - a[4] = _mm256_min_epu8(a[4], a[8]); - SortU8(a[0], a[1]); SortU8(a[4], a[6]); SortU8(a[0], a[4]); - a[11] = _mm256_max_epu8(a[0], a[11]); - SortU8(a[6], a[11]); - a[1] = _mm256_min_epu8(a[1], a[11]); - SortU8(a[1], a[4]); SortU8(a[6], a[12]); - a[6] = _mm256_max_epu8(a[1], a[6]); - a[4] = _mm256_min_epu8(a[4], a[12]); - a[6] = _mm256_max_epu8(a[4], a[6]); - } - - template void MedianFilterRhomb5x5( - const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(step*(width - 2) >= A); - - const uint8_t * y[5]; - __m256i a[13]; - - size_t size = step*width; - size_t bodySize = Simd::AlignHi(size, A) - A; - - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - y[0] = src + srcStride*(row - 2); - y[1] = y[0] + srcStride; - y[2] = y[1] + srcStride; - y[3] = y[2] + srcStride; - y[4] = y[3] + srcStride; - if (row < 2) - { - if (row < 1) - y[1] = y[2]; - y[0] = y[1]; - } - if (row >= height - 2) - { - if (row >= height - 1) - y[3] = y[2]; - y[4] = y[3]; - } - - LoadNoseRhomb5x5(y, 0, a); - PartialSort13(a); - Store((__m256i*)(dst), a[6]); - - for (size_t col = A; col < bodySize; col += A) - { - LoadBodyRhomb5x5(y, col, a); - PartialSort13(a); - Store((__m256i*)(dst + col), a[6]); - } - - size_t col = size - A; - LoadTailRhomb5x5(y, col, a); - PartialSort13(a); - Store((__m256i*)(dst + col), a[6]); - } - } - - template void MedianFilterRhomb5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - assert(channelCount > 0 && channelCount <= 4); - - switch (channelCount) - { - case 1: MedianFilterRhomb5x5(src, srcStride, width, height, dst, dstStride); break; - case 2: MedianFilterRhomb5x5(src, srcStride, width, height, dst, dstStride); break; - case 3: MedianFilterRhomb5x5(src, srcStride, width, height, dst, dstStride); break; - case 4: MedianFilterRhomb5x5(src, srcStride, width, height, dst, dstStride); break; - } - } - - void MedianFilterRhomb5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(width) && Aligned(dst) && Aligned(dstStride)) - MedianFilterRhomb5x5(src, srcStride, width, height, channelCount, dst, dstStride); - else - MedianFilterRhomb5x5(src, srcStride, width, height, channelCount, dst, dstStride); - } - - template SIMD_INLINE void LoadNoseSquare5x5(const uint8_t* y[5], size_t offset, __m256i a[25]) - { - LoadNose5(y[0] + offset, a + 0); - LoadNose5(y[1] + offset, a + 5); - LoadNose5(y[2] + offset, a + 10); - LoadNose5(y[3] + offset, a + 15); - LoadNose5(y[4] + offset, a + 20); - } - - template SIMD_INLINE void LoadBodySquare5x5(const uint8_t* y[5], size_t offset, __m256i a[25]) - { - LoadBody5(y[0] + offset, a + 0); - LoadBody5(y[1] + offset, a + 5); - LoadBody5(y[2] + offset, a + 10); - LoadBody5(y[3] + offset, a + 15); - LoadBody5(y[4] + offset, a + 20); - } - - template SIMD_INLINE void LoadTailSquare5x5(const uint8_t* y[5], size_t offset, __m256i a[25]) - { - LoadTail5(y[0] + offset, a + 0); - LoadTail5(y[1] + offset, a + 5); - LoadTail5(y[2] + offset, a + 10); - LoadTail5(y[3] + offset, a + 15); - LoadTail5(y[4] + offset, a + 20); - } - - SIMD_INLINE void PartialSort25(__m256i a[25]) - { - SortU8(a[0], a[1]); SortU8(a[3], a[4]); SortU8(a[2], a[4]); - SortU8(a[2], a[3]); SortU8(a[6], a[7]); SortU8(a[5], a[7]); - SortU8(a[5], a[6]); SortU8(a[9], a[10]); SortU8(a[8], a[10]); - SortU8(a[8], a[9]); SortU8(a[12], a[13]); SortU8(a[11], a[13]); - SortU8(a[11], a[12]); SortU8(a[15], a[16]); SortU8(a[14], a[16]); - SortU8(a[14], a[15]); SortU8(a[18], a[19]); SortU8(a[17], a[19]); - SortU8(a[17], a[18]); SortU8(a[21], a[22]); SortU8(a[20], a[22]); - SortU8(a[20], a[21]); SortU8(a[23], a[24]); SortU8(a[2], a[5]); - SortU8(a[3], a[6]); SortU8(a[0], a[6]); SortU8(a[0], a[3]); - SortU8(a[4], a[7]); SortU8(a[1], a[7]); SortU8(a[1], a[4]); - SortU8(a[11], a[14]); SortU8(a[8], a[14]); SortU8(a[8], a[11]); - SortU8(a[12], a[15]); SortU8(a[9], a[15]); SortU8(a[9], a[12]); - SortU8(a[13], a[16]); SortU8(a[10], a[16]); SortU8(a[10], a[13]); - SortU8(a[20], a[23]); SortU8(a[17], a[23]); SortU8(a[17], a[20]); - SortU8(a[21], a[24]); SortU8(a[18], a[24]); SortU8(a[18], a[21]); - SortU8(a[19], a[22]); SortU8(a[9], a[18]); SortU8(a[0], a[18]); - a[17] = _mm256_max_epu8(a[8], a[17]); - a[9] = _mm256_max_epu8(a[0], a[9]); - SortU8(a[10], a[19]); SortU8(a[1], a[19]); SortU8(a[1], a[10]); - SortU8(a[11], a[20]); SortU8(a[2], a[20]); SortU8(a[12], a[21]); - a[11] = _mm256_max_epu8(a[2], a[11]); - SortU8(a[3], a[21]); SortU8(a[3], a[12]); SortU8(a[13], a[22]); - a[4] = _mm256_min_epu8(a[4], a[22]); - SortU8(a[4], a[13]); SortU8(a[14], a[23]); - SortU8(a[5], a[23]); SortU8(a[5], a[14]); SortU8(a[15], a[24]); - a[6] = _mm256_min_epu8(a[6], a[24]); - SortU8(a[6], a[15]); - a[7] = _mm256_min_epu8(a[7], a[16]); - a[7] = _mm256_min_epu8(a[7], a[19]); - a[13] = _mm256_min_epu8(a[13], a[21]); - a[15] = _mm256_min_epu8(a[15], a[23]); - a[7] = _mm256_min_epu8(a[7], a[13]); - a[7] = _mm256_min_epu8(a[7], a[15]); - a[9] = _mm256_max_epu8(a[1], a[9]); - a[11] = _mm256_max_epu8(a[3], a[11]); - a[17] = _mm256_max_epu8(a[5], a[17]); - a[17] = _mm256_max_epu8(a[11], a[17]); - a[17] = _mm256_max_epu8(a[9], a[17]); - SortU8(a[4], a[10]); - SortU8(a[6], a[12]); SortU8(a[7], a[14]); SortU8(a[4], a[6]); - a[7] = _mm256_max_epu8(a[4], a[7]); - SortU8(a[12], a[14]); - a[10] = _mm256_min_epu8(a[10], a[14]); - SortU8(a[6], a[7]); SortU8(a[10], a[12]); SortU8(a[6], a[10]); - a[17] = _mm256_max_epu8(a[6], a[17]); - SortU8(a[12], a[17]); - a[7] = _mm256_min_epu8(a[7], a[17]); - SortU8(a[7], a[10]); SortU8(a[12], a[18]); - a[12] = _mm256_max_epu8(a[7], a[12]); - a[10] = _mm256_min_epu8(a[10], a[18]); - SortU8(a[12], a[20]); - a[10] = _mm256_min_epu8(a[10], a[20]); - a[12] = _mm256_max_epu8(a[10], a[12]); - } - - template void MedianFilterSquare5x5( - const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(step*(width - 2) >= A); - - const uint8_t * y[5]; - __m256i a[25]; - - size_t size = step*width; - size_t bodySize = Simd::AlignHi(size, A) - A; - - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - y[0] = src + srcStride*(row - 2); - y[1] = y[0] + srcStride; - y[2] = y[1] + srcStride; - y[3] = y[2] + srcStride; - y[4] = y[3] + srcStride; - if (row < 2) - { - if (row < 1) - y[1] = y[2]; - y[0] = y[1]; - } - if (row >= height - 2) - { - if (row >= height - 1) - y[3] = y[2]; - y[4] = y[3]; - } - - LoadNoseSquare5x5(y, 0, a); - PartialSort25(a); - Store((__m256i*)(dst), a[12]); - - for (size_t col = A; col < bodySize; col += A) - { - LoadBodySquare5x5(y, col, a); - PartialSort25(a); - Store((__m256i*)(dst + col), a[12]); - } - - size_t col = size - A; - LoadTailSquare5x5(y, col, a); - PartialSort25(a); - Store((__m256i*)(dst + col), a[12]); - } - } - - template void MedianFilterSquare5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - assert(channelCount > 0 && channelCount <= 4); - - switch (channelCount) - { - case 1: MedianFilterSquare5x5(src, srcStride, width, height, dst, dstStride); break; - case 2: MedianFilterSquare5x5(src, srcStride, width, height, dst, dstStride); break; - case 3: MedianFilterSquare5x5(src, srcStride, width, height, dst, dstStride); break; - case 4: MedianFilterSquare5x5(src, srcStride, width, height, dst, dstStride); break; - } - } - - void MedianFilterSquare5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(width) && Aligned(dst) && Aligned(dstStride)) - MedianFilterSquare5x5(src, srcStride, width, height, channelCount, dst, dstStride); - else - MedianFilterSquare5x5(src, srcStride, width, height, channelCount, dst, dstStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Neural.cpp b/src/3rd/Simd/Simd/SimdAvx2Neural.cpp deleted file mode 100644 index 684d1d38..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Neural.cpp +++ /dev/null @@ -1,1885 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdStream.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdNeural.h" -#include "Simd/SimdPow.h" -#include "Simd/SimdExp.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template __m128i Invert(__m128i value); - - template <> __m128i Invert(__m128i value) - { - return _mm_sub_epi8(Sse2::K_INV_ZERO, value); - } - - template <> __m128i Invert(__m128i value) - { - return value; - } - - template void Convert(const uint8_t * src, const __m256 & _1_255, float * dst) - { - __m128i _src = Invert(_mm_loadl_epi64((__m128i*)src)); - Avx::Stream(dst, _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_src)), _1_255)); - } - - template void NeuralConvert(const uint8_t * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - assert(width >= F); - if (align) - assert(Aligned(dst) && Aligned(dstStride)); - - size_t alignedWidth = AlignLo(width, F); - __m256 _1_255 = _mm256_set1_ps(1.0f / 255.0f); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += F) - Convert(src + col, _1_255, dst + col); - if (width != alignedWidth) - Convert(src + width - F, _1_255, dst + width - F); - src += srcStride; - dst += dstStride; - } - if (stream) - _mm_mfence(); - } - - template void NeuralConvert(const uint8_t * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - { - if (width*height * sizeof(float) >= STREAM_SIZE_MIN) - NeuralConvert(src, srcStride, width, height, dst, dstStride); - else - NeuralConvert(src, srcStride, width, height, dst, dstStride); - } - else - NeuralConvert(src, srcStride, width, height, dst, dstStride); - } - - void NeuralConvert(const uint8_t * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride, int inversion) - { - if (inversion) - NeuralConvert(src, srcStride, width, height, dst, dstStride); - else - NeuralConvert(src, srcStride, width, height, dst, dstStride); - } - - template SIMD_INLINE void NeuralProductSum(const float * a, const float * b, size_t offset, __m256 & sum) - { - __m256 _a = Load(a + offset); - __m256 _b = Load(b + offset); - sum = _mm256_fmadd_ps(_a, _b, sum); - } - - template SIMD_INLINE void NeuralProductSum(const float * a, const float * b, size_t size, float * sum) - { - if (align) - assert(Aligned(a) && Aligned(b)); - - *sum = 0; - size_t partialAlignedSize = AlignLo(size, F); - size_t fullAlignedSize = AlignLo(size, QF); - size_t i = 0; - if (partialAlignedSize) - { - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - if (fullAlignedSize) - { - for (; i < fullAlignedSize; i += QF) - { - NeuralProductSum(a, b, i + F * 0, sums[0]); - NeuralProductSum(a, b, i + F * 1, sums[1]); - NeuralProductSum(a, b, i + F * 2, sums[2]); - NeuralProductSum(a, b, i + F * 3, sums[3]); - } - sums[0] = _mm256_add_ps(_mm256_add_ps(sums[0], sums[1]), _mm256_add_ps(sums[2], sums[3])); - } - for (; i < partialAlignedSize; i += F) - NeuralProductSum(a, b, i, sums[0]); - *sum += Avx::ExtractSum(sums[0]); - } - for (; i < size; ++i) - *sum += a[i] * b[i]; - } - - void NeuralProductSum(const float * a, const float * b, size_t size, float * sum) - { - if (Aligned(a) && Aligned(b)) - NeuralProductSum(a, b, size, sum); - else - NeuralProductSum(a, b, size, sum); - } - - template SIMD_INLINE void AddMultiplied(const float * src, const __m256 & value, float * dst) - { - Avx::Store(dst, _mm256_fmadd_ps(value, Load(src), Load(dst))); - } - - template SIMD_INLINE void AddMultiplied(const float * src, size_t aligned, size_t partial, size_t full, float value, float * dst) - { - size_t i = 0; - if (partial) - { - __m256 _value = _mm256_set1_ps(value); - for (; i < aligned; i += QF) - { - AddMultiplied(src + i + F * 0, _value, dst + i + 0); - AddMultiplied(src + i + F * 1, _value, dst + i + 8); - AddMultiplied(src + i + F * 2, _value, dst + i + 16); - AddMultiplied(src + i + F * 3, _value, dst + i + 24); - } - for (; i < partial; i += F) - AddMultiplied(src + i, _value, dst + i); - } - for (; i < full; ++i) - dst[i] += src[i] * value; - } - - void NeuralAddVectorMultipliedByValue(const float * src, size_t size, const float * value, float * dst) - { - size_t aligned = AlignLo(size, QF); - size_t partial = AlignLo(size, F); - if (Aligned(src) && Aligned(dst)) - AddMultiplied(src, aligned, partial, size, *value, dst); - else - AddMultiplied(src, aligned, partial, size, *value, dst); - } - - template SIMD_INLINE void NeuralRoughSigmoid2(const float * src, const __m256 & k, const __m256 & o, const __m256 & m, float * dst) - { - __m256 _src = Load(src); - __m256 e1 = _mm256_max_ps(m, _mm256_fmadd_ps(_src, k, o)); - __m256 e2 = _mm256_mul_ps(e1, e1); - __m256 e4 = _mm256_mul_ps(e2, e2); - __m256 e8 = _mm256_mul_ps(e4, e4); - __m256 e16 = _mm256_mul_ps(e8, e8); - __m256 e32 = _mm256_mul_ps(e16, e16); - __m256 e64 = _mm256_mul_ps(e32, e32); - __m256 sigmoid = _mm256_rcp_ps(_mm256_fmadd_ps(e64, e64, o)); - Avx::Store(dst, sigmoid); - } - - template SIMD_INLINE void NeuralRoughSigmoid2(const float * src, size_t size, const float * slope, float * dst) - { - size_t partialAlignedSize = Simd::AlignLo(size, F); - size_t fullAlignedSize = Simd::AlignLo(size, QF); - __m256 _k = _mm256_set1_ps(-(*slope)*0.0078125f); - __m256 _1 = _mm256_set1_ps(1.0f); - __m256 _05 = _mm256_set1_ps(0.5f); - size_t i = 0; - for (; i < fullAlignedSize; i += QF) - { - NeuralRoughSigmoid2(src + i + 0 * F, _k, _1, _05, dst + i + 0 * F); - NeuralRoughSigmoid2(src + i + 1 * F, _k, _1, _05, dst + i + 1 * F); - NeuralRoughSigmoid2(src + i + 2 * F, _k, _1, _05, dst + i + 2 * F); - NeuralRoughSigmoid2(src + i + 3 * F, _k, _1, _05, dst + i + 3 * F); - } - for (; i < partialAlignedSize; i += F) - NeuralRoughSigmoid2(src + i, _k, _1, _05, dst + i); - for (; i < size; ++i) - dst[i] = Base::RoughSigmoid2(src[i] * slope[0]); - } - - void NeuralRoughSigmoid2(const float * src, size_t size, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - NeuralRoughSigmoid2(src, size, slope, dst); - else - NeuralRoughSigmoid2(src, size, slope, dst); - } - - template void NeuralPow(const float * src, size_t size, const float * exponent, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - float e = exponent[0]; - size_t alignedSize = AlignLo(size, F); - __m256 _e = _mm256_set1_ps(e); - Pow pow; - size_t i = 0; - for (; i < alignedSize; i += F) - Avx::Store(dst + i, pow(Avx::Load(src + i), _e)); - for (; i < size; ++i) - dst[i] = Base::Pow(src[i], e); - } - - void NeuralPow(const float * src, size_t size, const float * exponent, float * dst) - { - if (Aligned(src) && Aligned(dst)) - NeuralPow(src, size, exponent, dst); - else - NeuralPow(src, size, exponent, dst); - } - - template void NeuralAddConvolutionForward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - size_t alignedWidth = AlignLo(width, F); - __m256 tailMask = RightNotZero32f(width - alignedWidth); - __m256 _weights[coreX*coreY]; - LoadWeightsForward(weights, _weights); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += F) - { - __m256 _dst = Avx::Load(dst + col); - _dst = _mm256_add_ps(_dst, Convolution::template Forward(src + col, srcStride, _weights)); - Avx::Store(dst + col, _dst); - } - if (width - alignedWidth) - { - size_t col = width - F; - __m256 _dst = Avx::Load(dst + col); - _dst = _mm256_add_ps(_dst, _mm256_and_ps(tailMask, Convolution::template Forward(src + col, srcStride, _weights))); - Avx::Store(dst + col, _dst); - } - src += srcStride; - dst += dstStride; - } - } - - void NeuralAddConvolution2x2Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution3x3Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution4x4Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution5x5Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - } - - template struct If - { - template static SIMD_INLINE void AddMultiplied(const float * src, size_t aligned, size_t partial, size_t full, float value, float * dst) - { - Avx2::AddMultiplied(src, aligned, partial, full, value, dst); - } - }; - - template<> struct If - { - template static SIMD_INLINE void AddMultiplied(const float * src, size_t aligned, size_t partial, size_t full, float value, float * dst) - { - } - }; - - template void NeuralAddConvolutionBackwardSmall(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - size_t aligned = AlignLo(width, QF); - size_t partial = AlignLo(width, F); - for (size_t row = 0; row < height; ++row) - { - for (size_t dy = 0; dy < coreY; ++dy) - { - const float * w = weights + dy * coreX; - float * d = dst + dy*dstStride; - If < 0 < coreX > ::template AddMultiplied(src, aligned, partial, width, w[0], d + 0); - If < 1 < coreX > ::template AddMultiplied(src, aligned, partial, width, w[1], d + 1); - If < 2 < coreX > ::template AddMultiplied(src, aligned, partial, width, w[2], d + 2); - If < 3 < coreX > ::template AddMultiplied(src, aligned, partial, width, w[3], d + 3); - If < 4 < coreX > ::template AddMultiplied(src, aligned, partial, width, w[4], d + 4); - } - src += srcStride; - dst += dstStride; - } - } - - template void NeuralAddConvolutionBackwardLarge(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - ConvolutionBackwardBuffer buffer(width, F); - height += coreY - 1; - width += coreX - 1; - size_t alignedWidth = AlignLo(width, F); - __m256 tailMask = RightNotZero32f(width - alignedWidth); - __m256 _weights[coreX*coreY]; - LoadWeightsBackward(weights, _weights); - - for (size_t row = 0; row < height; ++row) - { - buffer.Update(row <= height - coreY ? src : NULL); - for (size_t col = 0; col < alignedWidth; col += F) - { - __m256 _dst = Avx::Load(dst + col); - _dst = _mm256_add_ps(_dst, Convolution::template Backward(buffer, col, _weights)); - Avx::Store(dst + col, _dst); - } - if (width - alignedWidth) - { - size_t col = width - F; - __m256 _dst = Avx::Load(dst + col); - _dst = _mm256_add_ps(_dst, _mm256_and_ps(tailMask, Convolution::template Backward(buffer, col, _weights))); - Avx::Store(dst + col, _dst); - } - src += srcStride; - dst += dstStride; - } - } - - template void NeuralAddConvolutionBackward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (width*height < 1024) - NeuralAddConvolutionBackwardSmall(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionBackwardLarge(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution2x2Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution3x3Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution4x4Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution5x5Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - } - - template SIMD_INLINE void NeuralAddConvolutionSum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - size_t alignedWidth = Simd::AlignLo(width, F); - __m256 tailMask = RightNotZero32f(width - alignedWidth); - __m256 _sums[coreX*coreY]; - memset(_sums, 0, sizeof(_sums)); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += F) - { - __m256 _dst = Avx::Load(dst + col); - Convolution::template Sum(src + col, srcStride, _dst, _sums); - } - if (alignedWidth < width) - { - size_t col = width - F; - __m256 _dst = _mm256_and_ps(tailMask, Avx::Load(dst + col)); - Convolution::template Sum(src + col, srcStride, _dst, _sums); - } - src += srcStride; - dst += dstStride; - } - size_t i = 0, n = Simd::AlignLo(coreX*coreY, F); - for (; i < n; i += F) - Add8ExtractedSums(_sums + i, sums + i); - for (; i < coreX*coreY; ++i) - sums[i] += Avx::ExtractSum(_sums[i]); - } - - void NeuralAddConvolution2x2Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - else - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - } - - void NeuralAddConvolution3x3Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - else - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - } - - void NeuralAddConvolution4x4Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - else - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - } - - void NeuralAddConvolution5x5Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - else - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - } - - template SIMD_INLINE __m256 Pooling1x1Max3x1Body(const float * src) - { - return _mm256_max_ps(_mm256_max_ps(Avx::Load(src - 1), Avx::Load(src)), Avx::Load(src + 1)); - } - - template SIMD_INLINE void Pooling1x1Max3x3Body(const float * src, size_t stride, float * dst) - { - __m256 src0 = Pooling1x1Max3x1Body(src - stride); - __m256 src1 = Pooling1x1Max3x1Body(src); - __m256 src2 = Pooling1x1Max3x1Body(src + stride); - Avx::Store(dst, _mm256_max_ps(_mm256_max_ps(src0, src1), src2)); - } - - template SIMD_INLINE void Pooling1x1Max3x2Body(const float * src, size_t stride, float * dst) - { - __m256 src0 = Pooling1x1Max3x1Body(src); - __m256 src1 = Pooling1x1Max3x1Body(src + stride); - Avx::Store(dst, _mm256_max_ps(src0, src1)); - } - - __m256i K32_PERMUTE_NOSE = SIMD_MM256_SETR_EPI32(0, 0, 1, 2, 3, 4, 5, 6); - - template SIMD_INLINE __m256 Pooling1x1Max3x1Nose(const float * src) - { - __m256 src1 = Avx::Load(src); - __m256 src0 = _mm256_permutevar8x32_ps(src1, K32_PERMUTE_NOSE); - __m256 src2 = Avx::Load(src + 1); - return _mm256_max_ps(_mm256_max_ps(src0, src1), src2); - } - - template SIMD_INLINE void Pooling1x1Max3x3Nose(const float * src, size_t stride, float * dst) - { - __m256 src0 = Pooling1x1Max3x1Nose(src - stride); - __m256 src1 = Pooling1x1Max3x1Nose(src); - __m256 src2 = Pooling1x1Max3x1Nose(src + stride); - Avx::Store(dst, _mm256_max_ps(_mm256_max_ps(src0, src1), src2)); - } - template SIMD_INLINE void Pooling1x1Max3x2Nose(const float * src, size_t stride, float * dst) - { - __m256 src0 = Pooling1x1Max3x1Nose(src); - __m256 src1 = Pooling1x1Max3x1Nose(src + stride); - Avx::Store(dst, _mm256_max_ps(src0, src1)); - } - - __m256i K32_PERMUTE_TAIL = SIMD_MM256_SETR_EPI32(1, 2, 3, 4, 5, 6, 7, 7); - - template SIMD_INLINE __m256 Pooling1x1Max3x1Tail(const float * src) - { - __m256 src0 = Avx::Load(src - 1); - __m256 src1 = Avx::Load(src); - __m256 src2 = _mm256_permutevar8x32_ps(src1, K32_PERMUTE_TAIL); - return _mm256_max_ps(_mm256_max_ps(src0, src1), src2); - } - - template SIMD_INLINE void Pooling1x1Max3x3Tail(const float * src, size_t stride, float * dst) - { - __m256 src0 = Pooling1x1Max3x1Tail(src - stride); - __m256 src1 = Pooling1x1Max3x1Tail(src); - __m256 src2 = Pooling1x1Max3x1Tail(src + stride); - Avx::Store(dst, _mm256_max_ps(_mm256_max_ps(src0, src1), src2)); - } - template SIMD_INLINE void Pooling1x1Max3x2Tail(const float * src, size_t stride, float * dst) - { - __m256 src0 = Pooling1x1Max3x1Tail(src); - __m256 src1 = Pooling1x1Max3x1Tail(src + stride); - Avx::Store(dst, _mm256_max_ps(src0, src1)); - } - - template void NeuralPooling1x1Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - assert(width > F && height > 1); - - size_t alignedWidth = AlignHi(width, F) - F; - height -= 1; - - Pooling1x1Max3x2Nose(src, srcStride, dst); - for (size_t col = F; col < alignedWidth; col += F) - Pooling1x1Max3x2Body(src + col, srcStride, dst + col); - Pooling1x1Max3x2Tail(src + width - F, srcStride, dst + width - F); - - for (size_t row = 1; row < height; ++row) - { - src += srcStride; - dst += dstStride; - Pooling1x1Max3x3Nose(src, srcStride, dst); - for (size_t col = F; col < alignedWidth; col += F) - Pooling1x1Max3x3Body(src + col, srcStride, dst + col); - Pooling1x1Max3x3Tail(src + width - F, srcStride, dst + width - F); - } - - dst += dstStride; - Pooling1x1Max3x2Nose(src, srcStride, dst); - for (size_t col = F; col < alignedWidth; col += F) - Pooling1x1Max3x2Body(src + col, srcStride, dst + col); - Pooling1x1Max3x2Tail(src + width - F, srcStride, dst + width - F); - } - - void NeuralPooling1x1Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralPooling1x1Max3x3(src, srcStride, width, height, dst, dstStride); - else - NeuralPooling1x1Max3x3(src, srcStride, width, height, dst, dstStride); - } - - SIMD_INLINE float Max2(const float * src) - { - return Simd::Max(src[0], src[1]); - } - - SIMD_INLINE float Max2x2(const float * src, size_t stride) - { - return Simd::Max(Max2(src), Max2(src + stride)); - } - - SIMD_INLINE float Max2x3(const float * src, size_t stride) - { - return Simd::Max(Max2(src), Simd::Max(Max2(src + stride), Max2(src + 2 * stride))); - } - - template SIMD_INLINE __m256 Pooling2x2Max1x3(const float * src, size_t stride) - { - return _mm256_max_ps(_mm256_max_ps(Avx::Load(src), Avx::Load(src + stride)), Avx::Load(src + 2 * stride)); - } - - SIMD_INLINE __m256 PermuteFor2x2(__m256 a) - { - return _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(a), 0xD8)); - } - - template SIMD_INLINE __m256 Pooling2x2Max3x3(const float * src, size_t stride) - { - __m256 _01234567 = Pooling2x2Max1x3(src, stride); - __m256 _89abcdef = Pooling2x2Max1x3(src + F, stride); - __m256 _456789ab = _mm256_permute2f128_ps(_01234567, _89abcdef, 0x21); - __m256 _12345678 = Alignr<1>(_01234567, _456789ab); - __m256 _9abcdefg = Pooling2x2Max1x3(src + F + 1, stride); - __m256 _028a46ce = _mm256_shuffle_ps(_01234567, _89abcdef, 0x88); - __m256 _139b57df = _mm256_shuffle_ps(_01234567, _89abcdef, 0xDD); - __m256 _24ac68eg = _mm256_shuffle_ps(_12345678, _9abcdefg, 0xDD); - return PermuteFor2x2(_mm256_max_ps(_mm256_max_ps(_028a46ce, _139b57df), _24ac68eg)); - } - - template SIMD_INLINE __m256 Pooling2x2Max1x2(const float * src, size_t stride) - { - return _mm256_max_ps(Avx::Load(src), Avx::Load(src + stride)); - } - - template SIMD_INLINE __m256 Pooling2x2Max3x2(const float * src, size_t stride) - { - __m256 _01234567 = Pooling2x2Max1x2(src, stride); - __m256 _89abcdef = Pooling2x2Max1x2(src + F, stride); - __m256 _456789ab = _mm256_permute2f128_ps(_01234567, _89abcdef, 0x21); - __m256 _12345678 = Alignr<1>(_01234567, _456789ab); - __m256 _9abcdefg = Pooling2x2Max1x2(src + F + 1, stride); - __m256 _028a46ce = _mm256_shuffle_ps(_01234567, _89abcdef, 0x88); - __m256 _139b57df = _mm256_shuffle_ps(_01234567, _89abcdef, 0xDD); - __m256 _24ac68eg = _mm256_shuffle_ps(_12345678, _9abcdefg, 0xDD); - return PermuteFor2x2(_mm256_max_ps(_mm256_max_ps(_028a46ce, _139b57df), _24ac68eg)); - } - - template void NeuralPooling2x2Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - height -= 1; - width -= 1; - size_t heightEven = Simd::AlignLo(height, 2); - size_t widthEven = Simd::AlignLo(width, 2); - size_t alignedWidth = AlignLo(width, DF); - for (size_t row = 0; row < heightEven; row += 2) - { - for (size_t col = 0; col < alignedWidth; col += DF) - Avx::Store(dst + (col >> 1), Pooling2x2Max3x3(src + col, srcStride)); - if (widthEven - alignedWidth) - { - size_t col = widthEven - DF; - Avx::Store(dst + (col >> 1), Pooling2x2Max3x3(src + col, srcStride)); - } - if (width - widthEven) - dst[widthEven >> 1] = Max2x3(src + widthEven, srcStride); - src += 2 * srcStride; - dst += dstStride; - } - if (height - heightEven) - { - for (size_t col = 0; col < alignedWidth; col += DF) - Avx::Store(dst + (col >> 1), Pooling2x2Max3x2(src + col, srcStride)); - if (widthEven - alignedWidth) - { - size_t col = widthEven - DF; - Avx::Store(dst + (col >> 1), Pooling2x2Max3x2(src + col, srcStride)); - } - if (width - widthEven) - dst[widthEven >> 1] = Max2x2(src + widthEven, srcStride); - } - } - - void NeuralPooling2x2Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralPooling2x2Max3x3(src, srcStride, width, height, dst, dstStride); - else - NeuralPooling2x2Max3x3(src, srcStride, width, height, dst, dstStride); - } - - namespace Ncf - { - namespace Ver0 - { - void PrepareB(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, size_t kernelX, size_t kernelY, - size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, float * dst) - { - const size_t K = kernelX*kernelY*srcDepth, N = dstHeight*dstWidth; - if (dilationX*dilationY*strideX*strideY != 1) - { - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - size_t srcRow0 = dstRow*strideY - padY; - for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol) - { - size_t srcCol0 = dstCol*strideX - padX; - for (size_t channel = 0; channel < srcDepth; ++channel) - { - for (size_t kernelRow = 0; kernelRow < kernelY; ++kernelRow) - { - size_t srcRow = srcRow0 + kernelRow*dilationY; - if (srcRow < srcHeight) - { - const float * psrc = src + (channel*srcHeight + srcRow)*srcWidth; - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol) - { - size_t srcCol = srcCol0 + kernelCol*dilationX; - if (srcCol < srcWidth) - *(dst++) = psrc[srcCol]; - else - *(dst++) = 0; - } - } - else - { - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol) - *(dst++) = 0; - } - } - } - } - } - } - else if (kernelX*kernelY != 1) - { - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - size_t srcRow0 = dstRow - padY; - for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol) - { - size_t srcCol0 = dstCol - padX; - for (size_t channel = 0; channel < srcDepth; ++channel) - { - for (size_t kernelRow = 0; kernelRow < kernelY; ++kernelRow) - { - size_t srcRow = srcRow0 + kernelRow; - if (srcRow < srcHeight) - { - const float * psrc = src + (channel*srcHeight + srcRow)*srcWidth; - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol) - { - size_t srcCol = srcCol0 + kernelCol; - if (srcCol < srcWidth) - *(dst++) = psrc[srcCol]; - else - *(dst++) = 0; - } - } - else - { - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol) - *(dst++) = 0; - } - } - } - } - } - } - else - { - for (size_t i = 0; i < N; ++i) - { - for (size_t k = 0; k < K; ++k) - *(dst++) = src[k*N + i]; - } - } - } - - template static SIMD_INLINE void Kernel1x4x8(const __m256 & a, size_t K, const float * b, __m256 * sums) - { - sums[0] = _mm256_fmadd_ps(a, Avx::Load(b + 0 * K), sums[0]); - sums[1] = _mm256_fmadd_ps(a, Avx::Load(b + 1 * K), sums[1]); - sums[2] = _mm256_fmadd_ps(a, Avx::Load(b + 2 * K), sums[2]); - sums[3] = _mm256_fmadd_ps(a, Avx::Load(b + 3 * K), sums[3]); - } - - template static SIMD_INLINE void Kernel1x1x8(const __m256 & a, const float * b, __m256 & sum) - { - sum = _mm256_fmadd_ps(a, Avx::Load(b), sum); - } - - SIMD_INLINE void Add4ExtractedSums(const __m256 * src, float * dst) - { - __m256 sum256 = _mm256_hadd_ps(_mm256_hadd_ps(src[0], src[1]), _mm256_hadd_ps(src[2], src[3])); - __m128 sum128 = _mm_add_ps(_mm256_extractf128_ps(sum256, 0), _mm256_extractf128_ps(sum256, 1)); - _mm_storeu_ps(dst, _mm_add_ps(_mm_loadu_ps(dst), sum128)); - } - - template static SIMD_INLINE void Kernel3x4x8(const __m256 * a, size_t K, const float * b, __m256 * sums) - { - __m256 _b; - _b = Avx::Load(b + 0 * K); - sums[0x0] = _mm256_fmadd_ps(a[0], _b, sums[0x0]); - sums[0x4] = _mm256_fmadd_ps(a[1], _b, sums[0x4]); - sums[0x8] = _mm256_fmadd_ps(a[2], _b, sums[0x8]); - _b = Avx::Load(b + 1 * K); - sums[0x1] = _mm256_fmadd_ps(a[0], _b, sums[0x1]); - sums[0x5] = _mm256_fmadd_ps(a[1], _b, sums[0x5]); - sums[0x9] = _mm256_fmadd_ps(a[2], _b, sums[0x9]); - _b = Avx::Load(b + 2 * K); - sums[0x2] = _mm256_fmadd_ps(a[0], _b, sums[0x2]); - sums[0x6] = _mm256_fmadd_ps(a[1], _b, sums[0x6]); - sums[0xA] = _mm256_fmadd_ps(a[2], _b, sums[0xA]); - _b = Avx::Load(b + 3 * K); - sums[0x3] = _mm256_fmadd_ps(a[0], _b, sums[0x3]); - sums[0x7] = _mm256_fmadd_ps(a[1], _b, sums[0x7]); - sums[0xB] = _mm256_fmadd_ps(a[2], _b, sums[0xB]); - } - - template static SIMD_INLINE void Kernel3x1x8(const __m256 * a, const float * b, __m256 * sums) - { - __m256 _b = Avx::Load(b); - sums[0x0] = _mm256_fmadd_ps(a[0], _b, sums[0x0]); - sums[0x1] = _mm256_fmadd_ps(a[1], _b, sums[0x1]); - sums[0x2] = _mm256_fmadd_ps(a[2], _b, sums[0x2]); - } - - template void Execute(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) - { - size_t M3 = M/3*3; - size_t N4 = Simd::AlignLo(N, 4); - size_t K8 = Simd::AlignLo(K, 8); - __m256 tailMask = RightNotZero32f(K - K8); - size_t i = 0; - for (; i < M3; i += 3) - { - const float * pa = a + i * K; - float * pc = c + i * N; - size_t j = 0; - for (; j < N4; j += 4) - { - const float * pb = b + j * K; - __m256 sums[12] = { - _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), - _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), - _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - __m256 _a[3]; - for (size_t k = 0; k < K8; k += 8) - { - _a[0] = Avx::Load(pa + k + 0 * K); - _a[1] = Avx::Load(pa + k + 1 * K); - _a[2] = Avx::Load(pa + k + 2 * K); - Kernel3x4x8(_a, K, pb + k, sums); - } - if (K8 < K) - { - size_t k = K - 8; - _a[0] = _mm256_and_ps(tailMask, Avx::Load(pa + k + 0 * K)); - _a[1] = _mm256_and_ps(tailMask, Avx::Load(pa + k + 1 * K)); - _a[2] = _mm256_and_ps(tailMask, Avx::Load(pa + k + 2 * K)); - Kernel3x4x8(_a, K, pb + k, sums); - } - Add4ExtractedSums(sums + 0, pc + j + 0 * N); - Add4ExtractedSums(sums + 4, pc + j + 1 * N); - Add4ExtractedSums(sums + 8, pc + j + 2 * N); - } - for (; j < N; ++j) - { - const float * pb = b + j * K; - __m256 sums[3] = { _mm256_setzero_ps(), _mm256_setzero_ps() , _mm256_setzero_ps() }; - __m256 _a[3]; - for (size_t k = 0; k < K8; k += 8) - { - _a[0] = Avx::Load(pa + k + 0 * K); - _a[1] = Avx::Load(pa + k + 1 * K); - _a[2] = Avx::Load(pa + k + 2 * K); - Kernel3x1x8(_a, pb + k, sums); - } - if (K8 < K) - { - size_t k = K - 8; - _a[0] = _mm256_and_ps(tailMask, Avx::Load(pa + k + 0 * K)); - _a[1] = _mm256_and_ps(tailMask, Avx::Load(pa + k + 1 * K)); - _a[2] = _mm256_and_ps(tailMask, Avx::Load(pa + k + 2 * K)); - Kernel3x1x8(_a, pb + k, sums); - } - pc[j + 0 * N] += Avx::ExtractSum(sums[0]); - pc[j + 1 * N] += Avx::ExtractSum(sums[1]); - pc[j + 2 * N] += Avx::ExtractSum(sums[2]); - } - } - for (; i < M; ++i) - { - const float * pa = a + i*K; - float * pc = c + i*N; - size_t j = 0; - for (; j < N4; j += 4) - { - const float * pb = b + j*K; - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - for (size_t k = 0; k < K8; k += 8) - { - __m256 _a = Avx::Load(pa + k); - Kernel1x4x8(_a, K, pb + k, sums); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 _a = _mm256_and_ps(tailMask, Avx::Load(pa + k)); - Kernel1x4x8(_a, K, pb + k, sums); - } - Add4ExtractedSums(sums + 0, pc + j); - } - for (; j < N; ++j) - { - const float * pb = b + j*K; - __m256 sum = _mm256_setzero_ps(); - for (size_t k = 0; k < K8; k += 8) - { - __m256 _a = Avx::Load(pa + k); - Kernel1x1x8(_a, pb + k, sum); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 _a = _mm256_and_ps(tailMask, Avx::Load(pa + k)); - Kernel1x1x8(_a, pb + k, sum); - } - pc[j] += Avx::ExtractSum(sum); - } - } - } - - void Execute(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) - { - if (Aligned(K, F)) - Execute(M, N, K, a, b, c); - else - Execute(M, N, K, a, b, c); - } - } - - namespace Ver1 - { - void PrepareA(const float * src, size_t M, size_t K, size_t cell, float * dst) - { - size_t K4 = AlignLo(K, 4), K8 = AlignLo(K, 8); - for (size_t i = 0; i < M; i += cell) - { - size_t n = Simd::Min(cell, M - i), k = 0; - if (cell == 4 && n == 4) - { - for (; k < K8; k += 8) - { - const float * ps = src + k; - __m256 s0 = Avx::Load(ps + 0 * K); - __m256 s1 = Avx::Load(ps + 1 * K); - __m256 s2 = Avx::Load(ps + 2 * K); - __m256 s3 = Avx::Load(ps + 3 * K); - __m256 s00 = _mm256_unpacklo_ps(s0, s2); - __m256 s01 = _mm256_unpacklo_ps(s1, s3); - __m256 s10 = _mm256_unpackhi_ps(s0, s2); - __m256 s11 = _mm256_unpackhi_ps(s1, s3); - __m256 d0 = _mm256_unpacklo_ps(s00, s01); - __m256 d1 = _mm256_unpackhi_ps(s00, s01); - __m256 d2 = _mm256_unpacklo_ps(s10, s11); - __m256 d3 = _mm256_unpackhi_ps(s10, s11); - Avx::Store(dst + 0, _mm256_permute2f128_ps(d0, d1, 0x20)); - Avx::Store(dst + 8, _mm256_permute2f128_ps(d2, d3, 0x20)); - Avx::Store(dst + 16, _mm256_permute2f128_ps(d0, d1, 0x31)); - Avx::Store(dst + 24, _mm256_permute2f128_ps(d2, d3, 0x31)); - dst += 32; - } - for (; k < K4; k += 4) - { - const float * ps = src + k; - __m128 s0 = Sse::Load(ps + 0 * K); - __m128 s1 = Sse::Load(ps + 1 * K); - __m128 s2 = Sse::Load(ps + 2 * K); - __m128 s3 = Sse::Load(ps + 3 * K); - __m128 s00 = _mm_unpacklo_ps(s0, s2); - __m128 s01 = _mm_unpacklo_ps(s1, s3); - __m128 s10 = _mm_unpackhi_ps(s0, s2); - __m128 s11 = _mm_unpackhi_ps(s1, s3); - Sse::Store(dst + 0, _mm_unpacklo_ps(s00, s01)); - Sse::Store(dst + 4, _mm_unpackhi_ps(s00, s01)); - Sse::Store(dst + 8, _mm_unpacklo_ps(s10, s11)); - Sse::Store(dst + 12, _mm_unpackhi_ps(s10, s11)); - dst += 16; - } - } - for (; k < K; ++k) - { - for (size_t c = 0; c < n; ++c) - *(dst++) = src[c*K + k]; - } - src += cell*K; - } - } - - void PrepareB(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, size_t kernelX, size_t kernelY, size_t padX, size_t padY, - size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, size_t cell, float * tmp, float * dst) - { - const size_t K = kernelX*kernelY*srcDepth, N = dstHeight*dstWidth; - if (kernelX*kernelY != 1) - { - float * dst = tmp; - size_t channelSize = srcHeight * srcWidth; - if (dilationX*dilationY*strideX*strideY != 1) - { - for (size_t channel = 0, k = 0; channel < srcDepth; ++channel, src += channelSize) - { - for (size_t kernelRow = 0; kernelRow < kernelY; ++kernelRow) - { - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol, ++k) - { - size_t srcRow = kernelRow*dilationY - padY; - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - if (srcRow < srcHeight) - { - size_t srcCol = kernelCol*dilationX - padX; - for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol) - { - if (srcCol < srcWidth) - *(dst++) = src[srcRow*srcWidth + srcCol]; - else - *(dst++) = 0; - srcCol += strideX; - } - } - else - { - for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol) - *(dst++) = 0; - } - srcRow += strideY; - } - } - } - } - } - else - { - const size_t bodySize = dstWidth - padX * 2; - for (size_t channel = 0, k = 0; channel < srcDepth; ++channel, src += channelSize) - { - for (size_t kernelRow = 0; kernelRow < kernelY; ++kernelRow) - { - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol, ++k) - { - size_t srcRow = kernelRow - padY; - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow, ++srcRow) - { - if (srcRow < srcHeight) - { - size_t srcCol = kernelCol - padX, dstCol = 0; - const float * psrc = src + srcRow*srcWidth; - for (; dstCol < padX; ++dstCol, ++srcCol) - { - if (srcCol < srcWidth) - *(dst++) = psrc[srcCol]; - else - *(dst++) = 0; - } - memcpy(dst, psrc + srcCol, bodySize * 4); - dst += bodySize; - dstCol += bodySize; - srcCol += bodySize; - for (; dstCol < dstWidth; ++dstCol, ++srcCol) - { - if (srcCol < srcWidth) - *(dst++) = psrc[srcCol]; - else - *(dst++) = 0; - } - } - else - { - memset(dst, 0, dstWidth * 4); - dst += dstWidth; - } - } - } - } - } - } - src = tmp; - } - if (cell == 24) - { - for (size_t j = 0; j < N; j += cell) - { - size_t n = Simd::Min(cell, N - j); - if (n == cell) - { - for (size_t k = 0; k < K; ++k) - { - const float * psrc = src + k * N; - Avx::Store(dst + 0x00, Avx::Load(psrc + 0x00)); - Avx::Store(dst + 0x08, Avx::Load(psrc + 0x08)); - Avx::Store(dst + 0x10, Avx::Load(psrc + 0x10)); - dst += 24; - } - } - else - { - for (size_t k = 0; k < K; ++k) - { - const float * psrc = src + k * N; - size_t c = 0; - for (; c < n; ++c) - *(dst++) = *(psrc++); - for (; c < cell; ++c) - *(dst++) = 0; - } - } - src += cell; - } - } - else if (cell == 16) - { - for (size_t j = 0; j < N; j += cell) - { - size_t n = Simd::Min(cell, N - j); - if (n == cell) - { - for (size_t k = 0; k < K; ++k) - { - const float * psrc = src + k*N; - Avx::Store(dst + 0, Avx::Load(psrc + 0)); - Avx::Store(dst + 8, Avx::Load(psrc + 8)); - dst += 16; - } - } - else - { - for (size_t k = 0; k < K; ++k) - { - const float * psrc = src + k*N; - size_t c = 0; - for (; c < n; ++c) - *(dst++) = *(psrc++); - for (; c < cell; ++c) - *(dst++) = 0; - } - } - src += cell; - } - } - else - { - for (size_t j = 0; j < N; j += cell) - { - size_t n = Simd::Min(cell, N - j); - for (size_t k = 0; k < K; ++k) - { - const float * psrc = src + k*N; - size_t c = 0; - for (; c < n; ++c) - *(dst++) = *(psrc++); - for (; c < cell; ++c) - *(dst++) = 0; - } - src += cell; - } - } - } - - SIMD_INLINE void AddSum(__m256 sum, float * dst) - { - Avx::Store(dst, _mm256_add_ps(Load(dst), sum)); - } - - SIMD_INLINE void AddSums8(const __m256 * sums, size_t size, const float * mask, float * dst, size_t stride) - { - if (mask) - { - __m256 _mask = _mm256_loadu_ps(mask); - for (size_t i = 0; i < size; ++i, dst += stride) - AddSum(_mm256_and_ps(_mask, sums[i]), dst); - } - else - { - for (size_t i = 0; i < size; ++i, dst += stride) - AddSum(sums[i], dst); - } - } - - template SIMD_INLINE void KernelMx8(size_t N, size_t K, const float * a, const float * b, float * c, const float * mask, size_t m) - { - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - for (size_t k = 0; k < K; ++k) - { - __m256 b0 = Load(b); - for (size_t s = 0; s < m; ++s) - sums[s] = _mm256_fmadd_ps(_mm256_set1_ps(a[s]), b0, sums[s]); - b += 8; - a += m; - } - AddSums8(sums, m, mask, c, N); - } - - template SIMD_INLINE void Kernel4x8(size_t N, size_t K, const float * a, const float * b, float * c, const float * mask) - { - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - for (size_t k = 0; k < K; ++k) - { - __m256 b0 = Load(b); - sums[0] = _mm256_fmadd_ps(_mm256_set1_ps(a[0]), b0, sums[0]); - sums[1] = _mm256_fmadd_ps(_mm256_set1_ps(a[1]), b0, sums[1]); - sums[2] = _mm256_fmadd_ps(_mm256_set1_ps(a[2]), b0, sums[2]); - sums[3] = _mm256_fmadd_ps(_mm256_set1_ps(a[3]), b0, sums[3]); - b += 8; - a += 4; - } - AddSums8(sums, 4, mask, c, N); - } - - template void Execute4x8(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) - { - size_t M4 = Simd::AlignLo(M, 4); - size_t N8 = Simd::AlignLo(N, 8); - const int32_t mask[16] = { -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 }; - const float * tail = (float*)mask + 8 - N + N8; - size_t i = 0; - for (; i < M4; i += 4) - { - size_t j = 0; - for (; j < N8; j += 8) - Kernel4x8(N, K, a + i*K, b + j*K, c + i*N + j, NULL); - if (N8 < N) - Kernel4x8(N, K, a + i*K, b + j*K, c + i*N + j, tail); - } - if (M4 < M) - { - size_t j = 0; - for (; j < N8; j += 8) - KernelMx8(N, K, a + i*K, b + j*K, c + i*N + j, NULL, M - M4); - if (N8 < N) - KernelMx8(N, K, a + i*K, b + j*K, c + i*N + j, tail, M - M4); - } - } - - SIMD_INLINE void AddSums16(const __m256 * sums, size_t size, const float * mask, float * dst, size_t stride) - { - if (mask) - { - __m256 mask0 = _mm256_loadu_ps(mask + 0); - __m256 mask1 = _mm256_loadu_ps(mask + 8); - for (size_t i = 0; i < size; ++i, dst += stride) - { - AddSum(_mm256_and_ps(mask0, sums[i + 0]), dst + 0); - AddSum(_mm256_and_ps(mask1, sums[i + 4]), dst + 8); - } - } - else - { - for (size_t i = 0; i < size; ++i, dst += stride) - { - AddSum(sums[i + 0], dst + 0); - AddSum(sums[i + 4], dst + 8); - } - } - } - - template SIMD_INLINE void KernelMx16(size_t N, size_t K, const float * a, const float * b, float * c, const float * mask, size_t m) - { - __m256 sums[8] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), - _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - for (size_t k = 0; k < K; ++k) - { - __m256 b0 = Load(b + 0); - __m256 b1 = Load(b + 8); - for (size_t s = 0; s < m; ++s) - { - __m256 a0 = _mm256_set1_ps(a[s]); - sums[s + 0] = _mm256_fmadd_ps(b0, a0, sums[s + 0]); - sums[s + 4] = _mm256_fmadd_ps(b1, a0, sums[s + 4]); - } - b += 16; - a += m; - } - AddSums16(sums, m, mask, c, N); - } - - template SIMD_INLINE void Kernel4x16(size_t N, size_t K, const float * a, const float * b, float * c, const float * mask) - { - __m256 sums[8] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), - _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - for (size_t k = 0; k < K; ++k) - { - __m256 b0 = Load(b + 0); - __m256 b1 = Load(b + 8); - __m256 a0 = _mm256_set1_ps(a[0]); - sums[0] = _mm256_fmadd_ps(b0, a0, sums[0]); - sums[4] = _mm256_fmadd_ps(b1, a0, sums[4]); - __m256 a1 = _mm256_set1_ps(a[1]); - sums[1] = _mm256_fmadd_ps(b0, a1, sums[1]); - sums[5] = _mm256_fmadd_ps(b1, a1, sums[5]); - __m256 a2 = _mm256_set1_ps(a[2]); - sums[2] = _mm256_fmadd_ps(b0, a2, sums[2]); - sums[6] = _mm256_fmadd_ps(b1, a2, sums[6]); - __m256 a3 = _mm256_set1_ps(a[3]); - sums[3] = _mm256_fmadd_ps(b0, a3, sums[3]); - sums[7] = _mm256_fmadd_ps(b1, a3, sums[7]); - b += 16; - a += 4; - } - AddSums16(sums, 4, mask, c, N); - } - - template void Execute4x16(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) - { - size_t M4 = Simd::AlignLo(M, 4); - size_t N16 = Simd::AlignLo(N, 16); - const int32_t mask[32] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - const float * tail = (float*)mask + 16 - N + N16; - size_t i = 0; - for (; i < M4; i += 4) - { - size_t j = 0; - for (; j < N16; j += 16) - Kernel4x16(N, K, a + i*K, b + j*K, c + i*N + j, NULL); - if (N16 < N) - Kernel4x16(N, K, a + i*K, b + j*K, c + i*N + j, tail); - } - if (M4 < M) - { - size_t j = 0; - for (; j < N16; j += 16) - KernelMx16(N, K, a + i*K, b + j*K, c + i*N + j, NULL, M - M4); - if (N16 < N) - KernelMx16(N, K, a + i*K, b + j*K, c + i*N + j, tail, M - M4); - } - } - - SIMD_INLINE void AddSums24(const __m256 * sums, size_t size, const float * mask, float * dst, size_t stride) - { - if (mask) - { - __m256 mask0 = _mm256_loadu_ps(mask + 0 * F); - __m256 mask1 = _mm256_loadu_ps(mask + 1 * F); - __m256 mask2 = _mm256_loadu_ps(mask + 2 * F); - for (size_t i = 0; i < size; ++i, dst += stride) - { - AddSum(_mm256_and_ps(mask0, sums[i + 0]), dst + 0 * F); - AddSum(_mm256_and_ps(mask1, sums[i + 4]), dst + 1 * F); - AddSum(_mm256_and_ps(mask2, sums[i + 8]), dst + 2 * F); - } - } - else - { - for (size_t i = 0; i < size; ++i, dst += stride) - { - AddSum(sums[i + 0], dst + 0 * F); - AddSum(sums[i + 4], dst + 1 * F); - AddSum(sums[i + 8], dst + 2 * F); - } - } - } - - template SIMD_INLINE void KernelMx24(size_t N, size_t K, const float * a, const float * b, float * c, const float * mask, size_t m) - { - __m256 sums[12] = { - _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), - _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), - _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - __m256 _b[3]; - for (size_t k = 0; k < K; ++k) - { - _b[0] = Load(b + 0 * F); - _b[1] = Load(b + 1 * F); - _b[2] = Load(b + 2 * F); - for (size_t s = 0; s < m; ++s) - { - __m256 _a = _mm256_set1_ps(a[s]); - sums[s + 0] = _mm256_fmadd_ps(_b[0], _a, sums[s + 0]); - sums[s + 4] = _mm256_fmadd_ps(_b[1], _a, sums[s + 4]); - sums[s + 8] = _mm256_fmadd_ps(_b[2], _a, sums[s + 8]); - } - b += 24; - a += m; - } - AddSums24(sums, m, mask, c, N); - } - - void Kernel4x24(size_t N, size_t K, const float * a, const float * b, float * c) - { - __m256 _a, b0, b1, b2, c00, c01, c02, c10, c11, c12, c20, c21, c22, c30, c31, c32; - - c00 = _mm256_setzero_ps(); - c01 = _mm256_setzero_ps(); - c02 = _mm256_setzero_ps(); - c10 = _mm256_setzero_ps(); - c11 = _mm256_setzero_ps(); - c12 = _mm256_setzero_ps(); - c20 = _mm256_setzero_ps(); - c21 = _mm256_setzero_ps(); - c22 = _mm256_setzero_ps(); - c30 = _mm256_setzero_ps(); - c31 = _mm256_setzero_ps(); - c32 = _mm256_setzero_ps(); - - for (size_t k = 0; k < K; ++k) - { - b0 = _mm256_loadu_ps(b + 0 * F); - b1 = _mm256_loadu_ps(b + 1 * F); - b2 = _mm256_loadu_ps(b + 2 * F); - _a = _mm256_set1_ps(a[0]); - c00 = _mm256_fmadd_ps(b0, _a, c00); - c01 = _mm256_fmadd_ps(b1, _a, c01); - c02 = _mm256_fmadd_ps(b2, _a, c02); - _a = _mm256_set1_ps(a[1]); - c10 = _mm256_fmadd_ps(b0, _a, c10); - c11 = _mm256_fmadd_ps(b1, _a, c11); - c12 = _mm256_fmadd_ps(b2, _a, c12); - _a = _mm256_set1_ps(a[2]); - c20 = _mm256_fmadd_ps(b0, _a, c20); - c21 = _mm256_fmadd_ps(b1, _a, c21); - c22 = _mm256_fmadd_ps(b2, _a, c22); - _a = _mm256_set1_ps(a[3]); - c30 = _mm256_fmadd_ps(b0, _a, c30); - c31 = _mm256_fmadd_ps(b1, _a, c31); - c32 = _mm256_fmadd_ps(b2, _a, c32); - b += 24; - a += 4; - } - - AddSum(c00, c + 0 * F); - AddSum(c01, c + 1 * F); - AddSum(c02, c + 2 * F); - c += N; - AddSum(c10, c + 0 * F); - AddSum(c11, c + 1 * F); - AddSum(c12, c + 2 * F); - c += N; - AddSum(c20, c + 0 * F); - AddSum(c21, c + 1 * F); - AddSum(c22, c + 2 * F); - c += N; - AddSum(c30, c + 0 * F); - AddSum(c31, c + 1 * F); - AddSum(c32, c + 2 * F); - } - - template void Execute4x24(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) - { - size_t M4 = Simd::AlignLo(M, 4); - size_t N24 = N/24*24; - const int32_t mask[48] = { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - const float * tail = (float*)mask + 24 - N + N24; - if (M > N) - { - size_t i = 0; - for (; i < M4; i += 4) - { - size_t j = 0; - for (; j < N24; j += 24) - Kernel4x24(N, K, a + i * K, b + j * K, c + i * N + j); - if (N24 < N) - KernelMx24(N, K, a + i * K, b + j * K, c + i * N + j, tail, 4); - } - if (M4 < M) - { - size_t j = 0; - for (; j < N24; j += 24) - KernelMx24(N, K, a + i * K, b + j * K, c + i * N + j, NULL, M - M4); - if (N24 < N) - KernelMx24(N, K, a + i * K, b + j * K, c + i * N + j, tail, M - M4); - } - } - else - { - size_t j = 0; - for (; j < N24; j += 24) - { - size_t i = 0; - for (; i < M4; i += 4) - Kernel4x24(N, K, a + i * K, b + j * K, c + i * N + j); - if (M4 < M) - KernelMx24(N, K, a + i * K, b + j * K, c + i * N + j, NULL, M - M4); - } - if (N24 < N) - { - size_t i = 0; - for (; i < M4; i += 4) - KernelMx24(N, K, a + i * K, b + j * K, c + i * N + j, tail, 4); - if (M4 < M) - KernelMx24(N, K, a + i * K, b + j * K, c + i * N + j, tail, M - M4); - } - } - } - - void Execute(size_t M, size_t N, size_t K, const float * a, const float * b, float * c, size_t cellA, size_t cellB) - { - if (cellA == 4) - { - if (cellB == 8) - Execute4x8(M, N, K, a, b, c); - if (cellB == 16) - Execute4x16(M, N, K, a, b, c); - if (cellB == 24) - Execute4x24(M, N, K, a, b, c); - } - } - } - - namespace Ver2 - { - void PrepareB(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, size_t padX, size_t padY, float * dst, size_t dstWidth, size_t dstHeight) - { - for (size_t channel = 0; channel < srcDepth; ++channel) - { - const float * s = src; - float * d = dst; - memset(d, 0, padY*dstWidth * 4); - d += padY*dstWidth; - for (size_t row = padY; row < dstHeight - padY; ++row) - { - memset(d, 0, padX * 4); - memcpy(d + padX, s, srcWidth * 4); - memset(d + padX + srcWidth, 0, padX * 4); - d += dstWidth; - s += srcWidth; - } - memset(d, 0, padY*dstWidth * 4); - src += srcWidth*srcHeight; - dst += dstWidth*dstHeight; - } - } - - template void AddConvolution8x8(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, - const float * weight, float * dst, size_t dstDepth) - { - for (size_t dstChannel = 0; dstChannel < dstDepth; ++dstChannel) - { - __m256 _dst[8]; - float * pdst = dst; - for (size_t row = 0; row < 8; ++row, pdst += 8) - _dst[row] = Avx::Load(pdst); - if (kernelY < 4) - { - __m256 _weight[kernelX*kernelY]; - for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) - { - const float * psrc = src + srcWidth*srcHeight*srcChannel; - LoadWeightsForward(weight, _weight); - for (size_t row = 0; row < 8; ++row) - { - _dst[row] = _mm256_add_ps(_dst[row], Convolution::template Forward(psrc, srcWidth, _weight)); - psrc += srcWidth; - } - weight += kernelX*kernelY; - } - } - else - { - __m256 _weight[kernelX]; - for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) - { - const float * psrc = src + srcWidth*srcHeight*srcChannel; - for (size_t dy = 0; dy < kernelY; dy++) - { - const float * ps = psrc + dy*srcWidth; - LoadWeightsForward(weight, _weight); - for (size_t row = 0; row < 8; ++row) - { - _dst[row] = _mm256_add_ps(_dst[row], Convolution::template RowConvolution(ps, _weight)); - ps += srcWidth; - } - weight += kernelX; - } - } - } - for (size_t row = 0; row < 8; ++row, dst += 8) - Avx::Store(dst, _dst[row]); - } - } - - template void AddConvolution(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, - const float * weight, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth) - { - if (dstWidth == 8 && dstHeight == 8) - { - AddConvolution8x8(src, srcWidth, srcHeight, srcDepth, weight, dst, dstDepth); - return; - } - size_t alignedWidth = AlignLo(dstWidth, F); - __m256 tailMask = RightNotZero32f(dstWidth - alignedWidth); - __m256 _weight[kernelX*kernelY]; - for (size_t dstChannel = 0; dstChannel < dstDepth; ++dstChannel) - { - for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) - { - const float * psrc = src + srcWidth*srcHeight*srcChannel; - const float * pweight = weight + (dstChannel*srcDepth + srcChannel)*kernelX*kernelY; - float * pdst = dst + dstWidth*dstHeight*dstChannel; - LoadWeightsForward(pweight, _weight); - for (size_t row = 0; row < dstHeight; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += F) - { - __m256 _dst = Load(pdst + col); - _dst = _mm256_add_ps(_dst, Convolution::template Forward(psrc + col, srcWidth, _weight)); - Avx::Store(pdst + col, _dst); - } - if (dstWidth - alignedWidth) - { - size_t col = dstWidth - F; - __m256 _dst = Load(pdst + col); - _dst = _mm256_add_ps(_dst, _mm256_and_ps(tailMask, Convolution::template Forward(psrc + col, srcWidth, _weight))); - Avx::Store(pdst + col, _dst); - } - psrc += srcWidth; - pdst += dstWidth; - } - } - } - } - - void AddConvolution1x1x16(const float * src, size_t srcDepth, const float * weight, float * dst, size_t dstDepth) - { - size_t dstDepth4 = dstDepth/4*4; - size_t dstChannel = 0; - for (; dstChannel < dstDepth4; dstChannel += 4) - { - __m256 dst00 = _mm256_loadu_ps(dst + 0 * F); - __m256 dst01 = _mm256_loadu_ps(dst + 1 * F); - __m256 dst10 = _mm256_loadu_ps(dst + 2 * F); - __m256 dst11 = _mm256_loadu_ps(dst + 3 * F); - __m256 dst20 = _mm256_loadu_ps(dst + 4 * F); - __m256 dst21 = _mm256_loadu_ps(dst + 5 * F); - __m256 dst30 = _mm256_loadu_ps(dst + 6 * F); - __m256 dst31 = _mm256_loadu_ps(dst + 7 * F); - const float * psrc = src; - const float * pw0 = weight; - const float * pw1 = pw0 + srcDepth; - const float * pw2 = pw1 + srcDepth; - const float * pw3 = pw2 + srcDepth; - for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) - { - __m256 _weight; - __m256 src0 = _mm256_loadu_ps(psrc + 0 * F); - __m256 src1 = _mm256_loadu_ps(psrc + 1 * F); - _weight = _mm256_set1_ps(pw0[srcChannel]); - dst00 = _mm256_fmadd_ps(_weight, src0, dst00); - dst01 = _mm256_fmadd_ps(_weight, src1, dst01); - _weight = _mm256_set1_ps(pw1[srcChannel]); - dst10 = _mm256_fmadd_ps(_weight, src0, dst10); - dst11 = _mm256_fmadd_ps(_weight, src1, dst11); - _weight = _mm256_set1_ps(pw2[srcChannel]); - dst20 = _mm256_fmadd_ps(_weight, src0, dst20); - dst21 = _mm256_fmadd_ps(_weight, src1, dst21); - _weight = _mm256_set1_ps(pw3[srcChannel]); - dst30 = _mm256_fmadd_ps(_weight, src0, dst30); - dst31 = _mm256_fmadd_ps(_weight, src1, dst31); - psrc += 16; - } - _mm256_storeu_ps(dst + 0 * F, dst00); - _mm256_storeu_ps(dst + 1 * F, dst01); - _mm256_storeu_ps(dst + 2 * F, dst10); - _mm256_storeu_ps(dst + 3 * F, dst11); - _mm256_storeu_ps(dst + 4 * F, dst20); - _mm256_storeu_ps(dst + 5 * F, dst21); - _mm256_storeu_ps(dst + 6 * F, dst30); - _mm256_storeu_ps(dst + 7 * F, dst31); - dst += 16*4; - weight += srcDepth * 4; - } - for (; dstChannel < dstDepth; ++dstChannel) - { - __m256 dst0 = _mm256_loadu_ps(dst + 0 * F); - __m256 dst1 = _mm256_loadu_ps(dst + 1 * F); - const float * psrc = src; - for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) - { - __m256 weight0 = _mm256_set1_ps(*weight++); - dst0 = _mm256_fmadd_ps(weight0, _mm256_loadu_ps(psrc + 0 * F), dst0); - dst1 = _mm256_fmadd_ps(weight0, _mm256_loadu_ps(psrc + 1 * F), dst1); - psrc += 16; - } - _mm256_storeu_ps(dst + 0 * F, dst0); - _mm256_storeu_ps(dst + 1 * F, dst1); - dst += 16; - } - } - - void Execute(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, - const float * weight, size_t kernelX, size_t kernelY, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth) - { - assert(kernelX == kernelY); - if (kernelX == 1 && dstWidth*dstHeight == 16) - AddConvolution1x1x16(src, srcDepth, weight, dst, dstDepth); - else if (kernelX == 2) - AddConvolution(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth); - else if (kernelX == 3) - AddConvolution(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth); - else if (kernelX == 4) - AddConvolution(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth); - else if (kernelX == 5) - AddConvolution(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth); - else - assert(0); - } - - bool Preferable(size_t srcDepth, size_t kernelX, size_t kernelY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, size_t dstDepth) - { - if (kernelX == kernelY && strideX*strideY*dilationX*dilationY == 1 && dstWidth >= F) - { - if (kernelX >= 2 && kernelX <= 5 && dstWidth*dstHeight*kernelX*kernelY >= 8 * 8 * 3 * 3) - return true; - if (kernelX == 1 && (dstWidth*dstHeight == 16))// || dstWidth * dstHeight == 64)) - return true; - } - return false; - } - } - - struct Opt - { - enum Alg - { - None, - Ver0, - Ver1, - Ver2, - } alg; - - size_t sizeA; - size_t sizeB; - size_t sizeT; - - size_t cellA; - size_t cellB; - - size_t M, N, K; - size_t strideB; - size_t paddedW; - size_t paddedH; - - Opt(size_t srcWidth, size_t srcHeight, size_t srcDepth, size_t kernelX, size_t kernelY, size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, size_t dstDepth) - { - alg = None; - sizeA = 0; - sizeB = 0; - sizeT = 0; - cellA = 1; - cellB = 1; - - M = dstDepth; - N = dstHeight*dstWidth; - K = kernelX*kernelY*srcDepth; - - if (dstWidth*dstHeight / kernelX <= 1000) - alg = Ver0; - else - alg = Ver1; - if (Ver2::Preferable(srcDepth, kernelX, kernelY, strideX, strideY, dilationX, dilationY, dstWidth, dstHeight, dstDepth)) - alg = Ver2; - - switch (alg) - { - case Ver0: - sizeB = N*K; - break; - case Ver1: - cellA = 4; - cellB = 24; - sizeA = M*K; - strideB = (N + cellB - 1)/cellB*cellB; - sizeB = strideB*K; - if (kernelX*kernelY > 1) - sizeT = sizeB; - break; - case Ver2: - if (padX > 0 || padY > 0) - { - paddedW = Simd::AlignHi(srcWidth + 2 * padX, F); - paddedH = srcHeight + 2 * padY; - sizeB = paddedW*paddedH*srcDepth; - } - else - { - paddedW = srcWidth; - paddedH = srcHeight; - } - break; - default: - assert(0); - break; - } - } - }; - - struct Data - { - float * a; - float * b; - float * t; - - Data(size_t sizeA, size_t sizeB, size_t sizeT, void * externalData, size_t * externalSize) - : a(0) - , b(0) - , _data(0) - { - sizeA = AlignHi(sizeA, F); - sizeB = AlignHi(sizeB, F); - sizeT = AlignHi(sizeT, F); - size_t size = (sizeA + sizeB + sizeT) * sizeof(float); - if (size == 0) - return; - if (externalData != AlignHi(externalData, SIMD_ALIGN)) - size += SIMD_ALIGN; - float * data = NULL; - if (externalData == NULL || externalSize == NULL || *externalSize < size) - { - _data = Simd::Allocate(size); - if (externalSize) - *externalSize = size; - data = (float*)_data; - } - else - data = (float*)AlignHi(externalData, SIMD_ALIGN); - if (sizeA) - a = data; - if (sizeB) - b = data + sizeA; - if (sizeT) - t = data + sizeA + sizeB; - } - - ~Data() - { - if (_data) - Simd::Free(_data); - } - - private: - void * _data; - }; - } - - void NeuralConvolutionForward(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, - const float * weight, size_t kernelX, size_t kernelY, size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, - void * buffer, size_t * size, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth, int add) - { - using namespace Ncf; - - assert(dstWidth == (srcWidth + 2 * padX - (dilationX * (kernelX - 1) + 1)) / strideX + 1); - assert(dstHeight == (srcHeight + 2 * padY - (dilationY * (kernelY - 1) + 1)) / strideY + 1); - - if (!add) - memset(dst, 0, dstWidth*dstHeight*dstDepth * sizeof(float)); - - Opt opt(srcWidth, srcHeight, srcDepth, kernelX, kernelY, padX, padY, strideX, strideY, dilationX, dilationY, dstWidth, dstHeight, dstDepth); - - Data data(opt.sizeA, opt.sizeB, opt.sizeT, buffer, size); - - if (opt.sizeA) - { - switch (opt.alg) - { - case Opt::Ver1: Ver1::PrepareA(weight, opt.M, opt.K, opt.cellA, data.a); - default: - break; - } - } - else - data.a = (float*)weight; - - if (opt.sizeB) - { - switch (opt.alg) - { - case Opt::Ver0: Ver0::PrepareB(src, srcWidth, srcHeight, srcDepth, kernelX, kernelY, padX, padY, strideX, strideY, dilationX, dilationY, dstWidth, dstHeight, data.b); break; - case Opt::Ver1: Ver1::PrepareB(src, srcWidth, srcHeight, srcDepth, kernelX, kernelY, padX, padY, strideX, strideY, dilationX, dilationY, dstWidth, dstHeight, opt.cellB, data.t, data.b); break; - case Opt::Ver2: Ver2::PrepareB(src, srcWidth, srcHeight, srcDepth, padX, padY, data.b, opt.paddedW, opt.paddedH); break; - default: break; - } - } - else - data.b = (float*)src; - - switch (opt.alg) - { - case Opt::Ver0: Ver0::Execute(opt.M, opt.N, opt.K, data.a, data.b, dst); break; - case Opt::Ver1: Ver1::Execute(opt.M, opt.N, opt.K, data.a, data.b, dst, opt.cellA, opt.cellB); break; - case Opt::Ver2: Ver2::Execute(data.b, opt.paddedW, opt.paddedH, srcDepth, weight, kernelX, kernelY, dst, dstWidth, dstHeight, dstDepth); break; - default: break; - } - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Operation.cpp b/src/3rd/Simd/Simd/SimdAvx2Operation.cpp deleted file mode 100644 index 027a3fd5..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Operation.cpp +++ /dev/null @@ -1,230 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions:SimdOperationBinary8u -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE __m256i OperationBinary8u(const __m256i & a, const __m256i & b); - - template <> SIMD_INLINE __m256i OperationBinary8u(const __m256i & a, const __m256i & b) - { - return _mm256_avg_epu8(a, b); - } - - template <> SIMD_INLINE __m256i OperationBinary8u(const __m256i & a, const __m256i & b) - { - return _mm256_and_si256(a, b); - } - - template <> SIMD_INLINE __m256i OperationBinary8u(const __m256i & a, const __m256i & b) - { - return _mm256_or_si256(a, b); - } - - template <> SIMD_INLINE __m256i OperationBinary8u(const __m256i & a, const __m256i & b) - { - return _mm256_max_epu8(a, b); - } - - template <> SIMD_INLINE __m256i OperationBinary8u(const __m256i & a, const __m256i & b) - { - return _mm256_min_epu8(a, b); - } - - template <> SIMD_INLINE __m256i OperationBinary8u(const __m256i & a, const __m256i & b) - { - return _mm256_subs_epu8(a, b); - } - - template <> SIMD_INLINE __m256i OperationBinary8u(const __m256i & a, const __m256i & b) - { - return _mm256_adds_epu8(a, b); - } - - template void OperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride) - { - assert(width*channelCount >= A); - if (align) - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(dst) && Aligned(dstStride)); - - size_t size = channelCount*width; - size_t alignedSize = Simd::AlignLo(size, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t offset = 0; offset < alignedSize; offset += A) - { - const __m256i a_ = Load((__m256i*)(a + offset)); - const __m256i b_ = Load((__m256i*)(b + offset)); - Store((__m256i*)(dst + offset), OperationBinary8u(a_, b_)); - } - if (alignedSize != size) - { - const __m256i a_ = Load((__m256i*)(a + size - A)); - const __m256i b_ = Load((__m256i*)(b + size - A)); - Store((__m256i*)(dst + size - A), OperationBinary8u(a_, b_)); - } - a += aStride; - b += bStride; - dst += dstStride; - } - } - - template void OperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride, SimdOperationBinary8uType type) - { - switch (type) - { - case SimdOperationBinary8uAverage: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uAnd: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uOr: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uMaximum: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uMinimum: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uSaturatedSubtraction: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uSaturatedAddition: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - default: - assert(0); - } - } - - void OperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride, SimdOperationBinary8uType type) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(dst) && Aligned(dstStride)) - OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride, type); - else - OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride, type); - } - - template SIMD_INLINE __m256i OperationBinary16i(const __m256i & a, const __m256i & b); - - template <> SIMD_INLINE __m256i OperationBinary16i(const __m256i & a, const __m256i & b) - { - return _mm256_add_epi16(a, b); - } - - template <> SIMD_INLINE __m256i OperationBinary16i(const __m256i & a, const __m256i & b) - { - return _mm256_sub_epi16(a, b); - } - - template void OperationBinary16i(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(width * sizeof(uint16_t) >= A); - if (align) - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(dst) && Aligned(dstStride)); - - size_t size = width * sizeof(int16_t); - size_t alignedSize = Simd::AlignLo(size, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t offset = 0; offset < alignedSize; offset += A) - { - const __m256i a_ = Load((__m256i*)(a + offset)); - const __m256i b_ = Load((__m256i*)(b + offset)); - Store((__m256i*)(dst + offset), OperationBinary16i(a_, b_)); - } - if (alignedSize != size) - { - const __m256i a_ = Load((__m256i*)(a + size - A)); - const __m256i b_ = Load((__m256i*)(b + size - A)); - Store((__m256i*)(dst + size - A), OperationBinary16i(a_, b_)); - } - a += aStride; - b += bStride; - dst += dstStride; - } - } - - template void OperationBinary16i(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint8_t * dst, size_t dstStride, SimdOperationBinary16iType type) - { - switch (type) - { - case SimdOperationBinary16iAddition: - return OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride); - case SimdOperationBinary16iSubtraction: - return OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride); - default: - assert(0); - } - } - - void OperationBinary16i(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint8_t * dst, size_t dstStride, SimdOperationBinary16iType type) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(dst) && Aligned(dstStride)) - OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride, type); - else - OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride, type); - } - - template SIMD_INLINE void VectorProduct(const __m256i & vertical, const uint8_t * horizontal, uint8_t * dst) - { - __m256i _horizontal = Load((__m256i*)horizontal); - __m256i lo = DivideI16By255(_mm256_mullo_epi16(vertical, _mm256_unpacklo_epi8(_horizontal, K_ZERO))); - __m256i hi = DivideI16By255(_mm256_mullo_epi16(vertical, _mm256_unpackhi_epi8(_horizontal, K_ZERO))); - Store((__m256i*)dst, _mm256_packus_epi16(lo, hi)); - } - - template void VectorProduct(const uint8_t * vertical, const uint8_t * horizontal, uint8_t * dst, size_t stride, size_t width, size_t height) - { - assert(width >= A); - if (align) - assert(Aligned(horizontal) && Aligned(dst) && Aligned(stride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - __m256i _vertical = _mm256_set1_epi16(vertical[row]); - for (size_t col = 0; col < alignedWidth; col += A) - VectorProduct(_vertical, horizontal + col, dst + col); - if (alignedWidth != width) - VectorProduct(_vertical, horizontal + width - A, dst + width - A); - dst += stride; - } - } - - void VectorProduct(const uint8_t * vertical, const uint8_t * horizontal, uint8_t * dst, size_t stride, size_t width, size_t height) - { - if (Aligned(horizontal) && Aligned(dst) && Aligned(stride)) - VectorProduct(vertical, horizontal, dst, stride, width, height); - else - VectorProduct(vertical, horizontal, dst, stride, width, height); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Reduce.cpp b/src/3rd/Simd/Simd/SimdAvx2Reduce.cpp deleted file mode 100644 index 70b8d06a..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Reduce.cpp +++ /dev/null @@ -1,254 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { -#ifdef SIMD_MADDUBS_ERROR - SIMD_INLINE __m256i Average8(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11) - { - __m256i lo = Average16( - _mm256_and_si256(s00, K16_00FF), - _mm256_and_si256(_mm256_srli_si256(s00, 1), K16_00FF), - _mm256_and_si256(s10, K16_00FF), - _mm256_and_si256(_mm256_srli_si256(s10, 1), K16_00FF)); - __m256i hi = Average16( - _mm256_and_si256(s01, K16_00FF), - _mm256_and_si256(_mm256_srli_si256(s01, 1), K16_00FF), - _mm256_and_si256(s11, K16_00FF), - _mm256_and_si256(_mm256_srli_si256(s11, 1), K16_00FF)); - return PackI16ToU8(lo, hi); - } -#else - SIMD_INLINE __m256i Average16(const __m256i & s0, const __m256i & s1) - { - return _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(_mm256_maddubs_epi16(s0, K8_01), _mm256_maddubs_epi16(s1, K8_01)), K16_0002), 2); - } - - SIMD_INLINE __m256i Average8(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11) - { - return PackI16ToU8(Average16(s00, s10), Average16(s01, s11)); - } -#endif - - template __m256i Average8(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11); - - template<> SIMD_INLINE __m256i Average8<1>(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11) - { - return Average8(s00, s01, s10, s11); - } - - const __m256i K8_RC2 = SIMD_MM256_SETR_EPI8( - 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF, - 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF); - - template<> SIMD_INLINE __m256i Average8<2>(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11) - { - return Average8(_mm256_shuffle_epi8(s00, K8_RC2), _mm256_shuffle_epi8(s01, K8_RC2), _mm256_shuffle_epi8(s10, K8_RC2), _mm256_shuffle_epi8(s11, K8_RC2)); - } - - const __m256i K8_RC4 = SIMD_MM256_SETR_EPI8( - 0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF, - 0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF); - - template<> SIMD_INLINE __m256i Average8<4>(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11) - { - return Average8(_mm256_shuffle_epi8(s00, K8_RC4), _mm256_shuffle_epi8(s01, K8_RC4), _mm256_shuffle_epi8(s10, K8_RC4), _mm256_shuffle_epi8(s11, K8_RC4)); - } - - template SIMD_INLINE void ReduceColor2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst) - { - __m256i s00 = Load((__m256i*)src0 + 0); - __m256i s01 = Load((__m256i*)src0 + 1); - __m256i s10 = Load((__m256i*)src1 + 0); - __m256i s11 = Load((__m256i*)src1 + 1); - Store((__m256i*)dst, Average8(s00, s01, s10, s11)); - } - - template void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride) - { - size_t evenWidth = AlignLo(srcWidth, 2); - size_t evenSize = evenWidth * channelCount; - size_t alignedSize = AlignLo(evenSize, DA); - for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) - { - const uint8_t *src0 = src; - const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride); - size_t srcOffset = 0, dstOffset = 0; - for (; srcOffset < alignedSize; srcOffset += DA, dstOffset += A) - ReduceColor2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); - if (alignedSize != evenSize) - { - srcOffset = evenSize - DA; - dstOffset = srcOffset / 2; - ReduceColor2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); - } - if (evenWidth != srcWidth) - { - for (size_t c = 0; c < channelCount; ++c) - dst[evenSize / 2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]); - } - src += 2 * srcStride; - dst += dstStride; - } - } - - const __m256i K8_BGR0 = SIMD_MM256_SETR_EPI8( - 0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1, - -1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1); - const __m256i K8_BGR1 = SIMD_MM256_SETR_EPI8( - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m256i K8_BGR2 = SIMD_MM256_SETR_EPI8( - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1); - const __m256i K8_BGR3 = SIMD_MM256_SETR_EPI8( - -1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF, - 0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1); - const __m256i K8_BGR4 = SIMD_MM256_SETR_EPI8( - 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m256i K8_BGR5 = SIMD_MM256_SETR_EPI8( - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0); - const __m256i K8_BGR6 = SIMD_MM256_SETR_EPI8( - -1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1, - -1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF); - const __m256i K8_BGR7 = SIMD_MM256_SETR_EPI8( - 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m256i K8_BGR8 = SIMD_MM256_SETR_EPI8( - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - - template SIMD_INLINE void ReduceBgr2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst) - { - __m256i s001 = Load((__m256i*)src0 + 0); - __m256i s023 = Load((__m256i*)src0 + 1); - __m256i s045 = Load((__m256i*)src0 + 2); - __m256i s101 = Load((__m256i*)src1 + 0); - __m256i s123 = Load((__m256i*)src1 + 1); - __m256i s145 = Load((__m256i*)src1 + 2); - __m256i s000 = _mm256_permute2x128_si256(s001, s001, 0x00); - __m256i s100 = _mm256_permute2x128_si256(s101, s101, 0x00); - __m256i s012 = _mm256_permute2x128_si256(s001, s023, 0x21); - __m256i s112 = _mm256_permute2x128_si256(s101, s123, 0x21); - __m256i s034 = _mm256_permute2x128_si256(s023, s045, 0x21); - __m256i s134 = _mm256_permute2x128_si256(s123, s145, 0x21); - __m256i m00 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s001, K8_BGR0), _mm256_shuffle_epi8(s000, K8_BGR1)), _mm256_shuffle_epi8(s012, K8_BGR2)); - __m256i m01 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s023, K8_BGR3), _mm256_shuffle_epi8(s012, K8_BGR4)), _mm256_shuffle_epi8(s034, K8_BGR5)); - __m256i m10 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s101, K8_BGR0), _mm256_shuffle_epi8(s100, K8_BGR1)), _mm256_shuffle_epi8(s112, K8_BGR2)); - __m256i m11 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s123, K8_BGR3), _mm256_shuffle_epi8(s112, K8_BGR4)), _mm256_shuffle_epi8(s134, K8_BGR5)); - Store((__m256i*)dst + 0, Average8(m00, m01, m10, m11)); - __m256i s067 = Load((__m256i*)src0 + 3); - __m256i s089 = Load((__m256i*)src0 + 4); - __m256i s167 = Load((__m256i*)src1 + 3); - __m256i s189 = Load((__m256i*)src1 + 4); - __m256i s056 = _mm256_permute2x128_si256(s045, s067, 0x21); - __m256i s156 = _mm256_permute2x128_si256(s145, s167, 0x21); - __m256i s078 = _mm256_permute2x128_si256(s067, s089, 0x21); - __m256i s178 = _mm256_permute2x128_si256(s167, s189, 0x21); - __m256i m02 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s045, K8_BGR6), _mm256_shuffle_epi8(s034, K8_BGR7)), _mm256_shuffle_epi8(s056, K8_BGR8)); - __m256i m03 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s067, K8_BGR0), _mm256_shuffle_epi8(s056, K8_BGR1)), _mm256_shuffle_epi8(s078, K8_BGR2)); - __m256i m12 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s145, K8_BGR6), _mm256_shuffle_epi8(s134, K8_BGR7)), _mm256_shuffle_epi8(s156, K8_BGR8)); - __m256i m13 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s167, K8_BGR0), _mm256_shuffle_epi8(s156, K8_BGR1)), _mm256_shuffle_epi8(s178, K8_BGR2)); - Store((__m256i*)dst + 1, Average8(m02, m03, m12, m13)); - __m256i s0ab = Load((__m256i*)src0 + 5); - __m256i s1ab = Load((__m256i*)src1 + 5); - __m256i s09a = _mm256_permute2x128_si256(s089, s0ab, 0x21); - __m256i s19a = _mm256_permute2x128_si256(s189, s1ab, 0x21); - __m256i s0bb = _mm256_permute2x128_si256(s0ab, s0ab, 0x33); - __m256i s1bb = _mm256_permute2x128_si256(s1ab, s1ab, 0x33); - __m256i m04 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s089, K8_BGR3), _mm256_shuffle_epi8(s078, K8_BGR4)), _mm256_shuffle_epi8(s09a, K8_BGR5)); - __m256i m05 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s0ab, K8_BGR6), _mm256_shuffle_epi8(s09a, K8_BGR7)), _mm256_shuffle_epi8(s0bb, K8_BGR8)); - __m256i m14 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s189, K8_BGR3), _mm256_shuffle_epi8(s178, K8_BGR4)), _mm256_shuffle_epi8(s19a, K8_BGR5)); - __m256i m15 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s1ab, K8_BGR6), _mm256_shuffle_epi8(s19a, K8_BGR7)), _mm256_shuffle_epi8(s1bb, K8_BGR8)); - Store((__m256i*)dst + 2, Average8(m04, m05, m14, m15)); - } - - template void ReduceBgr2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride) - { - size_t evenWidth = AlignLo(srcWidth, 2); - size_t alignedWidth = AlignLo(srcWidth, DA); - size_t evenSize = evenWidth * 3; - size_t alignedSize = alignedWidth * 3; - size_t srcStep = DA * 3, dstStep = A * 3; - for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) - { - const uint8_t *src0 = src; - const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride); - size_t srcOffset = 0, dstOffset = 0; - for (; srcOffset < alignedSize; srcOffset += srcStep, dstOffset += dstStep) - ReduceBgr2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); - if (alignedSize != evenSize) - { - srcOffset = evenSize - srcStep; - dstOffset = srcOffset / 2; - ReduceBgr2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); - } - if (evenWidth != srcWidth) - { - for (size_t c = 0; c < 3; ++c) - dst[evenSize / 2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]); - } - src += 2 * srcStride; - dst += dstStride; - } - } - - template void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) - { - assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA); - if (align) - { - assert(Aligned(src) && Aligned(srcStride)); - assert(Aligned(dst) && Aligned(dstStride)); - } - - switch (channelCount) - { - case 1: ReduceColor2x2<1, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - case 2: ReduceColor2x2<2, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - case 3: ReduceBgr2x2(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - case 4: ReduceColor2x2<4, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - default: assert(0); - } - } - - void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - else - ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2ReduceGray2x2.cpp b/src/3rd/Simd/Simd/SimdAvx2ReduceGray2x2.cpp deleted file mode 100644 index 27c82028..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2ReduceGray2x2.cpp +++ /dev/null @@ -1,111 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { -#ifdef SIMD_MADDUBS_ERROR - SIMD_INLINE __m256i Average8(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11) - { - __m256i lo = Average16( - _mm256_and_si256(s00, K16_00FF), - _mm256_and_si256(_mm256_srli_si256(s00, 1), K16_00FF), - _mm256_and_si256(s10, K16_00FF), - _mm256_and_si256(_mm256_srli_si256(s10, 1), K16_00FF)); - __m256i hi = Average16( - _mm256_and_si256(s01, K16_00FF), - _mm256_and_si256(_mm256_srli_si256(s01, 1), K16_00FF), - _mm256_and_si256(s11, K16_00FF), - _mm256_and_si256(_mm256_srli_si256(s11, 1), K16_00FF)); - return PackI16ToU8(lo, hi); - } -#else - SIMD_INLINE __m256i Average16(const __m256i & s0, const __m256i & s1) - { - return _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(_mm256_maddubs_epi16(s0, K8_01), _mm256_maddubs_epi16(s1, K8_01)), K16_0002), 2); - } - - SIMD_INLINE __m256i Average8(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11) - { - return PackI16ToU8(Average16(s00, s10), Average16(s01, s11)); - } -#endif - - template void ReduceGray2x2( - const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA); - if (align) - { - assert(Aligned(src) && Aligned(srcStride)); - assert(Aligned(dst) && Aligned(dstStride) && Aligned(dstWidth)); - } - - size_t alignedWidth = AlignLo(srcWidth, DA); - size_t evenWidth = AlignLo(srcWidth, 2); - for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) - { - const uint8_t *src0 = src; - const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride); - size_t srcOffset = 0, dstOffset = 0; - for (; srcOffset < alignedWidth; srcOffset += DA, dstOffset += A) - { - Store((__m256i*)(dst + dstOffset), Average8( - Load((__m256i*)(src0 + srcOffset)), Load((__m256i*)(src0 + srcOffset + A)), - Load((__m256i*)(src1 + srcOffset)), Load((__m256i*)(src1 + srcOffset + A)))); - } - if (alignedWidth != srcWidth) - { - dstOffset = dstWidth - A - (evenWidth != srcWidth ? 1 : 0); - srcOffset = evenWidth - DA; - Store((__m256i*)(dst + dstOffset), Average8( - Load((__m256i*)(src0 + srcOffset)), Load((__m256i*)(src0 + srcOffset + A)), - Load((__m256i*)(src1 + srcOffset)), Load((__m256i*)(src1 + srcOffset + A)))); - if (evenWidth != srcWidth) - { - dst[dstWidth - 1] = Base::Average(src0[evenWidth], src1[evenWidth]); - } - } - - src += 2 * srcStride; - dst += dstStride; - } - } - - void ReduceGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - if (Aligned(src) && Aligned(srcWidth) && Aligned(srcStride) && Aligned(dst) && Aligned(dstWidth) && Aligned(dstStride)) - ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2ReduceGray3x3.cpp b/src/3rd/Simd/Simd/SimdAvx2ReduceGray3x3.cpp deleted file mode 100644 index f85df821..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2ReduceGray3x3.cpp +++ /dev/null @@ -1,148 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE __m256i DivideBy16(__m256i value); - - template <> SIMD_INLINE __m256i DivideBy16(__m256i value) - { - return _mm256_srli_epi16(_mm256_add_epi16(value, K16_0008), 4); - } - - template <> SIMD_INLINE __m256i DivideBy16(__m256i value) - { - return _mm256_srli_epi16(value, 4); - } - - const __m256i K16_0102 = SIMD_MM256_SET1_EPI16(0x0102); - - SIMD_INLINE __m256i BinomialSum8(const __m256i & s01, const __m256i & s12) - { -#ifdef SIMD_MADDUBS_ERROR - return BinomialSum16(_mm256_and_si256(s01, K16_00FF), _mm256_and_si256(s12, K16_00FF), _mm256_and_si256(_mm256_srli_si256(s12, 1), K16_00FF)); -#else - return _mm256_add_epi16(_mm256_and_si256(s01, K16_00FF), _mm256_maddubs_epi16(s12, K16_0102)); -#endif - } - - template SIMD_INLINE __m256i ReduceColNose(const uint8_t * p) - { - return BinomialSum8(LoadBeforeFirst(p), Load((__m256i*)p)); - } - - template SIMD_INLINE void ReduceColNose(const uint8_t * s[3], __m256i a[3]) - { - a[0] = ReduceColNose(s[0]); - a[1] = ReduceColNose(s[1]); - a[2] = ReduceColNose(s[2]); - } - - template SIMD_INLINE __m256i ReduceColBody(const uint8_t * p) - { - return BinomialSum8(Load((__m256i*)(p - 1)), Load((__m256i*)p)); - } - - template SIMD_INLINE void ReduceColBody(const uint8_t * s[3], size_t offset, __m256i a[3]) - { - a[0] = ReduceColBody(s[0] + offset); - a[1] = ReduceColBody(s[1] + offset); - a[2] = ReduceColBody(s[2] + offset); - } - - template SIMD_INLINE __m256i ReduceRow(const __m256i lo[3], const __m256i hi[3]) - { - return PackI16ToU8( - DivideBy16(BinomialSum16(lo[0], lo[1], lo[2])), - DivideBy16(BinomialSum16(hi[0], hi[1], hi[2]))); - } - - template void ReduceGray3x3( - const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert(srcWidth >= DA && (srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight); - if (align) - assert(Aligned(src) && Aligned(srcStride)); - - size_t lastOddCol = srcWidth - AlignLo(srcWidth, 2); - size_t bodyWidth = AlignLo(srcWidth, DA); - for (size_t row = 0; row < srcHeight; row += 2, dst += dstStride, src += 2 * srcStride) - { - const uint8_t * s[3]; - s[1] = src; - s[0] = s[1] - (row ? srcStride : 0); - s[2] = s[1] + (row != srcHeight - 1 ? srcStride : 0); - - __m256i lo[3], hi[3]; - ReduceColNose(s, lo); - ReduceColBody(s, A, hi); - Store((__m256i*)dst, ReduceRow(lo, hi)); - - for (size_t srcCol = DA, dstCol = A; srcCol < bodyWidth; srcCol += DA, dstCol += A) - { - ReduceColBody(s, srcCol, lo); - ReduceColBody(s, srcCol + A, hi); - Store((__m256i*)(dst + dstCol), ReduceRow(lo, hi)); - } - - if (bodyWidth != srcWidth) - { - size_t srcCol = srcWidth - DA - lastOddCol; - size_t dstCol = dstWidth - A - lastOddCol; - ReduceColBody(s, srcCol, lo); - ReduceColBody(s, srcCol + A, hi); - Store((__m256i*)(dst + dstCol), ReduceRow(lo, hi)); - if (lastOddCol) - dst[dstWidth - 1] = Base::GaussianBlur3x3(s[0] + srcWidth, s[1] + srcWidth, s[2] + srcWidth, -2, -1, -1); - } - } - } - - template void ReduceGray3x3( - const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation) - { - if (compensation) - ReduceGray3x3(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - ReduceGray3x3(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - } - - void ReduceGray3x3(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation) - { - if (Aligned(src) && Aligned(srcStride)) - ReduceGray3x3(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, compensation); - else - ReduceGray3x3(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, compensation); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2ReduceGray4x4.cpp b/src/3rd/Simd/Simd/SimdAvx2ReduceGray4x4.cpp deleted file mode 100644 index 06d911b7..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2ReduceGray4x4.cpp +++ /dev/null @@ -1,192 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - namespace - { - struct Buffer - { - Buffer(size_t width) - { - _p = Allocate(sizeof(uint16_t) * 4 * width); - src0 = (uint16_t*)_p; - src1 = src0 + width; - src2 = src1 + width; - src3 = src2 + width; - } - - ~Buffer() - { - Free(_p); - } - - uint16_t * src0; - uint16_t * src1; - uint16_t * src2; - uint16_t * src3; - private: - void * _p; - }; - } - - SIMD_INLINE __m256i DivideBy64(__m256i value) - { - return _mm256_srli_epi16(_mm256_add_epi16(value, K16_0020), 6); - } - - SIMD_INLINE __m256i BinomialSum16(const __m256i & a, const __m256i & b, const __m256i & c, const __m256i & d) - { - return _mm256_add_epi16(_mm256_add_epi16(a, d), _mm256_mullo_epi16(_mm256_add_epi16(b, c), K16_0003)); - } - - const __m256i K8_01_03 = SIMD_MM256_SET2_EPI8(1, 3); - const __m256i K8_03_01 = SIMD_MM256_SET2_EPI8(3, 1); - - SIMD_INLINE __m256i BinomialSum16(const __m256i & ab, const __m256i & cd) - { -#ifdef SIMD_MADDUBS_ERROR - return _mm256_add_epi16(_mm256_maddubs_epi16(_mm256_or_si256(K_ZERO, ab), K8_01_03), _mm256_maddubs_epi16(_mm256_or_si256(K_ZERO, cd), K8_03_01)); -#else - return _mm256_add_epi16(_mm256_maddubs_epi16(ab, K8_01_03), _mm256_maddubs_epi16(cd, K8_03_01)); -#endif - } - - SIMD_INLINE __m256i ReduceColNose(const uint8_t * src) - { - const __m256i t2 = _mm256_loadu_si256((__m256i*)(src + 1)); - return BinomialSum16(LoadBeforeFirst(src), t2); - } - - SIMD_INLINE __m256i ReduceColBody(const uint8_t * src) - { - const __m256i t0 = _mm256_loadu_si256((__m256i*)(src - 1)); - const __m256i t2 = _mm256_loadu_si256((__m256i*)(src + 1)); - return BinomialSum16(t0, t2); - } - - template SIMD_INLINE __m256i ReduceColTail(const uint8_t * src); - - template <> SIMD_INLINE __m256i ReduceColTail(const uint8_t * src) - { - const __m256i t0 = _mm256_loadu_si256((__m256i*)(src - 1)); - const __m256i t2 = LoadAfterLast(src); - return BinomialSum16(t0, t2); - } - - template <> SIMD_INLINE __m256i ReduceColTail(const uint8_t * src) - { - const __m256i t0 = _mm256_loadu_si256((__m256i*)(src - 1)); - __m256i t1, t2; - LoadAfterLast(src - 1, t1, t2); - return BinomialSum16(t0, t2); - } - - template SIMD_INLINE __m256i ReduceRow16(const Buffer & buffer, size_t offset) - { - return _mm256_and_si256(DivideBy64(BinomialSum16( - Load((__m256i*)(buffer.src0 + offset)), Load((__m256i*)(buffer.src1 + offset)), - Load((__m256i*)(buffer.src2 + offset)), Load((__m256i*)(buffer.src3 + offset)))), K16_00FF); - } - - template SIMD_INLINE __m256i ReduceRow8(const Buffer & buffer, size_t offset) - { - __m256i lo = ReduceRow16(buffer, offset); - __m256i hi = ReduceRow16(buffer, offset + HA); - return PackI16ToU8(lo, hi); - } - - template void ReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth > DA); - - size_t alignedDstWidth = Simd::AlignLo(dstWidth, A); - size_t srcTail = Simd::AlignHi(srcWidth - A, 2); - - Buffer buffer(Simd::AlignHi(dstWidth, A)); - - __m256i tmp = ReduceColNose(src); - Store((__m256i*)buffer.src0, tmp); - Store((__m256i*)buffer.src1, tmp); - size_t srcCol = A, dstCol = HA; - for (; srcCol < srcWidth - A; srcCol += A, dstCol += HA) - { - tmp = ReduceColBody(src + srcCol); - Store((__m256i*)(buffer.src0 + dstCol), tmp); - Store((__m256i*)(buffer.src1 + dstCol), tmp); - } - tmp = ReduceColTail(src + srcTail); - Store((__m256i*)(buffer.src0 + dstWidth - HA), tmp); - Store((__m256i*)(buffer.src1 + dstWidth - HA), tmp); - - for (size_t row = 0; row < srcHeight; row += 2, dst += dstStride) - { - const uint8_t *src2 = src + srcStride*(row + 1); - const uint8_t *src3 = src2 + srcStride; - if (row >= srcHeight - 2) - { - src2 = src + srcStride*(srcHeight - 1); - src3 = src2; - } - - Store((__m256i*)buffer.src2, ReduceColNose(src2)); - Store((__m256i*)buffer.src3, ReduceColNose(src3)); - size_t srcCol = A, dstCol = HA; - for (; srcCol < srcWidth - A; srcCol += A, dstCol += HA) - { - Store((__m256i*)(buffer.src2 + dstCol), ReduceColBody(src2 + srcCol)); - Store((__m256i*)(buffer.src3 + dstCol), ReduceColBody(src3 + srcCol)); - } - Store((__m256i*)(buffer.src2 + dstWidth - HA), ReduceColTail(src2 + srcTail)); - Store((__m256i*)(buffer.src3 + dstWidth - HA), ReduceColTail(src3 + srcTail)); - - Store((__m256i*)dst, ReduceRow8(buffer, 0)); - for (size_t col = A; col < alignedDstWidth; col += A) - Store((__m256i*)(dst + col), ReduceRow8(buffer, col)); - - if (alignedDstWidth != dstWidth) - Store((__m256i*)(dst + dstWidth - A), ReduceRow8(buffer, dstWidth - A)); - - Swap(buffer.src0, buffer.src2); - Swap(buffer.src1, buffer.src3); - } - } - - void ReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - if (Aligned(srcWidth, 2)) - ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2ReduceGray5x5.cpp b/src/3rd/Simd/Simd/SimdAvx2ReduceGray5x5.cpp deleted file mode 100644 index 76493a0f..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2ReduceGray5x5.cpp +++ /dev/null @@ -1,202 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - namespace - { - struct Buffer - { - Buffer(size_t width) - { - _p = Allocate(sizeof(uint16_t)*(5 * width + A)); - in0 = (uint16_t*)_p; - in1 = in0 + width; - out0 = in1 + width; - out1 = out0 + width; - dst = out1 + width + HA; - } - - ~Buffer() - { - Free(_p); - } - - uint16_t * in0; - uint16_t * in1; - uint16_t * out0; - uint16_t * out1; - uint16_t * dst; - private: - void *_p; - }; - } - - template SIMD_INLINE __m256i DivideBy256(__m256i value); - - template <> SIMD_INLINE __m256i DivideBy256(__m256i value) - { - return _mm256_srli_epi16(_mm256_add_epi16(value, K16_0080), 8); - } - - template <> SIMD_INLINE __m256i DivideBy256(__m256i value) - { - return _mm256_srli_epi16(value, 8); - } - - template SIMD_INLINE __m256i LoadUnpacked(const void * src) - { - return _mm256_cvtepu8_epi16(LoadHalf((const __m128i*)src)); - } - - template SIMD_INLINE void FirstRow5x5(__m256i src, Buffer & buffer, size_t offset) - { - Store((__m256i*)(buffer.in0 + offset), src); - Store((__m256i*)(buffer.in1 + offset), _mm256_mullo_epi16(src, K16_0005)); - } - - template SIMD_INLINE void FirstRow5x5(const uint8_t * src, Buffer & buffer, size_t offset) - { - FirstRow5x5(LoadUnpacked(src + offset), buffer, offset); - offset += HA; - FirstRow5x5(LoadUnpacked(src + offset), buffer, offset); - } - - template SIMD_INLINE void MainRowY5x5(__m256i odd, __m256i even, Buffer & buffer, size_t offset) - { - __m256i cp = _mm256_mullo_epi16(odd, K16_0004); - __m256i c0 = Load((__m256i*)(buffer.in0 + offset)); - __m256i c1 = Load((__m256i*)(buffer.in1 + offset)); - Store((__m256i*)(buffer.dst + offset), _mm256_add_epi16(even, _mm256_add_epi16(c1, _mm256_add_epi16(cp, _mm256_mullo_epi16(c0, K16_0006))))); - Store((__m256i*)(buffer.out1 + offset), _mm256_add_epi16(c0, cp)); - Store((__m256i*)(buffer.out0 + offset), even); - } - - template SIMD_INLINE void MainRowY5x5(const uint8_t * odd, const uint8_t * even, Buffer & buffer, size_t offset) - { - MainRowY5x5(LoadUnpacked(odd + offset), LoadUnpacked(even + offset), buffer, offset); - offset += HA; - MainRowY5x5(LoadUnpacked(odd + offset), LoadUnpacked(even + offset), buffer, offset); - } - - template SIMD_INLINE __m256i MainRowX5x5(uint16_t * dst) - { - __m256i t0 = _mm256_loadu_si256((__m256i*)(dst - 2)); - __m256i t1 = _mm256_loadu_si256((__m256i*)(dst - 1)); - __m256i t2 = Load((__m256i*)dst); - __m256i t3 = _mm256_loadu_si256((__m256i*)(dst + 1)); - __m256i t4 = _mm256_loadu_si256((__m256i*)(dst + 2)); - t2 = _mm256_add_epi16(_mm256_add_epi16(_mm256_mullo_epi16(t2, K16_0006), _mm256_mullo_epi16(_mm256_add_epi16(t1, t3), K16_0004)), _mm256_add_epi16(t0, t4)); - return DivideBy256(t2); - } - - template SIMD_INLINE __m256i MainRowX5x5(Buffer & buffer, size_t offset) - { - const __m256i lo = MainRowX5x5(buffer.dst + offset); - const __m256i hi = MainRowX5x5(buffer.dst + offset + HA); - return _mm256_and_si256(PackI16ToU8(lo, hi), K16_00FF); - } - - template SIMD_INLINE void MainRowX5x5(Buffer & buffer, size_t offset, uint8_t * dst) - { - __m256i lo = MainRowX5x5(buffer, offset); - __m256i hi = MainRowX5x5(buffer, offset + A); - Store((__m256i*)dst, PackI16ToU8(lo, hi)); - } - - template void ReduceGray5x5( - const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA); - if (align) - assert(Aligned(src) && Aligned(srcStride)); - - size_t alignedWidth = Simd::AlignLo(srcWidth, DA); - size_t bufferDstTail = Simd::AlignHi(srcWidth - DA, 2); - - Buffer buffer(Simd::AlignHi(srcWidth, A)); - - for (size_t col = 0; col < alignedWidth; col += A) - FirstRow5x5(src, buffer, col); - if (alignedWidth != srcWidth) - { - FirstRow5x5(src, buffer, srcWidth - DA); - FirstRow5x5(src, buffer, srcWidth - A); - } - src += srcStride; - - for (size_t row = 1; row <= srcHeight; row += 2, dst += dstStride, src += 2 * srcStride) - { - const uint8_t * odd = src - (row < srcHeight ? 0 : srcStride); - const uint8_t * even = odd + (row < srcHeight - 1 ? srcStride : 0); - - for (size_t col = 0; col < alignedWidth; col += A) - MainRowY5x5(odd, even, buffer, col); - if (alignedWidth != srcWidth) - { - MainRowY5x5(odd, even, buffer, srcWidth - DA); - MainRowY5x5(odd, even, buffer, srcWidth - A); - } - - Swap(buffer.in0, buffer.out0); - Swap(buffer.in1, buffer.out1); - - buffer.dst[-2] = buffer.dst[0]; - buffer.dst[-1] = buffer.dst[0]; - buffer.dst[srcWidth] = buffer.dst[srcWidth - 1]; - buffer.dst[srcWidth + 1] = buffer.dst[srcWidth - 1]; - - for (size_t srcCol = 0, dstCol = 0; srcCol < alignedWidth; srcCol += DA, dstCol += A) - MainRowX5x5(buffer, srcCol, dst + dstCol); - if (alignedWidth != srcWidth) - MainRowX5x5(buffer, bufferDstTail, dst + dstWidth - A); - } - } - - template void ReduceGray5x5(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride)) - ReduceGray5x5(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - ReduceGray5x5(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - } - - void ReduceGray5x5(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation) - { - if (compensation) - ReduceGray5x5(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - ReduceGray5x5(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Reorder.cpp b/src/3rd/Simd/Simd/SimdAvx2Reorder.cpp deleted file mode 100644 index 2fbc708a..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Reorder.cpp +++ /dev/null @@ -1,120 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - const __m256i K8_SHUFFLE_REORDER_16 = SIMD_MM256_SETR_EPI8( - 0x1, 0x0, 0x3, 0x2, 0x5, 0x4, 0x7, 0x6, 0x9, 0x8, 0xB, 0xA, 0xD, 0xC, 0xF, 0xE, - 0x1, 0x0, 0x3, 0x2, 0x5, 0x4, 0x7, 0x6, 0x9, 0x8, 0xB, 0xA, 0xD, 0xC, 0xF, 0xE); - - template SIMD_INLINE void Reorder16bit(const uint8_t * src, uint8_t * dst) - { - __m256i _src = Load((__m256i*)src); - Store((__m256i*)dst, _mm256_shuffle_epi8(_src, K8_SHUFFLE_REORDER_16)); - } - - template void Reorder16bit(const uint8_t * src, size_t size, uint8_t * dst) - { - assert(size >= A && size % 2 == 0); - - size_t alignedSize = AlignLo(size, A); - for (size_t i = 0; i < alignedSize; i += A) - Reorder16bit(src + i, dst + i); - for (size_t i = alignedSize; i < size; i += 2) - Base::Reorder16bit(src + i, dst + i); - } - - void Reorder16bit(const uint8_t * src, size_t size, uint8_t * dst) - { - if (Aligned(src) && Aligned(dst)) - Reorder16bit(src, size, dst); - else - Reorder16bit(src, size, dst); - } - - const __m256i K8_SHUFFLE_REORDER_32 = SIMD_MM256_SETR_EPI8( - 0x3, 0x2, 0x1, 0x0, 0x7, 0x6, 0x5, 0x4, 0xB, 0xA, 0x9, 0x8, 0xF, 0xE, 0xD, 0xC, - 0x3, 0x2, 0x1, 0x0, 0x7, 0x6, 0x5, 0x4, 0xB, 0xA, 0x9, 0x8, 0xF, 0xE, 0xD, 0xC); - - template SIMD_INLINE void Reorder32bit(const uint8_t * src, uint8_t * dst) - { - __m256i _src = Load((__m256i*)src); - Store((__m256i*)dst, _mm256_shuffle_epi8(_src, K8_SHUFFLE_REORDER_32)); - } - - template void Reorder32bit(const uint8_t * src, size_t size, uint8_t * dst) - { - assert(size >= A && size % 4 == 0); - - size_t alignedSize = AlignLo(size, A); - for (size_t i = 0; i < alignedSize; i += A) - Reorder32bit(src + i, dst + i); - for (size_t i = alignedSize; i < size; i += 4) - Base::Reorder32bit(src + i, dst + i); - } - - void Reorder32bit(const uint8_t * src, size_t size, uint8_t * dst) - { - if (Aligned(src) && Aligned(dst)) - Reorder32bit(src, size, dst); - else - Reorder32bit(src, size, dst); - } - - const __m256i K8_SHUFFLE_REORDER_64 = SIMD_MM256_SETR_EPI8( - 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0, 0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8, - 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0, 0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8); - - template SIMD_INLINE void Reorder64bit(const uint8_t * src, uint8_t * dst) - { - __m256i _src = Load((__m256i*)src); - Store((__m256i*)dst, _mm256_shuffle_epi8(_src, K8_SHUFFLE_REORDER_64)); - } - - template void Reorder64bit(const uint8_t * src, size_t size, uint8_t * dst) - { - assert(size >= A && size % 8 == 0); - - size_t alignedSize = AlignLo(size, A); - for (size_t i = 0; i < alignedSize; i += A) - Reorder64bit(src + i, dst + i); - for (size_t i = alignedSize; i < size; i += 8) - Base::Reorder64bit(src + i, dst + i); - } - - void Reorder64bit(const uint8_t * src, size_t size, uint8_t * dst) - { - if (Aligned(src) && Aligned(dst)) - Reorder64bit(src, size, dst); - else - Reorder64bit(src, size, dst); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2ResizeBilinear.cpp b/src/3rd/Simd/Simd/SimdAvx2ResizeBilinear.cpp deleted file mode 100644 index d1c70247..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2ResizeBilinear.cpp +++ /dev/null @@ -1,428 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - namespace - { - struct Buffer - { - Buffer(size_t size, size_t width, size_t height) - { - _p = Allocate(3 * size + sizeof(int)*(2 * height + width)); - bx[0] = (uint8_t*)_p; - bx[1] = bx[0] + size; - ax = bx[1] + size; - ix = (int*)(ax + size); - iy = ix + width; - ay = iy + height; - } - - ~Buffer() - { - Free(_p); - } - - uint8_t * bx[2]; - uint8_t * ax; - int * ix; - int * ay; - int * iy; - private: - void *_p; - }; - - struct Index - { - int src, dst; - uint8_t shuffle[A]; - }; - - struct BufferG - { - BufferG(size_t width, size_t blocks, size_t height) - { - _p = Allocate(3 * width + sizeof(int) * 2 * height + blocks * sizeof(Index) + 2 * A); - bx[0] = (uint8_t*)_p; - bx[1] = bx[0] + width + A; - ax = bx[1] + width + A; - ix = (Index*)(ax + width); - iy = (int*)(ix + blocks); - ay = iy + height; - } - - ~BufferG() - { - Free(_p); - } - - uint8_t * bx[2]; - uint8_t * ax; - Index * ix; - int * ay; - int * iy; - private: - void *_p; - }; - } - - template void EstimateAlphaIndexX(size_t srcSize, size_t dstSize, int * indexes, uint8_t * alphas) - { - float scale = (float)srcSize / dstSize; - - for (size_t i = 0; i < dstSize; ++i) - { - float alpha = (float)((i + 0.5)*scale - 0.5); - ptrdiff_t index = (ptrdiff_t)::floor(alpha); - alpha -= index; - - if (index < 0) - { - index = 0; - alpha = 0; - } - - if (index > (ptrdiff_t)srcSize - 2) - { - index = srcSize - 2; - alpha = 1; - } - - indexes[i] = (int)index; - alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5); - alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]); - for (size_t channel = 1; channel < channelCount; channel++) - ((uint16_t*)alphas)[channel] = *(uint16_t*)alphas; - alphas += 2 * channelCount; - } - } - - size_t BlockCountMax(size_t src, size_t dst) - { - return (size_t)Simd::Max(::ceil(float(src) / (A - 1)), ::ceil(float(dst) / HA)); - } - - void EstimateAlphaIndexX(int srcSize, int dstSize, Index * indexes, uint8_t * alphas, size_t & blockCount) - { - float scale = (float)srcSize / dstSize; - int block = 0; - indexes[0].src = 0; - indexes[0].dst = 0; - for (int dstIndex = 0; dstIndex < dstSize; ++dstIndex) - { - float alpha = (float)((dstIndex + 0.5)*scale - 0.5); - int srcIndex = (int)::floor(alpha); - alpha -= srcIndex; - - if (srcIndex < 0) - { - srcIndex = 0; - alpha = 0; - } - - if (srcIndex > srcSize - 2) - { - srcIndex = srcSize - 2; - alpha = 1; - } - - int dst = 2 * dstIndex - indexes[block].dst; - int src = srcIndex - indexes[block].src; - if (src >= A - 1 || dst >= A) - { - block++; - indexes[block].src = Simd::Min(srcIndex, srcSize - (int)A); - indexes[block].dst = 2 * dstIndex; - dst = 0; - src = srcIndex - indexes[block].src; - } - indexes[block].shuffle[dst] = src; - indexes[block].shuffle[dst + 1] = src + 1; - - alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5); - alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]); - alphas += 2; - } - blockCount = block + 1; - } - - template void InterpolateX(const __m256i * alpha, __m256i * buffer); - - template <> SIMD_INLINE void InterpolateX<1>(const __m256i * alpha, __m256i * buffer) - { -#ifdef SIMD_MADDUBS_ERROR - __m256i _buffer = _mm256_or_si256(K_ZERO, _mm256_load_si256(buffer)); -#else - __m256i _buffer = _mm256_load_si256(buffer); -#endif - _mm256_store_si256(buffer, _mm256_maddubs_epi16(_buffer, _mm256_load_si256(alpha))); - } - - const __m256i K8_SHUFFLE_X2 = SIMD_MM256_SETR_EPI8(0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF, - 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF); - - SIMD_INLINE void InterpolateX2(const __m256i * alpha, __m256i * buffer) - { - __m256i src = _mm256_shuffle_epi8(_mm256_load_si256(buffer), K8_SHUFFLE_X2); - _mm256_store_si256(buffer, _mm256_maddubs_epi16(src, _mm256_load_si256(alpha))); - } - - template <> SIMD_INLINE void InterpolateX<2>(const __m256i * alpha, __m256i * buffer) - { - InterpolateX2(alpha + 0, buffer + 0); - InterpolateX2(alpha + 1, buffer + 1); - } - - const __m256i K8_SHUFFLE_X3_00 = SIMD_MM256_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m256i K8_SHUFFLE_X3_01 = SIMD_MM256_SETR_EPI8(0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1, - -1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1); - const __m256i K8_SHUFFLE_X3_02 = SIMD_MM256_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1); - - const __m256i K8_SHUFFLE_X3_10 = SIMD_MM256_SETR_EPI8(0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m256i K8_SHUFFLE_X3_11 = SIMD_MM256_SETR_EPI8(-1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF, - 0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1); - const __m256i K8_SHUFFLE_X3_12 = SIMD_MM256_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0); - - const __m256i K8_SHUFFLE_X3_20 = SIMD_MM256_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m256i K8_SHUFFLE_X3_21 = SIMD_MM256_SETR_EPI8(-1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1, - -1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF); - const __m256i K8_SHUFFLE_X3_22 = SIMD_MM256_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - - template <> SIMD_INLINE void InterpolateX<3>(const __m256i * alpha, __m256i * buffer) - { - __m256i src[3], shuffled; - src[0] = _mm256_load_si256(buffer + 0); - src[1] = _mm256_load_si256(buffer + 1); - src[2] = _mm256_load_si256(buffer + 2); - - shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[0], 0x21), K8_SHUFFLE_X3_00); - shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[0], K8_SHUFFLE_X3_01)); - shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[1], 0x21), K8_SHUFFLE_X3_02)); - _mm256_store_si256(buffer + 0, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 0))); - - shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[1], 0x21), K8_SHUFFLE_X3_10); - shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[1], K8_SHUFFLE_X3_11)); - shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[1], src[2], 0x21), K8_SHUFFLE_X3_12)); - _mm256_store_si256(buffer + 1, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 1))); - - shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[1], src[2], 0x21), K8_SHUFFLE_X3_20); - shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[2], K8_SHUFFLE_X3_21)); - shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[2], src[2], 0x21), K8_SHUFFLE_X3_22)); - _mm256_store_si256(buffer + 2, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 2))); - } - - const __m256i K8_SHUFFLE_X4 = SIMD_MM256_SETR_EPI8(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF, - 0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF); - - SIMD_INLINE void InterpolateX4(const __m256i * alpha, __m256i * buffer) - { - __m256i src = _mm256_shuffle_epi8(_mm256_load_si256(buffer), K8_SHUFFLE_X4); - _mm256_store_si256(buffer, _mm256_maddubs_epi16(src, _mm256_load_si256(alpha))); - } - - template <> SIMD_INLINE void InterpolateX<4>(const __m256i * alpha, __m256i * buffer) - { - InterpolateX4(alpha + 0, buffer + 0); - InterpolateX4(alpha + 1, buffer + 1); - InterpolateX4(alpha + 2, buffer + 2); - InterpolateX4(alpha + 3, buffer + 3); - } - - const __m256i K16_FRACTION_ROUND_TERM = SIMD_MM256_SET1_EPI16(Base::BILINEAR_ROUND_TERM); - - template SIMD_INLINE __m256i InterpolateY(const __m256i * pbx0, const __m256i * pbx1, __m256i alpha[2]) - { - __m256i sum = _mm256_add_epi16(_mm256_mullo_epi16(Load(pbx0), alpha[0]), _mm256_mullo_epi16(Load(pbx1), alpha[1])); - return _mm256_srli_epi16(_mm256_add_epi16(sum, K16_FRACTION_ROUND_TERM), Base::BILINEAR_SHIFT); - } - - template SIMD_INLINE void InterpolateY(const uint8_t * bx0, const uint8_t * bx1, __m256i alpha[2], uint8_t * dst) - { - __m256i lo = InterpolateY((__m256i*)bx0 + 0, (__m256i*)bx1 + 0, alpha); - __m256i hi = InterpolateY((__m256i*)bx0 + 1, (__m256i*)bx1 + 1, alpha); - Store((__m256i*)dst, PackI16ToU8(lo, hi)); - } - - template void ResizeBilinear( - const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert(dstWidth >= A); - - struct One { uint8_t channels[channelCount]; }; - struct Two { uint8_t channels[channelCount * 2]; }; - - size_t size = 2 * dstWidth*channelCount; - size_t bufferSize = AlignHi(dstWidth, A)*channelCount * 2; - size_t alignedSize = AlignHi(size, DA) - DA; - const size_t step = A*channelCount; - - Buffer buffer(bufferSize, dstWidth, dstHeight); - - Base::EstimateAlphaIndex(srcHeight, dstHeight, buffer.iy, buffer.ay, 1); - - EstimateAlphaIndexX(srcWidth, dstWidth, buffer.ix, buffer.ax); - - ptrdiff_t previous = -2; - - __m256i a[2]; - - for (size_t yDst = 0; yDst < dstHeight; yDst++, dst += dstStride) - { - a[0] = _mm256_set1_epi16(int16_t(Base::FRACTION_RANGE - buffer.ay[yDst])); - a[1] = _mm256_set1_epi16(int16_t(buffer.ay[yDst])); - - ptrdiff_t sy = buffer.iy[yDst]; - int k = 0; - - if (sy == previous) - k = 2; - else if (sy == previous + 1) - { - Swap(buffer.bx[0], buffer.bx[1]); - k = 1; - } - - previous = sy; - - for (; k < 2; k++) - { - Two * pb = (Two *)buffer.bx[k]; - const One * psrc = (const One *)(src + (sy + k)*srcStride); - for (size_t x = 0; x < dstWidth; x++) - pb[x] = *(Two *)(psrc + buffer.ix[x]); - - uint8_t * pbx = buffer.bx[k]; - for (size_t i = 0; i < bufferSize; i += step) - InterpolateX((__m256i*)(buffer.ax + i), (__m256i*)(pbx + i)); - } - - for (size_t ib = 0, id = 0; ib < alignedSize; ib += DA, id += A) - InterpolateY(buffer.bx[0] + ib, buffer.bx[1] + ib, a, dst + id); - size_t i = size - DA; - InterpolateY(buffer.bx[0] + i, buffer.bx[1] + i, a, dst + i / 2); - } - } - - SIMD_INLINE void LoadGrayIntrepolated(const uint8_t * src, const Index & index, const uint8_t * alpha, uint8_t * dst) - { - __m256i _src = _mm256_loadu_si256((__m256i*)(src + index.src)); - __m256i _shuffle = _mm256_loadu_si256((__m256i*)&index.shuffle); - __m256i _alpha = _mm256_loadu_si256((__m256i*)(alpha + index.dst)); - _mm256_storeu_si256((__m256i*)(dst + index.dst), _mm256_maddubs_epi16(Shuffle(_src, _shuffle), _alpha)); - } - - void ResizeBilinearGray(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert(dstWidth >= A); - - size_t size = 2 * dstWidth; - size_t bufferWidth = AlignHi(dstWidth, A) * 2; - size_t blockCount = BlockCountMax(srcWidth, dstWidth); - size_t alignedSize = AlignHi(size, DA) - DA; - - BufferG buffer(bufferWidth, blockCount, dstHeight); - - Base::EstimateAlphaIndex(srcHeight, dstHeight, buffer.iy, buffer.ay, 1); - - EstimateAlphaIndexX((int)srcWidth, (int)dstWidth, buffer.ix, buffer.ax, blockCount); - - ptrdiff_t previous = -2; - - __m256i a[2]; - - for (size_t yDst = 0; yDst < dstHeight; yDst++, dst += dstStride) - { - a[0] = _mm256_set1_epi16(int16_t(Base::FRACTION_RANGE - buffer.ay[yDst])); - a[1] = _mm256_set1_epi16(int16_t(buffer.ay[yDst])); - - ptrdiff_t sy = buffer.iy[yDst]; - int k = 0; - - if (sy == previous) - k = 2; - else if (sy == previous + 1) - { - Swap(buffer.bx[0], buffer.bx[1]); - k = 1; - } - - previous = sy; - - for (; k < 2; k++) - { - const uint8_t * psrc = src + (sy + k)*srcStride; - uint8_t * pdst = buffer.bx[k]; - for (size_t i = 0; i < blockCount; ++i) - LoadGrayIntrepolated(psrc, buffer.ix[i], buffer.ax, pdst); - } - - for (size_t ib = 0, id = 0; ib < alignedSize; ib += DA, id += A) - InterpolateY(buffer.bx[0] + ib, buffer.bx[1] + ib, a, dst + id); - size_t i = size - DA; - InterpolateY(buffer.bx[0] + i, buffer.bx[1] + i, a, dst + i / 2); - } - } - - void ResizeBilinear( - const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) - { - switch (channelCount) - { - case 1: - if (srcWidth >= A && srcWidth < 4 * dstWidth) - ResizeBilinearGray(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - ResizeBilinear<1>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - break; - case 2: - ResizeBilinear<2>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - break; - case 3: - ResizeBilinear<3>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - break; - case 4: - ResizeBilinear<4>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - break; - default: - Base::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - } - } - } -#endif//SIMD_AVX2_ENABLE -} - diff --git a/src/3rd/Simd/Simd/SimdAvx2Resizer.cpp b/src/3rd/Simd/Simd/SimdAvx2Resizer.cpp deleted file mode 100644 index 9e5df02d..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Resizer.cpp +++ /dev/null @@ -1,656 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdResizer.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdUpdate.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - ResizerByteBilinear::ResizerByteBilinear(const ResParam & param) - : Ssse3::ResizerByteBilinear(param) - { - } - - void ResizerByteBilinear::EstimateParams() - { - if (_ax.data) - return; - if (_param.channels == 1 && _param.srcW < 4 * _param.dstW) - _blocks = BlockCountMax(A); - float scale = (float)_param.srcW / _param.dstW; - _ax.Resize(AlignHi(_param.dstW, A) * _param.channels * 2, false, _param.align); - uint8_t * alphas = _ax.data; - if (_blocks) - { - _ixg.Resize(_blocks); - int block = 0; - _ixg[0].src = 0; - _ixg[0].dst = 0; - for (int dstIndex = 0; dstIndex < (int)_param.dstW; ++dstIndex) - { - float alpha = (float)((dstIndex + 0.5)*scale - 0.5); - int srcIndex = (int)::floor(alpha); - alpha -= srcIndex; - - if (srcIndex < 0) - { - srcIndex = 0; - alpha = 0; - } - - if (srcIndex > (int)_param.srcW - 2) - { - srcIndex = (int)_param.srcW - 2; - alpha = 1; - } - - int dst = 2 * dstIndex - _ixg[block].dst; - int src = srcIndex - _ixg[block].src; - if (src >= A - 1 || dst >= A) - { - block++; - _ixg[block].src = Simd::Min(srcIndex, int(_param.srcW - A)); - _ixg[block].dst = 2 * dstIndex; - dst = 0; - src = srcIndex - _ixg[block].src; - } - _ixg[block].shuffle[dst] = src; - _ixg[block].shuffle[dst + 1] = src + 1; - - alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5); - alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]); - alphas += 2; - } - _blocks = block + 1; - } - else - { - _ix.Resize(AlignHi(_param.dstW, _param.align/4), true, _param.align); - for (size_t i = 0; i < _param.dstW; ++i) - { - float alpha = (float)((i + 0.5)*scale - 0.5); - ptrdiff_t index = (ptrdiff_t)::floor(alpha); - alpha -= index; - - if (index < 0) - { - index = 0; - alpha = 0; - } - - if (index >(ptrdiff_t)_param.srcW - 2) - { - index = _param.srcW - 2; - alpha = 1; - } - - _ix[i] = (int)index; - alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5); - alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]); - for (size_t channel = 1; channel < _param.channels; channel++) - ((uint16_t*)alphas)[channel] = *(uint16_t*)alphas; - alphas += 2 * _param.channels; - } - } - size_t size = AlignHi(_param.dstW, _param.align)*_param.channels * 2; - _bx[0].Resize(size, false, _param.align); - _bx[1].Resize(size, false, _param.align); - } - - template void ResizerByteBilinearInterpolateX(const __m256i * alpha, __m256i * buffer); - - template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<1>(const __m256i * alpha, __m256i * buffer) - { -#ifdef SIMD_MADDUBS_ERROR - __m256i _buffer = _mm256_or_si256(K_ZERO, _mm256_load_si256(buffer)); -#else - __m256i _buffer = _mm256_load_si256(buffer); -#endif - _mm256_store_si256(buffer, _mm256_maddubs_epi16(_buffer, _mm256_load_si256(alpha))); - } - - const __m256i K8_SHUFFLE_X2 = SIMD_MM256_SETR_EPI8(0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF, - 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF); - - SIMD_INLINE void ResizerByteBilinearInterpolateX2(const __m256i * alpha, __m256i * buffer) - { - __m256i src = _mm256_shuffle_epi8(_mm256_load_si256(buffer), K8_SHUFFLE_X2); - _mm256_store_si256(buffer, _mm256_maddubs_epi16(src, _mm256_load_si256(alpha))); - } - - template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<2>(const __m256i * alpha, __m256i * buffer) - { - ResizerByteBilinearInterpolateX2(alpha + 0, buffer + 0); - ResizerByteBilinearInterpolateX2(alpha + 1, buffer + 1); - } - - const __m256i K8_SHUFFLE_X3_00 = SIMD_MM256_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m256i K8_SHUFFLE_X3_01 = SIMD_MM256_SETR_EPI8(0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1, - -1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1); - const __m256i K8_SHUFFLE_X3_02 = SIMD_MM256_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1); - - const __m256i K8_SHUFFLE_X3_10 = SIMD_MM256_SETR_EPI8(0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m256i K8_SHUFFLE_X3_11 = SIMD_MM256_SETR_EPI8(-1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF, - 0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1); - const __m256i K8_SHUFFLE_X3_12 = SIMD_MM256_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0); - - const __m256i K8_SHUFFLE_X3_20 = SIMD_MM256_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m256i K8_SHUFFLE_X3_21 = SIMD_MM256_SETR_EPI8(-1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1, - -1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF); - const __m256i K8_SHUFFLE_X3_22 = SIMD_MM256_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - - template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<3>(const __m256i * alpha, __m256i * buffer) - { - __m256i src[3], shuffled; - src[0] = _mm256_load_si256(buffer + 0); - src[1] = _mm256_load_si256(buffer + 1); - src[2] = _mm256_load_si256(buffer + 2); - - shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[0], 0x21), K8_SHUFFLE_X3_00); - shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[0], K8_SHUFFLE_X3_01)); - shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[1], 0x21), K8_SHUFFLE_X3_02)); - _mm256_store_si256(buffer + 0, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 0))); - - shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[1], 0x21), K8_SHUFFLE_X3_10); - shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[1], K8_SHUFFLE_X3_11)); - shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[1], src[2], 0x21), K8_SHUFFLE_X3_12)); - _mm256_store_si256(buffer + 1, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 1))); - - shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[1], src[2], 0x21), K8_SHUFFLE_X3_20); - shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[2], K8_SHUFFLE_X3_21)); - shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[2], src[2], 0x21), K8_SHUFFLE_X3_22)); - _mm256_store_si256(buffer + 2, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 2))); - } - - const __m256i K8_SHUFFLE_X4 = SIMD_MM256_SETR_EPI8(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF, - 0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF); - - SIMD_INLINE void ResizerByteBilinearInterpolateX4(const __m256i * alpha, __m256i * buffer) - { - __m256i src = _mm256_shuffle_epi8(_mm256_load_si256(buffer), K8_SHUFFLE_X4); - _mm256_store_si256(buffer, _mm256_maddubs_epi16(src, _mm256_load_si256(alpha))); - } - - template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<4>(const __m256i * alpha, __m256i * buffer) - { - ResizerByteBilinearInterpolateX4(alpha + 0, buffer + 0); - ResizerByteBilinearInterpolateX4(alpha + 1, buffer + 1); - ResizerByteBilinearInterpolateX4(alpha + 2, buffer + 2); - ResizerByteBilinearInterpolateX4(alpha + 3, buffer + 3); - } - - const __m256i K16_FRACTION_ROUND_TERM = SIMD_MM256_SET1_EPI16(Base::BILINEAR_ROUND_TERM); - - template SIMD_INLINE __m256i ResizerByteBilinearInterpolateY(const __m256i * pbx0, const __m256i * pbx1, __m256i alpha[2]) - { - __m256i sum = _mm256_add_epi16(_mm256_mullo_epi16(Load(pbx0), alpha[0]), _mm256_mullo_epi16(Load(pbx1), alpha[1])); - return _mm256_srli_epi16(_mm256_add_epi16(sum, K16_FRACTION_ROUND_TERM), Base::BILINEAR_SHIFT); - } - - template SIMD_INLINE void ResizerByteBilinearInterpolateY(const uint8_t * bx0, const uint8_t * bx1, __m256i alpha[2], uint8_t * dst) - { - __m256i lo = ResizerByteBilinearInterpolateY((__m256i*)bx0 + 0, (__m256i*)bx1 + 0, alpha); - __m256i hi = ResizerByteBilinearInterpolateY((__m256i*)bx0 + 1, (__m256i*)bx1 + 1, alpha); - Store((__m256i*)dst, PackI16ToU8(lo, hi)); - } - - template void ResizerByteBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - struct One { uint8_t val[N * 1]; }; - struct Two { uint8_t val[N * 2]; }; - - size_t size = 2 * _param.dstW*N; - size_t aligned = AlignHi(size, DA) - DA; - const size_t step = A * N; - size_t dstW = _param.dstW; - ptrdiff_t previous = -2; - __m256i a[2]; - uint8_t * bx[2] = { _bx[0].data, _bx[1].data }; - const uint8_t * ax = _ax.data; - const int32_t * ix = _ix.data; - - for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride) - { - a[0] = _mm256_set1_epi16(int16_t(Base::FRACTION_RANGE - _ay[yDst])); - a[1] = _mm256_set1_epi16(int16_t(_ay[yDst])); - - ptrdiff_t sy = _iy[yDst]; - int k = 0; - - if (sy == previous) - k = 2; - else if (sy == previous + 1) - { - Swap(bx[0], bx[1]); - k = 1; - } - - previous = sy; - - for (; k < 2; k++) - { - Two * pb = (Two *)bx[k]; - const One * psrc = (const One *)(src + (sy + k)*srcStride); - for (size_t x = 0; x < dstW; x++) - pb[x] = *(Two *)(psrc + ix[x]); - - uint8_t * pbx = bx[k]; - for (size_t i = 0; i < size; i += step) - ResizerByteBilinearInterpolateX((__m256i*)(ax + i), (__m256i*)(pbx + i)); - } - - for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A) - ResizerByteBilinearInterpolateY(bx[0] + ib, bx[1] + ib, a, dst + id); - size_t i = size - DA; - ResizerByteBilinearInterpolateY(bx[0] + i, bx[1] + i, a, dst + i / 2); - } - } - - void ResizerByteBilinear::RunG(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - size_t bufW = AlignHi(_param.dstW, A) * 2; - size_t size = 2 * _param.dstW; - size_t aligned = AlignHi(size, DA) - DA; - size_t blocks = _blocks; - ptrdiff_t previous = -2; - __m256i a[2]; - uint8_t * bx[2] = { _bx[0].data, _bx[1].data }; - const uint8_t * ax = _ax.data; - const Idx * ixg = _ixg.data; - - for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride) - { - a[0] = _mm256_set1_epi16(int16_t(Base::FRACTION_RANGE - _ay[yDst])); - a[1] = _mm256_set1_epi16(int16_t(_ay[yDst])); - - ptrdiff_t sy = _iy[yDst]; - int k = 0; - - if (sy == previous) - k = 2; - else if (sy == previous + 1) - { - Swap(bx[0], bx[1]); - k = 1; - } - - previous = sy; - - for (; k < 2; k++) - { - const uint8_t * psrc = src + (sy + k)*srcStride; - uint8_t * pdst = bx[k]; - for (size_t i = 0; i < blocks; ++i) - ResizerByteBilinearLoadGrayInterpolated(psrc, ixg[i], ax, pdst); - } - - for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A) - ResizerByteBilinearInterpolateY(bx[0] + ib, bx[1] + ib, a, dst + id); - size_t i = size - DA; - ResizerByteBilinearInterpolateY(bx[0] + i, bx[1] + i, a, dst + i / 2); - } - } - - void ResizerByteBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - assert(_param.dstW >= A); - - EstimateParams(); - switch (_param.channels) - { - case 1: - if (_blocks) - RunG(src, srcStride, dst, dstStride); - else - Run<1>(src, srcStride, dst, dstStride); - break; - case 2: Run<2>(src, srcStride, dst, dstStride); break; - case 3: Run<3>(src, srcStride, dst, dstStride); break; - case 4: Run<4>(src, srcStride, dst, dstStride); break; - default: - assert(0); - } - } - - //--------------------------------------------------------------------- - - ResizerByteArea::ResizerByteArea(const ResParam & param) - : Sse41::ResizerByteArea(param) - { - } - - SIMD_INLINE __m256i SaveLoadTail(const uint8_t * ptr, size_t tail) - { - uint8_t buffer[DA]; - _mm256_storeu_si256((__m256i*)(buffer), _mm256_loadu_si256((__m256i*)(ptr + tail - A))); - return _mm256_loadu_si256((__m256i*)(buffer + A - tail)); - } - - template SIMD_INLINE void ResizerByteAreaRowUpdate(const uint8_t * src0, size_t size, int32_t a, int32_t * dst) - { - __m256i alpha = SetInt16(a, a); - size_t sizeA = AlignLo(size, A); - size_t i = 0; - for (; i < sizeA; i += A, dst += A) - { - __m256i s0 = _mm256_permutevar8x32_epi32(_mm256_loadu_si256((__m256i*)(src0 + i)), K32_TWO_UNPACK_PERMUTE); - __m256i i0 = UnpackU8<0>(s0); - __m256i i1 = UnpackU8<1>(s0); - Update(dst + 0 * F, _mm256_madd_epi16(alpha, UnpackU8<0>(i0))); - Update(dst + 1 * F, _mm256_madd_epi16(alpha, UnpackU8<1>(i0))); - Update(dst + 2 * F, _mm256_madd_epi16(alpha, UnpackU8<0>(i1))); - Update(dst + 3 * F, _mm256_madd_epi16(alpha, UnpackU8<1>(i1))); - } - if (i < size) - { - __m256i s0 = _mm256_permutevar8x32_epi32(SaveLoadTail(src0 + i, size - i), K32_TWO_UNPACK_PERMUTE); - __m256i i0 = UnpackU8<0>(s0); - __m256i i1 = UnpackU8<1>(s0); - Update(dst + 0 * F, _mm256_madd_epi16(alpha, UnpackU8<0>(i0))); - Update(dst + 1 * F, _mm256_madd_epi16(alpha, UnpackU8<1>(i0))); - Update(dst + 2 * F, _mm256_madd_epi16(alpha, UnpackU8<0>(i1))); - Update(dst + 3 * F, _mm256_madd_epi16(alpha, UnpackU8<1>(i1))); - } - } - - template SIMD_INLINE void ResizerByteAreaRowUpdate(const uint8_t * src0, size_t stride, size_t size, int32_t a0, int32_t a1, int32_t * dst) - { - __m256i alpha = SetInt16(a0, a1); - const uint8_t * src1 = src0 + stride; - size_t sizeA = AlignLo(size, A); - size_t i = 0; - for (; i < sizeA; i += A, dst += A) - { - __m256i s0 = _mm256_permutevar8x32_epi32(_mm256_loadu_si256((__m256i*)(src0 + i)), K32_TWO_UNPACK_PERMUTE); - __m256i s1 = _mm256_permutevar8x32_epi32(_mm256_loadu_si256((__m256i*)(src1 + i)), K32_TWO_UNPACK_PERMUTE); - __m256i i0 = UnpackU8<0>(s0, s1); - __m256i i1 = UnpackU8<1>(s0, s1); - Update(dst + 0 * F, _mm256_madd_epi16(alpha, UnpackU8<0>(i0))); - Update(dst + 1 * F, _mm256_madd_epi16(alpha, UnpackU8<1>(i0))); - Update(dst + 2 * F, _mm256_madd_epi16(alpha, UnpackU8<0>(i1))); - Update(dst + 3 * F, _mm256_madd_epi16(alpha, UnpackU8<1>(i1))); - } - if (i < size) - { - __m256i s0 = _mm256_permutevar8x32_epi32(_mm256_loadu_si256((__m256i*)(src0 + i)), K32_TWO_UNPACK_PERMUTE); - __m256i s1 = _mm256_permutevar8x32_epi32(SaveLoadTail(src1 + i, size - i), K32_TWO_UNPACK_PERMUTE); - __m256i i0 = UnpackU8<0>(s0, s1); - __m256i i1 = UnpackU8<1>(s0, s1); - Update(dst + 0 * F, _mm256_madd_epi16(alpha, UnpackU8<0>(i0))); - Update(dst + 1 * F, _mm256_madd_epi16(alpha, UnpackU8<1>(i0))); - Update(dst + 2 * F, _mm256_madd_epi16(alpha, UnpackU8<0>(i1))); - Update(dst + 3 * F, _mm256_madd_epi16(alpha, UnpackU8<1>(i1))); - } - } - - SIMD_INLINE void ResizerByteAreaRowSum(const uint8_t * src, size_t stride, size_t count, size_t size, int32_t curr, int32_t zero, int32_t next, int32_t * dst) - { - if (count) - { - size_t i = 0; - ResizerByteAreaRowUpdate(src, stride, size, curr, count == 1 ? zero - next : zero, dst), src += 2 * stride, i += 2; - for (; i < count; i += 2, src += 2 * stride) - ResizerByteAreaRowUpdate(src, stride, size, zero, i == count - 1 ? zero - next : zero, dst); - if (i == count) - ResizerByteAreaRowUpdate(src, size, zero - next, dst); - } - else - ResizerByteAreaRowUpdate(src, size, curr - next, dst); - } - - template SIMD_INLINE void ResizerByteAreaSet(const int32_t * src, int32_t value, int32_t * dst) - { - for (size_t c = 0; c < N; ++c) - dst[c] = src[c] * value; - } - - template SIMD_INLINE void ResizerByteAreaAdd(const int32_t * src, int32_t value, int32_t * dst) - { - for (size_t c = 0; c < N; ++c) - dst[c] += src[c] * value; - } - - template SIMD_INLINE void ResizerByteAreaRes(const int32_t * src, uint8_t * dst) - { - for (size_t c = 0; c < N; ++c) - dst[c] = uint8_t((src[c] + Base::AREA_ROUND) >> Base::AREA_SHIFT); - } - - template SIMD_INLINE void ResizerByteAreaResult(const int32_t * src, size_t count, int32_t curr, int32_t zero, int32_t next, uint8_t * dst) - { - int32_t sum[N]; - ResizerByteAreaSet(src, curr, sum); - for (size_t i = 0; i < count; ++i) - src += N, ResizerByteAreaAdd(src, zero, sum); - ResizerByteAreaAdd(src, -next, sum); - ResizerByteAreaRes(sum, dst); - } - - template SIMD_INLINE void ResizerByteAreaResult34(const int32_t * src, size_t count, int32_t curr, int32_t zero, int32_t next, uint8_t * dst) - { - __m128i sum = _mm_mullo_epi32(_mm_loadu_si128((__m128i*)src), _mm_set1_epi32(curr)); - for (size_t i = 0; i < count; ++i) - src += N, sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_loadu_si128((__m128i*)src), _mm_set1_epi32(zero))); - sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_loadu_si128((__m128i*)src), _mm_set1_epi32(-next))); - __m128i res = _mm_srai_epi32(_mm_add_epi32(sum, _mm_set1_epi32(Base::AREA_ROUND)), Base::AREA_SHIFT); - *(int32_t*)dst = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packus_epi32(res, Sse2::K_ZERO), Sse2::K_ZERO)); - } - - template<> SIMD_INLINE void ResizerByteAreaResult<4>(const int32_t * src, size_t count, int32_t curr, int32_t zero, int32_t next, uint8_t * dst) - { - ResizerByteAreaResult34<4>(src, count, curr, zero, next, dst); - } - - template<> SIMD_INLINE void ResizerByteAreaResult<3>(const int32_t * src, size_t count, int32_t curr, int32_t zero, int32_t next, uint8_t * dst) - { - ResizerByteAreaResult34<3>(src, count, curr, zero, next, dst); - } - - template void ResizerByteArea::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - size_t dstW = _param.dstW, rowSize = _param.srcW*N, rowRest = dstStride - dstW * N; - const int32_t * iy = _iy.data, *ix = _ix.data, *ay = _ay.data, *ax = _ax.data; - int32_t ay0 = ay[0], ax0 = ax[0]; - for (size_t dy = 0; dy < _param.dstH; dy++, dst += rowRest) - { - int32_t * buf = _by.data; - size_t yn = iy[dy + 1] - iy[dy]; - ResizerByteAreaRowSum(src, srcStride, yn, rowSize, ay[dy], ay0, ay[dy + 1], buf), src += yn * srcStride; - for (size_t dx = 0; dx < dstW; dx++, dst += N) - { - size_t xn = ix[dx + 1] - ix[dx]; - ResizerByteAreaResult(buf, xn, ax[dx], ax0, ax[dx + 1], dst), buf += xn * N; - } - } - } - - void ResizerByteArea::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - switch (_param.channels) - { - case 1: Run<1>(src, srcStride, dst, dstStride); return; - case 2: Run<2>(src, srcStride, dst, dstStride); return; - case 3: Run<3>(src, srcStride, dst, dstStride); return; - case 4: Run<4>(src, srcStride, dst, dstStride); return; - default: - assert(0); - } - } - - //--------------------------------------------------------------------- - - ResizerFloatBilinear::ResizerFloatBilinear(const ResParam & param) - : Base::ResizerFloatBilinear(param) - { - } - - void ResizerFloatBilinear::Run(const float * src, size_t srcStride, float * dst, size_t dstStride) - { - size_t cn = _param.channels; - size_t rs = _param.dstW * cn; - float * pbx[2] = { _bx[0].data, _bx[1].data }; - int32_t prev = -2; - size_t rsa = AlignLo(rs, Avx::F); - size_t rsh = AlignLo(rs, Sse::F); - for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride) - { - float fy1 = _ay[dy]; - float fy0 = 1.0f - fy1; - int32_t sy = _iy[dy]; - int32_t k = 0; - - if (sy == prev) - k = 2; - else if (sy == prev + 1) - { - Swap(pbx[0], pbx[1]); - k = 1; - } - - prev = sy; - - for (; k < 2; k++) - { - float * pb = pbx[k]; - const float * ps = src + (sy + k)*srcStride; - size_t dx = 0; - if (cn == 1) - { - __m256 _1 = _mm256_set1_ps(1.0f); - for (; dx < rsa; dx += Avx::F) - { - __m256i idx = Avx2::LoadPermuted((__m256i*)(_ix.data + dx)); - __m256 s0145 = _mm256_castpd_ps(_mm256_i32gather_pd((double*)ps, _mm256_extracti128_si256(idx, 0), 4)); - __m256 s2367 = _mm256_castpd_ps(_mm256_i32gather_pd((double*)ps, _mm256_extracti128_si256(idx, 1), 4)); - __m256 fx1 = _mm256_load_ps(_ax.data + dx); - __m256 fx0 = _mm256_sub_ps(_1, fx1); - __m256 s0 = _mm256_shuffle_ps(s0145, s2367, 0x88); - __m256 s1 = _mm256_shuffle_ps(s0145, s2367, 0xDD); - _mm256_store_ps(pb + dx, _mm256_fmadd_ps(s0, fx0, _mm256_mul_ps(s1, fx1))); - } - for (; dx < rsh; dx += Sse::F) - { - __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]); - __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]); - __m128 fx1 = _mm_load_ps(_ax.data + dx); - __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1); - __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88)); - __m128 m1 = _mm_mul_ps(fx1, _mm_shuffle_ps(s01, s23, 0xDD)); - _mm_store_ps(pb + dx, _mm_add_ps(m0, m1)); - } - } - if (cn == 3 && rs > 3) - { - __m256 _1 = _mm256_set1_ps(1.0f); - size_t rs3 = rs - 3; - size_t rs6 = AlignLoAny(rs3, 6); - for (; dx < rs6; dx += 6) - { - __m256 s0 = Avx::Load(ps + _ix[dx + 0] + 0, ps + _ix[dx + 3] + 0); - __m256 s1 = Avx::Load(ps + _ix[dx + 0] + 3, ps + _ix[dx + 3] + 3); - __m256 fx1 = Avx::Load(_ax.data + dx + 0, _ax.data + dx + 3); - __m256 fx0 = _mm256_sub_ps(_1, fx1); - Avx::Store(pb + dx + 0, pb + dx + 3, _mm256_fmadd_ps(fx0, s0, _mm256_mul_ps(fx1, s1))); - } - for (; dx < rs3; dx += 3) - { - __m128 s0 = _mm_loadu_ps(ps + _ix[dx] + 0); - __m128 s1 = _mm_loadu_ps(ps + _ix[dx] + 3); - __m128 fx1 = _mm_set1_ps(_ax.data[dx]); - __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1); - _mm_storeu_ps(pb + dx, _mm_add_ps(_mm_mul_ps(fx0, s0), _mm_mul_ps(fx1, s1))); - } - } - else - { - __m256 _1 = _mm256_set1_ps(1.0f); - __m256i _cn = _mm256_set1_epi32((int)cn); - for (; dx < rsa; dx += Avx::F) - { - __m256i i0 = _mm256_load_si256((__m256i*)(_ix.data + dx)); - __m256i i1 = _mm256_add_epi32(i0, _cn); - __m256 s0 = _mm256_i32gather_ps(ps, i0, 4); - __m256 s1 = _mm256_i32gather_ps(ps, i1, 4); - __m256 fx1 = _mm256_load_ps(_ax.data + dx); - __m256 fx0 = _mm256_sub_ps(_1, fx1); - _mm256_store_ps(pb + dx, _mm256_fmadd_ps(s0, fx0, _mm256_mul_ps(s1, fx1))); - } - } - for (; dx < rs; dx++) - { - int32_t sx = _ix[dx]; - float fx = _ax[dx]; - pb[dx] = ps[sx] * (1.0f - fx) + ps[sx + cn] * fx; - } - } - - size_t dx = 0; - __m256 _fy0 = _mm256_set1_ps(fy0); - __m256 _fy1 = _mm256_set1_ps(fy1); - for (; dx < rsa; dx += Avx::F) - { - __m256 b0 = _mm256_load_ps(pbx[0] + dx); - __m256 b1 = _mm256_load_ps(pbx[1] + dx); - _mm256_storeu_ps(dst + dx, _mm256_fmadd_ps(b0, _fy0, _mm256_mul_ps(b1, _fy1))); - } - for (; dx < rsh; dx += Sse::F) - { - __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _mm256_castps256_ps128(_fy0)); - __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _mm256_castps256_ps128(_fy1)); - _mm_storeu_ps(dst + dx, _mm_add_ps(m0, m1)); - } - for (; dx < rs; dx++) - dst[dx] = pbx[0][dx] * fy0 + pbx[1][dx] * fy1; - } - } - - //--------------------------------------------------------------------- - - void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) - { - ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m256i)); - if (param.IsByteBilinear() && dstX >= A) - return new ResizerByteBilinear(param); - else if (param.IsByteArea()) - return new ResizerByteArea(param); - else if (param.IsFloatBilinear()) - return new ResizerFloatBilinear(param); - else - return Avx::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - } - } -#endif //SIMD_AVX2_ENABLE -} - diff --git a/src/3rd/Simd/Simd/SimdAvx2Segmentation.cpp b/src/3rd/Simd/Simd/SimdAvx2Segmentation.cpp deleted file mode 100644 index 0c437327..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Segmentation.cpp +++ /dev/null @@ -1,282 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void FillSingleHoles(uint8_t * mask, ptrdiff_t stride, __m256i index) - { - const __m256i up = _mm256_cmpeq_epi8(Load((__m256i*)(mask - stride)), index); - const __m256i left = _mm256_cmpeq_epi8(Load((__m256i*)(mask - 1)), index); - const __m256i right = _mm256_cmpeq_epi8(Load((__m256i*)(mask + 1)), index); - const __m256i down = _mm256_cmpeq_epi8(Load((__m256i*)(mask + stride)), index); - StoreMasked((__m256i*)mask, index, _mm256_and_si256(_mm256_and_si256(up, left), _mm256_and_si256(right, down))); - } - - template void SegmentationFillSingleHoles(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index) - { - assert(width > A + 2 && height > 2); - - height -= 1; - width -= 1; - __m256i _index = _mm256_set1_epi8((char)index); - size_t alignedWidth = Simd::AlignLo(width, A); - for (size_t row = 1; row < height; ++row) - { - mask += stride; - - FillSingleHoles(mask + 1, stride, _index); - - for (size_t col = A; col < alignedWidth; col += A) - FillSingleHoles(mask + col, stride, _index); - - if (alignedWidth != width) - FillSingleHoles(mask + width - A, stride, _index); - } - } - - void SegmentationFillSingleHoles(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index) - { - if (Aligned(mask) && Aligned(stride)) - SegmentationFillSingleHoles(mask, stride, width, height, index); - else - SegmentationFillSingleHoles(mask, stride, width, height, index); - } - - template SIMD_INLINE void ChangeIndex(uint8_t * mask, __m256i oldIndex, __m256i newIndex) - { - __m256i _mask = Load((__m256i*)mask); - Store((__m256i*)mask, _mm256_blendv_epi8(_mask, newIndex, _mm256_cmpeq_epi8(_mask, oldIndex))); - } - - template void SegmentationChangeIndex(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t oldIndex, uint8_t newIndex) - { - __m256i _oldIndex = _mm256_set1_epi8((char)oldIndex); - __m256i _newIndex = _mm256_set1_epi8((char)newIndex); - size_t alignedWidth = Simd::AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - ChangeIndex(mask + col, _oldIndex, _newIndex); - if (alignedWidth != width) - ChangeIndex(mask + width - A, _oldIndex, _newIndex); - mask += stride; - } - } - - void SegmentationChangeIndex(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t oldIndex, uint8_t newIndex) - { - if (Aligned(mask) && Aligned(stride)) - SegmentationChangeIndex(mask, stride, width, height, oldIndex, newIndex); - else - SegmentationChangeIndex(mask, stride, width, height, oldIndex, newIndex); - } - - SIMD_INLINE void SegmentationPropagate2x2(const __m256i & parentOne, const __m256i & parentAll, - const uint8_t * difference0, const uint8_t * difference1, uint8_t * child0, uint8_t * child1, size_t childCol, - const __m256i & index, const __m256i & invalid, const __m256i & empty, const __m256i & threshold) - { - const __m256i _difference0 = Load((__m256i*)(difference0 + childCol)); - const __m256i _difference1 = Load((__m256i*)(difference1 + childCol)); - const __m256i _child0 = Load((__m256i*)(child0 + childCol)); - const __m256i _child1 = Load((__m256i*)(child1 + childCol)); - const __m256i condition0 = _mm256_or_si256(parentAll, _mm256_and_si256(parentOne, Greater8u(_difference0, threshold))); - const __m256i condition1 = _mm256_or_si256(parentAll, _mm256_and_si256(parentOne, Greater8u(_difference1, threshold))); - Store((__m256i*)(child0 + childCol), _mm256_blendv_epi8(_child0, _mm256_blendv_epi8(empty, index, condition0), Lesser8u(_child0, invalid))); - Store((__m256i*)(child1 + childCol), _mm256_blendv_epi8(_child1, _mm256_blendv_epi8(empty, index, condition1), Lesser8u(_child1, invalid))); - } - - template SIMD_INLINE void SegmentationPropagate2x2(const uint8_t * parent0, const uint8_t * parent1, size_t parentCol, - const uint8_t * difference0, const uint8_t * difference1, uint8_t * child0, uint8_t * child1, size_t childCol, - const __m256i & index, const __m256i & invalid, const __m256i & empty, const __m256i & threshold) - { - const __m256i parent00 = _mm256_cmpeq_epi8(Load((__m256i*)(parent0 + parentCol)), index); - const __m256i parent01 = _mm256_cmpeq_epi8(Load((__m256i*)(parent0 + parentCol + 1)), index); - const __m256i parent10 = _mm256_cmpeq_epi8(Load((__m256i*)(parent1 + parentCol)), index); - const __m256i parent11 = _mm256_cmpeq_epi8(Load((__m256i*)(parent1 + parentCol + 1)), index); - const __m256i parentOne = _mm256_permute4x64_epi64(_mm256_or_si256(_mm256_or_si256(parent00, parent01), _mm256_or_si256(parent10, parent11)), 0xD8); - const __m256i parentAll = _mm256_permute4x64_epi64(_mm256_and_si256(_mm256_and_si256(parent00, parent01), _mm256_and_si256(parent10, parent11)), 0xD8); - - SegmentationPropagate2x2(_mm256_unpacklo_epi8(parentOne, parentOne), _mm256_unpacklo_epi8(parentAll, parentAll), - difference0, difference1, child0, child1, childCol, index, invalid, empty, threshold); - - SegmentationPropagate2x2(_mm256_unpackhi_epi8(parentOne, parentOne), _mm256_unpackhi_epi8(parentAll, parentAll), - difference0, difference1, child0, child1, childCol + A, index, invalid, empty, threshold); - } - - template void SegmentationPropagate2x2(const uint8_t * parent, size_t parentStride, size_t width, size_t height, - uint8_t * child, size_t childStride, const uint8_t * difference, size_t differenceStride, - uint8_t currentIndex, uint8_t invalidIndex, uint8_t emptyIndex, uint8_t differenceThreshold) - { - assert(width >= A + 1 && height >= 2); - height--; - width--; - - size_t alignedWidth = Simd::AlignLo(width, A); - __m256i index = _mm256_set1_epi8((char)currentIndex); - __m256i invalid = _mm256_set1_epi8((char)invalidIndex); - __m256i empty = _mm256_set1_epi8((char)emptyIndex); - __m256i threshold = _mm256_set1_epi8((char)differenceThreshold); - - for (size_t parentRow = 0, childRow = 1; parentRow < height; ++parentRow, childRow += 2) - { - const uint8_t * parent0 = parent + parentRow*parentStride; - const uint8_t * parent1 = parent0 + parentStride; - const uint8_t * difference0 = difference + childRow*differenceStride; - const uint8_t * difference1 = difference0 + differenceStride; - uint8_t * child0 = child + childRow*childStride; - uint8_t * child1 = child0 + childStride; - - for (size_t parentCol = 0, childCol = 1; parentCol < alignedWidth; parentCol += A, childCol += DA) - SegmentationPropagate2x2(parent0, parent1, parentCol, difference0, difference1, - child0, child1, childCol, index, invalid, empty, threshold); - if (alignedWidth != width) - SegmentationPropagate2x2(parent0, parent1, width - A, difference0, difference1, - child0, child1, (width - A) * 2 + 1, index, invalid, empty, threshold); - } - } - - void SegmentationPropagate2x2(const uint8_t * parent, size_t parentStride, size_t width, size_t height, - uint8_t * child, size_t childStride, const uint8_t * difference, size_t differenceStride, - uint8_t currentIndex, uint8_t invalidIndex, uint8_t emptyIndex, uint8_t differenceThreshold) - { - if (Aligned(parent) && Aligned(parentStride)) - SegmentationPropagate2x2(parent, parentStride, width, height, child, childStride, - difference, differenceStride, currentIndex, invalidIndex, emptyIndex, differenceThreshold); - else - SegmentationPropagate2x2(parent, parentStride, width, height, child, childStride, - difference, differenceStride, currentIndex, invalidIndex, emptyIndex, differenceThreshold); - } - - SIMD_INLINE bool RowHasIndex(const uint8_t * mask, size_t alignedSize, size_t fullSize, __m256i index) - { - for (size_t col = 0; col < alignedSize; col += A) - { - if (!_mm256_testz_si256(_mm256_cmpeq_epi8(_mm256_loadu_si256((__m256i*)(mask + col)), index), K_INV_ZERO)) - return true; - } - if (alignedSize != fullSize) - { - if (!_mm256_testz_si256(_mm256_cmpeq_epi8(_mm256_loadu_si256((__m256i*)(mask + fullSize - A)), index), K_INV_ZERO)) - return true; - } - return false; - } - - SIMD_INLINE bool ColsHasIndex(const uint8_t * mask, size_t stride, size_t size, __m256i index, uint8_t * cols) - { - __m256i _cols = _mm256_setzero_si256(); - for (size_t row = 0; row < size; ++row) - { - _cols = _mm256_or_si256(_cols, _mm256_cmpeq_epi8(_mm256_loadu_si256((__m256i*)mask), index)); - mask += stride; - } - _mm256_storeu_si256((__m256i*)cols, _cols); - return !_mm256_testz_si256(_cols, K_INV_ZERO); - } - - void SegmentationShrinkRegion(const uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index, - ptrdiff_t * left, ptrdiff_t * top, ptrdiff_t * right, ptrdiff_t * bottom) - { - assert(*right - *left >= A && *bottom > *top); - assert(*left >= 0 && *right <= (ptrdiff_t)width && *top >= 0 && *bottom <= (ptrdiff_t)height); - - size_t fullWidth = *right - *left; - ptrdiff_t alignedWidth = Simd::AlignLo(fullWidth, A); - __m256i _index = _mm256_set1_epi8(index); - bool search = true; - for (ptrdiff_t row = *top; search && row < *bottom; ++row) - { - if (RowHasIndex(mask + row*stride + *left, alignedWidth, fullWidth, _index)) - { - search = false; - *top = row; - } - } - - if (search) - { - *left = 0; - *top = 0; - *right = 0; - *bottom = 0; - return; - } - - search = true; - for (ptrdiff_t row = *bottom - 1; search && row >= *top; --row) - { - if (RowHasIndex(mask + row*stride + *left, alignedWidth, fullWidth, _index)) - { - search = false; - *bottom = row + 1; - } - } - - search = true; - for (ptrdiff_t col = *left; search && col < *left + alignedWidth; col += A) - { - uint8_t cols[A]; - if (ColsHasIndex(mask + (*top)*stride + col, stride, *bottom - *top, _index, cols)) - { - for (size_t i = 0; i < A; i++) - { - if (cols[i]) - { - *left = col + i; - break; - } - } - search = false; - break; - } - } - - search = true; - for (ptrdiff_t col = *right; search && col > *left; col -= A) - { - uint8_t cols[A]; - if (ColsHasIndex(mask + (*top)*stride + col - A, stride, *bottom - *top, _index, cols)) - { - for (ptrdiff_t i = A - 1; i >= 0; i--) - { - if (cols[i]) - { - *right = col - A + i + 1; - break; - } - } - search = false; - break; - } - } - } - } -#endif//SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2ShiftBilinear.cpp b/src/3rd/Simd/Simd/SimdAvx2ShiftBilinear.cpp deleted file mode 100644 index ca8fb2f7..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2ShiftBilinear.cpp +++ /dev/null @@ -1,189 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - const __m256i K16_LINEAR_ROUND_TERM = SIMD_MM256_SET1_EPI16(Base::LINEAR_ROUND_TERM); - const __m256i K16_BILINEAR_ROUND_TERM = SIMD_MM256_SET1_EPI16(Base::BILINEAR_ROUND_TERM); - - SIMD_INLINE __m256i Interpolate(__m256i s[2][2], __m256i k[2][2]) - { - __m256i r = _mm256_mullo_epi16(s[0][0], k[0][0]); - r = _mm256_add_epi16(r, _mm256_mullo_epi16(s[0][1], k[0][1])); - r = _mm256_add_epi16(r, _mm256_mullo_epi16(s[1][0], k[1][0])); - r = _mm256_add_epi16(r, _mm256_mullo_epi16(s[1][1], k[1][1])); - r = _mm256_add_epi16(r, K16_BILINEAR_ROUND_TERM); - return _mm256_srli_epi16(r, Base::BILINEAR_SHIFT); - } - - SIMD_INLINE __m256i Interpolate(__m256i s[2][2][2], __m256i k[2][2]) - { - return _mm256_packus_epi16(Interpolate(s[0], k), Interpolate(s[1], k)); - } - - SIMD_INLINE __m256i Interpolate(__m256i s[2], __m256i k[2]) - { - __m256i r = _mm256_mullo_epi16(s[0], k[0]); - r = _mm256_add_epi16(r, _mm256_mullo_epi16(s[1], k[1])); - r = _mm256_add_epi16(r, K16_LINEAR_ROUND_TERM); - return _mm256_srli_epi16(r, Base::LINEAR_SHIFT); - } - - SIMD_INLINE __m256i Interpolate(__m256i s[2][2], __m256i k[2]) - { - return _mm256_packus_epi16(Interpolate(s[0], k), Interpolate(s[1], k)); - } - - SIMD_INLINE void LoadBlock(const uint8_t *src, __m256i &lo, __m256i &hi) - { - const __m256i t = _mm256_loadu_si256((__m256i*)(src)); - lo = _mm256_unpacklo_epi8(t, K_ZERO); - hi = _mm256_unpackhi_epi8(t, K_ZERO); - } - - SIMD_INLINE void LoadBlock(const uint8_t *src, size_t dx, size_t dy, __m256i s[2][2][2]) - { - LoadBlock(src, s[0][0][0], s[1][0][0]); - LoadBlock(src + dx, s[0][0][1], s[1][0][1]); - LoadBlock(src + dy, s[0][1][0], s[1][1][0]); - LoadBlock(src + dy + dx, s[0][1][1], s[1][1][1]); - } - - SIMD_INLINE void LoadBlock(const uint8_t *src, size_t dr, __m256i s[2][2]) - { - LoadBlock(src, s[0][0], s[1][0]); - LoadBlock(src + dr, s[0][1], s[1][1]); - } - - void ShiftBilinear(const uint8_t *src, size_t srcStride, size_t width, size_t height, size_t channelCount, - int fDx, int fDy, uint8_t *dst, size_t dstStride) - { - size_t size = width*channelCount; - size_t alignedSize = AlignLo(size, A); - - if (fDy) - { - if (fDx) - { - __m256i k[2][2], s[2][2][2]; - k[0][0] = _mm256_set1_epi16((Base::FRACTION_RANGE - fDx)*(Base::FRACTION_RANGE - fDy)); - k[0][1] = _mm256_set1_epi16(fDx*(Base::FRACTION_RANGE - fDy)); - k[1][0] = _mm256_set1_epi16((Base::FRACTION_RANGE - fDx)*fDy); - k[1][1] = _mm256_set1_epi16(fDx*fDy); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedSize; col += A) - { - LoadBlock(src + col, channelCount, srcStride, s); - _mm256_storeu_si256((__m256i*)(dst + col), Interpolate(s, k)); - } - if (size != alignedSize) - { - LoadBlock(src + size - A, channelCount, srcStride, s); - _mm256_storeu_si256((__m256i*)(dst + size - A), Interpolate(s, k)); - } - src += srcStride; - dst += dstStride; - } - } - else - { - __m256i k[2], s[2][2]; - k[0] = _mm256_set1_epi16(Base::FRACTION_RANGE - fDy); - k[1] = _mm256_set1_epi16(fDy); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedSize; col += A) - { - LoadBlock(src + col, srcStride, s); - _mm256_storeu_si256((__m256i*)(dst + col), Interpolate(s, k)); - } - if (size != alignedSize) - { - LoadBlock(src + size - A, srcStride, s); - _mm256_storeu_si256((__m256i*)(dst + size - A), Interpolate(s, k)); - } - src += srcStride; - dst += dstStride; - } - } - } - else - { - if (fDx) - { - __m256i k[2], s[2][2]; - k[0] = _mm256_set1_epi16(Base::FRACTION_RANGE - fDx); - k[1] = _mm256_set1_epi16(fDx); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedSize; col += A) - { - LoadBlock(src + col, channelCount, s); - _mm256_storeu_si256((__m256i*)(dst + col), Interpolate(s, k)); - } - if (size != alignedSize) - { - LoadBlock(src + size - A, channelCount, s); - _mm256_storeu_si256((__m256i*)(dst + size - A), Interpolate(s, k)); - } - src += srcStride; - dst += dstStride; - } - } - else - { - for (size_t row = 0; row < height; ++row) - { - memcpy(dst, src, size); - src += srcStride; - dst += dstStride; - } - } - } - } - - void ShiftBilinear( - const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t * bkg, size_t bkgStride, const double * shiftX, const double * shiftY, - size_t cropLeft, size_t cropTop, size_t cropRight, size_t cropBottom, uint8_t * dst, size_t dstStride) - { - int fDx, fDy; - Base::CommonShiftAction(src, srcStride, width, height, channelCount, bkg, bkgStride, shiftX, shiftY, - cropLeft, cropTop, cropRight, cropBottom, dst, dstStride, fDx, fDy); - - if (*shiftX + A < cropRight - cropLeft) - Avx2::ShiftBilinear(src, srcStride, width, height, channelCount, fDx, fDy, dst, dstStride); - else - Base::ShiftBilinear(src, srcStride, width, height, channelCount, fDx, fDy, dst, dstStride); - } - } -#endif//SIMD_AVX2_ENABLE -} - diff --git a/src/3rd/Simd/Simd/SimdAvx2Sobel.cpp b/src/3rd/Simd/Simd/SimdAvx2Sobel.cpp deleted file mode 100644 index 969c182e..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Sobel.cpp +++ /dev/null @@ -1,505 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void SobelDx(__m256i a[3][3], __m256i & lo, __m256i & hi) - { - lo = ConditionalAbs(BinomialSum16(SubUnpackedU8<0>(a[0][2], a[0][0]), SubUnpackedU8<0>(a[1][2], a[1][0]), SubUnpackedU8<0>(a[2][2], a[2][0]))); - hi = ConditionalAbs(BinomialSum16(SubUnpackedU8<1>(a[0][2], a[0][0]), SubUnpackedU8<1>(a[1][2], a[1][0]), SubUnpackedU8<1>(a[2][2], a[2][0]))); - } - - template SIMD_INLINE void SobelDx(__m256i a[3][3], int16_t * dst) - { - __m256i lo, hi; - SobelDx(a, lo, hi); - Store((__m256i*)dst + 0, _mm256_permute2x128_si256(lo, hi, 0x20)); - Store((__m256i*)dst + 1, _mm256_permute2x128_si256(lo, hi, 0x31)); - } - - template void SobelDx(const uint8_t * src, size_t srcStride, size_t width, size_t height, int16_t * dst, size_t dstStride) - { - assert(width > A); - if (align) - assert(Aligned(dst) && Aligned(dstStride, HA)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - __m256i a[3][3]; - - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - LoadNoseDx(src0 + 0, a[0]); - LoadNoseDx(src1 + 0, a[1]); - LoadNoseDx(src2 + 0, a[2]); - SobelDx(a, dst + 0); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBodyDx(src0 + col, a[0]); - LoadBodyDx(src1 + col, a[1]); - LoadBodyDx(src2 + col, a[2]); - SobelDx(a, dst + col); - } - LoadTailDx(src0 + width - A, a[0]); - LoadTailDx(src1 + width - A, a[1]); - LoadTailDx(src2 + width - A, a[2]); - SobelDx(a, dst + width - A); - - dst += dstStride; - } - } - - void SobelDx(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(dst) && Aligned(dstStride)) - SobelDx(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - SobelDx(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - void SobelDxAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(dst) && Aligned(dstStride)) - SobelDx(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - SobelDx(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - SIMD_INLINE void SobelDxAbsSum(__m256i a[3][3], __m256i & sum) - { - __m256i lo, hi; - SobelDx(a, lo, hi); - sum = _mm256_add_epi32(sum, _mm256_madd_epi16(lo, K16_0001)); - sum = _mm256_add_epi32(sum, _mm256_madd_epi16(hi, K16_0001)); - } - - SIMD_INLINE void SetMask3(__m256i a[3], __m256i mask) - { - a[0] = _mm256_and_si256(a[0], mask); - a[1] = _mm256_and_si256(a[1], mask); - a[2] = _mm256_and_si256(a[2], mask); - } - - SIMD_INLINE void SetMask3x3(__m256i a[3][3], __m256i mask) - { - SetMask3(a[0], mask); - SetMask3(a[1], mask); - SetMask3(a[2], mask); - } - - void SobelDxAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - assert(width > A); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - - __m256i a[3][3]; - __m256i tailMask = SetMask(0, A - width + bodyWidth, 0xFF); - __m256i fullSum = _mm256_setzero_si256(); - - for (size_t row = 0; row < height; ++row) - { - src0 = src + stride*(row - 1); - src1 = src0 + stride; - src2 = src1 + stride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - __m256i rowSum = _mm256_setzero_si256(); - - LoadNoseDx(src0 + 0, a[0]); - LoadNoseDx(src1 + 0, a[1]); - LoadNoseDx(src2 + 0, a[2]); - SobelDxAbsSum(a, rowSum); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBodyDx(src0 + col, a[0]); - LoadBodyDx(src1 + col, a[1]); - LoadBodyDx(src2 + col, a[2]); - SobelDxAbsSum(a, rowSum); - } - LoadTailDx(src0 + width - A, a[0]); - LoadTailDx(src1 + width - A, a[1]); - LoadTailDx(src2 + width - A, a[2]); - SetMask3x3(a, tailMask); - SobelDxAbsSum(a, rowSum); - - fullSum = _mm256_add_epi64(fullSum, HorizontalSum32(rowSum)); - } - *sum = ExtractSum(fullSum); - } - - template SIMD_INLINE void SobelDy(__m256i a[3][3], __m256i & lo, __m256i & hi) - { - lo = ConditionalAbs(BinomialSum16(SubUnpackedU8<0>(a[2][0], a[0][0]), SubUnpackedU8<0>(a[2][1], a[0][1]), SubUnpackedU8<0>(a[2][2], a[0][2]))); - hi = ConditionalAbs(BinomialSum16(SubUnpackedU8<1>(a[2][0], a[0][0]), SubUnpackedU8<1>(a[2][1], a[0][1]), SubUnpackedU8<1>(a[2][2], a[0][2]))); - } - - template SIMD_INLINE void SobelDy(__m256i a[3][3], int16_t * dst) - { - __m256i lo, hi; - SobelDy(a, lo, hi); - Store((__m256i*)dst + 0, _mm256_permute2x128_si256(lo, hi, 0x20)); - Store((__m256i*)dst + 1, _mm256_permute2x128_si256(lo, hi, 0x31)); - } - - template void SobelDy(const uint8_t * src, size_t srcStride, size_t width, size_t height, int16_t * dst, size_t dstStride) - { - assert(width > A); - if (align) - assert(Aligned(dst) && Aligned(dstStride, HA)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - __m256i a[3][3]; - - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - LoadNose3(src0 + 0, a[0]); - LoadNose3(src2 + 0, a[2]); - SobelDy(a, dst + 0); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBody3(src0 + col, a[0]); - LoadBody3(src2 + col, a[2]); - SobelDy(a, dst + col); - } - LoadTail3(src0 + width - A, a[0]); - LoadTail3(src2 + width - A, a[2]); - SobelDy(a, dst + width - A); - - dst += dstStride; - } - } - - void SobelDy(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - SobelDy(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - SobelDy(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - void SobelDyAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - SobelDy(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - SobelDy(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - SIMD_INLINE void SobelDyAbsSum(__m256i a[3][3], __m256i & sum) - { - __m256i lo, hi; - SobelDy(a, lo, hi); - sum = _mm256_add_epi32(sum, _mm256_madd_epi16(lo, K16_0001)); - sum = _mm256_add_epi32(sum, _mm256_madd_epi16(hi, K16_0001)); - } - - template void SobelDyAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - assert(width > A); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - - __m256i a[3][3]; - __m256i tailMask = SetMask(0, A - width + bodyWidth, 0xFF); - __m256i fullSum = _mm256_setzero_si256(); - - for (size_t row = 0; row < height; ++row) - { - src0 = src + stride*(row - 1); - src1 = src0 + stride; - src2 = src1 + stride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - __m256i rowSum = _mm256_setzero_si256(); - - LoadNose3(src0 + 0, a[0]); - LoadNose3(src2 + 0, a[2]); - SobelDyAbsSum(a, rowSum); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBody3(src0 + col, a[0]); - LoadBody3(src2 + col, a[2]); - SobelDyAbsSum(a, rowSum); - } - LoadTail3(src0 + width - A, a[0]); - LoadTail3(src2 + width - A, a[2]); - SetMask3x3(a, tailMask); - SobelDyAbsSum(a, rowSum); - - fullSum = _mm256_add_epi64(fullSum, HorizontalSum32(rowSum)); - } - *sum = ExtractSum(fullSum); - } - - void SobelDyAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(src) && Aligned(stride)) - SobelDyAbsSum(src, stride, width, height, sum); - else - SobelDyAbsSum(src, stride, width, height, sum); - } - - SIMD_INLINE __m256i ContourMetrics(__m256i dx, __m256i dy) - { - return _mm256_add_epi16(_mm256_slli_epi16(_mm256_add_epi16(dx, dy), 1), _mm256_and_si256(_mm256_cmpgt_epi16(dy, dx), K16_0001)); - } - - SIMD_INLINE void ContourMetrics(__m256i a[3][3], __m256i & lo, __m256i & hi) - { - __m256i dxLo, dxHi, dyLo, dyHi; - SobelDx(a, dxLo, dxHi); - SobelDy(a, dyLo, dyHi); - lo = ContourMetrics(dxLo, dyLo); - hi = ContourMetrics(dxHi, dyHi); - } - - template SIMD_INLINE void ContourMetrics(__m256i a[3][3], int16_t * dst) - { - __m256i lo, hi; - ContourMetrics(a, lo, hi); - Store((__m256i*)dst + 0, _mm256_permute2x128_si256(lo, hi, 0x20)); - Store((__m256i*)dst + 1, _mm256_permute2x128_si256(lo, hi, 0x31)); - } - - template void ContourMetrics(const uint8_t * src, size_t srcStride, size_t width, size_t height, int16_t * dst, size_t dstStride) - { - assert(width > A); - if (align) - assert(Aligned(dst) && Aligned(dstStride, HA)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - __m256i a[3][3]; - - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - LoadNose3(src0 + 0, a[0]); - LoadNose3(src1 + 0, a[1]); - LoadNose3(src2 + 0, a[2]); - ContourMetrics(a, dst + 0); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBody3(src0 + col, a[0]); - LoadBody3(src1 + col, a[1]); - LoadBody3(src2 + col, a[2]); - ContourMetrics(a, dst + col); - } - LoadTail3(src0 + width - A, a[0]); - LoadTail3(src1 + width - A, a[1]); - LoadTail3(src2 + width - A, a[2]); - ContourMetrics(a, dst + width - A); - - dst += dstStride; - } - } - - void ContourMetrics(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - ContourMetrics(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - ContourMetrics(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - template SIMD_INLINE void ContourMetricsMasked(__m256i a[3][3], const uint8_t * mask, const __m256i & indexMin, int16_t * dst) - { - __m256i m = GreaterOrEqual8u(Load((__m256i*)mask), indexMin); - __m256i lo, hi; - ContourMetrics(a, lo, hi); - lo = _mm256_and_si256(lo, _mm256_unpacklo_epi8(m, m)); - hi = _mm256_and_si256(hi, _mm256_unpackhi_epi8(m, m)); - Store((__m256i*)dst + 0, _mm256_permute2x128_si256(lo, hi, 0x20)); - Store((__m256i*)dst + 1, _mm256_permute2x128_si256(lo, hi, 0x31)); - } - - template void ContourMetricsMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t indexMin, int16_t * dst, size_t dstStride) - { - assert(width > A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride, HA) && Aligned(mask) && Aligned(maskStride)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - __m256i _indexMin = _mm256_set1_epi8(indexMin); - __m256i a[3][3]; - - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - LoadNose3(src0 + 0, a[0]); - LoadNose3(src1 + 0, a[1]); - LoadNose3(src2 + 0, a[2]); - ContourMetricsMasked(a, mask + 0, _indexMin, dst + 0); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBody3(src0 + col, a[0]); - LoadBody3(src1 + col, a[1]); - LoadBody3(src2 + col, a[2]); - ContourMetricsMasked(a, mask + col, _indexMin, dst + col); - } - LoadTail3(src0 + width - A, a[0]); - LoadTail3(src1 + width - A, a[1]); - LoadTail3(src2 + width - A, a[2]); - ContourMetricsMasked(a, mask + width - A, _indexMin, dst + width - A); - - dst += dstStride; - mask += maskStride; - } - } - - void ContourMetricsMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t indexMin, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride) && Aligned(mask) && Aligned(maskStride)) - ContourMetricsMasked(src, srcStride, width, height, mask, maskStride, indexMin, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - ContourMetricsMasked(src, srcStride, width, height, mask, maskStride, indexMin, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - template SIMD_INLINE __m256i AnchorComponent(const int16_t * src, size_t step, const __m256i & current, const __m256i & threshold, const __m256i & mask) - { - __m256i last = _mm256_srli_epi16(Load((__m256i*)(src - step)), 1); - __m256i next = _mm256_srli_epi16(Load((__m256i*)(src + step)), 1); - return _mm256_andnot_si256(_mm256_or_si256(_mm256_cmpgt_epi16(threshold, _mm256_sub_epi16(current, last)), - _mm256_cmpgt_epi16(threshold, _mm256_sub_epi16(current, next))), mask); - } - - template SIMD_INLINE __m256i Anchor(const int16_t * src, size_t stride, const __m256i & threshold) - { - __m256i _src = Load((__m256i*)src); - __m256i direction = _mm256_and_si256(_src, K16_0001); - __m256i magnitude = _mm256_srli_epi16(_src, 1); - __m256i vertical = AnchorComponent(src, 1, magnitude, threshold, _mm256_cmpeq_epi16(direction, K16_0001)); - __m256i horizontal = AnchorComponent(src, stride, magnitude, threshold, _mm256_cmpeq_epi16(direction, K_ZERO)); - return _mm256_andnot_si256(_mm256_cmpeq_epi16(magnitude, K_ZERO), _mm256_and_si256(_mm256_or_si256(vertical, horizontal), K16_00FF)); - } - - template SIMD_INLINE void Anchor(const int16_t * src, size_t stride, const __m256i & threshold, uint8_t * dst) - { - __m256i lo = Anchor(src, stride, threshold); - __m256i hi = Anchor(src + HA, stride, threshold); - Store((__m256i*)dst, PackI16ToU8(lo, hi)); - } - - template void ContourAnchors(const int16_t * src, size_t srcStride, size_t width, size_t height, - size_t step, int16_t threshold, uint8_t * dst, size_t dstStride) - { - assert(width > A); - if (align) - assert(Aligned(src) && Aligned(srcStride, HA) && Aligned(dst) && Aligned(dstStride)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - __m256i _threshold = _mm256_set1_epi16(threshold); - memset(dst, 0, width); - memset(dst + dstStride*(height - 1), 0, width); - src += srcStride; - dst += dstStride; - for (size_t row = 1; row < height - 1; row += step) - { - dst[0] = 0; - Anchor(src + 1, srcStride, _threshold, dst + 1); - for (size_t col = A; col < bodyWidth; col += A) - Anchor(src + col, srcStride, _threshold, dst + col); - Anchor(src + width - A - 1, srcStride, _threshold, dst + width - A - 1); - dst[width - 1] = 0; - src += step*srcStride; - dst += step*dstStride; - } - } - - void ContourAnchors(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t step, int16_t threshold, uint8_t * dst, size_t dstStride) - { - assert(srcStride % sizeof(int16_t) == 0); - - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - ContourAnchors((const int16_t *)src, srcStride / sizeof(int16_t), width, height, step, threshold, dst, dstStride); - else - ContourAnchors((const int16_t *)src, srcStride / sizeof(int16_t), width, height, step, threshold, dst, dstStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2SquaredDifferenceSum.cpp b/src/3rd/Simd/Simd/SimdAvx2SquaredDifferenceSum.cpp deleted file mode 100644 index c6abc290..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2SquaredDifferenceSum.cpp +++ /dev/null @@ -1,135 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - SIMD_INLINE __m256i SquaredDifference(__m256i a, __m256i b) - { - const __m256i lo = SubUnpackedU8<0>(a, b); - const __m256i hi = SubUnpackedU8<1>(a, b); - return _mm256_add_epi32(_mm256_madd_epi16(lo, lo), _mm256_madd_epi16(hi, hi)); - } - - template void SquaredDifferenceSum( - const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - size_t width, size_t height, uint64_t * sum) - { - assert(width < 0x10000); - if (align) - { - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); - } - - size_t bodyWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + bodyWidth, 0xFF); - __m256i fullSum = _mm256_setzero_si256(); - for (size_t row = 0; row < height; ++row) - { - __m256i rowSum = _mm256_setzero_si256(); - for (size_t col = 0; col < bodyWidth; col += A) - { - const __m256i a_ = Load((__m256i*)(a + col)); - const __m256i b_ = Load((__m256i*)(b + col)); - rowSum = _mm256_add_epi32(rowSum, SquaredDifference(a_, b_)); - } - if (width - bodyWidth) - { - const __m256i a_ = _mm256_and_si256(tailMask, Load((__m256i*)(a + width - A))); - const __m256i b_ = _mm256_and_si256(tailMask, Load((__m256i*)(b + width - A))); - rowSum = _mm256_add_epi32(rowSum, SquaredDifference(a_, b_)); - } - fullSum = _mm256_add_epi64(fullSum, HorizontalSum32(rowSum)); - a += aStride; - b += bStride; - } - *sum = ExtractSum(fullSum); - } - - template void SquaredDifferenceSumMasked( - const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) - { - assert(width < 0x10000); - if (align) - { - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); - assert(Aligned(mask) && Aligned(maskStride)); - } - - size_t bodyWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + bodyWidth, 0xFF); - __m256i fullSum = _mm256_setzero_si256(); - __m256i index_ = _mm256_set1_epi8(index); - for (size_t row = 0; row < height; ++row) - { - __m256i rowSum = _mm256_setzero_si256(); - for (size_t col = 0; col < bodyWidth; col += A) - { - const __m256i mask_ = LoadMaskI8((__m256i*)(mask + col), index_); - const __m256i a_ = _mm256_and_si256(mask_, Load((__m256i*)(a + col))); - const __m256i b_ = _mm256_and_si256(mask_, Load((__m256i*)(b + col))); - rowSum = _mm256_add_epi32(rowSum, SquaredDifference(a_, b_)); - } - if (width - bodyWidth) - { - const __m256i mask_ = _mm256_and_si256(tailMask, LoadMaskI8((__m256i*)(mask + width - A), index_)); - const __m256i a_ = _mm256_and_si256(mask_, Load((__m256i*)(a + width - A))); - const __m256i b_ = _mm256_and_si256(mask_, Load((__m256i*)(b + width - A))); - rowSum = _mm256_add_epi32(rowSum, SquaredDifference(a_, b_)); - } - fullSum = _mm256_add_epi64(fullSum, HorizontalSum32(rowSum)); - a += aStride; - b += bStride; - mask += maskStride; - } - *sum = ExtractSum(fullSum); - } - - void SquaredDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - size_t width, size_t height, uint64_t * sum) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)) - SquaredDifferenceSum(a, aStride, b, bStride, width, height, sum); - else - SquaredDifferenceSum(a, aStride, b, bStride, width, height, sum); - } - - void SquaredDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(mask) && Aligned(maskStride)) - SquaredDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - else - SquaredDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Statistic.cpp b/src/3rd/Simd/Simd/SimdAvx2Statistic.cpp deleted file mode 100644 index d32df0b1..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Statistic.cpp +++ /dev/null @@ -1,456 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSet.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template void GetStatistic(const uint8_t * src, size_t stride, size_t width, size_t height, - uint8_t * min, uint8_t * max, uint8_t * average) - { - assert(width*height && width >= A); - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t bodyWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + bodyWidth, 0xFF); - __m256i sum = _mm256_setzero_si256(); - __m256i min_ = K_INV_ZERO; - __m256i max_ = K_ZERO; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < bodyWidth; col += A) - { - const __m256i value = Load((__m256i*)(src + col)); - min_ = _mm256_min_epu8(min_, value); - max_ = _mm256_max_epu8(max_, value); - sum = _mm256_add_epi64(_mm256_sad_epu8(value, K_ZERO), sum); - } - if (width - bodyWidth) - { - const __m256i value = Load((__m256i*)(src + width - A)); - min_ = _mm256_min_epu8(min_, value); - max_ = _mm256_max_epu8(max_, value); - sum = _mm256_add_epi64(_mm256_sad_epu8(_mm256_and_si256(tailMask, value), K_ZERO), sum); - } - src += stride; - } - - uint8_t min_buffer[A], max_buffer[A]; - _mm256_storeu_si256((__m256i*)min_buffer, min_); - _mm256_storeu_si256((__m256i*)max_buffer, max_); - *min = UCHAR_MAX; - *max = 0; - for (size_t i = 0; i < A; ++i) - { - *min = Base::MinU8(min_buffer[i], *min); - *max = Base::MaxU8(max_buffer[i], *max); - } - *average = (uint8_t)((ExtractSum(sum) + width*height / 2) / (width*height)); - } - - void GetStatistic(const uint8_t * src, size_t stride, size_t width, size_t height, - uint8_t * min, uint8_t * max, uint8_t * average) - { - if (Aligned(src) && Aligned(stride)) - GetStatistic(src, stride, width, height, min, max, average); - else - GetStatistic(src, stride, width, height, min, max, average); - } - - template void GetRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - size_t alignedWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + alignedWidth, 0xFF); - - memset(sums, 0, sizeof(uint32_t)*height); - for (size_t row = 0; row < height; ++row) - { - __m256i sum = _mm256_setzero_si256(); - for (size_t col = 0; col < alignedWidth; col += A) - { - __m256i _src = Load((__m256i*)(src + col)); - sum = _mm256_add_epi32(sum, _mm256_sad_epu8(_src, K_ZERO)); - } - if (alignedWidth != width) - { - __m256i _src = _mm256_and_si256(Load((__m256i*)(src + width - A)), tailMask); - sum = _mm256_add_epi32(sum, _mm256_sad_epu8(_src, K_ZERO)); - } - sums[row] = ExtractSum(sum); - src += stride; - } - } - - void GetRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - if (Aligned(src) && Aligned(stride)) - GetRowSums(src, stride, width, height, sums); - else - GetRowSums(src, stride, width, height, sums); - } - - namespace - { - struct Buffer - { - Buffer(size_t width) - { - _p = Allocate(sizeof(uint16_t)*width + sizeof(uint32_t)*width); - sums16 = (uint16_t*)_p; - sums32 = (uint32_t*)(sums16 + width); - } - - ~Buffer() - { - Free(_p); - } - - uint16_t * sums16; - uint32_t * sums32; - private: - void *_p; - }; - } - - SIMD_INLINE void Sum16(__m256i src8, uint16_t * sums16) - { - Store((__m256i*)sums16 + 0, _mm256_add_epi16(Load((__m256i*)sums16 + 0), _mm256_unpacklo_epi8(src8, K_ZERO))); - Store((__m256i*)sums16 + 1, _mm256_add_epi16(Load((__m256i*)sums16 + 1), _mm256_unpackhi_epi8(src8, K_ZERO))); - } - - SIMD_INLINE void Sum16To32(const uint16_t * src, uint32_t * dst) - { - __m256i lo = LoadPermuted((__m256i*)src + 0); - __m256i hi = LoadPermuted((__m256i*)src + 1); - Store((__m256i*)dst + 0, _mm256_add_epi32(Load((__m256i*)dst + 0), _mm256_unpacklo_epi16(lo, K_ZERO))); - Store((__m256i*)dst + 1, _mm256_add_epi32(Load((__m256i*)dst + 1), _mm256_unpacklo_epi16(hi, K_ZERO))); - Store((__m256i*)dst + 2, _mm256_add_epi32(Load((__m256i*)dst + 2), _mm256_unpackhi_epi16(lo, K_ZERO))); - Store((__m256i*)dst + 3, _mm256_add_epi32(Load((__m256i*)dst + 3), _mm256_unpackhi_epi16(hi, K_ZERO))); - } - - template void GetColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - size_t alignedLoWidth = AlignLo(width, A); - size_t alignedHiWidth = AlignHi(width, A); - size_t stepSize = SCHAR_MAX + 1; - size_t stepCount = (height + SCHAR_MAX) / stepSize; - - Buffer buffer(alignedHiWidth); - memset(buffer.sums32, 0, sizeof(uint32_t)*alignedHiWidth); - for (size_t step = 0; step < stepCount; ++step) - { - size_t rowStart = step*stepSize; - size_t rowEnd = Min(rowStart + stepSize, height); - - memset(buffer.sums16, 0, sizeof(uint16_t)*alignedHiWidth); - for (size_t row = rowStart; row < rowEnd; ++row) - { - for (size_t col = 0; col < alignedLoWidth; col += A) - { - __m256i src8 = Load((__m256i*)(src + col)); - Sum16(src8, buffer.sums16 + col); - } - if (alignedLoWidth != width) - { - __m256i src8 = Load((__m256i*)(src + width - A)); - Sum16(src8, buffer.sums16 + alignedLoWidth); - } - src += stride; - } - - for (size_t col = 0; col < alignedHiWidth; col += A) - Sum16To32(buffer.sums16 + col, buffer.sums32 + col); - } - memcpy(sums, buffer.sums32, sizeof(uint32_t)*alignedLoWidth); - if (alignedLoWidth != width) - memcpy(sums + alignedLoWidth, buffer.sums32 + alignedLoWidth + alignedHiWidth - width, sizeof(uint32_t)*(width - alignedLoWidth)); - } - - void GetColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - if (Aligned(src) && Aligned(stride)) - GetColSums(src, stride, width, height, sums); - else - GetColSums(src, stride, width, height, sums); - } - - template void GetAbsDyRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - size_t alignedWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + alignedWidth, 0xFF); - - memset(sums, 0, sizeof(uint32_t)*height); - const uint8_t * src0 = src; - const uint8_t * src1 = src + stride; - height--; - for (size_t row = 0; row < height; ++row) - { - __m256i sum = _mm256_setzero_si256(); - for (size_t col = 0; col < alignedWidth; col += A) - { - __m256i _src0 = Load((__m256i*)(src0 + col)); - __m256i _src1 = Load((__m256i*)(src1 + col)); - sum = _mm256_add_epi32(sum, _mm256_sad_epu8(_src0, _src1)); - } - if (alignedWidth != width) - { - __m256i _src0 = _mm256_and_si256(Load((__m256i*)(src0 + width - A)), tailMask); - __m256i _src1 = _mm256_and_si256(Load((__m256i*)(src1 + width - A)), tailMask); - sum = _mm256_add_epi32(sum, _mm256_sad_epu8(_src0, _src1)); - } - sums[row] = ExtractSum(sum); - src0 += stride; - src1 += stride; - } - } - - void GetAbsDyRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - if (Aligned(src) && Aligned(stride)) - GetAbsDyRowSums(src, stride, width, height, sums); - else - GetAbsDyRowSums(src, stride, width, height, sums); - } - - template void GetAbsDxColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - width--; - size_t alignedLoWidth = AlignLo(width, A); - size_t alignedHiWidth = AlignHi(width, A); - size_t stepSize = SCHAR_MAX + 1; - size_t stepCount = (height + SCHAR_MAX) / stepSize; - - Buffer buffer(alignedHiWidth); - memset(buffer.sums32, 0, sizeof(uint32_t)*alignedHiWidth); - for (size_t step = 0; step < stepCount; ++step) - { - size_t rowStart = step*stepSize; - size_t rowEnd = Min(rowStart + stepSize, height); - - memset(buffer.sums16, 0, sizeof(uint16_t)*alignedHiWidth); - for (size_t row = rowStart; row < rowEnd; ++row) - { - for (size_t col = 0; col < alignedLoWidth; col += A) - { - __m256i _src0 = Load((__m256i*)(src + col + 0)); - __m256i _src1 = Load((__m256i*)(src + col + 1)); - Sum16(AbsDifferenceU8(_src0, _src1), buffer.sums16 + col); - } - if (alignedLoWidth != width) - { - __m256i _src0 = Load((__m256i*)(src + width - A + 0)); - __m256i _src1 = Load((__m256i*)(src + width - A + 1)); - Sum16(AbsDifferenceU8(_src0, _src1), buffer.sums16 + alignedLoWidth); - } - src += stride; - } - - for (size_t col = 0; col < alignedHiWidth; col += A) - Sum16To32(buffer.sums16 + col, buffer.sums32 + col); - } - memcpy(sums, buffer.sums32, sizeof(uint32_t)*alignedLoWidth); - if (alignedLoWidth != width) - memcpy(sums + alignedLoWidth, buffer.sums32 + alignedLoWidth + alignedHiWidth - width, sizeof(uint32_t)*(width - alignedLoWidth)); - sums[width] = 0; - } - - void GetAbsDxColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - if (Aligned(src) && Aligned(stride)) - GetAbsDxColSums(src, stride, width, height, sums); - else - GetAbsDxColSums(src, stride, width, height, sums); - } - - template void ValueSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - assert(width >= A); - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t bodyWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + bodyWidth, 0xFF); - __m256i fullSum = _mm256_setzero_si256(); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < bodyWidth; col += A) - { - const __m256i src_ = Load((__m256i*)(src + col)); - fullSum = _mm256_add_epi64(_mm256_sad_epu8(src_, K_ZERO), fullSum); - } - if (width - bodyWidth) - { - const __m256i src_ = _mm256_and_si256(tailMask, Load((__m256i*)(src + width - A))); - fullSum = _mm256_add_epi64(_mm256_sad_epu8(src_, K_ZERO), fullSum); - } - src += stride; - } - *sum = ExtractSum(fullSum); - } - - void ValueSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(src) && Aligned(stride)) - ValueSum(src, stride, width, height, sum); - else - ValueSum(src, stride, width, height, sum); - } - - SIMD_INLINE __m256i Square(__m256i src) - { - const __m256i lo = _mm256_unpacklo_epi8(src, _mm256_setzero_si256()); - const __m256i hi = _mm256_unpackhi_epi8(src, _mm256_setzero_si256()); - return _mm256_add_epi32(_mm256_madd_epi16(lo, lo), _mm256_madd_epi16(hi, hi)); - } - - template void SquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - assert(width >= A); - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t bodyWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + bodyWidth, 0xFF); - __m256i fullSum = _mm256_setzero_si256(); - for (size_t row = 0; row < height; ++row) - { - __m256i rowSum = _mm256_setzero_si256(); - for (size_t col = 0; col < bodyWidth; col += A) - { - const __m256i src_ = Load((__m256i*)(src + col)); - rowSum = _mm256_add_epi32(rowSum, Square(src_)); - } - if (width - bodyWidth) - { - const __m256i src_ = _mm256_and_si256(tailMask, Load((__m256i*)(src + width - A))); - rowSum = _mm256_add_epi32(rowSum, Square(src_)); - } - fullSum = _mm256_add_epi64(fullSum, HorizontalSum32(rowSum)); - src += stride; - } - *sum = ExtractSum(fullSum); - } - - void SquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(src) && Aligned(stride)) - SquareSum(src, stride, width, height, sum); - else - SquareSum(src, stride, width, height, sum); - } - - template void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum) - { - assert(width >= A); - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t bodyWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + bodyWidth, 0xFF); - __m256i fullValueSum = _mm256_setzero_si256(); - __m256i fullSquareSum = _mm256_setzero_si256(); - for (size_t row = 0; row < height; ++row) - { - __m256i rowSquareSum = _mm256_setzero_si256(); - for (size_t col = 0; col < bodyWidth; col += A) - { - const __m256i value = Load((__m256i*)(src + col)); - fullValueSum = _mm256_add_epi64(_mm256_sad_epu8(value, K_ZERO), fullValueSum); - rowSquareSum = _mm256_add_epi32(rowSquareSum, Square(value)); - } - if (width - bodyWidth) - { - const __m256i value = _mm256_and_si256(tailMask, Load((__m256i*)(src + width - A))); - fullValueSum = _mm256_add_epi64(_mm256_sad_epu8(value, K_ZERO), fullValueSum); - rowSquareSum = _mm256_add_epi32(rowSquareSum, Square(value)); - } - fullSquareSum = _mm256_add_epi64(fullSquareSum, HorizontalSum32(rowSquareSum)); - src += stride; - } - *valueSum = ExtractSum(fullValueSum); - *squareSum = ExtractSum(fullSquareSum); - } - - void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum) - { - if (Aligned(src) && Aligned(stride)) - ValueSquareSum(src, stride, width, height, valueSum, squareSum); - else - ValueSquareSum(src, stride, width, height, valueSum, squareSum); - } - - SIMD_INLINE __m256i Correlation(__m256i a, __m256i b) - { - const __m256i lo = _mm256_madd_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()), _mm256_unpacklo_epi8(b, _mm256_setzero_si256())); - const __m256i hi = _mm256_madd_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()), _mm256_unpackhi_epi8(b, _mm256_setzero_si256())); - return _mm256_add_epi32(lo, hi); - } - - template void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum) - { - assert(width >= A); - if (align) - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); - - size_t bodyWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + bodyWidth, 0xFF); - __m256i fullSum = _mm256_setzero_si256(); - for (size_t row = 0; row < height; ++row) - { - __m256i rowSum = _mm256_setzero_si256(); - for (size_t col = 0; col < bodyWidth; col += A) - { - const __m256i a_ = Load((__m256i*)(a + col)); - const __m256i b_ = Load((__m256i*)(b + col)); - rowSum = _mm256_add_epi32(rowSum, Correlation(a_, b_)); - } - if (width - bodyWidth) - { - const __m256i a_ = _mm256_and_si256(tailMask, Load((__m256i*)(a + width - A))); - const __m256i b_ = _mm256_and_si256(tailMask, Load((__m256i*)(b + width - A))); - rowSum = _mm256_add_epi32(rowSum, Correlation(a_, b_)); - } - fullSum = _mm256_add_epi64(fullSum, HorizontalSum32(rowSum)); - a += aStride; - b += bStride; - } - *sum = ExtractSum(fullSum); - } - - void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)) - CorrelationSum(a, aStride, b, bStride, width, height, sum); - else - CorrelationSum(a, aStride, b, bStride, width, height, sum); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2StatisticMoments.cpp b/src/3rd/Simd/Simd/SimdAvx2StatisticMoments.cpp deleted file mode 100644 index 940c2fec..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2StatisticMoments.cpp +++ /dev/null @@ -1,198 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSet.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - SIMD_INLINE void GetObjectMoments16(__m256i src, __m256i col, __m256i & sx, __m256i & sxx) - { - sx = _mm256_add_epi32(sx, _mm256_madd_epi16(col, src)); - sxx = _mm256_add_epi32(sxx, _mm256_madd_epi16(src, _mm256_mullo_epi16(col, col))); - } - - SIMD_INLINE void GetObjectMoments8(__m256i src, __m256i mask, __m256i& col, __m256i & n, __m256i & s, __m256i & sx, __m256i & sxx) - { - src = _mm256_and_si256(src, mask); - n = _mm256_add_epi64(n, _mm256_sad_epu8(_mm256_and_si256(K8_01, mask), K_ZERO)); - s = _mm256_add_epi64(s, _mm256_sad_epu8(src, K_ZERO)); - GetObjectMoments16(_mm256_unpacklo_epi8(src, K_ZERO), col, sx, sxx); - col = _mm256_add_epi16(col, K16_0008); - GetObjectMoments16(_mm256_unpackhi_epi8(src, K_ZERO), col, sx, sxx); - col = _mm256_add_epi16(col, K16_0018); - } - - template void GetObjectMoments(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t index, - __m256i & n, __m256i & s, __m256i & sx, __m256i & sy, __m256i & sxx, __m256i& sxy, __m256i& syy) - { - size_t widthA = AlignLo(width, A); - const size_t B = AlignLo(181, A); - size_t widthB = AlignLoAny(width, B); - __m256i tailMask = SetMask(0, A - width + widthA, 0xFF); - - const __m256i K16_I = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23); - const __m256i _index = _mm256_set1_epi8(index); - const __m256i tailCol = _mm256_add_epi16(K16_I, _mm256_set1_epi16((int16_t)(width - A - widthB))); - - for (size_t row = 0; row < height; ++row) - { - for (size_t colB = 0; colB < width;) - { - size_t colE = Simd::Min(colB + B, widthA); - __m256i _col = K16_I; - __m256i _n = _mm256_setzero_si256(); - __m256i _s = _mm256_setzero_si256(); - __m256i _sx = _mm256_setzero_si256(); - __m256i _sxx = _mm256_setzero_si256(); - if (mask == NULL) - { - for (size_t col = colB; col < colE; col += A) - { - __m256i _src = Load((__m256i*)(src + col)); - GetObjectMoments8(_src, K_INV_ZERO, _col, _n, _s, _sx, _sxx); - } - if (colB == widthB && widthA < width) - { - __m256i _src = Load((__m256i*)(src + width - A)); - _col = tailCol; - GetObjectMoments8(_src, tailMask, _col, _n, _s, _sx, _sxx); - colE = width; - } - } - else if (src == NULL) - { - for (size_t col = colB; col < colE; col += A) - { - __m256i _mask = _mm256_cmpeq_epi8(Load((__m256i*)(mask + col)), _index); - GetObjectMoments8(K8_01, _mask, _col, _n, _s, _sx, _sxx); - } - if (colB == widthB && widthA < width) - { - __m256i _mask = _mm256_and_si256(_mm256_cmpeq_epi8(Load((__m256i*)(mask + width - A)), _index), tailMask); - _col = tailCol; - GetObjectMoments8(K8_01, _mask, _col, _n, _s, _sx, _sxx); - colE = width; - } - } - else - { - for (size_t col = colB; col < colE; col += A) - { - __m256i _src = Load((__m256i*)(src + col)); - __m256i _mask = _mm256_cmpeq_epi8(Load((__m256i*)(mask + col)), _index); - GetObjectMoments8(_src, _mask, _col, _n, _s, _sx, _sxx); - } - if (colB == widthB && widthA < width) - { - __m256i _mask = _mm256_and_si256(_mm256_cmpeq_epi8(Load((__m256i*)(mask + width - A)), _index), tailMask); - __m256i _src = Load((__m256i*)(src + width - A)); - _col = tailCol; - GetObjectMoments8(_src, _mask, _col, _n, _s, _sx, _sxx); - colE = width; - } - } - _sx = HorizontalSum32(_sx); - _sxx = HorizontalSum32(_sxx); - - __m256i _y = _mm256_set1_epi32((int32_t)row); - __m256i _x0 = _mm256_set1_epi32((int32_t)colB); - - n = _mm256_add_epi64(n, _n); - - s = _mm256_add_epi64(s, _s); - - sx = _mm256_add_epi64(sx, _sx); - __m256i _sx0 = _mm256_mul_epu32(_s, _x0); - sx = _mm256_add_epi64(sx, _sx0); - - __m256i _sy = _mm256_mul_epu32(_s, _y); - sy = _mm256_add_epi64(sy, _sy); - - sxx = _mm256_add_epi64(sxx, _sxx); - sxx = _mm256_add_epi64(sxx, _mm256_mul_epu32(_sx, _mm256_add_epi64(_x0, _x0))); - sxx = _mm256_add_epi64(sxx, _mm256_mul_epu32(_sx0, _x0)); - - sxy = _mm256_add_epi64(sxy, _mm256_mul_epu32(_sx, _y)); - sxy = _mm256_add_epi64(sxy, _mm256_mul_epu32(_sx0, _y)); - - syy = _mm256_add_epi64(syy, _mm256_mul_epu32(_sy, _y)); - - colB = colE; - } - if(src) - src += srcStride; - if(mask) - mask += maskStride; - } - } - - template void GetObjectMoments(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t* mask, size_t maskStride, uint8_t index, - uint64_t* n, uint64_t* s, uint64_t* sx, uint64_t* sy, uint64_t* sxx, uint64_t* sxy, uint64_t* syy) - { - assert(width >= A && (src || mask)); - if (align) - assert((src == NULL || (Aligned(src) && Aligned(srcStride))) && (mask == NULL || (Aligned(mask) && Aligned(maskStride)))); - - __m256i _n = _mm256_setzero_si256(); - __m256i _s = _mm256_setzero_si256(); - __m256i _sx = _mm256_setzero_si256(); - __m256i _sy = _mm256_setzero_si256(); - __m256i _sxx = _mm256_setzero_si256(); - __m256i _sxy = _mm256_setzero_si256(); - __m256i _syy = _mm256_setzero_si256(); - - GetObjectMoments(src, srcStride, width, height, mask, maskStride, index, _n, _s, _sx, _sy, _sxx, _sxy, _syy); - - *n = ExtractSum(_n); - *s = ExtractSum(_s); - *sx = ExtractSum(_sx); - *sy = ExtractSum(_sy); - *sxx = ExtractSum(_sxx); - *sxy = ExtractSum(_sxy); - *syy = ExtractSum(_syy); - } - - void GetObjectMoments(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t* mask, size_t maskStride, uint8_t index, - uint64_t* n, uint64_t* s, uint64_t* sx, uint64_t* sy, uint64_t* sxx, uint64_t* sxy, uint64_t* syy) - { - if ((src == NULL || (Aligned(src) && Aligned(srcStride))) && (mask == NULL || (Aligned(mask) && Aligned(maskStride)))) - GetObjectMoments(src, srcStride, width, height, mask, maskStride, index, n, s, sx, sy, sxx, sxy, syy); - else - GetObjectMoments(src, srcStride, width, height, mask, maskStride, index, n, s, sx, sy, sxx, sxy, syy); - } - - void GetMoments(const uint8_t* mask, size_t stride, size_t width, size_t height, uint8_t index, - uint64_t* area, uint64_t* x, uint64_t* y, uint64_t* xx, uint64_t* xy, uint64_t* yy) - { - uint64_t stub; - GetObjectMoments(NULL, 0, width, height, mask, stride, index, &stub, area, x, y, xx, xy, yy); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2StretchGray2x2.cpp b/src/3rd/Simd/Simd/SimdAvx2StretchGray2x2.cpp deleted file mode 100644 index 8efd3702..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2StretchGray2x2.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void StoreUnpacked(__m256i value, uint8_t * dst) - { - Store((__m256i*)(dst + 0), _mm256_unpacklo_epi8(value, value)); - Store((__m256i*)(dst + A), _mm256_unpackhi_epi8(value, value)); - } - - template void StretchGray2x2( - const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert(srcWidth * 2 == dstWidth && srcHeight * 2 == dstHeight && srcWidth >= A); - if (align) - { - assert(Aligned(src) && Aligned(srcStride)); - assert(Aligned(dst) && Aligned(dstStride)); - } - - size_t alignedWidth = AlignLo(srcWidth, A); - for (size_t row = 0; row < srcHeight; ++row) - { - uint8_t * dstEven = dst; - uint8_t * dstOdd = dst + dstStride; - for (size_t srcCol = 0, dstCol = 0; srcCol < alignedWidth; srcCol += A, dstCol += DA) - { - __m256i value = LoadPermuted((__m256i*)(src + srcCol)); - StoreUnpacked(value, dstEven + dstCol); - StoreUnpacked(value, dstOdd + dstCol); - } - if (alignedWidth != srcWidth) - { - __m256i value = LoadPermuted((__m256i*)(src + srcWidth - A)); - StoreUnpacked(value, dstEven + dstWidth - 2 * A); - StoreUnpacked(value, dstOdd + dstWidth - 2 * A); - } - src += srcStride; - dst += 2 * dstStride; - } - } - - void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - StretchGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - StretchGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Synet.cpp b/src/3rd/Simd/Simd/SimdAvx2Synet.cpp deleted file mode 100644 index e2e194de..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Synet.cpp +++ /dev/null @@ -1,955 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynet.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdSse1.h" -#include "Simd/SimdSse2.h" -#include "Simd/SimdAvx1.h" -#include "Simd/SimdAvx2.h" -#include "Simd/SimdArray.h" -#include "Simd/SimdPow.h" -#include "Simd/SimdExp.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template void SynetEltwiseLayerForwardSum(const float * src0, const __m256 & weight0, const float * src1, const __m256 & weight1, float * dst, size_t offset) - { - Avx::Store(dst + offset, _mm256_fmadd_ps(Avx::Load(src0 + offset), weight0, _mm256_mul_ps(Avx::Load(src1 + offset), weight1))); - } - - template void SynetEltwiseLayerForwardSum(const float * src, const __m256 & weight, float * dst, size_t offset) - { - Avx::Store(dst + offset, _mm256_fmadd_ps(Avx::Load(src + offset), weight, Load(dst + offset))); - } - - template void SynetEltwiseLayerForwardSum(float const * const * src, const float * weight, size_t count, size_t size, float * dst) - { - size_t aligned = AlignLo(size, QF); - size_t partial = AlignLo(size, F); - const float * src0 = src[0]; - const float * src1 = src[1]; - __m256 weight0 = _mm256_set1_ps(weight[0]); - __m256 weight1 = _mm256_set1_ps(weight[1]); - size_t j = 0; - if (partial) - { - for (; j < aligned; j += QF) - { - SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 0); - SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 1); - SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 2); - SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 3); - } - for (; j < partial; j += F) - SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j); - } - for (; j < size; ++j) - dst[j] = src0[j] * weight[0] + src1[j] * weight[1]; - for (size_t i = 2; i < count; ++i) - { - const float * srci = src[i]; - __m256 weighti = _mm256_set1_ps(weight[i]); - size_t j = 0; - if (partial) - { - for (; j < aligned; j += QF) - { - SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 0); - SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 1); - SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 2); - SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 3); - } - for (; j < partial; j += F) - SynetEltwiseLayerForwardSum(srci, weighti, dst, j); - } - for (; j < size; ++j) - dst[j] += srci[j] * weight[i]; - } - } - - void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst) - { - if (type != SimdSynetEltwiseOperationSum) - { - Avx::SynetEltwiseLayerForward(src, weight, count, size, type, dst); - return; - } - assert(count >= 2); - bool aligned = Aligned(dst) && Aligned(src[0]) && Aligned(src[1]); - for (size_t i = 2; i < count; ++i) - aligned = aligned && Aligned(src[i]); - if (aligned) - SynetEltwiseLayerForwardSum(src, weight, count, size, dst); - else - SynetEltwiseLayerForwardSum(src, weight, count, size, dst); - } - - SIMD_INLINE __m256 Tail(size_t tail) - { - const int32_t mask[DF] = { 0, 0, 0, 0, 0, 0, 0, 0 , -1, -1, -1, -1, -1, -1, -1, -1 }; - return _mm256_loadu_ps((float*)(mask + tail)); - } - - void SynetInnerProductLayerForward1(const float * S0, const float * W, const float * B, size_t K, float * D) - { - size_t K8 = K & (~7); - size_t K32 = K & (~31); - const float * W0 = W + 0 * K; - __m256 d00, d01, d02, d03; - __m256 s0, s1, s2, s3, w0, w1, w2, w3; - size_t k = 0; - d00 = _mm256_setzero_ps(); - if (K32) - { - d01 = _mm256_setzero_ps(); - d02 = _mm256_setzero_ps(); - d03 = _mm256_setzero_ps(); - for (; k < K32; k += 32) - { - s0 = _mm256_loadu_ps(S0 + k + 0 * F); - s1 = _mm256_loadu_ps(S0 + k + 1 * F); - w0 = _mm256_loadu_ps(W0 + k + 0 * F); - w1 = _mm256_loadu_ps(W0 + k + 1 * F); - d00 = _mm256_fmadd_ps(s0, w0, d00); - d01 = _mm256_fmadd_ps(s1, w1, d01); - s2 = _mm256_loadu_ps(S0 + k + 2 * F); - s3 = _mm256_loadu_ps(S0 + k + 3 * F); - w2 = _mm256_loadu_ps(W0 + k + 2 * F); - w3 = _mm256_loadu_ps(W0 + k + 3 * F); - d02 = _mm256_fmadd_ps(s2, w2, d02); - d03 = _mm256_fmadd_ps(s3, w3, d03); - } - d00 = _mm256_add_ps(_mm256_add_ps(d00, d01), _mm256_add_ps(d02, d03)); - } - for (; k < K8; k += 8) - { - s0 = _mm256_loadu_ps(S0 + k); - w0 = _mm256_loadu_ps(W0 + k); - d00 = _mm256_fmadd_ps(s0, w0, d00); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 tail = Tail(K - K8); - s0 = _mm256_and_ps(tail, _mm256_loadu_ps(S0 + k)); - w0 = _mm256_loadu_ps(W0 + k); - d00 = _mm256_fmadd_ps(s0, w0, d00); - } - D[0] = Avx::ExtractSum(d00) + B[0]; - } - - void SynetInnerProductLayerForward4(const float * S0, const float * W, const float * B, size_t K, float * D) - { - size_t K8 = K & (~7); - size_t K16 = K & (~15); - const float * W0 = W + 0 * K; - const float * W1 = W + 1 * K; - const float * W2 = W + 2 * K; - const float * W3 = W + 3 * K; - __m256 d00, d01, d10, d11, d20, d21, d30, d31; - __m256 s0, s1, w0, w1; - size_t k = 0; - d00 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); - if (K16) - { - d01 = _mm256_setzero_ps(); - d11 = _mm256_setzero_ps(); - d21 = _mm256_setzero_ps(); - d31 = _mm256_setzero_ps(); - for (; k < K16; k += 16) - { - s0 = _mm256_loadu_ps(S0 + k + 0 * F); - s1 = _mm256_loadu_ps(S0 + k + 1 * F); - w0 = _mm256_loadu_ps(W0 + k + 0 * F); - w1 = _mm256_loadu_ps(W0 + k + 1 * F); - d00 = _mm256_fmadd_ps(s0, w0, d00); - d01 = _mm256_fmadd_ps(s1, w1, d01); - w0 = _mm256_loadu_ps(W1 + k + 0 * F); - w1 = _mm256_loadu_ps(W1 + k + 1 * F); - d10 = _mm256_fmadd_ps(s0, w0, d10); - d11 = _mm256_fmadd_ps(s1, w1, d11); - w0 = _mm256_loadu_ps(W2 + k + 0 * F); - w1 = _mm256_loadu_ps(W2 + k + 1 * F); - d20 = _mm256_fmadd_ps(s0, w0, d20); - d21 = _mm256_fmadd_ps(s1, w1, d21); - w0 = _mm256_loadu_ps(W3 + k + 0 * F); - w1 = _mm256_loadu_ps(W3 + k + 1 * F); - d30 = _mm256_fmadd_ps(s0, w0, d30); - d31 = _mm256_fmadd_ps(s1, w1, d31); - } - d00 = _mm256_add_ps(d00, d01); - d10 = _mm256_add_ps(d10, d11); - d20 = _mm256_add_ps(d20, d21); - d30 = _mm256_add_ps(d30, d31); - } - for (; k < K8; k += 8) - { - s0 = _mm256_loadu_ps(S0 + k + 0 * F); - w0 = _mm256_loadu_ps(W0 + k + 0 * F); - d00 = _mm256_fmadd_ps(s0, w0, d00); - w0 = _mm256_loadu_ps(W1 + k + 0 * F); - d10 = _mm256_fmadd_ps(s0, w0, d10); - w0 = _mm256_loadu_ps(W2 + k + 0 * F); - d20 = _mm256_fmadd_ps(s0, w0, d20); - w0 = _mm256_loadu_ps(W3 + k + 0 * F); - d30 = _mm256_fmadd_ps(s0, w0, d30); - } - if (K8 < K) - { - size_t k = K - 8; - __m256 tail = Tail(K - K8); - s0 = _mm256_and_ps(tail, _mm256_loadu_ps(S0 + k)); - w0 = _mm256_loadu_ps(W0 + k + 0 * F); - d00 = _mm256_fmadd_ps(s0, w0, d00); - w0 = _mm256_loadu_ps(W1 + k + 0 * F); - d10 = _mm256_fmadd_ps(s0, w0, d10); - w0 = _mm256_loadu_ps(W2 + k + 0 * F); - d20 = _mm256_fmadd_ps(s0, w0, d20); - w0 = _mm256_loadu_ps(W3 + k + 0 * F); - d30 = _mm256_fmadd_ps(s0, w0, d30); - } - _mm_storeu_ps(D, _mm_add_ps(Extract4Sums(d00, d10, d20, d30), _mm_loadu_ps(B))); - } - - void SynetInnerProductLayerForward(const float * src, const float * weight, const float * bias, size_t count, size_t size, float * dst) - { - float _bias[4] = { 0, 0, 0, 0 }; - size_t count4 = AlignLo(count, 4); - size_t i = 0; - for (; i < count4; i += 4) - SynetInnerProductLayerForward4(src, weight + i * size, (bias ? bias + i : _bias), size, dst + i); - for (; i < count; ++i) - SynetInnerProductLayerForward1(src, weight + i * size, (bias ? bias + i : _bias), size, dst + i); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE __m256 LoadAtEdge(const float * src) - { - static const int32_t mask[3 * F] = { 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 }; - return _mm256_and_ps(_mm256_loadu_ps(src + shift), _mm256_loadu_ps((float*)mask + F + shift)); - } - - SIMD_INLINE __m256 NoseSquareSum(const float * src) - { - return _mm256_add_ps(_mm256_add_ps(Avx::Square(LoadAtEdge<-2>(src)), Avx::Square(LoadAtEdge<-1>(src))), - _mm256_add_ps(Avx::Square(_mm256_loadu_ps(src)), _mm256_add_ps(Avx::Square(_mm256_loadu_ps(src + 1)), Avx::Square(_mm256_loadu_ps(src + 2))))); - } - - SIMD_INLINE __m256 BodySquareSum(const float * src) - { - return _mm256_add_ps(_mm256_add_ps(Avx::Square(_mm256_loadu_ps(src - 2)), Avx::Square(_mm256_loadu_ps(src - 1))), - _mm256_add_ps(Avx::Square(_mm256_loadu_ps(src)), _mm256_add_ps(Avx::Square(_mm256_loadu_ps(src + 1)), Avx::Square(_mm256_loadu_ps(src + 2))))); - } - - SIMD_INLINE __m256 TailSquareSum(const float * src) - { - return _mm256_add_ps(_mm256_add_ps(Avx::Square(LoadAtEdge<2>(src)), Avx::Square(LoadAtEdge<1>(src))), - _mm256_add_ps(Avx::Square(_mm256_loadu_ps(src)), _mm256_add_ps(Avx::Square(_mm256_loadu_ps(src - 1)), Avx::Square(_mm256_loadu_ps(src - 2))))); - } - - template void SynetLrnLayerCrossChannelsNchw(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst) - { - __m256 k0 = _mm256_set1_ps(k[0]); - __m256 k1 = _mm256_set1_ps(k[1]); - __m256 k2 = _mm256_set1_ps(k[2]); - Avx2::Pow pow; - Array32f sum(spatial, true), zero(spatial, true); - size_t aligned = AlignLo(spatial, F); - for (size_t c = 0; c < half; ++c) - { - const float * pos = src + c * spatial; - size_t s = 0; - for (; s < aligned; s += F) - { - __m256 _pos = Avx::Load(pos + s); - Avx::Store(sum.data + s, _mm256_add_ps(Avx::Load(sum.data + s), _mm256_mul_ps(_pos, _pos))); - } - for (; s < spatial; ++s) - sum[s] += Simd::Square(pos[s]); - } - for (size_t c = 0; c < channels; ++c) - { - const float * pos = (c < channels - half) ? src + half * spatial : zero.data; - const float * neg = (c > half) ? src - (half + 1) * spatial : zero.data; - size_t s = 0; - for (; s < aligned; s += F) - { - __m256 _pos = Avx::Load(pos + s); - __m256 _neg = Avx::Load(neg + s); - __m256 _sum = Avx::Load(sum.data + s); - _sum = _mm256_add_ps(_sum, _mm256_sub_ps(_mm256_mul_ps(_pos, _pos), _mm256_mul_ps(_neg, _neg))); - __m256 _src = Avx::Load(src + s); - Avx::Store(sum.data + s, _sum); - Avx::Store(dst + s, _mm256_mul_ps(_src, pow(_mm256_add_ps(k0, _mm256_mul_ps(k1, _sum)), k2))); - } - for (; s < spatial; ++s) - { - sum[s] += Simd::Square(pos[s]); - sum[s] -= Simd::Square(neg[s]); - dst[s] = src[s] * Base::Pow(k[0] + k[1] * sum[s], k[2]); - } - src += spatial; - dst += spatial; - } - } - - SIMD_INLINE void SynetLrnLayerCrossChannelsNchw(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst) - { - if (Aligned(src) && Aligned(dst) && Aligned(spatial, F)) - SynetLrnLayerCrossChannelsNchw(src, half, channels, spatial, k, dst); - else - SynetLrnLayerCrossChannelsNchw(src, half, channels, spatial, k, dst); - } - - template void SynetLrnLayerCrossChannelsNhwc2h(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst) - { - __m256 k0 = _mm256_set1_ps(k[0]); - __m256 k1 = _mm256_set1_ps(k[1]); - __m256 k2 = _mm256_set1_ps(k[2]); - Avx2::Pow pow; - size_t aligned = AlignLo(channels - half, F); - for (size_t s = 0; s < spatial; ++s) - { - Avx::Store(dst + 0, _mm256_mul_ps(Avx::Load(src + 0), pow(_mm256_add_ps(k0, _mm256_mul_ps(k1, NoseSquareSum(src + 0))), k2))); - for (size_t c = F; c < aligned; c += F) - Avx::Store(dst + c, _mm256_mul_ps(Avx::Load(src + c), pow(_mm256_add_ps(k0, _mm256_mul_ps(k1, BodySquareSum(src + c))), k2))); - if (aligned != channels - half) - { - size_t c = channels - half - F; - Avx::Store(dst + c, _mm256_mul_ps(Avx::Load(src + c), pow(_mm256_add_ps(k0, _mm256_mul_ps(k1, BodySquareSum(src + c))), k2))); - } - size_t c = channels - F; - Avx::Store(dst + c, _mm256_mul_ps(Avx::Load(src + c), pow(_mm256_add_ps(k0, _mm256_mul_ps(k1, TailSquareSum(src + c))), k2))); - src += channels; - dst += channels; - } - } - - SIMD_INLINE void SynetLrnLayerCrossChannelsNhwc(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst) - { - if (half == 2 && channels >= F + half) - { - if (Aligned(src) && Aligned(dst) && Aligned(channels, F)) - SynetLrnLayerCrossChannelsNhwc2h(src, half, channels, spatial, k, dst); - else - SynetLrnLayerCrossChannelsNhwc2h(src, half, channels, spatial, k, dst); - } - else - Sse2::SynetLrnLayerCrossChannels(src, half, channels, spatial, k, dst, SimdTensorFormatNhwc); - } - - void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst, SimdTensorFormatType format) - { - if (format == SimdTensorFormatNchw) - SynetLrnLayerCrossChannelsNchw(src, half, channels, spatial, k, dst); - else if (format == SimdTensorFormatNhwc) - SynetLrnLayerCrossChannelsNhwc(src, half, channels, spatial, k, dst); - else - Base::SynetLrnLayerCrossChannels(src, half, channels, spatial, k, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, float * dst, size_t offset) - { - __m256 _src = Avx::Load(src + offset); - __m256 _scale = Avx::Load(scale + offset); - __m256 _bias = Avx::Load(bias + offset); - Avx::Store(dst + offset, Fmadd(_src, _scale, _bias)); - } - - template SIMD_INLINE void SynetScaleLayerForward(const float* src, const float* scale, const float* bias, float* dst, size_t offset, __m256i tail) - { - __m256 _src = _mm256_maskload_ps(src + offset, tail); - __m256 _scale = _mm256_maskload_ps(scale + offset, tail); - __m256 _bias = _mm256_maskload_ps(bias + offset, tail); - _mm256_maskstore_ps(dst + offset, tail, Fmadd(_src, _scale, _bias)); - } - - template SIMD_INLINE void SynetScaleLayerForward(const float* src, const float* scale, float* dst, size_t offset) - { - Avx::Store(dst + offset, _mm256_mul_ps(Avx::Load(src + offset), Avx::Load(scale + offset))); - } - - template SIMD_INLINE void SynetScaleLayerForward(const float* src, const __m256& scale, const __m256& bias, float* dst, size_t offset) - { - __m256 _src = Avx::Load(src + offset); - Avx::Store(dst + offset, Fmadd(_src, scale, bias)); - } - - template SIMD_INLINE void SynetScaleLayerForward(const float * src, const __m256 & scale, const __m256 & bias, float * dst, size_t offset, __m256i tail) - { - __m256 _src = _mm256_maskload_ps(src + offset, tail); - _mm256_maskstore_ps(dst + offset, tail, Fmadd(_src, scale, bias)); - } - - template SIMD_INLINE void SynetScaleLayerForward(const float * src, const __m256 & scale, float * dst, size_t offset) - { - Avx::Store(dst + offset, _mm256_mul_ps(Avx::Load(src + offset), scale)); - } - - template void SynetScaleLayerForwardNchw(const float * src, const float * scale, const float * bias, size_t channels, size_t height, size_t width, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(width, F) && Aligned(dst)); - - size_t widthQF = AlignLo(width, QF); - size_t widthF = AlignLo(width, F); - if (bias) - { - for (size_t c = 0; c < channels; ++c) - { - for (size_t h = 0; h < height; ++h) - { - size_t w = 0; - if (widthF) - { - __m256 _scale = _mm256_set1_ps(scale[c]); - __m256 _bias = _mm256_set1_ps(bias[c]); - for (; w < widthQF; w += QF) - { - SynetScaleLayerForward(src, _scale, _bias, dst, w + F * 0); - SynetScaleLayerForward(src, _scale, _bias, dst, w + F * 1); - SynetScaleLayerForward(src, _scale, _bias, dst, w + F * 2); - SynetScaleLayerForward(src, _scale, _bias, dst, w + F * 3); - } - for (; w < widthF; w += F) - SynetScaleLayerForward(src, _scale, _bias, dst, w); - } - for (; w < width; ++w) - dst[w] = src[w] * scale[c] + bias[c]; - src += width; - dst += width; - } - } - } - else - { - for (size_t c = 0; c < channels; ++c) - { - for (size_t h = 0; h < height; ++h) - { - size_t w = 0; - if (widthF) - { - __m256 _scale = _mm256_set1_ps(scale[c]); - for (; w < widthQF; w += QF) - { - SynetScaleLayerForward(src, _scale, dst, w + F * 0); - SynetScaleLayerForward(src, _scale, dst, w + F * 1); - SynetScaleLayerForward(src, _scale, dst, w + F * 2); - SynetScaleLayerForward(src, _scale, dst, w + F * 3); - } - for (; w < widthF; w += F) - SynetScaleLayerForward(src, _scale, dst, w); - } - for (; w < width; ++w) - dst[w] = src[w] * scale[c]; - src += width; - dst += width; - } - } - } - } - - SIMD_INLINE void SynetScaleLayerForwardNchw(const float* src, const float* scale, const float* bias, size_t channels, size_t height, size_t width, float* dst, SimdSynetCompatibilityType compatibility) - { - if (!((compatibility & SimdSynetCompatibilityNoFma) && bias)) - { - width = height * width; - height = 1; - if (Aligned(src) && Aligned(width, F) && Aligned(dst)) - SynetScaleLayerForwardNchw(src, scale, bias, channels, height, width, dst); - else - SynetScaleLayerForwardNchw(src, scale, bias, channels, height, width, dst); - } - else - { - if (Aligned(src) && Aligned(width, F) && Aligned(dst)) - SynetScaleLayerForwardNchw(src, scale, bias, channels, height, width, dst); - else - SynetScaleLayerForwardNchw(src, scale, bias, channels, height, width, dst); - } - } - - template void SynetScaleLayerForwardNhwc(const float * src, const float * scale, const float * bias, size_t channels, size_t height, size_t width, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(scale) && Aligned(bias) && Aligned(channels, F) && Aligned(dst)); - - size_t channelsF = AlignLo(channels, F); - size_t channelsQF = AlignLo(channels, QF); - if (bias) - { - size_t widthF = AlignLo(width, F); - __m256i tail = LeftNotZero32i(channels - channelsF); - for (size_t h = 0; h < height; ++h) - { - size_t w = 0; - for (; w < widthF; ++w) - { - size_t c = 0; - for (; c < channelsQF; c += QF) - { - SynetScaleLayerForward(src, scale, bias, dst, c + F * 0); - SynetScaleLayerForward(src, scale, bias, dst, c + F * 1); - SynetScaleLayerForward(src, scale, bias, dst, c + F * 2); - SynetScaleLayerForward(src, scale, bias, dst, c + F * 3); - } - for (; c < channelsF; c += F) - SynetScaleLayerForward(src, scale, bias, dst, c); - if (c < channels) - SynetScaleLayerForward(src, scale, bias, dst, c, tail); - src += channels; - dst += channels; - } - for (; w < width; ++w) - { - size_t c = 0; - for (; c < channelsQF; c += QF) - { - SynetScaleLayerForward(src, scale, bias, dst, c + F * 0); - SynetScaleLayerForward(src, scale, bias, dst, c + F * 1); - SynetScaleLayerForward(src, scale, bias, dst, c + F * 2); - SynetScaleLayerForward(src, scale, bias, dst, c + F * 3); - } - for (; c < channelsF; c += F) - SynetScaleLayerForward(src, scale, bias, dst, c); - if (c < channels) - SynetScaleLayerForward(src, scale, bias, dst, c, tail); - src += channels; - dst += channels; - } - } - } - else - { - for (size_t h = 0; h < height; ++h) - { - for (size_t w = 0; w < width; ++w) - { - size_t c = 0; - for (; c < channelsQF; c += QF) - { - SynetScaleLayerForward(src, scale, dst, c + F * 0); - SynetScaleLayerForward(src, scale, dst, c + F * 1); - SynetScaleLayerForward(src, scale, dst, c + F * 2); - SynetScaleLayerForward(src, scale, dst, c + F * 3); - } - for (; c < channelsF; c += F) - SynetScaleLayerForward(src, scale, dst, c); - for (; c < channels; ++c) - dst[c] = src[c] * scale[c]; - src += channels; - dst += channels; - } - } - } - } - - template SIMD_INLINE void SynetScaleLayerForwardNhwc(const float* src, const float* scale, const float* bias, size_t channels, size_t height, size_t width, float* dst, SimdSynetCompatibilityType compatibility) - { - if ((compatibility & SimdSynetCompatibilityNoFma) && bias) - SynetScaleLayerForwardNhwc(src, scale, bias, channels, height, width, dst); - else if((compatibility & SimdSynetCompatibilityNoFmaTail) && bias) - SynetScaleLayerForwardNhwc(src, scale, bias, channels, height, width, dst); - else - SynetScaleLayerForwardNhwc(src, scale, bias, channels, height, width, dst); - } - - template void SynetScaleLayerForwardNhwc3(const float * src, const float * scale, const float * bias, size_t height, size_t width, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst) && Aligned(width)); - - size_t width3 = width * 3; - size_t widthF3 = AlignLo(width, F) * 3; - if (bias) - { - float _scale[F * 3], _bias[F * 3]; - for (size_t i = 0; i < F; ++i) - for (size_t c = 0; c < 3; ++c) - _scale[i * 3 + c] = scale[c], _bias[i * 3 + c] = bias[c]; - __m256 _scale0 = Load(_scale + 0 * F); - __m256 _scale1 = Load(_scale + 1 * F); - __m256 _scale2 = Load(_scale + 2 * F); - __m256 _bias0 = Load(_bias + 0 * F); - __m256 _bias1 = Load(_bias + 1 * F); - __m256 _bias2 = Load(_bias + 2 * F); - for (size_t h = 0; h < height; ++h) - { - size_t w = 0; - for (; w < widthF3; w += F * 3) - { - SynetScaleLayerForward(src, _scale0, _bias0, dst, w + F * 0); - SynetScaleLayerForward(src, _scale1, _bias1, dst, w + F * 1); - SynetScaleLayerForward(src, _scale2, _bias2, dst, w + F * 2); - } - for (; w < width3; w += 3) - { - dst[w + 0] = src[w + 0] * scale[0] + bias[0]; - dst[w + 1] = src[w + 1] * scale[1] + bias[1]; - dst[w + 2] = src[w + 2] * scale[2] + bias[2]; - } - src += width3; - dst += width3; - } - } - else - { - float _scale[F * 3]; - for (size_t i = 0; i < F; ++i) - for (size_t c = 0; c < 3; ++c) - _scale[i * 3 + c] = scale[c]; - __m256 _scale0 = Load(_scale + 0 * F); - __m256 _scale1 = Load(_scale + 1 * F); - __m256 _scale2 = Load(_scale + 2 * F); - for (size_t h = 0; h < height; ++h) - { - size_t w = 0; - for (; w < widthF3; w += F * 3) - { - SynetScaleLayerForward(src, _scale0, dst, w + F * 0); - SynetScaleLayerForward(src, _scale1, dst, w + F * 1); - SynetScaleLayerForward(src, _scale2, dst, w + F * 2); - } - for (; w < width3; w += 3) - { - dst[w + 0] = src[w + 0] * scale[0]; - dst[w + 1] = src[w + 1] * scale[1]; - dst[w + 2] = src[w + 2] * scale[2]; - } - src += width3; - dst += width3; - } - } - } - - SIMD_INLINE void SynetScaleLayerForwardNhwc(const float * src, const float * scale, const float * bias, size_t channels, size_t height, size_t width, float * dst, SimdSynetCompatibilityType compatibility) - { - if (!((compatibility & SimdSynetCompatibilityNoFmaTail) && bias)) - { - width = height * width; - height = 1; - } - if (channels == 3) - { - if ((compatibility & SimdSynetCompatibilityNoFma) && bias) - { - if (Aligned(src) && Aligned(dst) && Aligned(width)) - SynetScaleLayerForwardNhwc3(src, scale, bias, height, width, dst); - else - SynetScaleLayerForwardNhwc3(src, scale, bias, height, width, dst); - } - else - { - if (Aligned(src) && Aligned(dst) && Aligned(width)) - SynetScaleLayerForwardNhwc3(src, scale, bias, height, width, dst); - else - SynetScaleLayerForwardNhwc3(src, scale, bias, height, width, dst); - } - } - else - { - if (Aligned(src) && Aligned(scale) && Aligned(bias) && Aligned(channels, F) && Aligned(dst)) - SynetScaleLayerForwardNhwc(src, scale, bias, channels, height, width, dst, compatibility); - else - SynetScaleLayerForwardNhwc(src, scale, bias, channels, height, width, dst, compatibility); - } - } - - template void SynetScaleLayerForwardNchw8c(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - if (bias) - { - for (size_t c = 0; c < channels; c += F) - { - __m256 _scale = Load(scale + c); - __m256 _bias = Load(bias + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 0); - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 1); - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 2); - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetScaleLayerForward(src, _scale, _bias, dst, s); - src += spatialF; - dst += spatialF; - } - } - else - { - for (size_t c = 0; c < channels; c += F) - { - __m256 _scale = Load(scale + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetScaleLayerForward(src, _scale, dst, s + F * 0); - SynetScaleLayerForward(src, _scale, dst, s + F * 1); - SynetScaleLayerForward(src, _scale, dst, s + F * 2); - SynetScaleLayerForward(src, _scale, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetScaleLayerForward(src, _scale, dst, s); - src += spatialF; - dst += spatialF; - } - } - } - - SIMD_INLINE void SynetScaleLayerForwardNchw8c(const float* src, const float* scale, const float* bias, size_t channels, size_t spatial, float* dst, SimdSynetCompatibilityType compatibility) - { - if (compatibility & SimdSynetCompatibilityNoFma) - { - if (Aligned(src) && Aligned(dst)) - SynetScaleLayerForwardNchw8c(src, scale, bias, channels, spatial, dst); - else - SynetScaleLayerForwardNchw8c(src, scale, bias, channels, spatial, dst); - } - else - { - if (Aligned(src) && Aligned(dst)) - SynetScaleLayerForwardNchw8c(src, scale, bias, channels, spatial, dst); - else - SynetScaleLayerForwardNchw8c(src, scale, bias, channels, spatial, dst); - } - } - - void SynetScaleLayerForward(const float* src, const float* scale, const float* bias, size_t channels, size_t height, size_t width, float* dst, SimdTensorFormatType format, SimdSynetCompatibilityType compatibility) - { - size_t spatial = height * width; - if (Base::NchwCompatible(channels, spatial, format)) - SynetScaleLayerForwardNchw(src, scale, bias, channels, width, height, dst, compatibility); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetScaleLayerForwardNhwc(src, scale, bias, channels, height, width, dst, compatibility); - else if (format == SimdTensorFormatNchw4c) - Sse::SynetScaleLayerForward(src, scale, bias, channels, height, width, dst, format, compatibility); - else if (format == SimdTensorFormatNchw8c) - SynetScaleLayerForwardNchw8c(src, scale, bias, channels, spatial, dst, compatibility); - else - Base::SynetScaleLayerForward(src, scale, bias, channels, height, width, dst, format, compatibility); - } - - //--------------------------------------------------------------------- - - void SynetSoftmaxLayerForward(const float * src, size_t outer, size_t count, size_t inner, float * dst) - { - Avx2::Exp exp; - if (inner == 1 && count == 2) - { - size_t aligned = Simd::AlignLo(outer, F); - size_t o = 0; - for (; o < aligned; o += F) - { - __m256 s0 = _mm256_loadu_ps(src + 0); - __m256 s1 = _mm256_loadu_ps(src + F); - __m256 ss0 = _mm256_shuffle_ps(s0, s1, 0x88); - __m256 ss1 = _mm256_shuffle_ps(s0, s1, 0xDD); - __m256 max = _mm256_max_ps(ss0, ss1); - __m256 exp0 = exp.Exponent(_mm256_sub_ps(ss0, max)); - __m256 exp1 = exp.Exponent(_mm256_sub_ps(ss1, max)); - __m256 sum = _mm256_add_ps(exp0, exp1); - __m256 d0 = _mm256_div_ps(exp0, sum); - __m256 d1 = _mm256_div_ps(exp1, sum); - _mm256_storeu_ps(dst + 0, _mm256_unpacklo_ps(d0, d1)); - _mm256_storeu_ps(dst + F, _mm256_unpackhi_ps(d0, d1)); - src += DF; - dst += DF; - } - for (; o < outer; ++o) - { - float max = Simd::Max(src[0], src[1]); - float exp0 = ::exp(src[0] - max); - float exp1 = ::exp(src[1] - max); - float sum = exp0 + exp1; - dst[0] = exp0 / sum; - dst[1] = exp1 / sum; - src += 2; - dst += 2; - } - } - else - { - size_t aligned = Simd::AlignLo(inner, F); - Array32f tmp(inner * 2); - const float * s; - float * max = tmp.data, *sum = tmp.data + inner, *d; - for (size_t o = 0; o < outer; ++o) - { - memcpy(max, src, inner * sizeof(float)); - s = src + inner; - for (size_t c = 1; c < count; ++c) - { - size_t i = 0; - for (; i < aligned; i += F) - _mm256_storeu_ps(max + i, _mm256_max_ps(_mm256_loadu_ps(s + i), _mm256_loadu_ps(max + i))); - for (; i < inner; ++i) - max[i] = Simd::Max(max[i], s[i]); - s += inner; - } - - s = src; - d = dst; - memset(sum, 0, inner * sizeof(float)); - for (size_t c = 0; c < count; ++c) - { - size_t i = 0; - for (; i < aligned; i += F) - { - __m256 _d = exp.Exponent(_mm256_sub_ps(_mm256_loadu_ps(s + i), _mm256_loadu_ps(max + i))); - _mm256_storeu_ps(d + i, _d); - _mm256_storeu_ps(sum + i, _mm256_add_ps(_d, _mm256_loadu_ps(sum + i))); - } - for (; i < inner; ++i) - { - d[i] = ::exp(s[i] - max[i]); - sum[i] += d[i]; - } - s += inner; - d += inner; - } - - d = dst; - for (size_t c = 0; c < count; ++c) - { - size_t i = 0; - for (; i < aligned; i += F) - _mm256_storeu_ps(d + i, _mm256_div_ps(_mm256_loadu_ps(d + i), _mm256_loadu_ps(sum + i))); - for (; i < inner; ++i) - d[i] /= sum[i]; - d += inner; - } - src += count * inner; - dst += count * inner; - } - } - } - - //--------------------------------------------------------------------- - - template __m256 SynetUnaryOperation32f(__m256 value); - - template<> SIMD_INLINE __m256 SynetUnaryOperation32f(__m256 value) - { - return _mm256_andnot_ps(_mm256_set1_ps(-0.0f), value); - } - - template<> SIMD_INLINE __m256 SynetUnaryOperation32f(__m256 value) - { - return Exponent(value); - } - - template<> SIMD_INLINE __m256 SynetUnaryOperation32f(__m256 value) - { - return Logarithm(value); - } - - template<> SIMD_INLINE __m256 SynetUnaryOperation32f(__m256 value) - { - return _mm256_sub_ps(_mm256_setzero_ps(), value); - } - - template<> SIMD_INLINE __m256 SynetUnaryOperation32f(__m256 value) - { - return _mm256_rsqrt_ps(value); - } - - template<> SIMD_INLINE __m256 SynetUnaryOperation32f(__m256 value) - { - return _mm256_sqrt_ps(value); - } - - template<> SIMD_INLINE __m256 SynetUnaryOperation32f(__m256 value) - { - return Tanh(value); - } - - template<> SIMD_INLINE __m256 SynetUnaryOperation32f(__m256 value) - { - return _mm256_setzero_ps(); - } - - template void SynetUnaryOperation32fLayerForward(const float* src, size_t size, float* dst) - { - size_t sizeF = AlignLo(size, F); - size_t sizeQF = AlignLo(size, QF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - Avx::Store(dst + i + 0 * F, SynetUnaryOperation32f(Avx::Load(src + i + 0 * F))); - Avx::Store(dst + i + 1 * F, SynetUnaryOperation32f(Avx::Load(src + i + 1 * F))); - Avx::Store(dst + i + 2 * F, SynetUnaryOperation32f(Avx::Load(src + i + 2 * F))); - Avx::Store(dst + i + 3 * F, SynetUnaryOperation32f(Avx::Load(src + i + 3 * F))); - } - for (; i < sizeF; i += F) - Avx::Store(dst + i, SynetUnaryOperation32f(Avx::Load(src + i))); - for (; i < size; ++i) - dst[i] = Base::SynetUnaryOperation32f(src[i]); - } - - template void SynetUnaryOperation32fLayerForward(const float* src, size_t size, SimdSynetUnaryOperation32fType type, float* dst) - { - switch (type) - { - case SimdSynetUnaryOperation32fAbs: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fExp: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fLog: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fNeg: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fRsqrt: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fSqrt: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fTanh: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fZero: SynetUnaryOperation32fLayerForward(src, size, dst); break; - default: - assert(0); - } - } - - void SynetUnaryOperation32fLayerForward(const float* src, size_t size, SimdSynetUnaryOperation32fType type, float* dst) - { - if (Aligned(src) && Aligned(dst)) - SynetUnaryOperation32fLayerForward(src, size, type, dst); - else - SynetUnaryOperation32fLayerForward(src, size, type, dst); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2SynetActivation.cpp b/src/3rd/Simd/Simd/SimdAvx2SynetActivation.cpp deleted file mode 100644 index 3e90c1bc..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2SynetActivation.cpp +++ /dev/null @@ -1,180 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdArray.h" -#include "Simd/SimdPow.h" -#include "Simd/SimdExp.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdSynet.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void SynetElu32f(const float * src, const Avx2::Exp & exp, __m256 alpha, float * dst, size_t offset) - { - Avx::Store(dst + offset, exp.Elu(Avx::Load(src + offset), alpha)); - } - - template void SynetElu32f(const float * src, size_t size, const float * alpha, float * dst) - { - __m256 _alpha = _mm256_set1_ps(alpha[0]); - Avx2::Exp exp; - size_t sizeF = AlignLo(size, F); - size_t sizeQF = AlignLo(size, QF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - SynetElu32f(src, exp, _alpha, dst, i + 0 * F); - SynetElu32f(src, exp, _alpha, dst, i + 1 * F); - SynetElu32f(src, exp, _alpha, dst, i + 2 * F); - SynetElu32f(src, exp, _alpha, dst, i + 3 * F); - } - for (; i < sizeF; i += F) - SynetElu32f(src, exp, _alpha, dst, i); - for (; i < size; ++i) - dst[i] = Base::SynetElu32f(src[i], alpha[0]); - } - - void SynetElu32f(const float * src, size_t size, const float * alpha, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetElu32f(src, size, alpha, dst); - else - SynetElu32f(src, size, alpha, dst); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetSigmoid32f(const float* src, const Avx2::Exp& exp, float* dst, size_t offset) - { - Avx::Store(dst + offset, exp.Sigmoid(Avx::Load(src + offset))); - } - - template void SynetSigmoid32f(const float* src, size_t size, const float* slope, float* dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - Exp exp(-slope[0]); - size_t sizeF = AlignLo(size, F); - size_t sizeQF = AlignLo(size, QF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - SynetSigmoid32f(src, exp, dst, i + 0 * F); - SynetSigmoid32f(src, exp, dst, i + 1 * F); - SynetSigmoid32f(src, exp, dst, i + 2 * F); - SynetSigmoid32f(src, exp, dst, i + 3 * F); - } - for (; i < sizeF; i += F) - SynetSigmoid32f(src, exp, dst, i); - for (; i < size; ++i) - dst[i] = Base::SynetSigmoid32f(src[i], slope[0]); - } - - void SynetSigmoid32f(const float* src, size_t size, const float* slope, float* dst) - { - if (Aligned(src) && Aligned(dst)) - SynetSigmoid32f(src, size, slope, dst); - else - SynetSigmoid32f(src, size, slope, dst); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetSoftplus32f(const float* src, __m256 beta, __m256 threshold, float* dst, size_t offset) - { - Avx::Store(dst + offset, Softplus(Avx::Load(src + offset), beta, threshold)); - } - - template void SynetSoftplus32f(const float* src, size_t size, const float* beta, const float* threshold, float* dst) - { - __m256 _beta = _mm256_set1_ps(beta[0]); - __m256 _threshold = _mm256_set1_ps(threshold[0]); - size_t sizeF = AlignLo(size, F); - size_t sizeQF = AlignLo(size, QF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - SynetSoftplus32f(src, _beta, _threshold, dst, i + 0 * F); - SynetSoftplus32f(src, _beta, _threshold, dst, i + 1 * F); - SynetSoftplus32f(src, _beta, _threshold, dst, i + 2 * F); - SynetSoftplus32f(src, _beta, _threshold, dst, i + 3 * F); - } - for (; i < sizeF; i += F) - SynetSoftplus32f(src, _beta, _threshold, dst, i); - for (; i < size; ++i) - dst[i] = Base::SynetSoftplus32f(src[i], beta[0], threshold[0]); - } - - void SynetSoftplus32f(const float* src, size_t size, const float* beta, const float* threshold, float* dst) - { - if (Aligned(src) && Aligned(dst)) - SynetSoftplus32f(src, size, beta, threshold, dst); - else - SynetSoftplus32f(src, size, beta, threshold, dst); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetTanh32f(const float* src, const Avx2::Exp& exp, float* dst, size_t offset) - { - Avx::Store(dst + offset, exp.Tanh(Avx::Load(src + offset))); - } - - template void SynetTanh32f(const float* src, size_t size, const float* slope, float* dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - Exp exp(-2.0f*slope[0]); - size_t sizeF = AlignLo(size, F); - size_t sizeQF = AlignLo(size, QF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - SynetTanh32f(src, exp, dst, i + 0 * F); - SynetTanh32f(src, exp, dst, i + 1 * F); - SynetTanh32f(src, exp, dst, i + 2 * F); - SynetTanh32f(src, exp, dst, i + 3 * F); - } - for (; i < sizeF; i += F) - SynetTanh32f(src, exp, dst, i); - for (; i < size; ++i) - dst[i] = Base::SynetTanh32f(src[i], slope[0]); - } - - void SynetTanh32f(const float* src, size_t size, const float* slope, float* dst) - { - if (Aligned(src) && Aligned(dst)) - SynetTanh32f(src, size, slope, dst); - else - SynetTanh32f(src, size, slope, dst); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2SynetConversion.cpp b/src/3rd/Simd/Simd/SimdAvx2SynetConversion.cpp deleted file mode 100644 index 51ec01b3..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2SynetConversion.cpp +++ /dev/null @@ -1,541 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" -#include "Simd/SimdSse41.h" -#include "Simd/SimdLog.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdExtract.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void SynetConvert32fTo8u(const float* src, __m256 scale, __m256 shift, uint8_t* dst) - { - __m256i i32 = _mm256_cvtps_epi32(Fmadd(Avx::Load(src), scale, shift)); - *((int64_t*)dst) = Extract64i<0>(_mm256_packus_epi16(PackI32ToI16(i32, K_ZERO), K_ZERO)); - } - - template SIMD_INLINE void SynetConvert32fTo8u(const float* src, __m256 scale, __m256 shift, uint8_t* dst, __m256i tail) - { - __m256i i32 = _mm256_cvtps_epi32(Fmadd(Avx::Load(src, tail), scale, shift)); - *((int64_t*)dst) = Extract64i<0>(_mm256_packus_epi16(PackI32ToI16(i32, K_ZERO), K_ZERO)); - } - - template void SynetConvert32fTo8uNchw(const float* src, size_t batch, size_t channels, size_t height, size_t width, const float* scale, const float* shift, uint8_t* dst) - { - if (align) - assert(Aligned(src) && Aligned(dst) && Aligned(width, A)); - - size_t widthF = AlignLo(width, F); - for (size_t b = 0; b < batch; ++b) - { - for (size_t c = 0; c < channels; ++c) - { - __m256 _scale = _mm256_set1_ps(scale[c]); - __m256 _shift = _mm256_set1_ps(shift[c]); - for (size_t h = 0; h < height; ++h) - { - size_t w = 0; - for (; w < widthF; w += F) - SynetConvert32fTo8u(src + w, _scale, _shift, dst + w); - for (; w < width; w += 1) - dst[w] = Base::SynetConvert32fTo8u(src[w], scale[c], shift[c]); - src += width; - dst += width; - } - } - } - } - - template void SynetConvert32fTo8uNchw(const float* src, size_t batch, size_t channels, size_t height, size_t width, const float* scale, const float* shift, uint8_t* dst) - { - if (Aligned(src) && Aligned(dst) && Aligned(width, A)) - SynetConvert32fTo8uNchw(src, batch, channels, height, width, scale, shift, dst); - else - SynetConvert32fTo8uNchw(src, batch, channels, height, width, scale, shift, dst); - } - - template void SynetConvert32fTo8uNhwc(const float* src, size_t batch, size_t channels, size_t height, size_t width, const float* scale, const float* shift, uint8_t* dst) - { - if (align) - assert(Aligned(src) && Aligned(dst) && Aligned(channels, A) && Aligned(scale) && Aligned(shift)); - - size_t channelsF = AlignLo(channels, F); - size_t widthF = AlignLo(width, F); - __m256i tail = LeftNotZero32i(channels - channelsF); - for (size_t b = 0; b < batch; ++b) - { - for (size_t h = 0; h < height; ++h) - { - size_t w = 0; - for (; w < widthF; ++w) - { - size_t c = 0; - for (; c < channelsF; c += F) - SynetConvert32fTo8u(src + c, Avx::Load(scale + c), Avx::Load(shift + c), dst + c); - if (c < channels) - SynetConvert32fTo8u(src + c, Avx::Load(scale + c, tail), Avx::Load(shift + c, tail), dst + c, tail); - src += channels; - dst += channels; - } - for (; w < width; ++w) - { - size_t c = 0; - for (; c < channelsF; c += F) - SynetConvert32fTo8u(src + c, Avx::Load(scale + c), Avx::Load(shift + c), dst + c); - if (c < channels) - SynetConvert32fTo8u(src + c, Avx::Load(scale + c, tail), Avx::Load(shift + c, tail), dst + c, tail); - src += channels; - dst += channels; - } - } - } - } - - template void SynetConvert32fTo8uNhwc(const float* src, size_t batch, size_t channels, size_t height, size_t width, const float* scale, const float* shift, uint8_t* dst) - { - if (Aligned(src) && Aligned(dst) && Aligned(channels, A) && Aligned(scale) && Aligned(shift)) - SynetConvert32fTo8uNhwc(src, batch, channels, height, width, scale, shift, dst); - else - SynetConvert32fTo8uNhwc(src, batch, channels, height, width, scale, shift, dst); - } - - template void SynetConvert32fTo8uNhwc3(const float* src, size_t batch, size_t height, size_t width, const float* scale, const float* shift, uint8_t* dst) - { - if (align) - assert(Aligned(src) && Aligned(dst) && Aligned(width, A)); - - size_t width3 = width * 3; - size_t width3F = AlignLo(width, F) * 3; - - float _scale[F * 3], _shift[F * 3]; - for (size_t i = 0; i < F; ++i) - for (size_t c = 0; c < 3; ++c) - _scale[i * 3 + c] = scale[c], _shift[i * 3 + c] = shift[c]; - - __m256 _scale0 = Avx::Load(_scale + 0 * F); - __m256 _scale1 = Avx::Load(_scale + 1 * F); - __m256 _scale2 = Avx::Load(_scale + 2 * F); - __m256 _shift0 = Avx::Load(_shift + 0 * F); - __m256 _shift1 = Avx::Load(_shift + 1 * F); - __m256 _shift2 = Avx::Load(_shift + 2 * F); - - for (size_t b = 0; b < batch; ++b) - { - for (size_t h = 0; h < height; ++h) - { - size_t w = 0; - for (; w < width3F; w += 3 * F) - { - SynetConvert32fTo8u(src + 0 * F, _scale0, _shift0, dst + 0 * F); - SynetConvert32fTo8u(src + 1 * F, _scale1, _shift1, dst + 1 * F); - SynetConvert32fTo8u(src + 2 * F, _scale2, _shift2, dst + 2 * F); - src += 3 * F; - dst += 3 * F; - } - for (; w < width3; w += 3) - { - dst[0] = Base::SynetConvert32fTo8u(src[0], scale[0], shift[0]); - dst[1] = Base::SynetConvert32fTo8u(src[1], scale[1], shift[1]); - dst[2] = Base::SynetConvert32fTo8u(src[2], scale[2], shift[2]); - src += 3; - dst += 3; - } - } - } - } - - template void SynetConvert32fTo8uNhwc3(const float* src, size_t batch, size_t height, size_t width, const float* scale, const float* shift, uint8_t* dst) - { - if (Aligned(src) && Aligned(dst) && Aligned(width, A)) - SynetConvert32fTo8uNhwc3(src, batch, height, width, scale, shift, dst); - else - SynetConvert32fTo8uNhwc3(src, batch, height, width, scale, shift, dst); - } - - void SynetConvert32fTo8u(const float* src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float* shift, uint8_t* dst, SimdSynetCompatibilityType compatibility) - { - if (!(compatibility & SimdSynetCompatibilityNoFmaTail)) - { - width = height * width; - height = 1; - } - size_t spatial = height * width; - if (Base::NchwCompatible(channels, spatial, format)) - { - if(compatibility & SimdSynetCompatibilityNoFma) - SynetConvert32fTo8uNchw(src, batch, channels, height, width, scale, shift, dst); - else - SynetConvert32fTo8uNchw(src, batch, channels, height, width, scale, shift, dst); - } - else if (Base::NhwcCompatible(channels, spatial, format)) - { - if (channels == 3) - { - if (compatibility & SimdSynetCompatibilityNoFma) - SynetConvert32fTo8uNhwc3(src, batch, height, width, scale, shift, dst); - else - SynetConvert32fTo8uNhwc3(src, batch, height, width, scale, shift, dst); - } - else - { - if (compatibility & SimdSynetCompatibilityNoFma) - SynetConvert32fTo8uNhwc(src, batch, channels, height, width, scale, shift, dst); - else if (compatibility & SimdSynetCompatibilityNoFmaTail) - SynetConvert32fTo8uNhwc(src, batch, channels, height, width, scale, shift, dst); - else - SynetConvert32fTo8uNhwc(src, batch, channels, height, width, scale, shift, dst); - } - } - else - assert(0); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void StoreScaled(float * ptr, __m256i value32, __m256 scale, __m256 shift) - { - Avx::Store(ptr, _mm256_fmadd_ps(_mm256_cvtepi32_ps(value32), scale, shift)); - } - - const __m256i K16_BLUE_RED = SIMD_MM256_SET2_EPI16(Base::BLUE_TO_GRAY_WEIGHT, Base::RED_TO_GRAY_WEIGHT); - const __m256i K16_GREEN_0000 = SIMD_MM256_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, 0x0000); - const __m256i K32_ROUND_TERM = SIMD_MM256_SET1_EPI32(Base::BGR_TO_GRAY_ROUND_TERM); - - SIMD_INLINE __m256i BgraToGray32(__m256i bgra) - { - const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(bgra, 1), K16_00FF); - const __m256i b0r0 = _mm256_and_si256(bgra, K16_00FF); - const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_0000), _mm256_madd_epi16(b0r0, K16_BLUE_RED)); - return _mm256_srli_epi32(_mm256_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT); - } - - template SIMD_INLINE void SynetSetInput1(const uint8_t * src, __m256 scale, __m256 shift, float * dst); - - SIMD_INLINE void SynetSetInput1Gray8(__m128i gray8, __m256 scale, __m256 shift, float * dst) - { - StoreScaled(dst + 0, _mm256_cvtepu8_epi32(_mm_srli_si128(gray8, 0)), scale, shift); - StoreScaled(dst + F, _mm256_cvtepu8_epi32(_mm_srli_si128(gray8, 8)), scale, shift); - } - - SIMD_INLINE void SynetSetInput1Gray8(__m256i gray8, __m256 scale, __m256 shift, float * dst) - { - SynetSetInput1Gray8(_mm256_extractf128_si256(gray8, 0), scale, shift, dst + 0 * F); - SynetSetInput1Gray8(_mm256_extractf128_si256(gray8, 1), scale, shift, dst + 2 * F); - } - - template<> SIMD_INLINE void SynetSetInput1(const uint8_t * src, __m256 scale, __m256 shift, float * dst) - { - SynetSetInput1Gray8(Sse2::Load((__m128i*)src + 0), scale, shift, dst + 0 * F); - SynetSetInput1Gray8(Sse2::Load((__m128i*)src + 1), scale, shift, dst + 2 * F); - } - - template<> SIMD_INLINE void SynetSetInput1(const uint8_t * src, __m256 scale, __m256 shift, float * dst) - { - StoreScaled(dst + 0 * F, BgraToGray32(BgrToBgra(Load((__m256i*)(src + 0)), K32_01000000)), scale, shift); - StoreScaled(dst + 1 * F, BgraToGray32(BgrToBgra(Load((__m256i*)(src + 24)), K32_01000000)), scale, shift); - StoreScaled(dst + 2 * F, BgraToGray32(BgrToBgra(Load((__m256i*)(src + 48)), K32_01000000)), scale, shift); - StoreScaled(dst + 3 * F, BgraToGray32(BgrToBgra(Load((__m256i*)(src + 64)), K32_01000000)), scale, shift); - } - - template<> SIMD_INLINE void SynetSetInput1(const uint8_t * src, __m256 scale, __m256 shift, float * dst) - { - StoreScaled(dst + 0 * F, BgraToGray32(Load((__m256i*)src + 0)), scale, shift); - StoreScaled(dst + 1 * F, BgraToGray32(Load((__m256i*)src + 1)), scale, shift); - StoreScaled(dst + 2 * F, BgraToGray32(Load((__m256i*)src + 2)), scale, shift); - StoreScaled(dst + 3 * F, BgraToGray32(Load((__m256i*)src + 3)), scale, shift); - } - - template<> SIMD_INLINE void SynetSetInput1(const uint8_t * src, __m256 scale, __m256 shift, float * dst) - { - StoreScaled(dst + 0 * F, BgraToGray32(RgbToBgra(Load((__m256i*)(src + 0)), K32_01000000)), scale, shift); - StoreScaled(dst + 1 * F, BgraToGray32(RgbToBgra(Load((__m256i*)(src + 24)), K32_01000000)), scale, shift); - StoreScaled(dst + 2 * F, BgraToGray32(RgbToBgra(Load((__m256i*)(src + 48)), K32_01000000)), scale, shift); - StoreScaled(dst + 3 * F, BgraToGray32(RgbToBgra(Load((__m256i*)(src + 64)), K32_01000000)), scale, shift); - } - - template void SynetSetInput1(const uint8_t * src, size_t width, size_t height, size_t stride, const float * scale, const float * shift, float * dst) - { - __m256 _scale = _mm256_set1_ps(scale[0]); - __m256 _shift = _mm256_set1_ps(shift[0]); - size_t aligned = AlignLo(width, A); - for (size_t y = 0; y < height; ++y) - { - for (size_t x = 0; x < aligned; x += A) - SynetSetInput1(src + step * x, _scale, _shift, dst + x); - if(aligned < width) - SynetSetInput1(src + step * (width - A), _scale, _shift, dst + width - A); - src += stride; - dst += width; - } - } - - template SIMD_INLINE void SynetSetInputNchw3(const uint8_t * src, const __m256 * scale, const __m256 * shift, float * dst, size_t channel); - - template<> SIMD_INLINE void SynetSetInputNchw3(const uint8_t * src, const __m256 * scale, const __m256 * shift, float * dst, size_t channel) - { - __m128i src0 = Sse2::Load((__m128i*)src + 0); - __m256i gray0 = _mm256_cvtepu8_epi32(_mm_srli_si128(src0, 0)); - __m256i gray1 = _mm256_cvtepu8_epi32(_mm_srli_si128(src0, 8)); - __m128i src1 = Sse2::Load((__m128i*)src + 1); - __m256i gray2 = _mm256_cvtepu8_epi32(_mm_srli_si128(src1, 0)); - __m256i gray3 = _mm256_cvtepu8_epi32(_mm_srli_si128(src1, 8)); - StoreScaled(dst + 0 * F, gray0, scale[0], shift[0]); - StoreScaled(dst + 1 * F, gray1, scale[0], shift[0]); - StoreScaled(dst + 2 * F, gray2, scale[0], shift[0]); - StoreScaled(dst + 3 * F, gray3, scale[0], shift[0]); - dst += channel; - StoreScaled(dst + 0 * F, gray0, scale[1], shift[1]); - StoreScaled(dst + 1 * F, gray1, scale[1], shift[1]); - StoreScaled(dst + 2 * F, gray2, scale[1], shift[1]); - StoreScaled(dst + 3 * F, gray3, scale[1], shift[1]); - dst += channel; - StoreScaled(dst + 0 * F, gray0, scale[2], shift[2]); - StoreScaled(dst + 1 * F, gray1, scale[2], shift[2]); - StoreScaled(dst + 2 * F, gray2, scale[2], shift[2]); - StoreScaled(dst + 3 * F, gray3, scale[2], shift[2]); - } - - template<> SIMD_INLINE void SynetSetInputNchw3(const uint8_t * src, const __m256 * scale, const __m256 * shift, float * dst, size_t channel) - { - __m256i _bgr[3]; - _bgr[0] = Load((__m256i*)src + 0); - _bgr[1] = Load((__m256i*)src + 1); - _bgr[2] = Load((__m256i*)src + 2); - SynetSetInput1Gray8(BgrToBlue(_bgr), scale[0], shift[0], dst + 0 * channel); - SynetSetInput1Gray8(BgrToGreen(_bgr), scale[1], shift[1], dst + 1 * channel); - SynetSetInput1Gray8(BgrToRed(_bgr), scale[2], shift[2], dst + 2 * channel); - } - - SIMD_INLINE void SynetSetInputNchw3Bgra32(const uint8_t * src, const __m256 * scale, const __m256 * shift, float * dst, size_t channel) - { - __m256i bgra = Load((__m256i*)src); - StoreScaled(dst + 0 * channel, _mm256_and_si256(_mm256_srli_si256(bgra, 0), K32_000000FF), scale[0], shift[0]); - StoreScaled(dst + 1 * channel, _mm256_and_si256(_mm256_srli_si256(bgra, 1), K32_000000FF), scale[1], shift[1]); - StoreScaled(dst + 2 * channel, _mm256_and_si256(_mm256_srli_si256(bgra, 2), K32_000000FF), scale[2], shift[2]); - } - - template<> SIMD_INLINE void SynetSetInputNchw3(const uint8_t * src, const __m256 * scale, const __m256 * shift, float * dst, size_t channel) - { - SynetSetInputNchw3Bgra32(src + 0 * A, scale, shift, dst + 0 * F, channel); - SynetSetInputNchw3Bgra32(src + 1 * A, scale, shift, dst + 1 * F, channel); - SynetSetInputNchw3Bgra32(src + 2 * A, scale, shift, dst + 2 * F, channel); - SynetSetInputNchw3Bgra32(src + 3 * A, scale, shift, dst + 3 * F, channel); - } - - template<> SIMD_INLINE void SynetSetInputNchw3(const uint8_t * src, const __m256 * scale, const __m256 * shift, float * dst, size_t channel) - { - __m256i _rgb[3]; - _rgb[0] = Load((__m256i*)src + 0); - _rgb[1] = Load((__m256i*)src + 1); - _rgb[2] = Load((__m256i*)src + 2); - SynetSetInput1Gray8(BgrToRed(_rgb), scale[0], shift[0], dst + 0 * channel); - SynetSetInput1Gray8(BgrToGreen(_rgb), scale[1], shift[1], dst + 1 * channel); - SynetSetInput1Gray8(BgrToBlue(_rgb), scale[2], shift[2], dst + 2 * channel); - } - - template void SynetSetInputNchw3(const uint8_t * src, size_t width, size_t height, size_t stride, const float * scale, const float * shift, float * dst) - { - size_t aligned = AlignLo(width, A), channel = width * height; - __m256 _scale[3], _shift[3]; - for (size_t i = 0; i < 3; ++i) - { - _scale[i] = _mm256_set1_ps(scale[i]); - _shift[i] = _mm256_set1_ps(shift[i]); - } - for (size_t y = 0; y < height; ++y) - { - for (size_t x = 0; x < aligned; x += A) - SynetSetInputNchw3(src + step * x, _scale, _shift, dst + x, channel); - if (aligned < width) - SynetSetInputNchw3(src + step * (width - A), _scale, _shift, dst + width - A, channel); - src += stride; - dst += width; - } - } - - template SIMD_INLINE void SynetSetInputNhwc3(const uint8_t * src, const __m256 * scale, const __m256 * shift, float * dst); - - template<> SIMD_INLINE void SynetSetInputNhwc3(const uint8_t * src, const __m256 * scale, const __m256 * shift, float * dst) - { - __m128i gray0 = Sse2::Load((__m128i*)src + 0); - __m128i bgr0 = _mm_shuffle_epi8(gray0, Ssse3::K8_SHUFFLE_GRAY_TO_BGR0); - StoreScaled(dst + 0x0 * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr0, 0)), scale[0], shift[0]); - StoreScaled(dst + 0x1 * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr0, 8)), scale[1], shift[1]); - __m128i bgr1 = _mm_shuffle_epi8(gray0, Ssse3::K8_SHUFFLE_GRAY_TO_BGR1); - StoreScaled(dst + 0x2 * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr1, 0)), scale[2], shift[2]); - StoreScaled(dst + 0x3 * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr1, 8)), scale[0], shift[0]); - __m128i bgr2 = _mm_shuffle_epi8(gray0, Ssse3::K8_SHUFFLE_GRAY_TO_BGR2); - StoreScaled(dst + 0x4 * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr2, 0)), scale[1], shift[1]); - StoreScaled(dst + 0x5 * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr2, 8)), scale[2], shift[2]); - __m128i gray1 = Sse2::Load((__m128i*)src + 1); - __m128i bgr3 = _mm_shuffle_epi8(gray1, Ssse3::K8_SHUFFLE_GRAY_TO_BGR0); - StoreScaled(dst + 0x6 * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr3, 0)), scale[0], shift[0]); - StoreScaled(dst + 0x7 * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr3, 8)), scale[1], shift[1]); - __m128i bgr4 = _mm_shuffle_epi8(gray1, Ssse3::K8_SHUFFLE_GRAY_TO_BGR1); - StoreScaled(dst + 0x8 * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr4, 0)), scale[2], shift[2]); - StoreScaled(dst + 0x9 * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr4, 8)), scale[0], shift[0]); - __m128i bgr5 = _mm_shuffle_epi8(gray1, Ssse3::K8_SHUFFLE_GRAY_TO_BGR2); - StoreScaled(dst + 0xA * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr5, 0)), scale[1], shift[1]); - StoreScaled(dst + 0xB * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr5, 8)), scale[2], shift[2]); - } - - template<> SIMD_INLINE void SynetSetInputNhwc3(const uint8_t * src, const __m256 * scale, const __m256 * shift, float * dst) - { - __m128i bgr0 = Sse2::Load((__m128i*)src + 0); - StoreScaled(dst + 0x0 * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr0, 0)), scale[0], shift[0]); - StoreScaled(dst + 0x1 * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr0, 8)), scale[1], shift[1]); - __m128i bgr1 = Sse2::Load((__m128i*)src + 1); - StoreScaled(dst + 0x2 * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr1, 0)), scale[2], shift[2]); - StoreScaled(dst + 0x3 * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr1, 8)), scale[0], shift[0]); - __m128i bgr2 = Sse2::Load((__m128i*)src + 2); - StoreScaled(dst + 0x4 * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr2, 0)), scale[1], shift[1]); - StoreScaled(dst + 0x5 * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr2, 8)), scale[2], shift[2]); - __m128i bgr3 = Sse2::Load((__m128i*)src + 3); - StoreScaled(dst + 0x6 * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr3, 0)), scale[0], shift[0]); - StoreScaled(dst + 0x7 * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr3, 8)), scale[1], shift[1]); - __m128i bgr4 = Sse2::Load((__m128i*)src + 4); - StoreScaled(dst + 0x8 * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr4, 0)), scale[2], shift[2]); - StoreScaled(dst + 0x9 * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr4, 8)), scale[0], shift[0]); - __m128i bgr5 = Sse2::Load((__m128i*)src + 5); - StoreScaled(dst + 0xA * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr5, 0)), scale[1], shift[1]); - StoreScaled(dst + 0xB * F, _mm256_cvtepu8_epi32(_mm_srli_si128(bgr5, 8)), scale[2], shift[2]); - } - - const __m128i K8_BGRA_TO_BGR_0 = SIMD_MM_SETR_EPI8(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_BGRA_TO_BGR_1 = SIMD_MM_SETR_EPI8(0x0, 0x2, 0x3, 0x4, 0x6, 0x7, 0x8, 0xA, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_BGRA_TO_BGR_2 = SIMD_MM_SETR_EPI8(0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, -1, -1, -1, -1); - - template<> SIMD_INLINE void SynetSetInputNhwc3(const uint8_t * src, const __m256 * scale, const __m256 * shift, float * dst) - { - StoreScaled(dst + 0x0 * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 0)), K8_BGRA_TO_BGR_0)), scale[0], shift[0]); - StoreScaled(dst + 0x1 * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 10)), K8_BGRA_TO_BGR_1)), scale[1], shift[1]); - StoreScaled(dst + 0x2 * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 16)), K8_BGRA_TO_BGR_2)), scale[2], shift[2]); - StoreScaled(dst + 0x3 * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 32)), K8_BGRA_TO_BGR_0)), scale[0], shift[0]); - StoreScaled(dst + 0x4 * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 42)), K8_BGRA_TO_BGR_1)), scale[1], shift[1]); - StoreScaled(dst + 0x5 * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 48)), K8_BGRA_TO_BGR_2)), scale[2], shift[2]); - StoreScaled(dst + 0x6 * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 64)), K8_BGRA_TO_BGR_0)), scale[0], shift[0]); - StoreScaled(dst + 0x7 * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 74)), K8_BGRA_TO_BGR_1)), scale[1], shift[1]); - StoreScaled(dst + 0x8 * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 80)), K8_BGRA_TO_BGR_2)), scale[2], shift[2]); - StoreScaled(dst + 0x9 * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 96)), K8_BGRA_TO_BGR_0)), scale[0], shift[0]); - StoreScaled(dst + 0xA * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 106)), K8_BGRA_TO_BGR_1)), scale[1], shift[1]); - StoreScaled(dst + 0xB * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 112)), K8_BGRA_TO_BGR_2)), scale[2], shift[2]); - } - - const __m128i K8_RGB_UNPACK_0 = SIMD_MM_SETR_EPI8(0x2, -1, 0x1, -1, 0x0, -1, 0x5, -1, 0x4, - 1, 0x3, -1, 0x8, -1, 0x7, -1); - const __m128i K8_RGB_UNPACK_1 = SIMD_MM_SETR_EPI8(0x0, -1, 0x5, -1, 0x4, -1, 0x3, -1, 0x8, - 1, 0x7, -1, 0x6, -1, 0xB, -1); - const __m128i K8_RGB_UNPACK_2 = SIMD_MM_SETR_EPI8(0x8, -1, 0x7, -1, 0xC, -1, 0xB, -1, 0xA, - 1, 0xF, -1, 0xE, -1, 0xD, -1); - - template<> SIMD_INLINE void SynetSetInputNhwc3(const uint8_t * src, const __m256 * scale, const __m256 * shift, float * dst) - { - StoreScaled(dst + 0x0 * F, _mm256_cvtepi16_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 0)), K8_RGB_UNPACK_0)), scale[0], shift[0]); - StoreScaled(dst + 0x1 * F, _mm256_cvtepi16_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 6)), K8_RGB_UNPACK_1)), scale[1], shift[1]); - StoreScaled(dst + 0x2 * F, _mm256_cvtepi16_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 8)), K8_RGB_UNPACK_2)), scale[2], shift[2]); - StoreScaled(dst + 0x3 * F, _mm256_cvtepi16_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 24)), K8_RGB_UNPACK_0)), scale[0], shift[0]); - StoreScaled(dst + 0x4 * F, _mm256_cvtepi16_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 30)), K8_RGB_UNPACK_1)), scale[1], shift[1]); - StoreScaled(dst + 0x5 * F, _mm256_cvtepi16_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 32)), K8_RGB_UNPACK_2)), scale[2], shift[2]); - StoreScaled(dst + 0x6 * F, _mm256_cvtepi16_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 48)), K8_RGB_UNPACK_0)), scale[0], shift[0]); - StoreScaled(dst + 0x7 * F, _mm256_cvtepi16_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 54)), K8_RGB_UNPACK_1)), scale[1], shift[1]); - StoreScaled(dst + 0x8 * F, _mm256_cvtepi16_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 56)), K8_RGB_UNPACK_2)), scale[2], shift[2]); - StoreScaled(dst + 0x9 * F, _mm256_cvtepi16_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 72)), K8_RGB_UNPACK_0)), scale[0], shift[0]); - StoreScaled(dst + 0xA * F, _mm256_cvtepi16_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 78)), K8_RGB_UNPACK_1)), scale[1], shift[1]); - StoreScaled(dst + 0xB * F, _mm256_cvtepi16_epi32(_mm_shuffle_epi8(Sse2::Load((__m128i*)(src + 80)), K8_RGB_UNPACK_2)), scale[2], shift[2]); - } - - template void SynetSetInputNhwc3(const uint8_t * src, size_t width, size_t height, size_t stride, const float * scale, const float * shift, float * dst) - { - size_t aligned = AlignLo(width, A); - __m256 _scale[3], _shift[3]; - _scale[0] = _mm256_setr_ps(scale[0], scale[1], scale[2], scale[0], scale[1], scale[2], scale[0], scale[1]); - _scale[1] = _mm256_setr_ps(scale[2], scale[0], scale[1], scale[2], scale[0], scale[1], scale[2], scale[0]); - _scale[2] = _mm256_setr_ps(scale[1], scale[2], scale[0], scale[1], scale[2], scale[0], scale[1], scale[2]); - _shift[0] = _mm256_setr_ps(shift[0], shift[1], shift[2], shift[0], shift[1], shift[2], shift[0], shift[1]); - _shift[1] = _mm256_setr_ps(shift[2], shift[0], shift[1], shift[2], shift[0], shift[1], shift[2], shift[0]); - _shift[2] = _mm256_setr_ps(shift[1], shift[2], shift[0], shift[1], shift[2], shift[0], shift[1], shift[2]); - for (size_t y = 0; y < height; ++y) - { - for (size_t x = 0; x < aligned; x += A) - SynetSetInputNhwc3(src + step * x, _scale, _shift, dst + 3 * x); - if (aligned < width) - SynetSetInputNhwc3(src + step * (width - A), _scale, _shift, dst + 3 * (width - A)); - src += stride; - dst += 3*width; - } - } - - void SynetSetInput(const uint8_t * src, size_t width, size_t height, size_t stride, SimdPixelFormatType srcFormat, - const float * lower, const float * upper, float * dst, size_t channels, SimdTensorFormatType dstFormat) - { - assert(width >= A); - - float scale[3]; - for (size_t i = 0; i < channels; ++i) - scale[i] = (upper[i] - lower[i]) / 255.0f; - switch (channels) - { - case 1: - switch (srcFormat) - { - case SimdPixelFormatGray8: SynetSetInput1(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgr24: SynetSetInput1(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgra32: SynetSetInput1(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatRgb24: SynetSetInput1(src, width, height, stride, scale, lower, dst); return; - default: assert(0); - } - break; - case 3: - switch (dstFormat) - { - case SimdTensorFormatNchw: - switch (srcFormat) - { - case SimdPixelFormatGray8: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgr24: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgra32: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatRgb24: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return; - default: assert(0); - } - break; - case SimdTensorFormatNhwc: - switch (srcFormat) - { - case SimdPixelFormatGray8: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgr24: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgra32: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatRgb24: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return; - default: assert(0); - } - break; - default: assert(0); - } - default: assert(0); - } - } - } -#endif//SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2SynetConvolution32f.cpp b/src/3rd/Simd/Simd/SimdAvx2SynetConvolution32f.cpp deleted file mode 100644 index 095ae8c7..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2SynetConvolution32f.cpp +++ /dev/null @@ -1,1662 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdLoad.h" -#include "Simd/SimdAvx1.h" -#include "Simd/SimdAvx2.h" -#include "Simd/SimdGemm.h" -#include "Simd/SimdExp.h" -#include "Simd/SimdSynet.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - void ConvolutionBiasAndActivation(const float * bias, size_t count, size_t size, ::SimdConvolutionActivationType activation, const float * params, ::SimdBool trans, float * dst) - { - size_t aligned = trans ? AlignLo(count, F) : AlignLo(size, F); - if (activation == ::SimdConvolutionActivationElu) - { - float alpha = params[0]; - if (bias) - { - __m256 _alpha = _mm256_set1_ps(alpha); - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - size_t i = 0; - for (; i < aligned; i += F) - { - __m256 value = _mm256_add_ps(_mm256_loadu_ps(dst + i), _mm256_loadu_ps(bias + i)); - _mm256_storeu_ps(dst + i, Avx2::Elu(value, _alpha)); - } - for (; i < count; ++i) - dst[i] = Base::SynetElu32f(dst[i] + bias[i], alpha); - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - __m256 _bias = _mm256_set1_ps(bias[i]); - size_t j = 0; - for (; j < aligned; j += F) - { - __m256 value = _mm256_add_ps(_mm256_loadu_ps(dst + j), _bias); - _mm256_storeu_ps(dst + j, Avx2::Elu(value, _alpha)); - } - for (; j < size; ++j) - dst[j] = Base::SynetElu32f(dst[j] + bias[i], alpha); - dst += size; - } - } - } - else - SynetElu32f(dst, size*count, &alpha, dst); - } - else - Avx::ConvolutionBiasAndActivation(bias, count, size, activation, params, trans, dst); - } - - //--------------------------------------------------------------------- - - - SynetConvolution32fGemmNN::SynetConvolution32fGemmNN(const ConvParam32f & p) - : Avx::SynetConvolution32fGemmNN(p) - { - _index.Resize(F); - for (size_t i = 0; i < F; ++i) - _index[i] = int(i * p.strideX); - _nose.Resize(p.kernelX); - _tail.Resize(p.kernelX); - _start.Resize(p.kernelX); - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - _nose[kx] = 0; - _tail[kx] = int(p.dstW); - ptrdiff_t sx = kx * p.dilationX - p.padX; - for (size_t dx = 0; dx < p.dstW; ++dx) - { - if (sx < 0) - _nose[kx]++; - if (sx >= ptrdiff_t(p.srcW)) - _tail[kx]--; - sx += p.strideX; - } - _start[kx] = int(kx * p.dilationX - p.padX + _nose[kx] * p.strideX); - } - _gemm.Init(InitGemmFuncs(Avx2::Gemm32fNN, "Avx2", p.gemm, "Ext")); - if (_param.trans && _param.group == 1) - { - if (NHWC_GEMM_RUNTIME) - { - _gemmCb.Init(InitGemmCbFuncs(Avx2::Gemm32fNNcbBufferSize, Avx2::Gemm32fNNcbReorderB, Avx2::Gemm32fNNcbRun, "Avx2", GemmKernelF2, GemmKernelF3)); - _nhwcWeight.Resize(_gemmCb.At(0).BufferSize(_M*_merge, _N, _K)); - } - else - _nhwcWeight.Resize(Avx2::Gemm32fNNcbBufferSize(_M*_merge, _N, _K, GemmKernelAny, NHWC_GEMM_COMPATIBLE)); - _nhwcRun = Avx2::Gemm32fNNcbRun; - _nhwcReorderB = Avx2::Gemm32fNNcbReorderB; - } - _biasAndActivation = Avx2::ConvolutionBiasAndActivation; - } - - void SynetConvolution32fGemmNN::ImgToCol(const float * src, float * dst) - { - const ConvParam32f & p = _param; - size_t srcSize = p.srcW * p.srcH; - if (p.dilationX == 1 && p.dilationY == 1 && p.strideX == 2 && p.strideY == 2 && p.padX == 0 && p.padY == 0 && p.padW == 0 && p.padH == 0 && p.kernelX == 1 && p.kernelY == 1) - { - for (size_t c = 0; c < p.srcC; ++c) - { - for (size_t dy = 0; dy < p.dstH; ++dy) - { - const float * psrc = src + 2 * dy*p.srcW; - for (size_t dx = 0, sx = 0; dx < p.dstW; ++dx, sx += 2) - *(dst++) = psrc[sx]; - } - src += srcSize; - } - } - else if (p.dilationX*p.dilationY*p.strideX*p.strideY != 1) - { - __m256i index = _mm256_loadu_si256((__m256i*)_index.data); - for (size_t c = 0; c < p.srcC; ++c) - { - for (size_t ky = 0; ky < p.kernelY; ky++) - { - for (size_t kx = 0; kx < p.kernelX; kx++) - { - size_t noseDx = _nose[kx]; - size_t tailDx = _tail[kx]; - size_t bodyDx = AlignLo(tailDx - noseDx, F) + noseDx; - size_t sx0 = _start[kx]; - size_t sy = ky * p.dilationY - p.padY; - for (size_t dy = 0; dy < p.dstH; ++dy) - { - if (sy < p.srcH) - { - size_t dx = 0, sx = sx0 + sy * p.srcW; - for (; dx < noseDx; ++dx) - *(dst++) = 0; - for (; dx < bodyDx; dx += F, sx += p.strideX*F, dst += F) - _mm256_storeu_ps(dst, _mm256_i32gather_ps(src + sx, index, 4)); - for (; dx < tailDx; ++dx, sx += p.strideX) - *(dst++) = src[sx]; - for (; dx < p.dstW; ++dx) - *(dst++) = 0; - } - else - { - memset(dst, 0, p.dstW * sizeof(float)); - dst += p.dstW; - } - sy += p.strideY; - } - } - } - src += srcSize; - } - } - else - { - Base::SynetConvolution32fGemmNN::ImgToCol(src, dst); - } - } - - //--------------------------------------------------------------------- - - SynetConvolution32fGemmNT::SynetConvolution32fGemmNT(const ConvParam32f & p) - : Avx::SynetConvolution32fGemmNT(p) - { - _gemm.Init(InitGemmFuncs(Avx2::Gemm32fNT, "Avx2")); - _biasAndActivation = Avx::ConvolutionBiasAndActivation; - } - - //--------------------------------------------------------------------- - - SynetConvolution32fWinograd::SynetConvolution32fWinograd(const ConvParam32f & p) - : Avx::SynetConvolution32fWinograd(p) - { - if (p.kernelY == 1 && p.kernelX == 3) - { - { - SetBlock(1, 4); - _setFilter = Avx::WinogradKernel1x3Block1x4SetFilter; - _setInput = Avx::WinogradKernel1x3Block1x4SetInput; - _setOutput = Avx::WinogradKernel1x3Block1x4SetOutput; - } - } - else if (p.kernelY == 1 && p.kernelX == 5) - { - { - SetBlock(1, 4); - _setFilter = Avx::WinogradKernel1x5Block1x4SetFilter; - _setInput = Avx::WinogradKernel1x5Block1x4SetInput; - _setOutput = Avx::WinogradKernel1x5Block1x4SetOutput; - } - } - else if (p.kernelY == 2 && p.kernelX == 2) - { - if (p.trans && p.srcH >= 8 && p.srcW >= 8 && p.srcH * p.srcW * p.batch >= 256) - { - SetBlock(4, 4); - _setFilter = Avx::WinogradKernel2x2Block4x4SetFilter; - _setInput = Avx::WinogradKernel2x2Block4x4SetInput; - _setOutput = Avx::WinogradKernel2x2Block4x4SetOutput; - } - else - { - SetBlock(2, 2); - _setFilter = Avx::WinogradKernel2x2Block2x2SetFilter; - _setInput = Avx::WinogradKernel2x2Block2x2SetInput; - _setOutput = Avx::WinogradKernel2x2Block2x2SetOutput; - } - } - else if (p.kernelY == 3 && p.kernelX == 3) - { - if (p.trans && p.srcH >= 8 && p.srcW >= 8 && p.srcH * p.srcW * p.batch >= 256) - { - SetBlock(4, 4); - _setFilter = Avx::WinogradKernel3x3Block4x4SetFilter; - _setInput = Avx::WinogradKernel3x3Block4x4SetInput; - _setOutput = Avx::WinogradKernel3x3Block4x4SetOutput; - } - else if (p.trans && p.srcH >= 6 && p.srcW >= 6 && p.srcH * p.srcW * p.batch >= 144 && p.dstH % 3 == 0 && p.dstW % 3 == 0) - { - SetBlock(3, 3); - _setFilter = Avx::WinogradKernel3x3Block3x3SetFilter; - _setInput = Avx::WinogradKernel3x3Block3x3SetInput; - _setOutput = Avx::WinogradKernel3x3Block3x3SetOutput; - } - else - { - SetBlock(2, 2); - _setFilter = Avx::WinogradKernel3x3Block2x2SetFilter; - _setInput = Avx::WinogradKernel3x3Block2x2SetInput; - _setOutput = Avx::WinogradKernel3x3Block2x2SetOutput; - } - } - else - assert(0); - _gemm.Init(InitGemmFuncs(Avx2::Gemm32fNN, "Avx2", p.gemm, "Ext")); - if (_param.trans) - { - if (NHWC_GEMM_RUNTIME) - { - _gemmCb.Init(InitGemmCbFuncs(Avx2::Gemm32fNNcbBufferSize, Avx2::Gemm32fNNcbReorderB, Avx2::Gemm32fNNcbRun, "Avx2", GemmKernelF2, GemmKernelF3)); - _nhwcStrideW = _gemmCb.At(0).BufferSize(_M*_merge, _N, _K); - } - else - _nhwcStrideW = Avx2::Gemm32fNNcbBufferSize(_M*_merge, _N, _K, GemmKernelAny, NHWC_GEMM_COMPATIBLE); - _nhwcWeight.Resize(_nhwcStrideW*_count); - _nhwcRun = Avx2::Gemm32fNNcbRun; - _nhwcReorderB = Avx2::Gemm32fNNcbReorderB; - } - _biasAndActivation = Avx2::ConvolutionBiasAndActivation; - } - - //--------------------------------------------------------------------- - - SynetConvolution32fDirectNchw::SynetConvolution32fDirectNchw(const ConvParam32f & p) - : Avx::SynetConvolution32fDirectNchw(p) - { - _convolutionBiasActivation = SetConvolutionBiasActivation(); - } - - template SIMD_INLINE void LoadWeight(const float * src, __m256 * dst) - { - for (size_t i = 0; i < size; ++i) - dst[i] = _mm256_set1_ps(src[i]); - } - - template struct Kernel - { - static __m256 SynetConvolution32f(const float * src, size_t step, const __m256 * weight); - }; - - template<> struct Kernel<1, 1> - { - static SIMD_INLINE __m256 SynetConvolution32f(const float * src, size_t step, const __m256 * weight) - { - return _mm256_mul_ps(_mm256_loadu_ps(src), weight[0]); - } - }; - - template<> struct Kernel<2, 1> - { - static SIMD_INLINE __m256 RowConv(const float * src, const __m256 * weight) - { - return _mm256_fmadd_ps(_mm256_loadu_ps(src), weight[0], - _mm256_mul_ps(_mm256_loadu_ps(src + 1), weight[1])); - } - - static SIMD_INLINE __m256 SynetConvolution32f(const float * src, size_t step, const __m256 * weight) - { - return _mm256_add_ps(RowConv(src, weight), RowConv(src + step, weight + 2)); - } - }; - - template<> struct Kernel<2, 2> - { - static SIMD_INLINE __m256 RowConv(const float * src, const __m256 * weight) - { - __m256 s0 = _mm256_loadu_ps(src + 0); - __m256 s1 = _mm256_loadu_ps(src + F); - return _mm256_fmadd_ps(_mm256_shuffle_ps(s0, s1, 0x88), weight[0], - _mm256_mul_ps(_mm256_shuffle_ps(s0, s1, 0xDD), weight[1])); - } - - static SIMD_INLINE __m256 SynetConvolution32f(const float * src, size_t step, const __m256 * weight) - { - return Permute4x64<0xD8>(_mm256_add_ps(RowConv(src, weight), RowConv(src + step, weight + 2))); - } - }; - - template<> struct Kernel<3, 1> - { - static SIMD_INLINE __m256 RowConv(const float * src, const __m256 * weight) - { - return _mm256_fmadd_ps(_mm256_loadu_ps(src), weight[0], - _mm256_fmadd_ps(_mm256_loadu_ps(src + 1), weight[1], - _mm256_mul_ps(_mm256_loadu_ps(src + 2), weight[2]))); - } - - static SIMD_INLINE __m256 SynetConvolution32f(const float * src, size_t step, const __m256 * weight) - { - return _mm256_add_ps(RowConv(src, weight), - _mm256_add_ps(RowConv(src + step, weight + 3), - RowConv(src + 2 * step, weight + 6))); - } - }; - - template<> struct Kernel<3, 2> - { - static SIMD_INLINE __m256 RowConv(const float * src, const __m256 * weight) - { - __m256 s00 = _mm256_loadu_ps(src); - __m256 s10 = _mm256_loadu_ps(src + F); - __m256 s02 = _mm256_loadu_ps(src + 2); - __m256 s12 = _mm256_loadu_ps(src + 2 + F); - return _mm256_fmadd_ps(_mm256_shuffle_ps(s00, s10, 0x88), weight[0], - _mm256_fmadd_ps(_mm256_shuffle_ps(s00, s10, 0xDD), weight[1], - _mm256_mul_ps(_mm256_shuffle_ps(s02, s12, 0x88), weight[2]))); - } - - static SIMD_INLINE __m256 SynetConvolution32f(const float * src, size_t step, const __m256 * weight) - { - return Permute4x64<0xD8>(_mm256_add_ps(RowConv(src, weight), - _mm256_add_ps(RowConv(src + step, weight + 3), RowConv(src + 2 * step, weight + 6)))); - } - }; - - template<> struct Kernel<3, 3> - { - static SIMD_INLINE __m256 Gather(const float * src) - { - return _mm256_shuffle_ps(Avx::Load(src + 0, src + 12), Avx::Load(src + 6, src + 18), 0xCC); - } - - static SIMD_INLINE __m256 RowConv(const float * src, const __m256 * weight) - { - return _mm256_fmadd_ps(Gather(src + 0), weight[0], - _mm256_fmadd_ps(Gather(src + 1), weight[1], - _mm256_mul_ps(Gather(src + 2), weight[2]))); - } - - static SIMD_INLINE __m256 SynetConvolution32f(const float * src, size_t step, const __m256 * weight) - { - return _mm256_add_ps(RowConv(src, weight), _mm256_add_ps(RowConv(src + step, weight + 3), RowConv(src + 2 * step, weight + 6))); - } - }; - - template<::SimdConvolutionActivationType type> SIMD_INLINE __m256 Activate(__m256 value, const __m256 * params); - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationIdentity>(__m256 value, const __m256 * params) - { - return value; - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationRelu>(__m256 value, const __m256 * params) - { - return _mm256_max_ps(_mm256_setzero_ps(), value); - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationLeakyRelu>(__m256 value, const __m256 * params) - { - return _mm256_add_ps(_mm256_max_ps(_mm256_setzero_ps(), value), _mm256_mul_ps(params[0], _mm256_min_ps(_mm256_setzero_ps(), value))); - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationRestrictRange>(__m256 value, const __m256 * params) - { - return _mm256_min_ps(_mm256_max_ps(params[0], value), params[1]); - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationPrelu>(__m256 value, const __m256 * params) - { - return _mm256_add_ps(_mm256_max_ps(_mm256_setzero_ps(), value), _mm256_mul_ps(params[0], _mm256_min_ps(_mm256_setzero_ps(), value))); - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationElu>(__m256 value, const __m256 * params) - { - return Avx2::Elu(value, params[0]); - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationHswish>(__m256 value, const __m256 * params) - { - return Avx2::SynetHswish32f(value, params[0], params[1]); - } - - template - void ConvolutionBiasActivation(const float * src, size_t srcC, size_t srcH, size_t srcW, const float * weight, - const float * bias, const float * params, float * dst, size_t dstC, size_t dstH, size_t dstW) - { - __m256 _weight[kernel*kernel]; - __m256 _params[2]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - size_t dstWF = Simd::AlignLo(dstW, F); - __m256 tail = RightNotZero32f(dstW - dstWF); - for (size_t dc = 0; dc < dstC; ++dc) - { - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm256_set1_ps(params[dc]); - if (srcC == 1) - { - const float * ps = src; - float * pd = dst; - LoadWeight(weight, _weight); - __m256 _bias = bias ? _mm256_set1_ps(bias[dc]) : _mm256_setzero_ps(); - for (size_t y = 0; y < dstH; ++y) - { - for (size_t x = 0; x < dstWF; x += F) - { - __m256 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm256_storeu_ps(pd + x, Activate(_mm256_add_ps(_bias, conv), _params)); - } - if (dstWF < dstW) - { - size_t x = dstW - F; - __m256 _dst = _mm256_loadu_ps(pd + x); - __m256 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm256_storeu_ps(pd + x, _mm256_blendv_ps(_dst, Activate(_mm256_add_ps(_bias, conv), _params), tail)); - } - ps += srcW * stride; - pd += dstW; - } - weight += kernel * kernel; - } - else - { - size_t sc = 0; - for (; sc < 1; ++sc) - { - const float * ps = src; - float * pd = dst; - LoadWeight(weight, _weight); - __m256 _bias = bias ? _mm256_set1_ps(bias[dc]) : _mm256_setzero_ps(); - for (size_t y = 0; y < dstH; ++y) - { - for (size_t x = 0; x < dstWF; x += F) - { - __m256 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm256_storeu_ps(pd + x, _mm256_add_ps(_bias, conv)); - } - if (dstWF < dstW) - { - size_t x = dstW - F; - __m256 _dst = _mm256_loadu_ps(pd + x); - __m256 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm256_storeu_ps(pd + x, _mm256_blendv_ps(_dst, _mm256_add_ps(_bias, conv), tail)); - } - ps += srcW * stride; - pd += dstW; - } - weight += kernel * kernel; - } - for (; sc < srcC - 1; ++sc) - { - const float * ps = src + sc * srcW * srcH; - float * pd = dst; - LoadWeight(weight, _weight); - for (size_t y = 0; y < dstH; ++y) - { - for (size_t x = 0; x < dstWF; x += F) - { - __m256 _dst = _mm256_loadu_ps(pd + x); - __m256 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm256_storeu_ps(pd + x, _mm256_add_ps(_dst, conv)); - } - if (dstWF < dstW) - { - size_t x = dstW - F; - __m256 _dst = _mm256_loadu_ps(pd + x); - __m256 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm256_storeu_ps(pd + x, _mm256_add_ps(_dst, _mm256_and_ps(conv, tail))); - } - ps += srcW * stride; - pd += dstW; - } - weight += kernel * kernel; - } - for (; sc < srcC; ++sc) - { - const float * ps = src + sc * srcW * srcH; - float * pd = dst; - LoadWeight(weight, _weight); - for (size_t y = 0; y < dstH; ++y) - { - for (size_t x = 0; x < dstWF; x += F) - { - __m256 _dst = _mm256_loadu_ps(pd + x); - __m256 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm256_storeu_ps(pd + x, Activate(_mm256_add_ps(_dst, conv), _params)); - } - if (dstWF < dstW) - { - size_t x = dstW - F; - __m256 _dst = _mm256_loadu_ps(pd + x); - __m256 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm256_storeu_ps(pd + x, _mm256_blendv_ps(_dst, Activate(_mm256_add_ps(_dst, conv), _params), tail)); - } - ps += srcW * stride; - pd += dstW; - } - weight += kernel * kernel; - } - } - dst += dstH * dstW; - } - } - - template SynetConvolution32fDirectNchw::ConvolutionBiasActivationPtr SetConvolutionBiasActivation(::SimdConvolutionActivationType type) - { - switch (type) - { - case ::SimdConvolutionActivationIdentity: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationRelu: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationLeakyRelu: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationRestrictRange: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationPrelu: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationElu: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationHswish: return ConvolutionBiasActivation; - default: - assert(0); - return NULL; - } - } - - SynetConvolution32fDirectNchw::ConvolutionBiasActivationPtr SynetConvolution32fDirectNchw::SetConvolutionBiasActivation() - { - const ConvParam32f & p = _param; - if (p.dstW < F) - return Sse2::SynetConvolution32fDirectNchw::SetConvolutionBiasActivation(); - switch (p.strideX) - { - case 1: - if (p.kernelX == 1) - return Avx2::SetConvolutionBiasActivation<1, 1>(p.activation); - if (p.kernelX == 2) - return Avx2::SetConvolutionBiasActivation<2, 1>(p.activation); - if (p.kernelX == 3) - return Avx2::SetConvolutionBiasActivation<3, 1>(p.activation); - break; - case 2: - if (p.kernelX == 2) - return Avx2::SetConvolutionBiasActivation<2, 2>(p.activation); - if (p.kernelX == 3) - return Avx2::SetConvolutionBiasActivation<3, 2>(p.activation); - break; - case 3: - if (p.kernelX == 3) - return Avx2::SetConvolutionBiasActivation<3, 3>(p.activation); - break; - } - return Sse2::SynetConvolution32fDirectNchw::SetConvolutionBiasActivation(); - } - - //--------------------------------------------------------------------- - - SynetConvolution32fDirectNhwc::SynetConvolution32fDirectNhwc(const ConvParam32f & p) - : Avx::SynetConvolution32fDirectNhwc(p) - { - _convolutionBiasActivation = SetConvolutionBiasActivation(); - } - - template<::SimdConvolutionActivationType type> SIMD_INLINE __m256 Activate(__m256 value, const float * params, size_t offset); - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationIdentity>(__m256 value, const float * params, size_t offset) - { - return value; - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationRelu>(__m256 value, const float * params, size_t offset) - { - return _mm256_max_ps(_mm256_setzero_ps(), value); - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationLeakyRelu>(__m256 value, const float * params, size_t offset) - { - return _mm256_add_ps(_mm256_max_ps(_mm256_setzero_ps(), value), _mm256_mul_ps(_mm256_set1_ps(params[0]), _mm256_min_ps(_mm256_setzero_ps(), value))); - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationRestrictRange>(__m256 value, const float * params, size_t offset) - { - return _mm256_min_ps(_mm256_max_ps(_mm256_set1_ps(params[0]), value), _mm256_set1_ps(params[1])); - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationPrelu>(__m256 value, const float * params, size_t offset) - { - return _mm256_add_ps(_mm256_max_ps(_mm256_setzero_ps(), value), _mm256_mul_ps(_mm256_loadu_ps(params + offset), _mm256_min_ps(_mm256_setzero_ps(), value))); - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationElu>(__m256 value, const float * params, size_t offset) - { - return Avx2::Elu(value, _mm256_set1_ps(params[0])); - } - - template<> SIMD_INLINE __m256 Activate<::SimdConvolutionActivationHswish>(__m256 value, const float * params, size_t offset) - { - return Avx::SynetHswish32f(value, _mm256_set1_ps(params[0]), _mm256_set1_ps(params[1])); - } - - SIMD_INLINE void KernelHwcDefaultEdge(const float * src, const ConvParam32f & p, size_t kH, size_t kW, const float * weight, __m256 & sum) - { - size_t size = kW * p.srcC, tail = (p.kernelX - kW)*p.srcC*p.dstC, dstC = p.dstC, stride = p.srcW * p.srcC; - for (size_t ky = 0; ky < kH; ++ky) - { - for (size_t i = 0; i < size; ++i, weight += dstC) - sum = _mm256_fmadd_ps(_mm256_set1_ps(src[i]), _mm256_loadu_ps(weight), sum); - weight += tail; - src += stride; - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void KernelHwcDefaultEdge(const float * src, const ConvParam32f & p, size_t kH, size_t kW, const float * weight, const float * bias, const float * params, float * dst) - { - size_t dstC = p.dstC; - size_t dstCF = AlignLo(dstC, F); - size_t dc = 0; - for (; dc < dstCF; dc += F) - { - __m256 conv = bias ? _mm256_loadu_ps(bias + dc) : _mm256_setzero_ps(); - KernelHwcDefaultEdge(src, p, kH, kW, weight + dc, conv); - _mm256_storeu_ps(dst + dc, Activate(conv, params, dc)); - } - if (dc < dstC) - { - dc = dstC - F; - __m256 conv = bias ? _mm256_loadu_ps(bias + dc) : _mm256_setzero_ps(); - KernelHwcDefaultEdge(src, p, kH, kW, weight + dc, conv); - _mm256_storeu_ps(dst + dc, Activate(conv, params, dc)); - } - } - - SIMD_INLINE void KernelHwcDefaultBody2x2(const float * src, const ConvParam32f & p, const float * weight, __m256 sums[2][2]) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - __m256 w0, w1, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0 * F); - w1 = _mm256_loadu_ps(weight + 1 * F); - s0 = _mm256_set1_ps(src0[offset]); - sums[0][0] = _mm256_fmadd_ps(s0, w0, sums[0][0]); - sums[0][1] = _mm256_fmadd_ps(s0, w1, sums[0][1]); - s0 = _mm256_set1_ps(src1[offset]); - sums[1][0] = _mm256_fmadd_ps(s0, w0, sums[1][0]); - sums[1][1] = _mm256_fmadd_ps(s0, w1, sums[1][1]); - weight += dstC; - } - } - } - - SIMD_INLINE void KernelHwcDefaultBody2x1(const float * src, const ConvParam32f & p, const float * weight, __m256 sums[2][1]) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - __m256 w0, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0 * F); - s0 = _mm256_set1_ps(src0[offset]); - sums[0][0] = _mm256_fmadd_ps(s0, w0, sums[0][0]); - s0 = _mm256_set1_ps(src1[offset]); - sums[1][0] = _mm256_fmadd_ps(s0, w0, sums[1][0]); - weight += dstC; - } - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void KernelHwcDefaultBody2(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t dstC = p.dstC; - size_t dstCF1 = AlignLo(dstC, 1 * F); - size_t dstCF2 = AlignLo(dstC, 2 * F); - size_t dc = 0; - for (; dc < dstCF2; dc += 2 * F) - { - __m256 sums[2][2]; - __m256 bias0 = bias ? _mm256_loadu_ps(bias + dc + 0 * F) : _mm256_setzero_ps(); - __m256 bias1 = bias ? _mm256_loadu_ps(bias + dc + 1 * F) : _mm256_setzero_ps(); - sums[0][0] = bias0; - sums[0][1] = bias1; - sums[1][0] = bias0; - sums[1][1] = bias1; - KernelHwcDefaultBody2x2(src, p, weight + dc, sums); - _mm256_storeu_ps(dst + dc + 0 * dstC + 0 * F, Activate(sums[0][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 0 * dstC + 1 * F, Activate(sums[0][1], params, dc + 1 * F)); - _mm256_storeu_ps(dst + dc + 1 * dstC + 0 * F, Activate(sums[1][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 1 * dstC + 1 * F, Activate(sums[1][1], params, dc + 1 * F)); - } - for (; dc < dstCF1; dc += 1 * F) - { - __m256 sums[2][1]; - __m256 bias0 = bias ? _mm256_loadu_ps(bias + dc) : _mm256_setzero_ps(); - sums[0][0] = bias0; - sums[1][0] = bias0; - KernelHwcDefaultBody2x1(src, p, weight + dc, sums); - _mm256_storeu_ps(dst + dc + 0 * dstC, Activate(sums[0][0], params, dc)); - _mm256_storeu_ps(dst + dc + 1 * dstC, Activate(sums[1][0], params, dc)); - } - if (dc < dstC) - { - dc = dstC - F; - __m256 sums[2][1]; - __m256 bias0 = bias ? _mm256_loadu_ps(bias + dc) : _mm256_setzero_ps(); - sums[0][0] = bias0; - sums[1][0] = bias0; - KernelHwcDefaultBody2x1(src, p, weight + dc, sums); - _mm256_storeu_ps(dst + dc + 0 * dstC, Activate(sums[0][0], params, dc)); - _mm256_storeu_ps(dst + dc + 1 * dstC, Activate(sums[1][0], params, dc)); - } - } - - SIMD_INLINE void KernelHwcDefaultBody6x2(const float * src, const ConvParam32f & p, const float * weight, __m256 sums[6][2]) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - const float * src2 = src + 2 * step; - const float * src3 = src + 3 * step; - const float * src4 = src + 4 * step; - const float * src5 = src + 5 * step; - __m256 w0, w1, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0 * F); - w1 = _mm256_loadu_ps(weight + 1 * F); - s0 = _mm256_set1_ps(src0[offset]); - sums[0][0] = _mm256_fmadd_ps(s0, w0, sums[0][0]); - sums[0][1] = _mm256_fmadd_ps(s0, w1, sums[0][1]); - s0 = _mm256_set1_ps(src1[offset]); - sums[1][0] = _mm256_fmadd_ps(s0, w0, sums[1][0]); - sums[1][1] = _mm256_fmadd_ps(s0, w1, sums[1][1]); - s0 = _mm256_set1_ps(src2[offset]); - sums[2][0] = _mm256_fmadd_ps(s0, w0, sums[2][0]); - sums[2][1] = _mm256_fmadd_ps(s0, w1, sums[2][1]); - s0 = _mm256_set1_ps(src3[offset]); - sums[3][0] = _mm256_fmadd_ps(s0, w0, sums[3][0]); - sums[3][1] = _mm256_fmadd_ps(s0, w1, sums[3][1]); - s0 = _mm256_set1_ps(src4[offset]); - sums[4][0] = _mm256_fmadd_ps(s0, w0, sums[4][0]); - sums[4][1] = _mm256_fmadd_ps(s0, w1, sums[4][1]); - s0 = _mm256_set1_ps(src5[offset]); - sums[5][0] = _mm256_fmadd_ps(s0, w0, sums[5][0]); - sums[5][1] = _mm256_fmadd_ps(s0, w1, sums[5][1]); - weight += dstC; - } - } - } - - SIMD_INLINE void KernelHwcDefaultBody6x1(const float * src, const ConvParam32f & p, const float * weight, __m256 sums[6][1]) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - const float * src2 = src + 2 * step; - const float * src3 = src + 3 * step; - const float * src4 = src + 4 * step; - const float * src5 = src + 5 * step; - __m256 w0, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0 * F); - s0 = _mm256_set1_ps(src0[offset]); - sums[0][0] = _mm256_fmadd_ps(s0, w0, sums[0][0]); - s0 = _mm256_set1_ps(src1[offset]); - sums[1][0] = _mm256_fmadd_ps(s0, w0, sums[1][0]); - s0 = _mm256_set1_ps(src2[offset]); - sums[2][0] = _mm256_fmadd_ps(s0, w0, sums[2][0]); - s0 = _mm256_set1_ps(src3[offset]); - sums[3][0] = _mm256_fmadd_ps(s0, w0, sums[3][0]); - s0 = _mm256_set1_ps(src4[offset]); - sums[4][0] = _mm256_fmadd_ps(s0, w0, sums[4][0]); - s0 = _mm256_set1_ps(src5[offset]); - sums[5][0] = _mm256_fmadd_ps(s0, w0, sums[5][0]); - weight += dstC; - } - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void KernelHwcDefaultBody6(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t dstC = p.dstC; - size_t dstCF1 = AlignLo(dstC, 1 * F); - size_t dstCF2 = AlignLo(dstC, 2 * F); - size_t dc = 0; - for (; dc < dstCF2; dc += 2 * F) - { - __m256 sums[6][2]; - __m256 bias0 = bias ? _mm256_loadu_ps(bias + dc + 0 * F) : _mm256_setzero_ps(); - __m256 bias1 = bias ? _mm256_loadu_ps(bias + dc + 1 * F) : _mm256_setzero_ps(); - sums[0][0] = bias0; - sums[0][1] = bias1; - sums[1][0] = bias0; - sums[1][1] = bias1; - sums[2][0] = bias0; - sums[2][1] = bias1; - sums[3][0] = bias0; - sums[3][1] = bias1; - sums[4][0] = bias0; - sums[4][1] = bias1; - sums[5][0] = bias0; - sums[5][1] = bias1; - KernelHwcDefaultBody6x2(src, p, weight + dc, sums); - _mm256_storeu_ps(dst + dc + 0 * dstC + 0 * F, Activate(sums[0][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 0 * dstC + 1 * F, Activate(sums[0][1], params, dc + 1 * F)); - _mm256_storeu_ps(dst + dc + 1 * dstC + 0 * F, Activate(sums[1][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 1 * dstC + 1 * F, Activate(sums[1][1], params, dc + 1 * F)); - _mm256_storeu_ps(dst + dc + 2 * dstC + 0 * F, Activate(sums[2][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 2 * dstC + 1 * F, Activate(sums[2][1], params, dc + 1 * F)); - _mm256_storeu_ps(dst + dc + 3 * dstC + 0 * F, Activate(sums[3][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 3 * dstC + 1 * F, Activate(sums[3][1], params, dc + 1 * F)); - _mm256_storeu_ps(dst + dc + 4 * dstC + 0 * F, Activate(sums[4][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 4 * dstC + 1 * F, Activate(sums[4][1], params, dc + 1 * F)); - _mm256_storeu_ps(dst + dc + 5 * dstC + 0 * F, Activate(sums[5][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 5 * dstC + 1 * F, Activate(sums[5][1], params, dc + 1 * F)); - } - for (; dc < dstCF1; dc += 1 * F) - { - __m256 sums[6][1]; - __m256 bias0 = bias ? _mm256_loadu_ps(bias + dc) : _mm256_setzero_ps(); - sums[0][0] = bias0; - sums[1][0] = bias0; - sums[2][0] = bias0; - sums[3][0] = bias0; - sums[4][0] = bias0; - sums[5][0] = bias0; - KernelHwcDefaultBody6x1(src, p, weight + dc, sums); - _mm256_storeu_ps(dst + dc + 0 * dstC, Activate(sums[0][0], params, dc)); - _mm256_storeu_ps(dst + dc + 1 * dstC, Activate(sums[1][0], params, dc)); - _mm256_storeu_ps(dst + dc + 2 * dstC, Activate(sums[2][0], params, dc)); - _mm256_storeu_ps(dst + dc + 3 * dstC, Activate(sums[3][0], params, dc)); - _mm256_storeu_ps(dst + dc + 4 * dstC, Activate(sums[4][0], params, dc)); - _mm256_storeu_ps(dst + dc + 5 * dstC, Activate(sums[5][0], params, dc)); - } - if (dc < dstC) - { - dc = dstC - F; - __m256 sums[6][1]; - __m256 bias0 = bias ? _mm256_loadu_ps(bias + dc) : _mm256_setzero_ps(); - sums[0][0] = bias0; - sums[1][0] = bias0; - sums[2][0] = bias0; - sums[3][0] = bias0; - sums[4][0] = bias0; - sums[5][0] = bias0; - KernelHwcDefaultBody6x1(src, p, weight + dc, sums); - _mm256_storeu_ps(dst + dc + 0 * dstC, Activate(sums[0][0], params, dc)); - _mm256_storeu_ps(dst + dc + 1 * dstC, Activate(sums[1][0], params, dc)); - _mm256_storeu_ps(dst + dc + 2 * dstC, Activate(sums[2][0], params, dc)); - _mm256_storeu_ps(dst + dc + 3 * dstC, Activate(sums[3][0], params, dc)); - _mm256_storeu_ps(dst + dc + 4 * dstC, Activate(sums[4][0], params, dc)); - _mm256_storeu_ps(dst + dc + 5 * dstC, Activate(sums[5][0], params, dc)); - } - } - - SIMD_INLINE void KernelHwcDefaultBody4x3(const float * src, const ConvParam32f & p, const float * weight, __m256 sums[4][3]) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - const float * src2 = src + 2 * step; - const float * src3 = src + 3 * step; - __m256 w0, w1, w2, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0 * F); - w1 = _mm256_loadu_ps(weight + 1 * F); - w2 = _mm256_loadu_ps(weight + 2 * F); - s0 = _mm256_set1_ps(src0[offset]); - sums[0][0] = _mm256_fmadd_ps(s0, w0, sums[0][0]); - sums[0][1] = _mm256_fmadd_ps(s0, w1, sums[0][1]); - sums[0][2] = _mm256_fmadd_ps(s0, w2, sums[0][2]); - s0 = _mm256_set1_ps(src1[offset]); - sums[1][0] = _mm256_fmadd_ps(s0, w0, sums[1][0]); - sums[1][1] = _mm256_fmadd_ps(s0, w1, sums[1][1]); - sums[1][2] = _mm256_fmadd_ps(s0, w2, sums[1][2]); - s0 = _mm256_set1_ps(src2[offset]); - sums[2][0] = _mm256_fmadd_ps(s0, w0, sums[2][0]); - sums[2][1] = _mm256_fmadd_ps(s0, w1, sums[2][1]); - sums[2][2] = _mm256_fmadd_ps(s0, w2, sums[2][2]); - s0 = _mm256_set1_ps(src3[offset]); - sums[3][0] = _mm256_fmadd_ps(s0, w0, sums[3][0]); - sums[3][1] = _mm256_fmadd_ps(s0, w1, sums[3][1]); - sums[3][2] = _mm256_fmadd_ps(s0, w2, sums[3][2]); - weight += dstC; - } - } - } - - SIMD_INLINE void KernelHwcDefaultBody4x1(const float * src, const ConvParam32f & p, const float * weight, __m256 sums[4][1]) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - const float * src2 = src + 2 * step; - const float * src3 = src + 3 * step; - __m256 w0, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0 * F); - s0 = _mm256_set1_ps(src0[offset]); - sums[0][0] = _mm256_fmadd_ps(s0, w0, sums[0][0]); - s0 = _mm256_set1_ps(src1[offset]); - sums[1][0] = _mm256_fmadd_ps(s0, w0, sums[1][0]); - s0 = _mm256_set1_ps(src2[offset]); - sums[2][0] = _mm256_fmadd_ps(s0, w0, sums[2][0]); - s0 = _mm256_set1_ps(src3[offset]); - sums[3][0] = _mm256_fmadd_ps(s0, w0, sums[3][0]); - weight += dstC; - } - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void KernelHwcDefaultBody4(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t dstC = p.dstC; - size_t dstCF1 = AlignLo(dstC, 1 * F); - size_t dstCF3 = AlignLoAny(dstC, 3 * F); - size_t dc = 0; - for (; dc < dstCF3; dc += 3 * F) - { - __m256 sums[4][3]; - __m256 bias0 = bias ? _mm256_loadu_ps(bias + dc + 0 * F) : _mm256_setzero_ps(); - __m256 bias1 = bias ? _mm256_loadu_ps(bias + dc + 1 * F) : _mm256_setzero_ps(); - __m256 bias2 = bias ? _mm256_loadu_ps(bias + dc + 2 * F) : _mm256_setzero_ps(); - sums[0][0] = bias0; - sums[0][1] = bias1; - sums[0][2] = bias2; - sums[1][0] = bias0; - sums[1][1] = bias1; - sums[1][2] = bias2; - sums[2][0] = bias0; - sums[2][1] = bias1; - sums[2][2] = bias2; - sums[3][0] = bias0; - sums[3][1] = bias1; - sums[3][2] = bias2; - KernelHwcDefaultBody4x3(src, p, weight + dc, sums); - _mm256_storeu_ps(dst + dc + 0 * dstC + 0 * F, Activate(sums[0][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 0 * dstC + 1 * F, Activate(sums[0][1], params, dc + 1 * F)); - _mm256_storeu_ps(dst + dc + 0 * dstC + 2 * F, Activate(sums[0][2], params, dc + 2 * F)); - _mm256_storeu_ps(dst + dc + 1 * dstC + 0 * F, Activate(sums[1][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 1 * dstC + 1 * F, Activate(sums[1][1], params, dc + 1 * F)); - _mm256_storeu_ps(dst + dc + 1 * dstC + 2 * F, Activate(sums[1][2], params, dc + 2 * F)); - _mm256_storeu_ps(dst + dc + 2 * dstC + 0 * F, Activate(sums[2][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 2 * dstC + 1 * F, Activate(sums[2][1], params, dc + 1 * F)); - _mm256_storeu_ps(dst + dc + 2 * dstC + 2 * F, Activate(sums[2][2], params, dc + 2 * F)); - _mm256_storeu_ps(dst + dc + 3 * dstC + 0 * F, Activate(sums[3][0], params, dc + 0 * F)); - _mm256_storeu_ps(dst + dc + 3 * dstC + 1 * F, Activate(sums[3][1], params, dc + 1 * F)); - _mm256_storeu_ps(dst + dc + 3 * dstC + 2 * F, Activate(sums[3][2], params, dc + 2 * F)); - } - for (; dc < dstCF1; dc += 1 * F) - { - __m256 sums[4][1]; - __m256 bias0 = bias ? _mm256_loadu_ps(bias + dc) : _mm256_setzero_ps(); - sums[0][0] = bias0; - sums[1][0] = bias0; - sums[2][0] = bias0; - sums[3][0] = bias0; - KernelHwcDefaultBody4x1(src, p, weight + dc, sums); - _mm256_storeu_ps(dst + dc + 0 * dstC, Activate(sums[0][0], params, dc)); - _mm256_storeu_ps(dst + dc + 1 * dstC, Activate(sums[1][0], params, dc)); - _mm256_storeu_ps(dst + dc + 2 * dstC, Activate(sums[2][0], params, dc)); - _mm256_storeu_ps(dst + dc + 3 * dstC, Activate(sums[3][0], params, dc)); - } - if (dc < dstC) - { - dc = dstC - F; - __m256 sums[4][1]; - __m256 bias0 = bias ? _mm256_loadu_ps(bias + dc) : _mm256_setzero_ps(); - sums[0][0] = bias0; - sums[1][0] = bias0; - sums[2][0] = bias0; - sums[3][0] = bias0; - KernelHwcDefaultBody4x1(src, p, weight + dc, sums); - _mm256_storeu_ps(dst + dc + 0 * dstC, Activate(sums[0][0], params, dc)); - _mm256_storeu_ps(dst + dc + 1 * dstC, Activate(sums[1][0], params, dc)); - _mm256_storeu_ps(dst + dc + 2 * dstC, Activate(sums[2][0], params, dc)); - _mm256_storeu_ps(dst + dc + 3 * dstC, Activate(sums[3][0], params, dc)); - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void KernelHwcDefaultBody6_1x1x8(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t size = p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - const float * src2 = src + 2 * step; - const float * src3 = src + 3 * step; - const float * src4 = src + 4 * step; - const float * src5 = src + 5 * step; - __m256 w0, w1, s0, s1; - __m256 sums[6]; - __m256 bias0 = bias ? _mm256_loadu_ps(bias) : _mm256_setzero_ps(); - sums[0] = bias0; - sums[1] = bias0; - sums[2] = bias0; - sums[3] = bias0; - sums[4] = bias0; - sums[5] = bias0; - size_t offset = 0, size2 = size & (~1); - for (; offset < size2; offset += 2) - { - w0 = _mm256_loadu_ps(weight + 0 * F); - w1 = _mm256_loadu_ps(weight + 1 * F); - s0 = _mm256_set1_ps(src0[offset + 0]); - s1 = _mm256_set1_ps(src1[offset + 0]); - sums[0] = _mm256_fmadd_ps(s0, w0, sums[0]); - sums[1] = _mm256_fmadd_ps(s1, w0, sums[1]); - s0 = _mm256_set1_ps(src0[offset + 1]); - s1 = _mm256_set1_ps(src1[offset + 1]); - sums[0] = _mm256_fmadd_ps(s0, w1, sums[0]); - sums[1] = _mm256_fmadd_ps(s1, w1, sums[1]); - s0 = _mm256_set1_ps(src2[offset + 0]); - s1 = _mm256_set1_ps(src3[offset + 0]); - sums[2] = _mm256_fmadd_ps(s0, w0, sums[2]); - sums[3] = _mm256_fmadd_ps(s1, w0, sums[3]); - s0 = _mm256_set1_ps(src2[offset + 1]); - s1 = _mm256_set1_ps(src3[offset + 1]); - sums[2] = _mm256_fmadd_ps(s0, w1, sums[2]); - sums[3] = _mm256_fmadd_ps(s1, w1, sums[3]); - s0 = _mm256_set1_ps(src4[offset + 0]); - s1 = _mm256_set1_ps(src5[offset + 0]); - sums[4] = _mm256_fmadd_ps(s0, w0, sums[4]); - sums[5] = _mm256_fmadd_ps(s1, w0, sums[5]); - s0 = _mm256_set1_ps(src4[offset + 1]); - s1 = _mm256_set1_ps(src5[offset + 1]); - sums[4] = _mm256_fmadd_ps(s0, w1, sums[4]); - sums[5] = _mm256_fmadd_ps(s1, w1, sums[5]); - weight += 2*F; - } - for (; offset < size; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0 * F); - s0 = _mm256_set1_ps(src0[offset]); - s1 = _mm256_set1_ps(src1[offset]); - sums[0] = _mm256_fmadd_ps(s0, w0, sums[0]); - sums[1] = _mm256_fmadd_ps(s1, w0, sums[1]); - s0 = _mm256_set1_ps(src2[offset]); - s1 = _mm256_set1_ps(src3[offset]); - sums[2] = _mm256_fmadd_ps(s0, w0, sums[2]); - sums[3] = _mm256_fmadd_ps(s1, w0, sums[3]); - s0 = _mm256_set1_ps(src4[offset]); - s1 = _mm256_set1_ps(src5[offset]); - sums[4] = _mm256_fmadd_ps(s0, w0, sums[4]); - sums[5] = _mm256_fmadd_ps(s1, w0, sums[5]); - weight += F; - } - _mm256_storeu_ps(dst + 0 * F, Activate(sums[0], params, 0)); - _mm256_storeu_ps(dst + 1 * F, Activate(sums[1], params, 0)); - _mm256_storeu_ps(dst + 2 * F, Activate(sums[2], params, 0)); - _mm256_storeu_ps(dst + 3 * F, Activate(sums[3], params, 0)); - _mm256_storeu_ps(dst + 4 * F, Activate(sums[4], params, 0)); - _mm256_storeu_ps(dst + 5 * F, Activate(sums[5], params, 0)); - } - - template<::SimdConvolutionActivationType type> void ConvolutionDirectNhwcConvolutionBiasActivationDefault(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - bool is1x1x8 = p.dstC == 8 && p.kernelX == 1 && p.kernelY == 1; - size_t noseH = p.padY, noseW = p.padX; - size_t bodyH = p.srcH - p.kernelY + 1 + noseH, bodyW = p.srcW - p.kernelX + 1 + noseW; - size_t tailH = bodyH + p.padH, tailW = bodyW + p.padW; - size_t bodyW2 = AlignLoAny(bodyW - noseW, 2 * p.strideX) + noseW; - size_t bodyW4 = AlignLoAny(bodyW - noseW, 4 * p.strideX) + noseW; - size_t bodyW6 = AlignLoAny(bodyW - noseW, 6 * p.strideX) + noseW; - size_t wS = p.srcC*p.dstC; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - size_t sy = 0; - for (; sy < noseH; sy += p.strideY) - { - size_t sx = 0; - const float * w = weight + (noseH - sy) * p.kernelY * wS; - for (; sx < noseW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src, p, kY + sy, kX + sx, w + (noseW - sx)*wS, bias, params, dst); - for (; sx < bodyW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, kY + sy, p.kernelX, w, bias, params, dst); - for (; sx < tailW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, kY + sy, kW - sx, w, bias, params, dst); - } - src += (sy - noseH)*p.srcW*p.srcC; - for (; sy < bodyH; sy += p.strideY) - { - size_t sx = 0; - for (; sx < noseW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src, p, p.kernelY, kX + sx, weight + (noseW - sx)*wS, bias, params, dst); - if (is1x1x8) - { - for (; sx < bodyW6; sx += 6 * p.strideX, dst += 6 * p.dstC) - KernelHwcDefaultBody6_1x1x8(src + (sx - noseW) * p.srcC, p, weight, bias, params, dst); - } - else if (p.dstC%24 == 0) - { - for (; sx < bodyW4; sx += 4 * p.strideX, dst += 4 * p.dstC) - KernelHwcDefaultBody4(src + (sx - noseW) * p.srcC, p, weight, bias, params, dst); - } - else - { - for (; sx < bodyW6; sx += 6 * p.strideX, dst += 6 * p.dstC) - KernelHwcDefaultBody6(src + (sx - noseW) * p.srcC, p, weight, bias, params, dst); - } - for (; sx < bodyW2; sx += 2 * p.strideX, dst += 2 * p.dstC) - KernelHwcDefaultBody2(src + (sx - noseW) * p.srcC, p, weight, bias, params, dst); - for (; sx < bodyW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, p.kernelY, p.kernelX, weight, bias, params, dst); - for (; sx < tailW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, p.kernelY, kW - sx, weight, bias, params, dst); - src += p.strideY*p.srcW*p.srcC; - } - for (; sy < tailH; sy += p.strideY) - { - size_t sx = 0; - for (; sx < noseW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src, p, kH - sy, kX + sx, weight + (noseW - sx)*wS, bias, params, dst); - for (; sx < bodyW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, kH - sy, p.kernelX, weight, bias, params, dst); - for (; sx < tailW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, kH - sy, kW - sx, weight, bias, params, dst); - src += p.strideY*p.srcW*p.srcC; - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(const float * src, const ConvParam32f & p, size_t dy, size_t dx, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcC = p.srcC; - size_t srcCF = AlignLo(srcC, F); - size_t c = 0; - for (; c < srcCF; c += F) - { - __m256 sum = bias ? _mm256_loadu_ps(bias + c) : _mm256_setzero_ps(); - for (size_t ky = 0; ky < 3; ++ky) - { - size_t sy = dy * p.strideY + ky - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < 3; ++kx) - { - size_t sx = dx * p.strideX + kx - p.padX; - if (sx < p.srcW) - { - const float * pw = weight + (ky * 3 + kx) * srcC; - const float * ps = src + (sy*p.srcW + sx) * srcC; - sum = _mm256_fmadd_ps(_mm256_loadu_ps(ps), _mm256_loadu_ps(pw), sum); - } - } - } - } - _mm256_storeu_ps(dst + c, Activate(sum, params, c)); - src += F; - weight += F; - } - if (c < srcC) - { - c = p.srcC - F; - src -= srcCF - c; - weight -= srcCF - c; - __m256 sum = bias ? _mm256_loadu_ps(bias + c) : _mm256_setzero_ps(); - for (size_t ky = 0; ky < 3; ++ky) - { - size_t sy = dy * p.strideY + ky - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < 3; ++kx) - { - size_t sx = dx * p.strideX + kx - p.padX; - if (sx < p.srcW) - { - const float * pw = weight + (ky * 3 + kx) * srcC; - const float * ps = src + (sy*p.srcW + sx) * srcC; - sum = _mm256_fmadd_ps(_mm256_loadu_ps(ps), _mm256_loadu_ps(pw), sum); - } - } - } - } - _mm256_storeu_ps(dst + c, Activate(sum, params, c)); - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1(const float * src, size_t srcS, size_t srcC, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcCF = AlignLo(srcC, F); - size_t c = 0; - for (; c < srcCF; c += F) - { - __m256 sum = bias ? _mm256_loadu_ps(bias + c) : _mm256_setzero_ps(); - for (size_t ky = 0; ky < 3; ++ky) - { - const float * ps = src + ky * srcS; - const float * pw = weight + ky * 3 * srcC; - sum = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 0 * srcC), _mm256_loadu_ps(pw + 0 * srcC), sum); - sum = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 1 * srcC), _mm256_loadu_ps(pw + 1 * srcC), sum); - sum = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 2 * srcC), _mm256_loadu_ps(pw + 2 * srcC), sum); - } - _mm256_storeu_ps(dst + c, Activate(sum, params, c)); - src += F; - weight += F; - } - if (c < srcC) - { - c = srcC - F; - src -= srcCF - c; - weight -= srcCF - c; - __m256 sum = bias ? _mm256_loadu_ps(bias + c) : _mm256_setzero_ps(); - for (size_t ky = 0; ky < 3; ++ky) - { - const float * ps = src + ky * srcS; - const float * pw = weight + ky * 3 * srcC; - sum = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 0 * srcC), _mm256_loadu_ps(pw + 0 * srcC), sum); - sum = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 1 * srcC), _mm256_loadu_ps(pw + 1 * srcC), sum); - sum = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 2 * srcC), _mm256_loadu_ps(pw + 2 * srcC), sum); - } - _mm256_storeu_ps(dst + c, Activate(sum, params, c)); - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2(const float * src, size_t srcS, size_t srcX, size_t srcC, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcCF = AlignLo(srcC, F); - size_t c = 0; - __m256 sum0, sum1, w0; - for (; c < srcCF; c += F) - { - sum0 = bias ? _mm256_loadu_ps(bias + c) : _mm256_setzero_ps(); - sum1 = sum0; - const float * pw = weight + c; - for (size_t ky = 0; ky < 3; ++ky) - { - const float * ps0 = src + ky * srcS; - const float * ps1 = ps0 + srcX; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(ps0 + 0 * srcC), w0, sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(ps1 + 0 * srcC), w0, sum1); - pw += srcC; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(ps0 + 1 * srcC), w0, sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(ps1 + 1 * srcC), w0, sum1); - pw += srcC; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(ps0 + 2 * srcC), w0, sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(ps1 + 2 * srcC), w0, sum1); - pw += srcC; - } - _mm256_storeu_ps(dst + c, Activate(sum0, params, c)); - _mm256_storeu_ps(dst + c + srcC, Activate(sum1, params, c)); - src += F; - } - if (c < srcC) - { - c = srcC - F; - src -= srcCF - c; - sum0 = bias ? _mm256_loadu_ps(bias + c) : _mm256_setzero_ps(); - sum1 = sum0; - const float * pw = weight + c; - for (size_t ky = 0; ky < 3; ++ky) - { - const float * ps0 = src + ky * srcS; - const float * ps1 = ps0 + srcX; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(ps0 + 0 * srcC), w0, sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(ps1 + 0 * srcC), w0, sum1); - pw += srcC; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(ps0 + 1 * srcC), w0, sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(ps1 + 1 * srcC), w0, sum1); - pw += srcC; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(ps0 + 2 * srcC), w0, sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(ps1 + 2 * srcC), w0, sum1); - pw += srcC; - } - _mm256_storeu_ps(dst + c, Activate(sum0, params, c)); - _mm256_storeu_ps(dst + c + srcC, Activate(sum1, params, c)); - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4(const float * src, size_t srcS, size_t srcX, size_t srcC, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcCF = AlignLo(srcC, F); - size_t c = 0; - for (; c < srcCF; c += F) - { - __m256 sum0, sum1, sum2, sum3, w0; - sum0 = bias ? _mm256_loadu_ps(bias + c) : _mm256_setzero_ps(); - sum1 = sum0; - sum2 = sum0; - sum3 = sum0; - const float * pw = weight + c; - const float * ps0 = src + 0 * srcX; - const float * ps1 = src + 1 * srcX; - const float * ps2 = src + 2 * srcX; - const float * ps3 = src + 3 * srcX; - for (size_t ky = 0; ky < 3; ++ky) - { - size_t offset = ky * srcS; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(ps0 + offset), w0, sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(ps1 + offset), w0, sum1); - sum2 = _mm256_fmadd_ps(_mm256_loadu_ps(ps2 + offset), w0, sum2); - sum3 = _mm256_fmadd_ps(_mm256_loadu_ps(ps3 + offset), w0, sum3); - pw += srcC, offset += srcC; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(ps0 + offset), w0, sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(ps1 + offset), w0, sum1); - sum2 = _mm256_fmadd_ps(_mm256_loadu_ps(ps2 + offset), w0, sum2); - sum3 = _mm256_fmadd_ps(_mm256_loadu_ps(ps3 + offset), w0, sum3); - pw += srcC, offset += srcC; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(ps0 + offset), w0, sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(ps1 + offset), w0, sum1); - sum2 = _mm256_fmadd_ps(_mm256_loadu_ps(ps2 + offset), w0, sum2); - sum3 = _mm256_fmadd_ps(_mm256_loadu_ps(ps3 + offset), w0, sum3); - pw += srcC, offset += srcC; - } - _mm256_storeu_ps(dst + 0 * srcC, Activate(sum0, params, c)); - _mm256_storeu_ps(dst + 1 * srcC, Activate(sum1, params, c)); - _mm256_storeu_ps(dst + 2 * srcC, Activate(sum2, params, c)); - _mm256_storeu_ps(dst + 3 * srcC, Activate(sum3, params, c)); - src += F; - dst += F; - } - if (c < srcC) - { - c = srcC - F; - src -= srcCF - c; - dst -= srcCF - c; - __m256 sum0, sum1, sum2, sum3, w0; - sum0 = bias ? _mm256_loadu_ps(bias + c) : _mm256_setzero_ps(); - sum1 = sum0; - sum2 = sum0; - sum3 = sum0; - const float * pw = weight + c; - const float * ps0 = src + 0 * srcX; - const float * ps1 = src + 1 * srcX; - const float * ps2 = src + 2 * srcX; - const float * ps3 = src + 3 * srcX; - for (size_t ky = 0; ky < 3; ++ky) - { - size_t offset = ky * srcS; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(ps0 + offset), w0, sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(ps1 + offset), w0, sum1); - sum2 = _mm256_fmadd_ps(_mm256_loadu_ps(ps2 + offset), w0, sum2); - sum3 = _mm256_fmadd_ps(_mm256_loadu_ps(ps3 + offset), w0, sum3); - pw += srcC, offset += srcC; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(ps0 + offset), w0, sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(ps1 + offset), w0, sum1); - sum2 = _mm256_fmadd_ps(_mm256_loadu_ps(ps2 + offset), w0, sum2); - sum3 = _mm256_fmadd_ps(_mm256_loadu_ps(ps3 + offset), w0, sum3); - pw += srcC, offset += srcC; - w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(ps0 + offset), w0, sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(ps1 + offset), w0, sum1); - sum2 = _mm256_fmadd_ps(_mm256_loadu_ps(ps2 + offset), w0, sum2); - sum3 = _mm256_fmadd_ps(_mm256_loadu_ps(ps3 + offset), w0, sum3); - pw += srcC, offset += srcC; - } - _mm256_storeu_ps(dst + 0 * srcC, Activate(sum0, params, c)); - _mm256_storeu_ps(dst + 1 * srcC, Activate(sum1, params, c)); - _mm256_storeu_ps(dst + 2 * srcC, Activate(sum2, params, c)); - _mm256_storeu_ps(dst + 3 * srcC, Activate(sum3, params, c)); - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge8(const float * src, const ConvParam32f & p, size_t dy, size_t dx, const __m256 * weight, __m256 bias, const float * params, float * dst) - { - __m256 sum = bias; - for (size_t ky = 0; ky < 3; ++ky) - { - size_t sy = dy * p.strideY + ky - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < 3; ++kx) - { - size_t sx = dx * p.strideX + kx - p.padX; - if (sx < p.srcW) - { - const float * ps = src + (sy*p.srcW + sx) * F; - sum = _mm256_fmadd_ps(_mm256_loadu_ps(ps), weight[ky * 3 + kx], sum); - } - } - } - } - _mm256_storeu_ps(dst, Activate(sum, params, 0)); - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main8x1(const float * src, size_t srcS, const __m256 * weight, __m256 bias, const float * params, float * dst) - { - __m256 sum = bias; - sum = _mm256_fmadd_ps(_mm256_loadu_ps(src + 0 * F), weight[0], sum); - sum = _mm256_fmadd_ps(_mm256_loadu_ps(src + 1 * F), weight[1], sum); - sum = _mm256_fmadd_ps(_mm256_loadu_ps(src + 2 * F), weight[2], sum); - src += srcS; - sum = _mm256_fmadd_ps(_mm256_loadu_ps(src + 0 * F), weight[3], sum); - sum = _mm256_fmadd_ps(_mm256_loadu_ps(src + 1 * F), weight[4], sum); - sum = _mm256_fmadd_ps(_mm256_loadu_ps(src + 2 * F), weight[5], sum); - src += srcS; - sum = _mm256_fmadd_ps(_mm256_loadu_ps(src + 0 * F), weight[6], sum); - sum = _mm256_fmadd_ps(_mm256_loadu_ps(src + 1 * F), weight[7], sum); - sum = _mm256_fmadd_ps(_mm256_loadu_ps(src + 2 * F), weight[8], sum); - _mm256_storeu_ps(dst, Activate(sum, params, 0)); - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main8x2(const float * src, size_t srcS, const __m256 * weight, __m256 bias, const float * params, float * dst) - { - __m256 sum0 = bias; - __m256 sum1 = bias; - for (size_t ky = 0; ky < 3; ++ky) - { - __m256 s0 = _mm256_loadu_ps(src + 0 * F); - __m256 s1 = _mm256_loadu_ps(src + 1 * F); - __m256 s2 = _mm256_loadu_ps(src + 2 * F); - __m256 s3 = _mm256_loadu_ps(src + 3 * F); - sum0 = _mm256_fmadd_ps(s0, weight[0], sum0); - sum1 = _mm256_fmadd_ps(s1, weight[0], sum1); - sum0 = _mm256_fmadd_ps(s1, weight[1], sum0); - sum1 = _mm256_fmadd_ps(s2, weight[1], sum1); - sum0 = _mm256_fmadd_ps(s2, weight[2], sum0); - sum1 = _mm256_fmadd_ps(s3, weight[2], sum1); - src += srcS; - weight += 3; - } - _mm256_storeu_ps(dst + 0, Activate(sum0, params, 0)); - _mm256_storeu_ps(dst + F, Activate(sum1, params, 0)); - } - - template<::SimdConvolutionActivationType type> void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcS = p.srcC*p.srcW; - size_t srcX = p.srcC*p.strideX; - size_t dstH = p.dstH - p.padH; - size_t dstW = p.dstW - p.padW; - size_t dstW2 = AlignLo(dstW - p.padX, 2) + p.padX; - size_t dstW4 = AlignLo(dstW - p.padX, 4) + p.padX; - if (p.dstC == F && p.strideX == 1) - { - __m256 _weight[9]; - for (size_t i = 0; i < 9; ++i) - _weight[i] = _mm256_loadu_ps(weight + i * F); - __m256 _bias = bias ? _mm256_loadu_ps(bias) : _mm256_setzero_ps(); - size_t dy = 0; - for (; dy < p.padY; ++dy) - for (size_t dx = 0; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge8(src, p, dy, dx, _weight, _bias, params, dst), dst += F; - for (; dy < dstH; ++dy) - { - size_t dx = 0; - for (; dx < p.padX; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge8(src, p, dy, dx, _weight, _bias, params, dst), dst += F; - size_t offset = ((dy * p.strideY - p.padY)*p.srcW + dx * p.strideX - p.padX)*p.srcC; - for (; dx < dstW2; dx += 2) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main8x2(src + offset, srcS, _weight, _bias, params, dst), offset += 2 * F, dst += 2 * F; - for (; dx < dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main8x1(src + offset, srcS, _weight, _bias, params, dst), offset += F, dst += F; - for (; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge8(src, p, dy, dx, _weight, _bias, params, dst), dst += F; - } - for (; dy < p.dstH; ++dy) - for (size_t dx = 0; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge8(src, p, dy, dx, _weight, _bias, params, dst), dst += F; - } - else - { - size_t dy = 0; - for (; dy < p.padY; ++dy) - for (size_t dx = 0; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC; - for (; dy < dstH; ++dy) - { - size_t dx = 0; - for (; dx < p.padX; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC; - size_t offset = ((dy * p.strideY - p.padY)*p.srcW + dx * p.strideX - p.padX)*p.srcC; - for (; dx < dstW4; dx += 4) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4(src + offset, srcS, srcX, p.srcC, weight, bias, params, dst), dst += 4 * p.dstC, offset += 4 * srcX; - for (; dx < dstW2; dx += 2) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2(src + offset, srcS, srcX, p.srcC, weight, bias, params, dst), dst += 2 * p.dstC, offset += 2 * srcX; - for (; dx < dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1(src + offset, srcS, p.srcC, weight, bias, params, dst), dst += p.dstC, offset += srcX; - for (; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC; - } - for (; dy < p.dstH; ++dy) - for (size_t dx = 0; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC; - } - } - - template <::SimdConvolutionActivationType type> SynetConvolution32fDirectNhwc::ConvolutionBiasActivationPtr GetConvolutionBiasActivation(const ConvParam32f & p) - { - if (p.group == 1) - return ConvolutionDirectNhwcConvolutionBiasActivationDefault; - else if (p.IsDepthwise() && p.IsKernel(3) && p.IsDilation(1)) - return ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3; - return NULL; - } - - SynetConvolution32fDirectNhwc::ConvolutionBiasActivationPtr SynetConvolution32fDirectNhwc::SetConvolutionBiasActivation() - { - const ConvParam32f & p = _param; - SynetConvolution32fDirectNhwc::ConvolutionBiasActivationPtr func = NULL; - if (p.dstC >= F && p.dstH >= p.padY + p.padH && p.dstW >= p.padX + p.padW) - { - switch (p.activation) - { - case ::SimdConvolutionActivationIdentity: func = GetConvolutionBiasActivation<::SimdConvolutionActivationIdentity>(p); break; - case ::SimdConvolutionActivationRelu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationRelu>(p); break; - case ::SimdConvolutionActivationLeakyRelu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationLeakyRelu>(p); break; - case ::SimdConvolutionActivationRestrictRange: func = GetConvolutionBiasActivation<::SimdConvolutionActivationRestrictRange>(p); break; - case ::SimdConvolutionActivationPrelu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationPrelu>(p); break; - case ::SimdConvolutionActivationElu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationElu>(p); break; - case ::SimdConvolutionActivationHswish: func = GetConvolutionBiasActivation<::SimdConvolutionActivationHswish>(p); break; - } - } - return func ? func : Avx::SynetConvolution32fDirectNhwc::SetConvolutionBiasActivation(); - }; - - //--------------------------------------------------------------------- - - SynetConvolution32fNhwcDirect::SynetConvolution32fNhwcDirect(const ConvParam32f& p) - : Avx::SynetConvolution32fNhwcDirect(p) - { - if (p.dstC <= Sse::F) - return; -#ifdef SIMD_SYNET_CONVOLUTION_NHWC_DIRECT_OLD - //_old.enable = true; - if (_old.enable) - { - if (Set2f(p, _old.convolution)) - OldSetAlgParam(F); - } - else -#endif - { - RunFuncs funcs; - for (size_t n = 2; n <= 3; ++n) - { - funcs.push_back(RunFunc(Ext() + "-" + ToStr(n))); - SetAlgParam(F, n, funcs.back().alg); - if (!SetRt(p, funcs.back().alg)) - return; - } - _run.Init(funcs); - } - } - - bool SynetConvolution32fNhwcDirect::SetRt(const ConvParam32f& p, AlgParam& a) - { - switch (a.microD) - { - case 2 * F: return Set2r(p, a); - case 3 * F: return Set3r(p, a); - default: - return false; - } - } - - //--------------------------------------------------------------------- - - void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdGemm32fNNPtr gemm) - { - ConvParam32f param(batch, conv, gemm); - if (!param.Valid()) - return NULL; - else if (Avx::SynetConvolution32fDepthwiseDotProduct::Preferable(param)) - return new Avx::SynetConvolution32fDepthwiseDotProduct(param); - else if (SynetConvolution32fWinograd::Preferable(param)) - return new SynetConvolution32fWinograd(param); - else if (SynetConvolution32fGemmNT::Preferable(param)) - return new SynetConvolution32fGemmNT(param); - else if (SynetConvolution32fDirectNchw::Preferable(param)) - return new Avx2::SynetConvolution32fDirectNchw(param); - else if (SynetConvolution32fNhwcDirect::Preferable(param)) - return new SynetConvolution32fNhwcDirect(param); - else if (SynetConvolution32fDirectNhwc::Preferable(param)) - return new SynetConvolution32fDirectNhwc(param); - else - return new SynetConvolution32fGemmNN(param); - } - } -#endif//SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2SynetConvolution32fNhwcDirect2f.cpp b/src/3rd/Simd/Simd/SimdAvx2SynetConvolution32fNhwcDirect2f.cpp deleted file mode 100644 index 4ed05118..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2SynetConvolution32fNhwcDirect2f.cpp +++ /dev/null @@ -1,797 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - using AlgParam = SynetConvolution32fNhwcDirect::AlgParam; - - template SIMD_NOINLINE void ConvolutionNhwcDirect_2x6(const float* src0, const ConvParam32f& p, - size_t kernelH, size_t kernelW, size_t srcC, size_t dstC, const float* weight, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t dS = p.srcC * p.strideX, dW = DF * (p.kernelX - kernelW) * srcC, dY = p.srcW * p.srcC, dX = p.srcC, dD = p.dstC; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - d00 = _mm256_setzero_ps(); d01 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); d11 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); d21 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); d31 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(); d41 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(); d51 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - s0 = _mm256_set1_ps(src0[offset]); - d00 = _mm256_fmadd_ps(s0, w0, d00); - d01 = _mm256_fmadd_ps(s0, w1, d01); - s0 = _mm256_set1_ps(src1[offset]); - d10 = _mm256_fmadd_ps(s0, w0, d10); - d11 = _mm256_fmadd_ps(s0, w1, d11); - s0 = _mm256_set1_ps(src2[offset]); - d20 = _mm256_fmadd_ps(s0, w0, d20); - d21 = _mm256_fmadd_ps(s0, w1, d21); - s0 = _mm256_set1_ps(src3[offset]); - d30 = _mm256_fmadd_ps(s0, w0, d30); - d31 = _mm256_fmadd_ps(s0, w1, d31); - s0 = _mm256_set1_ps(src4[offset]); - d40 = _mm256_fmadd_ps(s0, w0, d40); - d41 = _mm256_fmadd_ps(s0, w1, d41); - s0 = _mm256_set1_ps(src5[offset]); - d50 = _mm256_fmadd_ps(s0, w0, d50); - d51 = _mm256_fmadd_ps(s0, w1, d51); - weight += DF; - } - } - weight += dW; - } - if (dstC == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params); - } - else - { - dstC -= F; - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params, dstC); - } - } - else - { - d00 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - s0 = _mm256_set1_ps(src0[offset]); - d00 = _mm256_fmadd_ps(s0, w0, d00); - s0 = _mm256_set1_ps(src1[offset]); - d10 = _mm256_fmadd_ps(s0, w0, d10); - s0 = _mm256_set1_ps(src2[offset]); - d20 = _mm256_fmadd_ps(s0, w0, d20); - s0 = _mm256_set1_ps(src3[offset]); - d30 = _mm256_fmadd_ps(s0, w0, d30); - s0 = _mm256_set1_ps(src4[offset]); - d40 = _mm256_fmadd_ps(s0, w0, d40); - s0 = _mm256_set1_ps(src5[offset]); - d50 = _mm256_fmadd_ps(s0, w0, d50); - weight += DF; - } - } - weight += dW; - } - if (dstC == F) - { - Term::template Save(dst + 0, d00, bias, params); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d10, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d20, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d30, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d40, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d50, bias, params, dstC); - } - } - } - - template SIMD_NOINLINE void ConvolutionNhwcDirect_2x3(const float* src0, const ConvParam32f& p, - size_t kernelH, size_t kernelW, size_t srcC, size_t dstC, const float* weight, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d10, d11, d20, d21, s0, w0, w1; - size_t dS = p.srcC * p.strideX, dW = DF * (p.kernelX - kernelW) * srcC, dY = p.srcW * p.srcC, dX = p.srcC, dD = p.dstC; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - if (dstC > F) - { - d00 = _mm256_setzero_ps(); d01 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); d11 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); d21 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - s0 = _mm256_set1_ps(src0[offset]); - d00 = _mm256_fmadd_ps(s0, w0, d00); - d01 = _mm256_fmadd_ps(s0, w1, d01); - s0 = _mm256_set1_ps(src1[offset]); - d10 = _mm256_fmadd_ps(s0, w0, d10); - d11 = _mm256_fmadd_ps(s0, w1, d11); - s0 = _mm256_set1_ps(src2[offset]); - d20 = _mm256_fmadd_ps(s0, w0, d20); - d21 = _mm256_fmadd_ps(s0, w1, d21); - weight += DF; - } - } - weight += dW; - } - if (dstC == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params); - } - else - { - dstC -= F; - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, dstC); - } - } - else - { - d00 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - s0 = _mm256_set1_ps(src0[offset]); - d00 = _mm256_fmadd_ps(s0, w0, d00); - s0 = _mm256_set1_ps(src1[offset]); - d10 = _mm256_fmadd_ps(s0, w0, d10); - s0 = _mm256_set1_ps(src2[offset]); - d20 = _mm256_fmadd_ps(s0, w0, d20); - weight += DF; - } - } - weight += dW; - } - if (dstC == F) - { - Term::template Save(dst + 0, d00, bias, params); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d10, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d20, bias, params, dstC); - } - } - } - - template SIMD_NOINLINE void ConvolutionNhwcDirect_2x1(const float* src0, const ConvParam32f& p, - size_t kernelH, size_t kernelW, size_t srcC, size_t dstC, const float* weight, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, s0, w0, w1; - size_t dW = DF * (p.kernelX - kernelW) * srcC, dY = p.srcW * p.srcC, dX = p.srcC; - if (dstC > F) - { - d00 = _mm256_setzero_ps(); - d01 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - s0 = _mm256_set1_ps(src0[offset]); - d00 = _mm256_fmadd_ps(s0, w0, d00); - d01 = _mm256_fmadd_ps(s0, w1, d01); - weight += DF; - } - } - weight += dW; - } - if (dstC == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, dstC - F); - } - } - else - { - d00 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - s0 = _mm256_set1_ps(src0[offset]); - d00 = _mm256_fmadd_ps(s0, w0, d00); - weight += DF; - } - } - weight += dW; - } - if (dstC == F) - Term::template Save(dst + 0, d00, bias, params); - else - Term::template Save(dst + 0, d00, bias, params, dstC); - } - } - - template SIMD_NOINLINE void ConvolutionNhwcDirect_2(const float* src, const ConvParam32f& p, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t noseH = p.padY, noseW = p.padX; - size_t bodyH = p.srcH - p.kernelY + 1 + noseH, bodyW = p.srcW - p.kernelX + 1 + noseW; - size_t bodyW3 = AlignLoAny(bodyW - noseW, 3 * p.strideX) + noseW; - size_t bodyW6 = AlignLoAny(bodyW - noseW, 6 * p.strideX) + noseW; - size_t tailH = bodyH + p.padH, tailW = bodyW + p.padW; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - - __m256 _params[2], _bias[2]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - - for (size_t dc = 0; dc < dstC; dc += DF) - { - size_t dC = Simd::Min(DF, dstC - dc); - _bias[0] = _mm256_loadu_ps(bias + dc + 0); - _bias[1] = _mm256_loadu_ps(bias + dc + F); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = _mm256_loadu_ps(params + dc + 0); - _params[1] = _mm256_loadu_ps(params + dc + F); - } - float* d = dst + dc + yBeg * p.dstW * p.dstC; - size_t dy = yBeg, sy = dy * p.strideY; - for (; sy < noseH && dy < yEnd; sy += p.strideY, dy++) - { - size_t sx = 0; - const float* s = src; - const float* w = weight + (noseH - sy) * p.kernelX * srcC * DF; - for (; sx < noseW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s, p, kY + sy, kX + sx, srcC, dC, w + (noseW - sx) * srcC * DF, _bias, _params, d); - for (; sx < bodyW6; sx += 6 * p.strideX, d += 6 * p.dstC) - ConvolutionNhwcDirect_2x6(s + (sx - noseW) * p.srcC, p, kY + sy, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < bodyW3; sx += 3 * p.strideX, d += 3 * p.dstC) - ConvolutionNhwcDirect_2x3(s + (sx - noseW) * p.srcC, p, kY + sy, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < bodyW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, kY + sy, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < tailW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, kY + sy, kW - sx, srcC, dC, w, _bias, _params, d); - } - for (; sy < bodyH && dy < yEnd; sy += p.strideY, dy++) - { - size_t sx = 0; - const float* s = src + (sy - noseH) * p.srcW * p.srcC; - const float* w = weight; - for (; sx < noseW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s, p, p.kernelY, kX + sx, srcC, dC, w + (noseW - sx) * srcC * DF, _bias, _params, d); - for (; sx < bodyW6; sx += 6 * p.strideX, d += 6 * p.dstC) - ConvolutionNhwcDirect_2x6(s + (sx - noseW) * p.srcC, p, p.kernelY, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < bodyW3; sx += 3 * p.strideX, d += 3 * p.dstC) - ConvolutionNhwcDirect_2x3(s + (sx - noseW) * p.srcC, p, p.kernelY, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < bodyW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, p.kernelY, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < tailW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, p.kernelY, kW - sx, srcC, dC, w, _bias, _params, d); - } - for (; sy < tailH && dy < yEnd; sy += p.strideY, dy++) - { - size_t sx = 0; - const float* s = src + (sy - noseH) * p.srcW * p.srcC; - const float* w = weight; - for (; sx < noseW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s, p, kH - sy, kX + sx, srcC, dC, w + (noseW - sx) * srcC * DF, _bias, _params, d); - for (; sx < bodyW6; sx += 6 * p.strideX, d += 6 * p.dstC) - ConvolutionNhwcDirect_2x6(s + (sx - noseW) * p.srcC, p, kH - sy, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < bodyW3; sx += 3 * p.strideX, d += 3 * p.dstC) - ConvolutionNhwcDirect_2x3(s + (sx - noseW) * p.srcC, p, kH - sy, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < bodyW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, kH - sy, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < tailW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, kH - sy, kW - sx, srcC, dC, w, _bias, _params, d); - } - weight += p.kernelY * p.kernelX * srcC * DF; - } - } - - template SIMD_NOINLINE void ConvolutionNhwcDirect_2(const float* src, const ConvParam32f& p, - const SynetConvolution32fNhwcDirect::AlgParam& a, const float* weight, const float* bias, const float* params, float* dst) - { - for (size_t dc = 0; dc < p.dstC; dc += a.macroD) - { - size_t macroD = Simd::Min(p.dstC, dc + a.macroD) - dc; - for (size_t sc = 0; sc < p.srcC; sc += a.macroC) - { - size_t macroC = Simd::Min(p.srcC, sc + a.macroC) - sc; - size_t macroK = p.kernelY * p.kernelX * macroC; - for (size_t yBeg = 0; yBeg < p.dstH;) - { - size_t yEnd = Simd::Min(yBeg + a.macroH, p.dstH); - if (a.macroC == p.srcC) - ConvolutionNhwcDirect_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc == 0) - ConvolutionNhwcDirect_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc + macroC == p.srcC) - ConvolutionNhwcDirect_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else - ConvolutionNhwcDirect_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - yBeg = yEnd; - } - weight += AlignHiAny(macroD, a.microD) * macroK; - } - if (type == ::SimdConvolutionActivationPrelu) - params += macroD; - } - } - - //--------------------------------------------------------------------- - - template SIMD_NOINLINE void ConvolutionNhwcDirect1x1_2x6(const float* src0, const ConvParam32f& p, - size_t srcC, size_t dstC, const float* weight, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t dS = p.srcC, dD = p.dstC; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - d00 = _mm256_setzero_ps(); d01 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); d11 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); d21 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); d31 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(); d41 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(); d51 = _mm256_setzero_ps(); - for (size_t offset = 0; offset < srcC; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - s0 = _mm256_set1_ps(src0[offset]); - d00 = _mm256_fmadd_ps(s0, w0, d00); - d01 = _mm256_fmadd_ps(s0, w1, d01); - s0 = _mm256_set1_ps(src1[offset]); - d10 = _mm256_fmadd_ps(s0, w0, d10); - d11 = _mm256_fmadd_ps(s0, w1, d11); - s0 = _mm256_set1_ps(src2[offset]); - d20 = _mm256_fmadd_ps(s0, w0, d20); - d21 = _mm256_fmadd_ps(s0, w1, d21); - s0 = _mm256_set1_ps(src3[offset]); - d30 = _mm256_fmadd_ps(s0, w0, d30); - d31 = _mm256_fmadd_ps(s0, w1, d31); - s0 = _mm256_set1_ps(src4[offset]); - d40 = _mm256_fmadd_ps(s0, w0, d40); - d41 = _mm256_fmadd_ps(s0, w1, d41); - s0 = _mm256_set1_ps(src5[offset]); - d50 = _mm256_fmadd_ps(s0, w0, d50); - d51 = _mm256_fmadd_ps(s0, w1, d51); - weight += DF; - } - if (dstC == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params); - } - else - { - dstC -= F; - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params, dstC); - } - } - else - { - d00 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(); - for (size_t offset = 0; offset < srcC; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - s0 = _mm256_set1_ps(src0[offset]); - d00 = _mm256_fmadd_ps(s0, w0, d00); - s0 = _mm256_set1_ps(src1[offset]); - d10 = _mm256_fmadd_ps(s0, w0, d10); - s0 = _mm256_set1_ps(src2[offset]); - d20 = _mm256_fmadd_ps(s0, w0, d20); - s0 = _mm256_set1_ps(src3[offset]); - d30 = _mm256_fmadd_ps(s0, w0, d30); - s0 = _mm256_set1_ps(src4[offset]); - d40 = _mm256_fmadd_ps(s0, w0, d40); - s0 = _mm256_set1_ps(src5[offset]); - d50 = _mm256_fmadd_ps(s0, w0, d50); - weight += DF; - } - if (dstC == F) - { - Term::template Save(dst + 0, d00, bias, params); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d10, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d20, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d30, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d40, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d50, bias, params, dstC); - } - } - } - - template SIMD_NOINLINE void ConvolutionNhwcDirect1x1_2xM(const float* src0, const ConvParam32f& p, - size_t srcC, size_t dstC, const float* weight, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t dS = p.srcC, dD = p.dstC; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - if (M > 0) d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - if (M > 4) d40 = _mm256_setzero_ps(), d41 = _mm256_setzero_ps(); - if (M > 5) d50 = _mm256_setzero_ps(), d51 = _mm256_setzero_ps(); - for (size_t offset = 0; offset < srcC; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - if (M > 0) s0 = _mm256_set1_ps(src0[offset]), d00 = _mm256_fmadd_ps(s0, w0, d00), d01 = _mm256_fmadd_ps(s0, w1, d01); - if (M > 1) s0 = _mm256_set1_ps(src1[offset]), d10 = _mm256_fmadd_ps(s0, w0, d10), d11 = _mm256_fmadd_ps(s0, w1, d11); - if (M > 2) s0 = _mm256_set1_ps(src2[offset]), d20 = _mm256_fmadd_ps(s0, w0, d20), d21 = _mm256_fmadd_ps(s0, w1, d21); - if (M > 3) s0 = _mm256_set1_ps(src3[offset]), d30 = _mm256_fmadd_ps(s0, w0, d30), d31 = _mm256_fmadd_ps(s0, w1, d31); - if (M > 4) s0 = _mm256_set1_ps(src4[offset]), d40 = _mm256_fmadd_ps(s0, w0, d40), d41 = _mm256_fmadd_ps(s0, w1, d41); - if (M > 5) s0 = _mm256_set1_ps(src5[offset]), d50 = _mm256_fmadd_ps(s0, w0, d50), d51 = _mm256_fmadd_ps(s0, w1, d51); - weight += DF; - } - if (dstC == DF) - { - if (M > 0) Term::template Save(dst + 0, d00, bias, params), Term::template Save(dst + F, d01, bias, params), dst += dD; - if (M > 1) Term::template Save(dst + 0, d10, bias, params), Term::template Save(dst + F, d11, bias, params), dst += dD; - if (M > 2) Term::template Save(dst + 0, d20, bias, params), Term::template Save(dst + F, d21, bias, params), dst += dD; - if (M > 3) Term::template Save(dst + 0, d30, bias, params), Term::template Save(dst + F, d31, bias, params), dst += dD; - if (M > 4) Term::template Save(dst + 0, d40, bias, params), Term::template Save(dst + F, d41, bias, params), dst += dD; - if (M > 5) Term::template Save(dst + 0, d50, bias, params), Term::template Save(dst + F, d51, bias, params), dst += dD; - } - else - { - dstC -= F; - if (M > 0) Term::template Save(dst + 0, d00, bias, params), Term::template Save(dst + F, d01, bias, params, dstC), dst += dD; - if (M > 1) Term::template Save(dst + 0, d10, bias, params), Term::template Save(dst + F, d11, bias, params, dstC), dst += dD; - if (M > 2) Term::template Save(dst + 0, d20, bias, params), Term::template Save(dst + F, d21, bias, params, dstC), dst += dD; - if (M > 3) Term::template Save(dst + 0, d30, bias, params), Term::template Save(dst + F, d31, bias, params, dstC), dst += dD; - if (M > 4) Term::template Save(dst + 0, d40, bias, params), Term::template Save(dst + F, d41, bias, params, dstC), dst += dD; - if (M > 5) Term::template Save(dst + 0, d50, bias, params), Term::template Save(dst + F, d51, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0) d00 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(); - if (M > 4) d40 = _mm256_setzero_ps(); - if (M > 5) d50 = _mm256_setzero_ps(); - for (size_t offset = 0; offset < srcC; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - if (M > 0) s0 = _mm256_set1_ps(src0[offset]), d00 = _mm256_fmadd_ps(s0, w0, d00); - if (M > 1) s0 = _mm256_set1_ps(src1[offset]), d10 = _mm256_fmadd_ps(s0, w0, d10); - if (M > 2) s0 = _mm256_set1_ps(src2[offset]), d20 = _mm256_fmadd_ps(s0, w0, d20); - if (M > 3) s0 = _mm256_set1_ps(src3[offset]), d30 = _mm256_fmadd_ps(s0, w0, d30); - if (M > 4) s0 = _mm256_set1_ps(src4[offset]), d40 = _mm256_fmadd_ps(s0, w0, d40); - if (M > 5) s0 = _mm256_set1_ps(src5[offset]), d50 = _mm256_fmadd_ps(s0, w0, d50); - weight += DF; - } - if (dstC == F) - { - if (M > 0) Term::template Save(dst + 0, d00, bias, params), dst += dD; - if (M > 1) Term::template Save(dst + 0, d10, bias, params), dst += dD; - if (M > 2) Term::template Save(dst + 0, d20, bias, params), dst += dD; - if (M > 3) Term::template Save(dst + 0, d30, bias, params), dst += dD; - if (M > 4) Term::template Save(dst + 0, d40, bias, params), dst += dD; - if (M > 5) Term::template Save(dst + 0, d50, bias, params), dst += dD; - } - else - { - if (M > 0) Term::template Save(dst + 0, d00, bias, params, dstC), dst += dD; - if (M > 1) Term::template Save(dst + 0, d10, bias, params, dstC), dst += dD; - if (M > 2) Term::template Save(dst + 0, d20, bias, params, dstC), dst += dD; - if (M > 3) Term::template Save(dst + 0, d30, bias, params, dstC), dst += dD; - if (M > 4) Term::template Save(dst + 0, d40, bias, params, dstC), dst += dD; - if (M > 5) Term::template Save(dst + 0, d50, bias, params, dstC), dst += dD; - } - } - } - - typedef void(*ConvolutionNhwcDirect1x1_2xM_Ptr)(const float* src0, const ConvParam32f& p, size_t srcC, size_t dstC, const float* weight, const __m256* bias, const __m256* params, float* dst); - - template ConvolutionNhwcDirect1x1_2xM_Ptr GetConvolutionNhwcDirect1x1_2xM(size_t M) - { - switch (M) - { - case 0: return ConvolutionNhwcDirect1x1_2xM; - case 1: return ConvolutionNhwcDirect1x1_2xM; - case 2: return ConvolutionNhwcDirect1x1_2xM; - case 3: return ConvolutionNhwcDirect1x1_2xM; - case 4: return ConvolutionNhwcDirect1x1_2xM; - case 5: return ConvolutionNhwcDirect1x1_2xM; - } - assert(0); - return NULL; - } - - template SIMD_NOINLINE void ConvolutionNhwcDirect1x1_2(const float* src, const ConvParam32f& p, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t n1 = (yEnd - yBeg) * p.dstW; - size_t n6 = AlignLoAny(n1, 6); - size_t nTail = n1 - n6; - ConvolutionNhwcDirect1x1_2xM_Ptr tailN = GetConvolutionNhwcDirect1x1_2xM(nTail); - - __m256 _params[2], _bias[2]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - - for (size_t dc = 0; dc < dstC; dc += DF) - { - size_t dC = Simd::Min(DF, dstC - dc); - _bias[0] = _mm256_loadu_ps(bias + dc + 0); - _bias[1] = _mm256_loadu_ps(bias + dc + F); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = _mm256_loadu_ps(params + dc + 0); - _params[1] = _mm256_loadu_ps(params + dc + F); - } - const float* ps = src + yBeg * p.srcW * p.srcC; - float* pd = dst + dc + yBeg * p.dstW * p.dstC; - size_t i = 0; - for (; i < n6; i += 6, ps += 6 * p.srcC, pd += 6 * p.dstC) - ConvolutionNhwcDirect1x1_2x6(ps, p, srcC, dC, weight, _bias, _params, pd); - if (nTail) - tailN(ps, p, srcC, dC, weight, _bias, _params, pd), ps += nTail * p.srcC, pd += nTail * p.dstC; - weight += srcC * DF; - } - } - - template SIMD_NOINLINE void ConvolutionNhwcDirect1x1_2(const float* src, const ConvParam32f& p, - const SynetConvolution32fNhwcDirect::AlgParam& a, const float* weight, const float* bias, const float* params, float* dst) - { - for (size_t dc = 0; dc < p.dstC; dc += a.macroD) - { - size_t macroD = Simd::Min(p.dstC, dc + a.macroD) - dc; - for (size_t sc = 0; sc < p.srcC; sc += a.macroC) - { - size_t macroC = Simd::Min(p.srcC, sc + a.macroC) - sc; - for (size_t yBeg = 0; yBeg < p.dstH;) - { - size_t yEnd = Simd::Min(yBeg + a.macroH, p.dstH); - if (a.macroC == p.srcC) - ConvolutionNhwcDirect1x1_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc == 0) - ConvolutionNhwcDirect1x1_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc + macroC == p.srcC) - ConvolutionNhwcDirect1x1_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else - ConvolutionNhwcDirect1x1_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - yBeg = yEnd; - } - weight += AlignHiAny(macroD, a.microD) * macroC; - } - if (type == ::SimdConvolutionActivationPrelu) - params += macroD; - } - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void Set(const ConvParam32f& p, SynetConvolution32fNhwcDirect::OldConvolutionPtr& convolution) - { - if (p.Is1x1()) - convolution = ConvolutionNhwcDirect1x1_2; - else - convolution = ConvolutionNhwcDirect_2; - } - - bool SynetConvolution32fNhwcDirect::Set2f(const ConvParam32f& p, OldConvolutionPtr& convolution) - { - switch (p.activation) - { - case SimdConvolutionActivationIdentity: Set(p, convolution); break; - case SimdConvolutionActivationRelu: Set(p, convolution); break; - case SimdConvolutionActivationLeakyRelu: Set(p, convolution); break; - case SimdConvolutionActivationRestrictRange: Set(p, convolution); break; - case SimdConvolutionActivationPrelu: Set(p, convolution); break; - case SimdConvolutionActivationElu: Set(p, convolution); break; - case SimdConvolutionActivationHswish: Set(p, convolution); break; - default: assert(0); - } - return true; - } - } -#endif//SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2SynetConvolution32fNhwcDirect2r.cpp b/src/3rd/Simd/Simd/SimdAvx2SynetConvolution32fNhwcDirect2r.cpp deleted file mode 100644 index 5d8e34aa..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2SynetConvolution32fNhwcDirect2r.cpp +++ /dev/null @@ -1,690 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - using AlgParam = SynetConvolution32fNhwcDirect::AlgParam; - - typedef void(*ConvolutionNhwcDirect_NxM_Ptr)(const float* src0, const ConvParam32f& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst); - typedef void(*ConvolutionNhwcDirect1x1_NxM_Ptr)(const float* src0, const ConvParam32f& p, const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst); - - //--------------------------------------------------------------------- - - template void ConvolutionNhwcDirect_2x1(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, s0, w0, w1; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - if (dstC > F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00), d01 = _mm256_fmadd_ps(s0, w1, d01); - } - } - weight0 += dW, weight1 += dW; - } - } - if (dstC == DF) - Save2(dst, d00, d01, bias, params); - else - Save2(dst, d00, d01, bias, params, dstC - F); - } - else - { - d00 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00); - } - } - weight0 += dW; - } - } - if (dstC == F) - Save1(dst, d00, bias, params); - else - Save1(dst, d00, bias, params, dstC); - } - } - - template void ConvolutionNhwcDirect_2x6(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(), d41 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(), d51 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 6 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00), d01 = _mm256_fmadd_ps(s0, w1, d01); - s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_fmadd_ps(s0, w0, d10), d11 = _mm256_fmadd_ps(s0, w1, d11); - s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_fmadd_ps(s0, w0, d20), d21 = _mm256_fmadd_ps(s0, w1, d21); - s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_fmadd_ps(s0, w0, d30), d31 = _mm256_fmadd_ps(s0, w1, d31); - s0 = _mm256_set1_ps(src4[offs]), d40 = _mm256_fmadd_ps(s0, w0, d40), d41 = _mm256_fmadd_ps(s0, w1, d41); - s0 = _mm256_set1_ps(src5[offs]), d50 = _mm256_fmadd_ps(s0, w0, d50), d51 = _mm256_fmadd_ps(s0, w1, d51); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == DF) - { - Save2(dst, d00, d01, bias, params), dst += dD; - Save2(dst, d10, d11, bias, params), dst += dD; - Save2(dst, d20, d21, bias, params), dst += dD; - Save2(dst, d30, d31, bias, params), dst += dD; - Save2(dst, d40, d41, bias, params), dst += dD; - Save2(dst, d50, d51, bias, params), dst += dD; - } - else - { - dstC -= F; - Save2(dst, d00, d01, bias, params, dstC), dst += dD; - Save2(dst, d10, d11, bias, params, dstC), dst += dD; - Save2(dst, d20, d21, bias, params, dstC), dst += dD; - Save2(dst, d30, d31, bias, params, dstC), dst += dD; - Save2(dst, d40, d41, bias, params, dstC), dst += dD; - Save2(dst, d50, d51, bias, params, dstC), dst += dD; - } - } - else - { - d00 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 6 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00); - s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_fmadd_ps(s0, w0, d10); - s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_fmadd_ps(s0, w0, d20); - s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_fmadd_ps(s0, w0, d30); - s0 = _mm256_set1_ps(src4[offs]), d40 = _mm256_fmadd_ps(s0, w0, d40); - s0 = _mm256_set1_ps(src5[offs]), d50 = _mm256_fmadd_ps(s0, w0, d50); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - Save1(dst, d00, bias, params), dst += dD; - Save1(dst, d10, bias, params), dst += dD; - Save1(dst, d20, bias, params), dst += dD; - Save1(dst, d30, bias, params), dst += dD; - Save1(dst, d40, bias, params), dst += dD; - Save1(dst, d50, bias, params), dst += dD; - } - else - { - Save1(dst, d00, bias, params, dstC), dst += dD; - Save1(dst, d10, bias, params, dstC), dst += dD; - Save1(dst, d20, bias, params, dstC), dst += dD; - Save1(dst, d30, bias, params, dstC), dst += dD; - Save1(dst, d40, bias, params, dstC), dst += dD; - Save1(dst, d50, bias, params, dstC), dst += dD; - } - } - } - - template void ConvolutionNhwcDirect_2xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - if (M > 0) d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - if (M > 4) d40 = _mm256_setzero_ps(), d41 = _mm256_setzero_ps(); - if (M > 5) d50 = _mm256_setzero_ps(), d51 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - if (M > 0) s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00), d01 = _mm256_fmadd_ps(s0, w1, d01); - if (M > 1) s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_fmadd_ps(s0, w0, d10), d11 = _mm256_fmadd_ps(s0, w1, d11); - if (M > 2) s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_fmadd_ps(s0, w0, d20), d21 = _mm256_fmadd_ps(s0, w1, d21); - if (M > 3) s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_fmadd_ps(s0, w0, d30), d31 = _mm256_fmadd_ps(s0, w1, d31); - if (M > 4) s0 = _mm256_set1_ps(src4[offs]), d40 = _mm256_fmadd_ps(s0, w0, d40), d41 = _mm256_fmadd_ps(s0, w1, d41); - if (M > 5) s0 = _mm256_set1_ps(src5[offs]), d50 = _mm256_fmadd_ps(s0, w0, d50), d51 = _mm256_fmadd_ps(s0, w1, d51); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == DF) - { - if (M > 0) Save2(dst, d00, d01, bias, params), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params), dst += dD; - } - else - { - dstC -= F; - if (M > 0) Save2(dst, d00, d01, bias, params, dstC), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params, dstC), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params, dstC), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params, dstC), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params, dstC), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0) d00 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(); - if (M > 4) d40 = _mm256_setzero_ps(); - if (M > 5) d50 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - if (M > 0) s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00); - if (M > 1) s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_fmadd_ps(s0, w0, d10); - if (M > 2) s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_fmadd_ps(s0, w0, d20); - if (M > 3) s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_fmadd_ps(s0, w0, d30); - if (M > 4) s0 = _mm256_set1_ps(src4[offs]), d40 = _mm256_fmadd_ps(s0, w0, d40); - if (M > 5) s0 = _mm256_set1_ps(src5[offs]), d50 = _mm256_fmadd_ps(s0, w0, d50); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - if (M > 0) Save1(dst, d00, bias, params), dst += dD; - if (M > 1) Save1(dst, d10, bias, params), dst += dD; - if (M > 2) Save1(dst, d20, bias, params), dst += dD; - if (M > 3) Save1(dst, d30, bias, params), dst += dD; - if (M > 4) Save1(dst, d40, bias, params), dst += dD; - if (M > 5) Save1(dst, d50, bias, params), dst += dD; - } - else - { - if (M > 0) Save1(dst, d00, bias, params, dstC), dst += dD; - if (M > 1) Save1(dst, d10, bias, params, dstC), dst += dD; - if (M > 2) Save1(dst, d20, bias, params, dstC), dst += dD; - if (M > 3) Save1(dst, d30, bias, params, dstC), dst += dD; - if (M > 4) Save1(dst, d40, bias, params, dstC), dst += dD; - if (M > 5) Save1(dst, d50, bias, params, dstC), dst += dD; - } - } - } - - template ConvolutionNhwcDirect_NxM_Ptr GetConvolutionNhwcDirect_2xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect_2xM; - case 2: return ConvolutionNhwcDirect_2xM; - case 3: return ConvolutionNhwcDirect_2xM; - case 4: return ConvolutionNhwcDirect_2xM; - case 5: return ConvolutionNhwcDirect_2xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect_2(const float* src, const ConvParam32f& p, const AlgParam& a, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t noseH = p.NoseH(), noseW = p.NoseW(), bodyH = p.BodyH(), bodyW = p.BodyW(); - size_t n = 6, bodyWn = AlignLoAny(bodyW - noseW, n) + noseW, m = bodyW - bodyWn; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_2x1 = ConvolutionNhwcDirect_2x1; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_2xN = ConvolutionNhwcDirect_2x6; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_2xM = GetConvolutionNhwcDirect_2xM(m); - size_t tailH = p.dstH, tailW = p.dstW; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - - __m256 _params[2], _bias[2]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - - for (size_t dc = 0; dc < dstC; dc += a.microD) - { - size_t dC = Simd::Min(a.microD, dstC - dc); - if (dC > 0 * F) _bias[0] = _mm256_loadu_ps(bias + dc + 0 * F); - if (dC > 1 * F) _bias[1] = _mm256_loadu_ps(bias + dc + 1 * F); - if (type == ::SimdConvolutionActivationPrelu) - { - if (dC > 0 * F) _params[0] = _mm256_loadu_ps(params + dc + 0 * F); - if (dC > 1 * F) _params[1] = _mm256_loadu_ps(params + dc + 1 * F); - } - float* d = dst + dc + yBeg * p.dstW * p.dstC; - size_t dy = yBeg; - for (; dy < noseH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_2xN(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - } - for (; dy < bodyH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_2xN(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - } - for (; dy < tailH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_2xN(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - } - weight += p.kernelY * p.kernelX * p.srcC * a.microD; - } - } - - //--------------------------------------------------------------------- - - template void ConvolutionNhwcDirect1x1_2x6(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(), d41 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(), d51 = _mm256_setzero_ps(); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00), d01 = _mm256_fmadd_ps(s0, w1, d01); - s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_fmadd_ps(s0, w0, d10), d11 = _mm256_fmadd_ps(s0, w1, d11); - s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_fmadd_ps(s0, w0, d20), d21 = _mm256_fmadd_ps(s0, w1, d21); - s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_fmadd_ps(s0, w0, d30), d31 = _mm256_fmadd_ps(s0, w1, d31); - s0 = _mm256_set1_ps(src4[offs]), d40 = _mm256_fmadd_ps(s0, w0, d40), d41 = _mm256_fmadd_ps(s0, w1, d41); - s0 = _mm256_set1_ps(src5[offs]), d50 = _mm256_fmadd_ps(s0, w0, d50), d51 = _mm256_fmadd_ps(s0, w1, d51); - } - if (dstC == DF) - { - Save2(dst, d00, d01, bias, params), dst += dD; - Save2(dst, d10, d11, bias, params), dst += dD; - Save2(dst, d20, d21, bias, params), dst += dD; - Save2(dst, d30, d31, bias, params), dst += dD; - Save2(dst, d40, d41, bias, params), dst += dD; - Save2(dst, d50, d51, bias, params), dst += dD; - } - else - { - dstC -= F; - Save2(dst, d00, d01, bias, params, dstC), dst += dD; - Save2(dst, d10, d11, bias, params, dstC), dst += dD; - Save2(dst, d20, d21, bias, params, dstC), dst += dD; - Save2(dst, d30, d31, bias, params, dstC), dst += dD; - Save2(dst, d40, d41, bias, params, dstC), dst += dD; - Save2(dst, d50, d51, bias, params, dstC), dst += dD; - } - } - else - { - d00 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00); - s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_fmadd_ps(s0, w0, d10); - s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_fmadd_ps(s0, w0, d20); - s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_fmadd_ps(s0, w0, d30); - s0 = _mm256_set1_ps(src4[offs]), d40 = _mm256_fmadd_ps(s0, w0, d40); - s0 = _mm256_set1_ps(src5[offs]), d50 = _mm256_fmadd_ps(s0, w0, d50); - } - if (dstC == F) - { - Save1(dst, d00, bias, params), dst += dD; - Save1(dst, d10, bias, params), dst += dD; - Save1(dst, d20, bias, params), dst += dD; - Save1(dst, d30, bias, params), dst += dD; - Save1(dst, d40, bias, params), dst += dD; - Save1(dst, d50, bias, params), dst += dD; - } - else - { - Save1(dst, d00, bias, params, dstC), dst += dD; - Save1(dst, d10, bias, params, dstC), dst += dD; - Save1(dst, d20, bias, params, dstC), dst += dD; - Save1(dst, d30, bias, params, dstC), dst += dD; - Save1(dst, d40, bias, params, dstC), dst += dD; - Save1(dst, d50, bias, params, dstC), dst += dD; - } - } - } - - template void ConvolutionNhwcDirect1x1_2xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - if (M > 0) d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - if (M > 4) d40 = _mm256_setzero_ps(), d41 = _mm256_setzero_ps(); - if (M > 5) d50 = _mm256_setzero_ps(), d51 = _mm256_setzero_ps(); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - if (M > 0) s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00), d01 = _mm256_fmadd_ps(s0, w1, d01); - if (M > 1) s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_fmadd_ps(s0, w0, d10), d11 = _mm256_fmadd_ps(s0, w1, d11); - if (M > 2) s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_fmadd_ps(s0, w0, d20), d21 = _mm256_fmadd_ps(s0, w1, d21); - if (M > 3) s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_fmadd_ps(s0, w0, d30), d31 = _mm256_fmadd_ps(s0, w1, d31); - if (M > 4) s0 = _mm256_set1_ps(src4[offs]), d40 = _mm256_fmadd_ps(s0, w0, d40), d41 = _mm256_fmadd_ps(s0, w1, d41); - if (M > 5) s0 = _mm256_set1_ps(src5[offs]), d50 = _mm256_fmadd_ps(s0, w0, d50), d51 = _mm256_fmadd_ps(s0, w1, d51); - } - if (dstC == DF) - { - if (M > 0) Save2(dst, d00, d01, bias, params), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params), dst += dD; - } - else - { - dstC -= F; - if (M > 0) Save2(dst, d00, d01, bias, params, dstC), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params, dstC), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params, dstC), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params, dstC), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params, dstC), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0) d00 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(); - if (M > 4) d40 = _mm256_setzero_ps(); - if (M > 5) d50 = _mm256_setzero_ps(); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - if (M > 0) s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00); - if (M > 1) s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_fmadd_ps(s0, w0, d10); - if (M > 2) s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_fmadd_ps(s0, w0, d20); - if (M > 3) s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_fmadd_ps(s0, w0, d30); - if (M > 4) s0 = _mm256_set1_ps(src4[offs]), d40 = _mm256_fmadd_ps(s0, w0, d40); - if (M > 5) s0 = _mm256_set1_ps(src5[offs]), d50 = _mm256_fmadd_ps(s0, w0, d50); - } - if (dstC == F) - { - if (M > 0) Save1(dst, d00, bias, params), dst += dD; - if (M > 1) Save1(dst, d10, bias, params), dst += dD; - if (M > 2) Save1(dst, d20, bias, params), dst += dD; - if (M > 3) Save1(dst, d30, bias, params), dst += dD; - if (M > 4) Save1(dst, d40, bias, params), dst += dD; - if (M > 5) Save1(dst, d50, bias, params), dst += dD; - } - else - { - if (M > 0) Save1(dst, d00, bias, params, dstC), dst += dD; - if (M > 1) Save1(dst, d10, bias, params, dstC), dst += dD; - if (M > 2) Save1(dst, d20, bias, params, dstC), dst += dD; - if (M > 3) Save1(dst, d30, bias, params, dstC), dst += dD; - if (M > 4) Save1(dst, d40, bias, params, dstC), dst += dD; - if (M > 5) Save1(dst, d50, bias, params, dstC), dst += dD; - } - } - } - - template ConvolutionNhwcDirect1x1_NxM_Ptr GetConvolutionNhwcDirect1x1_2xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect1x1_2xM; - case 2: return ConvolutionNhwcDirect1x1_2xM; - case 3: return ConvolutionNhwcDirect1x1_2xM; - case 4: return ConvolutionNhwcDirect1x1_2xM; - case 5: return ConvolutionNhwcDirect1x1_2xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect1x1_2(const float* src, const ConvParam32f& p, const AlgParam& a, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t n = 6, n1 = (yEnd - yBeg) * p.dstW, nn = AlignLoAny(n1, n), m = n1 - nn; - ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_2xN = ConvolutionNhwcDirect1x1_2x6; - ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_2xM = GetConvolutionNhwcDirect1x1_2xM(m); - - __m256 _params[2], _bias[2]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - - for (size_t dc = 0; dc < dstC; dc += a.microD) - { - size_t dC = Simd::Min(a.microD, dstC - dc); - if (dC > 0 * F) _bias[0] = _mm256_loadu_ps(bias + dc + 0 * F); - if (dC > 1 * F) _bias[1] = _mm256_loadu_ps(bias + dc + 1 * F); - if (type == ::SimdConvolutionActivationPrelu) - { - if (dC > 0 * F) _params[0] = _mm256_loadu_ps(params + dc + 0 * F); - if (dC > 1 * F) _params[1] = _mm256_loadu_ps(params + dc + 1 * F); - } - const float* ps = src + yBeg * p.srcW * p.srcC; - float* pd = dst + dc + yBeg * p.dstW * p.dstC; - size_t i = 0; - for (; i < nn; i += n, ps += n * p.srcC, pd += n * p.dstC) - convolutionNhwcDirect1x1_2xN(ps, p, a, srcC, dC, weight, _bias, _params, pd); - for (; i < n1; i += m, ps += m * p.srcC, pd += m * p.dstC) - convolutionNhwcDirect1x1_2xM(ps, p, a, srcC, dC, weight, _bias, _params, pd); - weight += p.srcC * a.microD; - } - } - - //--------------------------------------------------------------------- - - template static SIMD_INLINE void Set(const ConvParam32f& p, AlgParam& a) - { - a.convolutions[term] = p.Is1x1() ? ConvolutionNhwcDirect1x1_2 : ConvolutionNhwcDirect_2; - } - - template static SIMD_INLINE void Set(const ConvParam32f& p, AlgParam& a) - { - Set(p, a); - Set(p, a); - Set(p, a); - Set(p, a); - } - - bool SynetConvolution32fNhwcDirect::Set2r(const ConvParam32f& p, AlgParam& a) - { - assert(a.microD == 2 * F); - switch (p.activation) - { - case SimdConvolutionActivationIdentity: Set(p, a); break; - case SimdConvolutionActivationRelu: Set(p, a); break; - case SimdConvolutionActivationLeakyRelu: Set(p, a); break; - case SimdConvolutionActivationRestrictRange: Set(p, a); break; - case SimdConvolutionActivationPrelu: Set(p, a); break; - case SimdConvolutionActivationElu: Set(p, a); break; - case SimdConvolutionActivationHswish: Set(p, a); break; - default: assert(0); - } - return true; - } - } -#endif//SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2SynetConvolution32fNhwcDirect3r.cpp b/src/3rd/Simd/Simd/SimdAvx2SynetConvolution32fNhwcDirect3r.cpp deleted file mode 100644 index 0859f667..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2SynetConvolution32fNhwcDirect3r.cpp +++ /dev/null @@ -1,808 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - using AlgParam = SynetConvolution32fNhwcDirect::AlgParam; - - typedef void(*ConvolutionNhwcDirect_NxM_Ptr)(const float* src0, const ConvParam32f& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst); - typedef void(*ConvolutionNhwcDirect1x1_NxM_Ptr)(const float* src0, const ConvParam32f& p, const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst); - - //--------------------------------------------------------------------- - - template void ConvolutionNhwcDirect_3x1(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d02, s0, w0, w1, w2; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - if (dstC > 2 * F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(), d02 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - w2 = _mm256_loadu_ps(weight2 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00), d01 = _mm256_fmadd_ps(s0, w1, d01), d02 = _mm256_fmadd_ps(s0, w2, d02); - } - } - weight0 += dW, weight1 += dW, weight2 += dW; - } - } - if (dstC == 3 * F) - Save3(dst, d00, d01, d02, bias, params); - else - Save3(dst, d00, d01, d02, bias, params, dstC - 2 * F); - } - else if (dstC > F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00), d01 = _mm256_fmadd_ps(s0, w1, d01); - } - } - weight0 += dW, weight1 += dW; - } - } - if (dstC == 2 * F) - Save2(dst, d00, d01, bias, params); - else - Save2(dst, d00, d01, bias, params, dstC - F); - } - else - { - d00 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00); - } - } - weight0 += dW; - } - } - if (dstC == F) - Save1(dst, d00, bias, params); - else - Save1(dst, d00, bias, params, dstC); - } - } - - template void ConvolutionNhwcDirect_3x4(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d02, d10, d11, d12, d20, d21, d22, d30, d31, d32, s0, w0, w1, w2; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - if (dstC > 2 * F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(), d02 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(), d12 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(), d22 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(), d32 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 4 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - w2 = _mm256_loadu_ps(weight2 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00), d01 = _mm256_fmadd_ps(s0, w1, d01), d02 = _mm256_fmadd_ps(s0, w2, d02); - s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_fmadd_ps(s0, w0, d10), d11 = _mm256_fmadd_ps(s0, w1, d11), d12 = _mm256_fmadd_ps(s0, w2, d12); - s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_fmadd_ps(s0, w0, d20), d21 = _mm256_fmadd_ps(s0, w1, d21), d22 = _mm256_fmadd_ps(s0, w2, d22); - s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_fmadd_ps(s0, w0, d30), d31 = _mm256_fmadd_ps(s0, w1, d31), d32 = _mm256_fmadd_ps(s0, w2, d32); - } - weight0 += dW, weight1 += dW, weight2 += dW; - } - } - else - weight0 += dWz, weight1 += dWz, weight2 += dWz; - } - if (dstC == 3 * F) - { - Save3(dst, d00, d01, d02, bias, params), dst += dD; - Save3(dst, d10, d11, d12, bias, params), dst += dD; - Save3(dst, d20, d21, d22, bias, params), dst += dD; - Save3(dst, d30, d31, d32, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - Save3(dst, d20, d21, d22, bias, params, dstC), dst += dD; - Save3(dst, d30, d31, d32, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 4 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00), d01 = _mm256_fmadd_ps(s0, w1, d01); - s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_fmadd_ps(s0, w0, d10), d11 = _mm256_fmadd_ps(s0, w1, d11); - s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_fmadd_ps(s0, w0, d20), d21 = _mm256_fmadd_ps(s0, w1, d21); - s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_fmadd_ps(s0, w0, d30), d31 = _mm256_fmadd_ps(s0, w1, d31); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == 2 * F) - { - Save2(dst, d00, d01, bias, params), dst += dD; - Save2(dst, d10, d11, bias, params), dst += dD; - Save2(dst, d20, d21, bias, params), dst += dD; - Save2(dst, d30, d31, bias, params), dst += dD; - } - else - { - dstC -= 1 * F; - Save2(dst, d00, d01, bias, params, dstC), dst += dD; - Save2(dst, d10, d11, bias, params, dstC), dst += dD; - Save2(dst, d20, d21, bias, params, dstC), dst += dD; - Save2(dst, d30, d31, bias, params, dstC), dst += dD; - } - } - else - { - d00 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 4 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00); - s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_fmadd_ps(s0, w0, d10); - s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_fmadd_ps(s0, w0, d20); - s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_fmadd_ps(s0, w0, d30); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - Save1(dst, d00, bias, params), dst += dD; - Save1(dst, d10, bias, params), dst += dD; - Save1(dst, d20, bias, params), dst += dD; - Save1(dst, d30, bias, params), dst += dD; - } - else - { - Save1(dst, d00, bias, params, dstC), dst += dD; - Save1(dst, d10, bias, params, dstC), dst += dD; - Save1(dst, d20, bias, params, dstC), dst += dD; - Save1(dst, d30, bias, params, dstC), dst += dD; - } - } - } - - template void ConvolutionNhwcDirect_3xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d02, d10, d11, d12, d20, d21, d22, d30, d31, d32, s0, w0, w1, w2; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - if (dstC > 2 * F) - { - if (M > 0) d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(), d02 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(), d12 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(), d22 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(), d32 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - w2 = _mm256_loadu_ps(weight2 + offw); - if (M > 0) s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00), d01 = _mm256_fmadd_ps(s0, w1, d01), d02 = _mm256_fmadd_ps(s0, w2, d02); - if (M > 1) s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_fmadd_ps(s0, w0, d10), d11 = _mm256_fmadd_ps(s0, w1, d11), d12 = _mm256_fmadd_ps(s0, w2, d12); - if (M > 2) s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_fmadd_ps(s0, w0, d20), d21 = _mm256_fmadd_ps(s0, w1, d21), d22 = _mm256_fmadd_ps(s0, w2, d22); - if (M > 3) s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_fmadd_ps(s0, w0, d30), d31 = _mm256_fmadd_ps(s0, w1, d31), d32 = _mm256_fmadd_ps(s0, w2, d32); - } - weight0 += dW, weight1 += dW, weight2 += dW; - } - } - else - weight0 += dWz, weight1 += dWz, weight2 += dWz; - } - if (dstC == 3 * F) - { - if (M > 0) Save3(dst, d00, d01, d02, bias, params), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - if (M > 0) Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params, dstC), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - if (M > 0) d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - if (M > 0) s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00), d01 = _mm256_fmadd_ps(s0, w1, d01); - if (M > 1) s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_fmadd_ps(s0, w0, d10), d11 = _mm256_fmadd_ps(s0, w1, d11); - if (M > 2) s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_fmadd_ps(s0, w0, d20), d21 = _mm256_fmadd_ps(s0, w1, d21); - if (M > 3) s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_fmadd_ps(s0, w0, d30), d31 = _mm256_fmadd_ps(s0, w1, d31); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == 2 * F) - { - if (M > 0) Save2(dst, d00, d01, bias, params), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params), dst += dD; - } - else - { - dstC -= 1 * F; - if (M > 0) Save2(dst, d00, d01, bias, params, dstC), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params, dstC), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params, dstC), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0) d00 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - if (M > 0) s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00); - if (M > 1) s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_fmadd_ps(s0, w0, d10); - if (M > 2) s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_fmadd_ps(s0, w0, d20); - if (M > 3) s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_fmadd_ps(s0, w0, d30); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - if (M > 0) Save1(dst, d00, bias, params), dst += dD; - if (M > 1) Save1(dst, d10, bias, params), dst += dD; - if (M > 2) Save1(dst, d20, bias, params), dst += dD; - if (M > 3) Save1(dst, d30, bias, params), dst += dD; - } - else - { - if (M > 0) Save1(dst, d00, bias, params, dstC), dst += dD; - if (M > 1) Save1(dst, d10, bias, params, dstC), dst += dD; - if (M > 2) Save1(dst, d20, bias, params, dstC), dst += dD; - if (M > 3) Save1(dst, d30, bias, params, dstC), dst += dD; - } - } - } - - template ConvolutionNhwcDirect_NxM_Ptr GetConvolutionNhwcDirect_3xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect_3xM; - case 2: return ConvolutionNhwcDirect_3xM; - case 3: return ConvolutionNhwcDirect_3xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect_3(const float* src, const ConvParam32f& p, const AlgParam& a, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t noseH = p.NoseH(), noseW = p.NoseW(), bodyH = p.BodyH(), bodyW = p.BodyW(); - size_t n = 4, bodyWn = AlignLoAny(bodyW - noseW, n) + noseW, m = bodyW - bodyWn; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_3x1 = ConvolutionNhwcDirect_3x1; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_3xN = ConvolutionNhwcDirect_3x4; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_3xM = GetConvolutionNhwcDirect_3xM(m); - size_t tailH = p.dstH, tailW = p.dstW; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - - __m256 _params[3], _bias[3]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - - for (size_t dc = 0; dc < dstC; dc += a.microD) - { - size_t dC = Simd::Min(a.microD, dstC - dc); - if (dC > 0 * F) _bias[0] = _mm256_loadu_ps(bias + dc + 0 * F); - if (dC > 1 * F) _bias[1] = _mm256_loadu_ps(bias + dc + 1 * F); - if (dC > 2 * F) _bias[2] = _mm256_loadu_ps(bias + dc + 2 * F); - if (type == ::SimdConvolutionActivationPrelu) - { - if (dC > 0 * F) _params[0] = _mm256_loadu_ps(params + dc + 0 * F); - if (dC > 1 * F) _params[1] = _mm256_loadu_ps(params + dc + 1 * F); - if (dC > 2 * F) _params[2] = _mm256_loadu_ps(params + dc + 2 * F); - } - float* d = dst + dc + yBeg * p.dstW * p.dstC; - size_t dy = yBeg; - for (; dy < noseH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_3xN(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_3xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - } - for (; dy < bodyH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_3xN(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_3xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - } - for (; dy < tailH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_3xN(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_3xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - } - weight += p.kernelY * p.kernelX * p.srcC * a.microD; - } - } - - //--------------------------------------------------------------------- - - template void ConvolutionNhwcDirect1x1_3x4(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d02, d10, d11, d12, d20, d21, d22, d30, d31, d32, s0, w0, w1, w2; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - if (dstC > 2 * F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(), d02 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(), d12 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(), d22 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(), d32 = _mm256_setzero_ps(); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - w2 = _mm256_loadu_ps(weight2 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00), d01 = _mm256_fmadd_ps(s0, w1, d01), d02 = _mm256_fmadd_ps(s0, w2, d02); - s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_fmadd_ps(s0, w0, d10), d11 = _mm256_fmadd_ps(s0, w1, d11), d12 = _mm256_fmadd_ps(s0, w2, d12); - s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_fmadd_ps(s0, w0, d20), d21 = _mm256_fmadd_ps(s0, w1, d21), d22 = _mm256_fmadd_ps(s0, w2, d22); - s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_fmadd_ps(s0, w0, d30), d31 = _mm256_fmadd_ps(s0, w1, d31), d32 = _mm256_fmadd_ps(s0, w2, d32); - } - if (dstC == 3 * F) - { - Save3(dst, d00, d01, d02, bias, params), dst += dD; - Save3(dst, d10, d11, d12, bias, params), dst += dD; - Save3(dst, d20, d21, d22, bias, params), dst += dD; - Save3(dst, d30, d31, d32, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - Save3(dst, d20, d21, d22, bias, params, dstC), dst += dD; - Save3(dst, d30, d31, d32, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00), d01 = _mm256_fmadd_ps(s0, w1, d01); - s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_fmadd_ps(s0, w0, d10), d11 = _mm256_fmadd_ps(s0, w1, d11); - s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_fmadd_ps(s0, w0, d20), d21 = _mm256_fmadd_ps(s0, w1, d21); - s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_fmadd_ps(s0, w0, d30), d31 = _mm256_fmadd_ps(s0, w1, d31); - } - if (dstC == 2 * F) - { - Save2(dst, d00, d01, bias, params), dst += dD; - Save2(dst, d10, d11, bias, params), dst += dD; - Save2(dst, d20, d21, bias, params), dst += dD; - Save2(dst, d30, d31, bias, params), dst += dD; - } - else - { - dstC -= 1 * F; - Save2(dst, d00, d01, bias, params, dstC), dst += dD; - Save2(dst, d10, d11, bias, params, dstC), dst += dD; - Save2(dst, d20, d21, bias, params, dstC), dst += dD; - Save2(dst, d30, d31, bias, params, dstC), dst += dD; - } - } - else - { - d00 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00); - s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_fmadd_ps(s0, w0, d10); - s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_fmadd_ps(s0, w0, d20); - s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_fmadd_ps(s0, w0, d30); - } - if (dstC == F) - { - Save1(dst, d00, bias, params), dst += dD; - Save1(dst, d10, bias, params), dst += dD; - Save1(dst, d20, bias, params), dst += dD; - Save1(dst, d30, bias, params), dst += dD; - } - else - { - Save1(dst, d00, bias, params, dstC), dst += dD; - Save1(dst, d10, bias, params, dstC), dst += dD; - Save1(dst, d20, bias, params, dstC), dst += dD; - Save1(dst, d30, bias, params, dstC), dst += dD; - } - } - } - - template void ConvolutionNhwcDirect1x1_3xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const __m256* bias, const __m256* params, float* dst) - { - __m256 d00, d01, d02, d10, d11, d12, d20, d21, d22, d30, d31, d32, s0, w0, w1, w2; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - if (dstC > 2 * F) - { - if (M > 0) d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(), d02 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(), d12 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(), d22 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(), d32 = _mm256_setzero_ps(); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - w2 = _mm256_loadu_ps(weight2 + offw); - if (M > 0) s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00), d01 = _mm256_fmadd_ps(s0, w1, d01), d02 = _mm256_fmadd_ps(s0, w2, d02); - if (M > 1) s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_fmadd_ps(s0, w0, d10), d11 = _mm256_fmadd_ps(s0, w1, d11), d12 = _mm256_fmadd_ps(s0, w2, d12); - if (M > 2) s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_fmadd_ps(s0, w0, d20), d21 = _mm256_fmadd_ps(s0, w1, d21), d22 = _mm256_fmadd_ps(s0, w2, d22); - if (M > 3) s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_fmadd_ps(s0, w0, d30), d31 = _mm256_fmadd_ps(s0, w1, d31), d32 = _mm256_fmadd_ps(s0, w2, d32); - } - if (dstC == 3 * F) - { - if (M > 0) Save3(dst, d00, d01, d02, bias, params), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - if (M > 0) Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params, dstC), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - if (M > 0) d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - w1 = _mm256_loadu_ps(weight1 + offw); - if (M > 0) s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00), d01 = _mm256_fmadd_ps(s0, w1, d01); - if (M > 1) s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_fmadd_ps(s0, w0, d10), d11 = _mm256_fmadd_ps(s0, w1, d11); - if (M > 2) s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_fmadd_ps(s0, w0, d20), d21 = _mm256_fmadd_ps(s0, w1, d21); - if (M > 3) s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_fmadd_ps(s0, w0, d30), d31 = _mm256_fmadd_ps(s0, w1, d31); - } - if (dstC == DF) - { - if (M > 0) Save2(dst, d00, d01, bias, params), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params), dst += dD; - } - else - { - dstC -= F; - if (M > 0) Save2(dst, d00, d01, bias, params, dstC), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params, dstC), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params, dstC), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0) d00 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = _mm256_loadu_ps(weight0 + offw); - if (M > 0) s0 = _mm256_set1_ps(src0[offs]), d00 = _mm256_fmadd_ps(s0, w0, d00); - if (M > 1) s0 = _mm256_set1_ps(src1[offs]), d10 = _mm256_fmadd_ps(s0, w0, d10); - if (M > 2) s0 = _mm256_set1_ps(src2[offs]), d20 = _mm256_fmadd_ps(s0, w0, d20); - if (M > 3) s0 = _mm256_set1_ps(src3[offs]), d30 = _mm256_fmadd_ps(s0, w0, d30); - } - if (dstC == F) - { - if (M > 0) Save1(dst, d00, bias, params), dst += dD; - if (M > 1) Save1(dst, d10, bias, params), dst += dD; - if (M > 2) Save1(dst, d20, bias, params), dst += dD; - if (M > 3) Save1(dst, d30, bias, params), dst += dD; - } - else - { - if (M > 0) Save1(dst, d00, bias, params, dstC), dst += dD; - if (M > 1) Save1(dst, d10, bias, params, dstC), dst += dD; - if (M > 2) Save1(dst, d20, bias, params, dstC), dst += dD; - if (M > 3) Save1(dst, d30, bias, params, dstC), dst += dD; - } - } - } - - template ConvolutionNhwcDirect1x1_NxM_Ptr GetConvolutionNhwcDirect1x1_3xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect1x1_3xM; - case 2: return ConvolutionNhwcDirect1x1_3xM; - case 3: return ConvolutionNhwcDirect1x1_3xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect1x1_3(const float* src, const ConvParam32f& p, const AlgParam& a, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t n = 4, n1 = (yEnd - yBeg) * p.dstW, nn = AlignLoAny(n1, n), m = n1 - nn; - ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_3xN = ConvolutionNhwcDirect1x1_3x4; - ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_3xM = GetConvolutionNhwcDirect1x1_3xM(m); - - __m256 _params[3], _bias[3]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - - for (size_t dc = 0; dc < dstC; dc += a.microD) - { - size_t dC = Simd::Min(a.microD, dstC - dc); - if (dC > 0 * F) _bias[0] = _mm256_loadu_ps(bias + dc + 0 * F); - if (dC > 1 * F) _bias[1] = _mm256_loadu_ps(bias + dc + 1 * F); - if (dC > 2 * F) _bias[2] = _mm256_loadu_ps(bias + dc + 2 * F); - if (type == ::SimdConvolutionActivationPrelu) - { - if (dC > 0 * F) _params[0] = _mm256_loadu_ps(params + dc + 0 * F); - if (dC > 1 * F) _params[1] = _mm256_loadu_ps(params + dc + 1 * F); - if (dC > 2 * F) _params[2] = _mm256_loadu_ps(params + dc + 2 * F); - } - const float* ps = src + yBeg * p.srcW * p.srcC; - float* pd = dst + dc + yBeg * p.dstW * p.dstC; - size_t i = 0; - for (; i < nn; i += n, ps += n * p.srcC, pd += n * p.dstC) - convolutionNhwcDirect1x1_3xN(ps, p, a, srcC, dC, weight, _bias, _params, pd); - for (; i < n1; i += m, ps += m * p.srcC, pd += m * p.dstC) - convolutionNhwcDirect1x1_3xM(ps, p, a, srcC, dC, weight, _bias, _params, pd); - weight += p.srcC * a.microD; - } - } - - //--------------------------------------------------------------------- - - template static SIMD_INLINE void Set(const ConvParam32f& p, AlgParam& a) - { - a.convolutions[term] = p.Is1x1() ? ConvolutionNhwcDirect1x1_3 : ConvolutionNhwcDirect_3; - } - - template static SIMD_INLINE void Set(const ConvParam32f& p, AlgParam& a) - { - Set(p, a); - Set(p, a); - Set(p, a); - Set(p, a); - } - - bool SynetConvolution32fNhwcDirect::Set3r(const ConvParam32f& p, AlgParam& a) - { - assert(a.microD == 3 * F); - switch (p.activation) - { - case SimdConvolutionActivationIdentity: Set(p, a); break; - case SimdConvolutionActivationRelu: Set(p, a); break; - case SimdConvolutionActivationLeakyRelu: Set(p, a); break; - case SimdConvolutionActivationRestrictRange: Set(p, a); break; - case SimdConvolutionActivationPrelu: Set(p, a); break; - case SimdConvolutionActivationElu: Set(p, a); break; - case SimdConvolutionActivationHswish: Set(p, a); break; - default: assert(0); - } - return true; - } - } -#endif//SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2SynetConvolution8i.cpp b/src/3rd/Simd/Simd/SimdAvx2SynetConvolution8i.cpp deleted file mode 100644 index c9d4fa82..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2SynetConvolution8i.cpp +++ /dev/null @@ -1,905 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution8i.h" -#include "Simd/SimdSynetConvolution8iCommon.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdMath.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdAvx2.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - using AlgParam = SynetConvolution8iNhwcDirect::AlgParam; - using ConvolutionPtr = SynetConvolution8iNhwcDirect::ConvolutionPtr; - using Term8iType = Base::SynetConvolution8iNhwcDirect::Term8iType; - - SIMD_INLINE __m256i Set4(const uint8_t* src) - { - return _mm256_set1_epi32(*(int32_t*)src); - } - - template void Madd4(__m256i& i32, __m256i u8, __m256i i8); - - template<> SIMD_INLINE void Madd4(__m256i& i32, __m256i u8, __m256i i8) - { - i32 = _mm256_add_epi32(i32, _mm256_madd_epi16(_mm256_maddubs_epi16(u8, i8), Avx2::K16_0001)); - } - - template<> SIMD_INLINE void Madd4(__m256i& i32, __m256i u8, __m256i i8) - { - __m256i lo = _mm256_madd_epi16(Cvt8uTo16i<0>(u8), Cvt8iTo16i<0>(i8)); - __m256i hi = _mm256_madd_epi16(Cvt8uTo16i<1>(u8), Cvt8iTo16i<1>(i8)); - i32 = _mm256_add_epi32(i32, PermutedHadd32i(lo, hi)); - } - - template void ConvolutionNhwcDirect_2x1(const uint8_t * src0, - const ConvParam8i& p, const AlgParam & a, size_t dy, size_t dx, size_t srcC, size_t dstC, const int8_t * weight0, - const __m256i * bias, const __m256i * params, const __m256 * scale, const __m256* shift, int32_t * buf, uint8_t* dst) - { - __m256i d00, d01, s0, w0, w1; - size_t dW = (DivHi(p.srcC, 4) - DivHi(srcC, 4)) * A, dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dWz = DivHi(srcC, 4) * A; - const int8_t* weight1 = weight0 + p.kernelY * p.kernelX * DivHi(p.srcC, 4) * A; - __m256i norm = _mm256_set1_epi32(a.norm); - size_t sy = dy * p.strideY - p.padY; - size_t sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY; - size_t kX = p.kernelX * p.dilationX; - if (dstC > F) - { - d00 = _mm256_setzero_si256(), d01 = _mm256_setzero_si256(); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - if (sy + ky < p.srcH && sx + kx < p.srcW) - { - size_t offs = (sy + ky) * dY + (sx + kx) * dX, end = offs + srcC; - for (; offs < end; offs += 4) - { - w0 = _mm256_loadu_si256((__m256i*)weight0); - w1 = _mm256_loadu_si256((__m256i*)weight1); - s0 = Set4(src0 + offs); - Madd4(d00, s0, w0); - Madd4(d01, s0, w1); - weight0 += A, weight1 += A; - } - } - else - { - if (a.zero) - { - s0 = _mm256_set1_epi32(a.zero); - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = _mm256_loadu_si256((__m256i*)weight0); - w1 = _mm256_loadu_si256((__m256i*)weight1); - Madd4(d00, s0, w0); - Madd4(d01, s0, w1); - weight0 += A, weight1 += A; - } - } - else - weight0 += dWz, weight1 += dWz; - } - weight0 += dW, weight1 += dW; - } - } - if (dstC == DF) - Save2(dst, buf, d00, d01, norm, bias, params, scale, shift); - else - Save2(dst, buf, d00, d01, norm, bias, params, scale, shift, dstC - F); - } - else - { - d00 = _mm256_setzero_si256(); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - if (sy + ky < p.srcH && sx + kx < p.srcW) - { - size_t offs = (sy + ky) * dY + (sx + kx) * dX, end = offs + srcC; - for (; offs < end; offs += 4) - { - w0 = _mm256_loadu_si256((__m256i*)weight0); - s0 = Set4(src0 + offs); - Madd4(d00, s0, w0); - weight0 += A; - } - } - else - { - if (a.zero) - { - s0 = _mm256_set1_epi32(a.zero); - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = _mm256_loadu_si256((__m256i*)weight0); - Madd4(d00, s0, w0); - weight0 += A; - } - } - else - weight0 += dWz; - } - weight0 += dW; - } - } - if (dstC == F) - Save1(dst, buf, d00, norm, bias, params, scale, shift); - else - Save1(dst, buf, d00, norm, bias, params, scale, shift, dstC); - } - } - - template void ConvolutionNhwcDirect_2x5(const uint8_t* src0, - const ConvParam8i& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const int8_t* weight0, - const __m256i* bias, const __m256i* params, const __m256* scale, const __m256* shift, int32_t* buf, uint8_t* dst) - { - __m256i d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, s0, w0, w1; - size_t dW = (DivHi(p.srcC, 4) - DivHi(srcC, 4)) * A, dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dD = p.dstC * a.size, dB = p.dstC, dWz = (DivHi(srcC, 4) * A + dW) * p.kernelX; - const int8_t * weight1 = weight0 + p.kernelY * p.kernelX * DivHi(p.srcC, 4) * A; - const uint8_t* src1 = src0 + 1 * dS; - const uint8_t* src2 = src0 + 2 * dS; - const uint8_t* src3 = src0 + 3 * dS; - const uint8_t* src4 = src0 + 4 * dS; - __m256i norm = _mm256_set1_epi32(a.norm); - size_t sy = dy * p.strideY - p.padY; - size_t sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY; - size_t kX = p.kernelX * p.dilationX; - if (dstC > F) - { - d00 = _mm256_setzero_si256(), d01 = _mm256_setzero_si256(); - d10 = _mm256_setzero_si256(), d11 = _mm256_setzero_si256(); - d20 = _mm256_setzero_si256(), d21 = _mm256_setzero_si256(); - d30 = _mm256_setzero_si256(), d31 = _mm256_setzero_si256(); - d40 = _mm256_setzero_si256(), d41 = _mm256_setzero_si256(); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - if (sy + ky < p.srcH) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - assert(sx + kx < p.srcW && sx + kx + 5 <= p.srcW); - size_t offs = (sy + ky) * dY + (sx + kx) * dX, end = offs + srcC; - for (; offs < end; offs += 4) - { - w0 = _mm256_loadu_si256((__m256i*)weight0); - w1 = _mm256_loadu_si256((__m256i*)weight1); - s0 = Set4(src0 + offs); - Madd4(d00, s0, w0); - Madd4(d01, s0, w1); - s0 = Set4(src1 + offs); - Madd4(d10, s0, w0); - Madd4(d11, s0, w1); - s0 = Set4(src2 + offs); - Madd4(d20, s0, w0); - Madd4(d21, s0, w1); - s0 = Set4(src3 + offs); - Madd4(d30, s0, w0); - Madd4(d31, s0, w1); - s0 = Set4(src4 + offs); - Madd4(d40, s0, w0); - Madd4(d41, s0, w1); - weight0 += A, weight1 += A; - } - weight0 += dW, weight1 += dW; - } - } - else if (a.zero) - { - s0 = _mm256_set1_epi32(a.zero); - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = _mm256_loadu_si256((__m256i*)weight0); - w1 = _mm256_loadu_si256((__m256i*)weight1); - Madd4(d00, s0, w0); - Madd4(d01, s0, w1); - Madd4(d10, s0, w0); - Madd4(d11, s0, w1); - Madd4(d20, s0, w0); - Madd4(d21, s0, w1); - Madd4(d30, s0, w0); - Madd4(d31, s0, w1); - Madd4(d40, s0, w0); - Madd4(d41, s0, w1); - weight0 += A, weight1 += A; - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == DF) - { - Save2(dst, buf, d00, d01, norm, bias, params, scale, shift); - dst += dD, buf += dB; - Save2(dst, buf, d10, d11, norm, bias, params, scale, shift); - dst += dD, buf += dB; - Save2(dst, buf, d20, d21, norm, bias, params, scale, shift); - dst += dD, buf += dB; - Save2(dst, buf, d30, d31, norm, bias, params, scale, shift); - dst += dD, buf += dB; - Save2(dst, buf, d40, d41, norm, bias, params, scale, shift); - dst += dD, buf += dB; - } - else - { - Save2(dst, buf, d00, d01, norm, bias, params, scale, shift, dstC - F); - dst += dD, buf += dB; - Save2(dst, buf, d10, d11, norm, bias, params, scale, shift, dstC - F); - dst += dD, buf += dB; - Save2(dst, buf, d20, d21, norm, bias, params, scale, shift, dstC - F); - dst += dD, buf += dB; - Save2(dst, buf, d30, d31, norm, bias, params, scale, shift, dstC - F); - dst += dD, buf += dB; - Save2(dst, buf, d40, d41, norm, bias, params, scale, shift, dstC - F); - dst += dD, buf += dB; - } - } - else - { - d00 = _mm256_setzero_si256(); - d10 = _mm256_setzero_si256(); - d20 = _mm256_setzero_si256(); - d30 = _mm256_setzero_si256(); - d40 = _mm256_setzero_si256(); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - if (sy + ky < p.srcH) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - assert(sx + kx < p.srcW && sx + kx + 5 <= p.srcW); - size_t offs = (sy + ky) * dY + (sx + kx) * dX, end = offs + srcC; - for (; offs < end; offs += 4) - { - w0 = _mm256_loadu_si256((__m256i*)weight0); - s0 = Set4(src0 + offs); - Madd4(d00, s0, w0); - s0 = Set4(src1 + offs); - Madd4(d10, s0, w0); - s0 = Set4(src2 + offs); - Madd4(d20, s0, w0); - s0 = Set4(src3 + offs); - Madd4(d30, s0, w0); - s0 = Set4(src4 + offs); - Madd4(d40, s0, w0); - weight0 += A; - } - weight0 += dW; - } - } - else if (a.zero) - { - s0 = _mm256_set1_epi32(a.zero); - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = _mm256_loadu_si256((__m256i*)weight0); - Madd4(d00, s0, w0); - Madd4(d10, s0, w0); - Madd4(d20, s0, w0); - Madd4(d30, s0, w0); - Madd4(d40, s0, w0); - weight0 += A; - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - Save1(dst, buf, d00, norm, bias, params, scale, shift); - dst += dD, buf += dB; - Save1(dst, buf, d10, norm, bias, params, scale, shift); - dst += dD, buf += dB; - Save1(dst, buf, d20, norm, bias, params, scale, shift); - dst += dD, buf += dB; - Save1(dst, buf, d30, norm, bias, params, scale, shift); - dst += dD, buf += dB; - Save1(dst, buf, d40, norm, bias, params, scale, shift); - dst += dD, buf += dB; - } - else - { - Save1(dst, buf, d00, norm, bias, params, scale, shift, dstC); - dst += dD, buf += dB; - Save1(dst, buf, d10, norm, bias, params, scale, shift, dstC); - dst += dD, buf += dB; - Save1(dst, buf, d20, norm, bias, params, scale, shift, dstC); - dst += dD, buf += dB; - Save1(dst, buf, d30, norm, bias, params, scale, shift, dstC); - dst += dD, buf += dB; - Save1(dst, buf, d40, norm, bias, params, scale, shift, dstC); - dst += dD, buf += dB; - } - } - } - - template void ConvolutionNhwcDirect_2xM(const uint8_t* src0, - const ConvParam8i& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const int8_t* weight0, - const __m256i* bias, const __m256i* params, const __m256* scale, const __m256* shift, int32_t* buf, uint8_t* dst) - { - __m256i d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, s0, w0, w1; - size_t dW = (DivHi(p.srcC, 4) - DivHi(srcC, 4)) * A, dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dD = p.dstC * a.size, dB = p.dstC, dWz = (DivHi(srcC, 4) * A + dW) * p.kernelX; - const int8_t* weight1 = weight0 + p.kernelY * p.kernelX * DivHi(p.srcC, 4) * A; - const uint8_t* src1 = src0 + 1 * dS; - const uint8_t* src2 = src0 + 2 * dS; - const uint8_t* src3 = src0 + 3 * dS; - const uint8_t* src4 = src0 + 4 * dS; - __m256i norm = _mm256_set1_epi32(a.norm); - size_t sy = dy * p.strideY - p.padY; - size_t sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY; - size_t kX = p.kernelX * p.dilationX; - if (dstC > F) - { - if (M > 0) d00 = _mm256_setzero_si256(), d01 = _mm256_setzero_si256(); - if (M > 1) d10 = _mm256_setzero_si256(), d11 = _mm256_setzero_si256(); - if (M > 2) d20 = _mm256_setzero_si256(), d21 = _mm256_setzero_si256(); - if (M > 3) d30 = _mm256_setzero_si256(), d31 = _mm256_setzero_si256(); - if (M > 4) d40 = _mm256_setzero_si256(), d41 = _mm256_setzero_si256(); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - if (sy + ky < p.srcH) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - assert(sx + kx < p.srcW && sx + kx + M <= p.srcW); - size_t offs = (sy + ky) * dY + (sx + kx) * dX, end = offs + srcC; - for (; offs < end; offs += 4) - { - w0 = _mm256_loadu_si256((__m256i*)weight0); - w1 = _mm256_loadu_si256((__m256i*)weight1); - if (M > 0) s0 = Set4(src0 + offs), Madd4(d00, s0, w0), Madd4(d01, s0, w1); - if (M > 1) s0 = Set4(src1 + offs), Madd4(d10, s0, w0), Madd4(d11, s0, w1); - if (M > 2) s0 = Set4(src2 + offs), Madd4(d20, s0, w0), Madd4(d21, s0, w1); - if (M > 3) s0 = Set4(src3 + offs), Madd4(d30, s0, w0), Madd4(d31, s0, w1); - if (M > 4) s0 = Set4(src4 + offs), Madd4(d40, s0, w0), Madd4(d41, s0, w1); - weight0 += A, weight1 += A; - } - weight0 += dW, weight1 += dW; - } - } - else if (a.zero) - { - s0 = _mm256_set1_epi32(a.zero); - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = _mm256_loadu_si256((__m256i*)weight0); - w1 = _mm256_loadu_si256((__m256i*)weight1); - if (M > 0) Madd4(d00, s0, w0), Madd4(d01, s0, w1); - if (M > 1) Madd4(d10, s0, w0), Madd4(d11, s0, w1); - if (M > 2) Madd4(d20, s0, w0), Madd4(d21, s0, w1); - if (M > 3) Madd4(d30, s0, w0), Madd4(d31, s0, w1); - if (M > 4) Madd4(d40, s0, w0), Madd4(d41, s0, w1); - weight0 += A, weight1 += A; - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == DF) - { - if (M > 0) Save2(dst, buf, d00, d01, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 1) Save2(dst, buf, d10, d11, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 2) Save2(dst, buf, d20, d21, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 3) Save2(dst, buf, d30, d31, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 4) Save2(dst, buf, d40, d41, norm, bias, params, scale, shift), dst += dD, buf += dB; - } - else - { - if (M > 0) Save2(dst, buf, d00, d01, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - if (M > 1) Save2(dst, buf, d10, d11, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - if (M > 2) Save2(dst, buf, d20, d21, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - if (M > 3) Save2(dst, buf, d30, d31, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - if (M > 4) Save2(dst, buf, d40, d41, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - } - } - else - { - if (M > 0) d00 = _mm256_setzero_si256(); - if (M > 1) d10 = _mm256_setzero_si256(); - if (M > 2) d20 = _mm256_setzero_si256(); - if (M > 3) d30 = _mm256_setzero_si256(); - if (M > 4) d40 = _mm256_setzero_si256(); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - if (sy + ky < p.srcH) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - assert(sx + kx < p.srcW && sx + kx + M <= p.srcW); - size_t offs = (sy + ky) * dY + (sx + kx) * dX, end = offs + srcC; - for (; offs < end; offs += 4) - { - w0 = _mm256_loadu_si256((__m256i*)weight0); - if (M > 0) s0 = Set4(src0 + offs), Madd4(d00, s0, w0); - if (M > 1) s0 = Set4(src1 + offs), Madd4(d10, s0, w0); - if (M > 2) s0 = Set4(src2 + offs), Madd4(d20, s0, w0); - if (M > 3) s0 = Set4(src3 + offs), Madd4(d30, s0, w0); - if (M > 4) s0 = Set4(src4 + offs), Madd4(d40, s0, w0); - weight0 += A; - } - weight0 += dW; - } - } - else if (a.zero) - { - s0 = _mm256_set1_epi32(a.zero); - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = _mm256_loadu_si256((__m256i*)weight0); - if (M > 0) Madd4(d00, s0, w0); - if (M > 1) Madd4(d10, s0, w0); - if (M > 2) Madd4(d20, s0, w0); - if (M > 3) Madd4(d30, s0, w0); - if (M > 4) Madd4(d40, s0, w0); - weight0 += A; - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - if (M > 0) Save1(dst, buf, d00, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 1) Save1(dst, buf, d10, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 2) Save1(dst, buf, d20, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 3) Save1(dst, buf, d30, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 4) Save1(dst, buf, d40, norm, bias, params, scale, shift), dst += dD, buf += dB; - } - else - { - if (M > 0) Save1(dst, buf, d00, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - if (M > 1) Save1(dst, buf, d10, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - if (M > 2) Save1(dst, buf, d20, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - if (M > 3) Save1(dst, buf, d30, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - if (M > 4) Save1(dst, buf, d40, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - } - } - } - - typedef void(*ConvolutionNhwcDirect_2xM_Ptr)(const uint8_t* src0, const ConvParam8i& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, - const int8_t* weight0, const __m256i* bias, const __m256i* params, const __m256* scale, const __m256* shift, int32_t* buf, uint8_t* dst); - - template ConvolutionNhwcDirect_2xM_Ptr GetConvolutionNhwcDirect_2xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect_2xM; - case 2: return ConvolutionNhwcDirect_2xM; - case 3: return ConvolutionNhwcDirect_2xM; - case 4: return ConvolutionNhwcDirect_2xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect_2(const uint8_t* src, - const ConvParam8i & p, const AlgParam & a, size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const int8_t* weight, - const int32_t* bias, const int32_t * params, const float * scale, const float* shift, int32_t* buf, uint8_t* dst) - { - size_t noseH = p.NoseH(), noseW = p.NoseW(), bodyH = p.BodyH(), bodyW = p.BodyW(); - size_t n = 5, bodyWn = AlignLoAny(bodyW - noseW, n) + noseW, m = bodyW - bodyWn; - ConvolutionNhwcDirect_2xM_Ptr convolutionNhwcDirect_2xM = GetConvolutionNhwcDirect_2xM(m); - size_t tailH = p.dstH, tailW = p.dstW; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - __m256i _params[2], _bias[2]; - _params[0] = _mm256_setzero_si256(); - if (type == ::SimdConvolutionActivationRestrictRange) - _params[1] = _mm256_set1_epi32(a.high); - __m256 _scale[2], _shift[2]; - - for (size_t dc = 0; dc < dstC; dc += DF) - { - size_t dC = Simd::Min(DF, dstC - dc); - _bias[0] = _mm256_loadu_si256((__m256i*)(bias + dc + 0)); - _bias[1] = _mm256_loadu_si256((__m256i*)(bias + dc + F)); - _scale[0] = _mm256_loadu_ps(scale + dc + 0); - _scale[1] = _mm256_loadu_ps(scale + dc + F); - _shift[0] = _mm256_loadu_ps(shift + dc + 0); - _shift[1] = _mm256_loadu_ps(shift + dc + F); - - uint8_t * d = dst + (dc + yBeg * p.dstW * p.dstC) * a.size; - int32_t * b = buf + dc + yBeg * p.dstW * p.dstC; - size_t dy = yBeg; - for (; dy < noseH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyWn; dx += n, b += p.dstC * n, d += p.dstC * a.size * n) - ConvolutionNhwcDirect_2x5(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyW; dx += m, b += p.dstC * m, d += p.dstC * a.size * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < tailW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - } - for (; dy < bodyH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyWn; dx += n, b += p.dstC * n, d += p.dstC * a.size * n) - ConvolutionNhwcDirect_2x5(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyW; dx += m, b += p.dstC * m, d += p.dstC * a.size * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < tailW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - } - for (; dy < tailH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyWn; dx += n, b += p.dstC * n, d += p.dstC * a.size * n) - ConvolutionNhwcDirect_2x5(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyW; dx += m, b += p.dstC * m, d += p.dstC * a.size * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < tailW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - } - weight += p.kernelY * p.kernelX * DivHi(p.srcC, 4) * DA; - } - } - - //--------------------------------------------------------------------- - - template void ConvolutionNhwcDirect1x1_2x5( - const uint8_t* src0, const ConvParam8i& p, const AlgParam& a, size_t srcC, size_t dstC, const int8_t* weight0, - const __m256i* bias, const __m256i* params, const __m256* scale, const __m256* shift, int32_t* buf, uint8_t* dst) - { - __m256i d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, s0, w0, w1; - size_t dS = p.srcC * p.strideX, dD = p.dstC * a.size, dB = p.dstC; - const int8_t* weight1 = weight0 + DivHi(p.srcC, 4) * A; - const uint8_t* src1 = src0 + 1 * dS; - const uint8_t* src2 = src0 + 2 * dS; - const uint8_t* src3 = src0 + 3 * dS; - const uint8_t* src4 = src0 + 4 * dS; - __m256i norm = _mm256_set1_epi32(a.norm); - if (dstC > F) - { - d00 = _mm256_setzero_si256(), d01 = _mm256_setzero_si256(); - d10 = _mm256_setzero_si256(), d11 = _mm256_setzero_si256(); - d20 = _mm256_setzero_si256(), d21 = _mm256_setzero_si256(); - d30 = _mm256_setzero_si256(), d31 = _mm256_setzero_si256(); - d40 = _mm256_setzero_si256(), d41 = _mm256_setzero_si256(); - for (size_t offs = 0; offs < srcC; offs += 4) - { - w0 = _mm256_loadu_si256((__m256i*)weight0); - w1 = _mm256_loadu_si256((__m256i*)weight1); - s0 = Set4(src0 + offs); - Madd4(d00, s0, w0); - Madd4(d01, s0, w1); - s0 = Set4(src1 + offs); - Madd4(d10, s0, w0); - Madd4(d11, s0, w1); - s0 = Set4(src2 + offs); - Madd4(d20, s0, w0); - Madd4(d21, s0, w1); - s0 = Set4(src3 + offs); - Madd4(d30, s0, w0); - Madd4(d31, s0, w1); - s0 = Set4(src4 + offs); - Madd4(d40, s0, w0); - Madd4(d41, s0, w1); - weight0 += A, weight1 += A; - } - if (dstC == DF) - { - Save2(dst, buf, d00, d01, norm, bias, params, scale, shift), dst += dD, buf += dB; - Save2(dst, buf, d10, d11, norm, bias, params, scale, shift), dst += dD, buf += dB; - Save2(dst, buf, d20, d21, norm, bias, params, scale, shift), dst += dD, buf += dB; - Save2(dst, buf, d30, d31, norm, bias, params, scale, shift), dst += dD, buf += dB; - Save2(dst, buf, d40, d41, norm, bias, params, scale, shift), dst += dD, buf += dB; - } - else - { - Save2(dst, buf, d00, d01, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - Save2(dst, buf, d10, d11, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - Save2(dst, buf, d20, d21, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - Save2(dst, buf, d30, d31, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - Save2(dst, buf, d40, d41, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - } - } - else - { - d00 = _mm256_setzero_si256(); - d10 = _mm256_setzero_si256(); - d20 = _mm256_setzero_si256(); - d30 = _mm256_setzero_si256(); - d40 = _mm256_setzero_si256(); - for (size_t offs = 0; offs < srcC; offs += 4) - { - w0 = _mm256_loadu_si256((__m256i*)weight0); - s0 = Set4(src0 + offs); - Madd4(d00, s0, w0); - s0 = Set4(src1 + offs); - Madd4(d10, s0, w0); - s0 = Set4(src2 + offs); - Madd4(d20, s0, w0); - s0 = Set4(src3 + offs); - Madd4(d30, s0, w0); - s0 = Set4(src4 + offs); - Madd4(d40, s0, w0); - weight0 += A; - } - if (dstC == F) - { - Save1(dst, buf, d00, norm, bias, params, scale, shift), dst += dD, buf += dB; - Save1(dst, buf, d10, norm, bias, params, scale, shift), dst += dD, buf += dB; - Save1(dst, buf, d20, norm, bias, params, scale, shift), dst += dD, buf += dB; - Save1(dst, buf, d30, norm, bias, params, scale, shift), dst += dD, buf += dB; - Save1(dst, buf, d40, norm, bias, params, scale, shift), dst += dD, buf += dB; - } - else - { - Save1(dst, buf, d00, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - Save1(dst, buf, d10, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - Save1(dst, buf, d20, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - Save1(dst, buf, d30, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - Save1(dst, buf, d40, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - } - } - } - - template void ConvolutionNhwcDirect1x1_2xM( - const uint8_t* src0, const ConvParam8i& p, const AlgParam& a, size_t srcC, size_t dstC, const int8_t* weight0, - const __m256i* bias, const __m256i* params, const __m256* scale, const __m256* shift, int32_t* buf, uint8_t* dst) - { - __m256i d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, s0, w0, w1; - size_t dS = p.srcC * p.strideX, dD = p.dstC * a.size, dB = p.dstC; - const int8_t* weight1 = weight0 + DivHi(p.srcC, 4) * A; - const uint8_t* src1 = src0 + 1 * dS; - const uint8_t* src2 = src0 + 2 * dS; - const uint8_t* src3 = src0 + 3 * dS; - const uint8_t* src4 = src0 + 4 * dS; - __m256i norm = _mm256_set1_epi32(a.norm); - if (dstC > F) - { - if (M > 0) d00 = _mm256_setzero_si256(), d01 = _mm256_setzero_si256(); - if (M > 1) d10 = _mm256_setzero_si256(), d11 = _mm256_setzero_si256(); - if (M > 2) d20 = _mm256_setzero_si256(), d21 = _mm256_setzero_si256(); - if (M > 3) d30 = _mm256_setzero_si256(), d31 = _mm256_setzero_si256(); - if (M > 4) d40 = _mm256_setzero_si256(), d41 = _mm256_setzero_si256(); - for (size_t offs = 0; offs < srcC; offs += 4) - { - w0 = _mm256_loadu_si256((__m256i*)weight0); - w1 = _mm256_loadu_si256((__m256i*)weight1); - if (M > 0) s0 = Set4(src0 + offs), Madd4(d00, s0, w0), Madd4(d01, s0, w1); - if (M > 1) s0 = Set4(src1 + offs), Madd4(d10, s0, w0), Madd4(d11, s0, w1); - if (M > 2) s0 = Set4(src2 + offs), Madd4(d20, s0, w0), Madd4(d21, s0, w1); - if (M > 3) s0 = Set4(src3 + offs), Madd4(d30, s0, w0), Madd4(d31, s0, w1); - if (M > 4) s0 = Set4(src4 + offs), Madd4(d40, s0, w0), Madd4(d41, s0, w1); - weight0 += A, weight1 += A; - } - if (dstC == DF) - { - if (M > 0) Save2(dst, buf, d00, d01, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 1) Save2(dst, buf, d10, d11, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 2) Save2(dst, buf, d20, d21, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 3) Save2(dst, buf, d30, d31, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 4) Save2(dst, buf, d40, d41, norm, bias, params, scale, shift), dst += dD, buf += dB; - } - else - { - if (M > 0) Save2(dst, buf, d00, d01, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - if (M > 1) Save2(dst, buf, d10, d11, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - if (M > 2) Save2(dst, buf, d20, d21, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - if (M > 3) Save2(dst, buf, d30, d31, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - if (M > 4) Save2(dst, buf, d40, d41, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - } - } - else - { - if (M > 0) d00 = _mm256_setzero_si256(); - if (M > 1) d10 = _mm256_setzero_si256(); - if (M > 2) d20 = _mm256_setzero_si256(); - if (M > 3) d30 = _mm256_setzero_si256(); - if (M > 4) d40 = _mm256_setzero_si256(); - for (size_t offs = 0; offs < srcC; offs += 4) - { - w0 = _mm256_loadu_si256((__m256i*)weight0); - if (M > 0) s0 = Set4(src0 + offs), Madd4(d00, s0, w0); - if (M > 1) s0 = Set4(src1 + offs), Madd4(d10, s0, w0); - if (M > 2) s0 = Set4(src2 + offs), Madd4(d20, s0, w0); - if (M > 3) s0 = Set4(src3 + offs), Madd4(d30, s0, w0); - if (M > 4) s0 = Set4(src4 + offs), Madd4(d40, s0, w0); - weight0 += A; - } - if (dstC == F) - { - if (M > 0) Save1(dst, buf, d00, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 1) Save1(dst, buf, d10, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 2) Save1(dst, buf, d20, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 3) Save1(dst, buf, d30, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 4) Save1(dst, buf, d40, norm, bias, params, scale, shift), dst += dD, buf += dB; - } - else - { - if (M > 0) Save1(dst, buf, d00, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - if (M > 1) Save1(dst, buf, d10, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - if (M > 2) Save1(dst, buf, d20, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - if (M > 3) Save1(dst, buf, d30, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - if (M > 4) Save1(dst, buf, d40, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - } - } - } - - typedef void(*ConvolutionNhwcDirect1x1_2xM_Ptr)(const uint8_t* src0, const ConvParam8i& p, const AlgParam& a, size_t srcC, size_t dstC, - const int8_t* weight0, const __m256i* bias, const __m256i* params, const __m256* scale, const __m256* shift, int32_t* buf, uint8_t* dst); - - template ConvolutionNhwcDirect1x1_2xM_Ptr GetConvolutionNhwcDirect1x1_2xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect1x1_2xM; - case 2: return ConvolutionNhwcDirect1x1_2xM; - case 3: return ConvolutionNhwcDirect1x1_2xM; - case 4: return ConvolutionNhwcDirect1x1_2xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect1x1_2(const uint8_t* src, - const ConvParam8i& p, const AlgParam& a, size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const int8_t* weight, - const int32_t* bias, const int32_t* params, const float* scale, const float* shift, int32_t* buf, uint8_t* dst) - { - size_t n1 = (yEnd - yBeg) * p.dstW, n5 = AlignLoAny(n1, 5), m = n1 - n5; - ConvolutionNhwcDirect1x1_2xM_Ptr convolutionNhwcDirect1x1_2xM = GetConvolutionNhwcDirect1x1_2xM(m); - __m256i _params[2], _bias[2]; - _params[0] = _mm256_setzero_si256(); - if (type == ::SimdConvolutionActivationRestrictRange) - _params[1] = _mm256_set1_epi32(a.high); - __m256 _scale[2], _shift[2]; - - for (size_t dc = 0; dc < dstC; dc += DF) - { - size_t dC = Simd::Min(DF, dstC - dc); - _bias[0] = _mm256_loadu_si256((__m256i*)(bias + dc + 0)); - _bias[1] = _mm256_loadu_si256((__m256i*)(bias + dc + F)); - _scale[0] = _mm256_loadu_ps(scale + dc + 0); - _scale[1] = _mm256_loadu_ps(scale + dc + F); - _shift[0] = _mm256_loadu_ps(shift + dc + 0); - _shift[1] = _mm256_loadu_ps(shift + dc + F); - const uint8_t* s = src + yBeg * p.srcW * p.srcC; - uint8_t* d = dst + (dc + yBeg * p.dstW * p.dstC) * a.size; - int32_t* b = buf + dc + yBeg * p.dstW * p.dstC; - size_t i = 0; - for (; i < n5; i += 5, s += p.srcC * 5, b += p.dstC * 5, d += p.dstC * a.size * 5) - ConvolutionNhwcDirect1x1_2x5(s, p, a, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; i < n1; i += m, s += p.srcC * m, b += p.dstC * m, d += p.dstC * a.size * m) - convolutionNhwcDirect1x1_2xM(s, p, a, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - weight += DivHi(p.srcC, 4) * DA; - } - } - - //--------------------------------------------------------------------- - - template void Set(const ConvParam8i& p, const AlgParam & a, ConvolutionPtr * d) - { - if (p.Is1x1()) - { - switch (a.microD) - { - case 2 * F: d[term] = ConvolutionNhwcDirect1x1_2; break; - default: - assert(0); - } - } - else - { - switch (a.microD) - { - case 2 * F: d[term] = ConvolutionNhwcDirect_2; break; - default: - assert(0); - } - } - } - - template void Set(const ConvParam8i& p, const AlgParam& a, ConvolutionPtr* d) - { - if (p.compatibility & SimdSynetCompatibilityNoFma) - Set(p, a, d); - else - Set(p, a, d); - } - - template void Set(const ConvParam8i& p, const AlgParam& a, ConvolutionPtr* d) - { - if (p.compatibility & SimdSynetCompatibilityOverflow16i) - Set(p, a, d); - else - Set(p, a, d); - } - - template void Set(const ConvParam8i& p, const AlgParam& a, ConvolutionPtr* d) - { - Set(p, a, d); - Set(p, a, d); - Set(p, a, d); - Set(p, a, d); - Set(p, a, d); - Set(p, a, d); - } - - static void Set(const ConvParam8i& p, const AlgParam& a, ConvolutionPtr * d) - { - switch (p.activation) - { - case SimdConvolutionActivationIdentity: Set(p, a, d); break; - case SimdConvolutionActivationRelu: Set(p, a, d); break; - case SimdConvolutionActivationRestrictRange: Set(p, a, d); break; - default: assert(0); - } - } - - SynetConvolution8iNhwcDirect::SynetConvolution8iNhwcDirect(const ConvParam8i& p) - : Sse41::SynetConvolution8iNhwcDirect(p) - { - SetAlgParam(F, 2 * F, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3()); - Set(p, _alg, _convolutions); - _convertSrc = Avx2::SynetConvert32fTo8u; - } - - //--------------------------------------------------------------------- - - void * SynetConvolution8iInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility) - { - ConvParam8i param(batch, conv, compatibility); - if (!param.Valid()) - return NULL; - else if (SynetConvolution8iNhwcDirect::Preferable(param)) - return new SynetConvolution8iNhwcDirect(param); - else - return new Base::SynetConvolution8iGemmNN(param); - } - } -#endif -} diff --git a/src/3rd/Simd/Simd/SimdAvx2SynetDeconvolution32f.cpp b/src/3rd/Simd/Simd/SimdAvx2SynetDeconvolution32f.cpp deleted file mode 100644 index 97239105..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2SynetDeconvolution32f.cpp +++ /dev/null @@ -1,304 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetDeconvolution32f.h" -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdAvx2.h" -#include "Simd/SimdGemm.h" -#include "Simd/SimdExp.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - SynetDeconvolution32fGemmNN::SynetDeconvolution32fGemmNN(const DeconvParam32f & p) - : Avx::SynetDeconvolution32fGemmNN(p) - { - _gemm.Init(InitGemmFuncs(Avx2::Gemm32fNN, "Avx2", p.gemm, "Ext")); - if (_param.trans && _param.group == 1) - { - if (NHWC_GEMM_RUNTIME) - { - _gemmCb.Init(InitGemmCbFuncs(Avx2::Gemm32fNNcbBufferSize, Avx2::Gemm32fNNcbReorderB, Avx2::Gemm32fNNcbRun, "Avx2", GemmKernelF2, GemmKernelF3)); - _nhwcWeight.Resize(_gemmCb.At(0).BufferSize(_M*_merge, _N, _K)); - } - else - _nhwcWeight.Resize(Avx2::Gemm32fNNcbBufferSize(_M*_merge, _N, _K, GemmKernelAny, NHWC_GEMM_COMPATIBLE)); - _nhwcRun = Avx2::Gemm32fNNcbRun; - _nhwcReorderB = Avx2::Gemm32fNNcbReorderB; - } - _biasAndActivation = Avx2::ConvolutionBiasAndActivation; - } - - //--------------------------------------------------------------------- - - typedef void(*DeconvolutionNhwcDirect2x2_Ptr) (const float * src0, const DeconvParam32f & p, size_t srcC, size_t dstC, const float * weight, const __m256 * bias, const __m256 * params, float * ds); - - template void DeconvolutionNhwcDirect2x2_6(const float * src0, - const DeconvParam32f & p, size_t srcC, size_t dstC, const float * weight0, const __m256 * bias, const __m256 * params, float * dst) - { - size_t dS = p.srcC, dD = p.dstC; - const float * weight1 = weight0 + srcC * F; - const float * src1 = src0 + 1 * dS; - const float * src2 = src0 + 2 * dS; - const float * src3 = src0 + 3 * dS; - const float * src4 = src0 + 4 * dS; - const float * src5 = src0 + 5 * dS; - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - d00 = _mm256_setzero_ps(); d01 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); d11 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); d21 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); d31 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(); d41 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(); d51 = _mm256_setzero_ps(); - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm256_loadu_ps(weight0); - w1 = _mm256_loadu_ps(weight1); - s0 = _mm256_set1_ps(src0[sc]); - d00 = _mm256_fmadd_ps(s0, w0, d00); - d01 = _mm256_fmadd_ps(s0, w1, d01); - s0 = _mm256_set1_ps(src1[sc]); - d10 = _mm256_fmadd_ps(s0, w0, d10); - d11 = _mm256_fmadd_ps(s0, w1, d11); - s0 = _mm256_set1_ps(src2[sc]); - d20 = _mm256_fmadd_ps(s0, w0, d20); - d21 = _mm256_fmadd_ps(s0, w1, d21); - s0 = _mm256_set1_ps(src3[sc]); - d30 = _mm256_fmadd_ps(s0, w0, d30); - d31 = _mm256_fmadd_ps(s0, w1, d31); - s0 = _mm256_set1_ps(src4[sc]); - d40 = _mm256_fmadd_ps(s0, w0, d40); - d41 = _mm256_fmadd_ps(s0, w1, d41); - s0 = _mm256_set1_ps(src5[sc]); - d50 = _mm256_fmadd_ps(s0, w0, d50); - d51 = _mm256_fmadd_ps(s0, w1, d51); - weight0 += F; - weight1 += F; - } - if (dstC == F) - { - Term::template Save(dst + 0x0 * dD, d00, bias, params); - Term::template Save(dst + 0x1 * dD, d01, bias, params); - Term::template Save(dst + 0x2 * dD, d10, bias, params); - Term::template Save(dst + 0x3 * dD, d11, bias, params); - Term::template Save(dst + 0x4 * dD, d20, bias, params); - Term::template Save(dst + 0x5 * dD, d21, bias, params); - Term::template Save(dst + 0x6 * dD, d30, bias, params); - Term::template Save(dst + 0x7 * dD, d31, bias, params); - Term::template Save(dst + 0x8 * dD, d40, bias, params); - Term::template Save(dst + 0x9 * dD, d41, bias, params); - Term::template Save(dst + 0xA * dD, d50, bias, params); - Term::template Save(dst + 0xB * dD, d51, bias, params); - } - else - { - Term::template Save(dst + 0x0 * dD, d00, bias, params, dstC); - Term::template Save(dst + 0x1 * dD, d01, bias, params, dstC); - Term::template Save(dst + 0x2 * dD, d10, bias, params, dstC); - Term::template Save(dst + 0x3 * dD, d11, bias, params, dstC); - Term::template Save(dst + 0x4 * dD, d20, bias, params, dstC); - Term::template Save(dst + 0x5 * dD, d21, bias, params, dstC); - Term::template Save(dst + 0x6 * dD, d30, bias, params, dstC); - Term::template Save(dst + 0x7 * dD, d31, bias, params, dstC); - Term::template Save(dst + 0x8 * dD, d40, bias, params, dstC); - Term::template Save(dst + 0x9 * dD, d41, bias, params, dstC); - Term::template Save(dst + 0xA * dD, d50, bias, params, dstC); - Term::template Save(dst + 0xB * dD, d51, bias, params, dstC); - } - } - - template void DeconvolutionNhwcDirect2x2_M(const float * src0, - const DeconvParam32f & p, size_t srcC, size_t dstC, const float * weight0, const __m256 * bias, const __m256 * params, float * dst) - { - size_t dS = p.srcC, dD = p.dstC; - const float * weight1 = weight0 + srcC * F, *src1, *src2, *src3, *src4, *src5; - if (tail > 1) src1 = src0 + 1 * dS; - if (tail > 2) src2 = src0 + 2 * dS; - if (tail > 3) src3 = src0 + 3 * dS; - if (tail > 4) src4 = src0 + 4 * dS; - if (tail > 5) src5 = src0 + 5 * dS; - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - if (tail > 0) d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - if (tail > 1) d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - if (tail > 2) d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - if (tail > 3) d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - if (tail > 4) d40 = _mm256_setzero_ps(), d41 = _mm256_setzero_ps(); - if (tail > 5) d50 = _mm256_setzero_ps(), d51 = _mm256_setzero_ps(); - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm256_loadu_ps(weight0); - w1 = _mm256_loadu_ps(weight1); - if (tail > 0) s0 = _mm256_set1_ps(src0[sc]), d00 = _mm256_fmadd_ps(s0, w0, d00), d01 = _mm256_fmadd_ps(s0, w1, d01); - if (tail > 1) s0 = _mm256_set1_ps(src1[sc]), d10 = _mm256_fmadd_ps(s0, w0, d10), d11 = _mm256_fmadd_ps(s0, w1, d11); - if (tail > 2) s0 = _mm256_set1_ps(src2[sc]), d20 = _mm256_fmadd_ps(s0, w0, d20), d21 = _mm256_fmadd_ps(s0, w1, d21); - if (tail > 3) s0 = _mm256_set1_ps(src3[sc]), d30 = _mm256_fmadd_ps(s0, w0, d30), d31 = _mm256_fmadd_ps(s0, w1, d31); - if (tail > 4) s0 = _mm256_set1_ps(src4[sc]), d40 = _mm256_fmadd_ps(s0, w0, d40), d41 = _mm256_fmadd_ps(s0, w1, d41); - if (tail > 5) s0 = _mm256_set1_ps(src5[sc]), d50 = _mm256_fmadd_ps(s0, w0, d50), d51 = _mm256_fmadd_ps(s0, w1, d51); - weight0 += F; - weight1 += F; - } - if (dstC == F) - { - if (tail > 0) Term::template Save(dst + 0x0 * dD, d00, bias, params), Term::template Save(dst + 0x1 * dD, d01, bias, params); - if (tail > 1) Term::template Save(dst + 0x2 * dD, d10, bias, params), Term::template Save(dst + 0x3 * dD, d11, bias, params); - if (tail > 2) Term::template Save(dst + 0x4 * dD, d20, bias, params), Term::template Save(dst + 0x5 * dD, d21, bias, params); - if (tail > 3) Term::template Save(dst + 0x6 * dD, d30, bias, params), Term::template Save(dst + 0x7 * dD, d31, bias, params); - if (tail > 4) Term::template Save(dst + 0x8 * dD, d40, bias, params), Term::template Save(dst + 0x9 * dD, d41, bias, params); - if (tail > 5) Term::template Save(dst + 0xA * dD, d50, bias, params), Term::template Save(dst + 0xB * dD, d51, bias, params); - } - else - { - if (tail > 0) Term::template Save(dst + 0x0 * dD, d00, bias, params, dstC), Term::template Save(dst + 0x1 * dD, d01, bias, params, dstC); - if (tail > 1) Term::template Save(dst + 0x2 * dD, d10, bias, params, dstC), Term::template Save(dst + 0x3 * dD, d11, bias, params, dstC); - if (tail > 2) Term::template Save(dst + 0x4 * dD, d20, bias, params, dstC), Term::template Save(dst + 0x5 * dD, d21, bias, params, dstC); - if (tail > 3) Term::template Save(dst + 0x6 * dD, d30, bias, params, dstC), Term::template Save(dst + 0x7 * dD, d31, bias, params, dstC); - if (tail > 4) Term::template Save(dst + 0x8 * dD, d40, bias, params, dstC), Term::template Save(dst + 0x9 * dD, d41, bias, params, dstC); - if (tail > 5) Term::template Save(dst + 0xA * dD, d50, bias, params, dstC), Term::template Save(dst + 0xB * dD, d51, bias, params, dstC); - } - } - - template SIMD_INLINE DeconvolutionNhwcDirect2x2_Ptr GetTailKernel(size_t tail) - { - switch (tail) - { - case 0: return DeconvolutionNhwcDirect2x2_M; - case 1: return DeconvolutionNhwcDirect2x2_M; - case 2: return DeconvolutionNhwcDirect2x2_M; - case 3: return DeconvolutionNhwcDirect2x2_M; - case 4: return DeconvolutionNhwcDirect2x2_M; - case 5: return DeconvolutionNhwcDirect2x2_M; - default: - assert(0); - return NULL; - } - } - - template void DeconvolutionNhwcDirect2x2(const float * src, const DeconvParam32f & p, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcW6 = AlignLoAny(p.srcW, 6), tail = p.srcW - srcW6; - DeconvolutionNhwcDirect2x2_Ptr bodyKernel = DeconvolutionNhwcDirect2x2_6; - DeconvolutionNhwcDirect2x2_Ptr tailKernel = GetTailKernel(tail); - - __m256 _params[2], _bias[1]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - - for (size_t dc = 0; dc < dstC; dc += F) - { - size_t dC = Simd::Min(F, dstC - dc); - _bias[0] = _mm256_loadu_ps(bias + dc); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm256_loadu_ps(params + dc); - const float * s = src + yBeg * p.srcW * p.srcC; - float * d = dst + yBeg * p.strideY * p.dstW * p.dstC; - const float * w0 = weight + 0 * p.kernelX * p.srcC * F; - const float * w1 = weight + 1 * p.kernelX * p.srcC * F; - for (size_t sy = yBeg; sy < yEnd; sy += 1, s += p.srcW * p.srcC) - { - for (size_t sx = 0; sx < srcW6; sx += 6) - bodyKernel(s + sx * p.srcC, p, srcC, dC, w0, _bias, _params, d), d += 6 * p.strideX * p.dstC; - if (tail) - tailKernel(s + srcW6 * p.srcC, p, srcC, dC, w0, _bias, _params, d), d += tail * p.strideX * p.dstC; - for (size_t sx = 0; sx < srcW6; sx += 6) - bodyKernel(s + sx * p.srcC, p, srcC, dC, w1, _bias, _params, d), d += 6 * p.strideX * p.dstC; - if (tail) - tailKernel(s + srcW6 * p.srcC, p, srcC, dC, w1, _bias, _params, d), d += tail * p.strideX * p.dstC; - } - weight += p.kernelY * p.kernelX*srcC*F; - dst += F; - } - } - - template void DeconvolutionNhwcDirect2x2(const float * src, const DeconvParam32f & p, - const SynetDeconvolution32fNhwcDirect2x2::AlgParam & a, const float * weight, const float * bias, const float * params, float * dst) - { - for (size_t dc = 0; dc < p.dstC; dc += a.macroD) - { - size_t macroD = Simd::Min(p.dstC, dc + a.macroD) - dc; - for (size_t sc = 0; sc < p.srcC; sc += a.macroC) - { - size_t macroC = Simd::Min(p.srcC, sc + a.macroC) - sc; - size_t macroK = p.kernelY * p.kernelX * macroC; - for (size_t yBeg = 0; yBeg < p.srcH;) - { - size_t yEnd = Simd::Min(yBeg + a.macroH, p.srcH); - if (a.macroC == p.srcC) - DeconvolutionNhwcDirect2x2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc == 0) - DeconvolutionNhwcDirect2x2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc + macroC == p.srcC) - DeconvolutionNhwcDirect2x2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else - DeconvolutionNhwcDirect2x2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - yBeg = yEnd; - } - weight += AlignHiAny(macroD, a.microD)*macroK; - } - if (type == ::SimdConvolutionActivationPrelu) - params += macroD; - } - } - - SynetDeconvolution32fNhwcDirect2x2::SynetDeconvolution32fNhwcDirect2x2(const DeconvParam32f & p) - : Avx::SynetDeconvolution32fNhwcDirect2x2(p) - { - if (p.dstC > HF) - { - switch (p.activation) - { - case SimdConvolutionActivationIdentity: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationRelu: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationLeakyRelu: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationRestrictRange: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationPrelu: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationElu: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationHswish: _deconvolution = DeconvolutionNhwcDirect2x2; break; - default: assert(0); - } - SetAlgParam(F, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3()); - } - } - - //--------------------------------------------------------------------- - - void * SynetDeconvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdGemm32fNNPtr gemm) - { - DeconvParam32f param(batch, conv, gemm); - if (!param.Valid()) - return NULL; - if (SynetDeconvolution32fNhwcDirect2x2::Preferable(param)) - return new SynetDeconvolution32fNhwcDirect2x2(param); - else - return new SynetDeconvolution32fGemmNN(param); - } - } -#endif//SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2SynetMergedConvolution32f.cpp b/src/3rd/Simd/Simd/SimdAvx2SynetMergedConvolution32f.cpp deleted file mode 100644 index 99d729a2..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2SynetMergedConvolution32f.cpp +++ /dev/null @@ -1,1304 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetMergedConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdUpdate.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#if defined(SIMD_AVX2_ENABLE) - namespace Avx2 - { - template SIMD_NOINLINE void InputConvolution1x1_2x6(const float * src0, size_t srcC, - const float * weight, const __m256 * bias, const __m256 * params, float * dst0, float * dst1) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - d00 = bias[0], d01 = bias[1]; - d10 = bias[0], d11 = bias[1]; - d20 = bias[0], d21 = bias[1]; - d30 = bias[0], d31 = bias[1]; - d40 = bias[0], d41 = bias[1]; - d50 = bias[0], d51 = bias[1]; - const float * src1 = src0 + 1 * srcC; - const float * src2 = src0 + 2 * srcC; - const float * src3 = src0 + 3 * srcC; - const float * src4 = src0 + 4 * srcC; - const float * src5 = src0 + 5 * srcC; - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - s0 = _mm256_set1_ps(src0[sc]); - d00 = _mm256_fmadd_ps(s0, w0, d00); - d01 = _mm256_fmadd_ps(s0, w1, d01); - s0 = _mm256_set1_ps(src1[sc]); - d10 = _mm256_fmadd_ps(s0, w0, d10); - d11 = _mm256_fmadd_ps(s0, w1, d11); - s0 = _mm256_set1_ps(src2[sc]); - d20 = _mm256_fmadd_ps(s0, w0, d20); - d21 = _mm256_fmadd_ps(s0, w1, d21); - s0 = _mm256_set1_ps(src3[sc]); - d30 = _mm256_fmadd_ps(s0, w0, d30); - d31 = _mm256_fmadd_ps(s0, w1, d31); - s0 = _mm256_set1_ps(src4[sc]); - d40 = _mm256_fmadd_ps(s0, w0, d40); - d41 = _mm256_fmadd_ps(s0, w1, d41); - s0 = _mm256_set1_ps(src5[sc]); - d50 = _mm256_fmadd_ps(s0, w0, d50); - d51 = _mm256_fmadd_ps(s0, w1, d51); - weight += DF; - } - _mm256_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); - _mm256_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); - _mm256_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); - _mm256_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); - _mm256_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); - _mm256_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); - _mm256_storeu_ps(dst1 + 0 * F, Activate(d01, params, 1)); - _mm256_storeu_ps(dst1 + 1 * F, Activate(d11, params, 1)); - _mm256_storeu_ps(dst1 + 2 * F, Activate(d21, params, 1)); - _mm256_storeu_ps(dst1 + 3 * F, Activate(d31, params, 1)); - _mm256_storeu_ps(dst1 + 4 * F, Activate(d41, params, 1)); - _mm256_storeu_ps(dst1 + 5 * F, Activate(d51, params, 1)); - } - - template SIMD_NOINLINE void InputConvolution1x1_2xM(const float * src0, size_t srcC, - const float * weight, const __m256 * bias, const __m256 * params, float * dst0, float * dst1) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - if (M > 0) d00 = bias[0], d01 = bias[1]; - if (M > 1) d10 = bias[0], d11 = bias[1]; - if (M > 2) d20 = bias[0], d21 = bias[1]; - if (M > 3) d30 = bias[0], d31 = bias[1]; - if (M > 4) d40 = bias[0], d41 = bias[1]; - if (M > 5) d50 = bias[0], d51 = bias[1]; - const float * src1 = src0 + 1 * srcC; - const float * src2 = src0 + 2 * srcC; - const float * src3 = src0 + 3 * srcC; - const float * src4 = src0 + 4 * srcC; - const float * src5 = src0 + 5 * srcC; - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - if (M > 0) s0 = _mm256_set1_ps(src0[sc]), d00 = _mm256_fmadd_ps(s0, w0, d00), d01 = _mm256_fmadd_ps(s0, w1, d01); - if (M > 1) s0 = _mm256_set1_ps(src1[sc]), d10 = _mm256_fmadd_ps(s0, w0, d10), d11 = _mm256_fmadd_ps(s0, w1, d11); - if (M > 2) s0 = _mm256_set1_ps(src2[sc]), d20 = _mm256_fmadd_ps(s0, w0, d20), d21 = _mm256_fmadd_ps(s0, w1, d21); - if (M > 3) s0 = _mm256_set1_ps(src3[sc]), d30 = _mm256_fmadd_ps(s0, w0, d30), d31 = _mm256_fmadd_ps(s0, w1, d31); - if (M > 4) s0 = _mm256_set1_ps(src4[sc]), d40 = _mm256_fmadd_ps(s0, w0, d40), d41 = _mm256_fmadd_ps(s0, w1, d41); - if (M > 5) s0 = _mm256_set1_ps(src5[sc]), d50 = _mm256_fmadd_ps(s0, w0, d50), d51 = _mm256_fmadd_ps(s0, w1, d51); - weight += DF; - } - if (M > 0) _mm256_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)), _mm256_storeu_ps(dst1 + 0 * F, Activate(d01, params, 1)); - if (M > 1) _mm256_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)), _mm256_storeu_ps(dst1 + 1 * F, Activate(d11, params, 1)); - if (M > 2) _mm256_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)), _mm256_storeu_ps(dst1 + 2 * F, Activate(d21, params, 1)); - if (M > 3) _mm256_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)), _mm256_storeu_ps(dst1 + 3 * F, Activate(d31, params, 1)); - if (M > 4) _mm256_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)), _mm256_storeu_ps(dst1 + 4 * F, Activate(d41, params, 1)); - if (M > 5) _mm256_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)), _mm256_storeu_ps(dst1 + 5 * F, Activate(d51, params, 1)); - } - - typedef void(*InputConvolution1x1_2xM_Ptr)(const float * src0, size_t srcC, const float * weight, const __m256 * bias, const __m256 * params, float * dst0, float * dst1); - - template InputConvolution1x1_2xM_Ptr GetInputConvolution1x1_2xM(size_t M) - { - switch (M) - { - case 0: return InputConvolution1x1_2xM; - case 1: return InputConvolution1x1_2xM; - case 2: return InputConvolution1x1_2xM; - case 3: return InputConvolution1x1_2xM; - case 4: return InputConvolution1x1_2xM; - case 5: return InputConvolution1x1_2xM; - } - assert(0); - return NULL; - } - - template SIMD_NOINLINE void InputConvolution1x1_1x6(const float * src0, size_t srcC, - const float * weight, const __m256 * bias, const __m256 * params, float * dst0) - { - __m256 d00, d10, d20, d30, d40, d50, s0, w0; - d00 = bias[0]; - d10 = bias[0]; - d20 = bias[0]; - d30 = bias[0]; - d40 = bias[0]; - d50 = bias[0]; - const float * src1 = src0 + 1 * srcC; - const float * src2 = src0 + 2 * srcC; - const float * src3 = src0 + 3 * srcC; - const float * src4 = src0 + 4 * srcC; - const float * src5 = src0 + 5 * srcC; - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm256_loadu_ps(weight + 0); - s0 = _mm256_set1_ps(src0[sc]); - d00 = _mm256_fmadd_ps(s0, w0, d00); - s0 = _mm256_set1_ps(src1[sc]); - d10 = _mm256_fmadd_ps(s0, w0, d10); - s0 = _mm256_set1_ps(src2[sc]); - d20 = _mm256_fmadd_ps(s0, w0, d20); - s0 = _mm256_set1_ps(src3[sc]); - d30 = _mm256_fmadd_ps(s0, w0, d30); - s0 = _mm256_set1_ps(src4[sc]); - d40 = _mm256_fmadd_ps(s0, w0, d40); - s0 = _mm256_set1_ps(src5[sc]); - d50 = _mm256_fmadd_ps(s0, w0, d50); - weight += DF; - } - _mm256_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); - _mm256_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); - _mm256_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); - _mm256_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); - _mm256_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); - _mm256_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); - } - - template SIMD_NOINLINE void InputConvolution1x1_1xM(const float * src0, size_t srcC, - const float * weight, const __m256 * bias, const __m256 * params, float * dst0) - { - __m256 d00, d10, d20, d30, d40, d50, s0, w0; - if (M > 0) d00 = bias[0]; - if (M > 1) d10 = bias[0]; - if (M > 2) d20 = bias[0]; - if (M > 3) d30 = bias[0]; - if (M > 4) d40 = bias[0]; - if (M > 5) d50 = bias[0]; - const float * src1 = src0 + 1 * srcC; - const float * src2 = src0 + 2 * srcC; - const float * src3 = src0 + 3 * srcC; - const float * src4 = src0 + 4 * srcC; - const float * src5 = src0 + 5 * srcC; - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm256_loadu_ps(weight + 0); - if (M > 0) s0 = _mm256_set1_ps(src0[sc]), d00 = _mm256_fmadd_ps(s0, w0, d00); - if (M > 1) s0 = _mm256_set1_ps(src1[sc]), d10 = _mm256_fmadd_ps(s0, w0, d10); - if (M > 2) s0 = _mm256_set1_ps(src2[sc]), d20 = _mm256_fmadd_ps(s0, w0, d20); - if (M > 3) s0 = _mm256_set1_ps(src3[sc]), d30 = _mm256_fmadd_ps(s0, w0, d30); - if (M > 4) s0 = _mm256_set1_ps(src4[sc]), d40 = _mm256_fmadd_ps(s0, w0, d40); - if (M > 5) s0 = _mm256_set1_ps(src5[sc]), d50 = _mm256_fmadd_ps(s0, w0, d50); - weight += DF; - } - if (M > 0) _mm256_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); - if (M > 1) _mm256_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); - if (M > 2) _mm256_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); - if (M > 3) _mm256_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); - if (M > 4) _mm256_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); - if (M > 5) _mm256_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); - } - - typedef void(*InputConvolution1x1_1xM_Ptr)(const float * src0, size_t srcC, const float * weight, const __m256 * bias, const __m256 * params, float * dst0); - - template InputConvolution1x1_1xM_Ptr GetInputConvolution1x1_1xM(size_t M) - { - switch (M) - { - case 0: return InputConvolution1x1_1xM; - case 1: return InputConvolution1x1_1xM; - case 2: return InputConvolution1x1_1xM; - case 3: return InputConvolution1x1_1xM; - case 4: return InputConvolution1x1_1xM; - case 5: return InputConvolution1x1_1xM; - } - assert(0); - return NULL; - } - - template SIMD_NOINLINE void InputConvolution1x1(const float * src, const SimdConvolutionParameters & p, - size_t dstC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcH = p.srcH, srcW = p.srcW, srcC = p.srcC, dstW = p.dstW; - size_t dstM = (bufH[0] - 1), dstS = bufH[0] * dstW * F; - size_t dstCDF = AlignLo(dstC, DF); - __m256 _params[2], _bias[2]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); -#ifdef SIMD_MERGECONV_MERGE_INPUT_ROWS_1X1 - size_t yInt = Simd::Max(yBeg, yEnd&(~dstM)), nBeg = yBeg * dstW, nInt = yInt * dstW, nEnd = yEnd * dstW; - size_t nInt6 = AlignLoAny(nInt - nBeg, 6) + nBeg, nEnd6 = AlignLoAny(nEnd - nInt, 6) + nInt, nIntTail = nInt - nInt6, nEndTail = nEnd - nEnd6; - InputConvolution1x1_2xM_Ptr tailInt_2 = GetInputConvolution1x1_2xM(nIntTail); - InputConvolution1x1_2xM_Ptr tailEnd_2 = GetInputConvolution1x1_2xM(nEndTail); -#else - size_t dstW6 = AlignLoAny(dstW, 6), wTail = dstW - dstW6; - InputConvolution1x1_2xM_Ptr tailW_2 = GetInputConvolution1x1_2xM(wTail); - InputConvolution1x1_1xM_Ptr tailW_1 = GetInputConvolution1x1_1xM(wTail); -#endif - - size_t dc = 0; - for (; dc < dstC; dc += DF) - { - _bias[0] = bias ? _mm256_loadu_ps(bias + dc + 0) : _mm256_setzero_ps(); - _bias[1] = bias ? _mm256_loadu_ps(bias + dc + F) : _mm256_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = _mm256_loadu_ps(params + dc + 0); - _params[1] = _mm256_loadu_ps(params + dc + F); - } - const float * pS = src + yBeg * srcW*srcC; - const float * pW = weight + dc * srcC; - float * pD = dst + (dc / F)*dstS; -#ifdef SIMD_MERGECONV_MERGE_INPUT_ROWS_1X1 - float * dst0 = pD + (yBeg&dstM)*dstW*F; - float * dst1 = pD + (yInt&dstM)*dstW*F; - size_t dn = nBeg; - if (dstC - dc > F) - { - for (; dn < nInt6; dn += 6, pS += 6 * srcC, dst0 += 6 * F) - InputConvolution1x1_2x6(pS, srcC, pW, _bias, _params, dst0, dst0 + dstS); - if (nIntTail) - tailInt_2(pS, srcC, pW, _bias, _params, dst0, dst0 + dstS), pS += nIntTail * srcC, dn += nIntTail; - for (; dn < nEnd6; dn += 6, pS += 6 * srcC, dst1 += 6 * F) - InputConvolution1x1_2x6(pS, srcC, pW, _bias, _params, dst1, dst1 + dstS); - if (nEndTail) - tailEnd_2(pS, srcC, pW, _bias, _params, dst1, dst1 + dstS), pS += nEndTail * srcC, dn += nEndTail; - } - else - { - InputConvolution1x1_1xM_Ptr tailInt_1 = GetInputConvolution1x1_1xM(nIntTail); - InputConvolution1x1_1xM_Ptr tailEnd_1 = GetInputConvolution1x1_1xM(nEndTail); - for (; dn < nInt6; dn += 6, pS += 6 * srcC, dst0 += 6 * F) - InputConvolution1x1_1x6(pS, srcC, pW, _bias, _params, dst0); - if (nIntTail) - tailInt_1(pS, srcC, pW, _bias, _params, dst0), pS += nIntTail * srcC, dn += nIntTail; - for (; dn < nEnd6; dn += 6, pS += 6 * srcC, dst1 += 6 * F) - InputConvolution1x1_1x6(pS, srcC, pW, _bias, _params, dst1); - if (nEndTail) - tailEnd_1(pS, srcC, pW, _bias, _params, dst1), pS += nEndTail * srcC, dn += nEndTail; - } -#else - for (size_t dy = yBeg; dy < yEnd; ++dy) - { - float * dst0 = pD + (dy&dstM)*dstW*F; - size_t dx = 0; - if (dstC - dc > F) - { - for (; dx < dstW6; dx += 6, pS += 6 * srcC, dst0 += 6 * F) - InputConvolution1x1_2x6(pS, srcC, pW, _bias, _params, dst0, dst0 + dstS); - if (wTail) - tailW_2(pS, srcC, pW, _bias, _params, dst0, dst0 + dstS), pS += wTail * srcC, dx += wTail; - } - else - { - for (; dx < dstW6; dx += 6, pS += 6 * srcC, dst0 += 6 * F) - InputConvolution1x1_1x6(pS, srcC, pW, _bias, _params, dst0); - if (wTail) - tailW_1(pS, srcC, pW, _bias, _params, dst0), pS += wTail * srcC, dx += wTail; - } - } -#endif - } - } - - template SIMD_NOINLINE void InputConvolution_2x1(const float * src0, const SimdConvolutionParameters & p, - size_t kH, size_t kW, const float * weight, const __m256 * bias, const __m256 * params, float * dst0, float * dst1) - { - __m256 d00, d01, s0, w0, w1; - d00 = bias[0]; - d01 = bias[1]; - size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW)*p.srcC, stride = p.srcW * p.srcC; - for (size_t ky = 0; ky < kH; ++ky) - { - for (size_t i = 0; i < size; ++i) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - s0 = _mm256_set1_ps(src0[i]); - d00 = _mm256_fmadd_ps(s0, w0, d00); - d01 = _mm256_fmadd_ps(s0, w1, d01); - weight += DF; - } - weight += tail; - src0 += stride; - } - _mm256_storeu_ps(dst0, Activate(d00, params, 0)); - _mm256_storeu_ps(dst1, Activate(d01, params, 1)); - } - - template SIMD_NOINLINE void InputConvolution_1x1(const float * src0, const SimdConvolutionParameters & p, - size_t kH, size_t kW, const float * weight, const __m256 * bias, const __m256 * params, float * dst0) - { - __m256 d00, s0, w0; - d00 = bias[0]; - size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW)*p.srcC, stride = p.srcW * p.srcC; - for (size_t ky = 0; ky < kH; ++ky) - { - for (size_t i = 0; i < size; ++i) - { - w0 = _mm256_loadu_ps(weight + 0); - s0 = _mm256_set1_ps(src0[i]); - d00 = _mm256_fmadd_ps(s0, w0, d00); - weight += DF; - } - weight += tail; - src0 += stride; - } - _mm256_storeu_ps(dst0, Activate(d00, params, 0)); - } - - template SIMD_NOINLINE void InputConvolution_2x6(const float * src0, const SimdConvolutionParameters & p, - size_t kH, size_t kW, const float * weight, const __m256 * bias, const __m256 * params, float * dst0, float * dst1) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - d00 = bias[0], d01 = bias[1]; - d10 = bias[0], d11 = bias[1]; - d20 = bias[0], d21 = bias[1]; - d30 = bias[0], d31 = bias[1]; - d40 = bias[0], d41 = bias[1]; - d50 = bias[0], d51 = bias[1]; - size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW)*p.srcC, stride = p.srcW * p.srcC, step = p.srcC*p.strideX; - const float * src1 = src0 + 1 * step; - const float * src2 = src0 + 2 * step; - const float * src3 = src0 + 3 * step; - const float * src4 = src0 + 4 * step; - const float * src5 = src0 + 5 * step; - for (size_t ky = 0; ky < kH; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - s0 = _mm256_set1_ps(src0[offset]); - d00 = _mm256_fmadd_ps(s0, w0, d00); - d01 = _mm256_fmadd_ps(s0, w1, d01); - s0 = _mm256_set1_ps(src1[offset]); - d10 = _mm256_fmadd_ps(s0, w0, d10); - d11 = _mm256_fmadd_ps(s0, w1, d11); - s0 = _mm256_set1_ps(src2[offset]); - d20 = _mm256_fmadd_ps(s0, w0, d20); - d21 = _mm256_fmadd_ps(s0, w1, d21); - s0 = _mm256_set1_ps(src3[offset]); - d30 = _mm256_fmadd_ps(s0, w0, d30); - d31 = _mm256_fmadd_ps(s0, w1, d31); - s0 = _mm256_set1_ps(src4[offset]); - d40 = _mm256_fmadd_ps(s0, w0, d40); - d41 = _mm256_fmadd_ps(s0, w1, d41); - s0 = _mm256_set1_ps(src5[offset]); - d50 = _mm256_fmadd_ps(s0, w0, d50); - d51 = _mm256_fmadd_ps(s0, w1, d51); - weight += DF; - } - weight += tail; - } - _mm256_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); - _mm256_storeu_ps(dst1 + 0 * F, Activate(d01, params, 1)); - _mm256_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); - _mm256_storeu_ps(dst1 + 1 * F, Activate(d11, params, 1)); - _mm256_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); - _mm256_storeu_ps(dst1 + 2 * F, Activate(d21, params, 1)); - _mm256_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); - _mm256_storeu_ps(dst1 + 3 * F, Activate(d31, params, 1)); - _mm256_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); - _mm256_storeu_ps(dst1 + 4 * F, Activate(d41, params, 1)); - _mm256_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); - _mm256_storeu_ps(dst1 + 5 * F, Activate(d51, params, 1)); - } - - template SIMD_NOINLINE void InputConvolution_1x6(const float * src0, const SimdConvolutionParameters & p, - size_t kH, size_t kW, const float * weight, const __m256 * bias, const __m256 * params, float * dst0) - { - __m256 d00, d10, d20, d30, d40, d50, s0, w0; - d00 = bias[0]; - d10 = bias[0]; - d20 = bias[0]; - d30 = bias[0]; - d40 = bias[0]; - d50 = bias[0]; - size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW)*p.srcC, stride = p.srcW * p.srcC, step = p.srcC*p.strideX; - const float * src1 = src0 + 1 * step; - const float * src2 = src0 + 2 * step; - const float * src3 = src0 + 3 * step; - const float * src4 = src0 + 4 * step; - const float * src5 = src0 + 5 * step; - for (size_t ky = 0; ky < kH; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm256_loadu_ps(weight + 0); - s0 = _mm256_set1_ps(src0[offset]); - d00 = _mm256_fmadd_ps(s0, w0, d00); - s0 = _mm256_set1_ps(src1[offset]); - d10 = _mm256_fmadd_ps(s0, w0, d10); - s0 = _mm256_set1_ps(src2[offset]); - d20 = _mm256_fmadd_ps(s0, w0, d20); - s0 = _mm256_set1_ps(src3[offset]); - d30 = _mm256_fmadd_ps(s0, w0, d30); - s0 = _mm256_set1_ps(src4[offset]); - d40 = _mm256_fmadd_ps(s0, w0, d40); - s0 = _mm256_set1_ps(src5[offset]); - d50 = _mm256_fmadd_ps(s0, w0, d50); - weight += DF; - } - weight += tail; - } - _mm256_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); - _mm256_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); - _mm256_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); - _mm256_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); - _mm256_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); - _mm256_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); - } - - template SIMD_NOINLINE void InputConvolution(const float * src, const SimdConvolutionParameters & p, - size_t dstC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcH = p.srcH, srcW = p.srcW, srcC = p.srcC, dstW = p.dstW; - size_t kernelY = p.kernelY, kernelX = p.kernelX, strideY = p.strideY, strideX = p.strideX; - size_t dstM = (bufH[0] - 1), dstS = bufH[0] * dstW * F; - size_t dstCDF = AlignLo(dstC, DF); - if (dstC - F > dstCDF) - dstCDF += DF; - - size_t noseH = p.padY, noseW = p.padX; - size_t bodyH = p.srcH - p.kernelY + 1 + noseH, bodyW = p.srcW - p.kernelX + 1 + noseW; - size_t bodyW6 = AlignLoAny(bodyW - noseW, 6 * p.strideX) + noseW; - size_t tailH = bodyH + p.padH, tailW = bodyW + p.padW; - size_t wS = p.srcC*p.dstC; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - - __m256 _params[2], _bias[2]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - - size_t dc = 0; - for (; dc < dstCDF; dc += DF) - { - _bias[0] = bias ? _mm256_loadu_ps(bias + dc + 0) : _mm256_setzero_ps(); - _bias[1] = bias ? _mm256_loadu_ps(bias + dc + F) : _mm256_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = _mm256_loadu_ps(params + dc + 0); - _params[1] = _mm256_loadu_ps(params + dc + F); - } - size_t dy = yBeg, sy = dy * strideY; - for (; sy < noseH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS, *dst1 = dst0 + dstS; - size_t sx = 0; - const float * s = src; - const float * w = weight + (noseH - sy) * kernelX * DF * srcC; - for (; sx < noseW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s, p, kY + sy, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0, dst1); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F, dst1 += 6 * F) - InputConvolution_2x6(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < bodyW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < tailW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kY + sy, kW - sx, w, _bias, _params, dst0, dst1); - } - for (; sy < bodyH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS, *dst1 = dst0 + dstS; - size_t sx = 0; - const float * s = src + (sy - noseH)*srcW*srcC; - const float * w = weight; - for (; sx < noseW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s, p, kernelY, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0, dst1); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F, dst1 += 6 * F) - InputConvolution_2x6(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < bodyW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < tailW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kernelY, kW - sx, w, _bias, _params, dst0, dst1); - } - for (; sy < tailH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS, *dst1 = dst0 + dstS; - size_t sx = 0; - const float * s = src + (sy - noseH)*srcW*srcC; - const float * w = weight; - for (; sx < noseW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s, p, kH - sy, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0, dst1); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F, dst1 += 6 * F) - InputConvolution_2x6(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < bodyW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < tailW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kH - sy, kW - sx, w, _bias, _params, dst0, dst1); - } - weight += kernelY * kernelX*srcC*DF; - } - if (dc < dstC) - { - _bias[0] = bias ? _mm256_loadu_ps(bias + dc) : _mm256_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm256_loadu_ps(params + dc); - size_t dy = yBeg, sy = dy * strideY; - for (; sy < noseH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS; - size_t sx = 0; - const float * s = src; - const float * w = weight + (noseH - sy) * kernelX * DF * srcC; - for (; sx < noseW; sx += strideX, dst0 += F) - InputConvolution_1x1(s, p, kY + sy, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F) - InputConvolution_1x6(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0); - for (; sx < bodyW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0); - for (; sx < tailW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kY + sy, kW - sx, w, _bias, _params, dst0); - } - for (; sy < bodyH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS; - size_t sx = 0; - const float * s = src + (sy - noseH)*srcW*srcC; - const float * w = weight; - for (; sx < noseW; sx += strideX, dst0 += F) - InputConvolution_1x1(s, p, kernelY, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F) - InputConvolution_1x6(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0); - for (; sx < bodyW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0); - for (; sx < tailW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kernelY, kW - sx, w, _bias, _params, dst0); - } - for (; sy < tailH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS; - size_t sx = 0; - const float * s = src + (sy - noseH)*srcW*srcC; - const float * w = weight; - for (; sx < noseW; sx += strideX, dst0 += F) - InputConvolution_1x1(s, p, kH - sy, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F) - InputConvolution_1x6(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0); - for (; sx < bodyW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0); - for (; sx < tailW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kH - sy, kW - sx, w, _bias, _params, dst0); - } - } - } - - template SIMD_NOINLINE void DepthwiseConvolution(const float* src, const SimdConvolutionParameters& p, - size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float* weight, const float* bias, const float* params, float* dst) - { - size_t strideY = p.strideY, strideX = p.strideX, padY = p.padY, padX = p.padX, padH = p.padH, padW = p.padW; - size_t srcW = p.srcW * F, dstW = p.dstW * F, weightS = p.kernelY * p.kernelX * F, strideXF = strideX * F; - size_t srcM = (bufH[0] - 1), dstM = (bufH[1] - 1), srcS = bufH[0] * srcW, dstS = bufH[1] * dstW; - size_t noseY = (p.padY + p.strideY - 1) / p.strideY; - size_t bodyY = (p.srcH + p.padY + p.strideY - p.kernelY) / p.strideY; - size_t noseX = (p.padX + p.strideX - 1) / p.strideX; - size_t bodyX = (p.srcW + p.padX + p.strideX - p.kernelX) / p.strideX; - size_t bodyX2 = AlignLo(bodyX - noseX, 2) + noseX; - size_t bodyX4 = AlignLo(bodyX - noseX, 4) + noseX; - size_t bodyX8 = AlignLo(bodyX - noseX, 8) + noseX; - - __m256 _params[2]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - for (size_t c = 0; c < srcC; c += F) - { - __m256 _bias = bias ? _mm256_loadu_ps(bias + c) : _mm256_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm256_loadu_ps(params + c); - - for (size_t dy = yBeg; dy < yEnd; ++dy) - { - float* pd = dst + (dy & dstM) * dstW; - if (dy >= noseY && dy < bodyY) - { - size_t dx = 0; - for (; dx < noseX; ++dx, pd += F) - { - __m256 sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * p.strideY + ky - padY; - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * p.strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + ((sy & srcM) * p.srcW + sx) * F; - sum = _mm256_fmadd_ps(_mm256_loadu_ps(ps), _mm256_loadu_ps(pw), sum); - } - } - } - _mm256_storeu_ps(pd, Activate(sum, _params, 0)); - } - for (; dx < bodyX8; dx += 8, pd += 8 * F) - { - __m256 sum0 = _bias; - __m256 sum1 = _bias; - __m256 sum2 = _bias; - __m256 sum3 = _bias; - __m256 sum4 = _bias; - __m256 sum5 = _bias; - __m256 sum6 = _bias; - __m256 sum7 = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - __m256 w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 0 * strideXF), w0, sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 1 * strideXF), w0, sum1); - sum2 = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 2 * strideXF), w0, sum2); - sum3 = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 3 * strideXF), w0, sum3); - sum4 = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 4 * strideXF), w0, sum4); - sum5 = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 5 * strideXF), w0, sum5); - sum6 = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 6 * strideXF), w0, sum6); - sum7 = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 7 * strideXF), w0, sum7); - } - } - _mm256_storeu_ps(pd + 0 * F, Activate(sum0, _params, 0)); - _mm256_storeu_ps(pd + 1 * F, Activate(sum1, _params, 0)); - _mm256_storeu_ps(pd + 2 * F, Activate(sum2, _params, 0)); - _mm256_storeu_ps(pd + 3 * F, Activate(sum3, _params, 0)); - _mm256_storeu_ps(pd + 4 * F, Activate(sum4, _params, 0)); - _mm256_storeu_ps(pd + 5 * F, Activate(sum5, _params, 0)); - _mm256_storeu_ps(pd + 6 * F, Activate(sum6, _params, 0)); - _mm256_storeu_ps(pd + 7 * F, Activate(sum7, _params, 0)); - } - for (; dx < bodyX4; dx += 4, pd += 4 * F) - { - __m256 sum0 = _bias; - __m256 sum1 = _bias; - __m256 sum2 = _bias; - __m256 sum3 = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - __m256 w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 0 * strideXF), w0, sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 1 * strideXF), w0, sum1); - sum2 = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 2 * strideXF), w0, sum2); - sum3 = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 3 * strideXF), w0, sum3); - } - } - _mm256_storeu_ps(pd + 0 * F, Activate(sum0, _params, 0)); - _mm256_storeu_ps(pd + 1 * F, Activate(sum1, _params, 0)); - _mm256_storeu_ps(pd + 2 * F, Activate(sum2, _params, 0)); - _mm256_storeu_ps(pd + 3 * F, Activate(sum3, _params, 0)); - } - for (; dx < bodyX2; dx += 2, pd += 2 * F) - { - __m256 sum0 = _bias; - __m256 sum1 = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - __m256 w0 = _mm256_loadu_ps(pw); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 0 * strideXF), w0, sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 1 * strideXF), w0, sum1); - } - } - _mm256_storeu_ps(pd + 0 * F, Activate(sum0, _params, 0)); - _mm256_storeu_ps(pd + 1 * F, Activate(sum1, _params, 0)); - } - for (; dx < bodyX; ++dx, pd += F) - { - __m256 sum = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - __m256 w0 = _mm256_loadu_ps(pw); - sum = _mm256_fmadd_ps(_mm256_loadu_ps(ps), w0, sum); - } - } - _mm256_storeu_ps(pd, Activate(sum, _params, 0)); - } - for (; dx < p.dstW; ++dx, pd += F) - { - __m256 sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + ((sy & srcM) * p.srcW + sx) * F; - sum = _mm256_fmadd_ps(_mm256_loadu_ps(ps), _mm256_loadu_ps(pw), sum); - } - } - } - _mm256_storeu_ps(pd, Activate(sum, _params, 0)); - } - } - else - { - for (size_t dx = 0; dx < p.dstW; ++dx, pd += F) - { - __m256 sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + ((sy & srcM) * p.srcW + sx) * F; - sum = _mm256_fmadd_ps(_mm256_loadu_ps(ps), _mm256_loadu_ps(pw), sum); - } - } - } - } - _mm256_storeu_ps(pd, Activate(sum, _params, 0)); - } - } - } - src += srcS; - dst += dstS; - weight += weightS; - } - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Edge2x2( - const float * src0, const float * src1, const __m256 * weight, const __m256 & bias, const __m256 * params, float * dst) - { - __m256 sum0 = bias, sum1 = _mm256_setzero_ps(); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(src0 + 0 * F), weight[0], sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(src0 + 1 * F), weight[1], sum1); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(src1 + 0 * F), weight[3], sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(src1 + 1 * F), weight[4], sum1); - _mm256_storeu_ps(dst, Activate(_mm256_add_ps(sum0, sum1), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Edge2x3( - const float * src0, const float * src1, const __m256 * weight, const __m256 & bias, const __m256 * params, float * dst) - { - __m256 sum0 = bias, sum1 = _mm256_setzero_ps(), sum2 = _mm256_setzero_ps(); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(src0 + 0 * F), weight[0], sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(src0 + 1 * F), weight[1], sum1); - sum2 = _mm256_fmadd_ps(_mm256_loadu_ps(src0 + 2 * F), weight[2], sum2); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(src1 + 0 * F), weight[3], sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(src1 + 1 * F), weight[4], sum1); - sum2 = _mm256_fmadd_ps(_mm256_loadu_ps(src1 + 2 * F), weight[5], sum2); - _mm256_storeu_ps(dst, Activate(_mm256_add_ps(_mm256_add_ps(sum0, sum1), sum2), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Edge3x2( - const float * src0, const float * src1, const float * src2, const __m256 * weight, const __m256 & bias, const __m256 * params, float * dst) - { - __m256 sum0 = bias, sum1 = _mm256_setzero_ps(); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(src0 + 0 * F), weight[0], sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(src0 + 1 * F), weight[1], sum1); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(src1 + 0 * F), weight[3], sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(src1 + 1 * F), weight[4], sum1); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(src2 + 0 * F), weight[6], sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(src2 + 1 * F), weight[7], sum1); - _mm256_storeu_ps(dst, Activate(_mm256_add_ps(sum0, sum1), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Main1x1( - const float * src0, const float * src1, const float * src2, const __m256 * weight, const __m256 & bias, const __m256 * params, float * dst) - { - __m256 sum0 = bias, sum1 = _mm256_setzero_ps(), sum2 = _mm256_setzero_ps(); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(src0 + 0 * F), weight[0], sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(src0 + 1 * F), weight[1], sum1); - sum2 = _mm256_fmadd_ps(_mm256_loadu_ps(src0 + 2 * F), weight[2], sum2); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(src1 + 0 * F), weight[3], sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(src1 + 1 * F), weight[4], sum1); - sum2 = _mm256_fmadd_ps(_mm256_loadu_ps(src1 + 2 * F), weight[5], sum2); - sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(src2 + 0 * F), weight[6], sum0); - sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(src2 + 1 * F), weight[7], sum1); - sum2 = _mm256_fmadd_ps(_mm256_loadu_ps(src2 + 2 * F), weight[8], sum2); - _mm256_storeu_ps(dst, Activate(_mm256_add_ps(_mm256_add_ps(sum0, sum1), sum2), params, 0)); - } - - template SIMD_NOINLINE void DepthwiseConvolution3x3(const float * src, const SimdConvolutionParameters & p, - size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float * weight, const float * bias, const float * params, float * dst) - { - size_t strideY = p.strideY, padY = p.padY, padX = p.padX, padH = p.padH, padW = p.padW; - size_t srcW = p.srcW * F, dstW = p.dstW * F, weightS = p.kernelY * p.kernelX * F; - size_t srcM = (bufH[0] - 1), dstM = (bufH[1] - 1), srcS = bufH[0] * srcW, dstS = bufH[1] * dstW; - size_t xStep = F * p.strideX, xStep0 = (p.strideX - p.padX)*F; - size_t xMainEnd = p.dstW - p.padW, yMainEnd = yEnd == p.dstH && p.padH ? yEnd - 1 : yEnd; - - __m256 _params[2]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - for (size_t c = 0; c < srcC; c += F) - { - __m256 _weight[9]; - for (size_t i = 0; i < 9; ++i) - _weight[i] = _mm256_loadu_ps(weight + i * F); - __m256 _bias = bias ? _mm256_loadu_ps(bias + c) : _mm256_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm256_loadu_ps(params + c); - - size_t dy = yBeg; - if (yBeg == 0 && padY) - { - size_t sy = 0, dx = 0; - const float * src0 = src + ((sy + 0)&srcM)*srcW; - const float * src1 = src + ((sy + 1)&srcM)*srcW; - float * pDst = dst + (dy&dstM)*dstW; - if (padX) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 4, _bias, _params, pDst), pDst += F, dx++, src0 += xStep0, src1 += xStep0; - for (; dx < xMainEnd; dx++, pDst += F, src0 += xStep, src1 += xStep) - ConvolutionDepthwise3x3Edge2x3(src0, src1, _weight + 3, _bias, _params, pDst); - if (padW) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 3, _bias, _params, pDst); - dy++; - } - for (; dy < yMainEnd; ++dy) - { - size_t sy = dy * strideY - padY, dx = 0; - const float * src0 = src + ((sy + 0)&srcM)*srcW; - const float * src1 = src + ((sy + 1)&srcM)*srcW; - const float * src2 = src + ((sy + 2)&srcM)*srcW; - float * pDst = dst + (dy&dstM)*dstW; - if (padX) - ConvolutionDepthwise3x3Edge3x2(src0, src1, src2, _weight + 1, _bias, _params, pDst), pDst += F, dx++, src0 += xStep0, src1 += xStep0, src2 += xStep0; - for (; dx < xMainEnd; dx++, pDst += F, src0 += xStep, src1 += xStep, src2 += xStep) - ConvolutionDepthwise3x3Main1x1(src0, src1, src2, _weight + 0, _bias, _params, pDst); - if (padW) - ConvolutionDepthwise3x3Edge3x2(src0, src1, src2, _weight + 0, _bias, _params, pDst); - } - if (dy < yEnd) - { - size_t sy = dy * strideY - padY, dx = 0; - const float * src0 = src + ((sy + 0)&srcM)*srcW; - const float * src1 = src + ((sy + 1)&srcM)*srcW; - float * pDst = dst + (dy&dstM)*dstW; - if (padX) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 1, _bias, _params, pDst), pDst += F, dx++, src0 += xStep0, src1 += xStep0; - for (; dx < xMainEnd; dx++, pDst += F, src0 += xStep, src1 += xStep) - ConvolutionDepthwise3x3Edge2x3(src0, src1, _weight + 0, _bias, _params, pDst); - if (padW) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 0, _bias, _params, pDst); - } - src += srcS; - dst += dstS; - weight += weightS; - } - } - - template SIMD_NOINLINE void OutputConvolution_2x6(const float * src, size_t srcC, size_t srcS, - const float * weight, const __m256 * bias, const __m256 * params, float * dst, size_t dstC, size_t tail) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - if (tail > F) - { - d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(), d41 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(), d51 = _mm256_setzero_ps(); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - s0 = _mm256_set1_ps(src[i + 0 * F]); - d00 = _mm256_fmadd_ps(s0, w0, d00); - d01 = _mm256_fmadd_ps(s0, w1, d01); - s0 = _mm256_set1_ps(src[i + 1 * F]); - d10 = _mm256_fmadd_ps(s0, w0, d10); - d11 = _mm256_fmadd_ps(s0, w1, d11); - s0 = _mm256_set1_ps(src[i + 2 * F]); - d20 = _mm256_fmadd_ps(s0, w0, d20); - d21 = _mm256_fmadd_ps(s0, w1, d21); - s0 = _mm256_set1_ps(src[i + 3 * F]); - d30 = _mm256_fmadd_ps(s0, w0, d30); - d31 = _mm256_fmadd_ps(s0, w1, d31); - s0 = _mm256_set1_ps(src[i + 4 * F]); - d40 = _mm256_fmadd_ps(s0, w0, d40); - d41 = _mm256_fmadd_ps(s0, w1, d41); - s0 = _mm256_set1_ps(src[i + 5 * F]); - d50 = _mm256_fmadd_ps(s0, w0, d50); - d51 = _mm256_fmadd_ps(s0, w1, d51); - } - src += srcS; - } - if (tail == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params); - dst += dstC; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params); - dst += dstC; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params); - } - else - { - tail -= F; - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params, tail); - } - } - else - { - d00 = _mm256_setzero_ps(); - d10 = _mm256_setzero_ps(); - d20 = _mm256_setzero_ps(); - d30 = _mm256_setzero_ps(); - d40 = _mm256_setzero_ps(); - d50 = _mm256_setzero_ps(); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm256_loadu_ps(weight + 0); - s0 = _mm256_set1_ps(src[i + 0 * F]); - d00 = _mm256_fmadd_ps(s0, w0, d00); - s0 = _mm256_set1_ps(src[i + 1 * F]); - d10 = _mm256_fmadd_ps(s0, w0, d10); - s0 = _mm256_set1_ps(src[i + 2 * F]); - d20 = _mm256_fmadd_ps(s0, w0, d20); - s0 = _mm256_set1_ps(src[i + 3 * F]); - d30 = _mm256_fmadd_ps(s0, w0, d30); - s0 = _mm256_set1_ps(src[i + 4 * F]); - d40 = _mm256_fmadd_ps(s0, w0, d40); - s0 = _mm256_set1_ps(src[i + 5 * F]); - d50 = _mm256_fmadd_ps(s0, w0, d50); - } - src += srcS; - } - if (tail == F) - { - Term::template Save(dst + 0, d00, bias, params); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - dst += dstC; - Term::template Save(dst + 0, d40, bias, params); - dst += dstC; - Term::template Save(dst + 0, d50, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d40, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d50, bias, params, tail); - } - } - } - - template SIMD_NOINLINE void OutputConvolution_2xM(const float* src, size_t srcC, size_t srcS, - const float* weight, const __m256* bias, const __m256* params, float* dst, size_t dstC, size_t tail) - { - __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - if (tail > F) - { - if (M > 0) d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - if (M > 4) d40 = _mm256_setzero_ps(), d41 = _mm256_setzero_ps(); - if (M > 5) d50 = _mm256_setzero_ps(), d51 = _mm256_setzero_ps(); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm256_loadu_ps(weight + 0); - w1 = _mm256_loadu_ps(weight + F); - if (M > 0) s0 = _mm256_set1_ps(src[i + 0 * F]), d00 = _mm256_fmadd_ps(s0, w0, d00), d01 = _mm256_fmadd_ps(s0, w1, d01); - if (M > 1) s0 = _mm256_set1_ps(src[i + 1 * F]), d10 = _mm256_fmadd_ps(s0, w0, d10), d11 = _mm256_fmadd_ps(s0, w1, d11); - if (M > 2) s0 = _mm256_set1_ps(src[i + 2 * F]), d20 = _mm256_fmadd_ps(s0, w0, d20), d21 = _mm256_fmadd_ps(s0, w1, d21); - if (M > 3) s0 = _mm256_set1_ps(src[i + 3 * F]), d30 = _mm256_fmadd_ps(s0, w0, d30), d31 = _mm256_fmadd_ps(s0, w1, d31); - if (M > 4) s0 = _mm256_set1_ps(src[i + 4 * F]), d40 = _mm256_fmadd_ps(s0, w0, d40), d41 = _mm256_fmadd_ps(s0, w1, d41); - if (M > 5) s0 = _mm256_set1_ps(src[i + 5 * F]), d50 = _mm256_fmadd_ps(s0, w0, d50), d51 = _mm256_fmadd_ps(s0, w1, d51); - } - src += srcS; - } - if (tail == DF) - { - if (M > 0) Term::template Save(dst + 0, d00, bias, params), Term::template Save(dst + F, d01, bias, params), dst += dstC; - if (M > 1) Term::template Save(dst + 0, d10, bias, params), Term::template Save(dst + F, d11, bias, params), dst += dstC; - if (M > 2) Term::template Save(dst + 0, d20, bias, params), Term::template Save(dst + F, d21, bias, params), dst += dstC; - if (M > 3) Term::template Save(dst + 0, d30, bias, params), Term::template Save(dst + F, d31, bias, params), dst += dstC; - if (M > 4) Term::template Save(dst + 0, d40, bias, params), Term::template Save(dst + F, d41, bias, params), dst += dstC; - if (M > 5) Term::template Save(dst + 0, d50, bias, params), Term::template Save(dst + F, d51, bias, params), dst += dstC; - } - else - { - tail -= F; - if (M > 0) Term::template Save(dst + 0, d00, bias, params), Term::template Save(dst + F, d01, bias, params, tail), dst += dstC; - if (M > 1) Term::template Save(dst + 0, d10, bias, params), Term::template Save(dst + F, d11, bias, params, tail), dst += dstC; - if (M > 2) Term::template Save(dst + 0, d20, bias, params), Term::template Save(dst + F, d21, bias, params, tail), dst += dstC; - if (M > 3) Term::template Save(dst + 0, d30, bias, params), Term::template Save(dst + F, d31, bias, params, tail), dst += dstC; - if (M > 4) Term::template Save(dst + 0, d40, bias, params), Term::template Save(dst + F, d41, bias, params, tail), dst += dstC; - if (M > 5) Term::template Save(dst + 0, d50, bias, params), Term::template Save(dst + F, d51, bias, params, tail), dst += dstC; - } - } - else - { - if (M > 0) d00 = _mm256_setzero_ps(); - if (M > 1) d10 = _mm256_setzero_ps(); - if (M > 2) d20 = _mm256_setzero_ps(); - if (M > 3) d30 = _mm256_setzero_ps(); - if (M > 4) d40 = _mm256_setzero_ps(); - if (M > 5) d50 = _mm256_setzero_ps(); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm256_loadu_ps(weight + 0); - if (M > 0) s0 = _mm256_set1_ps(src[i + 0 * F]), d00 = _mm256_fmadd_ps(s0, w0, d00); - if (M > 1) s0 = _mm256_set1_ps(src[i + 1 * F]), d10 = _mm256_fmadd_ps(s0, w0, d10); - if (M > 2) s0 = _mm256_set1_ps(src[i + 2 * F]), d20 = _mm256_fmadd_ps(s0, w0, d20); - if (M > 3) s0 = _mm256_set1_ps(src[i + 3 * F]), d30 = _mm256_fmadd_ps(s0, w0, d30); - if (M > 4) s0 = _mm256_set1_ps(src[i + 4 * F]), d40 = _mm256_fmadd_ps(s0, w0, d40); - if (M > 5) s0 = _mm256_set1_ps(src[i + 5 * F]), d50 = _mm256_fmadd_ps(s0, w0, d50); - } - src += srcS; - } - if (tail == F) - { - if (M > 0) Term::template Save(dst + 0, d00, bias, params), dst += dstC; - if (M > 1) Term::template Save(dst + 0, d10, bias, params), dst += dstC; - if (M > 2) Term::template Save(dst + 0, d20, bias, params), dst += dstC; - if (M > 3) Term::template Save(dst + 0, d30, bias, params), dst += dstC; - if (M > 4) Term::template Save(dst + 0, d40, bias, params), dst += dstC; - if (M > 5) Term::template Save(dst + 0, d50, bias, params), dst += dstC; - } - else - { - if (M > 0) Term::template Save(dst + 0, d00, bias, params, tail), dst += dstC; - if (M > 1) Term::template Save(dst + 0, d10, bias, params, tail), dst += dstC; - if (M > 2) Term::template Save(dst + 0, d20, bias, params, tail), dst += dstC; - if (M > 3) Term::template Save(dst + 0, d30, bias, params, tail), dst += dstC; - if (M > 4) Term::template Save(dst + 0, d40, bias, params, tail), dst += dstC; - if (M > 5) Term::template Save(dst + 0, d50, bias, params, tail), dst += dstC; - } - } - } - - typedef void(*OutputConvolution_2xM_Ptr)(const float* src, size_t srcC, size_t srcS, const float* weight, const __m256* bias, const __m256* params, float* dst, size_t dstC, size_t tail); - - template OutputConvolution_2xM_Ptr GetOutputConvolution_2xM(size_t M) - { - switch (M) - { - case 0: return OutputConvolution_2xM; - case 1: return OutputConvolution_2xM; - case 2: return OutputConvolution_2xM; - case 3: return OutputConvolution_2xM; - case 4: return OutputConvolution_2xM; - case 5: return OutputConvolution_2xM; - } - assert(0); - return NULL; - } - - template SIMD_NOINLINE void OutputConvolution(const float * src, const SimdConvolutionParameters & p, - size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float * weight, const float * bias, const float * params, float * dst) - { - assert(p.group == 1 && p.kernelY == 1 && p.strideY == 1); - size_t srcH = p.srcH, srcW = p.srcW, dstW = p.dstW, dstC = p.dstC; - size_t srcM = (bufH[1] - 1), srcS = bufH[1] * srcW*F; -#ifdef SIMD_MERGECONV_MERGE_OUTPUT_ROWS - size_t yInt = Simd::Max(yBeg, yEnd & (~srcM)), nBeg = yBeg * srcW, nInt = yInt * srcW, nEnd = yEnd * srcW; - size_t nInt6 = AlignLoAny(nInt - nBeg, 6) + nBeg, nEnd6 = AlignLoAny(nEnd - nInt, 6) + nInt, nIntTail = nInt - nInt6, nEndTail = nEnd - nEnd6; - OutputConvolution_2xM_Ptr tailInt = GetOutputConvolution_2xM(nIntTail); - OutputConvolution_2xM_Ptr tailEnd = GetOutputConvolution_2xM(nEndTail); -#else - size_t dstW6 = AlignLoAny(dstW, 6), wTail = dstW - dstW6; - OutputConvolution_2xM_Ptr tailW = GetOutputConvolution_2xM(wTail); -#endif - __m256 _params[2], _bias[2]; - _params[0] = _mm256_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm256_set1_ps(params[1]); - - dst += yBeg * p.dstW * p.dstC; - size_t dc = 0; - for (; dc < dstC; dc += DF) - { - size_t tail = Simd::Min(DF, dstC - dc); - _bias[0] = _mm256_loadu_ps(bias + dc + 0); - _bias[1] = _mm256_loadu_ps(bias + dc + F); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = _mm256_loadu_ps(params + dc + 0); - _params[1] = _mm256_loadu_ps(params + dc + F); - } - float * pDst = dst + dc; -#ifdef SIMD_MERGECONV_MERGE_OUTPUT_ROWS - const float* src0 = src + (yBeg & srcM) * srcW * F; - const float* src1 = src + (yInt & srcM) * srcW * F; - size_t dn = nBeg; - for (; dn < nInt6; dn += 6, pDst += 6 * dstC, src0 += 6 * F) - OutputConvolution_2x6(src0, srcC, srcS, weight, _bias, _params, pDst, dstC, tail); - if (nIntTail) - tailInt(src0, srcC, srcS, weight, _bias, _params, pDst, dstC, tail), dn += nIntTail, pDst += nIntTail * dstC, src0 += nIntTail * F; - for (; dn < nEnd6; dn += 6, pDst += 6 * dstC, src1 += 6 * F) - OutputConvolution_2x6(src1, srcC, srcS, weight, _bias, _params, pDst, dstC, tail); - if (nEndTail) - tailEnd(src1, srcC, srcS, weight, _bias, _params, pDst, dstC, tail), dn += nEndTail, pDst += nEndTail * dstC, src1 += nEndTail * F; -#else - for (size_t y = yBeg; y < yEnd; ++y) - { - const float* pSrc = src + (y & srcM) * srcW * F; - size_t x = 0; - for (; x < dstW6; x += 6, pDst += 6 * dstC, pSrc += 6 * F) - OutputConvolution_2x6(pSrc, srcC, srcS, weight, _bias, _params, pDst, dstC, tail); - if (wTail) - tailW(pSrc, srcC, srcS, weight, _bias, _params, pDst, dstC, tail), pDst += wTail * dstC, pSrc += wTail * F; - } -#endif - weight += srcC * DF; - } - } - - template void SetConvolutionPtr(const MergConvParam32f & p, size_t index, SynetMergedConvolution32f::ConvolutionPtr convolution[3]) - { - switch (index) - { - case 0: - if (p.conv[0].kernelY == 1 && p.conv[0].strideY == 1) - convolution[0] = InputConvolution1x1; - else - convolution[0] = InputConvolution; - break; - case 1: - if (p.conv[1].kernelY == 3) - convolution[1] = DepthwiseConvolution3x3; - else - convolution[1] = DepthwiseConvolution; - break; - case 2: - if (p.add) - { - convolution[2] = OutputConvolution; - convolution[3] = OutputConvolution; - convolution[4] = OutputConvolution; - convolution[5] = OutputConvolution; - } - else - { - convolution[2] = OutputConvolution; - convolution[3] = OutputConvolution; - convolution[4] = OutputConvolution; - convolution[5] = OutputConvolution; - } - break; - default: - assert(0); - } - } - - SynetMergedConvolution32f::SynetMergedConvolution32f(const MergConvParam32f & p) - : Avx::SynetMergedConvolution32f(p) - { - for (size_t i = 0; i < _param.count; ++i) - { - switch (p.conv[i].activation) - { - case SimdConvolutionActivationIdentity: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationRelu: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationLeakyRelu: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationRestrictRange: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationPrelu: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationElu: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationHswish: SetConvolutionPtr(_param, i, _convolution); break; - default: assert(0); - } - } - SetSize(Base::AlgCacheL1(), Base::AlgCacheL2()/2, Base::AlgCacheL3(), Avx::F); - } - - //--------------------------------------------------------------------- - - void * SynetMergedConvolution32fInit(size_t batch, const SimdConvolutionParameters * convs, size_t count, SimdBool add) - { - MergConvParam32f param(batch, convs, count, add); - if (!param.Valid()) - return NULL; - if (param.conv[2].dstC < F) - return new Sse2::SynetMergedConvolution32f(param); - else - return new Avx2::SynetMergedConvolution32f(param); - } - } - #endif//SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2SynetPooling.cpp b/src/3rd/Simd/Simd/SimdAvx2SynetPooling.cpp deleted file mode 100644 index f7717ae1..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2SynetPooling.cpp +++ /dev/null @@ -1,202 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdSse41.h" -#include "Simd/SimdAvx1.h" -#include "Simd/SimdAvx2.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - void SynetPoolingForwardMax32f(const float * src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float * dst, size_t dstH, size_t dstW, SimdTensorFormatType format) - { - if (format == SimdTensorFormatNchw) - { - if (strideY == 1 && strideX == 1 && kernelY == 3 && kernelX == 3 && srcH == dstH && srcW == dstW && dstW > F) - { - for (size_t c = 0; c < srcC; ++c, src += srcH * srcW, dst += dstH * dstW) - Avx2::NeuralPooling1x1Max3x3(src, srcW, srcW, srcH, dst, dstW); - return; - } - if (strideY == 2 && strideX == 2 && kernelY == 3 && kernelX == 3 && padY == 0 && padX == 0 && dstW > F) - { - for (size_t c = 0; c < srcC; ++c, src += srcH * srcW, dst += dstH * dstW) - Avx2::NeuralPooling2x2Max3x3(src, srcW, srcW, srcH, dst, dstW); - return; - } - } - Avx::SynetPoolingForwardMax32f(src, srcC, srcH, srcW, kernelY, kernelX, strideY, strideX, padY, padX, dst, dstH, dstW, format); - } - - //--------------------------------------------------------------------- - - SIMD_INLINE void PoolingMaxNhwc1(const uint8_t* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m256i& min, uint8_t* dst) - { - __m256i max0 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - const __m256i* ps = (__m256i*)(src + w * srcC); - max0 = _mm256_max_epu8(max0, _mm256_loadu_si256(ps + 0)); - } - src += srcS; - } - _mm256_storeu_si256((__m256i*)dst + 0, max0); - } - - SIMD_INLINE void PoolingMaxNhwc2(const uint8_t* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m256i& min, uint8_t* dst) - { - __m256i max0 = min; - __m256i max1 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - const __m256i* ps = (__m256i*)(src + w * srcC); - max0 = _mm256_max_epu8(max0, _mm256_loadu_si256(ps + 0)); - max1 = _mm256_max_epu8(max1, _mm256_loadu_si256(ps + 1)); - } - src += srcS; - } - _mm256_storeu_si256((__m256i*)dst + 0, max0); - _mm256_storeu_si256((__m256i*)dst + 1, max1); - } - - SIMD_INLINE void PoolingMaxNhwc4(const uint8_t* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m256i& min, uint8_t* dst) - { - __m256i max0 = min; - __m256i max1 = min; - __m256i max2 = min; - __m256i max3 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - const __m256i* ps = (__m256i*)(src + w * srcC); - max0 = _mm256_max_epu8(max0, _mm256_loadu_si256(ps + 0)); - max1 = _mm256_max_epu8(max1, _mm256_loadu_si256(ps + 1)); - max2 = _mm256_max_epu8(max2, _mm256_loadu_si256(ps + 2)); - max3 = _mm256_max_epu8(max3, _mm256_loadu_si256(ps + 3)); - } - src += srcS; - } - _mm256_storeu_si256((__m256i*)dst + 0, max0); - _mm256_storeu_si256((__m256i*)dst + 1, max1); - _mm256_storeu_si256((__m256i*)dst + 2, max2); - _mm256_storeu_si256((__m256i*)dst + 3, max3); - } - - SIMD_INLINE void PoolingMaxNhwc8(const uint8_t* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m256i& min, uint8_t* dst) - { - __m256i max0 = min; - __m256i max1 = min; - __m256i max2 = min; - __m256i max3 = min; - __m256i max4 = min; - __m256i max5 = min; - __m256i max6 = min; - __m256i max7 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - const __m256i* ps = (__m256i*)(src + w * srcC); - max0 = _mm256_max_epu8(max0, _mm256_loadu_si256(ps + 0)); - max1 = _mm256_max_epu8(max1, _mm256_loadu_si256(ps + 1)); - max2 = _mm256_max_epu8(max2, _mm256_loadu_si256(ps + 2)); - max3 = _mm256_max_epu8(max3, _mm256_loadu_si256(ps + 3)); - max4 = _mm256_max_epu8(max4, _mm256_loadu_si256(ps + 4)); - max5 = _mm256_max_epu8(max5, _mm256_loadu_si256(ps + 5)); - max6 = _mm256_max_epu8(max6, _mm256_loadu_si256(ps + 6)); - max7 = _mm256_max_epu8(max7, _mm256_loadu_si256(ps + 7)); - } - src += srcS; - } - _mm256_storeu_si256((__m256i*)dst + 0, max0); - _mm256_storeu_si256((__m256i*)dst + 1, max1); - _mm256_storeu_si256((__m256i*)dst + 2, max2); - _mm256_storeu_si256((__m256i*)dst + 3, max3); - _mm256_storeu_si256((__m256i*)dst + 4, max4); - _mm256_storeu_si256((__m256i*)dst + 5, max5); - _mm256_storeu_si256((__m256i*)dst + 6, max6); - _mm256_storeu_si256((__m256i*)dst + 7, max7); - } - - void SynetPoolingForwardMax8u(const uint8_t* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, uint8_t* dst, size_t dstH, size_t dstW, SimdTensorFormatType format) - { - if (format == SimdTensorFormatNhwc) - { - if (srcC >= A) - { - size_t srcS = srcW * srcC; - size_t srcCA1 = AlignLo(srcC, 1 * A); - size_t srcCA2 = AlignLo(srcC, 2 * A); - size_t srcCA4 = AlignLo(srcC, 4 * A); - size_t srcCA8 = AlignLo(srcC, 8 * A); - __m256i min = _mm256_set1_epi8(0); - for (size_t ph = 0; ph < dstH; ++ph) - { - size_t hStart = ph * strideY - padY; - size_t hEnd = Simd::Min(hStart + kernelY, srcH); - hStart = Simd::Max(0, hStart); - for (size_t pw = 0; pw < dstW; ++pw) - { - size_t wStart = pw * strideX - padX; - size_t wEnd = Simd::Min(wStart + kernelX, srcW); - wStart = Simd::Max(0, wStart); - const uint8_t* ps = src + hStart * srcS + wStart * srcC; - size_t c = 0; - for (; c < srcCA8; c += 8 * A) - PoolingMaxNhwc8(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - for (; c < srcCA4; c += 4 * A) - PoolingMaxNhwc4(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - for (; c < srcCA2; c += 2 * A) - PoolingMaxNhwc2(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - for (; c < srcCA1; c += 1 * A) - PoolingMaxNhwc1(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - if (c < srcC) - PoolingMaxNhwc1(ps + srcC - A, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + srcC - A); - dst += srcC; - } - } - } - else - Sse41::SynetPoolingForwardMax8u(src, srcC, srcH, srcW, kernelY, kernelX, strideY, strideX, padY, padX, dst, dstH, dstW, format); - } - else if (format == SimdTensorFormatNchw) - { - Base::SynetPoolingForwardMax8u(src, srcC, srcH, srcW, kernelY, kernelX, strideY, strideX, padY, padX, dst, dstH, dstW, format); - } - else - assert(0); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2Texture.cpp b/src/3rd/Simd/Simd/SimdAvx2Texture.cpp deleted file mode 100644 index f3bd4994..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2Texture.cpp +++ /dev/null @@ -1,266 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - SIMD_INLINE __m256i TextureBoostedSaturatedGradient16(__m256i difference, __m256i saturation, const __m256i & boost) - { - return _mm256_mullo_epi16(_mm256_max_epi16(K_ZERO, _mm256_add_epi16(saturation, _mm256_min_epi16(difference, saturation))), boost); - } - - SIMD_INLINE __m256i TextureBoostedSaturatedGradient8(__m256i a, __m256i b, __m256i saturation, const __m256i & boost) - { - __m256i lo = TextureBoostedSaturatedGradient16(SubUnpackedU8<0>(b, a), saturation, boost); - __m256i hi = TextureBoostedSaturatedGradient16(SubUnpackedU8<1>(b, a), saturation, boost); - return _mm256_packus_epi16(lo, hi); - } - - template SIMD_INLINE void TextureBoostedSaturatedGradient(const uint8_t * src, uint8_t * dx, uint8_t * dy, - size_t stride, __m256i saturation, __m256i boost) - { - const __m256i s10 = Load((__m256i*)(src - 1)); - const __m256i s12 = Load((__m256i*)(src + 1)); - const __m256i s01 = Load((__m256i*)(src - stride)); - const __m256i s21 = Load((__m256i*)(src + stride)); - Store((__m256i*)dx, TextureBoostedSaturatedGradient8(s10, s12, saturation, boost)); - Store((__m256i*)dy, TextureBoostedSaturatedGradient8(s01, s21, saturation, boost)); - } - - template void TextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride) - { - assert(width >= A && int(2)*saturation*boost <= 0xFF); - if (align) - { - assert(Aligned(src) && Aligned(srcStride) && Aligned(dx) && Aligned(dxStride) && Aligned(dy) && Aligned(dyStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __m256i _saturation = _mm256_set1_epi16(saturation); - __m256i _boost = _mm256_set1_epi16(boost); - - memset(dx, 0, width); - memset(dy, 0, width); - src += srcStride; - dx += dxStride; - dy += dyStride; - for (size_t row = 2; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - TextureBoostedSaturatedGradient(src + col, dx + col, dy + col, srcStride, _saturation, _boost); - if (width != alignedWidth) - TextureBoostedSaturatedGradient(src + width - A, dx + width - A, dy + width - A, srcStride, _saturation, _boost); - - dx[0] = 0; - dy[0] = 0; - dx[width - 1] = 0; - dy[width - 1] = 0; - - src += srcStride; - dx += dxStride; - dy += dyStride; - } - memset(dx, 0, width); - memset(dy, 0, width); - } - - void TextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dx) && Aligned(dxStride) && Aligned(dy) && Aligned(dyStride)) - TextureBoostedSaturatedGradient(src, srcStride, width, height, saturation, boost, dx, dxStride, dy, dyStride); - else - TextureBoostedSaturatedGradient(src, srcStride, width, height, saturation, boost, dx, dxStride, dy, dyStride); - } - - template SIMD_INLINE void TextureBoostedUv(const uint8_t * src, uint8_t * dst, __m256i min8, __m256i max8, __m256i boost16) - { - const __m256i _src = Load((__m256i*)src); - const __m256i saturated = _mm256_sub_epi8(_mm256_max_epu8(min8, _mm256_min_epu8(max8, _src)), min8); - const __m256i lo = _mm256_mullo_epi16(_mm256_unpacklo_epi8(saturated, K_ZERO), boost16); - const __m256i hi = _mm256_mullo_epi16(_mm256_unpackhi_epi8(saturated, K_ZERO), boost16); - Store((__m256i*)dst, _mm256_packus_epi16(lo, hi)); - } - - template void TextureBoostedUv(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t boost, uint8_t * dst, size_t dstStride) - { - assert(width >= A && boost < 0x80); - if (align) - { - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); - } - - size_t alignedWidth = AlignLo(width, A); - int min = 128 - (128 / boost); - int max = 255 - min; - - __m256i min8 = _mm256_set1_epi8(min); - __m256i max8 = _mm256_set1_epi8(max); - __m256i boost16 = _mm256_set1_epi16(boost); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - TextureBoostedUv(src + col, dst + col, min8, max8, boost16); - if (width != alignedWidth) - TextureBoostedUv(src + width - A, dst + width - A, min8, max8, boost16); - - src += srcStride; - dst += dstStride; - } - } - - void TextureBoostedUv(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t boost, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - TextureBoostedUv(src, srcStride, width, height, boost, dst, dstStride); - else - TextureBoostedUv(src, srcStride, width, height, boost, dst, dstStride); - } - - template SIMD_INLINE void TextureGetDifferenceSum(const uint8_t * src, const uint8_t * lo, const uint8_t * hi, - __m256i & positive, __m256i & negative, const __m256i & mask) - { - const __m256i _src = Load((__m256i*)src); - const __m256i _lo = Load((__m256i*)lo); - const __m256i _hi = Load((__m256i*)hi); - const __m256i average = _mm256_and_si256(mask, _mm256_avg_epu8(_lo, _hi)); - const __m256i current = _mm256_and_si256(mask, _src); - positive = _mm256_add_epi64(positive, _mm256_sad_epu8(_mm256_subs_epu8(current, average), K_ZERO)); - negative = _mm256_add_epi64(negative, _mm256_sad_epu8(_mm256_subs_epu8(average, current), K_ZERO)); - } - - template void TextureGetDifferenceSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, int64_t * sum) - { - assert(width >= A && sum != NULL); - if (align) - { - assert(Aligned(src) && Aligned(srcStride) && Aligned(lo) && Aligned(loStride) && Aligned(hi) && Aligned(hiStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, A - width + alignedWidth, 0xFF); - __m256i positive = _mm256_setzero_si256(); - __m256i negative = _mm256_setzero_si256(); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - TextureGetDifferenceSum(src + col, lo + col, hi + col, positive, negative, K_INV_ZERO); - if (width != alignedWidth) - TextureGetDifferenceSum(src + width - A, lo + width - A, hi + width - A, positive, negative, tailMask); - src += srcStride; - lo += loStride; - hi += hiStride; - } - *sum = ExtractSum(positive) - ExtractSum(negative); - } - - void TextureGetDifferenceSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, int64_t * sum) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(lo) && Aligned(loStride) && Aligned(hi) && Aligned(hiStride)) - TextureGetDifferenceSum(src, srcStride, width, height, lo, loStride, hi, hiStride, sum); - else - TextureGetDifferenceSum(src, srcStride, width, height, lo, loStride, hi, hiStride, sum); - } - - template void TexturePerformCompensation(const uint8_t * src, size_t srcStride, size_t width, size_t height, - int shift, uint8_t * dst, size_t dstStride) - { - assert(width >= A && shift > -0xFF && shift < 0xFF && shift != 0); - if (align) - { - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __m256i tailMask = SetMask(0, (src == dst) ? A - width + alignedWidth : 0, 0xFF); - if (shift > 0) - { - __m256i _shift = _mm256_set1_epi8((char)shift); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - { - const __m256i _src = Load((__m256i*) (src + col)); - Store((__m256i*) (dst + col), _mm256_adds_epu8(_src, _shift)); - } - if (width != alignedWidth) - { - const __m256i _src = Load((__m256i*) (src + width - A)); - Store((__m256i*) (dst + width - A), _mm256_adds_epu8(_src, _mm256_and_si256(_shift, tailMask))); - } - src += srcStride; - dst += dstStride; - } - } - if (shift < 0) - { - __m256i _shift = _mm256_set1_epi8((char)-shift); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - { - const __m256i _src = Load((__m256i*) (src + col)); - Store((__m256i*) (dst + col), _mm256_subs_epu8(_src, _shift)); - } - if (width != alignedWidth) - { - const __m256i _src = Load((__m256i*) (src + width - A)); - Store((__m256i*) (dst + width - A), _mm256_subs_epu8(_src, _mm256_and_si256(_shift, tailMask))); - } - src += srcStride; - dst += dstStride; - } - } - } - - void TexturePerformCompensation(const uint8_t * src, size_t srcStride, size_t width, size_t height, - int shift, uint8_t * dst, size_t dstStride) - { - if (shift == 0) - { - if (src != dst) - Base::Copy(src, srcStride, width, height, 1, dst, dstStride); - return; - } - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - TexturePerformCompensation(src, srcStride, width, height, shift, dst, dstStride); - else - TexturePerformCompensation(src, srcStride, width, height, shift, dst, dstStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2YuvToBgr.cpp b/src/3rd/Simd/Simd/SimdAvx2YuvToBgr.cpp deleted file mode 100644 index c34d8588..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2YuvToBgr.cpp +++ /dev/null @@ -1,342 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void YuvToBgr(__m256i y, __m256i u, __m256i v, __m256i * bgr) - { - __m256i blue = YuvToBlue(y, u); - __m256i green = YuvToGreen(y, u, v); - __m256i red = YuvToRed(y, v); - Store(bgr + 0, InterleaveBgr<0>(blue, green, red)); - Store(bgr + 1, InterleaveBgr<1>(blue, green, red)); - Store(bgr + 2, InterleaveBgr<2>(blue, green, red)); - } - - template SIMD_INLINE void Yuv444pToBgr(const uint8_t * y, const uint8_t * u, const uint8_t * v, uint8_t * bgr) - { - YuvToBgr(Load((__m256i*)y), Load((__m256i*)u), Load((__m256i*)v), (__m256i*)bgr); - } - - template void Yuv444pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); - } - - size_t bodyWidth = AlignLo(width, A); - size_t tail = width - bodyWidth; - size_t A3 = A * 3; - for (size_t row = 0; row < height; ++row) - { - for (size_t colYuv = 0, colBgr = 0; colYuv < bodyWidth; colYuv += A, colBgr += A3) - { - Yuv444pToBgr(y + colYuv, u + colYuv, v + colYuv, bgr + colBgr); - } - if (tail) - { - size_t col = width - A; - Yuv444pToBgr(y + col, u + col, v + col, bgr + 3 * col); - } - y += yStride; - u += uStride; - v += vStride; - bgr += bgrStride; - } - } - - void Yuv444pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)) - Yuv444pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else - Yuv444pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - } - - template SIMD_INLINE void Yuv422pToBgr(const uint8_t * y, const __m256i & u, const __m256i & v, uint8_t * bgr) - { - YuvToBgr(Load((__m256i*)y + 0), _mm256_unpacklo_epi8(u, u), _mm256_unpacklo_epi8(v, v), (__m256i*)bgr + 0); - YuvToBgr(Load((__m256i*)y + 1), _mm256_unpackhi_epi8(u, u), _mm256_unpackhi_epi8(v, v), (__m256i*)bgr + 3); - } - - template SIMD_INLINE void Yuv422pToBgr(const uint8_t * y, const uint8_t * u, const uint8_t * v, uint8_t * bgr) - { - Yuv422pToBgr(y, LoadPermuted((__m256i*)u), LoadPermuted((__m256i*)v), bgr); - } - - template void Yuv422pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - assert((width % 2 == 0) && (width >= DA)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); - } - - size_t bodyWidth = AlignLo(width, DA); - size_t tail = width - bodyWidth; - size_t A6 = A * 6; - for (size_t row = 0; row < height; ++row) - { - for (size_t colUV = 0, colY = 0, colBgr = 0; colY < bodyWidth; colY += DA, colUV += A, colBgr += A6) - Yuv422pToBgr(y + colY, u + colUV, v + colUV, bgr + colBgr); - if (tail) - { - size_t offset = width - DA; - Yuv422pToBgr(y + offset, u + offset / 2, v + offset / 2, bgr + 3 * offset); - } - y += yStride; - u += uStride; - v += vStride; - bgr += bgrStride; - } - } - - void Yuv422pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)) - Yuv422pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else - Yuv422pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - } - - template void Yuv420pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= DA) && (height >= 2)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); - } - - size_t bodyWidth = AlignLo(width, DA); - size_t tail = width - bodyWidth; - size_t A6 = A * 6; - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colY = 0, colBgr = 0; colY < bodyWidth; colY += DA, colUV += A, colBgr += A6) - { - __m256i u_ = LoadPermuted((__m256i*)(u + colUV)); - __m256i v_ = LoadPermuted((__m256i*)(v + colUV)); - Yuv422pToBgr(y + colY, u_, v_, bgr + colBgr); - Yuv422pToBgr(y + colY + yStride, u_, v_, bgr + colBgr + bgrStride); - } - if (tail) - { - size_t offset = width - DA; - __m256i u_ = LoadPermuted((__m256i*)(u + offset / 2)); - __m256i v_ = LoadPermuted((__m256i*)(v + offset / 2)); - Yuv422pToBgr(y + offset, u_, v_, bgr + 3 * offset); - Yuv422pToBgr(y + offset + yStride, u_, v_, bgr + 3 * offset + bgrStride); - } - y += 2 * yStride; - u += uStride; - v += vStride; - bgr += 2 * bgrStride; - } - } - - void Yuv420pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)) - Yuv420pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else - Yuv420pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void YuvToRgb(__m256i y, __m256i u, __m256i v, __m256i* rgb) - { - __m256i blue = YuvToBlue(y, u); - __m256i green = YuvToGreen(y, u, v); - __m256i red = YuvToRed(y, v); - Store(rgb + 0, InterleaveBgr<0>(red, green, blue)); - Store(rgb + 1, InterleaveBgr<1>(red, green, blue)); - Store(rgb + 2, InterleaveBgr<2>(red, green, blue)); - } - - template SIMD_INLINE void Yuv444pToRgb(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* rgb) - { - YuvToRgb(Load((__m256i*)y), Load((__m256i*)u), Load((__m256i*)v), (__m256i*)rgb); - } - - template void Yuv444pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(rgb) && Aligned(rgbStride)); - } - - size_t bodyWidth = AlignLo(width, A); - size_t tail = width - bodyWidth; - size_t A3 = A * 3; - for (size_t row = 0; row < height; ++row) - { - for (size_t colYuv = 0, colRgb = 0; colYuv < bodyWidth; colYuv += A, colRgb += A3) - { - Yuv444pToRgb(y + colYuv, u + colYuv, v + colYuv, rgb + colRgb); - } - if (tail) - { - size_t col = width - A; - Yuv444pToRgb(y + col, u + col, v + col, rgb + 3 * col); - } - y += yStride; - u += uStride; - v += vStride; - rgb += rgbStride; - } - } - - void Yuv444pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(rgb) && Aligned(rgbStride)) - Yuv444pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - else - Yuv444pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - } - - template SIMD_INLINE void Yuv422pToRgb(const uint8_t* y, const __m256i& u, const __m256i& v, uint8_t* rgb) - { - YuvToRgb(Load((__m256i*)y + 0), _mm256_unpacklo_epi8(u, u), _mm256_unpacklo_epi8(v, v), (__m256i*)rgb + 0); - YuvToRgb(Load((__m256i*)y + 1), _mm256_unpackhi_epi8(u, u), _mm256_unpackhi_epi8(v, v), (__m256i*)rgb + 3); - } - - template SIMD_INLINE void Yuv422pToRgb(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* rgb) - { - Yuv422pToRgb(y, LoadPermuted((__m256i*)u), LoadPermuted((__m256i*)v), rgb); - } - - template void Yuv422pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) - { - assert((width % 2 == 0) && (width >= DA)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(rgb) && Aligned(rgbStride)); - } - - size_t bodyWidth = AlignLo(width, DA); - size_t tail = width - bodyWidth; - size_t A6 = A * 6; - for (size_t row = 0; row < height; ++row) - { - for (size_t colUV = 0, colY = 0, colRgb = 0; colY < bodyWidth; colY += DA, colUV += A, colRgb += A6) - Yuv422pToRgb(y + colY, u + colUV, v + colUV, rgb + colRgb); - if (tail) - { - size_t offset = width - DA; - Yuv422pToRgb(y + offset, u + offset / 2, v + offset / 2, rgb + 3 * offset); - } - y += yStride; - u += uStride; - v += vStride; - rgb += rgbStride; - } - } - - void Yuv422pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(rgb) && Aligned(rgbStride)) - Yuv422pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - else - Yuv422pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - } - - template void Yuv420pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= DA) && (height >= 2)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(rgb) && Aligned(rgbStride)); - } - - size_t bodyWidth = AlignLo(width, DA); - size_t tail = width - bodyWidth; - size_t A6 = A * 6; - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colY = 0, colRgb = 0; colY < bodyWidth; colY += DA, colUV += A, colRgb += A6) - { - __m256i u_ = LoadPermuted((__m256i*)(u + colUV)); - __m256i v_ = LoadPermuted((__m256i*)(v + colUV)); - Yuv422pToRgb(y + colY, u_, v_, rgb + colRgb); - Yuv422pToRgb(y + colY + yStride, u_, v_, rgb + colRgb + rgbStride); - } - if (tail) - { - size_t offset = width - DA; - __m256i u_ = LoadPermuted((__m256i*)(u + offset / 2)); - __m256i v_ = LoadPermuted((__m256i*)(v + offset / 2)); - Yuv422pToRgb(y + offset, u_, v_, rgb + 3 * offset); - Yuv422pToRgb(y + offset + yStride, u_, v_, rgb + 3 * offset + rgbStride); - } - y += 2 * yStride; - u += uStride; - v += vStride; - rgb += 2 * rgbStride; - } - } - - void Yuv420pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(rgb) && Aligned(rgbStride)) - Yuv420pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - else - Yuv420pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2YuvToBgra.cpp b/src/3rd/Simd/Simd/SimdAvx2YuvToBgra.cpp deleted file mode 100644 index 59b00f86..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2YuvToBgra.cpp +++ /dev/null @@ -1,276 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void AdjustedYuv16ToBgra(__m256i y16, __m256i u16, __m256i v16, - const __m256i & a_0, __m256i * bgra) - { - const __m256i b16 = AdjustedYuvToBlue16(y16, u16); - const __m256i g16 = AdjustedYuvToGreen16(y16, u16, v16); - const __m256i r16 = AdjustedYuvToRed16(y16, v16); - const __m256i bg8 = _mm256_or_si256(b16, _mm256_slli_si256(g16, 1)); - const __m256i ra8 = _mm256_or_si256(r16, a_0); - __m256i bgra0 = _mm256_unpacklo_epi16(bg8, ra8); - __m256i bgra1 = _mm256_unpackhi_epi16(bg8, ra8); - Permute2x128(bgra0, bgra1); - Store(bgra + 0, bgra0); - Store(bgra + 1, bgra1); - } - - template SIMD_INLINE void Yuv16ToBgra(__m256i y16, __m256i u16, __m256i v16, - const __m256i & a_0, __m256i * bgra) - { - AdjustedYuv16ToBgra(AdjustY16(y16), AdjustUV16(u16), AdjustUV16(v16), a_0, bgra); - } - - template SIMD_INLINE void Yuva8ToBgra(__m256i y8, __m256i u8, __m256i v8, const __m256i & a8, __m256i * bgra) - { - Yuv16ToBgra(_mm256_unpacklo_epi8(y8, K_ZERO), _mm256_unpacklo_epi8(u8, K_ZERO), - _mm256_unpacklo_epi8(v8, K_ZERO), _mm256_unpacklo_epi8(K_ZERO, a8), bgra + 0); - Yuv16ToBgra(_mm256_unpackhi_epi8(y8, K_ZERO), _mm256_unpackhi_epi8(u8, K_ZERO), - _mm256_unpackhi_epi8(v8, K_ZERO), _mm256_unpackhi_epi8(K_ZERO, a8), bgra + 2); - } - - template SIMD_INLINE void Yuva422pToBgra(const uint8_t * y, const __m256i & u, const __m256i & v, - const uint8_t * a, uint8_t * bgra) - { - Yuva8ToBgra(LoadPermuted((__m256i*)y + 0), _mm256_permute4x64_epi64(_mm256_unpacklo_epi8(u, u), 0xD8), - _mm256_permute4x64_epi64(_mm256_unpacklo_epi8(v, v), 0xD8), LoadPermuted((__m256i*)a + 0), (__m256i*)bgra + 0); - Yuva8ToBgra(LoadPermuted((__m256i*)y + 1), _mm256_permute4x64_epi64(_mm256_unpackhi_epi8(u, u), 0xD8), - _mm256_permute4x64_epi64(_mm256_unpackhi_epi8(v, v), 0xD8), LoadPermuted((__m256i*)a + 1), (__m256i*)bgra + 4); - } - - template void Yuva420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= DA) && (height >= 2)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride)); - assert(Aligned(a) && Aligned(aStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - size_t bodyWidth = AlignLo(width, DA); - size_t tail = width - bodyWidth; - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colY = 0, colBgra = 0; colY < bodyWidth; colY += DA, colUV += A, colBgra += OA) - { - __m256i u_ = LoadPermuted((__m256i*)(u + colUV)); - __m256i v_ = LoadPermuted((__m256i*)(v + colUV)); - Yuva422pToBgra(y + colY, u_, v_, a + colY, bgra + colBgra); - Yuva422pToBgra(y + colY + yStride, u_, v_, a + colY + aStride, bgra + colBgra + bgraStride); - } - if (tail) - { - size_t offset = width - DA; - __m256i u_ = LoadPermuted((__m256i*)(u + offset / 2)); - __m256i v_ = LoadPermuted((__m256i*)(v + offset / 2)); - Yuva422pToBgra(y + offset, u_, v_, a + offset, bgra + 4 * offset); - Yuva422pToBgra(y + offset + yStride, u_, v_, a + offset + aStride, bgra + 4 * offset + bgraStride); - } - y += 2 * yStride; - u += uStride; - v += vStride; - a += 2 * aStride; - bgra += 2 * bgraStride; - } - } - - void Yuva420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride) - && Aligned(a) && Aligned(aStride) && Aligned(bgra) && Aligned(bgraStride)) - Yuva420pToBgra(y, yStride, u, uStride, v, vStride, a, aStride, width, height, bgra, bgraStride); - else - Yuva420pToBgra(y, yStride, u, uStride, v, vStride, a, aStride, width, height, bgra, bgraStride); - } - - template SIMD_INLINE void Yuv8ToBgra(__m256i y8, __m256i u8, __m256i v8, const __m256i & a_0, __m256i * bgra) - { - Yuv16ToBgra(_mm256_unpacklo_epi8(y8, K_ZERO), _mm256_unpacklo_epi8(u8, K_ZERO), - _mm256_unpacklo_epi8(v8, K_ZERO), a_0, bgra + 0); - Yuv16ToBgra(_mm256_unpackhi_epi8(y8, K_ZERO), _mm256_unpackhi_epi8(u8, K_ZERO), - _mm256_unpackhi_epi8(v8, K_ZERO), a_0, bgra + 2); - } - - template SIMD_INLINE void Yuv444pToBgra(const uint8_t * y, const uint8_t * u, - const uint8_t * v, const __m256i & a_0, uint8_t * bgra) - { - Yuv8ToBgra(LoadPermuted((__m256i*)y), LoadPermuted((__m256i*)u), LoadPermuted((__m256i*)v), a_0, (__m256i*)bgra); - } - - template void Yuv444pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - assert(width >= A); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - __m256i a_0 = _mm256_slli_si256(_mm256_set1_epi16(alpha), 1); - size_t bodyWidth = AlignLo(width, A); - size_t tail = width - bodyWidth; - for (size_t row = 0; row < height; ++row) - { - for (size_t colYuv = 0, colBgra = 0; colYuv < bodyWidth; colYuv += A, colBgra += QA) - { - Yuv444pToBgra(y + colYuv, u + colYuv, v + colYuv, a_0, bgra + colBgra); - } - if (tail) - { - size_t col = width - A; - Yuv444pToBgra(y + col, u + col, v + col, a_0, bgra + 4 * col); - } - y += yStride; - u += uStride; - v += vStride; - bgra += bgraStride; - } - } - - void Yuv444pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)) - Yuv444pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else - Yuv444pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - } - - template SIMD_INLINE void Yuv422pToBgra(const uint8_t * y, const __m256i & u, const __m256i & v, - const __m256i & a_0, uint8_t * bgra) - { - Yuv8ToBgra(LoadPermuted((__m256i*)y + 0), - _mm256_permute4x64_epi64(_mm256_unpacklo_epi8(u, u), 0xD8), - _mm256_permute4x64_epi64(_mm256_unpacklo_epi8(v, v), 0xD8), a_0, (__m256i*)bgra + 0); - Yuv8ToBgra(LoadPermuted((__m256i*)y + 1), - _mm256_permute4x64_epi64(_mm256_unpackhi_epi8(u, u), 0xD8), - _mm256_permute4x64_epi64(_mm256_unpackhi_epi8(v, v), 0xD8), a_0, (__m256i*)bgra + 4); - } - - template void Yuv420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= DA) && (height >= 2)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - __m256i a_0 = _mm256_slli_si256(_mm256_set1_epi16(alpha), 1); - size_t bodyWidth = AlignLo(width, DA); - size_t tail = width - bodyWidth; - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colY = 0, colBgra = 0; colY < bodyWidth; colY += DA, colUV += A, colBgra += OA) - { - __m256i u_ = LoadPermuted((__m256i*)(u + colUV)); - __m256i v_ = LoadPermuted((__m256i*)(v + colUV)); - Yuv422pToBgra(y + colY, u_, v_, a_0, bgra + colBgra); - Yuv422pToBgra(y + colY + yStride, u_, v_, a_0, bgra + colBgra + bgraStride); - } - if (tail) - { - size_t offset = width - DA; - __m256i u_ = LoadPermuted((__m256i*)(u + offset / 2)); - __m256i v_ = LoadPermuted((__m256i*)(v + offset / 2)); - Yuv422pToBgra(y + offset, u_, v_, a_0, bgra + 4 * offset); - Yuv422pToBgra(y + offset + yStride, u_, v_, a_0, bgra + 4 * offset + bgraStride); - } - y += 2 * yStride; - u += uStride; - v += vStride; - bgra += 2 * bgraStride; - } - } - - void Yuv420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)) - Yuv420pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else - Yuv420pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - } - - template SIMD_INLINE void Yuv422pToBgra(const uint8_t * y, const uint8_t * u, const uint8_t * v, const __m256i & a_0, uint8_t * bgra) - { - Yuv422pToBgra(y, LoadPermuted((__m256i*)u), LoadPermuted((__m256i*)v), a_0, bgra); - } - - template void Yuv422pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - assert((width % 2 == 0) && (width >= DA)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - __m256i a_0 = _mm256_slli_si256(_mm256_set1_epi16(alpha), 1); - size_t bodyWidth = AlignLo(width, DA); - size_t tail = width - bodyWidth; - for (size_t row = 0; row < height; ++row) - { - for (size_t colUV = 0, colY = 0, colBgra = 0; colY < bodyWidth; colY += DA, colUV += A, colBgra += OA) - Yuv422pToBgra(y + colY, u + colUV, v + colUV, a_0, bgra + colBgra); - if (tail) - { - size_t offset = width - DA; - Yuv422pToBgra(y + offset, u + offset / 2, v + offset / 2, a_0, bgra + 4 * offset); - } - y += yStride; - u += uStride; - v += vStride; - bgra += bgraStride; - } - } - - void Yuv422pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)) - Yuv422pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else - Yuv422pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx2YuvToHue.cpp b/src/3rd/Simd/Simd/SimdAvx2YuvToHue.cpp deleted file mode 100644 index c2b8b17a..00000000 --- a/src/3rd/Simd/Simd/SimdAvx2YuvToHue.cpp +++ /dev/null @@ -1,180 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - SIMD_INLINE __m256i MulDiv32(__m256i dividend, __m256i divisor, const __m256 & KF_255_DIV_6) - { - return _mm256_cvttps_epi32(_mm256_div_ps(_mm256_mul_ps(KF_255_DIV_6, _mm256_cvtepi32_ps(dividend)), _mm256_cvtepi32_ps(divisor))); - } - - SIMD_INLINE __m256i MulDiv16(__m256i dividend, __m256i divisor, const __m256 & KF_255_DIV_6) - { - const __m256i quotientLo = MulDiv32(_mm256_unpacklo_epi16(dividend, K_ZERO), _mm256_unpacklo_epi16(divisor, K_ZERO), KF_255_DIV_6); - const __m256i quotientHi = MulDiv32(_mm256_unpackhi_epi16(dividend, K_ZERO), _mm256_unpackhi_epi16(divisor, K_ZERO), KF_255_DIV_6); - return _mm256_packs_epi32(quotientLo, quotientHi); - } - - SIMD_INLINE __m256i AdjustedYuvToHue16(__m256i y, __m256i u, __m256i v, const __m256 & KF_255_DIV_6) - { - const __m256i red = AdjustedYuvToRed16(y, v); - const __m256i green = AdjustedYuvToGreen16(y, u, v); - const __m256i blue = AdjustedYuvToBlue16(y, u); - const __m256i max = MaxI16(red, green, blue); - const __m256i range = _mm256_subs_epi16(max, MinI16(red, green, blue)); - - const __m256i redMaxMask = _mm256_cmpeq_epi16(red, max); - const __m256i greenMaxMask = _mm256_andnot_si256(redMaxMask, _mm256_cmpeq_epi16(green, max)); - const __m256i blueMaxMask = _mm256_andnot_si256(redMaxMask, _mm256_andnot_si256(greenMaxMask, K_INV_ZERO)); - - const __m256i redMaxCase = _mm256_and_si256(redMaxMask, - _mm256_add_epi16(_mm256_sub_epi16(green, blue), _mm256_mullo_epi16(range, K16_0006))); - const __m256i greenMaxCase = _mm256_and_si256(greenMaxMask, - _mm256_add_epi16(_mm256_sub_epi16(blue, red), _mm256_mullo_epi16(range, K16_0002))); - const __m256i blueMaxCase = _mm256_and_si256(blueMaxMask, - _mm256_add_epi16(_mm256_sub_epi16(red, green), _mm256_mullo_epi16(range, K16_0004))); - - const __m256i dividend = _mm256_or_si256(_mm256_or_si256(redMaxCase, greenMaxCase), blueMaxCase); - - return _mm256_andnot_si256(_mm256_cmpeq_epi16(range, K_ZERO), _mm256_and_si256(MulDiv16(dividend, range, KF_255_DIV_6), K16_00FF)); - } - - SIMD_INLINE __m256i YuvToHue16(__m256i y, __m256i u, __m256i v, const __m256 & KF_255_DIV_6) - { - return AdjustedYuvToHue16(AdjustY16(y), AdjustUV16(u), AdjustUV16(v), KF_255_DIV_6); - } - - SIMD_INLINE __m256i YuvToHue8(__m256i y, __m256i u, __m256i v, const __m256 & KF_255_DIV_6) - { - return _mm256_packus_epi16( - YuvToHue16(_mm256_unpacklo_epi8(y, K_ZERO), _mm256_unpacklo_epi8(u, K_ZERO), _mm256_unpacklo_epi8(v, K_ZERO), KF_255_DIV_6), - YuvToHue16(_mm256_unpackhi_epi8(y, K_ZERO), _mm256_unpackhi_epi8(u, K_ZERO), _mm256_unpackhi_epi8(v, K_ZERO), KF_255_DIV_6)); - } - - template SIMD_INLINE void Yuv420pToHue(const uint8_t * y, __m256i u, __m256i v, uint8_t * hue, const __m256 & KF_255_DIV_6) - { - Store((__m256i*)(hue), YuvToHue8(Load((__m256i*)(y)), - _mm256_unpacklo_epi8(u, u), _mm256_unpacklo_epi8(v, v), KF_255_DIV_6)); - Store((__m256i*)(hue + A), YuvToHue8(Load((__m256i*)(y + A)), - _mm256_unpackhi_epi8(u, u), _mm256_unpackhi_epi8(v, v), KF_255_DIV_6)); - } - - template void Yuv420pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= DA) && (height >= 2)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(hue) && Aligned(hueStride)); - } - - const __m256 KF_255_DIV_6 = _mm256_set1_ps(Base::KF_255_DIV_6); - - size_t bodyWidth = AlignLo(width, DA); - size_t tail = width - bodyWidth; - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colY = 0, col_hue = 0; colY < bodyWidth; colY += DA, colUV += A, col_hue += DA) - { - __m256i u_ = LoadPermuted((__m256i*)(u + colUV)); - __m256i v_ = LoadPermuted((__m256i*)(v + colUV)); - Yuv420pToHue(y + colY, u_, v_, hue + col_hue, KF_255_DIV_6); - Yuv420pToHue(y + yStride + colY, u_, v_, hue + hueStride + col_hue, KF_255_DIV_6); - } - if (tail) - { - size_t offset = width - DA; - __m256i u_ = LoadPermuted((__m256i*)(u + offset / 2)); - __m256i v_ = LoadPermuted((__m256i*)(v + offset / 2)); - Yuv420pToHue(y + offset, u_, v_, hue + offset, KF_255_DIV_6); - Yuv420pToHue(y + yStride + offset, u_, v_, hue + hueStride + offset, KF_255_DIV_6); - } - y += 2 * yStride; - u += uStride; - v += vStride; - hue += 2 * hueStride; - } - } - - template void Yuv444pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(hue) && Aligned(hueStride)); - } - - const __m256 KF_255_DIV_6 = _mm256_set1_ps(Base::KF_255_DIV_6); - - size_t bodyWidth = AlignLo(width, A); - size_t tail = width - bodyWidth; - for (size_t row = 0; row < height; row += 1) - { - for (size_t col = 0; col < bodyWidth; col += A) - { - Store((__m256i*)(hue + col), YuvToHue8(Load((__m256i*)(y + col)), - Load((__m256i*)(u + col)), Load((__m256i*)(v + col)), KF_255_DIV_6)); - } - if (tail) - { - size_t offset = width - A; - Store((__m256i*)(hue + offset), YuvToHue8(Load((__m256i*)(y + offset)), - Load((__m256i*)(u + offset)), Load((__m256i*)(v + offset)), KF_255_DIV_6)); - } - y += yStride; - u += uStride; - v += vStride; - hue += hueStride; - } - } - - void Yuv420pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride) && Aligned(hue) && Aligned(hueStride)) - Yuv420pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - else - Yuv420pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - } - - void Yuv444pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride) && Aligned(hue) && Aligned(hueStride)) - Yuv444pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - else - Yuv444pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bw.h b/src/3rd/Simd/Simd/SimdAvx512bw.h deleted file mode 100644 index b240148f..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bw.h +++ /dev/null @@ -1,450 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdAvx512bw_h__ -#define __SimdAvx512bw_h__ - -#include "Simd/SimdDefs.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - void AbsDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, size_t width, size_t height, uint64_t * sum); - - void AbsDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum); - - void AbsDifferenceSums3x3(const uint8_t * current, size_t currentStride, const uint8_t * background, size_t backgroundStride, size_t width, size_t height, uint64_t * sums); - - void AbsDifferenceSums3x3Masked(const uint8_t *current, size_t currentStride, const uint8_t *background, size_t backgroundStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sums); - - void AbsGradientSaturatedSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void AddFeatureDifference(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, uint16_t weight, uint8_t * difference, size_t differenceStride); - - void AlphaBlending(const uint8_t *src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t *alpha, size_t alphaStride, uint8_t *dst, size_t dstStride); - - void AlphaFilling(uint8_t * dst, size_t dstStride, size_t width, size_t height, const uint8_t * channel, size_t channelCount, const uint8_t * alpha, size_t alphaStride); - - void BackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride); - - void BackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride); - - void BackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * loValue, size_t loValueStride, const uint8_t * hiValue, size_t hiValueStride, - uint8_t * loCount, size_t loCountStride, uint8_t * hiCount, size_t hiCountStride); - - void BackgroundAdjustRange(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold); - - void BackgroundAdjustRangeMasked(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride); - - void BackgroundShiftRange(const uint8_t * value, size_t valueStride, size_t width, size_t height, uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride); - - void BackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride, const uint8_t * mask, size_t maskStride); - - void BackgroundInitMask(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t index, uint8_t value, uint8_t * dst, size_t dstStride); - - void BayerToBgr(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgr, size_t bgrStride); - - void BayerToBgra(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void BgraToBayer(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat); - - void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride); - - void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride); - - void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride); - - void BgraToYuv420p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void BgraToYuv422p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void BgraToYuv444p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void BgraToYuva420p(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, - uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride, uint8_t * a, size_t aStride); - - void BgrToBayer(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat); - - void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride); - - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride); - - void BgrToYuv420p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void BgrToYuv422p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void BgrToYuv444p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, - const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void Binarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride, SimdCompareType compareType); - - void AveragingBinarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, size_t neighborhood, uint8_t threshold, uint8_t positive, uint8_t negative, - uint8_t * dst, size_t dstStride, SimdCompareType compareType); - - void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, uint8_t value, SimdCompareType compareType, uint32_t * count); - - void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, int16_t value, SimdCompareType compareType, uint32_t * count); - - void ConditionalSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum); - - void ConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum); - - void ConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum); - - void ConditionalFill(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t threshold, SimdCompareType compareType, uint8_t value, uint8_t * dst, size_t dstStride); - - void DeinterleaveUv(const uint8_t * uv, size_t uvStride, size_t width, size_t height, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride); - - void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride); - - void DetectionHaarDetect32fp(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void DetectionHaarDetect32fi(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void DetectionLbpDetect32fp(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void DetectionLbpDetect32fi(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void DetectionLbpDetect16ip(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void DetectionLbpDetect16ii(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void EdgeBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, uint8_t * background, size_t backgroundStride); - - void EdgeBackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, uint8_t * background, size_t backgroundStride); - - void EdgeBackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t * backgroundCount, size_t backgroundCountStride); - - void EdgeBackgroundAdjustRange(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold); - - void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride); - - void EdgeBackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride, const uint8_t * mask, size_t maskStride); - - void FillBgr(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red); - - void FillBgra(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red, uint8_t alpha); - - void FillPixel(uint8_t * dst, size_t stride, size_t width, size_t height, const uint8_t * pixel, size_t pixelSize); - - void Float32ToFloat16(const float * src, size_t size, uint16_t * dst); - - void Float16ToFloat32(const uint16_t * src, size_t size, float * dst); - - void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum); - - void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance); - - void CosineDistancesMxNa16f(size_t M, size_t N, size_t K, const uint16_t * const * A, const uint16_t * const * B, float * distances); - - void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst); - - void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst); - - void CosineDistance32f(const float * a, const float * b, size_t size, float * distance); - - void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride); - - void GrayToBgr(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgr, size_t bgrStride); - - void GrayToBgra(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha); - - void AbsSecondDerivativeHistogram(const uint8_t *src, size_t width, size_t height, size_t stride, size_t step, size_t indent, uint32_t * histogram); - - void HistogramMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t index, uint32_t * histogram); - - void HistogramConditional(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint32_t * histogram); - - void ChangeColors(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * colors, uint8_t * dst, size_t dstStride); - - void NormalizeHistogram(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void HogDirectionHistograms(const uint8_t * src, size_t stride, size_t width, size_t height, - size_t cellX, size_t cellY, size_t quantization, float * histograms); - - void HogExtractFeatures(const uint8_t * src, size_t stride, size_t width, size_t height, float * features); - - void HogDeinterleave(const float * src, size_t srcStride, size_t width, size_t height, size_t count, float ** dst, size_t dstStride); - - void HogFilterSeparable(const float * src, size_t srcStride, size_t width, size_t height, const float * rowFilter, size_t rowSize, const float * colFilter, size_t colSize, float * dst, size_t dstStride, int add); - - void HogLiteExtractFeatures(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t cell, float * features, size_t featuresStride); - - void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride); - - void HogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight); - - void HogLiteCompressFeatures(const float * src, size_t srcStride, size_t width, size_t height, const float * pca, float * dst, size_t dstStride); - - void HogLiteFilterSeparable(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * hFilter, size_t hSize, const float * vFilter, size_t vSize, float * dst, size_t dstStride, int add); - - void HogLiteCreateMask(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, const float * threshold, size_t scale, size_t size, uint32_t * dst, size_t dstStride); - - void Int16ToGray(const uint8_t * src, size_t width, size_t height, size_t srcStride, uint8_t * dst, size_t dstStride); - - void Integral(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t * sum, size_t sumStride, uint8_t * sqsum, size_t sqsumStride, uint8_t * tilted, size_t tiltedStride, - SimdPixelFormatType sumFormat, SimdPixelFormatType sqsumFormat); - - void InterferenceIncrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t increment, int16_t saturation); - - void InterferenceIncrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t increment, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index); - - void InterferenceDecrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t decrement, int16_t saturation); - - void InterferenceDecrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t decrement, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index); - - void InterleaveUv(const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * uv, size_t uvStride); - - void InterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - void InterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride); - - void Laplace(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void LaplaceAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void LaplaceAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - void LbpEstimate(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void MeanFilter3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride); - - void MedianFilterRhomb3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - void MedianFilterRhomb5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - void MedianFilterSquare3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - void MedianFilterSquare5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - void NeuralConvert(const uint8_t * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride, int inversion); - - void OperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride, SimdOperationBinary8uType type); - - void OperationBinary16i(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint8_t * dst, size_t dstStride, SimdOperationBinary16iType type); - - void VectorProduct(const uint8_t * vertical, const uint8_t * horizontal, uint8_t * dst, size_t stride, size_t width, size_t height); - - void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); - - void ReduceGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - void ReduceGray3x3(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation); - - void ReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - void ReduceGray5x5(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation); - - void Reorder16bit(const uint8_t * src, size_t size, uint8_t * dst); - - void Reorder32bit(const uint8_t * src, size_t size, uint8_t * dst); - - void Reorder64bit(const uint8_t * src, size_t size, uint8_t * dst); - - void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); - - void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); - - void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride); - - void SegmentationChangeIndex(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t oldIndex, uint8_t newIndex); - - void SegmentationFillSingleHoles(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index); - - void SegmentationPropagate2x2(const uint8_t * parent, size_t parentStride, size_t width, size_t height, - uint8_t * child, size_t childStride, const uint8_t * difference, size_t differenceStride, - uint8_t currentIndex, uint8_t invalidIndex, uint8_t emptyIndex, uint8_t differenceThreshold); - - void SegmentationShrinkRegion(const uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index, - ptrdiff_t * left, ptrdiff_t * top, ptrdiff_t * right, ptrdiff_t * bottom); - - void ShiftBilinear(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t * bkg, size_t bkgStride, const double * shiftX, const double * shiftY, - size_t cropLeft, size_t cropTop, size_t cropRight, size_t cropBottom, uint8_t * dst, size_t dstStride); - - void SobelDx(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void SobelDxAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void SobelDxAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - void SobelDy(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void SobelDyAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void SobelDyAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - void ContourMetrics(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void ContourMetricsMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t indexMin, uint8_t * dst, size_t dstStride); - - void ContourAnchors(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t step, int16_t threshold, uint8_t * dst, size_t dstStride); - - void SquaredDifferenceSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint64_t * sum); - - void SquaredDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum); - - void GetStatistic(const uint8_t * src, size_t stride, size_t width, size_t height, - uint8_t * min, uint8_t * max, uint8_t * average); - - void GetMoments(const uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index, - uint64_t * area, uint64_t * x, uint64_t * y, uint64_t * xx, uint64_t * xy, uint64_t * yy); - - void GetObjectMoments(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t* mask, size_t maskStride, uint8_t index, - uint64_t* n, uint64_t* s, uint64_t* sx, uint64_t* sy, uint64_t* sxx, uint64_t* sxy, uint64_t* syy); - - void GetRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - void GetColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - void GetAbsDyRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - void GetAbsDxColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - void ValueSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - void SquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum); - - void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum); - - void StretchGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - void SynetConvert32fTo8u(const float* src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float* shift, uint8_t* dst, SimdSynetCompatibilityType compatibility); - - void SynetSetInput(const uint8_t * src, size_t width, size_t height, size_t stride, SimdPixelFormatType srcFormat, - const float * lower, const float * upper, float * dst, size_t channels, SimdTensorFormatType dstFormat); - - void SynetPoolingForwardMax8u(const uint8_t* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, uint8_t* dst, size_t dstH, size_t dstW, SimdTensorFormatType format); - - void TextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride); - - void TextureBoostedUv(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t boost, uint8_t * dst, size_t dstStride); - - void TextureGetDifferenceSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, int64_t * sum); - - void TexturePerformCompensation(const uint8_t * src, size_t srcStride, size_t width, size_t height, - int shift, uint8_t * dst, size_t dstStride); - - void Yuva420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride); - - void Yuv420pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - void Yuv422pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - void Yuv444pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - void Yuv420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void Yuv422pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void Yuv444pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void Yuv420pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride); - - void Yuv444pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride); - - void Yuv420pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride); - - void Yuv422pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride); - - void Yuv444pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride); - } -#endif// SIMD_AVX512BW_ENABLE -} -#endif//__SimdAvx512bw_h__ diff --git a/src/3rd/Simd/Simd/SimdAvx512bwAbsDifferenceSum.cpp b/src/3rd/Simd/Simd/SimdAvx512bwAbsDifferenceSum.cpp deleted file mode 100644 index 7a8cbec9..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwAbsDifferenceSum.cpp +++ /dev/null @@ -1,418 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdLoad.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSet.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void Sum(__m512i & sum, const __m512i & value); - - template <> SIMD_INLINE void Sum<32>(__m512i & sum, const __m512i & value) - { - sum = _mm512_add_epi32(sum, value); - } - - template <> SIMD_INLINE void Sum<64>(__m512i & sum, const __m512i & value) - { - sum = _mm512_add_epi64(sum, value); - } - - template void AbsDifferenceSum4(const uint8_t * a, const uint8_t * b, __m512i * sums) - { - Sum(sums[0], _mm512_sad_epu8(Load(a + 0 * A), Load(b + 0 * A))); - Sum(sums[1], _mm512_sad_epu8(Load(a + 1 * A), Load(b + 1 * A))); - Sum(sums[0], _mm512_sad_epu8(Load(a + 2 * A), Load(b + 2 * A))); - Sum(sums[1], _mm512_sad_epu8(Load(a + 3 * A), Load(b + 3 * A))); - } - - template void AbsDifferenceSum1(const uint8_t * a, const uint8_t * b, __m512i * sums, __mmask64 tail = -1) - { - const __m512i a0 = Load(a, tail); - const __m512i b0 = Load(b, tail); - Sum(sums[0], _mm512_sad_epu8(a0, b0)); - } - - template void AbsDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, size_t width, size_t height, uint64_t * sum) - { - if (align) - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); - - size_t fullAlignedWidth = AlignLo(width, QA); - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - __m512i sums[2] = { _mm512_setzero_si512(), _mm512_setzero_si512() }; - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QA) - AbsDifferenceSum4(a + col, b + col, sums); - for (; col < alignedWidth; col += A) - AbsDifferenceSum1(a + col, b + col, sums); - if (col < width) - AbsDifferenceSum1(a + col, b + col, sums, tailMask); - a += aStride; - b += bStride; - } - *sum = ExtractSum(_mm512_add_epi64(sums[0], sums[1])); - } - - template void AbsDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, size_t width, size_t height, uint64_t * sum) - { - if (width*height >= 256 * 256 * 256 * 8) - AbsDifferenceSum(a, aStride, b, bStride, width, height, sum); - else - AbsDifferenceSum(a, aStride, b, bStride, width, height, sum); - } - - void AbsDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - size_t width, size_t height, uint64_t * sum) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)) - AbsDifferenceSum(a, aStride, b, bStride, width, height, sum); - else - AbsDifferenceSum(a, aStride, b, bStride, width, height, sum); - } - - template void AbsDifferenceSumMasked(const uint8_t * a, const uint8_t * b, const uint8_t * m, const __m512i & index, __m512i * sums) - { - __mmask64 m0 = _mm512_cmpeq_epu8_mask(Load(m), index); - __m512i a0 = Load(a, m0); - __m512i b0 = Load(b, m0); - Sum(sums[0], _mm512_sad_epu8(a0, b0)); - } - - template void AbsDifferenceSumMasked4(const uint8_t * a, const uint8_t * b, const uint8_t * m, const __m512i & index, __m512i * sums) - { - AbsDifferenceSumMasked(a + 0 * A, b + 0 * A, m + 0 * A, index, sums + 0); - AbsDifferenceSumMasked(a + 1 * A, b + 1 * A, m + 1 * A, index, sums + 1); - AbsDifferenceSumMasked(a + 2 * A, b + 2 * A, m + 2 * A, index, sums + 0); - AbsDifferenceSumMasked(a + 3 * A, b + 3 * A, m + 3 * A, index, sums + 1); - } - - template void AbsDifferenceSumMasked1(const uint8_t * a, const uint8_t * b, const uint8_t * m, __m512i & index, __m512i * sums, __mmask64 mm = -1) - { - __mmask64 m0 = _mm512_cmpeq_epu8_mask((Load(m, mm)), index) & mm; - const __m512i a0 = Load(a, m0); - const __m512i b0 = Load(b, m0); - Sum(sums[0], _mm512_sad_epu8(a0, b0)); - } - - template void AbsDifferenceSumMasked(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - const uint8_t * mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) - { - if (align) - { - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); - assert(Aligned(mask) && Aligned(maskStride)); - } - - __m512i _index = _mm512_set1_epi8(index); - size_t fullAlignedWidth = AlignLo(width, QA); - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - __m512i sums[2] = { _mm512_setzero_si512(), _mm512_setzero_si512() }; - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QA) - AbsDifferenceSumMasked4(a + col, b + col, mask + col, _index, sums); - for (; col < alignedWidth; col += A) - AbsDifferenceSumMasked1(a + col, b + col, mask + col, _index, sums); - if (col < width) - AbsDifferenceSumMasked1(a + col, b + col, mask + col, _index, sums, tailMask); - a += aStride; - b += bStride; - mask += maskStride; - } - sums[0] = _mm512_add_epi64(sums[0], sums[1]); - *sum = ExtractSum(sums[0]); - } - - template void AbsDifferenceSumMasked(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - const uint8_t * mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) - { - if (width*height >= 256 * 256 * 256 * 8) - AbsDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - else - AbsDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - } - - void AbsDifferenceSumMasked(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - const uint8_t * mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(mask) && Aligned(maskStride)) - AbsDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - else - AbsDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - } - - template void AbsDifferenceSums3(__m512i current, const uint8_t * background, __m512i sums[3], __mmask64 m = -1) - { - Sum(sums[0], _mm512_sad_epu8(current, Load(background - 1, m))); - Sum(sums[1], _mm512_sad_epu8(current, Load(background, m))); - Sum(sums[2], _mm512_sad_epu8(current, Load(background + 1, m))); - } - - template void AbsDifferenceSums3x3(const uint8_t * current, const uint8_t * background, size_t backgroundStride, __m512i sums[9], __mmask64 m = -1) - { - const __m512i _current = Load(current, m); - AbsDifferenceSums3(_current, background - backgroundStride, sums + 0, m); - AbsDifferenceSums3(_current, background, sums + 3, m); - AbsDifferenceSums3(_current, background + backgroundStride, sums + 6, m); - } - - template void AbsDifferenceSums3x3x2(const uint8_t * current0, size_t currentStride, const uint8_t * background1, size_t backgroundStride, __m512i sums[9], __mmask64 m = -1) - { - const __m512i current00 = Load(current0, m); - const uint8_t * background0 = background1 - backgroundStride; - const __m512i background00 = Load(background0 - 1, m); - const __m512i background01 = Load(background0, m); - const __m512i background02 = Load(background0 + 1, m); - Sum(sums[0], _mm512_sad_epu8(current00, background00)); - Sum(sums[1], _mm512_sad_epu8(current00, background01)); - Sum(sums[2], _mm512_sad_epu8(current00, background02)); - const uint8_t * current1 = current0 + currentStride; - const __m512i current10 = Load(current1, m); - const __m512i background10 = Load(background1 - 1, m); - const __m512i background11 = Load(background1, m); - const __m512i background12 = Load(background1 + 1, m); - Sum(sums[0], _mm512_sad_epu8(current10, background10)); - Sum(sums[1], _mm512_sad_epu8(current10, background11)); - Sum(sums[2], _mm512_sad_epu8(current10, background12)); - Sum(sums[3], _mm512_sad_epu8(current00, background10)); - Sum(sums[4], _mm512_sad_epu8(current00, background11)); - Sum(sums[5], _mm512_sad_epu8(current00, background12)); - const uint8_t * background2 = background1 + backgroundStride; - const __m512i background20 = Load(background2 - 1, m); - const __m512i background21 = Load(background2, m); - const __m512i background22 = Load(background2 + 1, m); - Sum(sums[3], _mm512_sad_epu8(current10, background20)); - Sum(sums[4], _mm512_sad_epu8(current10, background21)); - Sum(sums[5], _mm512_sad_epu8(current10, background22)); - Sum(sums[6], _mm512_sad_epu8(current00, background20)); - Sum(sums[7], _mm512_sad_epu8(current00, background21)); - Sum(sums[8], _mm512_sad_epu8(current00, background22)); - const uint8_t * background3 = background2 + backgroundStride; - const __m512i background30 = Load(background3 - 1, m); - const __m512i background31 = Load(background3, m); - const __m512i background32 = Load(background3 + 1, m); - Sum(sums[6], _mm512_sad_epu8(current10, background30)); - Sum(sums[7], _mm512_sad_epu8(current10, background31)); - Sum(sums[8], _mm512_sad_epu8(current10, background32)); - } - - template void AbsDifferenceSums3x3(const uint8_t * current, size_t currentStride, - const uint8_t * background, size_t backgroundStride, size_t width, size_t height, uint64_t * sums) - { - if (align) - assert(Aligned(background) && Aligned(backgroundStride)); - - width -= 2; - height -= 2; - current += 1 + currentStride; - background += 1 + backgroundStride; - - size_t alignedHeight = AlignLo(height, 2); - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = __mmask64(-1) >> (A + alignedWidth - width); - __m512i _sums[9]; - for (size_t i = 0; i < 9; ++i) - _sums[i] = _mm512_setzero_si512(); - - size_t row = 0; -#if SIMD_ZMM_COUNT == 32 - for (; row < alignedHeight; row += 2) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - AbsDifferenceSums3x3x2(current + col, currentStride, background + col, backgroundStride, _sums); - if (col < width) - AbsDifferenceSums3x3x2(current + col, currentStride, background + col, backgroundStride, _sums, tailMask); - current += 2 * currentStride; - background += 2 * backgroundStride; - } -#endif - for (; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - AbsDifferenceSums3x3(current + col, background + col, backgroundStride, _sums); - if (col < width) - AbsDifferenceSums3x3(current + col, background + col, backgroundStride, _sums, tailMask); - current += currentStride; - background += backgroundStride; - } - - for (size_t i = 0; i < 9; ++i) - sums[i] = ExtractSum(_sums[i]); - } - - template void AbsDifferenceSums3x3(const uint8_t * current, size_t currentStride, const uint8_t * background, size_t backgroundStride, - size_t width, size_t height, uint64_t * sums) - { - if (width*height >= 256 * 256 * 256 * 8) - AbsDifferenceSums3x3(current, currentStride, background, backgroundStride, width, height, sums); - else - AbsDifferenceSums3x3(current, currentStride, background, backgroundStride, width, height, sums); - } - - void AbsDifferenceSums3x3(const uint8_t * current, size_t currentStride, const uint8_t * background, size_t backgroundStride, - size_t width, size_t height, uint64_t * sums) - { - if (Aligned(background) && Aligned(backgroundStride)) - AbsDifferenceSums3x3(current, currentStride, background, backgroundStride, width, height, sums); - else - AbsDifferenceSums3x3(current, currentStride, background, backgroundStride, width, height, sums); - } - - template void AbsDifferenceSums3x3Masked(const uint8_t * current, const uint8_t * background, size_t backgroundStride, const uint8_t * m, const __m512i & index, __m512i sums[9], __mmask64 mm = -1) - { - __mmask64 m0 = _mm512_cmpeq_epu8_mask((Load(m, mm)), index) & mm; - const __m512i _current = Load(current, m0); - AbsDifferenceSums3(_current, background - backgroundStride, sums + 0, m0); - AbsDifferenceSums3(_current, background, sums + 3, m0); - AbsDifferenceSums3(_current, background + backgroundStride, sums + 6, m0); - } - - template void AbsDifferenceSums3x3x2(const uint8_t * current0, size_t currentStride, const uint8_t * background1, size_t backgroundStride, - const uint8_t * mask0, size_t maskStride, const __m512i & index, __m512i sums[9], __mmask64 mm = -1) - { - __mmask64 m0 = mm & _mm512_cmpeq_epu8_mask((Load(mask0, mm)), index); - __m512i mask00 = _mm512_maskz_set1_epi8(m0, -1); - const __m512i current00 = Load(current0, m0); - const uint8_t * background0 = background1 - backgroundStride; - const __m512i background00 = Load(background0 - 1, m0); - const __m512i background01 = Load(background0, m0); - const __m512i background02 = Load(background0 + 1, m0); - Sum(sums[0], _mm512_sad_epu8(current00, _mm512_and_si512(mask00, background00))); - Sum(sums[1], _mm512_sad_epu8(current00, _mm512_and_si512(mask00, background01))); - Sum(sums[2], _mm512_sad_epu8(current00, _mm512_and_si512(mask00, background02))); - const uint8_t * mask1 = mask0 + maskStride; - __mmask64 m1 = mm & _mm512_cmpeq_epu8_mask((Load(mask1, mm)), index); - __m512i mask10 = _mm512_maskz_set1_epi8(m1, -1); - const uint8_t * current1 = current0 + currentStride; - const __m512i current10 = Load(current1, m1); - const __m512i background10 = Load(background1 - 1); - const __m512i background11 = Load(background1); - const __m512i background12 = Load(background1 + 1); - Sum(sums[0], _mm512_sad_epu8(current10, _mm512_and_si512(mask10, background10))); - Sum(sums[1], _mm512_sad_epu8(current10, _mm512_and_si512(mask10, background11))); - Sum(sums[2], _mm512_sad_epu8(current10, _mm512_and_si512(mask10, background12))); - Sum(sums[3], _mm512_sad_epu8(current00, _mm512_and_si512(mask00, background10))); - Sum(sums[4], _mm512_sad_epu8(current00, _mm512_and_si512(mask00, background11))); - Sum(sums[5], _mm512_sad_epu8(current00, _mm512_and_si512(mask00, background12))); - const uint8_t * background2 = background1 + backgroundStride; - const __m512i background20 = Load(background2 - 1); - const __m512i background21 = Load(background2); - const __m512i background22 = Load(background2 + 1); - Sum(sums[3], _mm512_sad_epu8(current10, _mm512_and_si512(mask10, background20))); - Sum(sums[4], _mm512_sad_epu8(current10, _mm512_and_si512(mask10, background21))); - Sum(sums[5], _mm512_sad_epu8(current10, _mm512_and_si512(mask10, background22))); - Sum(sums[6], _mm512_sad_epu8(current00, _mm512_and_si512(mask00, background20))); - Sum(sums[7], _mm512_sad_epu8(current00, _mm512_and_si512(mask00, background21))); - Sum(sums[8], _mm512_sad_epu8(current00, _mm512_and_si512(mask00, background22))); - const uint8_t * background3 = background2 + backgroundStride; - const __m512i background30 = Load(background3 - 1, m1); - const __m512i background31 = Load(background3, m1); - const __m512i background32 = Load(background3 + 1, m1); - Sum(sums[6], _mm512_sad_epu8(current10, background30)); - Sum(sums[7], _mm512_sad_epu8(current10, background31)); - Sum(sums[8], _mm512_sad_epu8(current10, background32)); - } - - template void AbsDifferenceSums3x3Masked(const uint8_t * current, size_t currentStride, const uint8_t * background, size_t backgroundStride, - const uint8_t * mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sums) - { - if (align) - assert(Aligned(background) && Aligned(backgroundStride)); - - width -= 2; - height -= 2; - current += 1 + currentStride; - background += 1 + backgroundStride; - mask += 1 + maskStride; - - __m512i _index = _mm512_set1_epi8(index); - size_t alignedHeight = AlignLo(height, 2); - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = __mmask64(-1) >> (A + alignedWidth - width); - __m512i _sums[9]; - for (size_t i = 0; i < 9; ++i) - _sums[i] = _mm512_setzero_si512(); - - size_t row = 0; -#if SIMD_ZMM_COUNT == 32 - for (; row < alignedHeight; row += 2) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - AbsDifferenceSums3x3x2(current + col, currentStride, background + col, backgroundStride, mask + col, maskStride, _index, _sums); - if (col < width) - AbsDifferenceSums3x3x2(current + col, currentStride, background + col, backgroundStride, mask + col, maskStride, _index, _sums, tailMask); - current += 2 * currentStride; - background += 2 * backgroundStride; - mask += 2 * maskStride; - } -#endif - for (; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - AbsDifferenceSums3x3Masked(current + col, background + col, backgroundStride, mask + col, _index, _sums); - if (col < width) - AbsDifferenceSums3x3Masked(current + col, background + col, backgroundStride, mask + col, _index, _sums, tailMask); - current += currentStride; - background += backgroundStride; - mask += maskStride; - } - - for (size_t i = 0; i < 9; ++i) - sums[i] = ExtractSum(_sums[i]); - } - - template void AbsDifferenceSums3x3Masked(const uint8_t * current, size_t currentStride, const uint8_t * background, size_t backgroundStride, - const uint8_t * mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sums) - { - if (width*height >= 256 * 256 * 256 * 8) - AbsDifferenceSums3x3Masked(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums); - else - AbsDifferenceSums3x3Masked(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums); - } - - void AbsDifferenceSums3x3Masked(const uint8_t * current, size_t currentStride, const uint8_t * background, size_t backgroundStride, - const uint8_t * mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sums) - { - if (Aligned(background) && Aligned(backgroundStride)) - AbsDifferenceSums3x3Masked(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums); - else - AbsDifferenceSums3x3Masked(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwAbsGradientSaturatedSum.cpp b/src/3rd/Simd/Simd/SimdAvx512bwAbsGradientSaturatedSum.cpp deleted file mode 100644 index b4c6fe2a..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwAbsGradientSaturatedSum.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdMath.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void AbsGradientSaturatedSum(const uint8_t * src, size_t stride, uint8_t * dst, __mmask64 m = -1) - { - const __m512i s10 = Load(src - 1, m); - const __m512i s12 = Load(src + 1, m); - const __m512i s01 = Load(src - stride, m); - const __m512i s21 = Load(src + stride, m); - const __m512i dx = AbsDifferenceU8(s10, s12); - const __m512i dy = AbsDifferenceU8(s01, s21); - Store(dst, _mm512_adds_epu8(dx, dy), m); - } - - template void AbsGradientSaturatedSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = __mmask64(-1) >> (A + alignedWidth - width); - memset(dst, 0, width); - src += srcStride; - dst += dstStride; - for (size_t row = 2; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - AbsGradientSaturatedSum(src + col, srcStride, dst + col); - if (col < width) - AbsGradientSaturatedSum(src + col, srcStride, dst + col, tailMask); - - dst[0] = 0; - dst[width - 1] = 0; - - src += srcStride; - dst += dstStride; - } - memset(dst, 0, width); - } - - void AbsGradientSaturatedSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - AbsGradientSaturatedSum(src, srcStride, width, height, dst, dstStride); - else - AbsGradientSaturatedSum(src, srcStride, width, height, dst, dstStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwAddFeatureDifference.cpp b/src/3rd/Simd/Simd/SimdAvx512bwAddFeatureDifference.cpp deleted file mode 100644 index cf7159dc..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwAddFeatureDifference.cpp +++ /dev/null @@ -1,104 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdSet.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - SIMD_INLINE __m512i FeatureDifference(__m512i value, __m512i lo, __m512i hi) - { - return _mm512_max_epu8(_mm512_subs_epu8(value, hi), _mm512_subs_epu8(lo, value)); - } - - SIMD_INLINE __m512i ShiftedWeightedSquare16(__m512i difference, __m512i weight) - { - return _mm512_mulhi_epu16(_mm512_mullo_epi16(difference, difference), weight); - } - - SIMD_INLINE __m512i ShiftedWeightedSquare8(__m512i difference, __m512i weight) - { - const __m512i lo = ShiftedWeightedSquare16(UnpackU8<0>(difference), weight); - const __m512i hi = ShiftedWeightedSquare16(UnpackU8<1>(difference), weight); - return _mm512_packus_epi16(lo, hi); - } - - template SIMD_INLINE void AddFeatureDifference(const uint8_t * value, const uint8_t * lo, const uint8_t * hi, - uint8_t * difference, size_t offset, __m512i weight, __mmask64 m = -1) - { - const __m512i _value = Load(value + offset, m); - const __m512i _lo = Load(lo + offset, m); - const __m512i _hi = Load(hi + offset, m); - __m512i _difference = Load(difference + offset, m); - - const __m512i featureDifference = FeatureDifference(_value, _lo, _hi); - const __m512i inc = ShiftedWeightedSquare8(featureDifference, weight); - Store(difference + offset, _mm512_adds_epu8(_difference, inc), m); - } - - template void AddFeatureDifference(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, - uint16_t weight, uint8_t * difference, size_t differenceStride) - { - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(lo) && Aligned(loStride)); - assert(Aligned(hi) && Aligned(hiStride)); - assert(Aligned(difference) && Aligned(differenceStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = __mmask64(-1) >> (A + alignedWidth - width); - __m512i _weight = _mm512_set1_epi16((short)weight); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - AddFeatureDifference(value, lo, hi, difference, col, _weight); - if (col < width) - AddFeatureDifference(value, lo, hi, difference, col, _weight, tailMask); - value += valueStride; - lo += loStride; - hi += hiStride; - difference += differenceStride; - } - } - - void AddFeatureDifference(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, - uint16_t weight, uint8_t * difference, size_t differenceStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(lo) && Aligned(loStride) && - Aligned(hi) && Aligned(hiStride) && Aligned(difference) && Aligned(differenceStride)) - AddFeatureDifference(value, valueStride, width, height, lo, loStride, hi, hiStride, weight, difference, differenceStride); - else - AddFeatureDifference(value, valueStride, width, height, lo, loStride, hi, hiStride, weight, difference, differenceStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwAlphaBlending.cpp b/src/3rd/Simd/Simd/SimdAvx512bwAlphaBlending.cpp deleted file mode 100644 index d1e01095..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwAlphaBlending.cpp +++ /dev/null @@ -1,278 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdConversion.h" -#include "Simd/SimdSet.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - SIMD_INLINE __m512i AlphaBlendingI16(const __m512i & src, const __m512i & dst, const __m512i & alpha) - { - return DivideI16By255(_mm512_add_epi16(_mm512_mullo_epi16(src, alpha), _mm512_mullo_epi16(dst, _mm512_sub_epi16(K16_00FF, alpha)))); - } - - template SIMD_INLINE void AlphaBlending(const uint8_t * src, uint8_t * dst, const __m512i & alpha, __mmask64 m) - { - __m512i _src = Load(src, m); - __m512i _dst = Load(dst, m); - __m512i lo = AlphaBlendingI16(UnpackU8<0>(_src), UnpackU8<0>(_dst), UnpackU8<0>(alpha)); - __m512i hi = AlphaBlendingI16(UnpackU8<1>(_src), UnpackU8<1>(_dst), UnpackU8<1>(alpha)); - Store(dst, _mm512_packus_epi16(lo, hi), m); - } - - template struct AlphaBlender - { - void operator()(const uint8_t * src, uint8_t * dst, const uint8_t * alpha, __mmask64 m[channelCount + 1]); - }; - - template struct AlphaBlender - { - SIMD_INLINE void operator()(const uint8_t * src, uint8_t * dst, const uint8_t * alpha, __mmask64 m[2]) - { - __m512i _alpha = Load(alpha, m[0]); - AlphaBlending(src, dst, _alpha, m[1]); - } - }; - - template struct AlphaBlender - { - SIMD_INLINE void operator()(const uint8_t * src, uint8_t * dst, const uint8_t * alpha, __mmask64 m[3]) - { - __m512i _alpha = Load(alpha, m[0]); - _alpha = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, _alpha); - AlphaBlending(src + 0, dst + 0, UnpackU8<0>(_alpha, _alpha), m[1]); - AlphaBlending(src + A, dst + A, UnpackU8<1>(_alpha, _alpha), m[2]); - } - }; - - template struct AlphaBlender - { - SIMD_INLINE void operator()(const uint8_t * src, uint8_t * dst, const uint8_t * alpha, __mmask64 m[4]) - { - __m512i _alpha = Load(alpha, m[0]); - AlphaBlending(src + 0 * A, dst + 0 * A, GrayToBgr<0>(_alpha), m[1]); - AlphaBlending(src + 1 * A, dst + 1 * A, GrayToBgr<1>(_alpha), m[2]); - AlphaBlending(src + 2 * A, dst + 2 * A, GrayToBgr<2>(_alpha), m[3]); - } - }; - - template struct AlphaBlender - { - SIMD_INLINE void operator()(const uint8_t * src, uint8_t * dst, const uint8_t * alpha, __mmask64 m[5]) - { - __m512i _alpha = Load(alpha, m[0]); - _alpha = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, _alpha); - __m512i lo = UnpackU8<0>(_alpha, _alpha); - AlphaBlending(src + 0 * A, dst + 0 * A, UnpackU8<0>(lo, lo), m[1]); - AlphaBlending(src + 1 * A, dst + 1 * A, UnpackU8<1>(lo, lo), m[2]); - __m512i hi = UnpackU8<1>(_alpha, _alpha); - AlphaBlending(src + 2 * A, dst + 2 * A, UnpackU8<0>(hi, hi), m[3]); - AlphaBlending(src + 3 * A, dst + 3 * A, UnpackU8<1>(hi, hi), m[4]); - } - }; - - template void AlphaBlending(const uint8_t *src, size_t srcStride, size_t width, size_t height, - const uint8_t *alpha, size_t alphaStride, uint8_t *dst, size_t dstStride) - { - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMasks[channelCount + 1]; - tailMasks[0] = TailMask64(width - alignedWidth); - for (size_t channel = 0; channel < channelCount; ++channel) - tailMasks[channel + 1] = TailMask64((width - alignedWidth)*channelCount - A * channel); - size_t step = channelCount * A; - for (size_t row = 0; row < height; ++row) - { - size_t col = 0, offset = 0; - for (; col < alignedWidth; col += A, offset += step) - AlphaBlender()(src + offset, dst + offset, alpha + col, tailMasks); - if (col < width) - AlphaBlender()(src + offset, dst + offset, alpha + col, tailMasks); - src += srcStride; - alpha += alphaStride; - dst += dstStride; - } - } - - template void AlphaBlending(const uint8_t *src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t *alpha, size_t alphaStride, uint8_t *dst, size_t dstStride) - { - if (align) - { - assert(Aligned(src) && Aligned(srcStride)); - assert(Aligned(alpha) && Aligned(alphaStride)); - assert(Aligned(dst) && Aligned(dstStride)); - } - - switch (channelCount) - { - case 1: AlphaBlending(src, srcStride, width, height, alpha, alphaStride, dst, dstStride); break; - case 2: AlphaBlending(src, srcStride, width, height, alpha, alphaStride, dst, dstStride); break; - case 3: AlphaBlending(src, srcStride, width, height, alpha, alphaStride, dst, dstStride); break; - case 4: AlphaBlending(src, srcStride, width, height, alpha, alphaStride, dst, dstStride); break; - default: - assert(0); - } - } - - void AlphaBlending(const uint8_t *src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t *alpha, size_t alphaStride, uint8_t *dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(alpha) && Aligned(alphaStride) && Aligned(dst) && Aligned(dstStride)) - AlphaBlending(src, srcStride, width, height, channelCount, alpha, alphaStride, dst, dstStride); - else - AlphaBlending(src, srcStride, width, height, channelCount, alpha, alphaStride, dst, dstStride); - } - - template SIMD_INLINE void AlphaFilling(uint8_t * dst, __m512i channelLo, __m512i channelHi, __m512i alpha, __mmask64 m) - { - __m512i _dst = Load(dst, m); - __m512i lo = AlphaBlendingI16(channelLo, UnpackU8<0>(_dst), UnpackU8<0>(alpha)); - __m512i hi = AlphaBlendingI16(channelHi, UnpackU8<1>(_dst), UnpackU8<1>(alpha)); - Store(dst, _mm512_packus_epi16(lo, hi), m); - } - - template struct AlphaFiller - { - void operator()(uint8_t * dst, const __m512i * channel, const uint8_t * alpha, __mmask64 m[channelCount + 1]); - }; - - template struct AlphaFiller - { - SIMD_INLINE void operator()(uint8_t * dst, const __m512i * channel, const uint8_t * alpha, __mmask64 m[2]) - { - __m512i _alpha = Load(alpha, m[0]); - AlphaFilling(dst, channel[0], channel[0], _alpha, m[1]); - } - }; - - template struct AlphaFiller - { - SIMD_INLINE void operator()(uint8_t * dst, const __m512i * channel, const uint8_t * alpha, __mmask64 m[3]) - { - __m512i _alpha = Load(alpha, m[0]); - _alpha = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, _alpha); - AlphaFilling(dst + 0 * A, channel[0], channel[0], UnpackU8<0>(_alpha, _alpha), m[1]); - AlphaFilling(dst + 1 * A, channel[0], channel[0], UnpackU8<1>(_alpha, _alpha), m[2]); - } - }; - - template struct AlphaFiller - { - SIMD_INLINE void operator()(uint8_t * dst, const __m512i * channel, const uint8_t * alpha, __mmask64 m[4]) - { - __m512i _alpha = Load(alpha, m[0]); - AlphaFilling(dst + 0 * A, channel[0], channel[1], GrayToBgr<0>(_alpha), m[1]); - AlphaFilling(dst + 1 * A, channel[2], channel[0], GrayToBgr<1>(_alpha), m[2]); - AlphaFilling(dst + 2 * A, channel[1], channel[2], GrayToBgr<2>(_alpha), m[3]); - } - }; - - template struct AlphaFiller - { - SIMD_INLINE void operator()(uint8_t * dst, const __m512i * channel, const uint8_t * alpha, __mmask64 m[5]) - { - __m512i _alpha = Load(alpha, m[0]); - _alpha = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, _alpha); - __m512i lo = UnpackU8<0>(_alpha, _alpha); - AlphaFilling(dst + 0 * A, channel[0], channel[0], UnpackU8<0>(lo, lo), m[1]); - AlphaFilling(dst + 1 * A, channel[0], channel[0], UnpackU8<1>(lo, lo), m[2]); - __m512i hi = UnpackU8<1>(_alpha, _alpha); - AlphaFilling(dst + 2 * A, channel[0], channel[0], UnpackU8<0>(hi, hi), m[3]); - AlphaFilling(dst + 3 * A, channel[0], channel[0], UnpackU8<1>(hi, hi), m[4]); - } - }; - - template void AlphaFilling(uint8_t * dst, size_t dstStride, size_t width, size_t height, const __m512i * channel, const uint8_t * alpha, size_t alphaStride) - { - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMasks[channelCount + 1]; - tailMasks[0] = TailMask64(width - alignedWidth); - for (size_t c = 0; c < channelCount; ++c) - tailMasks[c + 1] = TailMask64((width - alignedWidth)*channelCount - A * c); - size_t step = channelCount * A; - for (size_t row = 0; row < height; ++row) - { - size_t col = 0, offset = 0; - for (; col < alignedWidth; col += A, offset += step) - AlphaFiller()(dst + offset, channel, alpha + col, tailMasks); - if (col < width) - AlphaFiller()(dst + offset, channel, alpha + col, tailMasks); - alpha += alphaStride; - dst += dstStride; - } - } - - template void AlphaFilling(uint8_t * dst, size_t dstStride, size_t width, size_t height, const uint8_t * channel, size_t channelCount, const uint8_t * alpha, size_t alphaStride) - { - if (align) - { - assert(Aligned(dst) && Aligned(dstStride)); - assert(Aligned(alpha) && Aligned(alphaStride)); - } - - __m512i _channel[3]; - switch (channelCount) - { - case 1: - _channel[0] = UnpackU8<0>(_mm512_set1_epi8(*(uint8_t*)channel)); - AlphaFilling(dst, dstStride, width, height, _channel, alpha, alphaStride); - break; - case 2: - _channel[0] = UnpackU8<0>(_mm512_set1_epi16(*(uint16_t*)channel)); - AlphaFilling(dst, dstStride, width, height, _channel, alpha, alphaStride); - break; - case 3: - { - uint64_t _0120 = uint64_t(channel[0]) | (uint64_t(channel[1]) << 16) | (uint64_t(channel[2]) << 32) | (uint64_t(channel[0]) << 48); - uint64_t _1201 = uint64_t(channel[1]) | (uint64_t(channel[2]) << 16) | (uint64_t(channel[0]) << 32) | (uint64_t(channel[1]) << 48); - uint64_t _2012 = uint64_t(channel[2]) | (uint64_t(channel[0]) << 16) | (uint64_t(channel[1]) << 32) | (uint64_t(channel[2]) << 48); - _channel[0] = _mm512_setr_epi64(_0120, _1201, _1201, _2012, _2012, _0120, _0120, _1201); - _channel[1] = _mm512_setr_epi64(_2012, _0120, _0120, _1201, _1201, _2012, _2012, _0120); - _channel[2] = _mm512_setr_epi64(_1201, _2012, _2012, _0120, _0120, _1201, _1201, _2012); - AlphaFilling(dst, dstStride, width, height, _channel, alpha, alphaStride); - break; - } - case 4: - _channel[0] = UnpackU8<0>(_mm512_set1_epi32(*(uint32_t*)channel)); - AlphaFilling(dst, dstStride, width, height, _channel, alpha, alphaStride); - break; - default: - assert(0); - } - } - - void AlphaFilling(uint8_t * dst, size_t dstStride, size_t width, size_t height, const uint8_t * channel, size_t channelCount, const uint8_t * alpha, size_t alphaStride) - { - if (Aligned(dst) && Aligned(dstStride) && Aligned(alpha) && Aligned(alphaStride)) - AlphaFilling(dst, dstStride, width, height, channel, channelCount, alpha, alphaStride); - else - AlphaFilling(dst, dstStride, width, height, channel, channelCount, alpha, alphaStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwBackground.cpp b/src/3rd/Simd/Simd/SimdAvx512bwBackground.cpp deleted file mode 100644 index 08d9884a..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwBackground.cpp +++ /dev/null @@ -1,456 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void BackgroundGrowRangeSlow(const uint8_t * value, uint8_t * lo, uint8_t * hi, __mmask64 m = -1) - { - const __m512i _value = Load(value, m); - const __m512i _lo = Load(lo, m); - const __m512i _hi = Load(hi, m); - - const __mmask64 inc = _mm512_cmpgt_epu8_mask(_value, _hi); - const __mmask64 dec = _mm512_cmplt_epu8_mask(_value, _lo); - - Store(lo, _mm512_mask_subs_epu8(_lo, dec, _lo, K8_01), m); - Store(hi, _mm512_mask_adds_epu8(_hi, inc, _hi, K8_01), m); - } - - template void BackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) - { - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(lo) && Aligned(loStride)); - assert(Aligned(hi) && Aligned(hiStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - BackgroundGrowRangeSlow(value + col, lo + col, hi + col); - if (col < width) - BackgroundGrowRangeSlow(value + col, lo + col, hi + col, tailMask); - value += valueStride; - lo += loStride; - hi += hiStride; - } - } - - void BackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(lo) && Aligned(loStride) && Aligned(hi) && Aligned(hiStride)) - BackgroundGrowRangeSlow(value, valueStride, width, height, lo, loStride, hi, hiStride); - else - BackgroundGrowRangeSlow(value, valueStride, width, height, lo, loStride, hi, hiStride); - } - - template SIMD_INLINE void BackgroundGrowRangeFast(const uint8_t * value, uint8_t * lo, uint8_t * hi, __mmask64 m = -1) - { - const __m512i _value = Load(value, m); - const __m512i _lo = Load(lo, m); - const __m512i _hi = Load(hi, m); - - Store(lo, _mm512_min_epu8(_lo, _value), m); - Store(hi, _mm512_max_epu8(_hi, _value), m); - } - - template void BackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) - { - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(lo) && Aligned(loStride)); - assert(Aligned(hi) && Aligned(hiStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - BackgroundGrowRangeFast(value + col, lo + col, hi + col); - if (col < width) - BackgroundGrowRangeFast(value + col, lo + col, hi + col, tailMask); - value += valueStride; - lo += loStride; - hi += hiStride; - } - } - - void BackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(lo) && Aligned(loStride) && Aligned(hi) && Aligned(hiStride)) - BackgroundGrowRangeFast(value, valueStride, width, height, lo, loStride, hi, hiStride); - else - BackgroundGrowRangeFast(value, valueStride, width, height, lo, loStride, hi, hiStride); - } - - template SIMD_INLINE void BackgroundIncrementCount(const uint8_t * value, - const uint8_t * loValue, const uint8_t * hiValue, uint8_t * loCount, uint8_t * hiCount, size_t offset, __mmask64 m = -1) - { - const __m512i _value = Load(value + offset, m); - const __m512i _loValue = Load(loValue + offset, m); - const __m512i _loCount = Load(loCount + offset, m); - const __m512i _hiValue = Load(hiValue + offset, m); - const __m512i _hiCount = Load(hiCount + offset, m); - - const __mmask64 incLo = _mm512_cmplt_epu8_mask(_value, _loValue); - const __mmask64 incHi = _mm512_cmpgt_epu8_mask(_value, _hiValue); - - Store(loCount + offset, _mm512_mask_adds_epu8(_loCount, incLo, _loCount, K8_01), m); - Store(hiCount + offset, _mm512_mask_adds_epu8(_hiCount, incHi, _hiCount, K8_01), m); - } - - template void BackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * loValue, size_t loValueStride, const uint8_t * hiValue, size_t hiValueStride, - uint8_t * loCount, size_t loCountStride, uint8_t * hiCount, size_t hiCountStride) - { - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(loValue) && Aligned(loValueStride) && Aligned(hiValue) && Aligned(hiValueStride)); - assert(Aligned(loCount) && Aligned(loCountStride) && Aligned(hiCount) && Aligned(hiCountStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - BackgroundIncrementCount(value, loValue, hiValue, loCount, hiCount, col); - if (col < width) - BackgroundIncrementCount(value, loValue, hiValue, loCount, hiCount, col, tailMask); - value += valueStride; - loValue += loValueStride; - hiValue += hiValueStride; - loCount += loCountStride; - hiCount += hiCountStride; - } - } - - void BackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * loValue, size_t loValueStride, const uint8_t * hiValue, size_t hiValueStride, - uint8_t * loCount, size_t loCountStride, uint8_t * hiCount, size_t hiCountStride) - { - if (Aligned(value) && Aligned(valueStride) && - Aligned(loValue) && Aligned(loValueStride) && Aligned(hiValue) && Aligned(hiValueStride) && - Aligned(loCount) && Aligned(loCountStride) && Aligned(hiCount) && Aligned(hiCountStride)) - BackgroundIncrementCount(value, valueStride, width, height, - loValue, loValueStride, hiValue, hiValueStride, loCount, loCountStride, hiCount, hiCountStride); - else - BackgroundIncrementCount(value, valueStride, width, height, - loValue, loValueStride, hiValue, hiValueStride, loCount, loCountStride, hiCount, hiCountStride); - } - - SIMD_INLINE __m512i AdjustLo(const __m512i & count, const __m512i & value, const __m512i & threshold) - { - const __mmask64 dec = _mm512_cmpgt_epu8_mask(count, threshold); - const __mmask64 inc = _mm512_cmplt_epu8_mask(count, threshold); - __m512i added = _mm512_mask_adds_epu8(value, inc, value, K8_01); - return _mm512_mask_subs_epu8(added, dec, added, K8_01); - } - - SIMD_INLINE __m512i AdjustHi(const __m512i & count, const __m512i & value, const __m512i & threshold) - { - const __mmask64 inc = _mm512_cmpgt_epu8_mask(count, threshold); - const __mmask64 dec = _mm512_cmplt_epu8_mask(count, threshold); - __m512i added = _mm512_mask_adds_epu8(value, inc, value, K8_01); - return _mm512_mask_subs_epu8(added, dec, added, K8_01); - } - - template SIMD_INLINE void BackgroundAdjustRange(uint8_t * loCount, uint8_t * loValue, - uint8_t * hiCount, uint8_t * hiValue, const __m512i & threshold, __mmask64 m = -1) - { - const __m512i _loCount = Load(loCount, m); - const __m512i _loValue = Load(loValue, m); - const __m512i _hiCount = Load(hiCount, m); - const __m512i _hiValue = Load(hiValue, m); - - Store(loValue, AdjustLo(_loCount, _loValue, threshold), m); - Store(hiValue, AdjustHi(_hiCount, _hiValue, threshold), m); - Store(loCount, K_ZERO, m); - Store(hiCount, K_ZERO, m); - } - - template void BackgroundAdjustRange(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold) - { - if (align) - { - assert(Aligned(loValue) && Aligned(loValueStride) && Aligned(hiValue) && Aligned(hiValueStride)); - assert(Aligned(loCount) && Aligned(loCountStride) && Aligned(hiCount) && Aligned(hiCountStride)); - } - - const __m512i _threshold = _mm512_set1_epi8((char)threshold); - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - BackgroundAdjustRange(loCount + col, loValue + col, hiCount + col, hiValue + col, _threshold); - if (col < width) - BackgroundAdjustRange(loCount + col, loValue + col, hiCount + col, hiValue + col, _threshold, tailMask); - loValue += loValueStride; - hiValue += hiValueStride; - loCount += loCountStride; - hiCount += hiCountStride; - } - } - - void BackgroundAdjustRange(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold) - { - if (Aligned(loValue) && Aligned(loValueStride) && Aligned(hiValue) && Aligned(hiValueStride) && - Aligned(loCount) && Aligned(loCountStride) && Aligned(hiCount) && Aligned(hiCountStride)) - BackgroundAdjustRange(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride, hiValue, hiValueStride, threshold); - else - BackgroundAdjustRange(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride, hiValue, hiValueStride, threshold); - } - - template SIMD_INLINE void BackgroundAdjustRangeMasked(uint8_t * loCount, uint8_t * loValue, uint8_t * hiCount, uint8_t * hiValue, - const uint8_t * pmask, const __m512i & threshold, __mmask64 m = -1) - { - const __m512i _mask = Load(pmask, m); - const __mmask64 mm = _mm512_cmpneq_epu8_mask(_mask, K_ZERO) & m; - - const __m512i _loCount = Load(loCount, m); - const __m512i _loValue = Load(loValue, m); - const __m512i _hiCount = Load(hiCount, m); - const __m512i _hiValue = Load(hiValue, m); - - Store(loValue, AdjustLo(_loCount, _loValue, threshold), mm); - Store(hiValue, AdjustHi(_hiCount, _hiValue, threshold), mm); - Store(loCount, K_ZERO, m); - Store(hiCount, K_ZERO, m); - } - - template void BackgroundAdjustRangeMasked(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride) - { - if (align) - { - assert(Aligned(loValue) && Aligned(loValueStride) && Aligned(hiValue) && Aligned(hiValueStride)); - assert(Aligned(loCount) && Aligned(loCountStride) && Aligned(hiCount) && Aligned(hiCountStride)); - assert(Aligned(mask) && Aligned(maskStride)); - } - - const __m512i _threshold = _mm512_set1_epi8((char)threshold); - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - BackgroundAdjustRangeMasked(loCount + col, loValue + col, hiCount + col, hiValue + col, mask + col, _threshold); - if (col < width) - BackgroundAdjustRangeMasked(loCount + col, loValue + col, hiCount + col, hiValue + col, mask + col, _threshold, tailMask); - loValue += loValueStride; - hiValue += hiValueStride; - loCount += loCountStride; - hiCount += hiCountStride; - mask += maskStride; - } - } - - void BackgroundAdjustRangeMasked(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride) - { - if (Aligned(loValue) && Aligned(loValueStride) && Aligned(hiValue) && Aligned(hiValueStride) && - Aligned(loCount) && Aligned(loCountStride) && Aligned(hiCount) && Aligned(hiCountStride) && - Aligned(mask) && Aligned(maskStride)) - BackgroundAdjustRangeMasked(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride, hiValue, hiValueStride, threshold, mask, maskStride); - else - BackgroundAdjustRangeMasked(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride, hiValue, hiValueStride, threshold, mask, maskStride); - } - - template SIMD_INLINE void BackgroundShiftRange(const uint8_t * value, uint8_t * lo, uint8_t * hi, __mmask64 m = -1) - { - const __m512i _value = Load(value, m); - const __m512i _lo = Load(lo, m); - const __m512i _hi = Load(hi, m); - - const __m512i add = _mm512_subs_epu8(_value, _hi); - const __m512i sub = _mm512_subs_epu8(_lo, _value); - - Store(lo, _mm512_subs_epu8(_mm512_adds_epu8(_lo, add), sub), m); - Store(hi, _mm512_subs_epu8(_mm512_adds_epu8(_hi, add), sub), m); - } - - template void BackgroundShiftRange(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) - { - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(lo) && Aligned(loStride)); - assert(Aligned(hi) && Aligned(hiStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - BackgroundShiftRange(value + col, lo + col, hi + col); - if (col < width) - BackgroundShiftRange(value + col, lo + col, hi + col, tailMask); - value += valueStride; - lo += loStride; - hi += hiStride; - } - } - - void BackgroundShiftRange(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(lo) && Aligned(loStride) && Aligned(hi) && Aligned(hiStride)) - BackgroundShiftRange(value, valueStride, width, height, lo, loStride, hi, hiStride); - else - BackgroundShiftRange(value, valueStride, width, height, lo, loStride, hi, hiStride); - } - - template SIMD_INLINE void BackgroundShiftRangeMasked(const uint8_t * value, uint8_t * lo, uint8_t * hi, - const uint8_t * pmask, __mmask64 m = -1) - { - const __m512i _mask = Load(pmask, m); - const __mmask64 mm = _mm512_cmpneq_epu8_mask(_mask, K_ZERO) & m; - - const __m512i _value = Load(value, m); - const __m512i _lo = Load(lo, m); - const __m512i _hi = Load(hi, m); - - const __m512i add = _mm512_subs_epu8(_value, _hi); - const __m512i sub = _mm512_subs_epu8(_lo, _value); - - Store(lo, _mm512_subs_epu8(_mm512_adds_epu8(_lo, add), sub), mm); - Store(hi, _mm512_subs_epu8(_mm512_adds_epu8(_hi, add), sub), mm); - } - - template void BackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride, const uint8_t * mask, size_t maskStride) - { - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(lo) && Aligned(loStride)); - assert(Aligned(hi) && Aligned(hiStride)); - assert(Aligned(mask) && Aligned(maskStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - BackgroundShiftRangeMasked(value + col, lo + col, hi + col, mask + col); - if (col < width) - BackgroundShiftRangeMasked(value + col, lo + col, hi + col, mask + col, tailMask); - value += valueStride; - lo += loStride; - hi += hiStride; - mask += maskStride; - } - } - - void BackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride, const uint8_t * mask, size_t maskStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(lo) && Aligned(loStride) && - Aligned(hi) && Aligned(hiStride) && Aligned(mask) && Aligned(maskStride)) - BackgroundShiftRangeMasked(value, valueStride, width, height, lo, loStride, hi, hiStride, mask, maskStride); - else - BackgroundShiftRangeMasked(value, valueStride, width, height, lo, loStride, hi, hiStride, mask, maskStride); - } - - template SIMD_INLINE void BackgroundInitMask(const uint8_t * src, uint8_t * dst, const __m512i & index, const __m512i & value, __mmask64 m = -1) - { - __m512i _src = Load(src, m); - __mmask64 mm = _mm512_cmpeq_epu8_mask(_src, index) & m; - Store(dst, value, mm); - } - - template void BackgroundInitMask(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t index, uint8_t value, uint8_t * dst, size_t dstStride) - { - if (align) - { - assert(Aligned(src) && Aligned(srcStride)); - assert(Aligned(dst) && Aligned(dstStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - __m512i _index = _mm512_set1_epi8(index); - __m512i _value = _mm512_set1_epi8(value); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - BackgroundInitMask(src + col, dst + col, _index, _value); - if (col < width) - BackgroundInitMask(src + col, dst + col, _index, _value, tailMask); - src += srcStride; - dst += dstStride; - } - } - - void BackgroundInitMask(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t index, uint8_t value, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - BackgroundInitMask(src, srcStride, width, height, index, value, dst, dstStride); - else - BackgroundInitMask(src, srcStride, width, height, index, value, dst, dstStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwBayerToBgr.cpp b/src/3rd/Simd/Simd/SimdAvx512bwBayerToBgr.cpp deleted file mode 100644 index 07466b2b..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwBayerToBgr.cpp +++ /dev/null @@ -1,107 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdBayer.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void SaveBgr(const __m512i src[3], uint8_t * dst) - { - Store((__m512i*)dst + 0, InterleaveBgr<0>(src[0], src[1], src[2])); - Store((__m512i*)dst + 1, InterleaveBgr<1>(src[0], src[1], src[2])); - Store((__m512i*)dst + 2, InterleaveBgr<2>(src[0], src[1], src[2])); - } - - template void BayerToBgr(const __m512i src[12], uint8_t * bgr, size_t stride) - { - __m512i _bgr[6]; - BayerToBgr(src, _bgr); - SaveBgr(_bgr + 0, bgr); - SaveBgr(_bgr + 3, bgr + stride); - } - - template void BayerToBgr(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, uint8_t * bgr, size_t bgrStride) - { - const uint8_t * src[3]; - __m512i _src[12]; - size_t body = AlignHi(width - 2, A) - A; - for (size_t row = 0; row < height; row += 2) - { - src[0] = (row == 0 ? bayer : bayer - 2 * bayerStride); - src[1] = bayer; - src[2] = (row == height - 2 ? bayer : bayer + 2 * bayerStride); - - LoadBayerNose(src, 0, bayerStride, _src); - BayerToBgr(_src, bgr, bgrStride); - for (size_t col = A; col < body; col += A) - { - LoadBayerBody(src, col, bayerStride, _src); - BayerToBgr(_src, bgr + 3 * col, bgrStride); - } - LoadBayerTail(src, width - A, bayerStride, _src); - BayerToBgr(_src, bgr + 3 * (width - A), bgrStride); - - bayer += 2 * bayerStride; - bgr += 2 * bgrStride; - } - } - - template void BayerToBgr(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgr, size_t bgrStride) - { - switch (bayerFormat) - { - case SimdPixelFormatBayerGrbg: - BayerToBgr(bayer, width, height, bayerStride, bgr, bgrStride); - break; - case SimdPixelFormatBayerGbrg: - BayerToBgr(bayer, width, height, bayerStride, bgr, bgrStride); - break; - case SimdPixelFormatBayerRggb: - BayerToBgr(bayer, width, height, bayerStride, bgr, bgrStride); - break; - case SimdPixelFormatBayerBggr: - BayerToBgr(bayer, width, height, bayerStride, bgr, bgrStride); - break; - default: - assert(0); - } - } - - void BayerToBgr(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgr, size_t bgrStride) - { - assert((width % 2 == 0) && (height % 2 == 0)); - - if (Aligned(bayer) && Aligned(bgr) && Aligned(bayerStride) && Aligned(bgrStride)) - BayerToBgr(bayer, width, height, bayerStride, bayerFormat, bgr, bgrStride); - else - BayerToBgr(bayer, width, height, bayerStride, bayerFormat, bgr, bgrStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwBayerToBgra.cpp b/src/3rd/Simd/Simd/SimdAvx512bwBayerToBgra.cpp deleted file mode 100644 index 7bc35aa5..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwBayerToBgra.cpp +++ /dev/null @@ -1,115 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdBayer.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void SaveBgra(__m512i bgr[3], const __m512i & alpha, uint8_t * bgra) - { - bgr[0] = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, bgr[0]); - bgr[1] = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, bgr[1]); - bgr[2] = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, bgr[2]); - __m512i bgLo = UnpackU8<0>(bgr[0], bgr[1]); - __m512i bgHi = UnpackU8<1>(bgr[0], bgr[1]); - __m512i raLo = UnpackU8<0>(bgr[2], alpha); - __m512i raHi = UnpackU8<1>(bgr[2], alpha); - Store(bgra + 0*A, UnpackU16<0>(bgLo, raLo)); - Store(bgra + 1*A, UnpackU16<1>(bgLo, raLo)); - Store(bgra + 2*A, UnpackU16<0>(bgHi, raHi)); - Store(bgra + 3*A, UnpackU16<1>(bgHi, raHi)); - } - - template void BayerToBgra(const __m512i src[12], const __m512i & alpha, uint8_t * bgra, size_t stride) - { - __m512i bgr[6]; - BayerToBgr(src, bgr); - SaveBgra(bgr + 0, alpha, bgra); - SaveBgra(bgr + 3, alpha, bgra + stride); - } - - template void BayerToBgra(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - const uint8_t * src[3]; - __m512i _src[12]; - __m512i _alpha = _mm512_set1_epi8((char)alpha); - size_t body = AlignHi(width - 2, A) - A; - for (size_t row = 0; row < height; row += 2) - { - src[0] = (row == 0 ? bayer : bayer - 2 * bayerStride); - src[1] = bayer; - src[2] = (row == height - 2 ? bayer : bayer + 2 * bayerStride); - - LoadBayerNose(src, 0, bayerStride, _src); - BayerToBgra(_src, _alpha, bgra, bgraStride); - for (size_t col = A; col < body; col += A) - { - LoadBayerBody(src, col, bayerStride, _src); - BayerToBgra(_src, _alpha, bgra + 4 * col, bgraStride); - } - LoadBayerTail(src, width - A, bayerStride, _src); - BayerToBgra(_src, _alpha, bgra + 4 * (width - A), bgraStride); - - bayer += 2 * bayerStride; - bgra += 2 * bgraStride; - } - } - - template void BayerToBgra(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - switch (bayerFormat) - { - case SimdPixelFormatBayerGrbg: - BayerToBgra(bayer, width, height, bayerStride, bgra, bgraStride, alpha); - break; - case SimdPixelFormatBayerGbrg: - BayerToBgra(bayer, width, height, bayerStride, bgra, bgraStride, alpha); - break; - case SimdPixelFormatBayerRggb: - BayerToBgra(bayer, width, height, bayerStride, bgra, bgraStride, alpha); - break; - case SimdPixelFormatBayerBggr: - BayerToBgra(bayer, width, height, bayerStride, bgra, bgraStride, alpha); - break; - default: - assert(0); - } - } - - void BayerToBgra(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - assert((width % 2 == 0) && (height % 2 == 0)); - - if (Aligned(bayer) && Aligned(bgra) && Aligned(bayerStride) && Aligned(bgraStride)) - BayerToBgra(bayer, width, height, bayerStride, bayerFormat, bgra, bgraStride, alpha); - else - BayerToBgra(bayer, width, height, bayerStride, bayerFormat, bgra, bgraStride, alpha); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwBgrToBayer.cpp b/src/3rd/Simd/Simd/SimdAvx512bwBgrToBayer.cpp deleted file mode 100644 index 4667778c..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwBgrToBayer.cpp +++ /dev/null @@ -1,130 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - const __m128i K8_SHUFFLE_GR = SIMD_MM_SETR_EPI8(0x1, 0x5, 0x7, 0xB, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_SHUFFLE_BG = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x6, 0xA, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_SHUFFLE_GB = SIMD_MM_SETR_EPI8(0x1, 0x3, 0x7, 0x9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_SHUFFLE_RG = SIMD_MM_SETR_EPI8(0x2, 0x4, 0x8, 0xA, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - - const __m512i K32_PERMUTE_BGRA_TO_BAYER_0 = SIMD_MM512_SETR_EPI32(0x00, 0x04, 0x08, 0x0C, 0x10, 0x14, 0x18, 0x1C, -1, -1, -1, -1, -1, -1, -1, -1); - const __m512i K32_PERMUTE_BGRA_TO_BAYER_1 = SIMD_MM512_SETR_EPI32(-1, -1, -1, -1, -1, -1, -1, -1, 0x00, 0x04, 0x08, 0x0C, 0x10, 0x14, 0x18, 0x1C); - - template SIMD_INLINE void BgrToBayer(const uint8_t * bgr, uint8_t * bayer, const __m512i shuffle[4][2], __mmask64 ms[5]) - { - const __m512i bgr0 = Load(bgr + 0 * A, ms[0]); - const __m512i bgr1 = Load(bgr + 1 * A, ms[1]); - const __m512i bgr2 = Load(bgr + 2 * A, ms[2]); - - const __m512i bgra0 = _mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_0, bgr0); - const __m512i bgra1 = _mm512_permutex2var_epi32(bgr0, K32_PERMUTE_BGR_TO_BGRA_1, bgr1); - const __m512i bgra2 = _mm512_permutex2var_epi32(bgr1, K32_PERMUTE_BGR_TO_BGRA_2, bgr2); - const __m512i bgra3 = _mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_3, bgr2); - - const __m512i bayer0 = _mm512_shuffle_epi8(bgra0, shuffle[format][row]); - const __m512i bayer1 = _mm512_shuffle_epi8(bgra1, shuffle[format][row]); - const __m512i bayer2 = _mm512_shuffle_epi8(bgra2, shuffle[format][row]); - const __m512i bayer3 = _mm512_shuffle_epi8(bgra3, shuffle[format][row]); - - __m512i bayer01xx = _mm512_permutex2var_epi32(bayer0, K32_PERMUTE_BGRA_TO_BAYER_0, bayer1); - __m512i bayerxx23 = _mm512_permutex2var_epi32(bayer2, K32_PERMUTE_BGRA_TO_BAYER_1, bayer3); - Store(bayer, _mm512_or_si512(bayer01xx, bayerxx23), ms[3]); - } - - template void BgrToBayer(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bayer, size_t bayerStride) - { - if (align) - assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(bayer) && Aligned(bayerStride)); - - const __m512i shuffle[4][2] = - { - { _mm512_broadcast_i32x4(K8_SHUFFLE_GR), _mm512_broadcast_i32x4(K8_SHUFFLE_BG) }, - { _mm512_broadcast_i32x4(K8_SHUFFLE_GB), _mm512_broadcast_i32x4(K8_SHUFFLE_RG) }, - { _mm512_broadcast_i32x4(K8_SHUFFLE_RG), _mm512_broadcast_i32x4(K8_SHUFFLE_GB) }, - { _mm512_broadcast_i32x4(K8_SHUFFLE_BG), _mm512_broadcast_i32x4(K8_SHUFFLE_GR) } - }; - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMasks[4]; - for (size_t c = 0; c < 3; ++c) - tailMasks[c] = TailMask64((width - alignedWidth) * 3 - A*c); - tailMasks[3] = TailMask64(width - alignedWidth); - - for (size_t row = 0, col = 0; row < height; row += 2) - { - for (col = 0; col < alignedWidth; col += A) - BgrToBayer(bgr + 3 * col, bayer + col, shuffle, tailMasks); - if (col < width) - BgrToBayer(bgr + 3 * col, bayer + col, shuffle, tailMasks); - bgr += bgrStride; - bayer += bayerStride; - - for (col = 0; col < alignedWidth; col += A) - BgrToBayer(bgr + 3 * col, bayer + col, shuffle, tailMasks); - if (col < width) - BgrToBayer(bgr + 3 * col, bayer + col, shuffle, tailMasks); - bgr += bgrStride; - bayer += bayerStride; - } - } - - template void BgrToBayer(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat) - { - assert((width % 2 == 0) && (height % 2 == 0)); - - switch (bayerFormat) - { - case SimdPixelFormatBayerGrbg: - BgrToBayer<0, align>(bgr, width, height, bgrStride, bayer, bayerStride); - break; - case SimdPixelFormatBayerGbrg: - BgrToBayer<1, align>(bgr, width, height, bgrStride, bayer, bayerStride); - break; - case SimdPixelFormatBayerRggb: - BgrToBayer<2, align>(bgr, width, height, bgrStride, bayer, bayerStride); - break; - case SimdPixelFormatBayerBggr: - BgrToBayer<3, align>(bgr, width, height, bgrStride, bayer, bayerStride); - break; - default: - assert(0); - } - } - - void BgrToBayer(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat) - { - if (Aligned(bgr) && Aligned(bgrStride) && Aligned(bayer) && Aligned(bayerStride)) - BgrToBayer(bgr, width, height, bgrStride, bayer, bayerStride, bayerFormat); - else - BgrToBayer(bgr, width, height, bgrStride, bayer, bayerStride, bayerFormat); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwBgrToBgra.cpp b/src/3rd/Simd/Simd/SimdAvx512bwBgrToBgra.cpp deleted file mode 100644 index 44a62a56..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwBgrToBgra.cpp +++ /dev/null @@ -1,204 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - const __m512i K8_SHUFFLE_BGR_TO_BGRA = SIMD_MM512_SETR_EPI8( - 0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1, - 0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1, - 0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1, - 0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1); - - template SIMD_INLINE void BgrToBgra(const uint8_t * bgr, uint8_t * bgra, const __m512i & alpha, const __mmask64 * ms) - { - __m512i bgr0 = Load(bgr + 0 * A, ms[0]); - __m512i bgr1 = Load(bgr + 1 * A, ms[1]); - __m512i bgr2 = Load(bgr + 2 * A, ms[2]); - - const __m512i bgra0 = _mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_0, bgr0); - const __m512i bgra1 = _mm512_permutex2var_epi32(bgr0, K32_PERMUTE_BGR_TO_BGRA_1, bgr1); - const __m512i bgra2 = _mm512_permutex2var_epi32(bgr1, K32_PERMUTE_BGR_TO_BGRA_2, bgr2); - const __m512i bgra3 = _mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_3, bgr2); - - Store(bgra + 0 * A, _mm512_or_si512(alpha, _mm512_shuffle_epi8(bgra0, K8_SHUFFLE_BGR_TO_BGRA)), ms[3]); - Store(bgra + 1 * A, _mm512_or_si512(alpha, _mm512_shuffle_epi8(bgra1, K8_SHUFFLE_BGR_TO_BGRA)), ms[4]); - Store(bgra + 2 * A, _mm512_or_si512(alpha, _mm512_shuffle_epi8(bgra2, K8_SHUFFLE_BGR_TO_BGRA)), ms[5]); - Store(bgra + 3 * A, _mm512_or_si512(alpha, _mm512_shuffle_epi8(bgra3, K8_SHUFFLE_BGR_TO_BGRA)), ms[6]); - } - - template void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)); - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMasks[7]; - for (size_t c = 0; c < 3; ++c) - tailMasks[c] = TailMask64((width - alignedWidth) * 3 - A*c); - for (size_t c = 0; c < 4; ++c) - tailMasks[3 + c] = TailMask64((width - alignedWidth) * 4 - A*c); - __m512i _alpha = _mm512_set1_epi32(alpha * 0x1000000); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - BgrToBgra(bgr + 3 * col, bgra + 4 * col, _alpha, tailMasks); - if (col < width) - BgrToBgra(bgr + 3 * col, bgra + 4 * col, _alpha, tailMasks); - bgr += bgrStride; - bgra += bgraStride; - } - } - - void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); - else - BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void Bgr48pToBgra32( - const uint8_t * blue, const uint8_t * green, const uint8_t * red, uint8_t * bgra, __m512i alpha, const __mmask64 * ms) - { - __m512i _blue = Load(blue, ms[0]); - __m512i _green = Load(green, ms[0]); - __m512i _red = Load(red, ms[0]); - - __m512i bg = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, _mm512_or_si512(_blue, _mm512_slli_epi16(_green, 8))); - __m512i ra = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, _mm512_or_si512(_red, alpha)); - - Store(bgra + 0, _mm512_unpacklo_epi16(bg, ra), ms[1]); - Store(bgra + A, _mm512_unpackhi_epi16(bg, ra), ms[2]); - } - - template void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, - const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - if (align) - { - assert(Aligned(blue) && Aligned(blueStride)); - assert(Aligned(green) && Aligned(greenStride)); - assert(Aligned(red) && Aligned(redStride)); - assert(Aligned(bgra) && Aligned(bgraStride)); - } - - width *= 2; - size_t alignedWidth = AlignLo(width, A); - __mmask64 bodyMask = 0x5555555555555555; - __mmask64 tailMasks[3]; - tailMasks[0] = TailMask64(width - alignedWidth)&bodyMask; - for (size_t c = 0; c < 2; ++c) - tailMasks[1 + c] = TailMask64((width - alignedWidth) * 2 - A*c); - __m512i _alpha = _mm512_set1_epi16(alpha * 0x100); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - Bgr48pToBgra32(blue + col, green + col, red + col, bgra + col * 2, _alpha, &bodyMask); - if (col < width) - Bgr48pToBgra32(blue + col, green + col, red + col, bgra + col * 2, _alpha, tailMasks); - blue += blueStride; - green += greenStride; - red += redStride; - bgra += bgraStride; - } - } - - void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, - const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(blue) && Aligned(blueStride) && Aligned(green) && Aligned(greenStride) && - Aligned(red) && Aligned(redStride) && Aligned(bgra) && Aligned(bgraStride)) - Bgr48pToBgra32(blue, blueStride, width, height, green, greenStride, red, redStride, bgra, bgraStride, alpha); - else - Bgr48pToBgra32(blue, blueStride, width, height, green, greenStride, red, redStride, bgra, bgraStride, alpha); - } - - //--------------------------------------------------------------------- - - const __m512i K8_SHUFFLE_RGB_TO_BGRA = SIMD_MM512_SETR_EPI8( - 0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1, - 0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1, - 0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1, - 0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1); - - template SIMD_INLINE void RgbToBgra(const uint8_t* rgb, uint8_t* bgra, const __m512i& alpha, const __mmask64* ms) - { - __m512i rgb0 = Load(rgb + 0 * A, ms[0]); - __m512i rgb1 = Load(rgb + 1 * A, ms[1]); - __m512i rgb2 = Load(rgb + 2 * A, ms[2]); - - const __m512i bgra0 = _mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_0, rgb0); - const __m512i bgra1 = _mm512_permutex2var_epi32(rgb0, K32_PERMUTE_BGR_TO_BGRA_1, rgb1); - const __m512i bgra2 = _mm512_permutex2var_epi32(rgb1, K32_PERMUTE_BGR_TO_BGRA_2, rgb2); - const __m512i bgra3 = _mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_3, rgb2); - - Store(bgra + 0 * A, _mm512_or_si512(alpha, _mm512_shuffle_epi8(bgra0, K8_SHUFFLE_RGB_TO_BGRA)), ms[3]); - Store(bgra + 1 * A, _mm512_or_si512(alpha, _mm512_shuffle_epi8(bgra1, K8_SHUFFLE_RGB_TO_BGRA)), ms[4]); - Store(bgra + 2 * A, _mm512_or_si512(alpha, _mm512_shuffle_epi8(bgra2, K8_SHUFFLE_RGB_TO_BGRA)), ms[5]); - Store(bgra + 3 * A, _mm512_or_si512(alpha, _mm512_shuffle_epi8(bgra3, K8_SHUFFLE_RGB_TO_BGRA)), ms[6]); - } - - template void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) - { - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)); - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMasks[7]; - for (size_t c = 0; c < 3; ++c) - tailMasks[c] = TailMask64((width - alignedWidth) * 3 - A * c); - for (size_t c = 0; c < 4; ++c) - tailMasks[3 + c] = TailMask64((width - alignedWidth) * 4 - A * c); - __m512i _alpha = _mm512_set1_epi32(alpha * 0x1000000); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - RgbToBgra(rgb + 3 * col, bgra + 4 * col, _alpha, tailMasks); - if (col < width) - RgbToBgra(rgb + 3 * col, bgra + 4 * col, _alpha, tailMasks); - rgb += rgbStride; - bgra += bgraStride; - } - } - - void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)) - RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); - else - RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwBgrToGray.cpp b/src/3rd/Simd/Simd/SimdAvx512bwBgrToGray.cpp deleted file mode 100644 index 3986df47..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwBgrToGray.cpp +++ /dev/null @@ -1,162 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - const __m512i K16_BLUE_RED = SIMD_MM512_SET2_EPI16(Base::BLUE_TO_GRAY_WEIGHT, Base::RED_TO_GRAY_WEIGHT); - const __m512i K16_GREEN_0000 = SIMD_MM512_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, 0x0000); - const __m512i K32_ROUND_TERM = SIMD_MM512_SET1_EPI32(Base::BGR_TO_GRAY_ROUND_TERM); - - SIMD_INLINE __m512i PermutedBgrToGray32(__m512i permutedBgr) - { - const __m512i b0r0 = _mm512_shuffle_epi8(permutedBgr, K8_SUFFLE_BGR_TO_B0R0); - const __m512i g000 = _mm512_shuffle_epi8(permutedBgr, K8_SUFFLE_BGR_TO_G000); - const __m512i weightedSum = _mm512_add_epi32(_mm512_madd_epi16(g000, K16_GREEN_0000), _mm512_madd_epi16(b0r0, K16_BLUE_RED)); - return _mm512_srli_epi32(_mm512_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT); - } - - template SIMD_INLINE void BgrToGray(const uint8_t * bgr, uint8_t * gray, const __mmask64 ms[4]) - { - const __m512i bgr0 = Load(bgr + 0 * A, ms[0]); - const __m512i bgr1 = Load(bgr + 1 * A, ms[1]); - const __m512i bgr2 = Load(bgr + 2 * A, ms[2]); - - const __m512i permutedBgr0 = _mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_0, bgr0); - const __m512i permutedBgr1 = _mm512_permutex2var_epi32(bgr0, K32_PERMUTE_BGR_TO_BGRA_1, bgr1); - const __m512i permutedBgr2 = _mm512_permutex2var_epi32(bgr1, K32_PERMUTE_BGR_TO_BGRA_2, bgr2); - const __m512i permutedBgr3 = _mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_3, bgr2); - - __m512i gray0 = PermutedBgrToGray32(permutedBgr0); - __m512i gray1 = PermutedBgrToGray32(permutedBgr1); - __m512i gray2 = PermutedBgrToGray32(permutedBgr2); - __m512i gray3 = PermutedBgrToGray32(permutedBgr3); - - __m512i gray01 = _mm512_packs_epi32(gray0, gray1); - __m512i gray23 = _mm512_packs_epi32(gray2, gray3); - __m512i gray0123 = _mm512_packus_epi16(gray01, gray23); - Store(gray, _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, gray0123), ms[3]); - } - - template void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride) - { - if (align) - assert(Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride)); - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMasks[4]; - for (size_t c = 0; c < 3; ++c) - tailMasks[c] = TailMask64((width - alignedWidth) * 3 - A*c); - tailMasks[3] = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - BgrToGray(bgr + col * 3, gray + col, tailMasks); - if (col < width) - BgrToGray(bgr + col * 3, gray + col, tailMasks); - bgr += bgrStride; - gray += grayStride; - } - } - - void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride) - { - if (Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToGray(bgr, width, height, bgrStride, gray, grayStride); - else - BgrToGray(bgr, width, height, bgrStride, gray, grayStride); - } - - //--------------------------------------------------------------------- - - const __m512i K16_RED_BLUE = SIMD_MM512_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT); - - SIMD_INLINE __m512i PermutedRgbToGray32(__m512i permutedRgb) - { - const __m512i r0b0 = _mm512_shuffle_epi8(permutedRgb, K8_SUFFLE_BGR_TO_B0R0); - const __m512i g000 = _mm512_shuffle_epi8(permutedRgb, K8_SUFFLE_BGR_TO_G000); - const __m512i weightedSum = _mm512_add_epi32(_mm512_madd_epi16(g000, K16_GREEN_0000), _mm512_madd_epi16(r0b0, K16_RED_BLUE)); - return _mm512_srli_epi32(_mm512_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT); - } - - template SIMD_INLINE void RgbToGray(const uint8_t* rgb, uint8_t* gray, const __mmask64 ms[4]) - { - const __m512i rgb0 = Load(rgb + 0 * A, ms[0]); - const __m512i rgb1 = Load(rgb + 1 * A, ms[1]); - const __m512i rgb2 = Load(rgb + 2 * A, ms[2]); - - const __m512i permutedRgb0 = _mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_0, rgb0); - const __m512i permutedRgb1 = _mm512_permutex2var_epi32(rgb0, K32_PERMUTE_BGR_TO_BGRA_1, rgb1); - const __m512i permutedRgb2 = _mm512_permutex2var_epi32(rgb1, K32_PERMUTE_BGR_TO_BGRA_2, rgb2); - const __m512i permutedRgb3 = _mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_3, rgb2); - - __m512i gray0 = PermutedRgbToGray32(permutedRgb0); - __m512i gray1 = PermutedRgbToGray32(permutedRgb1); - __m512i gray2 = PermutedRgbToGray32(permutedRgb2); - __m512i gray3 = PermutedRgbToGray32(permutedRgb3); - - __m512i gray01 = _mm512_packs_epi32(gray0, gray1); - __m512i gray23 = _mm512_packs_epi32(gray2, gray3); - __m512i gray0123 = _mm512_packus_epi16(gray01, gray23); - Store(gray, _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, gray0123), ms[3]); - } - - template void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) - { - if (align) - assert(Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride)); - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMasks[4]; - for (size_t c = 0; c < 3; ++c) - tailMasks[c] = TailMask64((width - alignedWidth) * 3 - A * c); - tailMasks[3] = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - RgbToGray(rgb + col * 3, gray + col, tailMasks); - if (col < width) - RgbToGray(rgb + col * 3, gray + col, tailMasks); - rgb += rgbStride; - gray += grayStride; - } - } - - void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) - { - if (Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride)) - RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - else - RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - } - } -#endif//SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwBgrToRgb.cpp b/src/3rd/Simd/Simd/SimdAvx512bwBgrToRgb.cpp deleted file mode 100644 index 8e587dbf..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwBgrToRgb.cpp +++ /dev/null @@ -1,121 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - const __m512i K64_PRMT_P0 = SIMD_MM512_SETR_EPI64(0x7, 0x2, 0x1, 0x4, 0x3, 0x6, 0x5, 0x8); - const __m512i K64_PRMT_P2 = SIMD_MM512_SETR_EPI64(0x7, 0xA, 0x9, 0xC, 0xB, 0xE, 0xD, 0x8); - - const __m512i K8_SHFL_0S0 = SIMD_MM512_SETR_EPI8( - 0x2, 0x1, 0x0, 0x5, 0x4, 0x3, 0x8, 0x7, 0x6, 0xB, 0xA, 0x9, 0xE, 0xD, 0xC, -1, - 0x0, -1, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0xA, 0x9, 0x8, 0xD, 0xC, 0xB, -1, 0xF, - -1, 0x3, 0x2, 0x1, 0x6, 0x5, 0x4, 0x9, 0x8, 0x7, 0xC, 0xB, 0xA, 0xF, 0xE, 0xD, - 0x2, 0x1, 0x0, 0x5, 0x4, 0x3, 0x8, 0x7, 0x6, 0xB, 0xA, 0x9, 0xE, 0xD, 0xC, -1); - const __m512i K8_SHFL_0P0 = SIMD_MM512_SETR_EPI8( - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x9, - -1, 0x7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x8, -1, - 0x6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x9); - - const __m512i K8_SHFL_1S1 = SIMD_MM512_SETR_EPI8( - 0x0, -1, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0xA, 0x9, 0x8, 0xD, 0xC, 0xB, -1, 0xF, - -1, 0x3, 0x2, 0x1, 0x6, 0x5, 0x4, 0x9, 0x8, 0x7, 0xC, 0xB, 0xA, 0xF, 0xE, 0xD, - 0x2, 0x1, 0x0, 0x5, 0x4, 0x3, 0x8, 0x7, 0x6, 0xB, 0xA, 0x9, 0xE, 0xD, 0xC, -1, - 0x0, -1, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0xA, 0x9, 0x8, 0xD, 0xC, 0xB, -1, 0xF); - const __m512i K8_SHFL_1P1 = SIMD_MM512_SETR_EPI8( - -1, 0x7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x8, -1, - 0x6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x9, - -1, 0x7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m512i K8_SHFL_1P2 = SIMD_MM512_SETR_EPI8( - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x8, -1); - - const __m512i K8_SHFL_2S2 = SIMD_MM512_SETR_EPI8( - -1, 0x3, 0x2, 0x1, 0x6, 0x5, 0x4, 0x9, 0x8, 0x7, 0xC, 0xB, 0xA, 0xF, 0xE, 0xD, - 0x2, 0x1, 0x0, 0x5, 0x4, 0x3, 0x8, 0x7, 0x6, 0xB, 0xA, 0x9, 0xE, 0xD, 0xC, -1, - 0x0, -1, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0xA, 0x9, 0x8, 0xD, 0xC, 0xB, -1, 0xF, - -1, 0x3, 0x2, 0x1, 0x6, 0x5, 0x4, 0x9, 0x8, 0x7, 0xC, 0xB, 0xA, 0xF, 0xE, 0xD); - const __m512i K8_SHFL_2P2 = SIMD_MM512_SETR_EPI8( - 0x6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x9, - -1, 0x7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x8, -1, - 0x6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - - template SIMD_INLINE void BgrToRgb(const uint8_t * src, uint8_t * dst, const __mmask64 * tails) - { - __m512i s0 = Load(src + 0 * A, tails[0]); - __m512i s1 = Load(src + 1 * A, tails[1]); - __m512i s2 = Load(src + 2 * A, tails[2]); - __m512i p0 = _mm512_permutex2var_epi64(s0, K64_PRMT_P0, s1); - __m512i p1 = _mm512_permutex2var_epi64(s0, K64_PRMT_P2, s1); - __m512i p2 = _mm512_permutex2var_epi64(s1, K64_PRMT_P2, s2); - Store(dst + 0 * A, _mm512_or_si512(_mm512_shuffle_epi8(s0, K8_SHFL_0S0), _mm512_shuffle_epi8(p0, K8_SHFL_0P0)), tails[0]); - Store(dst + 1 * A, _mm512_or_si512(_mm512_or_si512(_mm512_shuffle_epi8(s1, K8_SHFL_1S1), - _mm512_shuffle_epi8(p1, K8_SHFL_1P1)), _mm512_shuffle_epi8(p2, K8_SHFL_1P2)), tails[1]); - Store(dst + 2 * A, _mm512_or_si512(_mm512_shuffle_epi8(s2, K8_SHFL_2S2), _mm512_shuffle_epi8(p2, K8_SHFL_2P2)), tails[2]); - } - - template void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) - { - assert(width >= A); - if (align) - assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)); - - const size_t A3 = A * 3; - size_t size = width * 3; - size_t aligned = AlignLo(width, A) * 3; - __mmask64 tails[3]; - for (size_t i = 0; i < 3; ++i) - tails[i] = TailMask64(size - aligned - A * i); - - for (size_t row = 0; row < height; ++row) - { - size_t i = 0; - for (; i < aligned; i += A3) - BgrToRgb(bgr + i, rgb + i, tails); - if (i < size) - BgrToRgb(bgr + i, rgb + i, tails); - bgr += bgrStride; - rgb += rgbStride; - } - } - - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) - { - if (Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)) - BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); - else - BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); - } - } -#endif//SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwBgrToYuv.cpp b/src/3rd/Simd/Simd/SimdAvx512bwBgrToYuv.cpp deleted file mode 100644 index 4638ee37..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwBgrToYuv.cpp +++ /dev/null @@ -1,263 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void LoadPreparedBgr16(const uint8_t * bgr, __m512i & b16_r16, __m512i & g16_1, const __mmask64 * ms) - { - __m512i _bgr = Load(bgr, ms[0]); - __m512i bgr1 = _mm512_permutex2var_epi32(_bgr, K32_PERMUTE_BGR_TO_BGRA, K8_01); - b16_r16 = _mm512_shuffle_epi8(bgr1, K8_SUFFLE_BGR_TO_B0R0); - g16_1 = _mm512_shuffle_epi8(bgr1, K8_SUFFLE_BGR_TO_G010); - } - - template SIMD_INLINE __m512i LoadAndConvertBgrToY16(const uint8_t * bgr, __m512i & b16_r16, __m512i & g16_1, const __mmask64 * ms) - { - __m512i _b16_r16[2], _g16_1[2]; - LoadPreparedBgr16(bgr + 00, _b16_r16[0], _g16_1[0], ms + 0); - LoadPreparedBgr16(bgr + 48, _b16_r16[1], _g16_1[1], ms + 1); - b16_r16 = Hadd32(_b16_r16[0], _b16_r16[1]); - g16_1 = Hadd32(_g16_1[0], _g16_1[1]); - return Saturate16iTo8u(_mm512_add_epi16(K16_Y_ADJUST, _mm512_packs_epi32(BgrToY32(_b16_r16[0], _g16_1[0]), BgrToY32(_b16_r16[1], _g16_1[1])))); - } - - template SIMD_INLINE __m512i LoadAndConvertBgrToY8(const uint8_t * bgr, __m512i b16_r16[2], __m512i g16_1[2], const __mmask64 * ms) - { - __m512i lo = LoadAndConvertBgrToY16(bgr + 00, b16_r16[0], g16_1[0], ms + 0); - __m512i hi = LoadAndConvertBgrToY16(bgr + 96, b16_r16[1], g16_1[1], ms + 2); - return Permuted2Pack16iTo8u(lo, hi); - } - - SIMD_INLINE void Average16(__m512i & a, const __m512i & b) - { - a = _mm512_srli_epi16(_mm512_add_epi16(_mm512_add_epi16(a, b), K16_0002), 2); - } - - SIMD_INLINE __m512i ConvertU16(__m512i b16_r16[2], __m512i g16_1[2]) - { - return Saturate16iTo8u(_mm512_add_epi16(K16_UV_ADJUST, _mm512_packs_epi32(BgrToU32(b16_r16[0], g16_1[0]), BgrToU32(b16_r16[1], g16_1[1])))); - } - - SIMD_INLINE __m512i ConvertV16(__m512i b16_r16[2], __m512i g16_1[2]) - { - return Saturate16iTo8u(_mm512_add_epi16(K16_UV_ADJUST, _mm512_packs_epi32(BgrToV32(b16_r16[0], g16_1[0]), BgrToV32(b16_r16[1], g16_1[1])))); - } - - template SIMD_INLINE void BgrToYuv420p(const uint8_t * bgr0, size_t bgrStride, uint8_t * y0, size_t yStride, uint8_t * u, uint8_t * v, const __mmask64 * ms) - { - const uint8_t * bgr1 = bgr0 + bgrStride; - uint8_t * y1 = y0 + yStride; - - __m512i _b16_r16[2][2][2], _g16_1[2][2][2]; - Store(y0 + 0, LoadAndConvertBgrToY8(bgr0 + 0 * A, _b16_r16[0][0], _g16_1[0][0], ms + 0), ms[8]); - Store(y0 + A, LoadAndConvertBgrToY8(bgr0 + 3 * A, _b16_r16[0][1], _g16_1[0][1], ms + 4), ms[9]); - Store(y1 + 0, LoadAndConvertBgrToY8(bgr1 + 0 * A, _b16_r16[1][0], _g16_1[1][0], ms + 0), ms[8]); - Store(y1 + A, LoadAndConvertBgrToY8(bgr1 + 3 * A, _b16_r16[1][1], _g16_1[1][1], ms + 4), ms[9]); - - Average16(_b16_r16[0][0][0], _b16_r16[1][0][0]); - Average16(_b16_r16[0][0][1], _b16_r16[1][0][1]); - Average16(_b16_r16[0][1][0], _b16_r16[1][1][0]); - Average16(_b16_r16[0][1][1], _b16_r16[1][1][1]); - - Average16(_g16_1[0][0][0], _g16_1[1][0][0]); - Average16(_g16_1[0][0][1], _g16_1[1][0][1]); - Average16(_g16_1[0][1][0], _g16_1[1][1][0]); - Average16(_g16_1[0][1][1], _g16_1[1][1][1]); - - Store(u, Permuted2Pack16iTo8u(ConvertU16(_b16_r16[0][0], _g16_1[0][0]), ConvertU16(_b16_r16[0][1], _g16_1[0][1])), ms[10]); - Store(v, Permuted2Pack16iTo8u(ConvertV16(_b16_r16[0][0], _g16_1[0][0]), ConvertV16(_b16_r16[0][1], _g16_1[0][1])), ms[10]); - } - - template void BgrToYuv420p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert((width % 2 == 0) && (height % 2 == 0)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); - } - - width /= 2; - size_t alignedWidth = AlignLo(width - 1, A); - size_t tail = width - alignedWidth; - __mmask64 tailMasks[11]; - for (size_t i = 0; i < 8; ++i) - tailMasks[i] = TailMask64(tail * 6 - 48 * i) & 0x0000FFFFFFFFFFFF; - for (size_t i = 0; i < 2; ++i) - tailMasks[8 + i] = TailMask64(tail * 2 - A*i); - tailMasks[10] = TailMask64(tail); - for (size_t row = 0; row < height; row += 2) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - BgrToYuv420p(bgr + col * 6, bgrStride, y + col * 2, yStride, u + col, v + col, tailMasks); - if (col < width) - BgrToYuv420p(bgr + col * 6, bgrStride, y + col * 2, yStride, u + col, v + col, tailMasks); - y += 2 * yStride; - u += uStride; - v += vStride; - bgr += 2 * bgrStride; - } - } - - void BgrToYuv420p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToYuv420p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else - BgrToYuv420p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - } - - SIMD_INLINE void Average16(__m512i a[2][2]) - { - a[0][0] = _mm512_srli_epi16(_mm512_add_epi16(a[0][0], K16_0001), 1); - a[0][1] = _mm512_srli_epi16(_mm512_add_epi16(a[0][1], K16_0001), 1); - a[1][0] = _mm512_srli_epi16(_mm512_add_epi16(a[1][0], K16_0001), 1); - a[1][1] = _mm512_srli_epi16(_mm512_add_epi16(a[1][1], K16_0001), 1); - } - - template SIMD_INLINE void BgrToYuv422p(const uint8_t * bgr, uint8_t * y, uint8_t * u, uint8_t * v, const __mmask64 * ms) - { - __m512i _b16_r16[2][2], _g16_1[2][2]; - Store(y + 0, LoadAndConvertBgrToY8(bgr + 0 * A, _b16_r16[0], _g16_1[0], ms + 0), ms[8]); - Store(y + A, LoadAndConvertBgrToY8(bgr + 3 * A, _b16_r16[1], _g16_1[1], ms + 4), ms[9]); - - Average16(_b16_r16); - Average16(_g16_1); - - Store(u, Permuted2Pack16iTo8u(ConvertU16(_b16_r16[0], _g16_1[0]), ConvertU16(_b16_r16[1], _g16_1[1])), ms[10]); - Store(v, Permuted2Pack16iTo8u(ConvertV16(_b16_r16[0], _g16_1[0]), ConvertV16(_b16_r16[1], _g16_1[1])), ms[10]); - } - - template void BgrToYuv422p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert(width % 2 == 0); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); - } - - width /= 2; - size_t alignedWidth = AlignLo(width - 1, A); - size_t tail = width - alignedWidth; - __mmask64 tailMasks[11]; - for (size_t i = 0; i < 8; ++i) - tailMasks[i] = TailMask64(tail * 6 - 48 * i) & 0x0000FFFFFFFFFFFF; - for (size_t i = 0; i < 2; ++i) - tailMasks[8 + i] = TailMask64(tail * 2 - A*i); - tailMasks[10] = TailMask64(tail); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - BgrToYuv422p(bgr + col * 6, y + col * 2, u + col, v + col, tailMasks); - if (col < width) - BgrToYuv422p(bgr + col * 6, y + col * 2, u + col, v + col, tailMasks); - y += yStride; - u += uStride; - v += vStride; - bgr += bgrStride; - } - } - - void BgrToYuv422p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToYuv422p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else - BgrToYuv422p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - } - - SIMD_INLINE __m512i ConvertY16(__m512i b16_r16[2], __m512i g16_1[2]) - { - return Saturate16iTo8u(_mm512_add_epi16(K16_Y_ADJUST, _mm512_packs_epi32(BgrToY32(b16_r16[0], g16_1[0]), BgrToY32(b16_r16[1], g16_1[1])))); - } - - template SIMD_INLINE void BgrToYuv444p(const uint8_t * bgr, uint8_t * y, uint8_t * u, uint8_t * v, const __mmask64 * ms) - { - __m512i _b16_r16[2][2], _g16_1[2][2]; - LoadPreparedBgr16(bgr + 0x00, _b16_r16[0][0], _g16_1[0][0], ms + 0); - LoadPreparedBgr16(bgr + 0x30, _b16_r16[0][1], _g16_1[0][1], ms + 1); - LoadPreparedBgr16(bgr + 0x60, _b16_r16[1][0], _g16_1[1][0], ms + 2); - LoadPreparedBgr16(bgr + 0x90, _b16_r16[1][1], _g16_1[1][1], ms + 3); - - Store(y, Permuted2Pack16iTo8u(ConvertY16(_b16_r16[0], _g16_1[0]), ConvertY16(_b16_r16[1], _g16_1[1])), ms[4]); - Store(u, Permuted2Pack16iTo8u(ConvertU16(_b16_r16[0], _g16_1[0]), ConvertU16(_b16_r16[1], _g16_1[1])), ms[4]); - Store(v, Permuted2Pack16iTo8u(ConvertV16(_b16_r16[0], _g16_1[0]), ConvertV16(_b16_r16[1], _g16_1[1])), ms[4]); - } - - template void BgrToYuv444p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); - } - - size_t alignedWidth = AlignLo(width - 1, A); - size_t tail = width - alignedWidth; - __mmask64 tailMasks[5]; - for (size_t i = 0; i < 4; ++i) - tailMasks[i] = TailMask64(tail * 3 - 48 * i) & 0x0000FFFFFFFFFFFF; - tailMasks[4] = TailMask64(tail); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - BgrToYuv444p(bgr + col * 3, y + col, u + col, v + col, tailMasks); - if (col < width) - BgrToYuv444p(bgr + col * 3, y + col, u + col, v + col, tailMasks); - y += yStride; - u += uStride; - v += vStride; - bgr += bgrStride; - } - } - - void BgrToYuv444p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToYuv444p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else - BgrToYuv444p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwBgraToBayer.cpp b/src/3rd/Simd/Simd/SimdAvx512bwBgraToBayer.cpp deleted file mode 100644 index 9fa78d4e..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwBgraToBayer.cpp +++ /dev/null @@ -1,120 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - const __m128i K8_SHUFFLE_GR = SIMD_MM_SETR_EPI8(0x1, 0x6, 0x9, 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_SHUFFLE_BG = SIMD_MM_SETR_EPI8(0x0, 0x5, 0x8, 0xD, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_SHUFFLE_GB = SIMD_MM_SETR_EPI8(0x1, 0x4, 0x9, 0xC, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_SHUFFLE_RG = SIMD_MM_SETR_EPI8(0x2, 0x5, 0xA, 0xD, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - - const __m512i K32_PERMUTE_BGRA_TO_BAYER_0 = SIMD_MM512_SETR_EPI32(0x00, 0x04, 0x08, 0x0C, 0x10, 0x14, 0x18, 0x1C, -1, -1, -1, -1, -1, -1, -1, -1); - const __m512i K32_PERMUTE_BGRA_TO_BAYER_1 = SIMD_MM512_SETR_EPI32(-1, -1, -1, -1, -1, -1, -1, -1, 0x00, 0x04, 0x08, 0x0C, 0x10, 0x14, 0x18, 0x1C); - - template SIMD_INLINE void BgraToBayer(const uint8_t * bgra, uint8_t * bayer, const __m512i shuffle[4][2], __mmask64 ms[5]) - { - const __m512i bayer0 = _mm512_shuffle_epi8((Load(bgra + 0 * A, ms[0])), shuffle[format][row]); - const __m512i bayer1 = _mm512_shuffle_epi8((Load(bgra + 1 * A, ms[1])), shuffle[format][row]); - const __m512i bayer2 = _mm512_shuffle_epi8((Load(bgra + 2 * A, ms[2])), shuffle[format][row]); - const __m512i bayer3 = _mm512_shuffle_epi8((Load(bgra + 3 * A, ms[3])), shuffle[format][row]); - __m512i bayer01xx = _mm512_permutex2var_epi32(bayer0, K32_PERMUTE_BGRA_TO_BAYER_0, bayer1); - __m512i bayerxx23 = _mm512_permutex2var_epi32(bayer2, K32_PERMUTE_BGRA_TO_BAYER_1, bayer3); - Store(bayer, _mm512_or_si512(bayer01xx, bayerxx23), ms[4]); - } - - template void BgraToBayer(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bayer, size_t bayerStride) - { - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bayer) && Aligned(bayerStride)); - - const __m512i shuffle[4][2] = - { - { _mm512_broadcast_i32x4(K8_SHUFFLE_GR), _mm512_broadcast_i32x4(K8_SHUFFLE_BG)}, - { _mm512_broadcast_i32x4(K8_SHUFFLE_GB), _mm512_broadcast_i32x4(K8_SHUFFLE_RG)}, - { _mm512_broadcast_i32x4(K8_SHUFFLE_RG), _mm512_broadcast_i32x4(K8_SHUFFLE_GB)}, - { _mm512_broadcast_i32x4(K8_SHUFFLE_BG), _mm512_broadcast_i32x4(K8_SHUFFLE_GR)} - }; - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMasks[5]; - for (size_t c = 0; c < 4; ++c) - tailMasks[c] = TailMask64((width - alignedWidth) * 4 - A*c); - tailMasks[4] = TailMask64(width - alignedWidth); - - for (size_t row = 0, col = 0; row < height; row += 2) - { - for (col = 0; col < alignedWidth; col += A) - BgraToBayer(bgra + 4 * col, bayer + col, shuffle, tailMasks); - if (col < width) - BgraToBayer(bgra + 4 * col, bayer + col, shuffle, tailMasks); - bgra += bgraStride; - bayer += bayerStride; - - for (col = 0; col < alignedWidth; col += A) - BgraToBayer(bgra + 4 * col, bayer + col, shuffle, tailMasks); - if (col < width) - BgraToBayer(bgra + 4 * col, bayer + col, shuffle, tailMasks); - bgra += bgraStride; - bayer += bayerStride; - } - } - - template void BgraToBayer(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat) - { - assert((width % 2 == 0) && (height % 2 == 0)); - - switch (bayerFormat) - { - case SimdPixelFormatBayerGrbg: - BgraToBayer<0, align>(bgra, width, height, bgraStride, bayer, bayerStride); - break; - case SimdPixelFormatBayerGbrg: - BgraToBayer<1, align>(bgra, width, height, bgraStride, bayer, bayerStride); - break; - case SimdPixelFormatBayerRggb: - BgraToBayer<2, align>(bgra, width, height, bgraStride, bayer, bayerStride); - break; - case SimdPixelFormatBayerBggr: - BgraToBayer<3, align>(bgra, width, height, bgraStride, bayer, bayerStride); - break; - default: - assert(0); - } - } - - void BgraToBayer(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bayer) && Aligned(bayerStride)) - BgraToBayer(bgra, width, height, bgraStride, bayer, bayerStride, bayerFormat); - else - BgraToBayer(bgra, width, height, bgraStride, bayer, bayerStride, bayerFormat); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwBgraToBgr.cpp b/src/3rd/Simd/Simd/SimdAvx512bwBgraToBgr.cpp deleted file mode 100644 index cf82e931..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwBgraToBgr.cpp +++ /dev/null @@ -1,151 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - const __m512i K8_SUFFLE_BGRA_TO_BGR = SIMD_MM512_SETR_EPI8( - 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, - 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, - 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, - 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1); - - const __m512i K32_PERMUTE_BGRA_TO_BGR = SIMD_MM512_SETR_EPI32(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1); - - const __m512i K32_PERMUTE_BGRA_TO_BGR_0 = SIMD_MM512_SETR_EPI32(0x00, 0x01, 0x02, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x10, 0x11, 0x12, 0x14); - const __m512i K32_PERMUTE_BGRA_TO_BGR_1 = SIMD_MM512_SETR_EPI32(0x05, 0x06, 0x08, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x10, 0x11, 0x12, 0x14, 0x15, 0x16, 0x18, 0x19); - const __m512i K32_PERMUTE_BGRA_TO_BGR_2 = SIMD_MM512_SETR_EPI32(0x0A, 0x0C, 0x0D, 0x0E, 0x10, 0x11, 0x12, 0x14, 0x15, 0x16, 0x18, 0x19, 0x1A, 0x1C, 0x1D, 0x1E); - - template SIMD_INLINE void BgraToBgr(const uint8_t * bgra, uint8_t * bgr, __mmask64 bgraMask = -1, __mmask64 bgrMask = 0x0000ffffffffffff) - { - __m512i _bgra = Load(bgra, bgraMask); - __m512i _bgr = _mm512_permutexvar_epi32(K32_PERMUTE_BGRA_TO_BGR, _mm512_shuffle_epi8(_bgra, K8_SUFFLE_BGRA_TO_BGR)); - Store(bgr, _bgr, bgrMask); - } - - template SIMD_INLINE void BgraToBgr(const uint8_t * bgra, uint8_t * bgr) - { - __m512i bgr0 = _mm512_shuffle_epi8(Load(bgra + 0 * A), K8_SUFFLE_BGRA_TO_BGR); - __m512i bgr1 = _mm512_shuffle_epi8(Load(bgra + 1 * A), K8_SUFFLE_BGRA_TO_BGR); - __m512i bgr2 = _mm512_shuffle_epi8(Load(bgra + 2 * A), K8_SUFFLE_BGRA_TO_BGR); - __m512i bgr3 = _mm512_shuffle_epi8(Load(bgra + 3 * A), K8_SUFFLE_BGRA_TO_BGR); - Store(bgr + 0 * A, _mm512_permutex2var_epi32(bgr0, K32_PERMUTE_BGRA_TO_BGR_0, bgr1)); - Store(bgr + 1 * A, _mm512_permutex2var_epi32(bgr1, K32_PERMUTE_BGRA_TO_BGR_1, bgr2)); - Store(bgr + 2 * A, _mm512_permutex2var_epi32(bgr2, K32_PERMUTE_BGRA_TO_BGR_2, bgr3)); - } - - template void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) - { - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)); - - size_t fullAlignedWidth = AlignLo(width, A); - size_t alignedWidth = AlignLo(width, F); - __mmask64 bgraTailMask = TailMask64((width - alignedWidth) * 4); - __mmask64 bgrTailMask = TailMask64((width - alignedWidth) * 3); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += A) - BgraToBgr(bgra + 4 * col, bgr + 3 * col); - for (; col < alignedWidth; col += F) - BgraToBgr(bgra + 4 * col, bgr + 3 * col); - if (col < width) - BgraToBgr(bgra + 4 * col, bgr + 3 * col, bgraTailMask, bgrTailMask); - bgra += bgraStride; - bgr += bgrStride; - } - } - - void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)) - BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); - else - BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); - } - - //--------------------------------------------------------------------- - - const __m512i K8_SUFFLE_BGRA_TO_RGB = SIMD_MM512_SETR_EPI8( - 0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1, - 0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1, - 0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1, - 0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1); - - template SIMD_INLINE void BgraToRgb(const uint8_t* bgra, uint8_t* rgb, __mmask64 bgraMask = -1, __mmask64 rgbMask = 0x0000ffffffffffff) - { - __m512i _bgra = Load(bgra, bgraMask); - __m512i _rgb = _mm512_permutexvar_epi32(K32_PERMUTE_BGRA_TO_BGR, _mm512_shuffle_epi8(_bgra, K8_SUFFLE_BGRA_TO_RGB)); - Store(rgb, _rgb, rgbMask); - } - - template SIMD_INLINE void BgraToRgb(const uint8_t* bgra, uint8_t* rgb) - { - __m512i rgb0 = _mm512_shuffle_epi8(Load(bgra + 0 * A), K8_SUFFLE_BGRA_TO_RGB); - __m512i rgb1 = _mm512_shuffle_epi8(Load(bgra + 1 * A), K8_SUFFLE_BGRA_TO_RGB); - __m512i rgb2 = _mm512_shuffle_epi8(Load(bgra + 2 * A), K8_SUFFLE_BGRA_TO_RGB); - __m512i rgb3 = _mm512_shuffle_epi8(Load(bgra + 3 * A), K8_SUFFLE_BGRA_TO_RGB); - Store(rgb + 0 * A, _mm512_permutex2var_epi32(rgb0, K32_PERMUTE_BGRA_TO_BGR_0, rgb1)); - Store(rgb + 1 * A, _mm512_permutex2var_epi32(rgb1, K32_PERMUTE_BGRA_TO_BGR_1, rgb2)); - Store(rgb + 2 * A, _mm512_permutex2var_epi32(rgb2, K32_PERMUTE_BGRA_TO_BGR_2, rgb3)); - } - - template void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) - { - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)); - - size_t fullAlignedWidth = AlignLo(width, A); - size_t alignedWidth = AlignLo(width, F); - __mmask64 bgraTailMask = TailMask64((width - alignedWidth) * 4); - __mmask64 rgbTailMask = TailMask64((width - alignedWidth) * 3); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += A) - BgraToRgb(bgra + 4 * col, rgb + 3 * col); - for (; col < alignedWidth; col += F) - BgraToRgb(bgra + 4 * col, rgb + 3 * col); - if (col < width) - BgraToRgb(bgra + 4 * col, rgb + 3 * col, bgraTailMask, rgbTailMask); - bgra += bgraStride; - rgb += rgbStride; - } - } - - void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* bgr, size_t bgrStride) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)) - BgraToRgb(bgra, width, height, bgraStride, bgr, bgrStride); - else - BgraToRgb(bgra, width, height, bgraStride, bgr, bgrStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwBgraToGray.cpp b/src/3rd/Simd/Simd/SimdAvx512bwBgraToGray.cpp deleted file mode 100644 index b643b571..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwBgraToGray.cpp +++ /dev/null @@ -1,88 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - const __m512i K16_BLUE_RED = SIMD_MM512_SET2_EPI16(Base::BLUE_TO_GRAY_WEIGHT, Base::RED_TO_GRAY_WEIGHT); - const __m512i K16_GREEN_0000 = SIMD_MM512_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, 0x0000); - const __m512i K32_ROUND_TERM = SIMD_MM512_SET1_EPI32(Base::BGR_TO_GRAY_ROUND_TERM); - - SIMD_INLINE __m512i BgraToGray32(__m512i bgra) - { - const __m512i g0a0 = _mm512_shuffle_epi8(bgra, K8_SUFFLE_BGRA_TO_G0A0); - const __m512i b0r0 = _mm512_and_si512(bgra, K16_00FF); - const __m512i weightedSum = _mm512_add_epi32(_mm512_madd_epi16(g0a0, K16_GREEN_0000), _mm512_madd_epi16(b0r0, K16_BLUE_RED)); - return _mm512_srli_epi32(_mm512_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT); - } - - template SIMD_INLINE void BgraToGray(const uint8_t * bgra, uint8_t * gray, __mmask64 ms[5]) - { - __m512i gray0 = BgraToGray32(Load(bgra + 0 * A, ms[0])); - __m512i gray1 = BgraToGray32(Load(bgra + 1 * A, ms[1])); - __m512i gray2 = BgraToGray32(Load(bgra + 2 * A, ms[2])); - __m512i gray3 = BgraToGray32(Load(bgra + 3 * A, ms[3])); - __m512i gray01 = _mm512_packs_epi32(gray0, gray1); - __m512i gray23 = _mm512_packs_epi32(gray2, gray3); - __m512i gray0123 = _mm512_packus_epi16(gray01, gray23); - Store(gray, _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, gray0123), ms[4]); - } - - template void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride) - { - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(gray) && Aligned(grayStride)); - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMasks[5]; - for (size_t c = 0; c < 4; ++c) - tailMasks[c] = TailMask64((width - alignedWidth) * 4 - A * c); - tailMasks[4] = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - BgraToGray(bgra + col * 4, gray + col, tailMasks); - if (col < width) - BgraToGray(bgra + col * 4, gray + col, tailMasks); - bgra += bgraStride; - gray += grayStride; - } - } - - void BgraToGray(const uint8_t *bgra, size_t width, size_t height, size_t bgraStride, uint8_t *gray, size_t grayStride) - { - if (Aligned(bgra) && Aligned(gray) && Aligned(bgraStride) && Aligned(grayStride)) - BgraToGray(bgra, width, height, bgraStride, gray, grayStride); - else - BgraToGray(bgra, width, height, bgraStride, gray, grayStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwBgraToYuv.cpp b/src/3rd/Simd/Simd/SimdAvx512bwBgraToYuv.cpp deleted file mode 100644 index 48c3f5de..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwBgraToYuv.cpp +++ /dev/null @@ -1,364 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void LoadPreparedBgra16(const uint8_t * bgra, __m512i & b16_r16, __m512i & g16_1, const __mmask64 * ms) - { - __m512i _bgra = Load(bgra, ms[0]); - b16_r16 = _mm512_and_si512(_bgra, K16_00FF); - g16_1 = _mm512_or_si512(_mm512_shuffle_epi8(_bgra, K8_SUFFLE_BGRA_TO_G000), K32_00010000); - } - - template SIMD_INLINE __m512i LoadAndConvertBgraToY16(const uint8_t * bgra, __m512i & b16_r16, __m512i & g16_1, const __mmask64 * ms) - { - __m512i _b16_r16[2], _g16_1[2]; - LoadPreparedBgra16(bgra + 0, _b16_r16[0], _g16_1[0], ms + 0); - LoadPreparedBgra16(bgra + A, _b16_r16[1], _g16_1[1], ms + 1); - b16_r16 = Hadd32(_b16_r16[0], _b16_r16[1]); - g16_1 = Hadd32(_g16_1[0], _g16_1[1]); - return Saturate16iTo8u(_mm512_add_epi16(K16_Y_ADJUST, _mm512_packs_epi32(BgrToY32(_b16_r16[0], _g16_1[0]), BgrToY32(_b16_r16[1], _g16_1[1])))); - } - - template SIMD_INLINE __m512i LoadAndConvertBgraToY8(const uint8_t * bgra, __m512i b16_r16[2], __m512i g16_1[2], const __mmask64 * ms) - { - __m512i lo = LoadAndConvertBgraToY16(bgra + 0 * A, b16_r16[0], g16_1[0], ms + 0); - __m512i hi = LoadAndConvertBgraToY16(bgra + 2 * A, b16_r16[1], g16_1[1], ms + 2); - return Permuted2Pack16iTo8u(lo, hi); - } - - SIMD_INLINE void Average16(__m512i & a, const __m512i & b) - { - a = _mm512_srli_epi16(_mm512_add_epi16(_mm512_add_epi16(a, b), K16_0002), 2); - } - - SIMD_INLINE __m512i ConvertU16(__m512i b16_r16[2], __m512i g16_1[2]) - { - return Saturate16iTo8u(_mm512_add_epi16(K16_UV_ADJUST, _mm512_packs_epi32(BgrToU32(b16_r16[0], g16_1[0]), BgrToU32(b16_r16[1], g16_1[1])))); - } - - SIMD_INLINE __m512i ConvertV16(__m512i b16_r16[2], __m512i g16_1[2]) - { - return Saturate16iTo8u(_mm512_add_epi16(K16_UV_ADJUST, _mm512_packs_epi32(BgrToV32(b16_r16[0], g16_1[0]), BgrToV32(b16_r16[1], g16_1[1])))); - } - - template SIMD_INLINE void BgraToYuv420p(const uint8_t * bgra0, size_t bgraStride, uint8_t * y0, size_t yStride, uint8_t * u, uint8_t * v, const __mmask64 * ms) - { - const uint8_t * bgra1 = bgra0 + bgraStride; - uint8_t * y1 = y0 + yStride; - - __m512i _b16_r16[2][2][2], _g16_1[2][2][2]; - Store(y0 + 0, LoadAndConvertBgraToY8(bgra0 + 0 * A, _b16_r16[0][0], _g16_1[0][0], ms + 0), ms[8]); - Store(y0 + A, LoadAndConvertBgraToY8(bgra0 + 4 * A, _b16_r16[0][1], _g16_1[0][1], ms + 4), ms[9]); - Store(y1 + 0, LoadAndConvertBgraToY8(bgra1 + 0 * A, _b16_r16[1][0], _g16_1[1][0], ms + 0), ms[8]); - Store(y1 + A, LoadAndConvertBgraToY8(bgra1 + 4 * A, _b16_r16[1][1], _g16_1[1][1], ms + 4), ms[9]); - - Average16(_b16_r16[0][0][0], _b16_r16[1][0][0]); - Average16(_b16_r16[0][0][1], _b16_r16[1][0][1]); - Average16(_b16_r16[0][1][0], _b16_r16[1][1][0]); - Average16(_b16_r16[0][1][1], _b16_r16[1][1][1]); - - Average16(_g16_1[0][0][0], _g16_1[1][0][0]); - Average16(_g16_1[0][0][1], _g16_1[1][0][1]); - Average16(_g16_1[0][1][0], _g16_1[1][1][0]); - Average16(_g16_1[0][1][1], _g16_1[1][1][1]); - - Store(u, Permuted2Pack16iTo8u(ConvertU16(_b16_r16[0][0], _g16_1[0][0]), ConvertU16(_b16_r16[0][1], _g16_1[0][1])), ms[10]); - Store(v, Permuted2Pack16iTo8u(ConvertV16(_b16_r16[0][0], _g16_1[0][0]), ConvertV16(_b16_r16[0][1], _g16_1[0][1])), ms[10]); - } - - template void BgraToYuv420p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert((width % 2 == 0) && (height % 2 == 0)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - width /= 2; - size_t alignedWidth = AlignLo(width, A); - size_t tail = width - alignedWidth; - __mmask64 tailMasks[11]; - for (size_t i = 0; i < 8; ++i) - tailMasks[i] = TailMask64(tail * 8 - A*i); - for (size_t i = 0; i < 2; ++i) - tailMasks[8 + i] = TailMask64(tail * 2 - A*i); - tailMasks[10] = TailMask64(tail); - for (size_t row = 0; row < height; row += 2) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - BgraToYuv420p(bgra + col * 8, bgraStride, y + col * 2, yStride, u + col, v + col, tailMasks); - if (col < width) - BgraToYuv420p(bgra + col * 8, bgraStride, y + col * 2, yStride, u + col, v + col, tailMasks); - y += 2 * yStride; - u += uStride; - v += vStride; - bgra += 2 * bgraStride; - } - } - - void BgraToYuv420p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)) - BgraToYuv420p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else - BgraToYuv420p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - } - - SIMD_INLINE void Average16(__m512i a[2][2]) - { - a[0][0] = _mm512_srli_epi16(_mm512_add_epi16(a[0][0], K16_0001), 1); - a[0][1] = _mm512_srli_epi16(_mm512_add_epi16(a[0][1], K16_0001), 1); - a[1][0] = _mm512_srli_epi16(_mm512_add_epi16(a[1][0], K16_0001), 1); - a[1][1] = _mm512_srli_epi16(_mm512_add_epi16(a[1][1], K16_0001), 1); - } - - template SIMD_INLINE void BgraToYuv422p(const uint8_t * bgra, uint8_t * y, uint8_t * u, uint8_t * v, const __mmask64 * ms) - { - __m512i _b16_r16[2][2], _g16_1[2][2]; - Store(y + 0, LoadAndConvertBgraToY8(bgra + 0 * A, _b16_r16[0], _g16_1[0], ms + 0), ms[8]); - Store(y + A, LoadAndConvertBgraToY8(bgra + 4 * A, _b16_r16[1], _g16_1[1], ms + 4), ms[9]); - - Average16(_b16_r16); - Average16(_g16_1); - - Store(u, Permuted2Pack16iTo8u(ConvertU16(_b16_r16[0], _g16_1[0]), ConvertU16(_b16_r16[1], _g16_1[1])), ms[10]); - Store(v, Permuted2Pack16iTo8u(ConvertV16(_b16_r16[0], _g16_1[0]), ConvertV16(_b16_r16[1], _g16_1[1])), ms[10]); - } - - template void BgraToYuv422p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert(width % 2 == 0); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - width /= 2; - size_t alignedWidth = AlignLo(width, A); - size_t tail = width - alignedWidth; - __mmask64 tailMasks[11]; - for (size_t i = 0; i < 8; ++i) - tailMasks[i] = TailMask64(tail * 8 - A*i); - for (size_t i = 0; i < 2; ++i) - tailMasks[8 + i] = TailMask64(tail * 2 - A*i); - tailMasks[10] = TailMask64(tail); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - BgraToYuv422p(bgra + col * 8, y + col * 2, u + col, v + col, tailMasks); - if (col < width) - BgraToYuv422p(bgra + col * 8, y + col * 2, u + col, v + col, tailMasks); - y += yStride; - u += uStride; - v += vStride; - bgra += bgraStride; - } - } - - void BgraToYuv422p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)) - BgraToYuv422p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else - BgraToYuv422p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - } - - SIMD_INLINE __m512i ConvertY16(__m512i b16_r16[2], __m512i g16_1[2]) - { - return Saturate16iTo8u(_mm512_add_epi16(K16_Y_ADJUST, _mm512_packs_epi32(BgrToY32(b16_r16[0], g16_1[0]), BgrToY32(b16_r16[1], g16_1[1])))); - } - - template SIMD_INLINE void BgraToYuv444p(const uint8_t * bgra, uint8_t * y, uint8_t * u, uint8_t * v, const __mmask64 * ms) - { - __m512i _b16_r16[2][2], _g16_1[2][2]; - LoadPreparedBgra16(bgra + 0 * A, _b16_r16[0][0], _g16_1[0][0], ms + 0); - LoadPreparedBgra16(bgra + 1 * A, _b16_r16[0][1], _g16_1[0][1], ms + 1); - LoadPreparedBgra16(bgra + 2 * A, _b16_r16[1][0], _g16_1[1][0], ms + 2); - LoadPreparedBgra16(bgra + 3 * A, _b16_r16[1][1], _g16_1[1][1], ms + 3); - - Store(y, Permuted2Pack16iTo8u(ConvertY16(_b16_r16[0], _g16_1[0]), ConvertY16(_b16_r16[1], _g16_1[1])), ms[4]); - Store(u, Permuted2Pack16iTo8u(ConvertU16(_b16_r16[0], _g16_1[0]), ConvertU16(_b16_r16[1], _g16_1[1])), ms[4]); - Store(v, Permuted2Pack16iTo8u(ConvertV16(_b16_r16[0], _g16_1[0]), ConvertV16(_b16_r16[1], _g16_1[1])), ms[4]); - } - - template void BgraToYuv444p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - size_t alignedWidth = AlignLo(width, A); - size_t tail = width - alignedWidth; - __mmask64 tailMasks[5]; - for (size_t i = 0; i < 4; ++i) - tailMasks[i] = TailMask64(tail * 4 - A*i); - tailMasks[4] = TailMask64(tail); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - BgraToYuv444p(bgra + col * 4, y + col, u + col, v + col, tailMasks); - if (col < width) - BgraToYuv444p(bgra + col * 4, y + col, u + col, v + col, tailMasks); - y += yStride; - u += uStride; - v += vStride; - bgra += bgraStride; - } - } - - void BgraToYuv444p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)) - BgraToYuv444p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else - BgraToYuv444p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - } - - - template SIMD_INLINE void LoadPreparedBgra16(const uint8_t * bgra, __m512i & b16_r16, __m512i & g16_1, __m512i & a32, const __mmask64 * tails) - { - __m512i _bgra = Load(bgra, tails[0]); - b16_r16 = _mm512_and_si512(_bgra, K16_00FF); - g16_1 = _mm512_or_si512(_mm512_shuffle_epi8(_bgra, K8_SUFFLE_BGRA_TO_G000), K32_00010000); - a32 = _mm512_shuffle_epi8(_bgra, K8_SUFFLE_BGRA_TO_A000); - } - - template SIMD_INLINE void LoadAndConvertYA16(const uint8_t * bgra, __m512i & b16_r16, __m512i & g16_1, __m512i & y16, __m512i & a16, const __mmask64 * tails) - { - __m512i _b16_r16[2], _g16_1[2], a32[2]; - LoadPreparedBgra16(bgra + 0, _b16_r16[0], _g16_1[0], a32[0], tails + 0); - LoadPreparedBgra16(bgra + A, _b16_r16[1], _g16_1[1], a32[1], tails + 1); - b16_r16 = Hadd32(_b16_r16[0], _b16_r16[1]); - g16_1 = Hadd32(_g16_1[0], _g16_1[1]); - y16 = Saturate16iTo8u(_mm512_add_epi16(K16_Y_ADJUST, _mm512_packs_epi32(BgrToY32(_b16_r16[0], _g16_1[0]), BgrToY32(_b16_r16[1], _g16_1[1])))); - a16 = _mm512_packs_epi32(a32[0], a32[1]); - } - - template SIMD_INLINE void LoadAndStoreYA(const uint8_t * bgra, __m512i b16_r16[2], __m512i g16_1[2], uint8_t * y, uint8_t * a, const __mmask64 * tails) - { - __m512i y16[2], a16[2]; - LoadAndConvertYA16(bgra + 0 * A, b16_r16[0], g16_1[0], y16[0], a16[0], tails + 0); - LoadAndConvertYA16(bgra + 2 * A, b16_r16[1], g16_1[1], y16[1], a16[1], tails + 2); - Store(y, Permuted2Pack16iTo8u(y16[0], y16[1]), tails[4]); - Store(a, Permuted2Pack16iTo8u(a16[0], a16[1]), tails[4]); - } - - template SIMD_INLINE void BgraToYuva420p(const uint8_t * bgra0, size_t bgraStride, uint8_t * y0, size_t yStride, uint8_t * u, uint8_t * v, uint8_t * a0, size_t aStride, const __mmask64 * tails) - { - const uint8_t * bgra1 = bgra0 + bgraStride; - uint8_t * y1 = y0 + yStride; - uint8_t * a1 = a0 + aStride; - - __m512i _b16_r16[2][2][2], _g16_1[2][2][2]; - LoadAndStoreYA(bgra0 + 0 * A, _b16_r16[0][0], _g16_1[0][0], y0 + 0, a0 + 0, tails + 0); - LoadAndStoreYA(bgra0 + 4 * A, _b16_r16[0][1], _g16_1[0][1], y0 + A, a0 + A, tails + 5); - LoadAndStoreYA(bgra1 + 0 * A, _b16_r16[1][0], _g16_1[1][0], y1 + 0, a1 + 0, tails + 0); - LoadAndStoreYA(bgra1 + 4 * A, _b16_r16[1][1], _g16_1[1][1], y1 + A, a1 + A, tails + 5); - - Average16(_b16_r16[0][0][0], _b16_r16[1][0][0]); - Average16(_b16_r16[0][0][1], _b16_r16[1][0][1]); - Average16(_b16_r16[0][1][0], _b16_r16[1][1][0]); - Average16(_b16_r16[0][1][1], _b16_r16[1][1][1]); - - Average16(_g16_1[0][0][0], _g16_1[1][0][0]); - Average16(_g16_1[0][0][1], _g16_1[1][0][1]); - Average16(_g16_1[0][1][0], _g16_1[1][1][0]); - Average16(_g16_1[0][1][1], _g16_1[1][1][1]); - - Store(u, Permuted2Pack16iTo8u(ConvertU16(_b16_r16[0][0], _g16_1[0][0]), ConvertU16(_b16_r16[0][1], _g16_1[0][1])), tails[10]); - Store(v, Permuted2Pack16iTo8u(ConvertV16(_b16_r16[0][0], _g16_1[0][0]), ConvertV16(_b16_r16[0][1], _g16_1[0][1])), tails[10]); - } - - template void BgraToYuva420p(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride, uint8_t * a, size_t aStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= DA) && (height >= 2)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(a) && Aligned(aStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - width /= 2; - size_t alignedWidth = AlignLo(width, A); - size_t tail = width - alignedWidth; - __mmask64 tails[11]; - for (size_t i = 0; i < 4; ++i) - { - tails[i + 0] = TailMask64(tail * 8 - A * (i + 0)); - tails[i + 5] = TailMask64(tail * 8 - A * (i + 4)); - } - tails[4] = TailMask64(tail * 2 - A * 0); - tails[9] = TailMask64(tail * 2 - A * 1); - tails[10] = TailMask64(tail); - for (size_t row = 0; row < height; row += 2) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - BgraToYuva420p(bgra + col*8, bgraStride, y + col*2, yStride, u + col, v + col, a + col*2, aStride, tails); - if (col < width) - BgraToYuva420p(bgra + col*8, bgraStride, y + col*2, yStride, u + col, v + col, a + col*2, aStride, tails); - y += 2 * yStride; - u += uStride; - v += vStride; - a += 2 * aStride; - bgra += 2 * bgraStride; - } - } - - void BgraToYuva420p(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride, uint8_t * a, size_t aStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride) - && Aligned(a) && Aligned(aStride) && Aligned(bgra) && Aligned(bgraStride)) - BgraToYuva420p(bgra, bgraStride, width, height, y, yStride, u, uStride, v, vStride, a, aStride); - else - BgraToYuva420p(bgra, bgraStride, width, height, y, yStride, u, uStride, v, vStride, a, aStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwBinarization.cpp b/src/3rd/Simd/Simd/SimdAvx512bwBinarization.cpp deleted file mode 100644 index f2dd7335..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwBinarization.cpp +++ /dev/null @@ -1,290 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void Binarization(const uint8_t * src, - const __m512i & value, const __m512i & positive, const __m512i & negative, uint8_t * dst, __mmask64 m = -1) - { - __mmask64 mm = Compare8u(Load(src, m), value); - Store(dst, _mm512_mask_blend_epi8(mm, negative, positive), m); - } - - template SIMD_INLINE void Binarization4(const uint8_t * src, - const __m512i & value, const __m512i & positive, const __m512i & negative, uint8_t * dst) - { - Store(dst + 0 * A, _mm512_mask_blend_epi8(Compare8u(Load(src + 0 * A), value), negative, positive)); - Store(dst + 1 * A, _mm512_mask_blend_epi8(Compare8u(Load(src + 1 * A), value), negative, positive)); - Store(dst + 2 * A, _mm512_mask_blend_epi8(Compare8u(Load(src + 2 * A), value), negative, positive)); - Store(dst + 3 * A, _mm512_mask_blend_epi8(Compare8u(Load(src + 3 * A), value), negative, positive)); - } - - template - void Binarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride) - { - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - size_t fullAlignedWidth = Simd::AlignLo(width, QA); - __mmask64 tailMask = TailMask64(width - alignedWidth); - __m512i _value = _mm512_set1_epi8(value); - __m512i _positive = _mm512_set1_epi8(positive); - __m512i _negative = _mm512_set1_epi8(negative); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QA) - Binarization4(src + col, _value, _positive, _negative, dst + col); - for (; col < alignedWidth; col += A) - Binarization(src + col, _value, _positive, _negative, dst + col); - if (col < width) - Binarization(src + col, _value, _positive, _negative, dst + col, tailMask); - src += srcStride; - dst += dstStride; - } - } - - template - void Binarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - else - Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - } - - void Binarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride, SimdCompareType compareType) - { - switch (compareType) - { - case SimdCompareEqual: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - case SimdCompareNotEqual: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - case SimdCompareGreater: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - case SimdCompareGreaterOrEqual: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - case SimdCompareLesser: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - case SimdCompareLesserOrEqual: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - default: - assert(0); - } - } - - namespace - { - struct Buffer - { - Buffer(size_t width, size_t edge) - { - size_t size = sizeof(uint8_t)*(width + 2 * edge) + sizeof(uint32_t)*(2 * width + 2 * edge); - _p = Allocate(size); - memset(_p, 0, size); - s = (uint8_t*)_p + edge; - s0a0 = (uint32_t*)(s + width + edge) + edge; - sum = (uint32_t*)(s0a0 + width + edge); - } - - ~Buffer() - { - Free(_p); - } - - uint8_t * s; - uint32_t * s0a0; - uint32_t * sum; - private: - void *_p; - }; - } - - template SIMD_INLINE void AddRows(const uint8_t * src, uint8_t * sum, const __m512i & value, __mmask64 tail = -1) - { - __mmask64 inc = Compare8u(Load(src, tail), value); - __m512i _sum = Load(sum, tail); - _sum = _mm512_mask_add_epi8(_sum, inc, _sum, K8_01); - Store(sum, _sum, tail); - } - - template SIMD_INLINE void SubRows(const uint8_t * src, uint8_t * sum, const __m512i & value, __mmask64 tail = -1) - { - __mmask64 dec = Compare8u(Load(src, tail), value); - __m512i _sum = Load(sum, tail); - _sum = _mm512_mask_sub_epi8(_sum, dec, _sum, K8_01); - Store(sum, _sum, tail); - } - - template SIMD_INLINE void Unpack(const uint8_t * sum, const __m512i & area, uint32_t * s0a0, const __mmask16 * tailMasks) - { - const __m512i _sum = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, (Load(sum))); - const __m512i saLo = _mm512_unpacklo_epi8(_sum, area); - const __m512i saHi = _mm512_unpackhi_epi8(_sum, area); - Store(s0a0 + 0 * F, _mm512_unpacklo_epi8(saLo, K_ZERO), tailMasks[0]); - Store(s0a0 + 1 * F, _mm512_unpackhi_epi8(saLo, K_ZERO), tailMasks[1]); - Store(s0a0 + 2 * F, _mm512_unpacklo_epi8(saHi, K_ZERO), tailMasks[2]); - Store(s0a0 + 3 * F, _mm512_unpackhi_epi8(saHi, K_ZERO), tailMasks[3]); - } - - template SIMD_INLINE void Binarization(const uint32_t * sum, const __m512i & ff_threshold, const __m512i & positive, const __m512i & negative, uint8_t * dst, __mmask64 tail = -1) - { - union Mask - { - __mmask16 m16[4]; - __mmask64 m64[1]; - } mm; - mm.m16[0] = _mm512_cmpgt_epi32_mask(_mm512_madd_epi16((Load(sum + 0 * F)), ff_threshold), K_ZERO); - mm.m16[1] = _mm512_cmpgt_epi32_mask(_mm512_madd_epi16((Load(sum + 1 * F)), ff_threshold), K_ZERO); - mm.m16[2] = _mm512_cmpgt_epi32_mask(_mm512_madd_epi16((Load(sum + 2 * F)), ff_threshold), K_ZERO); - mm.m16[3] = _mm512_cmpgt_epi32_mask(_mm512_madd_epi16((Load(sum + 3 * F)), ff_threshold), K_ZERO); - Store(dst, _mm512_mask_blend_epi8(mm.m64[0], negative, positive), tail); - } - - template - void AveragingBinarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, size_t neighborhood, uint8_t threshold, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride) - { - assert(width > neighborhood && height > neighborhood && neighborhood < 0x7F); - - size_t alignedWidth = Simd::AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - __mmask16 tailMasks[4]; - for (size_t c = 0; c < 4; ++c) - tailMasks[c] = TailMask16(width - alignedWidth - F*c); - - const __m512i ff_threshold = SetInt16(0xFF, -threshold); - const __m512i _value = _mm512_set1_epi8(value); - const __m512i _positive = _mm512_set1_epi8(positive); - const __m512i _negative = _mm512_set1_epi8(negative); - - Buffer buffer(AlignHi(width, A), AlignHi(neighborhood + 1, A)); - uint8_t area = 0; - size_t col = 0; - - for (size_t row = 0; row < neighborhood; ++row) - { - area++; - const uint8_t * s = src + row*srcStride; - for (col = 0; col < alignedWidth; col += A) - AddRows(s + col, buffer.s + col, _value); - if (col < width) - AddRows(s + col, buffer.s + col, _value, tailMask); - } - - for (size_t row = 0; row < height; ++row) - { - if (row < height - neighborhood) - { - area++; - const uint8_t * s = src + (row + neighborhood)*srcStride; - for (col = 0; col < alignedWidth; col += A) - AddRows(s + col, buffer.s + col, _value); - if (col < width) - AddRows(s + col, buffer.s + col, _value, tailMask); - } - if (row > neighborhood) - { - area--; - const uint8_t * s = src + (row - neighborhood - 1)*srcStride; - for (col = 0; col < alignedWidth; col += A) - SubRows(s + col, buffer.s + col, _value); - if (col < width) - SubRows(s + col, buffer.s + col, _value, tailMask); - } - - __m512i _area = _mm512_set1_epi8(area); - for (col = 0; col < alignedWidth; col += A) - Unpack(buffer.s + col, _area, buffer.s0a0 + col, tailMasks); - if (col < width) - Unpack(buffer.s + col, _area, buffer.s0a0 + col, tailMasks); - - uint32_t sum = 0; - for (col = 0; col < neighborhood; ++col) - { - sum += buffer.s0a0[col]; - } - for (col = 0; col < width; ++col) - { - sum += buffer.s0a0[col + neighborhood]; - sum -= buffer.s0a0[col - neighborhood - 1]; - buffer.sum[col] = sum; - } - - for (col = 0; col < alignedWidth; col += A) - Binarization(buffer.sum + col, ff_threshold, _positive, _negative, dst + col); - if (col < width) - Binarization(buffer.sum + col, ff_threshold, _positive, _negative, dst + col, tailMask); - - dst += dstStride; - } - } - - template - void AveragingBinarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, size_t neighborhood, uint8_t threshold, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - else - AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - } - - void AveragingBinarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, size_t neighborhood, uint8_t threshold, uint8_t positive, uint8_t negative, - uint8_t * dst, size_t dstStride, SimdCompareType compareType) - { - switch (compareType) - { - case SimdCompareEqual: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - case SimdCompareNotEqual: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - case SimdCompareGreater: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - case SimdCompareGreaterOrEqual: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - case SimdCompareLesser: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - case SimdCompareLesserOrEqual: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - default: - assert(0); - } - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwConditional.cpp b/src/3rd/Simd/Simd/SimdAvx512bwConditional.cpp deleted file mode 100644 index 1ba9fe6f..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwConditional.cpp +++ /dev/null @@ -1,539 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { -#ifdef SIMD_X64_ENABLE - template SIMD_INLINE void ConditionalCount8u(const uint8_t * src, __m512i value, uint64_t * counts, __mmask64 tail = -1) - { - const __m512i _src = Load(src, tail); - __mmask64 bits = Compare8u(_src, value); - counts[0] += _mm_popcnt_u64(bits&tail); - } - - template SIMD_INLINE void ConditionalCount8u4(const uint8_t * src, __m512i value, uint64_t * counts) - { - counts[0] += _mm_popcnt_u64(Compare8u(Load(src + 0 * A), value)); - counts[1] += _mm_popcnt_u64(Compare8u(Load(src + 1 * A), value)); - counts[2] += _mm_popcnt_u64(Compare8u(Load(src + 2 * A), value)); - counts[3] += _mm_popcnt_u64(Compare8u(Load(src + 3 * A), value)); - } - - template void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, uint8_t value, uint32_t * count) - { - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - size_t fullAlignedWidth = Simd::AlignLo(width, QA); - __mmask64 tailMask = TailMask64(width - alignedWidth); - - __m512i _value = _mm512_set1_epi8(value); - uint64_t counts[4] = { 0, 0, 0, 0 }; - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QA) - ConditionalCount8u4(src + col, _value, counts); - for (; col < alignedWidth; col += A) - ConditionalCount8u(src + col, _value, counts); - if (col < width) - ConditionalCount8u(src + col, _value, counts, tailMask); - src += stride; - } - *count = (uint32_t)(counts[0] + counts[1] + counts[2] + counts[3]); - } -#else - template SIMD_INLINE void ConditionalCount8u(const uint8_t * src, __m512i value, uint32_t * counts, __mmask64 tail = -1) - { - const __m512i _src = Load(src, tail); - union Mask - { - __mmask32 m32[2]; - __mmask64 m64[1]; - } bits; - bits.m64[0] = Compare8u(_src, value)&tail; - counts[0] += _mm_popcnt_u32(bits.m32[0]) + _mm_popcnt_u32(bits.m32[1]); - } - - template void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, uint8_t value, uint32_t * count) - { - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - - __m512i _value = _mm512_set1_epi8(value); - uint32_t counts[1] = { 0 }; - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - ConditionalCount8u(src + col, _value, counts); - if (col < width) - ConditionalCount8u(src + col, _value, counts, tailMask); - src += stride; - } - *count = counts[0]; - } -#endif//SIMD_X64_ENABLE - - template void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, uint8_t value, uint32_t * count) - { - if (Aligned(src) && Aligned(stride)) - ConditionalCount8u(src, stride, width, height, value, count); - else - ConditionalCount8u(src, stride, width, height, value, count); - } - - void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, uint8_t value, SimdCompareType compareType, uint32_t * count) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalCount8u(src, stride, width, height, value, count); - case SimdCompareNotEqual: - return ConditionalCount8u(src, stride, width, height, value, count); - case SimdCompareGreater: - return ConditionalCount8u(src, stride, width, height, value, count); - case SimdCompareGreaterOrEqual: - return ConditionalCount8u(src, stride, width, height, value, count); - case SimdCompareLesser: - return ConditionalCount8u(src, stride, width, height, value, count); - case SimdCompareLesserOrEqual: - return ConditionalCount8u(src, stride, width, height, value, count); - default: - assert(0); - } - } - - template SIMD_INLINE void ConditionalCount16i(const uint8_t * src, __m512i value, uint32_t * counts, __mmask32 tail = -1) - { - const __m512i _src = Load((int16_t*)src, tail); - __mmask32 bits = Compare16i(_src, value); - counts[0] += _mm_popcnt_u32(bits&tail); - } - - template SIMD_INLINE void ConditionalCount16i4(const uint8_t * src, __m512i value, uint32_t * counts) - { - counts[0] += _mm_popcnt_u32(Compare16i(Load(src + 0 * A), value)); - counts[1] += _mm_popcnt_u32(Compare16i(Load(src + 1 * A), value)); - counts[2] += _mm_popcnt_u32(Compare16i(Load(src + 2 * A), value)); - counts[3] += _mm_popcnt_u32(Compare16i(Load(src + 3 * A), value)); - } - - template void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, int16_t value, uint32_t * count) - { - if (align) - assert(Aligned(src) && Aligned(stride)); - - width *= 2; - size_t alignedWidth = Simd::AlignLo(width, A); - size_t fullAlignedWidth = Simd::AlignLo(width, QA); - __mmask32 tailMask = TailMask32((width - alignedWidth) / 2); - - __m512i _value = _mm512_set1_epi16(value); - uint32_t counts[4] = { 0, 0, 0, 0 }; - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QA) - ConditionalCount16i4(src + col, _value, counts); - for (; col < alignedWidth; col += A) - ConditionalCount16i(src + col, _value, counts); - if (col < width) - ConditionalCount16i(src + col, _value, counts, tailMask); - src += stride; - } - *count = counts[0] + counts[1] + counts[2] + counts[3]; - } - - template void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, int16_t value, uint32_t * count) - { - if (Aligned(src) && Aligned(stride)) - ConditionalCount16i(src, stride, width, height, value, count); - else - ConditionalCount16i(src, stride, width, height, value, count); - } - - void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, int16_t value, SimdCompareType compareType, uint32_t * count) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalCount16i(src, stride, width, height, value, count); - case SimdCompareNotEqual: - return ConditionalCount16i(src, stride, width, height, value, count); - case SimdCompareGreater: - return ConditionalCount16i(src, stride, width, height, value, count); - case SimdCompareGreaterOrEqual: - return ConditionalCount16i(src, stride, width, height, value, count); - case SimdCompareLesser: - return ConditionalCount16i(src, stride, width, height, value, count); - case SimdCompareLesserOrEqual: - return ConditionalCount16i(src, stride, width, height, value, count); - default: - assert(0); - } - } - - template void ConditionalSum4(const uint8_t * src, const uint8_t * mask, const __m512i & value, __m512i * sums) - { - sums[0] = _mm512_add_epi64(sums[0], _mm512_sad_epu8(Load(src + A * 0, Compare8u(Load(mask + A * 0), value)), K_ZERO)); - sums[1] = _mm512_add_epi64(sums[1], _mm512_sad_epu8(Load(src + A * 1, Compare8u(Load(mask + A * 1), value)), K_ZERO)); - sums[2] = _mm512_add_epi64(sums[2], _mm512_sad_epu8(Load(src + A * 2, Compare8u(Load(mask + A * 2), value)), K_ZERO)); - sums[3] = _mm512_add_epi64(sums[3], _mm512_sad_epu8(Load(src + A * 3, Compare8u(Load(mask + A * 3), value)), K_ZERO)); - } - - template void ConditionalSum(const uint8_t * src, const uint8_t * mask, const __m512i & value, __m512i * sums, __mmask64 tail = -1) - { - const __m512i _mask = Load(mask, tail); - __mmask64 mmask = Compare8u(_mask, value)&tail; - const __m512i _src = Load(src, mmask); - sums[0] = _mm512_add_epi64(sums[0], _mm512_sad_epu8(_src, K_ZERO)); - } - - template void ConditionalSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) - { - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - size_t fullAlignedWidth = Simd::AlignLo(width, QA); - __mmask64 tailMask = TailMask64(width - alignedWidth); - - __m512i _value = _mm512_set1_epi8(value); - __m512i sums[4] = { _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512() }; - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QA) - ConditionalSum4(src + col, mask + col, _value, sums); - for (; col < alignedWidth; col += A) - ConditionalSum(src + col, mask + col, _value, sums); - if (col < width) - ConditionalSum(src + col, mask + col, _value, sums, tailMask); - src += srcStride; - mask += maskStride; - } - sums[0] = _mm512_add_epi64(_mm512_add_epi64(sums[0], sums[1]), _mm512_add_epi64(sums[2], sums[3])); - *sum = ExtractSum(sums[0]); - } - - template void ConditionalSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)) - ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - else - ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - } - - void ConditionalSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareNotEqual: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreater: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreaterOrEqual: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesser: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesserOrEqual: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - default: - assert(0); - } - } - - SIMD_INLINE __m512i Square(__m512i value) - { - const __m512i lo = _mm512_unpacklo_epi8(value, K_ZERO); - const __m512i hi = _mm512_unpackhi_epi8(value, K_ZERO); - return _mm512_add_epi32(_mm512_madd_epi16(lo, lo), _mm512_madd_epi16(hi, hi)); - } - - template void ConditionalSquareSum(const uint8_t * src, const uint8_t * mask, const __m512i & value, __m512i * sums, __mmask64 tail = -1) - { - const __m512i _mask = Load(mask, tail); - __mmask64 mmask = Compare8u(_mask, value)&tail; - const __m512i _src = Load(src, mmask); - sums[0] = _mm512_add_epi32(sums[0], Square(_src)); - } - - template void ConditionalSquareSum4(const uint8_t * src, const uint8_t * mask, const __m512i & value, __m512i * sums) - { - sums[0] = _mm512_add_epi32(sums[0], Square(Load(src + A * 0, Compare8u(Load(mask + A * 0), value)))); - sums[1] = _mm512_add_epi32(sums[1], Square(Load(src + A * 1, Compare8u(Load(mask + A * 1), value)))); - sums[2] = _mm512_add_epi32(sums[2], Square(Load(src + A * 2, Compare8u(Load(mask + A * 2), value)))); - sums[3] = _mm512_add_epi32(sums[3], Square(Load(src + A * 3, Compare8u(Load(mask + A * 3), value)))); - } - - template void ConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) - { - assert(width < 256 * 256 * F); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - size_t fullAlignedWidth = Simd::AlignLo(width, QA); - __mmask64 tailMask = TailMask64(width - alignedWidth); - - __m512i _value = _mm512_set1_epi8(value); - size_t blockSize = (256 * 256 * F) / width; - size_t blockCount = height / blockSize + 1; - __m512i _sum = _mm512_setzero_si512(); - for (size_t block = 0; block < blockCount; ++block) - { - __m512i sums[4] = { _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512() }; - for (size_t row = block*blockSize, endRow = Simd::Min(row + blockSize, height); row < endRow; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QA) - ConditionalSquareSum4(src + col, mask + col, _value, sums); - for (; col < alignedWidth; col += A) - ConditionalSquareSum(src + col, mask + col, _value, sums); - if (col < width) - ConditionalSquareSum(src + col, mask + col, _value, sums, tailMask); - src += srcStride; - mask += maskStride; - } - sums[0] = _mm512_add_epi32(_mm512_add_epi32(sums[0], sums[1]), _mm512_add_epi32(sums[2], sums[3])); - _sum = _mm512_add_epi64(_sum, HorizontalSum32(sums[0])); - } - *sum = ExtractSum(_sum); - } - - template void ConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)) - ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - else - ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - } - - void ConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareNotEqual: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreater: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreaterOrEqual: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesser: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesserOrEqual: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - default: - assert(0); - } - } - - template SIMD_INLINE __m512i SquaredDifference(const uint8_t * src, ptrdiff_t step, __mmask64 mask) - { - const __m512i a = Load(src - step, mask); - const __m512i b = Load(src + step, mask); - const __m512i lo = SubUnpackedU8<0>(a, b); - const __m512i hi = SubUnpackedU8<1>(a, b); - return _mm512_add_epi32(_mm512_madd_epi16(lo, lo), _mm512_madd_epi16(hi, hi)); - } - - template SIMD_INLINE __m512i SquareGradientSum(const uint8_t * src, ptrdiff_t stride, __mmask64 mask) - { - return _mm512_add_epi32(SquaredDifference(src, stride, mask), SquaredDifference(src, 1, mask)); - } - - template void ConditionalSquareGradientSum(const uint8_t * src, ptrdiff_t stride, const uint8_t * pmask, const __m512i & value, __m512i * sums, __mmask64 tail = -1) - { - __mmask64 mask = Compare8u(Load(pmask, tail), value)&tail; - sums[0] = _mm512_add_epi32(sums[0], SquareGradientSum(src, stride, mask)); - } - - template void ConditionalSquareGradientSum4(const uint8_t * src, ptrdiff_t stride, const uint8_t * mask, const __m512i & value, __m512i * sums) - { - sums[0] = _mm512_add_epi32(sums[0], SquareGradientSum(src + A * 0, stride, Compare8u(Load(mask + A * 0), value))); - sums[1] = _mm512_add_epi32(sums[1], SquareGradientSum(src + A * 1, stride, Compare8u(Load(mask + A * 1), value))); - sums[2] = _mm512_add_epi32(sums[2], SquareGradientSum(src + A * 2, stride, Compare8u(Load(mask + A * 2), value))); - sums[3] = _mm512_add_epi32(sums[3], SquareGradientSum(src + A * 3, stride, Compare8u(Load(mask + A * 3), value))); - } - - template void ConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) - { - assert(width >= 3 && height >= 3 && width < 256 * 256 * HF); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)); - - src += srcStride; - mask += maskStride; - height -= 2; - - size_t alignedWidth = Simd::AlignLo(width - 1, A); - size_t fullAlignedWidth = alignedWidth ? Simd::AlignLo(alignedWidth - A, QA) + A : 0; - __mmask64 noseMask = NoseMask64(A - 1); - __mmask64 tailMask = TailMask64(width - 1 - alignedWidth); - if (width <= A) - noseMask = noseMask&tailMask; - - __m512i _value = _mm512_set1_epi8(value); - size_t blockSize = (256 * 256 * F) / width; - size_t blockCount = height / blockSize + 1; - __m512i _sum = _mm512_setzero_si512(); - for (size_t block = 0; block < blockCount; ++block) - { - __m512i sums[4] = { _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512() }; - for (size_t row = block*blockSize, endRow = Simd::Min(row + blockSize, height); row < endRow; ++row) - { - ConditionalSquareGradientSum(src, srcStride, mask, _value, sums, noseMask); - size_t col = A; - for (; col < fullAlignedWidth; col += QA) - ConditionalSquareGradientSum4(src + col, srcStride, mask + col, _value, sums); - for (; col < alignedWidth; col += A) - ConditionalSquareGradientSum(src + col, srcStride, mask + col, _value, sums); - if (col < width) - ConditionalSquareGradientSum(src + col, srcStride, mask + col, _value, sums, tailMask); - src += srcStride; - mask += maskStride; - } - sums[0] = _mm512_add_epi32(_mm512_add_epi32(sums[0], sums[1]), _mm512_add_epi32(sums[2], sums[3])); - _sum = _mm512_add_epi64(_sum, _mm512_add_epi64(_mm512_unpacklo_epi32(sums[0], K_ZERO), _mm512_unpackhi_epi32(sums[0], K_ZERO))); - } - *sum = ExtractSum(_sum); - } - - template void ConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)) - ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - else - ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - } - - void ConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareNotEqual: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreater: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreaterOrEqual: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesser: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesserOrEqual: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - default: - assert(0); - } - } - - template SIMD_INLINE void ConditionalFill(const uint8_t * src, const __m512i & threshold, const __m512i & value, uint8_t * dst, __mmask64 tail = -1) - { - Store(dst, value, Compare8u(Load(src, tail), threshold)&tail); - } - - template SIMD_INLINE void ConditionalFill4(const uint8_t * src, const __m512i & threshold, const __m512i & value, uint8_t * dst) - { - Store(dst + 0 * A, value, Compare8u(Load(src + 0 * A), threshold)); - Store(dst + 1 * A, value, Compare8u(Load(src + 1 * A), threshold)); - Store(dst + 2 * A, value, Compare8u(Load(src + 2 * A), threshold)); - Store(dst + 3 * A, value, Compare8u(Load(src + 3 * A), threshold)); - } - - template - void ConditionalFill(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t threshold, uint8_t value, uint8_t * dst, size_t dstStride) - { - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - size_t fullAlignedWidth = Simd::AlignLo(width, QA); - __mmask64 tailMask = TailMask64(width - alignedWidth); - - __m512i _value = _mm512_set1_epi8(value); - __m512i _threshold = _mm512_set1_epi8(threshold); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QA) - ConditionalFill4(src + col, _threshold, _value, dst + col); - for (; col < alignedWidth; col += A) - ConditionalFill(src + col, _threshold, _value, dst + col); - if (col < width) - ConditionalFill(src + col, _threshold, _value, dst + col, tailMask); - src += srcStride; - dst += dstStride; - } - } - - template void ConditionalFill(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t threshold, uint8_t value, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - else - ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - } - - void ConditionalFill(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t threshold, SimdCompareType compareType, uint8_t value, uint8_t * dst, size_t dstStride) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - case SimdCompareNotEqual: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - case SimdCompareGreater: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - case SimdCompareGreaterOrEqual: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - case SimdCompareLesser: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - case SimdCompareLesserOrEqual: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - default: - assert(0); - } - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwDeinterleave.cpp b/src/3rd/Simd/Simd/SimdAvx512bwDeinterleave.cpp deleted file mode 100644 index 90ca4467..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwDeinterleave.cpp +++ /dev/null @@ -1,229 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - const __m512i K8_SHUFFLE_DEINTERLEAVE_UV = SIMD_MM512_SETR_EPI8( - 0x0, 0x2, 0x4, 0x6, 0x8, 0xA, 0xC, 0xE, 0x1, 0x3, 0x5, 0x7, 0x9, 0xB, 0xD, 0xF, - 0x0, 0x2, 0x4, 0x6, 0x8, 0xA, 0xC, 0xE, 0x1, 0x3, 0x5, 0x7, 0x9, 0xB, 0xD, 0xF, - 0x0, 0x2, 0x4, 0x6, 0x8, 0xA, 0xC, 0xE, 0x1, 0x3, 0x5, 0x7, 0x9, 0xB, 0xD, 0xF, - 0x0, 0x2, 0x4, 0x6, 0x8, 0xA, 0xC, 0xE, 0x1, 0x3, 0x5, 0x7, 0x9, 0xB, 0xD, 0xF); - - const __m512i K64_PERMUTE_UV_U = SIMD_MM512_SETR_EPI64(0x0, 0x2, 0x4, 0x6, 0x8, 0xA, 0xC, 0xE); - const __m512i K64_PERMUTE_UV_V = SIMD_MM512_SETR_EPI64(0x1, 0x3, 0x5, 0x7, 0x9, 0xB, 0xD, 0xF); - - template SIMD_INLINE void DeinterleaveUv(const uint8_t * uv, uint8_t * u, uint8_t * v, const __mmask64 * tailMasks) - { - const __m512i uv0 = Load(uv + 0, tailMasks[0]); - const __m512i uv1 = Load(uv + A, tailMasks[1]); - const __m512i shuffledUV0 = _mm512_shuffle_epi8(uv0, K8_SHUFFLE_DEINTERLEAVE_UV); - const __m512i shuffledUV1 = _mm512_shuffle_epi8(uv1, K8_SHUFFLE_DEINTERLEAVE_UV); - Store(u, _mm512_permutex2var_epi64(shuffledUV0, K64_PERMUTE_UV_U, shuffledUV1), tailMasks[2]); - Store(v, _mm512_permutex2var_epi64(shuffledUV0, K64_PERMUTE_UV_V, shuffledUV1), tailMasks[2]); - } - - template SIMD_INLINE void DeinterleaveUv2(const uint8_t * uv, uint8_t * u, uint8_t * v) - { - const __m512i uv0 = _mm512_shuffle_epi8(Load(uv + 0 * A), K8_SHUFFLE_DEINTERLEAVE_UV); - const __m512i uv1 = _mm512_shuffle_epi8(Load(uv + 1 * A), K8_SHUFFLE_DEINTERLEAVE_UV); - Store(u + 0, _mm512_permutex2var_epi64(uv0, K64_PERMUTE_UV_U, uv1)); - Store(v + 0, _mm512_permutex2var_epi64(uv0, K64_PERMUTE_UV_V, uv1)); - const __m512i uv2 = _mm512_shuffle_epi8(Load(uv + 2 * A), K8_SHUFFLE_DEINTERLEAVE_UV); - const __m512i uv3 = _mm512_shuffle_epi8(Load(uv + 3 * A), K8_SHUFFLE_DEINTERLEAVE_UV); - Store(u + A, _mm512_permutex2var_epi64(uv2, K64_PERMUTE_UV_U, uv3)); - Store(v + A, _mm512_permutex2var_epi64(uv2, K64_PERMUTE_UV_V, uv3)); - } - - template void DeinterleaveUv(const uint8_t * uv, size_t uvStride, size_t width, size_t height, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (align) - assert(Aligned(uv) && Aligned(uvStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride)); - - size_t alignedWidth = AlignLo(width, A); - size_t fullAlignedWidth = AlignLo(width, DA); - __mmask64 tailMasks[3]; - for (size_t c = 0; c < 2; ++c) - tailMasks[c] = TailMask64((width - alignedWidth) * 2 - A*c); - tailMasks[2] = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += DA) - DeinterleaveUv2(uv + col * 2, u + col, v + col); - for (; col < alignedWidth; col += A) - DeinterleaveUv(uv + col * 2, u + col, v + col, tailMasks); - if (col < width) - DeinterleaveUv(uv + col * 2, u + col, v + col, tailMasks); - uv += uvStride; - u += uStride; - v += vStride; - } - } - - void DeinterleaveUv(const uint8_t * uv, size_t uvStride, size_t width, size_t height, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (Aligned(uv) && Aligned(uvStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride)) - DeinterleaveUv(uv, uvStride, width, height, u, uStride, v, vStride); - else - DeinterleaveUv(uv, uvStride, width, height, u, uStride, v, vStride); - } - - const __m512i K8_SHUFFLE_DEINTERLEAVE_BGR = SIMD_MM512_SETR_EPI8( - 0x0, 0x3, 0x6, 0x9, 0x1, 0x4, 0x7, 0xA, 0x2, 0x5, 0x8, 0xB, -1, -1, -1, -1, - 0x0, 0x3, 0x6, 0x9, 0x1, 0x4, 0x7, 0xA, 0x2, 0x5, 0x8, 0xB, -1, -1, -1, -1, - 0x0, 0x3, 0x6, 0x9, 0x1, 0x4, 0x7, 0xA, 0x2, 0x5, 0x8, 0xB, -1, -1, -1, -1, - 0x0, 0x3, 0x6, 0x9, 0x1, 0x4, 0x7, 0xA, 0x2, 0x5, 0x8, 0xB, -1, -1, -1, -1); - - const __m512i K32_PERMUTE_BGR_B0 = SIMD_MM512_SETR_EPI32(0x00, 0x04, 0x08, 0x0C, 0x10, 0x14, 0x18, 0x1C, -1, -1, -1, -1, -1, -1, -1, -1); - const __m512i K32_PERMUTE_BGR_B1 = SIMD_MM512_SETR_EPI32(-1, -1, -1, -1, -1, -1, -1, -1, 0x00, 0x04, 0x08, 0x0C, 0x10, 0x14, 0x18, 0x1C); - const __m512i K32_PERMUTE_BGR_G0 = SIMD_MM512_SETR_EPI32(0x01, 0x05, 0x09, 0x0D, 0x11, 0x15, 0x19, 0x1D, -1, -1, -1, -1, -1, -1, -1, -1); - const __m512i K32_PERMUTE_BGR_G1 = SIMD_MM512_SETR_EPI32(-1, -1, -1, -1, -1, -1, -1, -1, 0x01, 0x05, 0x09, 0x0D, 0x11, 0x15, 0x19, 0x1D); - const __m512i K32_PERMUTE_BGR_R0 = SIMD_MM512_SETR_EPI32(0x02, 0x06, 0x0A, 0x0E, 0x12, 0x16, 0x1A, 0x1E, -1, -1, -1, -1, -1, -1, -1, -1); - const __m512i K32_PERMUTE_BGR_R1 = SIMD_MM512_SETR_EPI32(-1, -1, -1, -1, -1, -1, -1, -1, 0x02, 0x06, 0x0A, 0x0E, 0x12, 0x16, 0x1A, 0x1E); - - template SIMD_INLINE void DeinterleaveBgr(const uint8_t * bgr, uint8_t * b, uint8_t * g, uint8_t * r, const __mmask64 * tailMasks) - { - const __m512i bgr0 = Load(bgr + 0 * A, tailMasks[0]); - const __m512i bgr1 = Load(bgr + 1 * A, tailMasks[1]); - const __m512i bgr2 = Load(bgr + 2 * A, tailMasks[2]); - - const __m512i sp0 = _mm512_shuffle_epi8(_mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_0, bgr0), K8_SHUFFLE_DEINTERLEAVE_BGR); - const __m512i sp1 = _mm512_shuffle_epi8(_mm512_permutex2var_epi32(bgr0, K32_PERMUTE_BGR_TO_BGRA_1, bgr1), K8_SHUFFLE_DEINTERLEAVE_BGR); - const __m512i sp2 = _mm512_shuffle_epi8(_mm512_permutex2var_epi32(bgr1, K32_PERMUTE_BGR_TO_BGRA_2, bgr2), K8_SHUFFLE_DEINTERLEAVE_BGR); - const __m512i sp3 = _mm512_shuffle_epi8(_mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_3, bgr2), K8_SHUFFLE_DEINTERLEAVE_BGR); - - Store(b, _mm512_or_si512(_mm512_permutex2var_epi32(sp0, K32_PERMUTE_BGR_B0, sp1), _mm512_permutex2var_epi32(sp2, K32_PERMUTE_BGR_B1, sp3)), tailMasks[3]); - Store(g, _mm512_or_si512(_mm512_permutex2var_epi32(sp0, K32_PERMUTE_BGR_G0, sp1), _mm512_permutex2var_epi32(sp2, K32_PERMUTE_BGR_G1, sp3)), tailMasks[3]); - Store(r, _mm512_or_si512(_mm512_permutex2var_epi32(sp0, K32_PERMUTE_BGR_R0, sp1), _mm512_permutex2var_epi32(sp2, K32_PERMUTE_BGR_R1, sp3)), tailMasks[3]); - } - - template void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride) - { - if (align) - assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride)); - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMasks[4]; - for (size_t c = 0; c < 3; ++c) - tailMasks[c] = TailMask64((width - alignedWidth) * 3 - A*c); - tailMasks[3] = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - DeinterleaveBgr(bgr + col * 3, b + col, g + col, r + col, tailMasks); - if (col < width) - DeinterleaveBgr(bgr + col * 3, b + col, g + col, r + col, tailMasks); - bgr += bgrStride; - b += bStride; - g += gStride; - r += rStride; - } - } - - void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride) - { - if (Aligned(bgr) && Aligned(bgrStride) && Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride)) - DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); - else - DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); - } - - const __m512i K8_SHUFFLE_BGRA = SIMD_MM512_SETR_EPI8( - 0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF, - 0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF, - 0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF, - 0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF); - - const __m512i K32_PERMUTE_BGRA_BG = SIMD_MM512_SETR_EPI32(0x00, 0x04, 0x08, 0x0C, 0x10, 0x14, 0x18, 0x1C, 0x01, 0x05, 0x09, 0x0D, 0x11, 0x15, 0x19, 0x1D); - const __m512i K32_PERMUTE_BGRA_RA = SIMD_MM512_SETR_EPI32(0x02, 0x06, 0x0A, 0x0E, 0x12, 0x16, 0x1A, 0x1E, 0x03, 0x07, 0x0B, 0x0F, 0x13, 0x17, 0x1B, 0x1F); - - template SIMD_INLINE void DeinterleaveBgra(const uint8_t * bgra, uint8_t * b, uint8_t * g, uint8_t * r, uint8_t * a, const __mmask64 * tailMasks) - { - const __m512i bgra0 = _mm512_shuffle_epi8((Load(bgra + 0 * A, tailMasks[0])), K8_SHUFFLE_BGRA); - const __m512i bgra1 = _mm512_shuffle_epi8((Load(bgra + 1 * A, tailMasks[1])), K8_SHUFFLE_BGRA); - const __m512i bgra2 = _mm512_shuffle_epi8((Load(bgra + 2 * A, tailMasks[2])), K8_SHUFFLE_BGRA); - const __m512i bgra3 = _mm512_shuffle_epi8((Load(bgra + 3 * A, tailMasks[3])), K8_SHUFFLE_BGRA); - - const __m512i bg0 = _mm512_permutex2var_epi32(bgra0, K32_PERMUTE_BGRA_BG, bgra1); - const __m512i ra0 = _mm512_permutex2var_epi32(bgra0, K32_PERMUTE_BGRA_RA, bgra1); - const __m512i bg1 = _mm512_permutex2var_epi32(bgra2, K32_PERMUTE_BGRA_BG, bgra3); - const __m512i ra1 = _mm512_permutex2var_epi32(bgra2, K32_PERMUTE_BGRA_RA, bgra3); - - Store(b, _mm512_shuffle_i64x2(bg0, bg1, 0x44), tailMasks[4]); - Store(g, _mm512_shuffle_i64x2(bg0, bg1, 0xEE), tailMasks[4]); - Store(r, _mm512_shuffle_i64x2(ra0, ra1, 0x44), tailMasks[4]); - Store(a, _mm512_shuffle_i64x2(ra0, ra1, 0xEE), tailMasks[4]); - } - - template void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride) - { - if (align) - { - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride)); - assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMasks[5]; - for (size_t c = 0; c < 4; ++c) - tailMasks[c] = TailMask64((width - alignedWidth) * 4 - A*c); - tailMasks[4] = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - DeinterleaveBgra(bgra + col * 4, b + col, g + col, r + col, a + col, tailMasks); - if (col < width) - DeinterleaveBgra(bgra + col * 4, b + col, g + col, r + col, a + col, tailMasks); - bgra += bgraStride; - b += bStride; - g += gStride; - r += rStride; - a += aStride; - } - } - - void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride)) - DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); - else - DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwDetection.cpp b/src/3rd/Simd/Simd/SimdAvx512bwDetection.cpp deleted file mode 100644 index 46197f94..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwDetection.cpp +++ /dev/null @@ -1,745 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdDetection.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - using namespace Simd::Detection; - - template SIMD_INLINE void UnpackMask16i(const uint8_t * src, uint16_t * dst, const __m512i & mask, __mmask64 tail = -1) - { - __m512i src0 = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, _mm512_and_si512(mask, (Load(src, tail)))); - Store(dst + 0 * HA, UnpackU8<0>(src0), __mmask32(tail >> 00)); - Store(dst + 1 * HA, UnpackU8<1>(src0), __mmask32(tail >> 32)); - } - - SIMD_INLINE void UnpackMask16i(const uint8_t * src, size_t size, uint16_t * dst, const __m512i & mask) - { - size_t alignedSize = Simd::AlignLo(size, A); - __mmask64 tailMask = TailMask64(size - alignedSize); - size_t i = 0; - for (; i < alignedSize; i += A) - UnpackMask16i(src + i, dst + i, mask); - if (i < size) - UnpackMask16i(src + i, dst + i, mask, tailMask); - } - - template SIMD_INLINE void UnpackMask32i(const uint8_t * src, uint32_t * dst, const __m512i & mask, __mmask64 tail = -1) - { - __m512i _src = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, _mm512_and_si512(mask, (Load(src, tail)))); - __m512i src0 = UnpackU8<0>(_src); - Store(dst + 0 * F, UnpackU8<0>(src0), __mmask16(tail >> 00)); - Store(dst + 1 * F, UnpackU8<1>(src0), __mmask16(tail >> 16)); - __m512i src1 = UnpackU8<1>(_src); - Store(dst + 2 * F, UnpackU8<0>(src1), __mmask16(tail >> 32)); - Store(dst + 3 * F, UnpackU8<1>(src1), __mmask16(tail >> 48)); - } - - SIMD_INLINE void UnpackMask32i(const uint8_t * src, size_t size, uint32_t * dst, const __m512i & mask) - { - size_t alignedSize = Simd::AlignLo(size, A); - __mmask64 tailMask = TailMask64(size - alignedSize); - size_t i = 0; - for (; i < alignedSize; i += A) - UnpackMask32i(src + i, dst + i, mask); - if (i < size) - UnpackMask32i(src + i, dst + i, mask, tailMask); - } - - template SIMD_INLINE void PackResult16i(const uint16_t * src, uint8_t * dst, __mmask64 tail = -1) - { - __m512i src0 = Load(src + 00, __mmask32(tail >> 00)); - __m512i src1 = Load(src + HA, __mmask32(tail >> 32)); - Store(dst, _mm512_permutexvar_epi64(K64_PERMUTE_FOR_PACK, _mm512_packus_epi16(src0, src1)), tail); - } - - SIMD_INLINE void PackResult16i(const uint16_t * src, size_t size, uint8_t * dst) - { - size_t alignedSize = Simd::AlignLo(size, A); - __mmask64 tailMask = TailMask64(size - alignedSize); - size_t i = 0; - for (; i < alignedSize; i += A) - PackResult16i(src + i, dst + i); - if (i < size) - PackResult16i(src + i, dst + i, tailMask); - } - - template SIMD_INLINE void PackResult32i(const uint32_t * src, uint8_t * dst, __mmask64 tail = -1) - { - __m512i src0 = Load(src + 0 * F, __mmask16(tail >> 00)); - __m512i src1 = Load(src + 1 * F, __mmask16(tail >> 16)); - __m512i src2 = Load(src + 2 * F, __mmask16(tail >> 32)); - __m512i src3 = Load(src + 3 * F, __mmask16(tail >> 48)); - Store(dst, _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, _mm512_packus_epi16(_mm512_packs_epi32(src0, src1), _mm512_packs_epi32(src2, src3))), tail); - } - - SIMD_INLINE void PackResult32i(const uint32_t * src, size_t size, uint8_t * dst) - { - size_t alignedSize = Simd::AlignLo(size, A); - __mmask64 tailMask = TailMask64(size - alignedSize); - size_t i = 0; - for (; i < alignedSize; i += A) - PackResult32i(src + i, dst + i); - if (i < size) - PackResult32i(src + i, dst + i, tailMask); - } - - SIMD_INLINE int ResultCount(__m512i result) - { - return _mm_popcnt_u32(_mm512_test_epi16_mask(result, result)); - } - - SIMD_INLINE __m512 ValidSqrt(__m512 value) - { - __mmask16 mask = _mm512_cmp_ps_mask(value, _mm512_set1_ps(0.0f), _CMP_GT_OQ); - __m512 valid = _mm512_mask_blend_ps(mask, _mm512_set1_ps(1.0f), value); -#if 0 - __m512 rsqrt = _mm512_rsqrt14_ps(valid); - return _mm512_mul_ps(rsqrt, value); -#else - return _mm512_sqrt_ps(valid); -#endif - } - - template SIMD_INLINE __m512i Sum32ip(uint32_t * const ptr[4], size_t offset, __mmask16 tail = -1) - { - __m512i s0 = Load(ptr[0] + offset, tail); - __m512i s1 = Load(ptr[1] + offset, tail); - __m512i s2 = Load(ptr[2] + offset, tail); - __m512i s3 = Load(ptr[3] + offset, tail); - return _mm512_sub_epi32(_mm512_sub_epi32(s0, s1), _mm512_sub_epi32(s2, s3)); - } - - const __m512i K32_PERMUTE_EVEN = SIMD_MM512_SETR_EPI32(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E); - - template SIMD_INLINE __m512i Sum32ii(uint32_t * const ptr[4], size_t offset, const __mmask16 * tails) - { - __m512i lo = Sum32ip(ptr, offset + 0, tails[0]); - __m512i hi = Sum32ip(ptr, offset + F, tails[1]); - return _mm512_permutex2var_epi32(lo, K32_PERMUTE_EVEN, hi); - } - - template SIMD_INLINE __m512 Norm32fp(const HidHaarCascade & hid, size_t offset, __mmask16 tail = -1) - { - __m512 area = _mm512_set1_ps(hid.windowArea); - __m512 sum = _mm512_cvtepi32_ps(Sum32ip(hid.p, offset, tail)); - __m512 sqsum = _mm512_cvtepi32_ps(Sum32ip(hid.pq, offset, tail)); - return ValidSqrt(_mm512_sub_ps(_mm512_mul_ps(sqsum, area), _mm512_mul_ps(sum, sum))); - } - - template SIMD_INLINE __m512 Norm32fi(const HidHaarCascade & hid, size_t offset, const __mmask16 * tails) - { - __m512 area = _mm512_set1_ps(hid.windowArea); - __m512 sum = _mm512_cvtepi32_ps(Sum32ii(hid.p, offset, tails)); - __m512 sqsum = _mm512_cvtepi32_ps(Sum32ii(hid.pq, offset, tails)); - return ValidSqrt(_mm512_sub_ps(_mm512_mul_ps(sqsum, area), _mm512_mul_ps(sum, sum))); - } - - template SIMD_INLINE __m512 WeightedSum32f(const WeightedRect & rect, size_t offset, __mmask16 tail = -1) - { - __m512i s0 = Load(rect.p0 + offset, tail); - __m512i s1 = Load(rect.p1 + offset, tail); - __m512i s2 = Load(rect.p2 + offset, tail); - __m512i s3 = Load(rect.p3 + offset, tail); - __m512i sum = _mm512_sub_epi32(_mm512_sub_epi32(s0, s1), _mm512_sub_epi32(s2, s3)); - return _mm512_mul_ps(_mm512_cvtepi32_ps(sum), _mm512_set1_ps(rect.weight)); - } - - SIMD_INLINE void StageSum32f(const float * leaves, float threshold, const __m512 & sum, const __m512 & norm, __m512 & stageSum) - { - __mmask16 mask = _mm512_cmp_ps_mask(sum, _mm512_mul_ps(_mm512_set1_ps(threshold), norm), _CMP_GE_OQ); - stageSum = _mm512_add_ps(stageSum, _mm512_mask_blend_ps(mask, _mm512_set1_ps(leaves[0]), _mm512_set1_ps(leaves[1]))); - } - - template __mmask16 Detect32f(const HidHaarCascade & hid, size_t offset, const __m512 & norm, __mmask16 result) - { - typedef HidHaarCascade Hid; - const float * leaves = hid.leaves.data(); - const Hid::Node * node = hid.nodes.data(); - const Hid::Stage * stages = hid.stages.data(); - for (int i = 0, n = (int)hid.stages.size(); i < n; ++i) - { - const Hid::Stage & stage = stages[i]; - if (stage.canSkip) - continue; - const Hid::Node * end = node + stage.ntrees; - __m512 stageSum = _mm512_setzero_ps(); - if (stage.hasThree) - { - for (; node < end; ++node, leaves += 2) - { - const Hid::Feature & feature = hid.features[node->featureIdx]; - __m512 sum = _mm512_add_ps( - WeightedSum32f(feature.rect[0], offset, result), - WeightedSum32f(feature.rect[1], offset, result)); - if (feature.rect[2].p0) - sum = _mm512_add_ps(sum, WeightedSum32f(feature.rect[2], offset, result)); - StageSum32f(leaves, node->threshold, sum, norm, stageSum); - } - } - else - { - for (; node < end; ++node, leaves += 2) - { - const Hid::Feature & feature = hid.features[node->featureIdx]; - __m512 sum = _mm512_add_ps(WeightedSum32f(feature.rect[0], offset, result), - WeightedSum32f(feature.rect[1], offset, result)); - StageSum32f(leaves, node->threshold, sum, norm, stageSum); - } - } - result = result & _mm512_cmp_ps_mask(stageSum, _mm512_set1_ps(stage.threshold), _CMP_GE_OQ); - if (!result) - return result; - int resultCount = _mm_popcnt_u32(result); - if (resultCount == 1) - { - int j = _tzcnt_u32(result); - return Base::Detect32f(hid, offset + j, i + 1, Avx512f::Extract(norm, j)) > 0 ? result : __mmask16(0); - } - } - return result; - } - - void DetectionHaarDetect32fp(const HidHaarCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - size_t width = rect.Width(); - size_t alignedWidth = Simd::AlignLo(width, F); - __mmask16 tailMask = TailMask16(width - alignedWidth); - Buffer buffer(width); - for (ptrdiff_t row = rect.top; row < rect.bottom; row += 1) - { - size_t col = 0; - size_t p_offset = row * hid.sum.stride / sizeof(uint32_t) + rect.left; - size_t pq_offset = row * hid.sqsum.stride / sizeof(uint32_t) + rect.left; - - UnpackMask32i(mask.data + row*mask.stride + rect.left, width, buffer.m, K8_01); - memset(buffer.d, 0, width * sizeof(uint32_t)); - for (; col < alignedWidth; col += F) - { - __mmask16 result = _mm512_cmpneq_epi32_mask(Load(buffer.m + col), K_ZERO); - if (result) - { - __m512 norm = Norm32fp(hid, pq_offset + col); - result = Detect32f(hid, p_offset + col, norm, result); - Store(buffer.d + col, _mm512_maskz_set1_epi32(result, 1)); - } - } - if (col < width) - { - __mmask16 result = _mm512_cmpneq_epi32_mask((Load(buffer.m + col, tailMask)), K_ZERO); - if (result) - { - __m512 norm = Norm32fp(hid, pq_offset + col, tailMask); - result = Detect32f(hid, p_offset + col, norm, result); - Store(buffer.d + col, _mm512_maskz_set1_epi32(result, 1), tailMask); - } - } - PackResult32i(buffer.d, width, dst.data + row*dst.stride + rect.left); - } - } - - void DetectionHaarDetect32fp(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidHaarCascade & hid = *(HidHaarCascade*)_hid; - return DetectionHaarDetect32fp(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - - void DetectionHaarDetect32fi(const HidHaarCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - const size_t step = 2; - size_t width = rect.Width(); - size_t alignedWidth = Simd::AlignLo(width, HA); - size_t evenWidth = Simd::AlignLo(width, 2); - __mmask16 tailMasks[3]; - for (size_t c = 0; c < 2; ++c) - tailMasks[c] = TailMask16(width - alignedWidth - F*c); - tailMasks[2] = TailMask16((width - alignedWidth) / 2); - Buffer buffer(evenWidth); - for (ptrdiff_t row = rect.top; row < rect.bottom; row += step) - { - size_t col = 0; - size_t p_offset = row * hid.isum.stride / sizeof(uint32_t) + rect.left / 2; - size_t pq_offset = row * hid.sqsum.stride / sizeof(uint32_t) + rect.left; - - UnpackMask16i(mask.data + row*mask.stride + rect.left, evenWidth, buffer.m, K16_0001); - memset(buffer.d, 0, evenWidth * sizeof(uint16_t)); - for (; col < alignedWidth; col += HA) - { - __mmask16 result = _mm512_cmpneq_epi32_mask(_mm512_and_si512(Load(buffer.m + col), K32_0000FFFF), K_ZERO); - if (result) - { - __m512 norm = Norm32fi(hid, pq_offset + col, tailMasks); - result = Detect32f(hid, p_offset + col / 2, norm, result); - Store(buffer.d + col, _mm512_maskz_set1_epi32(result, 1)); - } - } - if (col < evenWidth) - { - __mmask16 result = _mm512_cmpneq_epi32_mask(_mm512_and_si512((Load((uint32_t*)buffer.m + col / 2, tailMasks[2])), K32_0000FFFF), K_ZERO); - if (result) - { - __m512 norm = Norm32fi(hid, pq_offset + col, tailMasks); - result = Detect32f(hid, p_offset + col / 2, norm, result); - Store((uint32_t*)buffer.d + col / 2, _mm512_maskz_set1_epi32(result, 1), tailMasks[2]); - } - col += HA; - } - for (; col < width; col += step) - { - if (mask.At(col + rect.left, row) == 0) - continue; - float norm = Base::Norm32f(hid, pq_offset + col); - if (Base::Detect32f(hid, p_offset + col / 2, 0, norm) > 0) - dst.At(col + rect.left, row) = 1; - } - PackResult16i(buffer.d, evenWidth, dst.data + row*dst.stride + rect.left); - } - } - - void DetectionHaarDetect32fi(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidHaarCascade & hid = *(HidHaarCascade*)_hid; - return DetectionHaarDetect32fi(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - - const __m512i K8_SHUFFLE_BITS = SIMD_MM512_SETR_EPI8( - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); - - SIMD_INLINE __m512i IntegralSum32i(const __m512i & s0, const __m512i & s1, const __m512i & s2, const __m512i & s3) - { - return _mm512_sub_epi32(_mm512_sub_epi32(s0, s1), _mm512_sub_epi32(s2, s3)); - } - - template SIMD_INLINE void Load(__m512i a[16], const HidLbpFeature & feature, ptrdiff_t offset, __mmask16 tail = -1) - { - a[i] = Load(feature.p[i] + offset, tail); - } - - template SIMD_INLINE void Calculate(const HidLbpFeature & feature, ptrdiff_t offset, __mmask16 & index, __m512i & shuffle, __m512i & mask, __mmask16 tail = -1) - { - __m512i a[16]; - Load<5, masked>(a, feature, offset, tail); - Load<6, masked>(a, feature, offset, tail); - Load<9, masked>(a, feature, offset, tail); - Load<10, masked>(a, feature, offset, tail); - __m512i central = IntegralSum32i(a[5], a[6], a[9], a[10]); - - Load<0, masked>(a, feature, offset, tail); - Load<1, masked>(a, feature, offset, tail); - Load<4, masked>(a, feature, offset, tail); - index = _mm512_cmpge_epu32_mask(IntegralSum32i(a[0], a[1], a[4], a[5]), central); - - shuffle = K32_FFFFFF00; - Load<2, masked>(a, feature, offset, tail); - shuffle = _mm512_or_si512(shuffle, _mm512_maskz_set1_epi32(_mm512_cmpge_epu32_mask(IntegralSum32i(a[1], a[2], a[5], a[6]), central), 8)); - Load<3, masked>(a, feature, offset, tail); - Load<7, masked>(a, feature, offset, tail); - shuffle = _mm512_or_si512(shuffle, _mm512_maskz_set1_epi32(_mm512_cmpge_epu32_mask(IntegralSum32i(a[2], a[3], a[6], a[7]), central), 4)); - Load<11, masked>(a, feature, offset, tail); - shuffle = _mm512_or_si512(shuffle, _mm512_maskz_set1_epi32(_mm512_cmpge_epu32_mask(IntegralSum32i(a[6], a[7], a[10], a[11]), central), 2)); - Load<14, masked>(a, feature, offset, tail); - Load<15, masked>(a, feature, offset, tail); - shuffle = _mm512_or_si512(shuffle, _mm512_maskz_set1_epi32(_mm512_cmpge_epu32_mask(IntegralSum32i(a[10], a[11], a[14], a[15]), central), 1)); - - mask = K32_FFFFFF00; - Load<13, masked>(a, feature, offset, tail); - mask = _mm512_or_si512(mask, _mm512_maskz_set1_epi32(_mm512_cmpge_epu32_mask(IntegralSum32i(a[9], a[10], a[13], a[14]), central), 4)); - Load<12, masked>(a, feature, offset, tail); - Load<8, masked>(a, feature, offset, tail); - mask = _mm512_or_si512(mask, _mm512_maskz_set1_epi32(_mm512_cmpge_epu32_mask(IntegralSum32i(a[8], a[9], a[12], a[13]), central), 2)); - mask = _mm512_or_si512(mask, _mm512_maskz_set1_epi32(_mm512_cmpge_epu32_mask(IntegralSum32i(a[4], a[5], a[8], a[9]), central), 1)); - mask = _mm512_shuffle_epi8(K8_SHUFFLE_BITS, mask); - } - - template SIMD_INLINE __mmask16 LeafMask(const HidLbpFeature & feature, ptrdiff_t offset, const int * subset, __mmask16 tail = -1) - { - __mmask16 index; - __m512i shuffle, mask; - Calculate(feature, offset, index, shuffle, mask, tail); - - __m256i _subset = _mm256_loadu_si256((__m256i*)subset); - __m512i subset0 = _mm512_broadcast_i32x4(_mm256_extracti128_si256(_subset, 0)); - __m512i subset1 = _mm512_broadcast_i32x4(_mm256_extracti128_si256(_subset, 1)); - - __m512i value0 = _mm512_and_si512(_mm512_shuffle_epi8(subset0, shuffle), mask); - __m512i value1 = _mm512_and_si512(_mm512_shuffle_epi8(subset1, shuffle), mask); - __m512i value = _mm512_mask_blend_epi32(index, value0, value1); - - return _mm512_cmpneq_epi32_mask(value, K_ZERO); - } - - template __mmask16 Detect(const HidLbpCascade & hid, size_t offset, int startStage, __mmask16 result) - { - typedef HidLbpCascade Hid; - - size_t subsetSize = (hid.ncategories + 31) / 32; - const int * subsets = hid.subsets.data(); - const Hid::Leave * leaves = hid.leaves.data(); - const Hid::Node * nodes = hid.nodes.data(); - const Hid::Stage * stages = hid.stages.data(); - int nodeOffset = stages[startStage].first; - int leafOffset = 2 * nodeOffset; - for (int i_stage = startStage, n_stages = (int)hid.stages.size(); i_stage < n_stages; i_stage++) - { - const Hid::Stage & stage = stages[i_stage]; - __m512 sum = _mm512_setzero_ps(); - for (int i_tree = 0, n_trees = stage.ntrees; i_tree < n_trees; i_tree++) - { - const Hid::Feature & feature = hid.features[nodes[nodeOffset].featureIdx]; - const int * subset = subsets + nodeOffset*subsetSize; - __mmask16 mask = LeafMask(feature, offset, subset, result); - sum = _mm512_add_ps(sum, _mm512_mask_blend_ps(mask, _mm512_set1_ps(leaves[leafOffset + 1]), _mm512_set1_ps(leaves[leafOffset + 0]))); - nodeOffset++; - leafOffset += 2; - } - result = result & _mm512_cmp_ps_mask(sum, _mm512_set1_ps(stage.threshold), _CMP_GE_OQ); - if (!result) - return result; - int resultCount = _mm_popcnt_u32(result); - if (resultCount == 1) - { - int j = _tzcnt_u32(result); - return Base::Detect(hid, offset + j, i_stage + 1) > 0 ? result : __mmask16(0); - } - } - return result; - } - - void DetectionLbpDetect32fp(const HidLbpCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - size_t width = rect.Width(); - size_t alignedWidth = Simd::AlignLo(width, F); - __mmask16 tailMask = TailMask16(width - alignedWidth); - Buffer buffer(width); - for (ptrdiff_t row = rect.top; row < rect.bottom; row += 1) - { - size_t col = 0; - size_t offset = row * hid.sum.stride / sizeof(uint32_t) + rect.left; - - UnpackMask32i(mask.data + row*mask.stride + rect.left, width, buffer.m, K8_01); - memset(buffer.d, 0, width * sizeof(uint32_t)); - for (; col < alignedWidth; col += F) - { - __mmask16 result = _mm512_cmpneq_epi32_mask(Load(buffer.m + col), K_ZERO); - if (result) - { - result = Detect(hid, offset + col, 0, result); - Store(buffer.d + col, _mm512_maskz_set1_epi32(result, 1)); - } - } - if (col < width) - { - __mmask16 result = _mm512_cmpneq_epi32_mask((Load(buffer.m + col, tailMask)), K_ZERO); - if (result) - { - result = Detect(hid, offset + col, 0, result); - Store(buffer.d + col, _mm512_maskz_set1_epi32(result, 1), tailMask); - } - } - PackResult32i(buffer.d, width, dst.data + row*dst.stride + rect.left); - } - } - - void DetectionLbpDetect32fp(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidLbpCascade & hid = *(HidLbpCascade*)_hid; - return DetectionLbpDetect32fp(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - - void DetectionLbpDetect32fi(const HidLbpCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - const size_t step = 2; - size_t width = rect.Width(); - size_t alignedWidth = Simd::AlignLo(width, HA); - __mmask16 tailMask = TailMask16((width - alignedWidth) / 2); - size_t evenWidth = Simd::AlignLo(width, 2); - Buffer buffer(evenWidth); - for (ptrdiff_t row = rect.top; row < rect.bottom; row += step) - { - size_t col = 0; - size_t offset = row * hid.isum.stride / sizeof(uint32_t) + rect.left / 2; - - UnpackMask16i(mask.data + row*mask.stride + rect.left, evenWidth, buffer.m, K16_0001); - memset(buffer.d, 0, evenWidth * sizeof(uint16_t)); - for (; col < alignedWidth; col += HA) - { - __mmask16 result = _mm512_cmpneq_epi32_mask(_mm512_and_si512(Load(buffer.m + col), K32_0000FFFF), K_ZERO); - if (result) - { - result = Detect(hid, offset + col / 2, 0, result); - Store(buffer.d + col, _mm512_maskz_set1_epi32(result, 1)); - } - } - if (col < evenWidth) - { - __mmask16 result = _mm512_cmpneq_epi32_mask(_mm512_and_si512((Load((uint32_t*)buffer.m + col / 2, tailMask)), K32_0000FFFF), K_ZERO); - if (result) - { - result = Detect(hid, offset + col / 2, 0, result); - Store((uint32_t*)buffer.d + col / 2, _mm512_maskz_set1_epi32(result, 1), tailMask); - } - col += HA; - } - for (; col < width; col += step) - { - if (mask.At(col + rect.left, row) == 0) - continue; - if (Base::Detect(hid, offset + col / 2, 0) > 0) - dst.At(col + rect.left, row) = 1; - } - PackResult16i(buffer.d, evenWidth, dst.data + row*dst.stride + rect.left); - } - } - - void DetectionLbpDetect32fi(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidLbpCascade & hid = *(HidLbpCascade*)_hid; - return DetectionLbpDetect32fi(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - - SIMD_INLINE __m512i IntegralSum16i(const __m512i & s0, const __m512i & s1, const __m512i & s2, const __m512i & s3) - { - return _mm512_sub_epi16(_mm512_sub_epi16(s0, s1), _mm512_sub_epi16(s2, s3)); - } - - template SIMD_INLINE void Load(__m512i a[16], const HidLbpFeature & feature, ptrdiff_t offset, __mmask32 tail = -1) - { - a[i] = Load(feature.p[i] + offset, tail); - } - - template SIMD_INLINE void Calculate(const HidLbpFeature & feature, ptrdiff_t offset, __mmask32 & index, __m512i & shuffle, __m512i & mask, __mmask32 tail = -1) - { - __m512i a[16]; - Load<5, masked>(a, feature, offset, tail); - Load<6, masked>(a, feature, offset, tail); - Load<9, masked>(a, feature, offset, tail); - Load<10, masked>(a, feature, offset, tail); - __m512i central = IntegralSum16i(a[5], a[6], a[9], a[10]); - - Load<0, masked>(a, feature, offset, tail); - Load<1, masked>(a, feature, offset, tail); - Load<4, masked>(a, feature, offset, tail); - index = _mm512_cmpge_epu16_mask(IntegralSum16i(a[0], a[1], a[4], a[5]), central); - - shuffle = K16_FF00; - Load<2, masked>(a, feature, offset, tail); - shuffle = _mm512_or_si512(shuffle, _mm512_maskz_set1_epi16(_mm512_cmpge_epu16_mask(IntegralSum16i(a[1], a[2], a[5], a[6]), central), 8)); - Load<3, masked>(a, feature, offset, tail); - Load<7, masked>(a, feature, offset, tail); - shuffle = _mm512_or_si512(shuffle, _mm512_maskz_set1_epi16(_mm512_cmpge_epu16_mask(IntegralSum16i(a[2], a[3], a[6], a[7]), central), 4)); - Load<11, masked>(a, feature, offset, tail); - shuffle = _mm512_or_si512(shuffle, _mm512_maskz_set1_epi16(_mm512_cmpge_epu16_mask(IntegralSum16i(a[6], a[7], a[10], a[11]), central), 2)); - Load<14, masked>(a, feature, offset, tail); - Load<15, masked>(a, feature, offset, tail); - shuffle = _mm512_or_si512(shuffle, _mm512_maskz_set1_epi16(_mm512_cmpge_epu16_mask(IntegralSum16i(a[10], a[11], a[14], a[15]), central), 1)); - - mask = K16_FF00; - Load<13, masked>(a, feature, offset, tail); - mask = _mm512_or_si512(mask, _mm512_maskz_set1_epi16(_mm512_cmpge_epu16_mask(IntegralSum16i(a[9], a[10], a[13], a[14]), central), 4)); - Load<12, masked>(a, feature, offset, tail); - Load<8, masked>(a, feature, offset, tail); - mask = _mm512_or_si512(mask, _mm512_maskz_set1_epi16(_mm512_cmpge_epu16_mask(IntegralSum16i(a[8], a[9], a[12], a[13]), central), 2)); - mask = _mm512_or_si512(mask, _mm512_maskz_set1_epi16(_mm512_cmpge_epu16_mask(IntegralSum16i(a[4], a[5], a[8], a[9]), central), 1)); - mask = _mm512_shuffle_epi8(K8_SHUFFLE_BITS, mask); - } - - template SIMD_INLINE __mmask32 LeafMask(const HidLbpFeature & feature, ptrdiff_t offset, const int * subset, __mmask32 tail = -1) - { - __mmask32 index; - __m512i shuffle, mask; - Calculate(feature, offset, index, shuffle, mask, tail); - - __m256i _subset = _mm256_loadu_si256((__m256i*)subset); - __m512i subset0 = _mm512_broadcast_i32x4(_mm256_extracti128_si256(_subset, 0)); - __m512i subset1 = _mm512_broadcast_i32x4(_mm256_extracti128_si256(_subset, 1)); - - __m512i value0 = _mm512_and_si512(_mm512_shuffle_epi8(subset0, shuffle), mask); - __m512i value1 = _mm512_and_si512(_mm512_shuffle_epi8(subset1, shuffle), mask); - __m512i value = _mm512_mask_blend_epi16(index, value0, value1); - - return _mm512_cmpneq_epi16_mask(value, K_ZERO); - } - - template __mmask32 Detect(const HidLbpCascade & hid, size_t offset, int startStage, __mmask32 result) - { - typedef HidLbpCascade Hid; - - size_t subsetSize = (hid.ncategories + 31) / 32; - const int * subsets = hid.subsets.data(); - const Hid::Leave * leaves = hid.leaves.data(); - const Hid::Node * nodes = hid.nodes.data(); - const Hid::Stage * stages = hid.stages.data(); - int nodeOffset = 0, leafOffset = 0; - for (int i_stage = 0, n_stages = (int)hid.stages.size(); i_stage < n_stages; i_stage++) - { - const Hid::Stage & stage = stages[i_stage]; - __m512i sum = _mm512_setzero_si512(); - for (int i_tree = 0, n_trees = stage.ntrees; i_tree < n_trees; i_tree++) - { - const Hid::Feature & feature = hid.features[nodes[nodeOffset].featureIdx]; - const int * subset = subsets + nodeOffset*subsetSize; - __mmask32 mask = LeafMask(feature, offset, subset, result); - sum = _mm512_add_epi16(sum, _mm512_mask_blend_epi16(mask, _mm512_set1_epi16(leaves[leafOffset + 1]), _mm512_set1_epi16(leaves[leafOffset + 0]))); - nodeOffset++; - leafOffset += 2; - } - result = result & _mm512_cmpge_epi16_mask(sum, _mm512_set1_epi16(stage.threshold)); - if (!result) - return result; - int resultCount = _mm_popcnt_u32(result); - if (resultCount == 1) - { - int j = _tzcnt_u32(result); - return Base::Detect(hid, offset + j, i_stage + 1) > 0 ? result : __mmask32(0); - } - } - return result; - } - - void DetectionLbpDetect16ip(const HidLbpCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - size_t width = rect.Width(); - size_t alignedWidth = Simd::AlignLo(width, HA); - __mmask32 tailMask = TailMask32(width - alignedWidth); - size_t evenWidth = Simd::AlignLo(width, 2); - Buffer buffer(width); - for (ptrdiff_t row = rect.top; row < rect.bottom; row += 1) - { - size_t col = 0; - size_t offset = row * hid.isum.stride / sizeof(uint16_t) + rect.left; - UnpackMask16i(mask.data + row*mask.stride + rect.left, width, buffer.m, K8_01); - memset(buffer.d, 0, width * sizeof(uint16_t)); - for (; col < alignedWidth; col += HA) - { - __mmask32 result = _mm512_cmpneq_epi16_mask(Load(buffer.m + col), K_ZERO); - if (result) - { - result = Detect(hid, offset + col, 0, result); - Store(buffer.d + col, _mm512_maskz_set1_epi16(result, 1)); - } - } - if (col < width) - { - __mmask32 result = _mm512_cmpneq_epi16_mask((Load(buffer.m + col, tailMask)), K_ZERO); - if (result) - { - result = Detect(hid, offset + col, 0, result); - Store(buffer.d + col, _mm512_maskz_set1_epi16(result, 1), tailMask); - } - } - PackResult16i(buffer.d, width, dst.data + row*dst.stride + rect.left); - } - } - - void DetectionLbpDetect16ip(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidLbpCascade & hid = *(HidLbpCascade*)_hid; - return DetectionLbpDetect16ip(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - - void DetectionLbpDetect16ii(const HidLbpCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - const size_t step = 2; - size_t width = rect.Width(); - size_t alignedWidth = Simd::AlignLo(width, A); - __mmask32 tailMask = TailMask32((width - alignedWidth) / 2); - size_t evenWidth = Simd::AlignLo(width, 2); - - for (ptrdiff_t row = rect.top; row < rect.bottom; row += step) - { - size_t col = 0; - size_t offset = row * hid.isum.stride / sizeof(uint16_t) + rect.left / 2; - const uint8_t * m = mask.data + row*mask.stride + rect.left; - uint8_t * d = dst.data + row*dst.stride + rect.left; - for (; col < alignedWidth; col += A) - { - __mmask32 result = _mm512_cmpneq_epi16_mask(_mm512_and_si512(Load(m + col), K16_00FF), K_ZERO); - if (result) - { - result = Detect(hid, offset + col / 2, 0, result); - Store(d + col, _mm512_maskz_set1_epi16(result, 1)); - } - } - if (col < evenWidth) - { - __mmask32 result = _mm512_cmpneq_epi16_mask(_mm512_and_si512((Load((uint16_t*)m + col / 2, tailMask)), K16_00FF), K_ZERO); - if (result) - { - result = Detect(hid, offset + col / 2, 0, result); - Store((uint16_t*)d + col / 2, _mm512_maskz_set1_epi16(result, 1), tailMask); - } - col += A; - } - for (; col < width; col += step) - { - if (mask.At(col + rect.left, row) == 0) - continue; - if (Base::Detect(hid, offset + col / 2, 0) > 0) - dst.At(col + rect.left, row) = 1; - } - } - } - - void DetectionLbpDetect16ii(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidLbpCascade & hid = *(HidLbpCascade*)_hid; - return DetectionLbpDetect16ii(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwEdgeBackground.cpp b/src/3rd/Simd/Simd/SimdAvx512bwEdgeBackground.cpp deleted file mode 100644 index 00b4d8bd..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwEdgeBackground.cpp +++ /dev/null @@ -1,308 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdCompare.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void EdgeBackgroundGrowRangeSlow(const uint8_t * value, uint8_t * background, __mmask64 m = -1) - { - const __m512i _value = Load(value, m); - const __m512i _background = Load(background, m); - const __mmask64 inc = _mm512_cmpgt_epu8_mask(_value, _background); - Store(background, _mm512_mask_adds_epu8(_background, inc, _background, K8_01), m); - } - - template void EdgeBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride) - { - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(background) && Aligned(backgroundStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - EdgeBackgroundGrowRangeSlow(value + col, background + col); - if (col < width) - EdgeBackgroundGrowRangeSlow(value + col, background + col, tailMask); - value += valueStride; - background += backgroundStride; - } - } - - void EdgeBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(background) && Aligned(backgroundStride)) - EdgeBackgroundGrowRangeSlow(value, valueStride, width, height, background, backgroundStride); - else - EdgeBackgroundGrowRangeSlow(value, valueStride, width, height, background, backgroundStride); - } - - template SIMD_INLINE void EdgeBackgroundGrowRangeFast(const uint8_t * value, uint8_t * background, __mmask64 m = -1) - { - const __m512i _value = Load(value, m); - const __m512i _background = Load(background, m); - Store(background, _mm512_max_epu8(_background, _value), m); - } - - template void EdgeBackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride) - { - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(background) && Aligned(backgroundStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - EdgeBackgroundGrowRangeFast(value + col, background + col); - if (col < width) - EdgeBackgroundGrowRangeFast(value + col, background + col, tailMask); - value += valueStride; - background += backgroundStride; - } - } - - void EdgeBackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(background) && Aligned(backgroundStride)) - EdgeBackgroundGrowRangeFast(value, valueStride, width, height, background, backgroundStride); - else - EdgeBackgroundGrowRangeFast(value, valueStride, width, height, background, backgroundStride); - } - - template SIMD_INLINE void EdgeBackgroundIncrementCount(const uint8_t * value, - const uint8_t * backgroundValue, uint8_t * backgroundCount, size_t offset, __mmask64 m = -1) - { - const __m512i _value = Load(value + offset, m); - const __m512i _backgroundValue = Load(backgroundValue + offset, m); - const __m512i _backgroundCount = Load(backgroundCount + offset, m); - const __mmask64 inc = _mm512_cmpgt_epu8_mask(_value, _backgroundValue); - Store(backgroundCount + offset, _mm512_mask_adds_epu8(_backgroundCount, inc, _backgroundCount, K8_01), m); - } - - template void EdgeBackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t * backgroundCount, size_t backgroundCountStride) - { - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(backgroundValue) && Aligned(backgroundValueStride)); - assert(Aligned(backgroundCount) && Aligned(backgroundCountStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - EdgeBackgroundIncrementCount(value, backgroundValue, backgroundCount, col); - if (col < width) - EdgeBackgroundIncrementCount(value, backgroundValue, backgroundCount, col, tailMask); - value += valueStride; - backgroundValue += backgroundValueStride; - backgroundCount += backgroundCountStride; - } - } - - void EdgeBackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t * backgroundCount, size_t backgroundCountStride) - { - if (Aligned(value) && Aligned(valueStride) && - Aligned(backgroundValue) && Aligned(backgroundValueStride) && Aligned(backgroundCount) && Aligned(backgroundCountStride)) - EdgeBackgroundIncrementCount(value, valueStride, width, height, - backgroundValue, backgroundValueStride, backgroundCount, backgroundCountStride); - else - EdgeBackgroundIncrementCount(value, valueStride, width, height, - backgroundValue, backgroundValueStride, backgroundCount, backgroundCountStride); - } - - SIMD_INLINE __m512i AdjustEdge(const __m512i & count, const __m512i & value, const __m512i & threshold) - { - const __mmask64 inc = _mm512_cmpgt_epu8_mask(count, threshold); - const __mmask64 dec = _mm512_cmplt_epu8_mask(count, threshold); - __m512i added = _mm512_mask_adds_epu8(value, inc, value, K8_01); - return _mm512_mask_subs_epu8(added, dec, added, K8_01); - } - - template SIMD_INLINE void EdgeBackgroundAdjustRange(uint8_t * backgroundCount, uint8_t * backgroundValue, const __m512i & threshold, __mmask64 m = -1) - { - const __m512i _backgroundCount = Load(backgroundCount, m); - const __m512i _backgroundValue = Load(backgroundValue, m); - Store(backgroundValue, AdjustEdge(_backgroundCount, _backgroundValue, threshold), m); - Store(backgroundCount, K_ZERO, m); - } - - template void EdgeBackgroundAdjustRange(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold) - { - if (align) - { - assert(Aligned(backgroundValue) && Aligned(backgroundValueStride)); - assert(Aligned(backgroundCount) && Aligned(backgroundCountStride)); - } - - const __m512i _threshold = _mm512_set1_epi8((char)threshold); - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - EdgeBackgroundAdjustRange(backgroundCount + col, backgroundValue + col, _threshold); - if (col < width) - EdgeBackgroundAdjustRange(backgroundCount + col, backgroundValue + col, _threshold, tailMask); - backgroundValue += backgroundValueStride; - backgroundCount += backgroundCountStride; - } - } - - void EdgeBackgroundAdjustRange(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold) - { - if (Aligned(backgroundValue) && Aligned(backgroundValueStride) && - Aligned(backgroundCount) && Aligned(backgroundCountStride)) - EdgeBackgroundAdjustRange(backgroundCount, backgroundCountStride, width, height, - backgroundValue, backgroundValueStride, threshold); - else - EdgeBackgroundAdjustRange(backgroundCount, backgroundCountStride, width, height, - backgroundValue, backgroundValueStride, threshold); - } - - template SIMD_INLINE void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, uint8_t * backgroundValue, - const uint8_t * pmask, const __m512i & threshold, __mmask64 m = -1) - { - const __m512i _mask = Load(pmask, m); - const __mmask64 mm = _mm512_cmpneq_epu8_mask(_mask, K_ZERO) & m; - - const __m512i _backgroundCount = Load(backgroundCount, m); - const __m512i _backgroundValue = Load(backgroundValue, m); - - Store(backgroundValue, AdjustEdge(_backgroundCount, _backgroundValue, threshold), mm); - Store(backgroundCount, K_ZERO, m); - } - - template void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride) - { - if (align) - { - assert(Aligned(backgroundValue) && Aligned(backgroundValueStride)); - assert(Aligned(backgroundCount) && Aligned(backgroundCountStride)); - assert(Aligned(mask) && Aligned(maskStride)); - } - - const __m512i _threshold = _mm512_set1_epi8((char)threshold); - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - EdgeBackgroundAdjustRangeMasked(backgroundCount + col, backgroundValue + col, mask + col, _threshold); - if (col < width) - EdgeBackgroundAdjustRangeMasked(backgroundCount + col, backgroundValue + col, mask + col, _threshold, tailMask); - backgroundValue += backgroundValueStride; - backgroundCount += backgroundCountStride; - mask += maskStride; - } - } - - void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride) - { - if (Aligned(backgroundValue) && Aligned(backgroundValueStride) && - Aligned(backgroundCount) && Aligned(backgroundCountStride) && - Aligned(mask) && Aligned(maskStride)) - EdgeBackgroundAdjustRangeMasked(backgroundCount, backgroundCountStride, width, height, - backgroundValue, backgroundValueStride, threshold, mask, maskStride); - else - EdgeBackgroundAdjustRangeMasked(backgroundCount, backgroundCountStride, width, height, - backgroundValue, backgroundValueStride, threshold, mask, maskStride); - } - - template SIMD_INLINE void EdgeBackgroundShiftRangeMasked(const uint8_t * value, uint8_t * background, const uint8_t * mask, __mmask64 tail = -1) - { - const __m512i _mask = Load(mask, tail); - const __mmask64 mmask = _mm512_cmpneq_epu8_mask(_mask, K_ZERO) & tail; - const __m512i _value = Load(value, tail); - Store(background, _value, mmask); - } - - template void EdgeBackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride, const uint8_t * mask, size_t maskStride) - { - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(background) && Aligned(backgroundStride)); - assert(Aligned(mask) && Aligned(maskStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - EdgeBackgroundShiftRangeMasked(value + col, background + col, mask + col); - if (col < width) - EdgeBackgroundShiftRangeMasked(value + col, background + col, mask + col, tailMask); - value += valueStride; - background += backgroundStride; - mask += maskStride; - } - } - - void EdgeBackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride, const uint8_t * mask, size_t maskStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(background) && Aligned(backgroundStride) && Aligned(mask) && Aligned(maskStride)) - EdgeBackgroundShiftRangeMasked(value, valueStride, width, height, background, backgroundStride, mask, maskStride); - else - EdgeBackgroundShiftRangeMasked(value, valueStride, width, height, background, backgroundStride, mask, maskStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwFill.cpp b/src/3rd/Simd/Simd/SimdAvx512bwFill.cpp deleted file mode 100644 index 05c0f72a..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwFill.cpp +++ /dev/null @@ -1,183 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void FillBgr(uint8_t * dst, const __m512i bgrs[3], const __mmask64 * tails) - { - Store(dst + 0 * A, bgrs[0], tails[0]); - Store(dst + 1 * A, bgrs[1], tails[1]); - Store(dst + 2 * A, bgrs[2], tails[2]); - } - - template SIMD_INLINE void FillBgr2(uint8_t * dst, const __m512i bgrs[3]) - { - Store(dst + 0 * A, bgrs[0]); - Store(dst + 1 * A, bgrs[1]); - Store(dst + 2 * A, bgrs[2]); - Store(dst + 3 * A, bgrs[0]); - Store(dst + 4 * A, bgrs[1]); - Store(dst + 5 * A, bgrs[2]); - } - - template void FillBgr(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red) - { - size_t size = width * 3; - size_t step = A * 3; - size_t alignedSize = AlignLo(width, A) * 3; - __mmask64 tailMasks[3]; - for (size_t c = 0; c < 3; ++c) - tailMasks[c] = TailMask64(size - alignedSize - A*c); - size_t step2 = 2 * step; - size_t alignedSize2 = AlignLo(width, 2 * A) * 3; - - uint32_t bgrb = uint32_t(blue) | (uint32_t(green) << 8) | (uint32_t(red) << 16) | (uint32_t(blue) << 24); - uint32_t grbg = uint32_t(green) | (uint32_t(red) << 8) | (uint32_t(blue) << 16) | (uint32_t(green) << 24); - uint32_t rbgr = uint32_t(red) | (uint32_t(blue) << 8) | (uint32_t(green) << 16) | (uint32_t(red) << 24); - - __m512i bgrs[3]; - bgrs[0] = _mm512_setr_epi32(bgrb, grbg, rbgr, bgrb, grbg, rbgr, bgrb, grbg, rbgr, bgrb, grbg, rbgr, bgrb, grbg, rbgr, bgrb); - bgrs[1] = _mm512_setr_epi32(grbg, rbgr, bgrb, grbg, rbgr, bgrb, grbg, rbgr, bgrb, grbg, rbgr, bgrb, grbg, rbgr, bgrb, grbg); - bgrs[2] = _mm512_setr_epi32(rbgr, bgrb, grbg, rbgr, bgrb, grbg, rbgr, bgrb, grbg, rbgr, bgrb, grbg, rbgr, bgrb, grbg, rbgr); - - for (size_t row = 0; row < height; ++row) - { - size_t offset = 0; - for (; offset < alignedSize2; offset += step2) - FillBgr2(dst + offset, bgrs); - for (; offset < alignedSize; offset += step) - FillBgr(dst + offset, bgrs, tailMasks); - if (offset < size) - FillBgr(dst + offset, bgrs, tailMasks); - dst += stride; - } - } - - void FillBgr(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red) - { - if (Aligned(dst) && Aligned(stride)) - FillBgr(dst, stride, width, height, blue, green, red); - else - FillBgr(dst, stride, width, height, blue, green, red); - } - - template void FillBgra(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red, uint8_t alpha) - { - size_t size = width * 4; - size_t alignedSize = AlignLo(size, A); - size_t fullAlignedSize = AlignLo(size, QA); - __mmask64 tailMask = TailMask64(size - alignedSize); - - uint32_t bgra32 = uint32_t(blue) | (uint32_t(green) << 8) | (uint32_t(red) << 16) | (uint32_t(alpha) << 24); - __m512i bgra512 = _mm512_set1_epi32(bgra32); - - for (size_t row = 0; row < height; ++row) - { - size_t offset = 0; - for (; offset < fullAlignedSize; offset += QA) - { - Store(dst + offset + 0 * A, bgra512); - Store(dst + offset + 1 * A, bgra512); - Store(dst + offset + 2 * A, bgra512); - Store(dst + offset + 3 * A, bgra512); - } - for (; offset < alignedSize; offset += A) - Store(dst + offset, bgra512, tailMask); - if (offset < size) - Store(dst + offset, bgra512, tailMask); - dst += stride; - } - } - - void FillBgra(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red, uint8_t alpha) - { - if (Aligned(dst) && Aligned(stride)) - FillBgra(dst, stride, width, height, blue, green, red, alpha); - else - FillBgra(dst, stride, width, height, blue, green, red, alpha); - } - - template void FillPixel(uint8_t * dst, size_t stride, size_t width, size_t height, const __m512i & pixel) - { - size_t fullAlignedWidth = AlignLo(width, QA); - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QA) - { - Store(dst + col + 0 * A, pixel); - Store(dst + col + 1 * A, pixel); - Store(dst + col + 2 * A, pixel); - Store(dst + col + 3 * A, pixel); - } - for (; col < alignedWidth; col += A) - Store(dst + col, pixel); - if (col < width) - Store(dst + col, pixel, tailMask); - dst += stride; - } - } - - template void FillPixel(uint8_t * dst, size_t stride, size_t width, size_t height, const uint8_t * pixel, size_t pixelSize) - { - if (pixelSize == 3) - FillBgr(dst, stride, width, height, pixel[0], pixel[1], pixel[2]); - else - { - __m512i _pixel; - switch (pixelSize) - { - case 1: - _pixel = _mm512_set1_epi8(*pixel); - break; - case 2: - _pixel = _mm512_set1_epi16(*(uint16_t*)pixel); - break; - case 4: - _pixel = _mm512_set1_epi32(*(uint32_t*)pixel); - break; - default: - assert(0); - } - FillPixel(dst, stride, width*pixelSize, height, _pixel); - } - } - - void FillPixel(uint8_t * dst, size_t stride, size_t width, size_t height, const uint8_t * pixel, size_t pixelSize) - { - if (Aligned(dst) && Aligned(stride)) - FillPixel(dst, stride, width, height, pixel, pixelSize); - else - FillPixel(dst, stride, width, height, pixel, pixelSize); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwFloat16.cpp b/src/3rd/Simd/Simd/SimdAvx512bwFloat16.cpp deleted file mode 100644 index f6559e2a..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwFloat16.cpp +++ /dev/null @@ -1,690 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void Float32ToFloat16(const float * src, uint16_t * dst, const __mmask16 * srcTails, __mmask32 dstTail) - { - __m256i lo = _mm512_cvtps_ph((Avx512f::Load(src + 0, srcTails[0])), 0); - __m256i hi = _mm512_cvtps_ph((Avx512f::Load(src + F, srcTails[1])), 0); - Store(dst, _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1), dstTail); - } - - template SIMD_INLINE void Float32ToFloat16x2(const float * src, uint16_t * dst) - { - Store(dst + 0 * HA, _mm512_inserti64x4(_mm512_castsi256_si512(_mm512_cvtps_ph(Avx512f::Load(src + 0 * F), 0)), _mm512_cvtps_ph(Avx512f::Load(src + 1 * F), 0), 1)); - Store(dst + 1 * HA, _mm512_inserti64x4(_mm512_castsi256_si512(_mm512_cvtps_ph(Avx512f::Load(src + 2 * F), 0)), _mm512_cvtps_ph(Avx512f::Load(src + 3 * F), 0), 1)); - } - - template void Float32ToFloat16(const float * src, size_t size, uint16_t * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t fullAlignedSize = Simd::AlignLo(size, QF); - size_t alignedSize = Simd::AlignLo(size, DF); - __mmask16 srcTailMasks[2]; - for (size_t c = 0; c < 2; ++c) - srcTailMasks[c] = TailMask16(size - alignedSize - F*c); - __mmask32 dstTailMask = TailMask32(size - alignedSize); - - size_t i = 0; - for (; i < fullAlignedSize; i += QF) - Float32ToFloat16x2(src + i, dst + i); - for (; i < alignedSize; i += DF) - Float32ToFloat16(src + i, dst + i, srcTailMasks, dstTailMask); - if (i < size) - Float32ToFloat16(src + i, dst + i, srcTailMasks, dstTailMask); - } - - void Float32ToFloat16(const float * src, size_t size, uint16_t * dst) - { - if (Aligned(src) && Aligned(dst)) - Float32ToFloat16(src, size, dst); - else - Float32ToFloat16(src, size, dst); - } - - template SIMD_INLINE void Float16ToFloat32(const uint16_t * src, float * dst, __mmask32 srcTail, const __mmask16 * dstTails) - { - __m512i _src = Load(src, srcTail); - Avx512f::Store(dst + 0, _mm512_cvtph_ps(_mm512_extracti64x4_epi64(_src, 0)), dstTails[0]); - Avx512f::Store(dst + F, _mm512_cvtph_ps(_mm512_extracti64x4_epi64(_src, 1)), dstTails[1]); - } - - template SIMD_INLINE void Float16ToFloat32x2(const uint16_t * src, float * dst) - { -#if defined(_MSC_VER) - const __m512i src0 = Load(src + 00); - Avx512f::Store(dst + 0 * F, _mm512_cvtph_ps(_mm512_extracti64x4_epi64(src0, 0))); - Avx512f::Store(dst + 1 * F, _mm512_cvtph_ps(_mm512_extracti64x4_epi64(src0, 1))); - const __m512i src1 = Load(src + HA); - Avx512f::Store(dst + 2 * F, _mm512_cvtph_ps(_mm512_extracti64x4_epi64(src1, 0))); - Avx512f::Store(dst + 3 * F, _mm512_cvtph_ps(_mm512_extracti64x4_epi64(src1, 1))); -#else - Avx512f::Store(dst + 0 * F, _mm512_cvtph_ps(Avx2::Load((__m256i*)src + 0))); - Avx512f::Store(dst + 1 * F, _mm512_cvtph_ps(Avx2::Load((__m256i*)src + 1))); - Avx512f::Store(dst + 2 * F, _mm512_cvtph_ps(Avx2::Load((__m256i*)src + 2))); - Avx512f::Store(dst + 3 * F, _mm512_cvtph_ps(Avx2::Load((__m256i*)src + 3))); -#endif - } - - template void Float16ToFloat32(const uint16_t * src, size_t size, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t fullAlignedSize = Simd::AlignLo(size, QF); - size_t alignedSize = Simd::AlignLo(size, DF); - __mmask32 srcTailMask = TailMask32(size - alignedSize); - __mmask16 dstTailMasks[2]; - for (size_t c = 0; c < 2; ++c) - dstTailMasks[c] = TailMask16(size - alignedSize - F*c); - - size_t i = 0; - for (; i < fullAlignedSize; i += QF) - Float16ToFloat32x2(src + i, dst + i); - for (; i < alignedSize; i += DF) - Float16ToFloat32(src + i, dst + i, srcTailMask, dstTailMasks); - if (i < size) - Float16ToFloat32(src + i, dst + i, srcTailMask, dstTailMasks); - } - - void Float16ToFloat32(const uint16_t * src, size_t size, float * dst) - { - if (Aligned(src) && Aligned(dst)) - Float16ToFloat32(src, size, dst); - else - Float16ToFloat32(src, size, dst); - } - - template SIMD_INLINE void SquaredDifferenceSum16f(const __m512i & a, const __m512i & b, __m512 * sums) - { - __m512 _a = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(a, part)); - __m512 _b = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(b, part)); - __m512 _d = _mm512_sub_ps(_a, _b); - sums[part] = _mm512_fmadd_ps(_d, _d, sums[part]); - } - - template SIMD_INLINE void SquaredDifferenceSum16f2(const uint16_t * a, const uint16_t * b, __m512 * sums, __mmask32 tail = -1) - { - __m512i a0 = Load(a, tail); - __m512i b0 = Load(b, tail); - SquaredDifferenceSum16f<0>(a0, b0, sums); - SquaredDifferenceSum16f<1>(a0, b0, sums); - } - - template SIMD_INLINE void SquaredDifferenceSum16f4(const uint16_t * a, const uint16_t * b, __m512 * sums) - { -#if defined(_MSC_VER) - __m512i a0 = Load(a + 00); - __m512i b0 = Load(b + 00); - SquaredDifferenceSum16f<0>(a0, b0, sums); - SquaredDifferenceSum16f<1>(a0, b0, sums); - __m512i a1 = Load(a + HA); - __m512i b1 = Load(b + HA); - SquaredDifferenceSum16f<0>(a1, b1, sums); - SquaredDifferenceSum16f<1>(a1, b1, sums); -#else - __m512 a0 = _mm512_cvtph_ps(Avx2::Load((__m256i*)a + 0)); - __m512 b0 = _mm512_cvtph_ps(Avx2::Load((__m256i*)b + 0)); - __m512 d0 = _mm512_sub_ps(a0, b0); - sums[0] = _mm512_fmadd_ps(d0, d0, sums[0]); - - __m512 a1 = _mm512_cvtph_ps(Avx2::Load((__m256i*)a + 1)); - __m512 b1 = _mm512_cvtph_ps(Avx2::Load((__m256i*)b + 1)); - __m512 d1 = _mm512_sub_ps(a1, b1); - sums[1] = _mm512_fmadd_ps(d1, d1, sums[1]); - - __m512 a2 = _mm512_cvtph_ps(Avx2::Load((__m256i*)a + 2)); - __m512 b2 = _mm512_cvtph_ps(Avx2::Load((__m256i*)b + 2)); - __m512 d2 = _mm512_sub_ps(a2, b2); - sums[0] = _mm512_fmadd_ps(d2, d2, sums[0]); - - __m512 a3 = _mm512_cvtph_ps(Avx2::Load((__m256i*)a + 3)); - __m512 b3 = _mm512_cvtph_ps(Avx2::Load((__m256i*)b + 3)); - __m512 d3 = _mm512_sub_ps(a3, b3); - sums[1] = _mm512_fmadd_ps(d3, d3, sums[1]); -#endif - } - - template SIMD_INLINE void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum) - { - if (align) - assert(Aligned(a) && Aligned(b)); - - size_t alignedSize = AlignLo(size, DF); - __mmask32 tailMask = TailMask32(size - alignedSize); - size_t fullAlignedSize = AlignLo(size, QF); - size_t i = 0; - __m512 sums[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() }; - for (; i < fullAlignedSize; i += QF) - SquaredDifferenceSum16f4(a + i, b + i, sums); - for (; i < alignedSize; i += DF) - SquaredDifferenceSum16f2(a + i, b + i, sums); - if (i < size) - SquaredDifferenceSum16f2(a + i, b + i, sums, tailMask); - sums[0] = _mm512_add_ps(sums[0], sums[1]); - *sum = Avx512f::ExtractSum(sums[0]); - } - - void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum) - { - if (Aligned(a) && Aligned(b)) - SquaredDifferenceSum16f(a, b, size, sum); - else - SquaredDifferenceSum16f(a, b, size, sum); - } - - template SIMD_INLINE void CosineDistance16f(const __m512i & a, const __m512i & b, __m512 * aa, __m512 * ab, __m512 * bb) - { - __m512 a0 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(a, part)); - __m512 b0 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(b, part)); - aa[part] = _mm512_fmadd_ps(a0, a0, aa[part]); - ab[part] = _mm512_fmadd_ps(a0, b0, ab[part]); - bb[part] = _mm512_fmadd_ps(b0, b0, bb[part]); - } - - template SIMD_INLINE void CosineDistance16f2(const uint16_t * a, const uint16_t * b, __m512 * aa, __m512 * ab, __m512 * bb, __mmask32 tail = -1) - { - __m512i a0 = Load(a, tail); - __m512i b0 = Load(b, tail); - CosineDistance16f<0>(a0, b0, aa, ab, bb); - CosineDistance16f<1>(a0, b0, aa, ab, bb); - } - - template SIMD_INLINE void CosineDistance16f4(const uint16_t * a, const uint16_t * b, __m512 * aa, __m512 * ab, __m512 * bb) - { - __m512i a0 = Load(a + 00); - __m512i b0 = Load(b + 00); - CosineDistance16f<0>(a0, b0, aa, ab, bb); - CosineDistance16f<1>(a0, b0, aa, ab, bb); - __m512i a1 = Load(a + HA); - __m512i b1 = Load(b + HA); - CosineDistance16f<0>(a1, b1, aa, ab, bb); - CosineDistance16f<1>(a1, b1, aa, ab, bb); - } - - template void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance) - { - if (align) - assert(Aligned(a) && Aligned(b)); - - size_t alignedSize = AlignLo(size, DF); - __mmask32 tailMask = TailMask32(size - alignedSize); - size_t fullAlignedSize = AlignLo(size, QF); - size_t i = 0; - __m512 _aa[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() }; - __m512 _ab[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() }; - __m512 _bb[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() }; - for (; i < fullAlignedSize; i += QF) - CosineDistance16f4(a + i, b + i, _aa, _ab, _bb); - for (; i < alignedSize; i += DF) - CosineDistance16f2(a + i, b + i, _aa, _ab, _bb); - if (i < size) - CosineDistance16f2(a + i, b + i, _aa, _ab, _bb, tailMask); - float aa = Avx512f::ExtractSum(_mm512_add_ps(_aa[0], _aa[1])); - float ab = Avx512f::ExtractSum(_mm512_add_ps(_ab[0], _ab[1])); - float bb = Avx512f::ExtractSum(_mm512_add_ps(_bb[0], _bb[1])); - *distance = 1.0f - ab / ::sqrt(aa*bb); - } - - void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance) - { - if (Aligned(a) && Aligned(b)) - CosineDistance16f(a, b, size, distance); - else - CosineDistance16f(a, b, size, distance); - } - - SIMD_INLINE __m512 Tail(size_t tail) - { - const int32_t mask[DF] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; - return _mm512_loadu_ps((float*)(mask + tail)); - } - - static void Squares(size_t M, size_t K, const uint16_t * const * A, float * squares) - { - size_t M4 = AlignLo(M, 4); - size_t KF = AlignLo(K, F); - __m512 mask = Tail(K - KF); - size_t i = 0; - for (; i < M4; i += 4) - { - __m512 sums[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; - for (size_t k = 0; k < KF; k += F) - { - __m512 a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[i + 0] + k))); - __m512 a1 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[i + 1] + k))); - __m512 a2 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[i + 2] + k))); - __m512 a3 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[i + 3] + k))); - sums[0] = _mm512_fmadd_ps(a0, a0, sums[0]); - sums[1] = _mm512_fmadd_ps(a1, a1, sums[1]); - sums[2] = _mm512_fmadd_ps(a2, a2, sums[2]); - sums[3] = _mm512_fmadd_ps(a3, a3, sums[3]); - } - if (KF < K) - { - size_t k = K - F; - __m512 a0 = _mm512_and_ps(mask, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[i + 0] + k)))); - __m512 a1 = _mm512_and_ps(mask, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[i + 1] + k)))); - __m512 a2 = _mm512_and_ps(mask, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[i + 2] + k)))); - __m512 a3 = _mm512_and_ps(mask, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[i + 3] + k)))); - sums[0] = _mm512_fmadd_ps(a0, a0, sums[0]); - sums[1] = _mm512_fmadd_ps(a1, a1, sums[1]); - sums[2] = _mm512_fmadd_ps(a2, a2, sums[2]); - sums[3] = _mm512_fmadd_ps(a3, a3, sums[3]); - } - _mm_storeu_ps(squares + i, Extract4Sums(sums)); - } - for (; i < M; i += 1) - { - __m512 sum = _mm512_setzero_ps(); - for (size_t k = 0; k < KF; k += F) - { - __m512 a = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[i] + k))); - sum = _mm512_fmadd_ps(a, a, sum); - } - if (KF < K) - { - size_t k = K - F; - __m512 a = _mm512_and_ps(mask, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[i] + k)))); - sum = _mm512_fmadd_ps(a, a, sum); - } - squares[i] = Avx512f::ExtractSum(sum); - } - } - - static void MicroCosineDistances6x4(size_t K, const uint16_t * const * A, const uint16_t * const * B, const float * aa, const float * bb, float * distances, size_t stride) - { - size_t K16 = K & (~15); - __m512 c00 = _mm512_setzero_ps(); - __m512 c01 = _mm512_setzero_ps(); - __m512 c02 = _mm512_setzero_ps(); - __m512 c03 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c11 = _mm512_setzero_ps(); - __m512 c12 = _mm512_setzero_ps(); - __m512 c13 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 c21 = _mm512_setzero_ps(); - __m512 c22 = _mm512_setzero_ps(); - __m512 c23 = _mm512_setzero_ps(); - __m512 c30 = _mm512_setzero_ps(); - __m512 c31 = _mm512_setzero_ps(); - __m512 c32 = _mm512_setzero_ps(); - __m512 c33 = _mm512_setzero_ps(); - __m512 c40 = _mm512_setzero_ps(); - __m512 c41 = _mm512_setzero_ps(); - __m512 c42 = _mm512_setzero_ps(); - __m512 c43 = _mm512_setzero_ps(); - __m512 c50 = _mm512_setzero_ps(); - __m512 c51 = _mm512_setzero_ps(); - __m512 c52 = _mm512_setzero_ps(); - __m512 c53 = _mm512_setzero_ps(); - __m512 a0, b0, b1, b2, b3; - for (size_t k = 0; k < K16; k += 16) - { - b0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[0] + k))); - b1 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[1] + k))); - b2 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[2] + k))); - b3 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[3] + k))); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[0] + k))); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c01 = _mm512_fmadd_ps(a0, b1, c01); - c02 = _mm512_fmadd_ps(a0, b2, c02); - c03 = _mm512_fmadd_ps(a0, b3, c03); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[1] + k))); - c10 = _mm512_fmadd_ps(a0, b0, c10); - c11 = _mm512_fmadd_ps(a0, b1, c11); - c12 = _mm512_fmadd_ps(a0, b2, c12); - c13 = _mm512_fmadd_ps(a0, b3, c13); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[2] + k))); - c20 = _mm512_fmadd_ps(a0, b0, c20); - c21 = _mm512_fmadd_ps(a0, b1, c21); - c22 = _mm512_fmadd_ps(a0, b2, c22); - c23 = _mm512_fmadd_ps(a0, b3, c23); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[3] + k))); - c30 = _mm512_fmadd_ps(a0, b0, c30); - c31 = _mm512_fmadd_ps(a0, b1, c31); - c32 = _mm512_fmadd_ps(a0, b2, c32); - c33 = _mm512_fmadd_ps(a0, b3, c33); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[4] + k))); - c40 = _mm512_fmadd_ps(a0, b0, c40); - c41 = _mm512_fmadd_ps(a0, b1, c41); - c42 = _mm512_fmadd_ps(a0, b2, c42); - c43 = _mm512_fmadd_ps(a0, b3, c43); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[5] + k))); - c50 = _mm512_fmadd_ps(a0, b0, c50); - c51 = _mm512_fmadd_ps(a0, b1, c51); - c52 = _mm512_fmadd_ps(a0, b2, c52); - c53 = _mm512_fmadd_ps(a0, b3, c53); - } - if (K16 < K) - { - size_t k = K - 16; - __m512 tail = Tail(K - K16); - b0 = _mm512_and_ps(tail, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[0] + k)))); - b1 = _mm512_and_ps(tail, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[1] + k)))); - b2 = _mm512_and_ps(tail, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[2] + k)))); - b3 = _mm512_and_ps(tail, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[3] + k)))); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[0] + k))); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c01 = _mm512_fmadd_ps(a0, b1, c01); - c02 = _mm512_fmadd_ps(a0, b2, c02); - c03 = _mm512_fmadd_ps(a0, b3, c03); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[1] + k))); - c10 = _mm512_fmadd_ps(a0, b0, c10); - c11 = _mm512_fmadd_ps(a0, b1, c11); - c12 = _mm512_fmadd_ps(a0, b2, c12); - c13 = _mm512_fmadd_ps(a0, b3, c13); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[2] + k))); - c20 = _mm512_fmadd_ps(a0, b0, c20); - c21 = _mm512_fmadd_ps(a0, b1, c21); - c22 = _mm512_fmadd_ps(a0, b2, c22); - c23 = _mm512_fmadd_ps(a0, b3, c23); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[3] + k))); - c30 = _mm512_fmadd_ps(a0, b0, c30); - c31 = _mm512_fmadd_ps(a0, b1, c31); - c32 = _mm512_fmadd_ps(a0, b2, c32); - c33 = _mm512_fmadd_ps(a0, b3, c33); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[4] + k))); - c40 = _mm512_fmadd_ps(a0, b0, c40); - c41 = _mm512_fmadd_ps(a0, b1, c41); - c42 = _mm512_fmadd_ps(a0, b2, c42); - c43 = _mm512_fmadd_ps(a0, b3, c43); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[5] + k))); - c50 = _mm512_fmadd_ps(a0, b0, c50); - c51 = _mm512_fmadd_ps(a0, b1, c51); - c52 = _mm512_fmadd_ps(a0, b2, c52); - c53 = _mm512_fmadd_ps(a0, b3, c53); - } - __m128 _bb = _mm_loadu_ps(bb); - __m128 _1 = _mm_set1_ps(1.0f); - _mm_storeu_ps(distances + 0 * stride, _mm_fnmadd_ps(_mm_rsqrt_ps(_mm_mul_ps(_bb, _mm_set1_ps(aa[0]))), Extract4Sums(c00, c01, c02, c03), _1)); - _mm_storeu_ps(distances + 1 * stride, _mm_fnmadd_ps(_mm_rsqrt_ps(_mm_mul_ps(_bb, _mm_set1_ps(aa[1]))), Extract4Sums(c10, c11, c12, c13), _1)); - _mm_storeu_ps(distances + 2 * stride, _mm_fnmadd_ps(_mm_rsqrt_ps(_mm_mul_ps(_bb, _mm_set1_ps(aa[2]))), Extract4Sums(c20, c21, c22, c23), _1)); - _mm_storeu_ps(distances + 3 * stride, _mm_fnmadd_ps(_mm_rsqrt_ps(_mm_mul_ps(_bb, _mm_set1_ps(aa[3]))), Extract4Sums(c30, c31, c32, c33), _1)); - _mm_storeu_ps(distances + 4 * stride, _mm_fnmadd_ps(_mm_rsqrt_ps(_mm_mul_ps(_bb, _mm_set1_ps(aa[4]))), Extract4Sums(c40, c41, c42, c43), _1)); - _mm_storeu_ps(distances + 5 * stride, _mm_fnmadd_ps(_mm_rsqrt_ps(_mm_mul_ps(_bb, _mm_set1_ps(aa[5]))), Extract4Sums(c50, c51, c52, c53), _1)); - } - - static void MicroCosineDistances6x1(size_t K, const uint16_t * const * A, const uint16_t * const * B, const float * aa, const float * bb, float * distances, size_t stride) - { - size_t K16 = K & (~15); - __m512 c00 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 c30 = _mm512_setzero_ps(); - __m512 c40 = _mm512_setzero_ps(); - __m512 c50 = _mm512_setzero_ps(); - __m512 a0, b0; - for (size_t k = 0; k < K16; k += 16) - { - b0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[0] + k))); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[0] + k))); - c00 = _mm512_fmadd_ps(a0, b0, c00); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[1] + k))); - c10 = _mm512_fmadd_ps(a0, b0, c10); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[2] + k))); - c20 = _mm512_fmadd_ps(a0, b0, c20); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[3] + k))); - c30 = _mm512_fmadd_ps(a0, b0, c30); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[4] + k))); - c40 = _mm512_fmadd_ps(a0, b0, c40); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[5] + k))); - c50 = _mm512_fmadd_ps(a0, b0, c50); - } - if (K16 < K) - { - size_t k = K - 16; - __m512 tail = Tail(K - K16); - b0 = _mm512_and_ps(tail, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[0] + k)))); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[0] + k))); - c00 = _mm512_fmadd_ps(a0, b0, c00); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[1] + k))); - c10 = _mm512_fmadd_ps(a0, b0, c10); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[2] + k))); - c20 = _mm512_fmadd_ps(a0, b0, c20); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[3] + k))); - c30 = _mm512_fmadd_ps(a0, b0, c30); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[4] + k))); - c40 = _mm512_fmadd_ps(a0, b0, c40); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[5] + k))); - c50 = _mm512_fmadd_ps(a0, b0, c50); - } - distances[0 * stride] = 1.0f - Avx512f::ExtractSum(c00) / sqrt(bb[0] * aa[0]); - distances[1 * stride] = 1.0f - Avx512f::ExtractSum(c10) / sqrt(bb[0] * aa[1]); - distances[2 * stride] = 1.0f - Avx512f::ExtractSum(c20) / sqrt(bb[0] * aa[2]); - distances[3 * stride] = 1.0f - Avx512f::ExtractSum(c30) / sqrt(bb[0] * aa[3]); - distances[4 * stride] = 1.0f - Avx512f::ExtractSum(c40) / sqrt(bb[0] * aa[4]); - distances[5 * stride] = 1.0f - Avx512f::ExtractSum(c50) / sqrt(bb[0] * aa[5]); - } - - static void MicroCosineDistances3x4(size_t K, const uint16_t * const * A, const uint16_t * const * B, const float * aa, const float * bb, float * distances, size_t stride) - { - size_t K16 = K & (~15); - __m512 c00 = _mm512_setzero_ps(); - __m512 c01 = _mm512_setzero_ps(); - __m512 c02 = _mm512_setzero_ps(); - __m512 c03 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c11 = _mm512_setzero_ps(); - __m512 c12 = _mm512_setzero_ps(); - __m512 c13 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 c21 = _mm512_setzero_ps(); - __m512 c22 = _mm512_setzero_ps(); - __m512 c23 = _mm512_setzero_ps(); - __m512 a0, a1, a2, b0; - for (size_t k = 0; k < K16; k += 16) - { - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[0] + k))); - a1 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[1] + k))); - a2 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[2] + k))); - b0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[0] + k))); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c10 = _mm512_fmadd_ps(a1, b0, c10); - c20 = _mm512_fmadd_ps(a2, b0, c20); - b0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[1] + k))); - c01 = _mm512_fmadd_ps(a0, b0, c01); - c11 = _mm512_fmadd_ps(a1, b0, c11); - c21 = _mm512_fmadd_ps(a2, b0, c21); - b0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[2] + k))); - c02 = _mm512_fmadd_ps(a0, b0, c02); - c12 = _mm512_fmadd_ps(a1, b0, c12); - c22 = _mm512_fmadd_ps(a2, b0, c22); - b0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[3] + k))); - c03 = _mm512_fmadd_ps(a0, b0, c03); - c13 = _mm512_fmadd_ps(a1, b0, c13); - c23 = _mm512_fmadd_ps(a2, b0, c23); - } - if (K16 < K) - { - size_t k = K - 16; - __m512 tail = Tail(K - K16); - a0 = _mm512_and_ps(tail, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[0] + k)))); - a1 = _mm512_and_ps(tail, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[1] + k)))); - a2 = _mm512_and_ps(tail, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[2] + k)))); - b0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[0] + k))); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c10 = _mm512_fmadd_ps(a1, b0, c10); - c20 = _mm512_fmadd_ps(a2, b0, c20); - b0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[1] + k))); - c01 = _mm512_fmadd_ps(a0, b0, c01); - c11 = _mm512_fmadd_ps(a1, b0, c11); - c21 = _mm512_fmadd_ps(a2, b0, c21); - b0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[2] + k))); - c02 = _mm512_fmadd_ps(a0, b0, c02); - c12 = _mm512_fmadd_ps(a1, b0, c12); - c22 = _mm512_fmadd_ps(a2, b0, c22); - b0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[3] + k))); - c03 = _mm512_fmadd_ps(a0, b0, c03); - c13 = _mm512_fmadd_ps(a1, b0, c13); - c23 = _mm512_fmadd_ps(a2, b0, c23); - } - __m128 _bb = _mm_loadu_ps(bb); - __m128 _1 = _mm_set1_ps(1.0f); - _mm_storeu_ps(distances + 0 * stride, _mm_fnmadd_ps(_mm_rsqrt_ps(_mm_mul_ps(_bb, _mm_set1_ps(aa[0]))), Extract4Sums(c00, c01, c02, c03), _1)); - _mm_storeu_ps(distances + 1 * stride, _mm_fnmadd_ps(_mm_rsqrt_ps(_mm_mul_ps(_bb, _mm_set1_ps(aa[1]))), Extract4Sums(c10, c11, c12, c13), _1)); - _mm_storeu_ps(distances + 2 * stride, _mm_fnmadd_ps(_mm_rsqrt_ps(_mm_mul_ps(_bb, _mm_set1_ps(aa[2]))), Extract4Sums(c20, c21, c22, c23), _1)); - } - - static void MicroCosineDistances3x1(size_t K, const uint16_t * const * A, const uint16_t * const * B, const float * aa, const float * bb, float * distances, size_t stride) - { - size_t K16 = K & (~15); - __m512 c00 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 a0, b0; - for (size_t k = 0; k < K16; k += 16) - { - b0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[0] + k))); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[0] + k))); - c00 = _mm512_fmadd_ps(a0, b0, c00); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[1] + k))); - c10 = _mm512_fmadd_ps(a0, b0, c10); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[2] + k))); - c20 = _mm512_fmadd_ps(a0, b0, c20); - } - if (K16 < K) - { - size_t k = K - 16; - __m512 tail = Tail(K - K16); - b0 = _mm512_and_ps(tail, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[0] + k)))); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[0] + k))); - c00 = _mm512_fmadd_ps(a0, b0, c00); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[1] + k))); - c10 = _mm512_fmadd_ps(a0, b0, c10); - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[2] + k))); - c20 = _mm512_fmadd_ps(a0, b0, c20); - } - distances[0 * stride] = 1.0f - Avx512f::ExtractSum(c00) / sqrt(bb[0] * aa[0]); - distances[1 * stride] = 1.0f - Avx512f::ExtractSum(c10) / sqrt(bb[0] * aa[1]); - distances[2 * stride] = 1.0f - Avx512f::ExtractSum(c20) / sqrt(bb[0] * aa[2]); - } - - static void MicroCosineDistances1x4(size_t K, const uint16_t * const * A, const uint16_t * const * B, const float * aa, const float * bb, float * distances, size_t stride) - { - size_t K16 = K & (~15); - __m512 c00 = _mm512_setzero_ps(); - __m512 c01 = _mm512_setzero_ps(); - __m512 c02 = _mm512_setzero_ps(); - __m512 c03 = _mm512_setzero_ps(); - __m512 a0, b0; - for (size_t k = 0; k < K16; k += 16) - { - a0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[0] + k))); - b0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[0] + k))); - c00 = _mm512_fmadd_ps(a0, b0, c00); - b0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[1] + k))); - c01 = _mm512_fmadd_ps(a0, b0, c01); - b0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[2] + k))); - c02 = _mm512_fmadd_ps(a0, b0, c02); - b0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[3] + k))); - c03 = _mm512_fmadd_ps(a0, b0, c03); - } - if (K16 < K) - { - size_t k = K - 16; - __m512 tail = Tail(K - K16); - a0 = _mm512_and_ps(tail, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(A[0] + k)))); - b0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[0] + k))); - c00 = _mm512_fmadd_ps(a0, b0, c00); - b0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[1] + k))); - c01 = _mm512_fmadd_ps(a0, b0, c01); - b0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[2] + k))); - c02 = _mm512_fmadd_ps(a0, b0, c02); - b0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(B[3] + k))); - c03 = _mm512_fmadd_ps(a0, b0, c03); - } - __m128 _bb = _mm_loadu_ps(bb); - __m128 _1 = _mm_set1_ps(1.0f); - _mm_storeu_ps(distances + 0 * stride, _mm_fnmadd_ps(_mm_rsqrt_ps(_mm_mul_ps(_bb, _mm_set1_ps(aa[0]))), Extract4Sums(c00, c01, c02, c03), _1)); - } - - static void MacroCosineDistances(size_t M, size_t N, size_t K, const uint16_t * const * A, const uint16_t * const * B, const float * aa, const float * bb, float * distances, size_t stride) - { - size_t M3 = AlignLoAny(M, 3); - size_t M6 = AlignLoAny(M, 6); - size_t N4 = AlignLo(N, 4); - size_t i = 0; - for (; i < M6; i += 6) - { - size_t j = 0; - for (; j < N4; j += 4) - MicroCosineDistances6x4(K, A + i, B + j, aa + i, bb + j, distances + j, stride); - for (; j < N; j += 1) - MicroCosineDistances6x1(K, A + i, B + j, aa + i, bb + j, distances + j, stride); - distances += 6 * stride; - } - for (; i < M3; i += 3) - { - size_t j = 0; - for (; j < N4; j += 4) - MicroCosineDistances3x4(K, A + i, B + j, aa + i, bb + j, distances + j, stride); - for (; j < N; j += 1) - MicroCosineDistances3x1(K, A + i, B + j, aa + i, bb + j, distances + j, stride); - distances += 3 * stride; - } - for (; i < M; i++) - { - size_t j = 0; - for (; j < N4; j += 4) - MicroCosineDistances1x4(K, A + i, B + j, aa + i, bb + j, distances + j, stride); - for (; j < N; j += 1) - CosineDistance16f(A[i], B[j], K, distances + j); - distances += 1 * stride; - } - } - - void CosineDistancesMxNa16f(size_t M, size_t N, size_t K, const uint16_t * const * A, const uint16_t * const * B, float * distances) - { - const size_t L2 = Base::AlgCacheL2(); - size_t mN = AlignLoAny(L2 / 2 / K, 4); - size_t mM = AlignLoAny(L2 / 2 / K, 6); - Array32f aa(M), bb(N); - for (size_t i = 0; i < M; i += mM) - { - size_t dM = Simd::Min(M, i + mM) - i; - Squares(dM, K, A + i, aa.data + i); - for (size_t j = 0; j < N; j += mN) - { - size_t dN = Simd::Min(N, j + mN) - j; - if (i == 0) - Squares(dN, K, B + j, bb.data + j); - MacroCosineDistances(dM, dN, K, A + i, B + j, aa.data + i, bb.data + j, distances + i * N + j, N); - } - } - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwFloat32.cpp b/src/3rd/Simd/Simd/SimdAvx512bwFloat32.cpp deleted file mode 100644 index 165a1ed4..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwFloat32.cpp +++ /dev/null @@ -1,179 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE __m512i Float32ToUint8(const float * src, const __m512 & lower, const __m512 & upper, const __m512 & boost, __mmask16 tail = -1) - { - return _mm512_cvtps_epi32(_mm512_mul_ps(_mm512_sub_ps(_mm512_min_ps(_mm512_max_ps((Avx512f::Load(src, tail)), lower), upper), lower), boost)); - } - - template SIMD_INLINE void Float32ToUint8(const float * src, const __m512 & lower, const __m512 & upper, const __m512 & boost, uint8_t * dst, const __mmask16 * srcTails, __mmask64 dstTail) - { - __m512i d0 = Float32ToUint8(src + F * 0, lower, upper, boost, srcTails[0]); - __m512i d1 = Float32ToUint8(src + F * 1, lower, upper, boost, srcTails[1]); - __m512i d2 = Float32ToUint8(src + F * 2, lower, upper, boost, srcTails[2]); - __m512i d3 = Float32ToUint8(src + F * 3, lower, upper, boost, srcTails[3]); - Store(dst, _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, _mm512_packus_epi16(_mm512_packs_epi32(d0, d1), _mm512_packs_epi32(d2, d3))), dstTail); - } - - template void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - __m512 _lower = _mm512_set1_ps(lower[0]); - __m512 _upper = _mm512_set1_ps(upper[0]); - __m512 boost = _mm512_set1_ps(255.0f / (upper[0] - lower[0])); - - size_t alignedSize = AlignLo(size, A); - __mmask16 srcTailMasks[4]; - for (size_t c = 0; c < 4; ++c) - srcTailMasks[c] = TailMask16(size - alignedSize - F*c); - __mmask64 dstTailMask = TailMask64(size - alignedSize); - - size_t i = 0; - for (; i < alignedSize; i += A) - Float32ToUint8(src + i, _lower, _upper, boost, dst + i, srcTailMasks, dstTailMask); - if (i < size) - Float32ToUint8(src + i, _lower, _upper, boost, dst + i, srcTailMasks, dstTailMask); - } - - void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst) - { - if (Aligned(src) && Aligned(dst)) - Float32ToUint8(src, size, lower, upper, dst); - else - Float32ToUint8(src, size, lower, upper, dst); - } - - template SIMD_INLINE void Uint8ToFloat32(const __m128i & value, const __m512 & lower, const __m512 & boost, float * dst, __mmask16 tail) - { - Avx512f::Store(dst, _mm512_add_ps(_mm512_mul_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(value)), boost), lower), tail); - } - - template SIMD_INLINE void Uint8ToFloat32(const uint8_t * src, const __m512 & lower, const __m512 & boost, float * dst, __mmask64 srcTail, const __mmask16 * dstTails) - { - __m512i _src = Load(src, srcTail); - Uint8ToFloat32(_mm512_extracti32x4_epi32(_src, 0), lower, boost, dst + 0 * F, dstTails[0]); - Uint8ToFloat32(_mm512_extracti32x4_epi32(_src, 1), lower, boost, dst + 1 * F, dstTails[1]); - Uint8ToFloat32(_mm512_extracti32x4_epi32(_src, 2), lower, boost, dst + 2 * F, dstTails[2]); - Uint8ToFloat32(_mm512_extracti32x4_epi32(_src, 3), lower, boost, dst + 3 * F, dstTails[3]); - } - - template void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - __m512 _lower = _mm512_set1_ps(lower[0]); - __m512 boost = _mm512_set1_ps((upper[0] - lower[0]) / 255.0f); - - size_t alignedSize = AlignLo(size, A); - __mmask64 srcTailMask = TailMask64(size - alignedSize); - __mmask16 dstTailMasks[4]; - for (size_t c = 0; c < 4; ++c) - dstTailMasks[c] = TailMask16(size - alignedSize - F*c); - - size_t i = 0; - for (; i < alignedSize; i += A) - Uint8ToFloat32(src + i, _lower, boost, dst + i, srcTailMask, dstTailMasks); - if (i < size) - Uint8ToFloat32(src + i, _lower, boost, dst + i, srcTailMask, dstTailMasks); - } - - void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst) - { - if (Aligned(src) && Aligned(dst)) - Uint8ToFloat32(src, size, lower, upper, dst); - else - Uint8ToFloat32(src, size, lower, upper, dst); - } - - template void CosineDistance32f(const float * a, const float * b, size_t size, float * distance) - { - if (align) - assert(Aligned(a) && Aligned(b)); - - size_t partialAlignedSize = AlignLo(size, F); - size_t fullAlignedSize = AlignLo(size, DF); - size_t i = 0; - __m512 _aa[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() }; - __m512 _ab[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() }; - __m512 _bb[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() }; - if (fullAlignedSize) - { - for (; i < fullAlignedSize; i += DF) - { - __m512 a0 = Avx512f::Load(a + i + 0 * F); - __m512 b0 = Avx512f::Load(b + i + 0 * F); - _aa[0] = _mm512_fmadd_ps(a0, a0, _aa[0]); - _ab[0] = _mm512_fmadd_ps(a0, b0, _ab[0]); - _bb[0] = _mm512_fmadd_ps(b0, b0, _bb[0]); - __m512 a1 = Avx512f::Load(a + i + 1 * F); - __m512 b1 = Avx512f::Load(b + i + 1 * F); - _aa[1] = _mm512_fmadd_ps(a1, a1, _aa[1]); - _ab[1] = _mm512_fmadd_ps(a1, b1, _ab[1]); - _bb[1] = _mm512_fmadd_ps(b1, b1, _bb[1]); - } - _aa[0] = _mm512_add_ps(_aa[0], _aa[1]); - _ab[0] = _mm512_add_ps(_ab[0], _ab[1]); - _bb[0] = _mm512_add_ps(_bb[0], _bb[1]); - } - for (; i < partialAlignedSize; i += F) - { - __m512 a0 = Avx512f::Load(a + i); - __m512 b0 = Avx512f::Load(b + i); - _aa[0] = _mm512_fmadd_ps(a0, a0, _aa[0]); - _ab[0] = _mm512_fmadd_ps(a0, b0, _ab[0]); - _bb[0] = _mm512_fmadd_ps(b0, b0, _bb[0]); - } - float aa = Avx512f::ExtractSum(_aa[0]), ab = Avx512f::ExtractSum(_ab[0]), bb = Avx512f::ExtractSum(_bb[0]); - for (; i < size; ++i) - { - float _a = a[i]; - float _b = b[i]; - aa += _a * _a; - ab += _a * _b; - bb += _b * _b; - } - *distance = 1.0f - ab / ::sqrt(aa*bb); - } - - void CosineDistance32f(const float * a, const float * b, size_t size, float * distance) - { - if (Aligned(a) && Aligned(b)) - CosineDistance32f(a, b, size, distance); - else - CosineDistance32f(a, b, size, distance); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwGaussianBlur3x3.cpp b/src/3rd/Simd/Simd/SimdAvx512bwGaussianBlur3x3.cpp deleted file mode 100644 index ddec9b1e..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwGaussianBlur3x3.cpp +++ /dev/null @@ -1,154 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - namespace - { - struct Buffer - { - Buffer(size_t width) - { - _p = Allocate(sizeof(uint16_t) * 3 * width); - src0 = (uint16_t*)_p; - src1 = src0 + width; - src2 = src1 + width; - } - - ~Buffer() - { - Free(_p); - } - - uint16_t * src0; - uint16_t * src1; - uint16_t * src2; - private: - void * _p; - }; - } - - SIMD_INLINE __m512i DivideBy16(__m512i value) - { - return _mm512_srli_epi16(_mm512_add_epi16(value, K16_0008), 4); - } - - const __m512i K8_01_02 = SIMD_MM512_SET2_EPI8(0x01, 0x02); - - template SIMD_INLINE __m512i BinomialSumUnpackedU8(__m512i a[3]) - { - return _mm512_add_epi16(_mm512_maddubs_epi16(UnpackU8(a[0], a[1]), K8_01_02), UnpackU8(a[2])); - } - - template SIMD_INLINE void BlurCol(__m512i a[3], uint16_t * b) - { - Store(b + 00, BinomialSumUnpackedU8<0>(a)); - Store(b + HA, BinomialSumUnpackedU8<1>(a)); - } - - template void BlurCol(const uint8_t * src, size_t aligned, size_t full, uint16_t * dst) - { - __m512i a[3]; - LoadNose3(src, a); - BlurCol(a, dst); - for (size_t col = A; col < aligned; col += A) - { - LoadBody3(src + col, a); - BlurCol(a, dst + col); - } - LoadTail3(src + full - A, a); - BlurCol(a, dst + aligned); - } - - template SIMD_INLINE __m512i BlurRow16(const Buffer & buffer, size_t offset) - { - return DivideBy16(BinomialSum16( - Load(buffer.src0 + offset), - Load(buffer.src1 + offset), - Load(buffer.src2 + offset))); - } - - template SIMD_INLINE __m512i BlurRow(const Buffer & buffer, size_t offset) - { - return _mm512_packus_epi16(BlurRow16(buffer, offset), BlurRow16(buffer, offset + HA)); - } - - template void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(step*(width - 1) >= Avx2::A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(step*width) && Aligned(dst) && Aligned(dstStride)); - - size_t size = step*width; - size_t bodySize = Simd::AlignHi(size, A) - A; - - Buffer buffer(Simd::AlignHi(size, A)); - BlurCol(src, bodySize, size, buffer.src0); - memcpy(buffer.src1, buffer.src0, sizeof(uint16_t)*(bodySize + A)); - - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - const uint8_t * src2 = src + srcStride*(row + 1); - if (row >= height - 2) - src2 = src + srcStride*(height - 1); - - BlurCol(src2, bodySize, size, buffer.src2); - - for (size_t col = 0; col < bodySize; col += A) - Store(dst + col, BlurRow(buffer, col)); - Store(dst + size - A, BlurRow(buffer, bodySize)); - - Swap(buffer.src0, buffer.src2); - Swap(buffer.src0, buffer.src1); - } - } - - template void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride) - { - assert(channelCount > 0 && channelCount <= 4); - - switch (channelCount) - { - case 1: GaussianBlur3x3(src, srcStride, width, height, dst, dstStride); break; - case 2: GaussianBlur3x3(src, srcStride, width, height, dst, dstStride); break; - case 3: GaussianBlur3x3(src, srcStride, width, height, dst, dstStride); break; - case 4: GaussianBlur3x3(src, srcStride, width, height, dst, dstStride); break; - } - } - - void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(channelCount*width) && Aligned(dst) && Aligned(dstStride)) - GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else - GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwGrayToBgr.cpp b/src/3rd/Simd/Simd/SimdAvx512bwGrayToBgr.cpp deleted file mode 100644 index 2f3d447c..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwGrayToBgr.cpp +++ /dev/null @@ -1,88 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void GrayToBgr(const uint8_t * gray, uint8_t * bgr, const __mmask64 tails[4]) - { - const __m512i gray0 = Load(gray + 0 * A, tails[0]); - Store(bgr + 0 * A, GrayToBgr<0>(gray0), tails[1]); - Store(bgr + 1 * A, GrayToBgr<1>(gray0), tails[2]); - Store(bgr + 2 * A, GrayToBgr<2>(gray0), tails[3]); - } - - template SIMD_INLINE void GrayToBgr2(const uint8_t * gray, uint8_t * bgr) - { - const __m512i gray0 = Load(gray + 0 * A); - Store(bgr + 0 * A, GrayToBgr<0>(gray0)); - Store(bgr + 1 * A, GrayToBgr<1>(gray0)); - Store(bgr + 2 * A, GrayToBgr<2>(gray0)); - const __m512i gray1 = Load(gray + 1 * A); - Store(bgr + 3 * A, GrayToBgr<0>(gray1)); - Store(bgr + 4 * A, GrayToBgr<1>(gray1)); - Store(bgr + 5 * A, GrayToBgr<2>(gray1)); - } - - template void GrayToBgr(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride) - { - if (align) - assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(gray) && Aligned(grayStride)); - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMasks[4]; - tailMasks[0] = TailMask64(width - alignedWidth); - for (size_t c = 0; c < 3; ++c) - tailMasks[1 + c] = TailMask64((width - alignedWidth) * 3 - A*c); - size_t fullAlignedWidth = AlignLo(width, DA); - - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += DA) - GrayToBgr2(gray + col, bgr + col * 3); - for (; col < alignedWidth; col += A) - GrayToBgr(gray + col, bgr + col * 3, tailMasks); - if (col < width) - GrayToBgr(gray + col, bgr + col * 3, tailMasks); - gray += grayStride; - bgr += bgrStride; - } - } - - void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride) - { - if (Aligned(bgr) && Aligned(gray) && Aligned(bgrStride) && Aligned(grayStride)) - GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); - else - GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwGrayToBgra.cpp b/src/3rd/Simd/Simd/SimdAvx512bwGrayToBgra.cpp deleted file mode 100644 index 841e9ee8..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwGrayToBgra.cpp +++ /dev/null @@ -1,103 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void GrayToBgra(const uint8_t * gray, const __m512i & alpha, uint8_t * bgra, const __mmask64 tails[5]) - { - __m512i gray0 = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, (Load(gray + 0, tails[0]))); - __m512i bg0 = _mm512_unpacklo_epi8(gray0, gray0); - __m512i bg1 = _mm512_unpackhi_epi8(gray0, gray0); - __m512i ra0 = _mm512_unpacklo_epi8(gray0, alpha); - __m512i ra1 = _mm512_unpackhi_epi8(gray0, alpha); - Store(bgra + 0 * A, _mm512_unpacklo_epi16(bg0, ra0), tails[1]); - Store(bgra + 1 * A, _mm512_unpackhi_epi16(bg0, ra0), tails[2]); - Store(bgra + 2 * A, _mm512_unpacklo_epi16(bg1, ra1), tails[3]); - Store(bgra + 3 * A, _mm512_unpackhi_epi16(bg1, ra1), tails[4]); - } - - template SIMD_INLINE void GrayToBgra2(const uint8_t * gray, const __m512i & alpha, uint8_t * bgra) - { - __m512i gray0 = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, Load(gray + 0)); - __m512i bg0 = _mm512_unpacklo_epi8(gray0, gray0); - __m512i bg1 = _mm512_unpackhi_epi8(gray0, gray0); - __m512i ra0 = _mm512_unpacklo_epi8(gray0, alpha); - __m512i ra1 = _mm512_unpackhi_epi8(gray0, alpha); - Store(bgra + 0 * A, _mm512_unpacklo_epi16(bg0, ra0)); - Store(bgra + 1 * A, _mm512_unpackhi_epi16(bg0, ra0)); - Store(bgra + 2 * A, _mm512_unpacklo_epi16(bg1, ra1)); - Store(bgra + 3 * A, _mm512_unpackhi_epi16(bg1, ra1)); - - __m512i gray1 = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, Load(gray + A)); - __m512i bg2 = _mm512_unpacklo_epi8(gray1, gray1); - __m512i bg3 = _mm512_unpackhi_epi8(gray1, gray1); - __m512i ra2 = _mm512_unpacklo_epi8(gray1, alpha); - __m512i ra3 = _mm512_unpackhi_epi8(gray1, alpha); - Store(bgra + 4 * A, _mm512_unpacklo_epi16(bg2, ra2)); - Store(bgra + 5 * A, _mm512_unpackhi_epi16(bg2, ra2)); - Store(bgra + 6 * A, _mm512_unpacklo_epi16(bg3, ra3)); - Store(bgra + 7 * A, _mm512_unpackhi_epi16(bg3, ra3)); - } - - template void GrayToBgra(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha) - { - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(gray) && Aligned(grayStride)); - - __m512i _alpha = _mm512_set1_epi8(alpha); - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMasks[5]; - tailMasks[0] = TailMask64(width - alignedWidth); - for (size_t c = 0; c < 4; ++c) - tailMasks[1 + c] = TailMask64((width - alignedWidth) * 4 - A*c); - size_t fullAlignedWidth = AlignLo(width, DA); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += DA) - GrayToBgra2(gray + col, _alpha, bgra + col * 4); - for (; col < alignedWidth; col += A) - GrayToBgra(gray + col, _alpha, bgra + col * 4, tailMasks); - if (col < width) - GrayToBgra(gray + col, _alpha, bgra + col * 4, tailMasks); - gray += grayStride; - bgra += bgraStride; - } - } - - void GrayToBgra(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(bgra) && Aligned(gray) && Aligned(bgraStride) && Aligned(grayStride)) - GrayToBgra(gray, width, height, grayStride, bgra, bgraStride, alpha); - else - GrayToBgra(gray, width, height, grayStride, bgra, bgraStride, alpha); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwHistogram.cpp b/src/3rd/Simd/Simd/SimdAvx512bwHistogram.cpp deleted file mode 100644 index 727342f2..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwHistogram.cpp +++ /dev/null @@ -1,369 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdCompare.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - namespace - { - template struct Buffer - { - Buffer(size_t rowSize, size_t histogramSize) - { - _p = Allocate(sizeof(T)*rowSize + 4 * sizeof(uint32_t)*histogramSize); - v = (T*)_p; - h[0] = (uint32_t *)(v + rowSize); - h[1] = h[0] + histogramSize; - h[2] = h[1] + histogramSize; - h[3] = h[2] + histogramSize; - memset(h[0], 0, 4 * sizeof(uint32_t)*histogramSize); - } - - ~Buffer() - { - Free(_p); - } - - T * v; - uint32_t * h[4]; - private: - void *_p; - }; - } - - template SIMD_INLINE __m512i AbsSecondDerivative(const uint8_t * src, ptrdiff_t step) - { - const __m512i s0 = Load(src - step); - const __m512i s1 = Load(src); - const __m512i s2 = Load(src + step); - return AbsDifferenceU8(_mm512_avg_epu8(s0, s2), s1); - } - - template SIMD_INLINE void AbsSecondDerivative(const uint8_t * src, ptrdiff_t colStep, ptrdiff_t rowStep, uint8_t * dst) - { - const __m512i sdX = AbsSecondDerivative(src, colStep); - const __m512i sdY = AbsSecondDerivative(src, rowStep); - Store(dst, _mm512_max_epu8(sdY, sdX)); - } - - SIMD_INLINE void SumHistograms(uint32_t * src, size_t start, uint32_t * dst) - { - uint32_t * src0 = src + start; - uint32_t * src1 = src0 + start + HISTOGRAM_SIZE; - uint32_t * src2 = src1 + start + HISTOGRAM_SIZE; - uint32_t * src3 = src2 + start + HISTOGRAM_SIZE; - for (size_t i = 0; i < HISTOGRAM_SIZE; i += F) - Store(dst + i, _mm512_add_epi32(_mm512_add_epi32(Load(src0 + i), Load(src1 + i)), _mm512_add_epi32(Load(src2 + i), Load(src3 + i)))); - } - -#ifdef __GNUC__ - //#define SIMD_USE_GATHER_AND_SCATTER_FOR_HISTOGRAM // low performance -#endif - -#if defined(SIMD_USE_GATHER_AND_SCATTER_FOR_HISTOGRAM) - const __m512i K32_TO_HISTOGRAMS = SIMD_MM512_SETR_EPI32(0x000, 0x100, 0x200, 0x300, 0x000, 0x100, 0x200, 0x300, 0x000, 0x100, 0x200, 0x300, 0x000, 0x100, 0x200, 0x300); - - SIMD_INLINE void AddToHistogram(__m128i index, uint32_t * histogram) - { - __m128i hist = _mm_i32gather_epi32((int*)histogram, index, 4); - hist = _mm_add_epi32(hist, Sse2::K32_00000001); - _mm_i32scatter_epi32((int*)histogram, index, hist, 4); - } -#endif - - template void AbsSecondDerivativeHistogram(const uint8_t *src, size_t width, size_t height, size_t stride, - size_t step, size_t indent, uint32_t * histogram) - { - Buffer buffer(AlignHi(width, A), HISTOGRAM_SIZE); - buffer.v += indent; - src += indent*(stride + 1); - height -= 2 * indent; - width -= 2 * indent; - - ptrdiff_t bodyStart = (uint8_t*)AlignHi(buffer.v, A) - buffer.v; - ptrdiff_t bodyEnd = bodyStart + AlignLo(width - bodyStart, A); - size_t rowStep = step*stride; - size_t alignedWidth = Simd::AlignLo(width, 4); - size_t fullAlignedWidth = Simd::AlignLo(width, Sse2::A); - for (size_t row = 0; row < height; ++row) - { - if (bodyStart) - AbsSecondDerivative(src, step, rowStep, buffer.v); - for (ptrdiff_t col = bodyStart; col < bodyEnd; col += A) - AbsSecondDerivative(src + col, step, rowStep, buffer.v + col); - if (width != (size_t)bodyEnd) - AbsSecondDerivative(src + width - A, step, rowStep, buffer.v + width - A); - - size_t col = 0; -#if defined(SIMD_USE_GATHER_AND_SCATTER_FOR_HISTOGRAM) - for (; col < fullAlignedWidth; col += Sse2::A) - { - __m512i index = _mm512_add_epi32(_mm512_cvtepu8_epi32(Sse2::Load((__m128i*)(buffer.v + col))), K32_TO_HISTOGRAMS); - AddToHistogram(_mm512_extracti32x4_epi32(index, 0), buffer.h[0]); - AddToHistogram(_mm512_extracti32x4_epi32(index, 1), buffer.h[0]); - AddToHistogram(_mm512_extracti32x4_epi32(index, 2), buffer.h[0]); - AddToHistogram(_mm512_extracti32x4_epi32(index, 3), buffer.h[0]); - } -#endif - for (; col < alignedWidth; col += 4) - { - ++buffer.h[0][buffer.v[col + 0]]; - ++buffer.h[1][buffer.v[col + 1]]; - ++buffer.h[2][buffer.v[col + 2]]; - ++buffer.h[3][buffer.v[col + 3]]; - } - for (; col < width; ++col) - ++buffer.h[0][buffer.v[col + 0]]; - src += stride; - } - - SumHistograms(buffer.h[0], 0, histogram); - } - - void AbsSecondDerivativeHistogram(const uint8_t *src, size_t width, size_t height, size_t stride, - size_t step, size_t indent, uint32_t * histogram) - { - assert(width > 2 * indent && height > 2 * indent && indent >= step && width >= A + 2 * indent); - - if (Aligned(src) && Aligned(stride)) - AbsSecondDerivativeHistogram(src, width, height, stride, step, indent, histogram); - else - AbsSecondDerivativeHistogram(src, width, height, stride, step, indent, histogram); - } - - template SIMD_INLINE void MaskSrc(const uint8_t * src, const uint8_t * mask, const __m512i & index, ptrdiff_t offset, uint16_t * dst, __mmask64 tail = -1) - { - __m512i _src = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(src + offset, tail))); - __mmask64 mmask = _mm512_cmpeq_epi8_mask((Load(mask + offset, tail)), index); - Store(dst + offset + 00, _mm512_maskz_add_epi16(__mmask32(mmask >> 00), UnpackU8<0>(_src), K16_0010)); - Store(dst + offset + HA, _mm512_maskz_add_epi16(__mmask32(mmask >> 32), UnpackU8<1>(_src), K16_0010)); - } - - template void HistogramMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t index, uint32_t * histogram) - { - Buffer buffer(AlignHi(width, A), HISTOGRAM_SIZE + F); - size_t widthAligned4 = Simd::AlignLo(width, 4); - size_t widthAlignedA = Simd::AlignLo(width, A); - size_t widthAlignedDA = Simd::AlignLo(width, DA); - __m512i _index = _mm512_set1_epi8(index); - __mmask64 tailMask = TailMask64(width - widthAlignedA); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < widthAlignedDA; col += DA) - { - MaskSrc(src, mask, _index, col + 0, buffer.v); - MaskSrc(src, mask, _index, col + A, buffer.v); - } - for (; col < widthAlignedA; col += A) - MaskSrc(src, mask, _index, col, buffer.v); - if (col < width) - MaskSrc(src, mask, _index, col, buffer.v, tailMask); - - for (col = 0; col < widthAligned4; col += 4) - { - ++buffer.h[0][buffer.v[col + 0]]; - ++buffer.h[1][buffer.v[col + 1]]; - ++buffer.h[2][buffer.v[col + 2]]; - ++buffer.h[3][buffer.v[col + 3]]; - } - for (; col < width; ++col) - ++buffer.h[0][buffer.v[col]]; - - src += srcStride; - mask += maskStride; - } - - SumHistograms(buffer.h[0], F, histogram); - } - - void HistogramMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t index, uint32_t * histogram) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)) - HistogramMasked(src, srcStride, width, height, mask, maskStride, index, histogram); - else - HistogramMasked(src, srcStride, width, height, mask, maskStride, index, histogram); - } - - template - SIMD_INLINE void ConditionalSrc(const uint8_t * src, const uint8_t * mask, const __m512i & value, ptrdiff_t offset, uint16_t * dst, __mmask64 tail = -1) - { - __m512i _src = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(src + offset, tail))); - __mmask64 mmask = Compare8u(Load(mask + offset, tail), value) & tail; - Store(dst + offset + 00, _mm512_maskz_add_epi16(__mmask32(mmask >> 00), UnpackU8<0>(_src), K16_0010)); - Store(dst + offset + HA, _mm512_maskz_add_epi16(__mmask32(mmask >> 32), UnpackU8<1>(_src), K16_0010)); - } - - template void HistogramConditional(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint32_t * histogram) - { - Buffer buffer(AlignHi(width, A), HISTOGRAM_SIZE + F); - size_t widthAligned4 = Simd::AlignLo(width, 4); - size_t widthAlignedA = Simd::AlignLo(width, A); - size_t widthAlignedDA = Simd::AlignLo(width, DA); - __m512i _value = _mm512_set1_epi8(value); - __mmask64 tailMask = TailMask64(width - widthAlignedA); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < widthAlignedDA; col += DA) - { - ConditionalSrc(src, mask, _value, col, buffer.v); - ConditionalSrc(src, mask, _value, col + A, buffer.v); - } - for (; col < widthAlignedA; col += A) - ConditionalSrc(src, mask, _value, col, buffer.v); - if (col < width) - ConditionalSrc(src, mask, _value, col, buffer.v, tailMask); - - for (col = 0; col < widthAligned4; col += 4) - { - ++buffer.h[0][buffer.v[col + 0]]; - ++buffer.h[1][buffer.v[col + 1]]; - ++buffer.h[2][buffer.v[col + 2]]; - ++buffer.h[3][buffer.v[col + 3]]; - } - for (; col < width; ++col) - ++buffer.h[0][buffer.v[col]]; - - src += srcStride; - mask += maskStride; - } - - SumHistograms(buffer.h[0], F, histogram); - } - - template - void HistogramConditional(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint32_t * histogram) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)) - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - else - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - } - - void HistogramConditional(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint32_t * histogram) - { - switch (compareType) - { - case SimdCompareEqual: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - case SimdCompareNotEqual: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - case SimdCompareGreater: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - case SimdCompareGreaterOrEqual: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - case SimdCompareLesser: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - case SimdCompareLesserOrEqual: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - default: - assert(0); - } - } - - SIMD_INLINE __m512i ChangeColors(const __m512i & src, const __m512i colors[4]) - { - __mmask32 blend = _mm512_cmpge_epi16_mask(src, K16_0080); - __m512i permute = _mm512_srli_epi16(src, 1); - __m512i shift = _mm512_slli_epi16(_mm512_and_si512(src, K16_0001), 3); - __m512i permute0 = _mm512_permutex2var_epi16(colors[0], permute, colors[1]); - __m512i permute1 = _mm512_permutex2var_epi16(colors[2], permute, colors[3]); - __m512i blended = _mm512_mask_blend_epi16(blend, permute0, permute1); - return _mm512_and_si512(_mm512_srlv_epi16(blended, shift), K16_00FF); - } - - template SIMD_INLINE void ChangeColors(const uint8_t * src, const __m512i colors[4], uint8_t * dst) - { - __m512i _src = _mm512_cvtepu8_epi16(Avx2::Load((__m256i*)src)); - __m512i _dst = ChangeColors(_src, colors); - Avx2::Store((__m256i*)dst, _mm512_cvtepi16_epi8(_dst)); - } - - SIMD_INLINE void ChangeColors(const uint8_t * src, const __m512i colors[4], uint8_t * dst, __mmask64 tail) - { - __m512i _src = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(_mm512_maskz_loadu_epi8(tail, src))); - __m512i _dst = ChangeColors(_src, colors); - _mm512_mask_storeu_epi8(dst, tail, _mm512_castsi256_si512(_mm512_cvtepi16_epi8(_dst))); - } - - template< bool align> void ChangeColors(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * colors, uint8_t * dst, size_t dstStride) - { - assert(width >= Avx512bw::HA); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); - - __m512i _colors[4]; - _colors[0] = Load(colors + 0 * A); - _colors[1] = Load(colors + 1 * A); - _colors[2] = Load(colors + 2 * A); - _colors[3] = Load(colors + 3 * A); - - size_t widthHA = Simd::AlignLo(width, HA); - __mmask64 tail = TailMask64(width - widthHA); - - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < widthHA; col += HA) - ChangeColors(src + col, _colors, dst + col); - if(col < width) - ChangeColors(src + col, _colors, dst + col, tail); - src += srcStride; - dst += dstStride; - } - } - - void ChangeColors(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * colors, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - ChangeColors(src, srcStride, width, height, colors, dst, dstStride); - else - ChangeColors(src, srcStride, width, height, colors, dst, dstStride); - } - - void NormalizeHistogram(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - uint32_t histogram[HISTOGRAM_SIZE]; - Base::Histogram(src, width, height, srcStride, histogram); - - uint8_t colors[HISTOGRAM_SIZE]; - Base::NormalizedColors(histogram, colors); - - ChangeColors(src, srcStride, width, height, colors, dst, dstStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwHog.cpp b/src/3rd/Simd/Simd/SimdAvx512bwHog.cpp deleted file mode 100644 index 60b61eef..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwHog.cpp +++ /dev/null @@ -1,897 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdArray.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - namespace - { - struct Buffer - { - const int size; - __m512 * cos, *sin; - __m512i * pos, *neg; - int * index; - float * value; - - Buffer(size_t width, size_t quantization) - : size((int)quantization / 2) - { - width = AlignHi(width, A / sizeof(float)); - _p = Allocate(width*(sizeof(int) + sizeof(float)) + (sizeof(__m512i) + sizeof(__m512)) * 2 * size); - index = (int*)_p - 1; - value = (float*)index + width; - cos = (__m512*)(value + width + 1); - sin = cos + size; - pos = (__m512i*)(sin + size); - neg = pos + size; - for (int i = 0; i < size; ++i) - { - cos[i] = _mm512_set1_ps((float)::cos(i*M_PI / size)); - sin[i] = _mm512_set1_ps((float)::sin(i*M_PI / size)); - pos[i] = _mm512_set1_epi32(i); - neg[i] = _mm512_set1_epi32(size + i); - } - } - - ~Buffer() - { - Free(_p); - } - - private: - void *_p; - }; - } - - template SIMD_INLINE void HogDirectionHistograms(const __m512 & dx, const __m512 & dy, Buffer & buffer, size_t col) - { - __m512 bestDot = _mm512_setzero_ps(); - __m512i bestIndex = _mm512_setzero_si512(); - for (int i = 0; i < buffer.size; ++i) - { - __m512 dot = _mm512_fmadd_ps(dx, buffer.cos[i], _mm512_mul_ps(dy, buffer.sin[i])); - bestIndex = _mm512_mask_blend_epi32(_mm512_cmp_ps_mask(dot, bestDot, _CMP_GT_OS), bestIndex, buffer.pos[i]); - bestDot = _mm512_max_ps(dot, bestDot); - - dot = _mm512_sub_ps(_mm512_setzero_ps(), dot); - bestIndex = _mm512_mask_blend_epi32(_mm512_cmp_ps_mask(dot, bestDot, _CMP_GT_OS), bestIndex, buffer.neg[i]); - bestDot = _mm512_max_ps(dot, bestDot); - } - Store(buffer.index + col, bestIndex); - Avx512f::Store(buffer.value + col, _mm512_sqrt_ps(_mm512_fmadd_ps(dx, dx, _mm512_mul_ps(dy, dy)))); - } - - template SIMD_INLINE __m512 CovertDifference(const __m256i & a, const __m256i & b) - { - return _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(Avx2::SubUnpackedU8(a, b))); - } - - template SIMD_INLINE void HogDirectionHistograms(const uint8_t * src, size_t stride, Buffer & buffer, size_t col) - { - const uint8_t * s = src + col; - __m256i t = Avx2::LoadPermuted((__m256i*)(s - stride)); - __m256i l = Avx2::LoadPermuted((__m256i*)(s - 1)); - __m256i r = Avx2::LoadPermuted((__m256i*)(s + 1)); - __m256i b = Avx2::LoadPermuted((__m256i*)(s + stride)); - HogDirectionHistograms(CovertDifference<0>(r, l), CovertDifference<0>(b, t), buffer, col + 0); - HogDirectionHistograms(CovertDifference<1>(r, l), CovertDifference<1>(b, t), buffer, col + F); - } - - namespace Custom_8x8_18 - { - struct Buffer - { - __m512i pos[5]; - __m512 cos[5], sin[5]; - __m128 kx[8], ky[8]; - - int * index; - float * value; - __m128 * hist; - size_t hs; - - Buffer(size_t width) - { - width = AlignHi(width, A / sizeof(float)); - hs = (width / 8 + 1) * 18 * sizeof(__m128); - _p = Allocate(width*(sizeof(int) + sizeof(float)) + hs); - index = (int*)_p - 1; - value = (float*)index + width; - hist = (__m128*)(value + width + 1); - - for (int i = 0; i < 5; ++i) - { - cos[i] = _mm512_set1_ps((float)::cos(i*M_PI / 9)); - sin[i] = _mm512_set1_ps((float)::sin(i*M_PI / 9)); - pos[i] = _mm512_set1_epi32(i); - } - for (int i = 0; i < 8; ++i) - { - float k0 = float((15 - i * 2) / 16.0f); - float k1 = 1.0f - k0; - kx[i] = _mm_setr_ps(k0, k1, k0, k1); - ky[i] = _mm_setr_ps(k0, k0, k1, k1); - } - ClearHist(); - } - - ~Buffer() - { - Free(_p); - } - - void ClearHist() - { - memset(hist, 0, hs); - } - - private: - void *_p; - }; - - const __m512i K32_9 = SIMD_MM512_SET1_EPI32(9); - const __m512i K32_18 = SIMD_MM512_SET1_EPI32(18); - - template SIMD_INLINE void HogDirectionHistograms(const __m512 & dx, const __m512 & dy, Buffer & buffer, size_t col) - { - __m512 _0 = _mm512_set1_ps(-0.0f); - __m512 adx = _mm512_andnot_ps(_0, dx); - __m512 ady = _mm512_andnot_ps(_0, dy); - __m512 bestDot = _mm512_fmadd_ps(adx, buffer.cos[0], _mm512_mul_ps(ady, buffer.sin[0])); - __m512i bestIndex = buffer.pos[0]; - for (int i = 1; i < 5; ++i) - { - __m512 dot = _mm512_fmadd_ps(adx, buffer.cos[i], _mm512_mul_ps(ady, buffer.sin[i])); - bestIndex = _mm512_mask_blend_epi32(_mm512_cmp_ps_mask(dot, bestDot, _CMP_GT_OS), bestIndex, buffer.pos[i]); - bestDot = _mm512_max_ps(dot, bestDot); - } - bestIndex = _mm512_mask_sub_epi32(bestIndex, _mm512_cmp_ps_mask(dx, _0, _CMP_LT_OS), K32_9, bestIndex); - - __m512i corr = _mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(adx, _mm512_setzero_ps(), _CMP_EQ_OS), 1); - bestIndex = _mm512_mask_sub_epi32(bestIndex, _mm512_cmp_ps_mask(dy, _0, _CMP_LT_OS), K32_18, _mm512_add_epi32(bestIndex, corr)); - - bestIndex = _mm512_mask_set1_epi32(bestIndex, _mm512_cmpeq_epi32_mask(bestIndex, K32_18), 0); - - Store(buffer.index + col, bestIndex); - Avx512f::Store(buffer.value + col, _mm512_sqrt_ps(_mm512_fmadd_ps(adx, adx, _mm512_mul_ps(ady, ady)))); - } - - template SIMD_INLINE void HogDirectionHistograms(const uint8_t * src, size_t stride, Buffer & buffer, size_t col) - { - const uint8_t * s = src + col; - __m256i t = Avx2::LoadPermuted((__m256i*)(s - stride)); - __m256i l = Avx2::LoadPermuted((__m256i*)(s - 1)); - __m256i r = Avx2::LoadPermuted((__m256i*)(s + 1)); - __m256i b = Avx2::LoadPermuted((__m256i*)(s + stride)); - HogDirectionHistograms(CovertDifference<0>(r, l), CovertDifference<0>(b, t), buffer, col + 0); - HogDirectionHistograms(CovertDifference<1>(r, l), CovertDifference<1>(b, t), buffer, col + F); - } - - void AddRowToBuffer(const uint8_t * src, size_t stride, Buffer & buffer, size_t row, size_t width, size_t aligned) - { - const uint8_t * s = src + stride * row; - for (size_t col = 1; col < aligned; col += HA) - HogDirectionHistograms(s, stride, buffer, col); - HogDirectionHistograms(s, stride, buffer, width - 1 - HA); - - __m128 ky = buffer.ky[(row + 4) & 7]; - __m128 * hist = buffer.hist; - size_t cellEnd = width / 8; - - for (size_t col = 1; col < 4; ++col) - { - int index = buffer.index[col]; - __m128 value = _mm_set1_ps(buffer.value[col]); - __m128 kx = buffer.kx[(col + 4) & 7]; - hist[index] = _mm_fmadd_ps(_mm_mul_ps(ky, kx), value, hist[index]); - } - hist += 18; - - for (size_t cell = 1, col = 4; cell < cellEnd; ++cell) - { - for (size_t i = 0; i < 8; ++i, ++col) - { - int index = buffer.index[col]; - __m128 value = _mm_set1_ps(buffer.value[col]); - __m128 kx = buffer.kx[i]; - hist[index] = _mm_fmadd_ps(_mm_mul_ps(ky, kx), value, hist[index]); - } - hist += 18; - } - - for (size_t col = width - 4; col < width - 1; ++col) - { - int index = buffer.index[col]; - __m128 value = _mm_set1_ps(buffer.value[col]); - __m128 kx = buffer.kx[(col + 4) & 7]; - hist[index] = _mm_fmadd_ps(_mm_mul_ps(ky, kx), value, hist[index]); - } - } - - void AddToHistogram(Buffer & buffer, size_t row, size_t width, size_t height, float * histograms) - { - typedef float f18_t[18]; - - float * src = (float*)buffer.hist; - f18_t * h0 = (f18_t*)histograms + row * width - width - 1; - f18_t * h1 = h0 + width; - - if (row == 0) - { - for (size_t i = 0; i < 18; ++i) - h1[1][i] += src[i * 4 + 3]; - h1++; - src += 72; - for (size_t cell = 1; cell < width; ++cell) - { - for (size_t i = 0; i < 18; ++i) - { - h1[0][i] += src[i * 4 + 2]; - h1[1][i] += src[i * 4 + 3]; - } - h1++; - src += 72; - } - for (size_t i = 0; i < 18; ++i) - h1[0][i] += src[i * 4 + 2]; - } - else if (row == height) - { - for (size_t i = 0; i < 18; ++i) - h0[1][i] += src[i * 4 + 1]; - h0++; - src += 72; - for (size_t cell = 1; cell < width; ++cell) - { - for (size_t i = 0; i < 18; ++i) - { - h0[0][i] += src[i * 4 + 0]; - h0[1][i] += src[i * 4 + 1]; - } - h0++; - src += 72; - } - for (size_t i = 0; i < 18; ++i) - h0[0][i] += src[i * 4 + 0]; - } - else - { - for (size_t i = 0; i < 18; ++i) - { - h0[1][i] += src[i * 4 + 1]; - h1[1][i] += src[i * 4 + 3]; - } - h0++; - h1++; - src += 72; - for (size_t cell = 1; cell < width; ++cell) - { - __m512 a0 = Load(src + 0x00, src + 0x10, src + 0x20, src + 0x30); - __m512 a1 = Load(src + 0x04, src + 0x14, src + 0x24, src + 0x34); - __m512 a2 = Load(src + 0x08, src + 0x18, src + 0x28, src + 0x38); - __m512 a3 = Load(src + 0x0C, src + 0x1C, src + 0x2C, src + 0x3C); - __m512 b0 = _mm512_unpacklo_ps(a0, a2); - __m512 b1 = _mm512_unpackhi_ps(a0, a2); - __m512 b2 = _mm512_unpacklo_ps(a1, a3); - __m512 b3 = _mm512_unpackhi_ps(a1, a3); - Avx512f::Store(h0[0], _mm512_add_ps(Avx512f::Load(h0[0]), _mm512_unpacklo_ps(b0, b2))); - Avx512f::Store(h0[1], _mm512_add_ps(Avx512f::Load(h0[1]), _mm512_unpackhi_ps(b0, b2))); - Avx512f::Store(h1[0], _mm512_add_ps(Avx512f::Load(h1[0]), _mm512_unpacklo_ps(b1, b3))); - Avx512f::Store(h1[1], _mm512_add_ps(Avx512f::Load(h1[1]), _mm512_unpackhi_ps(b1, b3))); - for (size_t i = 16; i < 18; ++i) - { - h0[0][i] += src[i * 4 + 0]; - h0[1][i] += src[i * 4 + 1]; - h1[0][i] += src[i * 4 + 2]; - h1[1][i] += src[i * 4 + 3]; - } - h0++; - h1++; - src += 72; - } - for (size_t i = 0; i < 18; ++i) - { - h0[0][i] += src[i * 4 + 0]; - h1[0][i] += src[i * 4 + 2]; - } - } - buffer.ClearHist(); - } - - void HogDirectionHistograms(const uint8_t * src, size_t stride, size_t width, size_t height, float * histograms) - { - const size_t quantization = 18; - - size_t sizeX = width / 8, sizeY = height / 8; - - memset(histograms, 0, quantization*sizeX*sizeY * sizeof(float)); - - Buffer buffer(width); - - size_t aligned = AlignLo(width - 2, HA) + 1; - - for (size_t row = 1; row < 4; ++row) - AddRowToBuffer(src, stride, buffer, row, width, aligned); - AddToHistogram(buffer, 0, sizeX, sizeY, histograms); - for (size_t row = 4, cell = 1; row < height - 4; ++row) - { - AddRowToBuffer(src, stride, buffer, row, width, aligned); - if ((row & 7) == 3) - AddToHistogram(buffer, cell++, sizeX, sizeY, histograms); - } - for (size_t row = height - 4; row < height - 1; ++row) - AddRowToBuffer(src, stride, buffer, row, width, aligned); - AddToHistogram(buffer, sizeY, sizeX, sizeY, histograms); - } - } - - void HogDirectionHistograms(const uint8_t * src, size_t stride, size_t width, size_t height, - size_t cellX, size_t cellY, size_t quantization, float * histograms) - { - assert(width%cellX == 0 && height%cellY == 0 && quantization % 2 == 0); - assert(width >= HA + 2); - - if (cellX == 8 && cellY == 8 && quantization == 18) - Custom_8x8_18::HogDirectionHistograms(src, stride, width, height, histograms); - else - { - memset(histograms, 0, quantization*(width / cellX)*(height / cellY) * sizeof(float)); - - Buffer buffer(width, quantization); - - size_t alignedWidth = AlignLo(width - 2, HA) + 1; - - for (size_t row = 1; row < height - 1; ++row) - { - const uint8_t * s = src + stride * row; - for (size_t col = 1; col < alignedWidth; col += HA) - HogDirectionHistograms(s, stride, buffer, col); - HogDirectionHistograms(s, stride, buffer, width - 1 - HA); - Base::AddRowToHistograms(buffer.index, buffer.value, row, width, height, cellX, cellY, quantization, histograms); - } - } - } - - class HogFeatureExtractor - { - static const size_t C = 8; - static const size_t Q = 9; - static const size_t Q2 = 18; - - typedef Array Array32i; - typedef Array Array32f; - - size_t _sx, _sy, _hs; - - __m512i _pos[5]; - __m512 _cos[5], _sin[5]; - __m128 _kx[8], _ky[8]; - __m512i _Q, _Q2; - - Array32i _index; - Array32f _value; - Array32f _buffer; - Array32f _histogram; - Array32f _norm; - - void Init(size_t w, size_t h) - { - _sx = w / C; - _hs = _sx + 2; - _sy = h / C; - for (int i = 0; i < 5; ++i) - { - _cos[i] = _mm512_set1_ps((float)::cos(i*M_PI / Q)); - _sin[i] = _mm512_set1_ps((float)::sin(i*M_PI / Q)); - _pos[i] = _mm512_set1_epi32(i); - } - for (int i = 0; i < C; ++i) - { - float k0 = float((15 - i * 2) / 16.0f); - float k1 = 1.0f - k0; - _kx[i] = _mm_setr_ps(k0, k1, k0, k1); - _ky[i] = _mm_setr_ps(k0, k0, k1, k1); - } - _Q = _mm512_set1_epi32(Q); - _Q2 = _mm512_set1_epi32(Q2); - - _index.Resize(w); - _value.Resize(w); - _buffer.Resize((_sx + 1) * 4 * Q2); - _histogram.Resize((_sx + 2)*(_sy + 2)*Q2); - _norm.Resize((_sx + 2)*(_sy + 2)); - } - - template SIMD_INLINE void GetHistogram(const __m512 & dx, const __m512 & dy, size_t col) - { - __m512 _0 = _mm512_set1_ps(-0.0f); - __m512 adx = _mm512_andnot_ps(_0, dx); - __m512 ady = _mm512_andnot_ps(_0, dy); - __m512 bestDot = _mm512_fmadd_ps(adx, _cos[0], _mm512_mul_ps(ady, _sin[0])); - __m512i bestIndex = _pos[0]; - for (int i = 1; i < 5; ++i) - { - __m512 dot = _mm512_fmadd_ps(adx, _cos[i], _mm512_mul_ps(ady, _sin[i])); - bestIndex = _mm512_mask_blend_epi32(_mm512_cmp_ps_mask(dot, bestDot, _CMP_GT_OS), bestIndex, _pos[i]); - bestDot = _mm512_max_ps(dot, bestDot); - } - bestIndex = _mm512_mask_sub_epi32(bestIndex, _mm512_cmp_ps_mask(dx, _0, _CMP_LT_OS), _Q, bestIndex); - - __m512i corr = _mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(adx, _mm512_setzero_ps(), _CMP_EQ_OS), 1); - bestIndex = _mm512_mask_sub_epi32(bestIndex, _mm512_cmp_ps_mask(dy, _0, _CMP_LT_OS), _Q2, _mm512_add_epi32(bestIndex, corr)); - - bestIndex = _mm512_mask_set1_epi32(bestIndex, _mm512_cmpeq_epi32_mask(bestIndex, _Q2), 0); - - Store(_index.data + col, bestIndex); - Avx512f::Store(_value.data + col, _mm512_sqrt_ps(_mm512_fmadd_ps(adx, adx, _mm512_mul_ps(ady, ady)))); - } - - template SIMD_INLINE void GetHistogram(const uint8_t * src, size_t stride, size_t col) - { - const uint8_t * s = src + col; - __m256i t = Avx2::LoadPermuted((__m256i*)(s - stride)); - __m256i l = Avx2::LoadPermuted((__m256i*)(s - 1)); - __m256i r = Avx2::LoadPermuted((__m256i*)(s + 1)); - __m256i b = Avx2::LoadPermuted((__m256i*)(s + stride)); - GetHistogram(CovertDifference<0>(r, l), CovertDifference<0>(b, t), col + 0); - GetHistogram(CovertDifference<1>(r, l), CovertDifference<1>(b, t), col + F); - } - - void AddRowToBuffer(const uint8_t * src, size_t stride, size_t row, size_t width, size_t aligned) - { - const uint8_t * s = src + stride * row; - GetHistogram(s, stride, 1); - for (size_t col = HA; col < aligned; col += HA) - GetHistogram(s, stride, col); - GetHistogram(s, stride, width - 1 - HA); - - __m128 ky = _ky[(row + 4) & 7]; - __m128 * buffer = (__m128*)_buffer.data; - for (size_t col = 1, n = C, i = 5; col < width - 1; i = 0, n = Simd::Min(C, width - col - 1)) - { - for (; i < n; ++i, ++col) - { - int index = _index[col]; - __m128 value = _mm_set1_ps(_value[col]); - buffer[index] = _mm_fmadd_ps(_mm_mul_ps(ky, _kx[i]), value, buffer[index]); - } - buffer += Q2; - } - } - - void AddToHistogram(size_t row, size_t width, size_t height) - { - typedef float f18_t[18]; - - float * src = _buffer.data; - f18_t * h0 = (f18_t*)_histogram.data + row * _hs; - f18_t * h1 = h0 + _hs; - - for (size_t cell = 0; cell <= width; ++cell) - { - __m512 a0 = Load(src + 0x00, src + 0x10, src + 0x20, src + 0x30); - __m512 a1 = Load(src + 0x04, src + 0x14, src + 0x24, src + 0x34); - __m512 a2 = Load(src + 0x08, src + 0x18, src + 0x28, src + 0x38); - __m512 a3 = Load(src + 0x0C, src + 0x1C, src + 0x2C, src + 0x3C); - __m512 b0 = _mm512_unpacklo_ps(a0, a2); - __m512 b1 = _mm512_unpackhi_ps(a0, a2); - __m512 b2 = _mm512_unpacklo_ps(a1, a3); - __m512 b3 = _mm512_unpackhi_ps(a1, a3); - Avx512f::Store(h0[0], _mm512_add_ps(Avx512f::Load(h0[0]), _mm512_unpacklo_ps(b0, b2))); - Avx512f::Store(h0[1], _mm512_add_ps(Avx512f::Load(h0[1]), _mm512_unpackhi_ps(b0, b2))); - Avx512f::Store(h1[0], _mm512_add_ps(Avx512f::Load(h1[0]), _mm512_unpacklo_ps(b1, b3))); - Avx512f::Store(h1[1], _mm512_add_ps(Avx512f::Load(h1[1]), _mm512_unpackhi_ps(b1, b3))); -#if defined(_MSC_VER) - for (size_t i = 16; i < 18; ++i) - { - h0[0][i] += src[i * 4 + 0]; - h0[1][i] += src[i * 4 + 1]; - h1[0][i] += src[i * 4 + 2]; - h1[1][i] += src[i * 4 + 3]; - } -#else - __m128 * ps = (__m128*)src; - __m128 s0 = _mm_add_ps(_mm_unpacklo_ps(ps[16], ps[17]), Sse::Load(h0[0] + 16, h0[1] + 16)); - __m128 s1 = _mm_add_ps(_mm_unpackhi_ps(ps[16], ps[17]), Sse::Load(h1[0] + 16, h1[1] + 16)); - Sse::StoreHalf<0>(h0[0] + 16, s0); - Sse::StoreHalf<1>(h0[1] + 16, s0); - Sse::StoreHalf<0>(h1[0] + 16, s1); - Sse::StoreHalf<1>(h1[1] + 16, s1); -#endif - h0++; - h1++; - src += 72; - } - _buffer.Clear(); - } - - void EstimateHistogram(const uint8_t * src, size_t stride, size_t width, size_t height) - { - _histogram.Clear(); - - size_t aligned = AlignHi(width - 1, HA) - HA; - - _buffer.Clear(); - for (size_t row = 1; row < 4; ++row) - AddRowToBuffer(src, stride, row, width, aligned); - AddToHistogram(0, _sx, _sy); - for (size_t row = 4, cell = 1; row < height - 4; ++row) - { - AddRowToBuffer(src, stride, row, width, aligned); - if ((row & 7) == 3) - AddToHistogram(cell++, _sx, _sy); - } - for (size_t row = height - 4; row < height - 1; ++row) - AddRowToBuffer(src, stride, row, width, aligned); - AddToHistogram(_sy, _sx, _sy); - } - - SIMD_INLINE float GetNorm(const float * src) - { - __m256 norm = _mm256_add_ps(_mm256_loadu_ps(src), _mm256_loadu_ps(src + Q)); - norm = _mm256_mul_ps(norm, norm); - norm = _mm256_hadd_ps(norm, norm); - norm = _mm256_hadd_ps(norm, norm); - float buf[8]; - _mm256_storeu_ps(buf, norm); - return buf[0] + buf[4] + Simd::Square(src[Q - 1] + src[Q2 - 1]); - } - - void EstimateNorm() - { - _norm.Clear(); - for (size_t y = 0, i = 0; y < _sy; y++) - { - const float * h = _histogram.data + ((y + 1)*_hs + 1)*Q2; - float * n = _norm.data + (y + 1)*_hs + 1; - for (size_t x = 0; x < _sx; x++, i++) - n[x] = GetNorm(h + x * Q2); - } - } - - void ExtractFeatures(float * features) - { - __m128 _02 = _mm_set1_ps(0.2f); - __m128 _05 = _mm_set1_ps(0.5f); - __m128 _02357 = _mm_set1_ps(0.2357f); - __m128 eps = _mm_set1_ps(0.0001f); - for (size_t y = 0; y < _sy; y++) - { - float * ph = _histogram.data + ((y + 1)*_hs + 1)*Q2; - for (size_t x = 0; x < _sx; x++) - { - float * dst = features + (y*_sx + x) * 31; - - float * p0 = _norm.data + y * _hs + x; - float * p1 = p0 + _hs; - float * p2 = p1 + _hs; - - __m128 n = _mm_setr_ps( - p1[1] + p1[2] + p2[1] + p2[2], - p0[1] + p0[2] + p1[1] + p1[2], - p1[0] + p1[1] + p2[0] + p2[1], - p0[0] + p0[1] + p1[0] + p1[1]); - - n = _mm_rsqrt_ps(_mm_add_ps(n, eps)); - - __m128 t = _mm_setzero_ps(); - - float * src = ph + x * Q2; - for (int o = 0; o < 16; o += 4) - { - __m128 s = _mm_loadu_ps(src); - __m128 h0 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<0>(s), n), _02); - __m128 h1 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<1>(s), n), _02); - __m128 h2 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<2>(s), n), _02); - __m128 h3 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<3>(s), n), _02); - t = _mm_add_ps(t, _mm_add_ps(_mm_add_ps(h0, h1), _mm_add_ps(h2, h3))); - _mm_storeu_ps(dst, _mm_mul_ps(_05, _mm_hadd_ps(_mm_hadd_ps(h0, h1), _mm_hadd_ps(h2, h3)))); - dst += 4; - src += 4; - } - { - __m128 h0 = _mm_min_ps(_mm_mul_ps(_mm_set1_ps(*src++), n), _02); - __m128 h1 = _mm_min_ps(_mm_mul_ps(_mm_set1_ps(*src++), n), _02); - t = _mm_add_ps(t, _mm_add_ps(h0, h1)); - __m128 h = _mm_hadd_ps(h0, h1); - _mm_storeu_ps(dst, _mm_mul_ps(_05, _mm_hadd_ps(h, h))); - dst += 2; - } - - src = ph + x * Q2; - for (int o = 0; o < 8; o += 4) - { - __m128 s = _mm_add_ps(_mm_loadu_ps(src), _mm_loadu_ps(src + Q)); - __m128 h0 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<0>(s), n), _02); - __m128 h1 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<1>(s), n), _02); - __m128 h2 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<2>(s), n), _02); - __m128 h3 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<3>(s), n), _02); - _mm_storeu_ps(dst, _mm_mul_ps(_05, _mm_hadd_ps(_mm_hadd_ps(h0, h1), _mm_hadd_ps(h2, h3)))); - dst += 4; - src += 4; - } - { - __m128 s = _mm_set1_ps(src[0] + src[Q]); - __m128 h = _mm_min_ps(_mm_mul_ps(s, n), _02); - h = _mm_dp_ps(_05, h, 0xF1); - _mm_store_ss(dst++, h); - } - _mm_storeu_ps(dst, _mm_mul_ps(t, _02357)); - } - } - } - - public: - - void Run(const uint8_t * src, size_t stride, size_t width, size_t height, float * features) - { - Init(width, height); - - EstimateHistogram(src, stride, width, height); - - EstimateNorm(); - - ExtractFeatures(features); - } - }; - - void HogExtractFeatures(const uint8_t * src, size_t stride, size_t width, size_t height, float * features) - { - assert(width % 8 == 0 && height % 8 == 0 && width >= 16 && height >= 16); - assert(width >= HA + 2); - - HogFeatureExtractor extractor; - extractor.Run(src, stride, width, height, features); - } - - SIMD_INLINE void HogDeinterleave(const float * src, size_t count, float ** dst, size_t offset, size_t i) - { - src += i; - __m512 a0 = Load(src + 0x0 * count, src + 0x4 * count, src + 0x8 * count, src + 0xC * count); - __m512 a1 = Load(src + 0x1 * count, src + 0x5 * count, src + 0x9 * count, src + 0xD * count); - __m512 a2 = Load(src + 0x2 * count, src + 0x6 * count, src + 0xA * count, src + 0xE * count); - __m512 a3 = Load(src + 0x3 * count, src + 0x7 * count, src + 0xB * count, src + 0xF * count); - __m512 b0 = _mm512_unpacklo_ps(a0, a2); - __m512 b1 = _mm512_unpackhi_ps(a0, a2); - __m512 b2 = _mm512_unpacklo_ps(a1, a3); - __m512 b3 = _mm512_unpackhi_ps(a1, a3); - Avx512f::Store(dst[i + 0] + offset, _mm512_unpacklo_ps(b0, b2)); - Avx512f::Store(dst[i + 1] + offset, _mm512_unpackhi_ps(b0, b2)); - Avx512f::Store(dst[i + 2] + offset, _mm512_unpacklo_ps(b1, b3)); - Avx512f::Store(dst[i + 3] + offset, _mm512_unpackhi_ps(b1, b3)); - } - - void HogDeinterleave(const float * src, size_t srcStride, size_t width, size_t height, size_t count, float ** dst, size_t dstStride) - { - assert(width >= F && count >= Sse::F); - - size_t alignedCount = AlignLo(count, Sse::F); - size_t alignedWidth = AlignLo(width, F); - - for (size_t row = 0; row < height; ++row) - { - size_t rowOffset = row * dstStride; - for (size_t col = 0; col < alignedWidth; col += F) - { - const float * s = src + count * col; - size_t offset = rowOffset + col; - for (size_t i = 0; i < alignedCount; i += Sse::F) - HogDeinterleave(s, count, dst, offset, i); - if (alignedCount != count) - HogDeinterleave(s, count, dst, offset, count - Sse::F); - } - if (alignedWidth != width) - { - size_t col = width - F; - const float * s = src + count * col; - size_t offset = rowOffset + col; - for (size_t i = 0; i < alignedCount; i += Sse::F) - HogDeinterleave(s, count, dst, offset, i); - if (alignedCount != count) - HogDeinterleave(s, count, dst, offset, count - Sse::F); - } - src += srcStride; - } - } - - namespace HogSeparableFilter_Detail - { - template SIMD_INLINE void Set(float * dst, const __m512 & value, __mmask16 tail = -1) - { - Avx512f::Store(dst, value, tail); - } - - template <> SIMD_INLINE void Set<1, false>(float * dst, const __m512 & value, __mmask16 tail) - { - Avx512f::Store(dst, _mm512_add_ps(Avx512f::Load(dst), value)); - } - - template <> SIMD_INLINE void Set<1, true>(float * dst, const __m512 & value, __mmask16 tail) - { - Avx512f::Store(dst, _mm512_add_ps((Avx512f::Load(dst, tail)), value), tail); - } - } - - class HogSeparableFilter - { - size_t _w, _h, _s; - Array32f _buffer; - Array512f _filter; - - SIMD_INLINE void Init(size_t w, size_t h, size_t rs, size_t cs) - { - _w = w - rs + 1; - _s = AlignHi(_w, F); - _h = h - cs + 1; - _buffer.Resize(_s*h); - } - - template SIMD_INLINE void FilterRows(const float * src, const __m512 * filter, size_t size, float * dst) - { - __m512 sum = _mm512_setzero_ps(); - for (size_t i = 0; i < size; ++i) - sum = _mm512_fmadd_ps(Avx512f::Load(src + i), filter[i], sum); - Avx512f::Store(dst, sum); - } - - void FilterRows(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - _filter.Resize(size); - for (size_t i = 0; i < size; ++i) - _filter[i] = _mm512_set1_ps(filter[i]); - - size_t alignedWidth = AlignLo(width, F); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += F) - FilterRows(src + col, _filter.data, size, dst + col); - if (alignedWidth != width) - FilterRows(src + width - F, _filter.data, size, dst + width - F); - src += srcStride; - dst += dstStride; - } - } - - template SIMD_INLINE void FilterRows_10(const float * src, const __m512 * filter, float * dst) - { - __m512 src0 = Avx512f::Load(src + 0); - __m512 srcf = Avx512f::Load(src + F); - __m512 sum0 = _mm512_mul_ps(Alignr<0>(src0, srcf), filter[0]); - __m512 sum1 = _mm512_mul_ps(Alignr<1>(src0, srcf), filter[1]); - sum0 = _mm512_fmadd_ps(Alignr<2>(src0, srcf), filter[2], sum0); - sum1 = _mm512_fmadd_ps(Alignr<3>(src0, srcf), filter[3], sum1); - sum0 = _mm512_fmadd_ps(Alignr<4>(src0, srcf), filter[4], sum0); - sum1 = _mm512_fmadd_ps(Alignr<5>(src0, srcf), filter[5], sum1); - sum0 = _mm512_fmadd_ps(Alignr<6>(src0, srcf), filter[6], sum0); - sum1 = _mm512_fmadd_ps(Alignr<7>(src0, srcf), filter[7], sum1); - sum0 = _mm512_fmadd_ps(Alignr<8>(src0, srcf), filter[8], sum0); - sum1 = _mm512_fmadd_ps(Alignr<9>(src0, srcf), filter[9], sum1); - Avx512f::Store(dst, _mm512_add_ps(sum0, sum1)); - } - - void FilterRows_10(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, float * dst, size_t dstStride) - { - __m512 _filter[10]; - for (size_t i = 0; i < 10; ++i) - _filter[i] = _mm512_set1_ps(filter[i]); - - size_t alignedWidth = AlignLo(width, F); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += F) - FilterRows_10(src + col, _filter, dst + col); - if (alignedWidth != width) - FilterRows_10(src + width - F, _filter, dst + width - F); - src += srcStride; - dst += dstStride; - } - } - - template SIMD_INLINE void FilterCols(const float * src, size_t stride, const __m512 * filter, size_t size, float * dst, __mmask16 tail = -1) - { - __m512 sum = _mm512_setzero_ps(); - for (size_t i = 0; i < size; ++i, src += stride) - sum = _mm512_fmadd_ps((Avx512f::Load(src, tail)), filter[i], sum); - HogSeparableFilter_Detail::Set(dst, sum, tail); - } - - template void SIMD_INLINE FilterCols4x(const float * src, size_t stride, const __m512 * filter, size_t size, float * dst) - { - __m512 sums[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; - for (size_t i = 0; i < size; ++i, src += stride) - { - __m512 f = filter[i]; - sums[0] = _mm512_fmadd_ps(Avx512f::Load(src + 0 * F), f, sums[0]); - sums[1] = _mm512_fmadd_ps(Avx512f::Load(src + 1 * F), f, sums[1]); - sums[2] = _mm512_fmadd_ps(Avx512f::Load(src + 2 * F), f, sums[2]); - sums[3] = _mm512_fmadd_ps(Avx512f::Load(src + 3 * F), f, sums[3]); - } - HogSeparableFilter_Detail::Set(dst + 0 * F, sums[0]); - HogSeparableFilter_Detail::Set(dst + 1 * F, sums[1]); - HogSeparableFilter_Detail::Set(dst + 2 * F, sums[2]); - HogSeparableFilter_Detail::Set(dst + 3 * F, sums[3]); - } - - template void FilterCols(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - _filter.Resize(size); - for (size_t i = 0; i < size; ++i) - _filter[i] = _mm512_set1_ps(filter[i]); - - size_t fullAlignedWidth = AlignLo(width, QF); - size_t alignedWidth = AlignLo(width, F); - __mmask16 tailMask = TailMask16(width - alignedWidth); - - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QF) - FilterCols4x(src + col, srcStride, _filter.data, size, dst + col); - for (; col < alignedWidth; col += F) - FilterCols(src + col, srcStride, _filter.data, size, dst + col); - if (col < width) - FilterCols(src + col, srcStride, _filter.data, size, dst + col, tailMask); - src += srcStride; - dst += dstStride; - } - } - - public: - - void Run(const float * src, size_t srcStride, size_t width, size_t height, - const float * rowFilter, size_t rowSize, const float * colFilter, size_t colSize, float * dst, size_t dstStride, int add) - { - Init(width, height, rowSize, colSize); - - if (colSize == 10) - FilterRows_10(src, srcStride, _w, height, rowFilter, _buffer.data, _s); - else - FilterRows(src, srcStride, _w, height, rowFilter, rowSize, _buffer.data, _s); - - if (add) - FilterCols<1>(_buffer.data, _s, _w, _h, colFilter, colSize, dst, dstStride); - else - FilterCols<0>(_buffer.data, _s, _w, _h, colFilter, colSize, dst, dstStride); - } - }; - - void HogFilterSeparable(const float * src, size_t srcStride, size_t width, size_t height, - const float * rowFilter, size_t rowSize, const float * colFilter, size_t colSize, float * dst, size_t dstStride, int add) - { - assert(width >= F + rowSize - 1 && height >= colSize - 1); - - HogSeparableFilter filter; - filter.Run(src, srcStride, width, height, rowFilter, rowSize, colFilter, colSize, dst, dstStride, add); - } - } -#endif -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwHogLite.cpp b/src/3rd/Simd/Simd/SimdAvx512bwHogLite.cpp deleted file mode 100644 index 49dafd64..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwHogLite.cpp +++ /dev/null @@ -1,1372 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdCompare.h" -#include "Simd/SimdArray.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdUpdate.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - const __m512i K8_KX4 = SIMD_MM512_SETR_EPI8( - 1, 3, 5, 7, 7, 5, 3, 1, 1, 3, 5, 7, 7, 5, 3, 1, - 1, 3, 5, 7, 7, 5, 3, 1, 1, 3, 5, 7, 7, 5, 3, 1, - 1, 3, 5, 7, 7, 5, 3, 1, 1, 3, 5, 7, 7, 5, 3, 1, - 1, 3, 5, 7, 7, 5, 3, 1, 1, 3, 5, 7, 7, 5, 3, 1); - const __m512i K8_KX8 = SIMD_MM512_SETR_EPI8( - 1, 3, 5, 7, 9, 11, 13, 15, 15, 13, 11, 9, 7, 5, 3, 1, - 1, 3, 5, 7, 9, 11, 13, 15, 15, 13, 11, 9, 7, 5, 3, 1, - 1, 3, 5, 7, 9, 11, 13, 15, 15, 13, 11, 9, 7, 5, 3, 1, - 1, 3, 5, 7, 9, 11, 13, 15, 15, 13, 11, 9, 7, 5, 3, 1); - - const __m512i K32_PERMUTE_4 = SIMD_MM512_SETR_EPI32(0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8); - const __m512i K32_PERMUTE_8 = SIMD_MM512_SETR_EPI32(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9); - - SIMD_INLINE __m512i Merge(__m512i a, __m512i b) - { - __m512i ab0 = Shuffle32i<0x88>(a, b); - __m512i ab1 = Shuffle32i<0xDD>(a, b); - return _mm512_add_epi16(ab0, ab1); - } - - const __m512i K64_PERMUTE_0 = SIMD_MM512_SETR_EPI64(0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xA, 0xB); - const __m512i K64_PERMUTE_1 = SIMD_MM512_SETR_EPI64(0x4, 0x5, 0xC, 0xD, 0x6, 0x7, 0xE, 0xF); - - const __m512i K32_PERMUTE_BN_0 = SIMD_MM512_SETR_EPI32(0x11, 0x01, 0x10, 0x00, 0x12, 0x02, 0x11, 0x01, 0x13, 0x03, 0x12, 0x02, 0x14, 0x04, 0x13, 0x03); - const __m512i K32_PERMUTE_BN_1 = SIMD_MM512_SETR_EPI32(0x15, 0x05, 0x14, 0x04, 0x16, 0x06, 0x15, 0x05, 0x17, 0x07, 0x16, 0x06, 0x18, 0x08, 0x17, 0x07); - const __m512i K32_PERMUTE_BN_2 = SIMD_MM512_SETR_EPI32(0x19, 0x09, 0x18, 0x08, 0x1A, 0x0A, 0x19, 0x09, 0x1B, 0x0B, 0x1A, 0x0A, 0x1C, 0x0C, 0x1B, 0x0B); - - template class HogLiteFeatureExtractor - { - static const size_t FQ = 8; - static const size_t HQ = FQ / 2; - static const size_t DQ = FQ * 2; - static const size_t QQ = FQ * 4; - - typedef Array Bytes; - typedef Array Ints; - typedef Array Floats; - - size_t _hx, _fx, _w, _aw; - __mmask64 _wt; - __mmask16 _ft[2]; - Bytes _value, _index; - Ints _hi[2]; - Floats _hf[2], _nf[4], _nb; - int _k0[cell], _k1[cell]; - __m256 _02, _05, _02357, _eps; - __m512 _k; - - SIMD_INLINE void Init(size_t width) - { - _w = (width / cell - 1)*cell; - _aw = AlignLo(_w, A); - _wt = TailMask64(_w - _aw); - _hx = width / cell; - _fx = _hx - 2; - _value.Resize(_aw + 3 * A, true); - _index.Resize(_aw + 3 * A, true); - for (size_t i = 0; i < cell; ++i) - { - _k0[i] = int(cell - i - 1) * 2 + 1; - _k1[i] = int(i) * 2 + 1; - } - for (size_t i = 0; i < 2; ++i) - { - _hi[i].Resize((_hx + 8)*FQ, true); - _hf[i].Resize(AlignHi(_hx*FQ, DF)); - } - for (size_t i = 0; i < 4; ++i) - _nf[i].Resize(_hx + Avx2::F); - _nb.Resize((_hx + 12) * 4); - _k = _mm512_set1_ps(1.0f / Simd::Square(cell * 2)); - _02 = _mm256_set1_ps(0.2f); - _05 = _mm256_set1_ps(0.5f); - _02357 = _mm256_set1_ps(0.2357f); - _eps = _mm256_set1_ps(0.0001f); - } - - template static SIMD_INLINE void SetIndexAndValue(const uint8_t * src, size_t stride, uint8_t * value, uint8_t * index, __mmask64 tail = -1) - { - __m512i y0 = Load(src - stride, tail); - __m512i y1 = Load(src + stride, tail); - __m512i x0 = Load(src - 1, tail); - __m512i x1 = Load(src + 1, tail); - - __m512i ady = AbsDifferenceU8(y0, y1); - __m512i adx = AbsDifferenceU8(x0, x1); - - __m512i max = _mm512_max_epu8(ady, adx); - __m512i min = _mm512_min_epu8(ady, adx); - __m512i val = _mm512_adds_epu8(max, _mm512_avg_epu8(min, K_ZERO)); - Store(value, val, tail); - - __m512i idx = _mm512_mask_blend_epi8(_mm512_cmpgt_epu8_mask(adx, ady), K8_01, K_ZERO); - idx = _mm512_mask_sub_epi8(idx, _mm512_cmple_epu8_mask(x1, x0), K8_03, idx); - idx = _mm512_mask_sub_epi8(idx, _mm512_cmple_epu8_mask(y1, y0), K8_07, idx); - Store(index, idx, tail); - } - - SIMD_INLINE void SetIndexAndValue(const uint8_t * src, size_t stride) - { - uint8_t * value = _value.data + A; - uint8_t * index = _index.data + A; - size_t col = 0; - for (; col < _aw; col += A) - SetIndexAndValue(src + col, stride, value + col, index + col); - if (col < _w) - SetIndexAndValue(src + col, stride, value + col, index + col, _wt); - } - - static SIMD_INLINE void UpdateIntegerHistogram4x4(uint8_t * value, uint8_t * index, const __m512i & ky0, const __m512i & ky1, int * h0, int * h1) - { - __m512i val = _mm512_permutexvar_epi32(K32_PERMUTE_4, Load(value)); - __m512i idx = _mm512_permutexvar_epi32(K32_PERMUTE_4, Load(index)); - __m512i cur0 = K_ZERO; - __m512i cur1 = K8_01; - __m512i dirs[4]; - for (size_t i = 0; i < 4; ++i) - { - __m512i dir0 = _mm512_maddubs_epi16(_mm512_maskz_mov_epi8(_mm512_cmpeq_epi8_mask(idx, cur0), val), K8_KX4); - __m512i dir1 = _mm512_maddubs_epi16(_mm512_maskz_mov_epi8(_mm512_cmpeq_epi8_mask(idx, cur1), val), K8_KX4); - dirs[i] = Merge(dir0, dir1); - cur0 = _mm512_add_epi8(cur0, K8_02); - cur1 = _mm512_add_epi8(cur1, K8_02); - } - __m512i hx0 = Shuffle32i<0x88>(dirs[0], dirs[1]); - __m512i hx1 = Shuffle32i<0x88>(dirs[2], dirs[3]); - __m512i hx2 = Shuffle32i<0xDD>(dirs[0], dirs[1]); - __m512i hx3 = Shuffle32i<0xDD>(dirs[2], dirs[3]); - __m512i hx0p = _mm512_permutex2var_epi64(hx0, K64_PERMUTE_0, hx1); - __m512i hx1p = _mm512_permutex2var_epi64(hx0, K64_PERMUTE_1, hx1); - __m512i hx2p = _mm512_permutex2var_epi64(hx2, K64_PERMUTE_0, hx3); - __m512i hx3p = _mm512_permutex2var_epi64(hx2, K64_PERMUTE_1, hx3); - Store(h0 + 0 * F, _mm512_add_epi32(Load(h0 + 0 * F), _mm512_madd_epi16(hx0p, ky0))); - Store(h0 + 1 * F, _mm512_add_epi32(Load(h0 + 1 * F), _mm512_madd_epi16(hx2p, ky0))); - Store(h0 + 2 * F, _mm512_add_epi32(Load(h0 + 2 * F), _mm512_madd_epi16(hx1p, ky0))); - Store(h0 + 3 * F, _mm512_add_epi32(Load(h0 + 3 * F), _mm512_madd_epi16(hx3p, ky0))); - Store(h1 + 0 * F, _mm512_add_epi32(Load(h1 + 0 * F), _mm512_madd_epi16(hx0p, ky1))); - Store(h1 + 1 * F, _mm512_add_epi32(Load(h1 + 1 * F), _mm512_madd_epi16(hx2p, ky1))); - Store(h1 + 2 * F, _mm512_add_epi32(Load(h1 + 2 * F), _mm512_madd_epi16(hx1p, ky1))); - Store(h1 + 3 * F, _mm512_add_epi32(Load(h1 + 3 * F), _mm512_madd_epi16(hx3p, ky1))); - } - - SIMD_INLINE void UpdateIntegerHistogram4x4(size_t rowI, size_t rowF) - { - int * h0 = _hi[(rowI + 0) & 1].data; - int * h1 = _hi[(rowI + 1) & 1].data; - uint8_t * value = _value.data + A - cell; - uint8_t * index = _index.data + A - cell; - __m512i ky0 = _mm512_set1_epi16((short)_k0[rowF]); - __m512i ky1 = _mm512_set1_epi16((short)_k1[rowF]); - for (size_t col = 0; col <= _w;) - { - UpdateIntegerHistogram4x4(value + col, index + col, ky0, ky1, h0, h1); - col += 8 * cell; - h0 += 8 * FQ; - h1 += 8 * FQ; - } - } - - static SIMD_INLINE void UpdateIntegerHistogram8x8(uint8_t * value, uint8_t * index, const __m512i & ky0, const __m512i & ky1, int * h0, int * h1) - { - __m512i val = _mm512_permutexvar_epi32(K32_PERMUTE_8, Load(value)); - __m512i idx = _mm512_permutexvar_epi32(K32_PERMUTE_8, Load(index)); - __m512i cur0 = K_ZERO; - __m512i cur1 = K8_01; - __m512i dirs[4]; - for (size_t i = 0; i < 4; ++i) - { - __m512i dir0 = _mm512_maddubs_epi16(_mm512_maskz_mov_epi8(_mm512_cmpeq_epi8_mask(idx, cur0), val), K8_KX8); - __m512i dir1 = _mm512_maddubs_epi16(_mm512_maskz_mov_epi8(_mm512_cmpeq_epi8_mask(idx, cur1), val), K8_KX8); - dirs[i] = Merge(dir0, dir1); - cur0 = _mm512_add_epi8(cur0, K8_02); - cur1 = _mm512_add_epi8(cur1, K8_02); - } - dirs[0] = Merge(dirs[0], dirs[1]); - dirs[1] = Merge(dirs[2], dirs[3]); - __m512i hx0 = _mm512_permutex2var_epi64(dirs[0], K64_PERMUTE_0, dirs[1]); - __m512i hx1 = _mm512_permutex2var_epi64(dirs[0], K64_PERMUTE_1, dirs[1]); - Store(h0 + 0, _mm512_add_epi32(Load(h0 + 0), _mm512_madd_epi16(hx0, ky0))); - Store(h0 + F, _mm512_add_epi32(Load(h0 + F), _mm512_madd_epi16(hx1, ky0))); - Store(h1 + 0, _mm512_add_epi32(Load(h1 + 0), _mm512_madd_epi16(hx0, ky1))); - Store(h1 + F, _mm512_add_epi32(Load(h1 + F), _mm512_madd_epi16(hx1, ky1))); - } - - SIMD_INLINE void UpdateIntegerHistogram8x8(size_t rowI, size_t rowF) - { - int * h0 = _hi[(rowI + 0) & 1].data; - int * h1 = _hi[(rowI + 1) & 1].data; - uint8_t * value = _value.data + A - cell; - uint8_t * index = _index.data + A - cell; - __m512i ky0 = _mm512_set1_epi16((short)_k0[rowF]); - __m512i ky1 = _mm512_set1_epi16((short)_k1[rowF]); - for (size_t col = 0; col <= _w;) - { - UpdateIntegerHistogram8x8(value + col, index + col, ky0, ky1, h0, h1); - col += 4 * cell; - h0 += 4 * FQ; - h1 += 4 * FQ; - } - } - - SIMD_INLINE void UpdateFloatHistogram(size_t rowI) - { - Ints & hi = _hi[rowI & 1]; - Floats & hf = _hf[rowI & 1]; - Floats & nf = _nf[rowI & 3]; - for (size_t i = 0; i < hf.size; i += DF) - { - Avx512f::Store(hf.data + i + 0, _mm512_mul_ps(_k, _mm512_cvtepi32_ps(Load(hi.data + i + 0)))); - Avx512f::Store(hf.data + i + F, _mm512_mul_ps(_k, _mm512_cvtepi32_ps(Load(hi.data + i + F)))); - } - hi.Clear(); - - const float * h = hf.data; - size_t ahx = AlignLo(_hx, 4), x = 0; - for (; x < ahx; x += 4, h += QQ) - { - __m256 h01 = Avx2::Load(h + 0 * FQ); - __m256 h23 = Avx2::Load(h + 1 * FQ); - __m256 h45 = Avx2::Load(h + 2 * FQ); - __m256 h67 = Avx2::Load(h + 3 * FQ); - __m256 s01 = _mm256_add_ps(_mm256_permute2f128_ps(h01, h23, 0x20), _mm256_permute2f128_ps(h01, h23, 0x31)); - __m256 n01 = Avx2::Permute4x64<0x88>(_mm256_dp_ps(s01, s01, 0xF1)); - __m256 s23 = _mm256_add_ps(_mm256_permute2f128_ps(h45, h67, 0x20), _mm256_permute2f128_ps(h45, h67, 0x31)); - __m256 n23 = Avx2::Permute4x64<0x88>(_mm256_dp_ps(s23, s23, 0xF1)); - _mm_storeu_ps(nf.data + x, _mm_shuffle_ps(_mm256_castps256_ps128(n01), _mm256_castps256_ps128(n23), 0x88)); - } - for (; x < _hx; ++x, h += FQ) - { - __m128 h0 = Sse::Load(h + 00); - __m128 h1 = Sse::Load(h + HQ); - __m128 sum = _mm_add_ps(h0, h1); - _mm_store_ss(nf.data + x, _mm_dp_ps(sum, sum, 0xF1)); - } - } - - SIMD_INLINE void BlockNorm(size_t rowI) - { - const float * src0 = _nf[(rowI - 2) & 3].data; - const float * src1 = _nf[(rowI - 1) & 3].data; - const float * src2 = _nf[(rowI - 0) & 3].data; - float * dst = _nb.data; - for (size_t x = 0; x < _fx; x += 12, dst += 3 * F) - { - __m512 s0 = Avx512f::Load(src0 + x); - __m512 s1 = Avx512f::Load(src1 + x); - __m512 s2 = Avx512f::Load(src2 + x); - __m512 v0 = _mm512_add_ps(s0, s1); - __m512 v1 = _mm512_add_ps(s1, s2); - __m512 h0 = _mm512_add_ps(v0, Alignr<1>(v0, v0)); - __m512 h1 = _mm512_add_ps(v1, Alignr<1>(v1, v1)); - Avx512f::Store(dst + 0 * F, _mm512_permutex2var_ps(h0, K32_PERMUTE_BN_0, h1)); - Avx512f::Store(dst + 1 * F, _mm512_permutex2var_ps(h0, K32_PERMUTE_BN_1, h1)); - Avx512f::Store(dst + 2 * F, _mm512_permutex2var_ps(h0, K32_PERMUTE_BN_2, h1)); - } - } - - SIMD_INLINE __m256 Features07(const __m256 & n, const __m256 & s, __m256 & t) - { - __m256 h0 = _mm256_min_ps(_mm256_mul_ps(Avx2::Broadcast<0>(s), n), _02); - __m256 h1 = _mm256_min_ps(_mm256_mul_ps(Avx2::Broadcast<1>(s), n), _02); - __m256 h2 = _mm256_min_ps(_mm256_mul_ps(Avx2::Broadcast<2>(s), n), _02); - __m256 h3 = _mm256_min_ps(_mm256_mul_ps(Avx2::Broadcast<3>(s), n), _02); - t = _mm256_add_ps(t, _mm256_add_ps(_mm256_add_ps(h0, h1), _mm256_add_ps(h2, h3))); - return _mm256_mul_ps(_05, _mm256_hadd_ps(_mm256_hadd_ps(h0, h1), _mm256_hadd_ps(h2, h3))); - } - - SIMD_INLINE __m256 Features8B(const __m256 & n, const __m256 & s) - { - __m256 h0 = _mm256_min_ps(_mm256_mul_ps(Avx2::Broadcast<0>(s), n), _02); - __m256 h1 = _mm256_min_ps(_mm256_mul_ps(Avx2::Broadcast<1>(s), n), _02); - __m256 h2 = _mm256_min_ps(_mm256_mul_ps(Avx2::Broadcast<2>(s), n), _02); - __m256 h3 = _mm256_min_ps(_mm256_mul_ps(Avx2::Broadcast<3>(s), n), _02); - return _mm256_mul_ps(_05, _mm256_hadd_ps(_mm256_hadd_ps(h0, h1), _mm256_hadd_ps(h2, h3))); - } - - SIMD_INLINE void SetFeatures(size_t rowI, float * dst) - { - const float * hf = _hf[(rowI - 1) & 1].data + FQ; - const float * nb = _nb.data; - size_t x = 0, afx = AlignLo(_fx, 2); - for (; x < afx; x += 2, nb += 8, dst += QQ) - { - __m256 n = _mm256_rsqrt_ps(_mm256_add_ps(_mm256_load_ps(nb), _eps)); - __m256 t = _mm256_setzero_ps(); - __m256 f[4]; - const float * src = hf + x * FQ; - __m256 s0 = Avx::Load(src + 0 * HQ, src + 2 * HQ); - __m256 s1 = Avx::Load(src + 1 * HQ, src + 3 * HQ); - f[0] = Features07(n, s0, t); - f[1] = Features07(n, s1, t); - f[2] = Features8B(n, _mm256_add_ps(s0, s1)); - f[3] = _mm256_mul_ps(t, _02357); - Avx::Store(dst + 0 * Avx2::F, _mm256_permute2f128_ps(f[0], f[1], 0x20)); - Avx::Store(dst + 1 * Avx2::F, _mm256_permute2f128_ps(f[2], f[3], 0x20)); - Avx::Store(dst + 2 * Avx2::F, _mm256_permute2f128_ps(f[0], f[1], 0x31)); - Avx::Store(dst + 3 * Avx2::F, _mm256_permute2f128_ps(f[2], f[3], 0x31)); - } - for (; x < _fx; ++x, nb += 4) - { - __m128 n = _mm_rsqrt_ps(_mm_add_ps(_mm_load_ps(nb), _mm256_castps256_ps128(_eps))); - __m128 t = _mm_setzero_ps(); - const float * src = hf + x * FQ; - for (int o = 0; o < FQ; o += 4) - { - __m128 s = _mm_loadu_ps(src); - __m128 h0 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<0>(s), n), _mm256_castps256_ps128(_02)); - __m128 h1 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<1>(s), n), _mm256_castps256_ps128(_02)); - __m128 h2 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<2>(s), n), _mm256_castps256_ps128(_02)); - __m128 h3 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<3>(s), n), _mm256_castps256_ps128(_02)); - t = _mm_add_ps(t, _mm_add_ps(_mm_add_ps(h0, h1), _mm_add_ps(h2, h3))); - _mm_storeu_ps(dst, _mm_mul_ps(_mm256_castps256_ps128(_05), _mm_hadd_ps(_mm_hadd_ps(h0, h1), _mm_hadd_ps(h2, h3)))); - dst += Sse2::F; - src += Sse2::F; - } - src = hf + x * FQ; - __m128 s = _mm_add_ps(_mm_loadu_ps(src), _mm_loadu_ps(src + HQ)); - __m128 h0 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<0>(s), n), _mm256_castps256_ps128(_02)); - __m128 h1 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<1>(s), n), _mm256_castps256_ps128(_02)); - __m128 h2 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<2>(s), n), _mm256_castps256_ps128(_02)); - __m128 h3 = _mm_min_ps(_mm_mul_ps(Sse2::Broadcast<3>(s), n), _mm256_castps256_ps128(_02)); - _mm_storeu_ps(dst, _mm_mul_ps(_mm256_castps256_ps128(_05), _mm_hadd_ps(_mm_hadd_ps(h0, h1), _mm_hadd_ps(h2, h3)))); - dst += 4; - _mm_storeu_ps(dst, _mm_mul_ps(t, _mm256_castps256_ps128(_02357))); - dst += 4; - } - } - - public: - - void Run(const uint8_t * src, size_t srcStride, size_t width, size_t height, float * features, size_t featuresStride) - { - assert(cell == 8 || cell == 4); - assert(width >= cell * 3 && height >= cell * 3); - - Init(width); - - src += (srcStride + 1)*cell / 2; - height = (height / cell - 1)*cell; - - for (size_t row = 0; row < height; ++row) - { - SetIndexAndValue(src, srcStride); - size_t rowI = row / cell; - size_t rowF = row & (cell - 1); - if (cell == 4) - UpdateIntegerHistogram4x4(rowI, rowF); - else - UpdateIntegerHistogram8x8(rowI, rowF); - if (rowF == cell - 1) - { - UpdateFloatHistogram(rowI); - if (rowI >= 2) - { - BlockNorm(rowI); - SetFeatures(rowI, features); - features += featuresStride; - } - } - src += srcStride; - } - size_t rowI = height / cell; - UpdateFloatHistogram(rowI); - BlockNorm(rowI); - SetFeatures(rowI, features); - } - }; - - void HogLiteExtractFeatures(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t cell, float * features, size_t featuresStride) - { - if (cell == 4) - { - HogLiteFeatureExtractor<4> extractor; - extractor.Run(src, srcStride, width, height, features, featuresStride); - } - else - { - HogLiteFeatureExtractor<8> extractor; - extractor.Run(src, srcStride, width, height, features, featuresStride); - } - } - - class HogLiteFeatureFilter - { - template SIMD_INLINE void ProductSum1x1(const float * src, const float * filter, __m512 & sum, __mmask16 tail = -1) - { - __m512 _src = Avx512f::Load(src, tail); - __m512 _filter = Avx512f::Load(filter, tail); - sum = _mm512_fmadd_ps(_src, _filter, sum); - } - - template SIMD_INLINE void ProductSum1x1(const float * src, const float * filter, __m256 & sum) - { - __m256 _src = Avx::Load(src); - __m256 _filter = Avx::Load(filter); - sum = _mm256_add_ps(sum, _mm256_mul_ps(_src, _filter)); - } - - template SIMD_INLINE void ProductSum1x4(const float * src, const float * filter, __m256 * sums) - { - __m256 _filter = Avx::Load(filter); - sums[0] = _mm256_fmadd_ps(Avx::Load(src + 0 * step), _filter, sums[0]); - sums[1] = _mm256_fmadd_ps(Avx::Load(src + 1 * step), _filter, sums[1]); - sums[2] = _mm256_fmadd_ps(Avx::Load(src + 2 * step), _filter, sums[2]); - sums[3] = _mm256_fmadd_ps(Avx::Load(src + 3 * step), _filter, sums[3]); - } - - template SIMD_INLINE void ProductSum1x4x8(const float * src, const float * filter, __m512 * sums) - { - __m512 _filter = _mm512_broadcast_f32x8(Avx::Load(filter)); - sums[0] = _mm512_fmadd_ps(Avx512f::Load(src + 0 * F), _filter, sums[0]); - sums[1] = _mm512_fmadd_ps(Avx512f::Load(src + 1 * F), _filter, sums[1]); - } - - template static SIMD_INLINE void ProductSum4x4x8(const float * src, const float * filter, __m512 * sums) - { - __m512 filter0 = _mm512_broadcast_f32x8(Avx::Load(filter + 0 * HF)); - __m512 src0 = Avx512f::Load(src + 0 * HF); - __m512 src2 = Avx512f::Load(src + 2 * HF); - sums[0] = _mm512_fmadd_ps(src0, filter0, sums[0]); - sums[1] = _mm512_fmadd_ps(src2, filter0, sums[1]); - __m512 filter2 = _mm512_broadcast_f32x8(Avx::Load(filter + 2 * HF)); - __m512 src4 = Avx512f::Load(src + 4 * HF); - sums[0] = _mm512_fmadd_ps(src2, filter2, sums[0]); - sums[1] = _mm512_fmadd_ps(src4, filter2, sums[1]); - __m512 filter1 = _mm512_broadcast_f32x8(Avx::Load(filter + 1 * HF)); - __m512 src1 = Alignr<8>(src0, src2); - __m512 src3 = Alignr<8>(src2, src4); - sums[0] = _mm512_fmadd_ps(src1, filter1, sums[0]); - sums[1] = _mm512_fmadd_ps(src3, filter1, sums[1]); - __m512 filter3 = _mm512_broadcast_f32x8(Avx::Load(filter + 3 * HF)); - __m512 src5 = Avx512f::Load(src + 5 * HF); - sums[0] = _mm512_fmadd_ps(src3, filter3, sums[0]); - sums[1] = _mm512_fmadd_ps(src5, filter3, sums[1]); - } - - template void Filter8(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride) - { - size_t filterStride = 8 * filterWidth; - size_t alignedDstWidth = AlignLo(dstWidth, 4); - size_t alignedFilterStride = AlignLo(filterStride, DF); - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - size_t dstCol = 0; - for (; dstCol < alignedDstWidth; dstCol += 4) - { - __m512 sums[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() }; - const float * pSrc = src + dstRow * srcStride + dstCol * 8; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - size_t filterCol = 0; - for (; filterCol < alignedFilterStride; filterCol += DF) - ProductSum4x4x8(pSrc + filterCol, pFilter + filterCol, sums); - for (; filterCol < filterStride; filterCol += HF) - ProductSum1x4x8(pSrc + filterCol, pFilter + filterCol, sums); - pSrc += srcStride; - pFilter += filterStride; - } - __m256 sum0 = _mm256_hadd_ps(_mm512_castps512_ps256(sums[0]), _mm512_castps512_ps256(Alignr<8>(sums[0], sums[0]))); - __m256 sum1 = _mm256_hadd_ps(_mm512_castps512_ps256(sums[1]), _mm512_castps512_ps256(Alignr<8>(sums[1], sums[1]))); - __m256 sum = _mm256_hadd_ps(sum0, sum1); - _mm_storeu_ps(dst + dstCol, _mm_add_ps(_mm256_castps256_ps128(sum), _mm256_extractf128_ps(sum, 1))); - } - for (; dstCol < dstWidth; ++dstCol) - { - __m256 sum = _mm256_setzero_ps(); - const float * pSrc = src + dstRow * srcStride + dstCol * 8; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - for (size_t filterCol = 0; filterCol < filterStride; filterCol += Avx::F) - ProductSum1x1(pSrc + filterCol, pFilter + filterCol, sum); - pSrc += srcStride; - pFilter += filterStride; - } - dst[dstCol] = Avx::ExtractSum(sum); - } - dst += dstStride; - } - } - - template void Filter8(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) - { - size_t filterStride = 8 * filterWidth; - size_t alignedDstWidth = AlignLo(dstWidth, 8); - size_t alignedFilterStride = AlignLo(filterStride, DF); - __m128 _min = _mm_set1_ps(-FLT_MAX); - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - size_t dstCol = 0; - for (; dstCol < alignedDstWidth; dstCol += 4) - { - __m128 _mask = _mm_castsi128_ps(_mm_loadu_si128((__m128i*)(mask + dstCol))); - if (Sse41::TestZ(_mask)) - _mm_storeu_ps(dst + dstCol, _min); - else - { - __m512 sums[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() }; - const float * pSrc = src + dstRow * srcStride + dstCol * 8; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - size_t filterCol = 0; - for (; filterCol < alignedFilterStride; filterCol += DF) - ProductSum4x4x8(pSrc + filterCol, pFilter + filterCol, sums); - for (; filterCol < filterStride; filterCol += HF) - ProductSum1x4x8(pSrc + filterCol, pFilter + filterCol, sums); - pSrc += srcStride; - pFilter += filterStride; - } - __m256 sum0 = _mm256_hadd_ps(_mm512_castps512_ps256(sums[0]), _mm512_castps512_ps256(Alignr<8>(sums[0], sums[0]))); - __m256 sum1 = _mm256_hadd_ps(_mm512_castps512_ps256(sums[1]), _mm512_castps512_ps256(Alignr<8>(sums[1], sums[1]))); - __m256 sum = _mm256_hadd_ps(sum0, sum1); - _mm_storeu_ps(dst + dstCol, _mm_blendv_ps(_min, _mm_add_ps(_mm256_castps256_ps128(sum), _mm256_extractf128_ps(sum, 1)), _mask)); - } - } - for (; dstCol < dstWidth; ++dstCol) - { - if (mask[dstCol]) - { - __m256 sum = _mm256_setzero_ps(); - const float * pSrc = src + dstRow * srcStride + dstCol * 8; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - for (size_t filterCol = 0; filterCol < filterStride; filterCol += Avx::F) - ProductSum1x1(pSrc + filterCol, pFilter + filterCol, sum); - pSrc += srcStride; - pFilter += filterStride; - } - dst[dstCol] = Avx::ExtractSum(sum); - } - else - dst[dstCol] = -FLT_MAX; - } - dst += dstStride; - mask += maskStride; - } - } - - template static SIMD_INLINE void ProductSum1x4x16(const float * src, const float * filter, __m512 * sums) - { - __m512 _filter = Avx512f::Load(filter); - sums[0] = _mm512_fmadd_ps(Avx512f::Load(src + 0 * F), _filter, sums[0]); - sums[1] = _mm512_fmadd_ps(Avx512f::Load(src + 1 * F), _filter, sums[1]); - sums[2] = _mm512_fmadd_ps(Avx512f::Load(src + 2 * F), _filter, sums[2]); - sums[3] = _mm512_fmadd_ps(Avx512f::Load(src + 3 * F), _filter, sums[3]); - } - - template static SIMD_INLINE void ProductSum4x4x16(const float * src, const float * filter, __m512 * sums) - { - __m512 filter0 = Avx512f::Load(filter + 0 * F); - __m512 src0 = Avx512f::Load(src + 0 * F); - __m512 src1 = Avx512f::Load(src + 1 * F); - __m512 src2 = Avx512f::Load(src + 2 * F); - __m512 src3 = Avx512f::Load(src + 3 * F); - sums[0] = _mm512_fmadd_ps(src0, filter0, sums[0]); - sums[1] = _mm512_fmadd_ps(src1, filter0, sums[1]); - sums[2] = _mm512_fmadd_ps(src2, filter0, sums[2]); - sums[3] = _mm512_fmadd_ps(src3, filter0, sums[3]); - __m512 filter1 = Avx512f::Load(filter + 1 * F); - __m512 src4 = Avx512f::Load(src + 4 * F); - sums[0] = _mm512_fmadd_ps(src1, filter1, sums[0]); - sums[1] = _mm512_fmadd_ps(src2, filter1, sums[1]); - sums[2] = _mm512_fmadd_ps(src3, filter1, sums[2]); - sums[3] = _mm512_fmadd_ps(src4, filter1, sums[3]); - } - - template void Filter16(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride) - { - size_t filterStride = 16 * filterWidth; - size_t alignedDstWidth = AlignLo(dstWidth, 4); - size_t alignedFilterStride = AlignLo(filterStride, DF); - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - size_t dstCol = 0; - for (; dstCol < alignedDstWidth; dstCol += 4) - { - __m512 sums[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; - const float * pSrc = src + dstRow * srcStride + dstCol * 16; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - size_t filterCol = 0; - for (; filterCol < alignedFilterStride; filterCol += DF) - ProductSum4x4x16(pSrc + filterCol, pFilter + filterCol, sums); - for (; filterCol < filterStride; filterCol += F) - ProductSum1x4x16(pSrc + filterCol, pFilter + filterCol, sums); - pSrc += srcStride; - pFilter += filterStride; - } - __m256 sum0 = _mm512_castps512_ps256(_mm512_add_ps(sums[0], Alignr<8>(sums[0], sums[0]))); - __m256 sum1 = _mm512_castps512_ps256(_mm512_add_ps(sums[1], Alignr<8>(sums[1], sums[1]))); - __m256 sum2 = _mm512_castps512_ps256(_mm512_add_ps(sums[2], Alignr<8>(sums[2], sums[2]))); - __m256 sum3 = _mm512_castps512_ps256(_mm512_add_ps(sums[3], Alignr<8>(sums[3], sums[3]))); - __m256 sum = _mm256_hadd_ps(_mm256_hadd_ps(sum0, sum1), _mm256_hadd_ps(sum2, sum3)); - _mm_storeu_ps(dst + dstCol, _mm_add_ps(_mm256_castps256_ps128(sum), _mm256_extractf128_ps(sum, 1))); - } - for (; dstCol < dstWidth; ++dstCol) - { - __m512 sum = _mm512_setzero_ps(); - const float * pSrc = src + dstRow * srcStride + dstCol * 16; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - for (size_t filterCol = 0; filterCol < filterStride; filterCol += F) - ProductSum1x1(pSrc + filterCol, pFilter + filterCol, sum); - pSrc += srcStride; - pFilter += filterStride; - } - dst[dstCol] = Avx512f::ExtractSum(sum); - } - dst += dstStride; - } - } - - template void Filter16(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) - { - size_t filterStride = 16 * filterWidth; - size_t alignedDstWidth = AlignLo(dstWidth, 4); - size_t alignedFilterStride = AlignLo(filterStride, DF); - __m128 _min = _mm_set1_ps(-FLT_MAX); - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - size_t dstCol = 0; - for (; dstCol < alignedDstWidth; dstCol += 4) - { - __m128 _mask = _mm_castsi128_ps(_mm_loadu_si128((__m128i*)(mask + dstCol))); - if (Sse41::TestZ(_mask)) - _mm_storeu_ps(dst + dstCol, _min); - else - { - __m512 sums[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; - const float * pSrc = src + dstRow * srcStride + dstCol * 16; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - size_t filterCol = 0; - for (; filterCol < alignedFilterStride; filterCol += DF) - ProductSum4x4x16(pSrc + filterCol, pFilter + filterCol, sums); - for (; filterCol < filterStride; filterCol += F) - ProductSum1x4x16(pSrc + filterCol, pFilter + filterCol, sums); - pSrc += srcStride; - pFilter += filterStride; - } - __m256 sum0 = _mm512_castps512_ps256(_mm512_add_ps(sums[0], Alignr<8>(sums[0], sums[0]))); - __m256 sum1 = _mm512_castps512_ps256(_mm512_add_ps(sums[1], Alignr<8>(sums[1], sums[1]))); - __m256 sum2 = _mm512_castps512_ps256(_mm512_add_ps(sums[2], Alignr<8>(sums[2], sums[2]))); - __m256 sum3 = _mm512_castps512_ps256(_mm512_add_ps(sums[3], Alignr<8>(sums[3], sums[3]))); - __m256 sum = _mm256_hadd_ps(_mm256_hadd_ps(sum0, sum1), _mm256_hadd_ps(sum2, sum3)); - _mm_storeu_ps(dst + dstCol, _mm_blendv_ps(_min, _mm_add_ps(_mm256_castps256_ps128(sum), _mm256_extractf128_ps(sum, 1)), _mask)); - } - } - for (; dstCol < dstWidth; ++dstCol) - { - if (mask[dstCol]) - { - __m512 sum = _mm512_setzero_ps(); - const float * pSrc = src + dstRow * srcStride + dstCol * 16; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - for (size_t filterCol = 0; filterCol < filterStride; filterCol += F) - ProductSum1x1(pSrc + filterCol, pFilter + filterCol, sum); - pSrc += srcStride; - pFilter += filterStride; - } - dst[dstCol] = Avx512f::ExtractSum(sum); - } - else - dst[dstCol] = -FLT_MAX; - } - dst += dstStride; - mask += maskStride; - } - } - - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride) - { - if (featureSize == 16) - Filter16(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride); - else - Filter8(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride); - } - - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) - { - if (featureSize == 16) - Filter16(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - else - Filter8(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - } - - public: - - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) - { - assert(featureSize == 8 || featureSize == 16); - assert(srcWidth >= filterWidth && srcHeight >= filterHeight); - - size_t dstWidth = srcWidth - filterWidth + 1; - size_t dstHeight = srcHeight - filterHeight + 1; - - if (mask) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(filter)) - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - else - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - } - else - { - if (Aligned(src) && Aligned(srcStride) && Aligned(filter)) - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride); - else - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride); - } - } - }; - - void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) - { - HogLiteFeatureFilter featureFilter; - featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - } - - class HogLiteFeatureResizer - { - typedef Array Ints; - typedef Array Floats; - - Ints _iy, _ix; - Floats _ky, _kx; - - void InitIndexWeight(size_t srcSize, size_t dstSize, size_t dstStep, Ints & indexes, Floats & weights) - { - indexes.Resize(dstSize); - weights.Resize(dstSize); - - float scale = float(srcSize) / float(dstSize); - for (size_t i = 0; i < dstSize; ++i) - { - float weight = (float)((i + 0.5f)*scale - 0.5f); - int index = (int)::floor(weight); - weight -= index; - if (index < 0) - { - index = 0; - weight = 0.0f; - } - if (index > (int)srcSize - 2) - { - index = (int)srcSize - 2; - weight = 1.0f; - } - indexes[i] = int(index*dstStep); - weights[i] = weight; - } - } - - template void Resize8(const float * src, size_t srcStride, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) - { - __m512 _1 = _mm512_set1_ps(1.0f); - size_t alignedDstWidth = AlignLo(dstWidth, 2); - for (size_t rowDst = 0; rowDst < dstHeight; ++rowDst) - { - __m512 ky1 = _mm512_set1_ps(_ky[rowDst]); - __m512 ky0 = _mm512_sub_ps(_1, ky1); - const float * pSrc = src + _iy[rowDst]; - float * pDst = dst + rowDst * dstStride; - size_t colDst = 0; - for (; colDst < alignedDstWidth; colDst += 2, pDst += F) - { - __m512 kx1 = _mm512_insertf32x8(_mm512_set1_ps(_kx[colDst + 0]), _mm256_set1_ps(_kx[colDst + 1]), 1); - __m512 kx0 = _mm512_sub_ps(_1, kx1); - __m512 k00 = _mm512_mul_ps(ky0, kx0); - __m512 k01 = _mm512_mul_ps(ky0, kx1); - __m512 k10 = _mm512_mul_ps(ky1, kx0); - __m512 k11 = _mm512_mul_ps(ky1, kx1); - const float * pSrc00 = pSrc + _ix[colDst + 0]; - const float * pSrc01 = pSrc + _ix[colDst + 1]; - const float * pSrc10 = pSrc00 + srcStride; - const float * pSrc11 = pSrc01 + srcStride; - Avx512f::Store(pDst, _mm512_add_ps( - _mm512_fmadd_ps(Load(pSrc00, pSrc01), k00, _mm512_mul_ps(Load(pSrc00 + Avx2::F, pSrc01 + Avx2::F), k01)), - _mm512_fmadd_ps(Load(pSrc10, pSrc11), k10, _mm512_mul_ps(Load(pSrc10 + Avx2::F, pSrc11 + Avx2::F), k11)))); - } - for (; colDst < dstWidth; ++colDst, pDst += Avx2::F) - { - __m256 kx1 = _mm256_set1_ps(_kx[colDst]); - __m256 kx0 = _mm256_sub_ps(_mm512_castps512_ps256(_1), kx1); - __m256 k00 = _mm256_mul_ps(_mm512_castps512_ps256(ky0), kx0); - __m256 k01 = _mm256_mul_ps(_mm512_castps512_ps256(ky0), kx1); - __m256 k10 = _mm256_mul_ps(_mm512_castps512_ps256(ky1), kx0); - __m256 k11 = _mm256_mul_ps(_mm512_castps512_ps256(ky1), kx1); - const float * pSrc0 = pSrc + _ix[colDst]; - const float * pSrc1 = pSrc0 + srcStride; - Avx::Store(pDst, _mm256_add_ps( - _mm256_fmadd_ps(Avx::Load(pSrc0), k00, _mm256_mul_ps(Avx::Load(pSrc0 + Avx2::F), k01)), - _mm256_fmadd_ps(Avx::Load(pSrc1), k10, _mm256_mul_ps(Avx::Load(pSrc1 + Avx2::F), k11)))); - } - } - } - - template void Resize16(const float * src, size_t srcStride, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) - { - __m512 _1 = _mm512_set1_ps(1.0f); - for (size_t rowDst = 0; rowDst < dstHeight; ++rowDst) - { - __m512 ky1 = _mm512_set1_ps(_ky[rowDst]); - __m512 ky0 = _mm512_sub_ps(_1, ky1); - const float * pSrc = src + _iy[rowDst]; - float * pDst = dst + rowDst * dstStride; - for (size_t colDst = 0; colDst < dstWidth; ++colDst, pDst += F) - { - __m512 kx1 = _mm512_set1_ps(_kx[colDst]); - __m512 kx0 = _mm512_sub_ps(_1, kx1); - __m512 k00 = _mm512_mul_ps(ky0, kx0); - __m512 k01 = _mm512_mul_ps(ky0, kx1); - __m512 k10 = _mm512_mul_ps(ky1, kx0); - __m512 k11 = _mm512_mul_ps(ky1, kx1); - const float * pSrc0 = pSrc + _ix[colDst]; - const float * pSrc1 = pSrc0 + srcStride; - Avx512f::Store(pDst, _mm512_add_ps( - _mm512_fmadd_ps(Avx512f::Load(pSrc0), k00, _mm512_mul_ps(Avx512f::Load(pSrc0 + F), k01)), - _mm512_fmadd_ps(Avx512f::Load(pSrc1), k10, _mm512_mul_ps(Avx512f::Load(pSrc1 + F), k11)))); - } - } - } - - template void Resize(const float * src, size_t srcStride, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) - { - if (featureSize == 8) - Resize8(src, srcStride, dst, dstStride, dstWidth, dstHeight); - else - Resize16(src, srcStride, dst, dstStride, dstWidth, dstHeight); - } - - public: - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) - { - assert(featureSize == 8 || featureSize == 16); - - if (srcWidth == dstWidth && srcHeight == dstHeight) - { - size_t size = sizeof(float)*srcWidth*featureSize; - for (size_t row = 0; row < dstHeight; ++row) - memcpy(dst + row * dstStride, src + row * srcStride, size); - return; - } - - InitIndexWeight(srcWidth, dstWidth, featureSize, _ix, _kx); - InitIndexWeight(srcHeight, dstHeight, srcStride, _iy, _ky); - - if (Aligned(src) && Aligned(dst)) - Resize(src, srcStride, featureSize, dst, dstStride, dstWidth, dstHeight); - else - Resize(src, srcStride, featureSize, dst, dstStride, dstWidth, dstHeight); - } - }; - - void HogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) - { - HogLiteFeatureResizer featureResizer; - featureResizer.Run(src, srcStride, srcWidth, srcHeight, featureSize, dst, dstStride, dstWidth, dstHeight); - } - - template void HogLiteCompressFeatures(const float * src, size_t srcStride, size_t width, size_t height, const float * pca, float * dst, size_t dstStride) - { - if (align) - assert(Aligned(src) && Aligned(pca) && Aligned(dst)); - - SIMD_ALIGNED(64) float pca2[128]; - for (size_t i = 0; i < 8; ++i) - { - for (size_t j = 0; j < 8; ++j) - pca2[j * 16 + i + 0] = pca[i * 16 + j + 0]; - for (size_t j = 0; j < 8; ++j) - pca2[j * 16 + i + 8] = pca[i * 16 + j + 8]; - } - __m512 _pca[8]; - for (size_t i = 0; i < 8; ++i) - _pca[i] = Avx512f::Load(pca2 + i * F); - for (size_t row = 0; row < height; ++row) - { - const float * s = src; - float * d = dst; - for (size_t col = 0; col < width; ++col) - { - __m512 sums[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() }; - __m512 _src = Avx512f::Load(s); - __m512 src0 = Shuffle2x<0x44>(_src); - __m512 src1 = Shuffle2x<0xEE>(_src); - sums[0] = _mm512_fmadd_ps(Broadcast<0>(src0), _pca[0], sums[0]); - sums[1] = _mm512_fmadd_ps(Broadcast<0>(src1), _pca[4], sums[1]); - sums[0] = _mm512_fmadd_ps(Broadcast<1>(src0), _pca[1], sums[0]); - sums[1] = _mm512_fmadd_ps(Broadcast<1>(src1), _pca[5], sums[1]); - sums[0] = _mm512_fmadd_ps(Broadcast<2>(src0), _pca[2], sums[0]); - sums[1] = _mm512_fmadd_ps(Broadcast<2>(src1), _pca[6], sums[1]); - sums[0] = _mm512_fmadd_ps(Broadcast<3>(src0), _pca[3], sums[0]); - sums[1] = _mm512_fmadd_ps(Broadcast<3>(src1), _pca[7], sums[1]); - sums[0] = _mm512_add_ps(sums[0], sums[1]); - sums[0] = _mm512_add_ps(sums[0], Avx512f::Alignr<8>(sums[0], _mm512_setzero_ps())); - Avx::Store(d, _mm512_castps512_ps256(sums[0])); - s += 16; - d += 8; - } - src += srcStride; - dst += dstStride; - } - - } - - void HogLiteCompressFeatures(const float * src, size_t srcStride, size_t width, size_t height, const float * pca, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(pca) && Aligned(dst)) - HogLiteCompressFeatures(src, srcStride, width, height, pca, dst, dstStride); - else - HogLiteCompressFeatures(src, srcStride, width, height, pca, dst, dstStride); - } - - class HogLiteSeparableFilter - { - size_t _dstWidth, _dstHeight, _dstStride; - Array32f _buffer; - Array512f _filter; - - void Init(size_t srcWidth, size_t srcHeight, size_t hSize, size_t vSize) - { - _dstWidth = srcWidth - hSize + 1; - _dstStride = AlignHi(_dstWidth, Avx512f::F); - _dstHeight = srcHeight - vSize + 1; - _buffer.Resize(_dstStride*srcHeight); - } - - template static SIMD_INLINE void FilterHx1x8(const float * src, const float * filter, __m256 & sum) - { - __m256 _src = Avx::Load(src); - __m256 _filter = Avx::Load(filter); - sum = _mm256_fmadd_ps(_src, _filter, sum); - } - - template static SIMD_INLINE void FilterHx4x8(const float * src, const float * filter, __m256 * sums) - { - __m256 _filter = Avx::Load(filter); - sums[0] = _mm256_fmadd_ps(Avx::Load(src + 0 * Avx::F), _filter, sums[0]); - sums[1] = _mm256_fmadd_ps(Avx::Load(src + 1 * Avx::F), _filter, sums[1]); - sums[2] = _mm256_fmadd_ps(Avx::Load(src + 2 * Avx::F), _filter, sums[2]); - sums[3] = _mm256_fmadd_ps(Avx::Load(src + 3 * Avx::F), _filter, sums[3]); - } - - template void FilterHx8o(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - const size_t step = 8; - size_t alignedWidth = AlignLo(width, 4); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += 4) - { - __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; - const float * s = src + col * step; - for (size_t i = 0; i < size; i += Avx::F) - FilterHx4x8(s + i, filter + i, sums); - Sse::Store(dst + col, Avx::Extract4Sums(sums)); - } - for (; col < width; ++col) - { - __m256 sum = _mm256_setzero_ps(); - const float * s = src + col * step; - for (size_t i = 0; i < size; i += Avx::F) - FilterHx1x8(s + i, filter + i, sum); - dst[col] = Avx::ExtractSum(sum); - } - src += srcStride; - dst += dstStride; - } - } - - template static SIMD_INLINE void FilterHx4x8(const float * src, const float * filter, __m512 * sums) - { - __m512 _filter = Avx512f::Load(filter); - sums[0] = _mm512_fmadd_ps(Avx512f::Load(src + 0 * HF), _filter, sums[0]); - sums[1] = _mm512_fmadd_ps(Avx512f::Load(src + 1 * HF), _filter, sums[1]); - sums[2] = _mm512_fmadd_ps(Avx512f::Load(src + 2 * HF), _filter, sums[2]); - sums[3] = _mm512_fmadd_ps(Avx512f::Load(src + 3 * HF), _filter, sums[3]); - } - - template void FilterHx8e(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - const size_t step = 8; - size_t alignedWidth = AlignLo(width, 4); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += 4) - { - __m512 sums[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; - const float * s = src + col * step; - for (size_t i = 0; i < size; i += F) - FilterHx4x8(s + i, filter + i, sums); - Sse::Store(dst + col, Avx512f::Extract4Sums(sums)); - } - for (; col < width; ++col) - { - __m256 sum = _mm256_setzero_ps(); - const float * s = src + col * step; - for (size_t i = 0; i < size; i += Avx::F) - FilterHx1x8(s + i, filter + i, sum); - dst[col] = Avx::ExtractSum(sum); - } - src += srcStride; - dst += dstStride; - } - } - - template static SIMD_INLINE void FilterHx1x16(const float * src, const float * filter, __m512 & sum) - { - __m512 _src = Avx512f::Load(src); - __m512 _filter = Avx512f::Load(filter); - sum = _mm512_fmadd_ps(_src, _filter, sum); - } - - template static SIMD_INLINE void FilterHx4x16(const float * src, const float * filter, __m512 * sums) - { - __m512 _filter = Avx512f::Load(filter); - sums[0] = _mm512_fmadd_ps(Avx512f::Load(src + 0 * F), _filter, sums[0]); - sums[1] = _mm512_fmadd_ps(Avx512f::Load(src + 1 * F), _filter, sums[1]); - sums[2] = _mm512_fmadd_ps(Avx512f::Load(src + 2 * F), _filter, sums[2]); - sums[3] = _mm512_fmadd_ps(Avx512f::Load(src + 3 * F), _filter, sums[3]); - } - - template static SIMD_INLINE void FilterHx4x16x2(const float * src, const float * filter, __m512 * sums) - { - __m512 filter0 = Avx512f::Load(filter + 0 * F); - __m512 src0 = Avx512f::Load(src + 0 * F); - __m512 src1 = Avx512f::Load(src + 1 * F); - __m512 src2 = Avx512f::Load(src + 2 * F); - __m512 src3 = Avx512f::Load(src + 3 * F); - sums[0] = _mm512_fmadd_ps(src0, filter0, sums[0]); - sums[1] = _mm512_fmadd_ps(src1, filter0, sums[1]); - sums[2] = _mm512_fmadd_ps(src2, filter0, sums[2]); - sums[3] = _mm512_fmadd_ps(src3, filter0, sums[3]); - __m512 filter1 = Avx512f::Load(filter + 1 * F); - __m512 src4 = Avx512f::Load(src + 4 * F); - sums[0] = _mm512_fmadd_ps(src1, filter1, sums[0]); - sums[1] = _mm512_fmadd_ps(src2, filter1, sums[1]); - sums[2] = _mm512_fmadd_ps(src3, filter1, sums[2]); - sums[3] = _mm512_fmadd_ps(src4, filter1, sums[3]); - } - - template void FilterHx16(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - const size_t step = 16; - size_t alignedWidth = AlignLo(width, 4); - size_t alignedSize = AlignLo(size, 2); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += 4) - { - __m512 sums[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; - const float * s = src + col * step; - size_t i = 0; - for (; i < alignedSize; i += DF) - FilterHx4x16x2(s + i, filter + i, sums); - for (; i < size; i += F) - FilterHx4x16(s + i, filter + i, sums); - _mm_storeu_ps(dst + col, Avx512f::Extract4Sums(sums)); - } - for (; col < width; ++col) - { - __m512 sum = _mm512_setzero_ps(); - const float * s = src + col * step; - for (size_t i = 0; i < size; i += F) - FilterHx1x16(s + i, filter + i, sum); - dst[col] = Avx512f::ExtractSum(sum); - } - src += srcStride; - dst += dstStride; - } - } - - template void FilterH(const float * src, size_t srcStride, size_t width, size_t height, size_t step, const float * filter, size_t size, float * dst, size_t dstStride) - { - if (step == 16) - FilterHx16(src, srcStride, width, height, filter, size, dst, dstStride); - else - { - if (size & 1) - FilterHx8o(src, srcStride, width, height, filter, size, dst, dstStride); - else - FilterHx8e(src, srcStride, width, height, filter, size, dst, dstStride); - } - } - - template static SIMD_INLINE void FilterV(const float * src, size_t stride, const __m512 * filter, size_t size, float * dst, __mmask16 tail = -1) - { - __m512 sum = _mm512_setzero_ps(); - for (size_t i = 0; i < size; ++i, src += stride) - sum = _mm512_fmadd_ps(Avx512f::Load(src), filter[i], sum); - Avx512f::Update(dst, sum, tail); - } - - template void FilterV(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - _filter.Resize(size); - for (size_t i = 0; i < size; ++i) - _filter[i] = _mm512_set1_ps(filter[i]); - - size_t alignedWidth = AlignLo(width, F); - __mmask16 tailMask = TailMask16(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += F) - FilterV(src + col, srcStride, _filter.data, size, dst + col); - if (col < width) - FilterV(src + col, srcStride, _filter.data, size, dst + col, tailMask); - src += srcStride; - dst += dstStride; - } - } - - template void FilterV(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - if (Aligned(dst) && Aligned(dstStride)) - FilterV(src, srcStride, width, height, filter, size, dst, dstStride); - else - FilterV(src, srcStride, width, height, filter, size, dst, dstStride); - } - - public: - - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * hFilter, size_t hSize, const float * vFilter, size_t vSize, float * dst, size_t dstStride, int add) - { - assert(featureSize == 8 || featureSize == 16); - assert(srcWidth >= hSize && srcHeight >= vSize); - - Init(srcWidth, srcHeight, hSize, vSize); - - if (Aligned(src) && Aligned(srcStride) && Aligned(hFilter)) - FilterH(src, srcStride, _dstWidth, srcHeight, featureSize, hFilter, hSize*featureSize, _buffer.data, _dstStride); - else - FilterH(src, srcStride, _dstWidth, srcHeight, featureSize, hFilter, hSize*featureSize, _buffer.data, _dstStride); - - if (add) - FilterV(_buffer.data, _dstStride, _dstWidth, _dstHeight, vFilter, vSize, dst, dstStride); - else - FilterV(_buffer.data, _dstStride, _dstWidth, _dstHeight, vFilter, vSize, dst, dstStride); - } - }; - - void HogLiteFilterSeparable(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * hFilter, size_t hSize, const float * vFilter, size_t vSize, float * dst, size_t dstStride, int add) - { - HogLiteSeparableFilter filter; - filter.Run(src, srcStride, srcWidth, srcHeight, featureSize, hFilter, hSize, vFilter, vSize, dst, dstStride, add); - } - - const __m512i K64_15 = SIMD_MM512_SET1_EPI64(15); - const __m512i K32_64_TO_32_2 = SIMD_MM512_SETR_EPI32(0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14); - - class HogLiteMaskCreater - { - typedef Simd::Array Ints; - Ints _sums[8]; - size_t _dstWidth, _alignedDstWidth, _dstHeight; - __mmask16 _dstWidthTail; - - void Init(size_t srcWidth, size_t srcHeight, size_t scale, size_t size) - { - _dstWidth = srcWidth * scale + size - scale; - _alignedDstWidth = AlignLo(_dstWidth, F); - _dstWidthTail = TailMask16(_dstWidth - _alignedDstWidth); - _dstHeight = srcHeight * scale + size - scale; - size_t sumSize = AlignHi(_dstWidth, F) + 2 * F; - for (size_t i = 0; i < 8; ++i) - _sums[i].Resize(sumSize, true); - } - - template SIMD_INLINE void SetDstRow(const uint32_t * sum0, const uint32_t * sum1, uint32_t * dst, __mmask16 tail = -1) - { - __m512i s00 = Load(sum0 - step); - __m512i s10 = Load(sum1 - step); - __m512i s01 = Load(sum0); - __m512i s11 = Load(sum1); - __m512i sum = _mm512_sub_epi32(_mm512_sub_epi32(s11, s10), _mm512_sub_epi32(s01, s00)); - __m512i value = _mm512_movm_epi32(_mm512_cmpgt_epi32_mask(sum, K_ZERO)); - Store(dst, value, tail); - } - - template SIMD_INLINE void SetDstRow(const uint32_t * sum0, const uint32_t * sum1, uint32_t * dst) - { - size_t dstCol = 0; - for (; dstCol < _alignedDstWidth; dstCol += F) - SetDstRow(sum0 + dstCol, sum1 + dstCol, dst + dstCol); - if (dstCol < _dstWidth) - SetDstRow(sum0 + dstCol, sum1 + dstCol, dst + dstCol, _dstWidthTail); - } - - void CreateMask7x7x1(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, const float * threshold, uint32_t * dst, size_t dstStride) - { - size_t alignedSrcWidth = AlignLo(srcWidth, F); - __m512 _threshold = _mm512_set1_ps(*threshold); - for (size_t row = 0; row < srcHeight; ++row) - { - uint32_t * sum0 = _sums[(row + 0) & 7].data + F; - uint32_t * sum6 = _sums[(row + 6) & 7].data + F; - uint32_t * sum7 = _sums[(row + 7) & 7].data + F; - - __m512i _rowSums = K_ZERO; - size_t col = 0; - for (; col < alignedSrcWidth; col += F) - { - __mmask16 mmask = _mm512_cmp_ps_mask(Avx512f::Load(src + col), _threshold, _CMP_GT_OQ); - - __mmask64 lo = ((mmask & 0xFF) * 0x0101010101010101ull) & 0xFF7F3F1F0F070301ull; - _rowSums = _mm512_add_epi32(_rowSums, _mm512_sad_epu8(_mm512_movm_epi8(lo), K_ZERO)); - _mm256_storeu_si256((__m256i*)(sum7 + col + 00), _mm256_add_epi32(_mm512_cvtepi64_epi32(_rowSums), _mm256_loadu_si256((__m256i*)(sum6 + col + 00)))); - _rowSums = _mm512_permutexvar_epi64(K64_15, _rowSums); - - __mmask64 hi = ((mmask >> 8) * 0x0101010101010101ull) & 0xFF7F3F1F0F070301ull; - _rowSums = _mm512_add_epi32(_rowSums, _mm512_sad_epu8(_mm512_movm_epi8(hi), K_ZERO)); - _mm256_storeu_si256((__m256i*)(sum7 + col + HF), _mm256_add_epi32(_mm512_cvtepi64_epi32(_rowSums), _mm256_loadu_si256((__m256i*)(sum6 + col + HF)))); - _rowSums = _mm512_permutexvar_epi64(K64_15, _rowSums); - } - uint32_t rowSum = sum7[col - 1] - sum6[col - 1]; - for (; col < srcWidth; ++col) - { - if (src[col] > *threshold) - rowSum += 0xFF; - sum7[col] = rowSum + sum6[col]; - } - for (; col < _dstWidth; ++col) - sum7[col] = sum7[col - 1]; - - SetDstRow<7>(sum0, sum7, dst); - - src += srcStride; - dst += dstStride; - } - - for (size_t row = srcHeight; row < _dstHeight; ++row) - { - uint32_t * sum0 = _sums[(row + 0) & 7].data + F; - uint32_t * sum7 = _sums[(srcHeight - 1 + 7) & 7].data + F; - SetDstRow<7>(sum0, sum7, dst); - dst += dstStride; - } - } - - void CreateMask7x7x2(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, const float * threshold, uint32_t * dst, size_t dstStride) - { - size_t alignedSrcWidth = AlignLo(srcWidth, F); - __m512 _threshold = _mm512_set1_ps(*threshold); - for (size_t srcRow = 0; srcRow < srcHeight; ++srcRow) - { - size_t dstRow = srcRow * 2; - uint32_t * sum0 = _sums[(srcRow + 0) & 7].data + F; - uint32_t * sum1 = _sums[(srcRow + 1) & 7].data + F; - uint32_t * sum3 = _sums[(srcRow + 3) & 7].data + F; - uint32_t * sum4 = _sums[(srcRow + 4) & 7].data + F; - - __m512i _rowSums = K_ZERO; - size_t srcCol = 0, dstCol = 0; - for (; srcCol < alignedSrcWidth; srcCol += F, dstCol += DF) - { - __mmask16 mmask = _mm512_cmp_ps_mask(Avx512f::Load(src + srcCol), _threshold, _CMP_GT_OQ); - - __mmask64 lo = ((mmask & 0xFF) * 0x0101010101010101ull) & 0xFF7F3F1F0F070301ull; - _rowSums = _mm512_add_epi32(_rowSums, _mm512_sad_epu8(_mm512_movm_epi8(lo), K_ZERO)); - Store(sum4 + dstCol + 0, _mm512_add_epi32(_mm512_permutexvar_epi32(K32_64_TO_32_2, _rowSums), Load(sum3 + dstCol + 0))); - _rowSums = _mm512_permutexvar_epi64(K64_15, _rowSums); - - __mmask64 hi = ((mmask >> 8) * 0x0101010101010101ull) & 0xFF7F3F1F0F070301ull; - _rowSums = _mm512_add_epi32(_rowSums, _mm512_sad_epu8(_mm512_movm_epi8(hi), K_ZERO)); - Store(sum4 + dstCol + F, _mm512_add_epi32(_mm512_permutexvar_epi32(K32_64_TO_32_2, _rowSums), Load(sum3 + dstCol + F))); - _rowSums = _mm512_permutexvar_epi64(K64_15, _rowSums); - } - uint32_t rowSum = sum4[dstCol - 1] - sum3[dstCol - 1]; - for (; srcCol < srcWidth; srcCol += 1, dstCol += 2) - { - if (src[srcCol] > *threshold) - rowSum += 0xFF; - sum4[dstCol + 0] = rowSum + sum3[dstCol + 0]; - sum4[dstCol + 1] = rowSum + sum3[dstCol + 1]; - } - for (; dstCol < _dstWidth; ++dstCol) - sum4[dstCol] = sum4[dstCol - 1]; - - SetDstRow<7>(sum0, sum4, dst); - dst += dstStride; - SetDstRow<7>(sum1, sum4, dst); - dst += dstStride; - src += srcStride; - } - - uint32_t * sum0 = _sums[(srcHeight + 0) & 7].data + F; - uint32_t * sum1 = _sums[(srcHeight + 1) & 7].data + F; - uint32_t * sum2 = _sums[(srcHeight + 2) & 7].data + F; - uint32_t * sum3 = _sums[(srcHeight + 3) & 7].data + F; - SetDstRow<7>(sum0, sum3, dst + 0 * dstStride); - SetDstRow<7>(sum1, sum3, dst + 1 * dstStride); - SetDstRow<7>(sum1, sum3, dst + 2 * dstStride); - SetDstRow<7>(sum2, sum3, dst + 3 * dstStride); - SetDstRow<7>(sum2, sum3, dst + 4 * dstStride); - } - - public: - - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, const float * threshold, size_t scale, size_t size, uint32_t * dst, size_t dstStride) - { - if (size == 7 && (scale == 1 || scale == 2)) - { - Init(srcWidth, srcHeight, scale, size); - if (scale == 1) - CreateMask7x7x1(src, srcStride, srcWidth, srcHeight, threshold, dst, dstStride); - else - CreateMask7x7x2(src, srcStride, srcWidth, srcHeight, threshold, dst, dstStride); - } - else - Base::HogLiteCreateMask(src, srcStride, srcWidth, srcHeight, threshold, scale, size, dst, dstStride); - } - }; - - void HogLiteCreateMask(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, const float * threshold, size_t scale, size_t size, uint32_t * dst, size_t dstStride) - { - HogLiteMaskCreater maskCreater; - maskCreater.Run(src, srcStride, srcWidth, srcHeight, threshold, scale, size, dst, dstStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} - - diff --git a/src/3rd/Simd/Simd/SimdAvx512bwInt16ToGray.cpp b/src/3rd/Simd/Simd/SimdAvx512bwInt16ToGray.cpp deleted file mode 100644 index d4f0d7b7..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwInt16ToGray.cpp +++ /dev/null @@ -1,76 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void Int16ToGray(const int16_t * src, uint8_t * dst, __mmask64 tail = -1) - { - __m512i src0 = Load(src + 00, __mmask32(tail >> 00)); - __m512i src1 = Load(src + HA, __mmask32(tail >> 32)); - Store(dst, _mm512_permutexvar_epi64(K64_PERMUTE_FOR_PACK, _mm512_packus_epi16(src0, src1)), tail); - } - - template SIMD_INLINE void Int16ToGray2(const int16_t * src, uint8_t * dst) - { - Store(dst + 0 * A, _mm512_permutexvar_epi64(K64_PERMUTE_FOR_PACK, _mm512_packus_epi16(Load(src + 0 * HA), Load(src + 1 * HA)))); - Store(dst + 1 * A, _mm512_permutexvar_epi64(K64_PERMUTE_FOR_PACK, _mm512_packus_epi16(Load(src + 2 * HA), Load(src + 3 * HA)))); - } - - template void Int16ToGray(const int16_t * src, size_t width, size_t height, size_t srcStride, uint8_t * dst, size_t dstStride) - { - if (align) - assert(Aligned(src) && Aligned(srcStride, HA) && Aligned(dst) && Aligned(dstStride)); - - size_t alignedWidth = AlignLo(width, A); - size_t fullAlignedWidth = AlignLo(width, DA); - __mmask64 tailMask = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += DA) - Int16ToGray2(src + col, dst + col); - for (; col < alignedWidth; col += A) - Int16ToGray(src + col, dst + col); - if (col < width) - Int16ToGray(src + col, dst + col, tailMask); - src += srcStride; - dst += dstStride; - } - } - - void Int16ToGray(const uint8_t * src, size_t width, size_t height, size_t srcStride, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - Int16ToGray((const int16_t *)src, width, height, srcStride / sizeof(int16_t), dst, dstStride); - else - Int16ToGray((const int16_t *)src, width, height, srcStride / sizeof(int16_t), dst, dstStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwIntegral.cpp b/src/3rd/Simd/Simd/SimdAvx512bwIntegral.cpp deleted file mode 100644 index 206a3d60..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwIntegral.cpp +++ /dev/null @@ -1,131 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdInit.h" -#include "Simd/SimdIntegral.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - const __m512i K8_SUM_MASK = SIMD_MM512_SETR_EPI8( - 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - - const __m512i K64_15 = SIMD_MM512_SET1_EPI64(15); - - void IntegralSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint32_t * sum, size_t sumStride) - { - memset(sum, 0, (width + 1) * sizeof(uint32_t)); - sum += sumStride + 1; - size_t alignedWidth = AlignLo(width, 8); - - for (size_t row = 0; row < height; row++) - { - sum[-1] = 0; - size_t col = 0; - __m512i _rowSums = K_ZERO; - for (; col < alignedWidth; col += 8) - { - __m512i _src = _mm512_and_si512(_mm512_set1_epi64(*(uint64_t*)(src + col)), K8_SUM_MASK); - _rowSums = _mm512_add_epi32(_rowSums, _mm512_sad_epu8(_src, K_ZERO)); - _mm256_storeu_si256((__m256i*)(sum + col), _mm256_add_epi32(_mm512_cvtepi64_epi32(_rowSums), _mm256_loadu_si256((__m256i*)(sum + col - sumStride)))); - _rowSums = _mm512_permutexvar_epi64(K64_15, _rowSums); - } - uint32_t rowSum = sum[col - 1] - sum[col - sumStride - 1]; - for (; col < width; col++) - { - rowSum += src[col]; - sum[col] = rowSum + sum[col - sumStride]; - } - src += srcStride; - sum += sumStride; - } - } - - void Integral(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t * sum, size_t sumStride, uint8_t * sqsum, size_t sqsumStride, uint8_t * tilted, size_t tiltedStride, - SimdPixelFormatType sumFormat, SimdPixelFormatType sqsumFormat) - { - assert(sumFormat == SimdPixelFormatInt32 && sumStride % sizeof(uint32_t) == 0); - if (tilted) - assert(tiltedStride % sizeof(uint32_t) == 0); - - if (sqsum) - { - if (tilted) - { - switch (sqsumFormat) - { - case SimdPixelFormatInt32: - IntegralSumSqsumTilted(src, srcStride, width, height, - (uint32_t*)sum, sumStride / sizeof(uint32_t), (uint32_t*)sqsum, sqsumStride / sizeof(uint32_t), (uint32_t*)tilted, tiltedStride / sizeof(uint32_t)); - break; - case SimdPixelFormatDouble: - IntegralSumSqsumTilted(src, srcStride, width, height, - (uint32_t*)sum, sumStride / sizeof(uint32_t), (double*)sqsum, sqsumStride / sizeof(double), (uint32_t*)tilted, tiltedStride / sizeof(uint32_t)); - break; - default: - assert(0); - } - } - else - { - switch (sqsumFormat) - { - case SimdPixelFormatInt32: - IntegralSumSqsum(src, srcStride, width, height, - (uint32_t*)sum, sumStride / sizeof(uint32_t), (uint32_t*)sqsum, sqsumStride / sizeof(uint32_t)); - break; - case SimdPixelFormatDouble: - IntegralSumSqsum(src, srcStride, width, height, - (uint32_t*)sum, sumStride / sizeof(uint32_t), (double*)sqsum, sqsumStride / sizeof(double)); - break; - default: - assert(0); - } - } - } - else - { - if (tilted) - { - IntegralSumTilted(src, srcStride, width, height, - (uint32_t*)sum, sumStride / sizeof(uint32_t), (uint32_t*)tilted, tiltedStride / sizeof(uint32_t)); - } - else - { - Avx512bw::IntegralSum(src, srcStride, width, height, (uint32_t*)sum, sumStride / sizeof(uint32_t)); - } - } - } - } -#endif//SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwInterference.cpp b/src/3rd/Simd/Simd/SimdAvx512bwInterference.cpp deleted file mode 100644 index a51a0e7a..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwInterference.cpp +++ /dev/null @@ -1,191 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdSet.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template __m512i InterferenceChange(__m512i statistic, __m512i value, __m512i saturation); - - template<> SIMD_INLINE __m512i InterferenceChange(__m512i statistic, __m512i value, __m512i saturation) - { - return _mm512_min_epi16(_mm512_add_epi16(statistic, value), saturation); - } - - template<> SIMD_INLINE __m512i InterferenceChange(__m512i statistic, __m512i value, __m512i saturation) - { - return _mm512_max_epi16(_mm512_sub_epi16(statistic, value), saturation); - } - - template SIMD_INLINE void InterferenceChange(int16_t * statistic, __m512i value, __m512i saturation, __mmask32 tail = -1) - { - Store(statistic, InterferenceChange(Load(statistic, tail), value, saturation), tail); - } - - template SIMD_INLINE void InterferenceChange4(int16_t * statistic, __m512i value, __m512i saturation) - { - Store(statistic + 0 * HA, InterferenceChange(Load(statistic + 0 * HA), value, saturation)); - Store(statistic + 1 * HA, InterferenceChange(Load(statistic + 1 * HA), value, saturation)); - Store(statistic + 2 * HA, InterferenceChange(Load(statistic + 2 * HA), value, saturation)); - Store(statistic + 3 * HA, InterferenceChange(Load(statistic + 3 * HA), value, saturation)); - } - - template void InterferenceChange(int16_t * statistic, size_t stride, size_t width, size_t height, uint8_t value, int16_t saturation) - { - if (align) - assert(Aligned(statistic) && Aligned(stride, HA)); - - size_t alignedWidth = AlignLo(width, HA); - size_t fullAlignedWidth = AlignLo(width, DA); - __mmask32 tailMask = TailMask32(width - alignedWidth); - - __m512i _value = _mm512_set1_epi16(value); - __m512i _saturation = _mm512_set1_epi16(saturation); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += DA) - InterferenceChange4(statistic + col, _value, _saturation); - for (; col < alignedWidth; col += HA) - InterferenceChange(statistic + col, _value, _saturation); - if (col < width) - InterferenceChange(statistic + col, _value, _saturation, tailMask); - statistic += stride; - } - } - - void InterferenceIncrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t increment, int16_t saturation) - { - assert(Aligned(stride, 2)); - - if (Aligned(statistic) && Aligned(stride)) - InterferenceChange((int16_t*)statistic, stride / 2, width, height, increment, saturation); - else - InterferenceChange((int16_t*)statistic, stride / 2, width, height, increment, saturation); - } - - void InterferenceDecrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t decrement, int16_t saturation) - { - assert(Aligned(stride, 2)); - - if (Aligned(statistic) && Aligned(stride)) - InterferenceChange((int16_t*)statistic, stride / 2, width, height, decrement, saturation); - else - InterferenceChange((int16_t*)statistic, stride / 2, width, height, decrement, saturation); - } - - template __m512i InterferenceChangeMasked(__m512i statistic, __m512i value, __m512i saturation, __mmask32 mask); - - template<> SIMD_INLINE __m512i InterferenceChangeMasked(__m512i statistic, __m512i value, __m512i saturation, __mmask32 mask) - { - return _mm512_min_epi16(_mm512_mask_add_epi16(statistic, mask, statistic, value), saturation); - } - - template<> SIMD_INLINE __m512i InterferenceChangeMasked(__m512i statistic, __m512i value, __m512i saturation, __mmask32 mask) - { - return _mm512_max_epi16(_mm512_mask_sub_epi16(statistic, mask, statistic, value), saturation); - } - - template SIMD_INLINE void InterferenceChangeMasked(int16_t * statistic, __m512i value, __m512i saturation, __mmask32 mask, __mmask32 tail = -1) - { - Store(statistic, InterferenceChangeMasked(Load(statistic, tail), value, saturation, mask), tail); - } - - template SIMD_INLINE void InterferenceChangeMasked(const uint8_t * mask, __m512i index, int16_t * statistic, __m512i value, __m512i saturation, __mmask64 tail = -1) - { - __mmask64 mask0 = _mm512_cmpeq_epi8_mask((Load(mask, tail)), index) & tail; - InterferenceChangeMasked(statistic + 00, value, saturation, __mmask32(mask0 >> 00), __mmask32(tail >> 00)); - InterferenceChangeMasked(statistic + HA, value, saturation, __mmask32(mask0 >> 32), __mmask32(tail >> 32)); - } - - template SIMD_INLINE void InterferenceChangeMasked(int16_t * statistic, __m512i value, __m512i saturation, __mmask32 mask) - { - Store(statistic, InterferenceChangeMasked(Load(statistic), value, saturation, mask)); - } - - template SIMD_INLINE void InterferenceChangeMasked2(const uint8_t * mask, __m512i index, int16_t * statistic, __m512i value, __m512i saturation) - { - __mmask64 mask0 = _mm512_cmpeq_epi8_mask(Load(mask + 0), index); - InterferenceChangeMasked(statistic + 0 * HA, value, saturation, __mmask32(mask0 >> 00)); - InterferenceChangeMasked(statistic + 1 * HA, value, saturation, __mmask32(mask0 >> 32)); - __mmask64 mask1 = _mm512_cmpeq_epi8_mask(Load(mask + A), index); - InterferenceChangeMasked(statistic + 2 * HA, value, saturation, __mmask32(mask1 >> 00)); - InterferenceChangeMasked(statistic + 3 * HA, value, saturation, __mmask32(mask1 >> 32)); - } - - template void InterferenceChangeMasked(int16_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t value, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index) - { - if (align) - assert(Aligned(statistic) && Aligned(statisticStride, HA) && Aligned(mask) && Aligned(maskStride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - size_t fullAlignedWidth = AlignLo(width, DA); - __mmask64 tailMask = TailMask64(width - alignedWidth); - - __m512i _value = _mm512_set1_epi16(value); - __m512i _saturation = _mm512_set1_epi16(saturation); - __m512i _index = _mm512_set1_epi8(index); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += DA) - InterferenceChangeMasked2(mask + col, _index, statistic + col, _value, _saturation); - for (; col < alignedWidth; col += A) - InterferenceChangeMasked(mask + col, _index, statistic + col, _value, _saturation); - if (col < width) - InterferenceChangeMasked(mask + col, _index, statistic + col, _value, _saturation, tailMask); - statistic += statisticStride; - mask += maskStride; - } - } - - void InterferenceIncrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t increment, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index) - { - assert(Aligned(statisticStride, 2)); - - if (Aligned(statistic) && Aligned(statisticStride) && Aligned(mask) && Aligned(maskStride)) - InterferenceChangeMasked((int16_t*)statistic, statisticStride / 2, width, height, increment, saturation, mask, maskStride, index); - else - InterferenceChangeMasked((int16_t*)statistic, statisticStride / 2, width, height, increment, saturation, mask, maskStride, index); - } - - void InterferenceDecrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t decrement, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index) - { - assert(Aligned(statisticStride, 2)); - - if (Aligned(statistic) && Aligned(statisticStride) && Aligned(mask) && Aligned(maskStride)) - InterferenceChangeMasked((int16_t*)statistic, statisticStride / 2, width, height, decrement, saturation, mask, maskStride, index); - else - InterferenceChangeMasked((int16_t*)statistic, statisticStride / 2, width, height, decrement, saturation, mask, maskStride, index); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwInterleave.cpp b/src/3rd/Simd/Simd/SimdAvx512bwInterleave.cpp deleted file mode 100644 index 2dbdcfd8..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwInterleave.cpp +++ /dev/null @@ -1,188 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void InterleaveUv(const uint8_t * u, const uint8_t * v, uint8_t * uv, const __mmask64 * tails) - { - __m512i _u = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(u, tails[2]))); - __m512i _v = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(v, tails[2]))); - Store(uv + 0, UnpackU8<0>(_u, _v), tails[0]); - Store(uv + A, UnpackU8<1>(_u, _v), tails[1]); - } - - template SIMD_INLINE void InterleaveUv2(const uint8_t * u, const uint8_t * v, uint8_t * uv) - { - __m512i u0 = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, Load(u + 0)); - __m512i v0 = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, Load(v + 0)); - Store(uv + 0 * A, UnpackU8<0>(u0, v0)); - Store(uv + 1 * A, UnpackU8<1>(u0, v0)); - __m512i u1 = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, Load(u + A)); - __m512i v1 = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, Load(v + A)); - Store(uv + 2 * A, UnpackU8<0>(u1, v1)); - Store(uv + 3 * A, UnpackU8<1>(u1, v1)); - } - - template void InterleaveUv(const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * uv, size_t uvStride) - { - if (align) - assert(Aligned(uv) && Aligned(uvStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride)); - - size_t alignedWidth = AlignLo(width, A); - size_t fullAlignedWidth = AlignLo(width, DA); - __mmask64 tailMasks[3]; - for (size_t c = 0; c < 2; ++c) - tailMasks[c] = TailMask64((width - alignedWidth) * 2 - A*c); - tailMasks[2] = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += DA) - InterleaveUv2(u + col, v + col, uv + col * 2); - for (; col < alignedWidth; col += A) - InterleaveUv(u + col, v + col, uv + col * 2, tailMasks); - if (col < width) - InterleaveUv(u + col, v + col, uv + col * 2, tailMasks); - uv += uvStride; - u += uStride; - v += vStride; - } - } - - void InterleaveUv(const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * uv, size_t uvStride) - { - if (Aligned(uv) && Aligned(uvStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride)) - InterleaveUv(u, uStride, v, vStride, width, height, uv, uvStride); - else - InterleaveUv(u, uStride, v, vStride, width, height, uv, uvStride); - } - - template SIMD_INLINE void InterleaveBgr(const uint8_t * b, const uint8_t * g, const uint8_t * r, uint8_t * bgr, const __mmask64 * tails) - { - __m512i _b = Load(b, tails[3]); - __m512i _g = Load(g, tails[3]); - __m512i _r = Load(r, tails[3]); - Store(bgr + 0 * A, InterleaveBgr<0>(_b, _g, _r), tails[0]); - Store(bgr + 1 * A, InterleaveBgr<1>(_b, _g, _r), tails[1]); - Store(bgr + 2 * A, InterleaveBgr<2>(_b, _g, _r), tails[2]); - } - - template void InterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - if (align) - { - assert(Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride)); - assert(Aligned(r) && Aligned(rStride) && Aligned(bgr) && Aligned(bgrStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMasks[4]; - for (size_t c = 0; c < 3; ++c) - tailMasks[c] = TailMask64((width - alignedWidth) * 3 - A*c); - tailMasks[3] = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - InterleaveBgr(b + col, g + col, r + col, bgr + col * 3, tailMasks); - if (col < width) - InterleaveBgr(b + col, g + col, r + col, bgr + col * 3, tailMasks); - b += bStride; - g += gStride; - r += rStride; - bgr += bgrStride; - } - } - - void InterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - if (Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride) - && Aligned(r) && Aligned(rStride) && Aligned(bgr) && Aligned(bgrStride)) - InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride); - else - InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride); - } - - template SIMD_INLINE void InterleaveBgra(const uint8_t * b, const uint8_t * g, const uint8_t * r, const uint8_t * a, uint8_t * bgra, const __mmask64 * tails) - { - __m512i _b = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, (Load(b, tails[4]))); - __m512i _g = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, (Load(g, tails[4]))); - __m512i _r = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, (Load(r, tails[4]))); - __m512i _a = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, (Load(a, tails[4]))); - __m512i bg0 = UnpackU8<0>(_b, _g); - __m512i bg1 = UnpackU8<1>(_b, _g); - __m512i ra0 = UnpackU8<0>(_r, _a); - __m512i ra1 = UnpackU8<1>(_r, _a); - Store(bgra + 0 * A, UnpackU16<0>(bg0, ra0), tails[0]); - Store(bgra + 1 * A, UnpackU16<1>(bg0, ra0), tails[1]); - Store(bgra + 2 * A, UnpackU16<0>(bg1, ra1), tails[2]); - Store(bgra + 3 * A, UnpackU16<1>(bg1, ra1), tails[3]); - } - - template void InterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride) - { - if (align) - { - assert(Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride)); - assert(Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMasks[5]; - for (size_t c = 0; c < 4; ++c) - tailMasks[c] = TailMask64((width - alignedWidth) * 4 - A*c); - tailMasks[4] = TailMask64(width - alignedWidth); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - InterleaveBgra(b + col, g + col, r + col, a + col, bgra + col * 4, tailMasks); - if (col < width) - InterleaveBgra(b + col, g + col, r + col, a + col, bgra + col * 4, tailMasks); - b += bStride; - g += gStride; - r += rStride; - a += aStride; - bgra += bgraStride; - } - } - - void InterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride) - { - if (Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride) - && Aligned(r) && Aligned(rStride) && Aligned(bgra) && Aligned(bgraStride)) - InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride); - else - InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwLaplace.cpp b/src/3rd/Simd/Simd/SimdAvx512bwLaplace.cpp deleted file mode 100644 index 7f06286e..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwLaplace.cpp +++ /dev/null @@ -1,196 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - const __m512i K64_PERMUTE_0 = SIMD_MM512_SETR_EPI64(0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xA, 0xB); - const __m512i K64_PERMUTE_1 = SIMD_MM512_SETR_EPI64(0x4, 0x5, 0xC, 0xD, 0x6, 0x7, 0xE, 0xF); - - template SIMD_INLINE __m512i Laplace(__m512i a[3][3]) - { - return _mm512_sub_epi16(_mm512_mullo_epi16(K16_0008, UnpackU8(a[1][1])), - _mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(UnpackU8(a[0][0], a[0][1]), K8_01), - _mm512_maddubs_epi16(UnpackU8(a[0][2], a[1][0]), K8_01)), - _mm512_add_epi16(_mm512_maddubs_epi16(UnpackU8(a[1][2], a[2][0]), K8_01), - _mm512_maddubs_epi16(UnpackU8(a[2][1], a[2][2]), K8_01)))); - } - - template SIMD_INLINE void Laplace(__m512i a[3][3], int16_t * dst) - { - __m512i lo = ConditionalAbs(Laplace<0>(a)); - __m512i hi = ConditionalAbs(Laplace<1>(a)); - Store(dst + 00, _mm512_permutex2var_epi64(lo, K64_PERMUTE_0, hi)); - Store(dst + HA, _mm512_permutex2var_epi64(lo, K64_PERMUTE_1, hi)); - } - - template void Laplace(const uint8_t * src, size_t srcStride, size_t width, size_t height, int16_t * dst, size_t dstStride) - { - assert(width > A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride, HA)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - __m512i a[3][3]; - - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - LoadNose3(src0 + 0, a[0]); - LoadNose3(src1 + 0, a[1]); - LoadNose3(src2 + 0, a[2]); - Laplace(a, dst + 0); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBody3(src0 + col, a[0]); - LoadBody3(src1 + col, a[1]); - LoadBody3(src2 + col, a[2]); - Laplace(a, dst + col); - } - LoadTail3(src0 + width - A, a[0]); - LoadTail3(src1 + width - A, a[1]); - LoadTail3(src2 + width - A, a[2]); - Laplace(a, dst + width - A); - - dst += dstStride; - } - } - - void Laplace(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - Laplace(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - Laplace(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - void LaplaceAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - Laplace(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - Laplace(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - SIMD_INLINE void LaplaceAbsSum(__m512i a[3][3], __m512i * sums) - { - sums[0] = _mm512_add_epi32(sums[0], _mm512_madd_epi16(ConditionalAbs(Laplace<0>(a)), K16_0001)); - sums[1] = _mm512_add_epi32(sums[1], _mm512_madd_epi16(ConditionalAbs(Laplace<1>(a)), K16_0001)); - } - - SIMD_INLINE void SetMask3(__m512i a[3], __m512i mask) - { - a[0] = _mm512_and_si512(a[0], mask); - a[1] = _mm512_and_si512(a[1], mask); - a[2] = _mm512_and_si512(a[2], mask); - } - - SIMD_INLINE void SetMask3x3(__m512i a[3][3], __m512i mask) - { - SetMask3(a[0], mask); - SetMask3(a[1], mask); - SetMask3(a[2], mask); - } - - template void LaplaceAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - assert(width > A && width < 256 * 256 * F); - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - - __m512i a[3][3]; - __m512i tailMask = _mm512_mask_set1_epi8(K_INV_ZERO, TailMask64(A - width + bodyWidth), 0); - - size_t blockSize = (256 * 256 * F) / width; - size_t blockCount = height / blockSize + 1; - __m512i _sum = _mm512_setzero_si512(); - for (size_t block = 0; block < blockCount; ++block) - { - __m512i sums[2] = { _mm512_setzero_si512(), _mm512_setzero_si512() }; - for (size_t row = block*blockSize, endRow = Simd::Min(row + blockSize, height); row < endRow; ++row) - { - src0 = src + stride*(row - 1); - src1 = src0 + stride; - src2 = src1 + stride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - LoadNose3(src0 + 0, a[0]); - LoadNose3(src1 + 0, a[1]); - LoadNose3(src2 + 0, a[2]); - LaplaceAbsSum(a, sums); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBody3(src0 + col, a[0]); - LoadBody3(src1 + col, a[1]); - LoadBody3(src2 + col, a[2]); - LaplaceAbsSum(a, sums); - } - LoadTail3(src0 + width - A, a[0]); - LoadTail3(src1 + width - A, a[1]); - LoadTail3(src2 + width - A, a[2]); - SetMask3x3(a, tailMask); - LaplaceAbsSum(a, sums); - } - sums[0] = _mm512_add_epi32(sums[0], sums[1]); - _sum = _mm512_add_epi64(_sum, _mm512_add_epi64(_mm512_unpacklo_epi32(sums[0], K_ZERO), _mm512_unpackhi_epi32(sums[0], K_ZERO))); - } - - *sum = ExtractSum(_sum); - } - - void LaplaceAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(src) && Aligned(stride)) - LaplaceAbsSum(src, stride, width, height, sum); - else - LaplaceAbsSum(src, stride, width, height, sum); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwLbp.cpp b/src/3rd/Simd/Simd/SimdAvx512bwLbp.cpp deleted file mode 100644 index b6b6e753..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwLbp.cpp +++ /dev/null @@ -1,85 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template void LbpEstimate(const uint8_t * src, ptrdiff_t stride, uint8_t * dst, __mmask64 tail = -1) - { - __m512i threshold = Load(src, tail); - __m512i lbp = _mm512_setzero_si512(); - lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load(src - 1 - stride, tail)), threshold), (char)0x01)); - lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load(src - stride, tail)), threshold), (char)0x02)); - lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load(src + 1 - stride, tail)), threshold), (char)0x04)); - lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load(src + 1, tail)), threshold), (char)0x08)); - lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load(src + 1 + stride, tail)), threshold), (char)0x10)); - lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load(src + stride, tail)), threshold), (char)0x20)); - lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load(src - 1 + stride, tail)), threshold), (char)0x40)); - lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load(src - 1, tail)), threshold), (char)0x80)); - Store(dst, lbp, tail); - } - - template void LbpEstimate( - const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(width >= 2); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); - - size_t alignedWidth = AlignLo(width - 2, A) + 1; - __mmask64 tailMask = Aligned(width - alignedWidth); - - memset(dst, 0, width); - src += srcStride; - dst += dstStride; - for (size_t row = 2; row < height; ++row) - { - dst[0] = 0; - size_t col = 1; - for (; col < alignedWidth; col += A) - LbpEstimate(src + col, srcStride, dst + col); - if (col < width) - LbpEstimate(src + col, srcStride, dst + col, tailMask); - dst[width - 1] = 0; - src += srcStride; - dst += dstStride; - } - memset(dst, 0, width); - } - - void LbpEstimate(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - LbpEstimate(src, srcStride, width, height, dst, dstStride); - else - LbpEstimate(src, srcStride, width, height, dst, dstStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwMeanFilter3x3.cpp b/src/3rd/Simd/Simd/SimdAvx512bwMeanFilter3x3.cpp deleted file mode 100644 index a0ea935c..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwMeanFilter3x3.cpp +++ /dev/null @@ -1,150 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - namespace - { - struct Buffer - { - Buffer(size_t width) - { - _p = Allocate(sizeof(uint16_t) * 3 * width); - src0 = (uint16_t*)_p; - src1 = src0 + width; - src2 = src1 + width; - } - - ~Buffer() - { - Free(_p); - } - - uint16_t * src0; - uint16_t * src1; - uint16_t * src2; - private: - void * _p; - }; - } - - template SIMD_INLINE __m512i SumCol(__m512i a[3]) - { - return _mm512_add_epi16(_mm512_maddubs_epi16(UnpackU8(a[0], a[1]), K8_01), UnpackU8(a[2])); - } - - template SIMD_INLINE void SumCol(__m512i a[3], uint16_t * b) - { - Store(b + 00, SumCol<0>(a)); - Store(b + HA, SumCol<1>(a)); - } - - template void SumCol(const uint8_t * src, size_t aligned, size_t full, uint16_t * dst) - { - __m512i a[3]; - LoadNose3(src, a); - SumCol(a, dst); - for (size_t col = A; col < aligned; col += A) - { - LoadBody3(src + col, a); - SumCol(a, dst + col); - } - LoadTail3(src + full - A, a); - SumCol(a, dst + aligned); - } - - template SIMD_INLINE __m512i AverageRow16(const Buffer & buffer, size_t offset) - { - return _mm512_mulhi_epu16(K16_DIVISION_BY_9_FACTOR, _mm512_add_epi16( - _mm512_add_epi16(K16_0005, Load(buffer.src0 + offset)), - _mm512_add_epi16(Load(buffer.src1 + offset), Load(buffer.src2 + offset)))); - } - - template SIMD_INLINE __m512i AverageRow(const Buffer & buffer, size_t offset) - { - return _mm512_packus_epi16(AverageRow16(buffer, offset), AverageRow16(buffer, offset + HA)); - } - - template void MeanFilter3x3( - const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(step*(width - 1) >= A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(step*width) && Aligned(dst) && Aligned(dstStride)); - - size_t size = step*width; - size_t bodySize = Simd::AlignHi(size, A) - A; - - Buffer buffer(Simd::AlignHi(size, A)); - - SumCol(src, bodySize, size, buffer.src0); - memcpy(buffer.src1, buffer.src0, sizeof(uint16_t)*(bodySize + A)); - - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - const uint8_t *src2 = src + srcStride*(row + 1); - if (row >= height - 2) - src2 = src + srcStride*(height - 1); - - SumCol(src2, bodySize, size, buffer.src2); - - for (size_t col = 0; col < bodySize; col += A) - Store(dst + col, AverageRow(buffer, col)); - Store(dst + size - A, AverageRow(buffer, bodySize)); - - Swap(buffer.src0, buffer.src2); - Swap(buffer.src0, buffer.src1); - } - } - - template void MeanFilter3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - assert(channelCount > 0 && channelCount <= 4); - - switch (channelCount) - { - case 1: MeanFilter3x3(src, srcStride, width, height, dst, dstStride); break; - case 2: MeanFilter3x3(src, srcStride, width, height, dst, dstStride); break; - case 3: MeanFilter3x3(src, srcStride, width, height, dst, dstStride); break; - case 4: MeanFilter3x3(src, srcStride, width, height, dst, dstStride); break; - } - } - - void MeanFilter3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(channelCount*width) && Aligned(dst) && Aligned(dstStride)) - MeanFilter3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else - MeanFilter3x3(src, srcStride, width, height, channelCount, dst, dstStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwMedianFilter.cpp b/src/3rd/Simd/Simd/SimdAvx512bwMedianFilter.cpp deleted file mode 100644 index 7f7a60a3..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwMedianFilter.cpp +++ /dev/null @@ -1,512 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void LoadNoseRhomb3x3(const uint8_t* y[3], size_t offset, __m512i a[5]) - { - a[0] = Load(y[0] + offset); - LoadNose3(y[1] + offset, a + 1); - a[4] = Load(y[2] + offset); - } - - template SIMD_INLINE void LoadBodyRhomb3x3(const uint8_t* y[3], size_t offset, __m512i a[5]) - { - a[0] = Load(y[0] + offset); - LoadBody3(y[1] + offset, a + 1); - a[4] = Load(y[2] + offset); - } - - template SIMD_INLINE void LoadTailRhomb3x3(const uint8_t* y[3], size_t offset, __m512i a[5]) - { - a[0] = Load(y[0] + offset); - LoadTail3(y[1] + offset, a + 1); - a[4] = Load(y[2] + offset); - } - - SIMD_INLINE void PartialSort5(__m512i a[5]) - { - SortU8(a[2], a[3]); - SortU8(a[1], a[2]); - SortU8(a[2], a[3]); - a[4] = _mm512_max_epu8(a[1], a[4]); - a[0] = _mm512_min_epu8(a[0], a[3]); - SortU8(a[2], a[0]); - a[2] = _mm512_max_epu8(a[4], a[2]); - a[2] = _mm512_min_epu8(a[2], a[0]); - } - - template void MedianFilterRhomb3x3( - const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(step*(width - 1) >= A); - - const uint8_t * y[3]; - __m512i a[5]; - - size_t size = step*width; - size_t bodySize = Simd::AlignHi(size, A) - A; - - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - y[0] = src + srcStride*(row - 1); - y[1] = y[0] + srcStride; - y[2] = y[1] + srcStride; - if (row < 1) - y[0] = y[1]; - if (row >= height - 1) - y[2] = y[1]; - - LoadNoseRhomb3x3(y, 0, a); - PartialSort5(a); - Store(dst, a[2]); - - for (size_t col = A; col < bodySize; col += A) - { - LoadBodyRhomb3x3(y, col, a); - PartialSort5(a); - Store(dst + col, a[2]); - } - - size_t col = size - A; - LoadTailRhomb3x3(y, col, a); - PartialSort5(a); - Store(dst + col, a[2]); - } - } - - template void MedianFilterRhomb3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - assert(channelCount > 0 && channelCount <= 4); - - switch (channelCount) - { - case 1: MedianFilterRhomb3x3(src, srcStride, width, height, dst, dstStride); break; - case 2: MedianFilterRhomb3x3(src, srcStride, width, height, dst, dstStride); break; - case 3: MedianFilterRhomb3x3(src, srcStride, width, height, dst, dstStride); break; - case 4: MedianFilterRhomb3x3(src, srcStride, width, height, dst, dstStride); break; - } - } - - void MedianFilterRhomb3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(width) && Aligned(dst) && Aligned(dstStride)) - MedianFilterRhomb3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else - MedianFilterRhomb3x3(src, srcStride, width, height, channelCount, dst, dstStride); - } - - template SIMD_INLINE void LoadNoseRhomb5x5(const uint8_t* y[5], size_t offset, __m512i a[13]) - { - a[0] = Load(y[0] + offset); - LoadNose3(y[1] + offset, a + 1); - LoadNose5(y[2] + offset, a + 4); - LoadNose3(y[3] + offset, a + 9); - a[12] = Load(y[4] + offset); - } - - template SIMD_INLINE void LoadBodyRhomb5x5(const uint8_t* y[5], size_t offset, __m512i a[13]) - { - a[0] = Load(y[0] + offset); - LoadBody3(y[1] + offset, a + 1); - LoadBody5(y[2] + offset, a + 4); - LoadBody3(y[3] + offset, a + 9); - a[12] = Load(y[4] + offset); - } - - template SIMD_INLINE void LoadTailRhomb5x5(const uint8_t* y[5], size_t offset, __m512i a[13]) - { - a[0] = Load(y[0] + offset); - LoadTail3(y[1] + offset, a + 1); - LoadTail5(y[2] + offset, a + 4); - LoadTail3(y[3] + offset, a + 9); - a[12] = Load(y[4] + offset); - } - - SIMD_INLINE void PartialSort13(__m512i a[13]) - { - SortU8(a[0], a[1]); SortU8(a[3], a[4]); SortU8(a[2], a[4]); - SortU8(a[2], a[3]); SortU8(a[6], a[7]); SortU8(a[5], a[7]); - SortU8(a[5], a[6]); SortU8(a[9], a[10]); SortU8(a[8], a[10]); - SortU8(a[8], a[9]); SortU8(a[11], a[12]); SortU8(a[5], a[8]); - SortU8(a[2], a[8]); SortU8(a[2], a[5]); SortU8(a[6], a[9]); - SortU8(a[3], a[9]); SortU8(a[3], a[6]); SortU8(a[7], a[10]); - SortU8(a[4], a[10]); SortU8(a[4], a[7]); SortU8(a[3], a[12]); - SortU8(a[0], a[9]); - a[1] = _mm512_min_epu8(a[1], a[10]); - a[1] = _mm512_min_epu8(a[1], a[7]); - a[1] = _mm512_min_epu8(a[1], a[9]); - a[11] = _mm512_max_epu8(a[5], a[11]); - a[11] = _mm512_max_epu8(a[3], a[11]); - a[11] = _mm512_max_epu8(a[2], a[11]); - SortU8(a[0], a[6]); SortU8(a[1], a[8]); SortU8(a[6], a[8]); - a[4] = _mm512_min_epu8(a[4], a[8]); - SortU8(a[0], a[1]); SortU8(a[4], a[6]); SortU8(a[0], a[4]); - a[11] = _mm512_max_epu8(a[0], a[11]); - SortU8(a[6], a[11]); - a[1] = _mm512_min_epu8(a[1], a[11]); - SortU8(a[1], a[4]); SortU8(a[6], a[12]); - a[6] = _mm512_max_epu8(a[1], a[6]); - a[4] = _mm512_min_epu8(a[4], a[12]); - a[6] = _mm512_max_epu8(a[4], a[6]); - } - - template void MedianFilterRhomb5x5( - const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(step*(width - 2) >= A); - - const uint8_t * y[5]; - __m512i a[13]; - - size_t size = step*width; - size_t bodySize = Simd::AlignHi(size, A) - A; - - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - y[0] = src + srcStride*(row - 2); - y[1] = y[0] + srcStride; - y[2] = y[1] + srcStride; - y[3] = y[2] + srcStride; - y[4] = y[3] + srcStride; - if (row < 2) - { - if (row < 1) - y[1] = y[2]; - y[0] = y[1]; - } - if (row >= height - 2) - { - if (row >= height - 1) - y[3] = y[2]; - y[4] = y[3]; - } - - LoadNoseRhomb5x5(y, 0, a); - PartialSort13(a); - Store(dst, a[6]); - - for (size_t col = A; col < bodySize; col += A) - { - LoadBodyRhomb5x5(y, col, a); - PartialSort13(a); - Store(dst + col, a[6]); - } - - size_t col = size - A; - LoadTailRhomb5x5(y, col, a); - PartialSort13(a); - Store(dst + col, a[6]); - } - } - - template void MedianFilterRhomb5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - assert(channelCount > 0 && channelCount <= 4); - - switch (channelCount) - { - case 1: MedianFilterRhomb5x5(src, srcStride, width, height, dst, dstStride); break; - case 2: MedianFilterRhomb5x5(src, srcStride, width, height, dst, dstStride); break; - case 3: MedianFilterRhomb5x5(src, srcStride, width, height, dst, dstStride); break; - case 4: MedianFilterRhomb5x5(src, srcStride, width, height, dst, dstStride); break; - } - } - - void MedianFilterRhomb5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(width) && Aligned(dst) && Aligned(dstStride)) - MedianFilterRhomb5x5(src, srcStride, width, height, channelCount, dst, dstStride); - else - MedianFilterRhomb5x5(src, srcStride, width, height, channelCount, dst, dstStride); - } - - template SIMD_INLINE void LoadNoseSquare3x3(const uint8_t* y[3], size_t offset, __m512i a[9]) - { - LoadNose3(y[0] + offset, a + 0); - LoadNose3(y[1] + offset, a + 3); - LoadNose3(y[2] + offset, a + 6); - } - - template SIMD_INLINE void LoadBodySquare3x3(const uint8_t* y[3], size_t offset, __m512i a[9]) - { - LoadBody3(y[0] + offset, a + 0); - LoadBody3(y[1] + offset, a + 3); - LoadBody3(y[2] + offset, a + 6); - } - - template SIMD_INLINE void LoadTailSquare3x3(const uint8_t* y[3], size_t offset, __m512i a[9]) - { - LoadTail3(y[0] + offset, a + 0); - LoadTail3(y[1] + offset, a + 3); - LoadTail3(y[2] + offset, a + 6); - } - - SIMD_INLINE void PartialSort9(__m512i a[9]) - { - SortU8(a[1], a[2]); SortU8(a[4], a[5]); SortU8(a[7], a[8]); - SortU8(a[0], a[1]); SortU8(a[3], a[4]); SortU8(a[6], a[7]); - SortU8(a[1], a[2]); SortU8(a[4], a[5]); SortU8(a[7], a[8]); - a[3] = _mm512_max_epu8(a[0], a[3]); - a[5] = _mm512_min_epu8(a[5], a[8]); - SortU8(a[4], a[7]); - a[6] = _mm512_max_epu8(a[3], a[6]); - a[4] = _mm512_max_epu8(a[1], a[4]); - a[2] = _mm512_min_epu8(a[2], a[5]); - a[4] = _mm512_min_epu8(a[4], a[7]); - SortU8(a[4], a[2]); - a[4] = _mm512_max_epu8(a[6], a[4]); - a[4] = _mm512_min_epu8(a[4], a[2]); - } - - template void MedianFilterSquare3x3( - const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(step*(width - 1) >= A); - - const uint8_t * y[3]; - __m512i a[9]; - - size_t size = step*width; - size_t bodySize = Simd::AlignHi(size, A) - A; - - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - y[0] = src + srcStride*(row - 1); - y[1] = y[0] + srcStride; - y[2] = y[1] + srcStride; - if (row < 1) - y[0] = y[1]; - if (row >= height - 1) - y[2] = y[1]; - - LoadNoseSquare3x3(y, 0, a); - PartialSort9(a); - Store(dst, a[4]); - - for (size_t col = A; col < bodySize; col += A) - { - LoadBodySquare3x3(y, col, a); - PartialSort9(a); - Store(dst + col, a[4]); - } - - size_t col = size - A; - LoadTailSquare3x3(y, col, a); - PartialSort9(a); - Store(dst + col, a[4]); - } - } - - template void MedianFilterSquare3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - assert(channelCount > 0 && channelCount <= 4); - - switch (channelCount) - { - case 1: MedianFilterSquare3x3(src, srcStride, width, height, dst, dstStride); break; - case 2: MedianFilterSquare3x3(src, srcStride, width, height, dst, dstStride); break; - case 3: MedianFilterSquare3x3(src, srcStride, width, height, dst, dstStride); break; - case 4: MedianFilterSquare3x3(src, srcStride, width, height, dst, dstStride); break; - } - } - - void MedianFilterSquare3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(width) && Aligned(dst) && Aligned(dstStride)) - MedianFilterSquare3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else - MedianFilterSquare3x3(src, srcStride, width, height, channelCount, dst, dstStride); - } - - - template SIMD_INLINE void LoadNoseSquare5x5(const uint8_t* y[5], size_t offset, __m512i a[25]) - { - LoadNose5(y[0] + offset, a + 0); - LoadNose5(y[1] + offset, a + 5); - LoadNose5(y[2] + offset, a + 10); - LoadNose5(y[3] + offset, a + 15); - LoadNose5(y[4] + offset, a + 20); - } - - template SIMD_INLINE void LoadBodySquare5x5(const uint8_t* y[5], size_t offset, __m512i a[25]) - { - LoadBody5(y[0] + offset, a + 0); - LoadBody5(y[1] + offset, a + 5); - LoadBody5(y[2] + offset, a + 10); - LoadBody5(y[3] + offset, a + 15); - LoadBody5(y[4] + offset, a + 20); - } - - template SIMD_INLINE void LoadTailSquare5x5(const uint8_t* y[5], size_t offset, __m512i a[25]) - { - LoadTail5(y[0] + offset, a + 0); - LoadTail5(y[1] + offset, a + 5); - LoadTail5(y[2] + offset, a + 10); - LoadTail5(y[3] + offset, a + 15); - LoadTail5(y[4] + offset, a + 20); - } - - SIMD_INLINE void PartialSort25(__m512i a[25]) - { - SortU8(a[0], a[1]); SortU8(a[3], a[4]); SortU8(a[2], a[4]); - SortU8(a[2], a[3]); SortU8(a[6], a[7]); SortU8(a[5], a[7]); - SortU8(a[5], a[6]); SortU8(a[9], a[10]); SortU8(a[8], a[10]); - SortU8(a[8], a[9]); SortU8(a[12], a[13]); SortU8(a[11], a[13]); - SortU8(a[11], a[12]); SortU8(a[15], a[16]); SortU8(a[14], a[16]); - SortU8(a[14], a[15]); SortU8(a[18], a[19]); SortU8(a[17], a[19]); - SortU8(a[17], a[18]); SortU8(a[21], a[22]); SortU8(a[20], a[22]); - SortU8(a[20], a[21]); SortU8(a[23], a[24]); SortU8(a[2], a[5]); - SortU8(a[3], a[6]); SortU8(a[0], a[6]); SortU8(a[0], a[3]); - SortU8(a[4], a[7]); SortU8(a[1], a[7]); SortU8(a[1], a[4]); - SortU8(a[11], a[14]); SortU8(a[8], a[14]); SortU8(a[8], a[11]); - SortU8(a[12], a[15]); SortU8(a[9], a[15]); SortU8(a[9], a[12]); - SortU8(a[13], a[16]); SortU8(a[10], a[16]); SortU8(a[10], a[13]); - SortU8(a[20], a[23]); SortU8(a[17], a[23]); SortU8(a[17], a[20]); - SortU8(a[21], a[24]); SortU8(a[18], a[24]); SortU8(a[18], a[21]); - SortU8(a[19], a[22]); SortU8(a[9], a[18]); SortU8(a[0], a[18]); - a[17] = _mm512_max_epu8(a[8], a[17]); - a[9] = _mm512_max_epu8(a[0], a[9]); - SortU8(a[10], a[19]); SortU8(a[1], a[19]); SortU8(a[1], a[10]); - SortU8(a[11], a[20]); SortU8(a[2], a[20]); SortU8(a[12], a[21]); - a[11] = _mm512_max_epu8(a[2], a[11]); - SortU8(a[3], a[21]); SortU8(a[3], a[12]); SortU8(a[13], a[22]); - a[4] = _mm512_min_epu8(a[4], a[22]); - SortU8(a[4], a[13]); SortU8(a[14], a[23]); - SortU8(a[5], a[23]); SortU8(a[5], a[14]); SortU8(a[15], a[24]); - a[6] = _mm512_min_epu8(a[6], a[24]); - SortU8(a[6], a[15]); - a[7] = _mm512_min_epu8(a[7], a[16]); - a[7] = _mm512_min_epu8(a[7], a[19]); - a[13] = _mm512_min_epu8(a[13], a[21]); - a[15] = _mm512_min_epu8(a[15], a[23]); - a[7] = _mm512_min_epu8(a[7], a[13]); - a[7] = _mm512_min_epu8(a[7], a[15]); - a[9] = _mm512_max_epu8(a[1], a[9]); - a[11] = _mm512_max_epu8(a[3], a[11]); - a[17] = _mm512_max_epu8(a[5], a[17]); - a[17] = _mm512_max_epu8(a[11], a[17]); - a[17] = _mm512_max_epu8(a[9], a[17]); - SortU8(a[4], a[10]); - SortU8(a[6], a[12]); SortU8(a[7], a[14]); SortU8(a[4], a[6]); - a[7] = _mm512_max_epu8(a[4], a[7]); - SortU8(a[12], a[14]); - a[10] = _mm512_min_epu8(a[10], a[14]); - SortU8(a[6], a[7]); SortU8(a[10], a[12]); SortU8(a[6], a[10]); - a[17] = _mm512_max_epu8(a[6], a[17]); - SortU8(a[12], a[17]); - a[7] = _mm512_min_epu8(a[7], a[17]); - SortU8(a[7], a[10]); SortU8(a[12], a[18]); - a[12] = _mm512_max_epu8(a[7], a[12]); - a[10] = _mm512_min_epu8(a[10], a[18]); - SortU8(a[12], a[20]); - a[10] = _mm512_min_epu8(a[10], a[20]); - a[12] = _mm512_max_epu8(a[10], a[12]); - } - - template void MedianFilterSquare5x5( - const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(step*(width - 2) >= A); - - const uint8_t * y[5]; - __m512i a[25]; - - size_t size = step*width; - size_t bodySize = Simd::AlignHi(size, A) - A; - - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - y[0] = src + srcStride*(row - 2); - y[1] = y[0] + srcStride; - y[2] = y[1] + srcStride; - y[3] = y[2] + srcStride; - y[4] = y[3] + srcStride; - if (row < 2) - { - if (row < 1) - y[1] = y[2]; - y[0] = y[1]; - } - if (row >= height - 2) - { - if (row >= height - 1) - y[3] = y[2]; - y[4] = y[3]; - } - - LoadNoseSquare5x5(y, 0, a); - PartialSort25(a); - Store(dst, a[12]); - - for (size_t col = A; col < bodySize; col += A) - { - LoadBodySquare5x5(y, col, a); - PartialSort25(a); - Store(dst + col, a[12]); - } - - size_t col = size - A; - LoadTailSquare5x5(y, col, a); - PartialSort25(a); - Store(dst + col, a[12]); - } - } - - template void MedianFilterSquare5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - assert(channelCount > 0 && channelCount <= 4); - - switch (channelCount) - { - case 1: MedianFilterSquare5x5(src, srcStride, width, height, dst, dstStride); break; - case 2: MedianFilterSquare5x5(src, srcStride, width, height, dst, dstStride); break; - case 3: MedianFilterSquare5x5(src, srcStride, width, height, dst, dstStride); break; - case 4: MedianFilterSquare5x5(src, srcStride, width, height, dst, dstStride); break; - } - } - - void MedianFilterSquare5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(width) && Aligned(dst) && Aligned(dstStride)) - MedianFilterSquare5x5(src, srcStride, width, height, channelCount, dst, dstStride); - else - MedianFilterSquare5x5(src, srcStride, width, height, channelCount, dst, dstStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwNeural.cpp b/src/3rd/Simd/Simd/SimdAvx512bwNeural.cpp deleted file mode 100644 index 1098d70c..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwNeural.cpp +++ /dev/null @@ -1,95 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdStream.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template __m128i Invert(__m128i value); - - template <> __m128i Invert(__m128i value) - { - return _mm_sub_epi8(Sse2::K_INV_ZERO, value); - } - - template <> __m128i Invert(__m128i value) - { - return value; - } - - template void Convert(const uint8_t * src, const __m512 & _1_255, float * dst) - { - __m128i _src = Invert(Sse2::Load((__m128i*)src)); - Avx512f::Stream(dst, _mm512_mul_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_src)), _1_255)); - } - - template void NeuralConvert(const uint8_t * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - assert(width >= F); - if (align) - assert(Aligned(src, Sse2::A) && Aligned(srcStride, Sse2::A) && Aligned(dst) && Aligned(dstStride)); - - size_t alignedWidth = AlignLo(width, F); - __m512 _1_255 = _mm512_set1_ps(1.0f / 255.0f); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += F) - Convert(src + col, _1_255, dst + col); - if (width != alignedWidth) - Convert(src + width - F, _1_255, dst + width - F); - src += srcStride; - dst += dstStride; - } - if (stream) - _mm_mfence(); - } - - template void NeuralConvert(const uint8_t * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - if (Aligned(src, Sse2::A) && Aligned(srcStride, Sse2::A) && Aligned(dst) && Aligned(dstStride)) - { - if (width*height * sizeof(float) >= STREAM_SIZE_MIN) - NeuralConvert(src, srcStride, width, height, dst, dstStride); - else - NeuralConvert(src, srcStride, width, height, dst, dstStride); - } - else - NeuralConvert(src, srcStride, width, height, dst, dstStride); - } - - void NeuralConvert(const uint8_t * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride, int inversion) - { - if (inversion) - NeuralConvert(src, srcStride, width, height, dst, dstStride); - else - NeuralConvert(src, srcStride, width, height, dst, dstStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwOperation.cpp b/src/3rd/Simd/Simd/SimdAvx512bwOperation.cpp deleted file mode 100644 index 9523db90..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwOperation.cpp +++ /dev/null @@ -1,274 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions:SimdOperationBinary8u -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE __m512i OperationBinary8u(const __m512i & a, const __m512i & b); - - template <> SIMD_INLINE __m512i OperationBinary8u(const __m512i & a, const __m512i & b) - { - return _mm512_avg_epu8(a, b); - } - - template <> SIMD_INLINE __m512i OperationBinary8u(const __m512i & a, const __m512i & b) - { - return _mm512_and_si512(a, b); - } - - template <> SIMD_INLINE __m512i OperationBinary8u(const __m512i & a, const __m512i & b) - { - return _mm512_or_si512(a, b); - } - - template <> SIMD_INLINE __m512i OperationBinary8u(const __m512i & a, const __m512i & b) - { - return _mm512_max_epu8(a, b); - } - - template <> SIMD_INLINE __m512i OperationBinary8u(const __m512i & a, const __m512i & b) - { - return _mm512_min_epu8(a, b); - } - - template <> SIMD_INLINE __m512i OperationBinary8u(const __m512i & a, const __m512i & b) - { - return _mm512_subs_epu8(a, b); - } - - template <> SIMD_INLINE __m512i OperationBinary8u(const __m512i & a, const __m512i & b) - { - return _mm512_adds_epu8(a, b); - } - - template void OperationBinary8u(const uint8_t * a, const uint8_t * b, uint8_t * dst, size_t offset, __mmask64 m = -1) - { - const __m512i _a = Load(a + offset, m); - const __m512i _b = Load(b + offset, m); - Store(dst + offset, OperationBinary8u(_a, _b), m); - } - - template void OperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride) - { - if (align) - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(dst) && Aligned(dstStride)); - - size_t size = channelCount*width; - size_t fullAlignedSize = Simd::AlignLo(size, QA); - size_t partialAlignedSize = Simd::AlignLo(size, A); - __mmask64 tailMask = __mmask64(-1) >> (A + partialAlignedSize - size); - for (size_t row = 0; row < height; ++row) - { - size_t offset = 0; - for (; offset < fullAlignedSize; offset += QA) - { - OperationBinary8u(a, b, dst, offset); - OperationBinary8u(a, b, dst, offset + A); - OperationBinary8u(a, b, dst, offset + 2 * A); - OperationBinary8u(a, b, dst, offset + 3 * A); - } - for (; offset < partialAlignedSize; offset += A) - OperationBinary8u(a, b, dst, offset); - for (; offset < size; offset += A) - OperationBinary8u(a, b, dst, offset, tailMask); - a += aStride; - b += bStride; - dst += dstStride; - } - } - - template void OperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride, SimdOperationBinary8uType type) - { - switch (type) - { - case SimdOperationBinary8uAverage: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uAnd: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uOr: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uMaximum: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uMinimum: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uSaturatedSubtraction: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uSaturatedAddition: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - default: - assert(0); - } - } - - void OperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride, SimdOperationBinary8uType type) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(dst) && Aligned(dstStride)) - OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride, type); - else - OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride, type); - } - - template SIMD_INLINE __m512i OperationBinary16i(const __m512i & a, const __m512i & b); - - template <> SIMD_INLINE __m512i OperationBinary16i(const __m512i & a, const __m512i & b) - { - return _mm512_add_epi16(a, b); - } - - template <> SIMD_INLINE __m512i OperationBinary16i(const __m512i & a, const __m512i & b) - { - return _mm512_sub_epi16(a, b); - } - - template void OperationBinary16i(const uint8_t * a, const uint8_t * b, uint8_t * dst, size_t offset, __mmask32 m = -1) - { - const __m512i _a = Load((int16_t*)(a + offset), m); - const __m512i _b = Load((int16_t*)(b + offset), m); - Store((int16_t*)(dst + offset), OperationBinary16i(_a, _b), m); - } - - template void OperationBinary16i(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - if (align) - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(dst) && Aligned(dstStride)); - - size_t size = width * sizeof(int16_t); - size_t fullAlignedSize = Simd::AlignLo(size, QA); - size_t partialAlignedSize = Simd::AlignLo(size, A); - __mmask32 tailMask = __mmask32(-1) >> ((A + partialAlignedSize - size) / sizeof(int16_t)); - for (size_t row = 0; row < height; ++row) - { - size_t offset = 0; - for (; offset < fullAlignedSize; offset += QA) - { - OperationBinary16i(a, b, dst, offset); - OperationBinary16i(a, b, dst, offset + A); - OperationBinary16i(a, b, dst, offset + 2 * A); - OperationBinary16i(a, b, dst, offset + 3 * A); - } - for (; offset < partialAlignedSize; offset += A) - OperationBinary16i(a, b, dst, offset); - for (; offset < size; offset += A) - OperationBinary16i(a, b, dst, offset, tailMask); - a += aStride; - b += bStride; - dst += dstStride; - } - } - - template void OperationBinary16i(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint8_t * dst, size_t dstStride, SimdOperationBinary16iType type) - { - switch (type) - { - case SimdOperationBinary16iAddition: - return OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride); - case SimdOperationBinary16iSubtraction: - return OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride); - default: - assert(0); - } - } - - void OperationBinary16i(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint8_t * dst, size_t dstStride, SimdOperationBinary16iType type) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(dst) && Aligned(dstStride)) - OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride, type); - else - OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride, type); - } - - SIMD_INLINE __m512i VectorProduct(const __m512i & vertical, const __m512i & horizontalLo, const __m512i & horizontalHi) - { - __m512i lo = DivideI16By255(_mm512_mullo_epi16(vertical, horizontalLo)); - __m512i hi = DivideI16By255(_mm512_mullo_epi16(vertical, horizontalHi)); - return _mm512_packus_epi16(lo, hi); - } - - template SIMD_INLINE void VectorProduct2(const __m512i & vertical0, const __m512i & vertical1, const uint8_t * horizontal, uint8_t * dst, size_t stride, __mmask64 m = -1) - { - __m512i _horizontal = Load(horizontal, m); - __m512i horizontalLo = UnpackU8<0>(_horizontal); - __m512i horizontalHi = UnpackU8<1>(_horizontal); - Store(dst + 0 * stride, VectorProduct(vertical0, horizontalLo, horizontalHi), m); - Store(dst + 1 * stride, VectorProduct(vertical1, horizontalLo, horizontalHi), m); - } - - template SIMD_INLINE void VectorProduct1(const __m512i & vertical, const uint8_t * horizontal, uint8_t * dst, __mmask64 m = -1) - { - __m512i _horizontal = Load(horizontal, m); - Store(dst, VectorProduct(vertical, UnpackU8<0>(_horizontal), UnpackU8<1>(_horizontal)), m); - } - - template void VectorProduct(const uint8_t * vertical, const uint8_t * horizontal, uint8_t * dst, size_t stride, size_t width, size_t height) - { - if (align) - assert(Aligned(horizontal) && Aligned(dst) && Aligned(stride)); - - size_t alignedHeight = Simd::AlignLo(height, 2); - size_t alignedWidth = Simd::AlignLo(width, A); - __mmask64 tailMask = __mmask64(-1) >> (A + alignedWidth - width); - size_t row = 0; - for (; row < alignedHeight; row += 2) - { - __m512i vertical0 = _mm512_set1_epi16(vertical[row + 0]); - __m512i vertical1 = _mm512_set1_epi16(vertical[row + 1]); - size_t col = 0; - for (; col < alignedWidth; col += A) - VectorProduct2(vertical0, vertical1, horizontal + col, dst + col, stride); - if (col < width) - VectorProduct2(vertical0, vertical1, horizontal + col, dst + col, stride, tailMask); - dst += 2 * stride; - } - for (; row < height; ++row) - { - __m512i _vertical = _mm512_set1_epi16(vertical[row]); - size_t col = 0; - for (; col < alignedWidth; col += A) - VectorProduct1(_vertical, horizontal + col, dst + col); - if (col < width) - VectorProduct1(_vertical, horizontal + col, dst + col, tailMask); - dst += stride; - } - } - - void VectorProduct(const uint8_t * vertical, const uint8_t * horizontal, uint8_t * dst, size_t stride, size_t width, size_t height) - { - if (Aligned(horizontal) && Aligned(dst) && Aligned(stride)) - VectorProduct(vertical, horizontal, dst, stride, width, height); - else - VectorProduct(vertical, horizontal, dst, stride, width, height); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwReduce.cpp b/src/3rd/Simd/Simd/SimdAvx512bwReduce.cpp deleted file mode 100644 index 50017826..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwReduce.cpp +++ /dev/null @@ -1,248 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - SIMD_INLINE __m512i Reduce16(const __m512i & s0, const __m512i & s1) - { - return _mm512_srli_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(s0, K8_01), _mm512_maddubs_epi16(s1, K8_01)), K16_0002), 2); - } - - SIMD_INLINE __m512i Reduce8(const __m512i & s00, const __m512i & s01, const __m512i & s10, const __m512i & s11) - { - return _mm512_permutexvar_epi64(K64_PERMUTE_FOR_PACK, _mm512_packus_epi16(Reduce16(s00, s10), Reduce16(s01, s11))); - } - - template __m512i Reduce8(const __m512i & s00, const __m512i & s01, const __m512i & s10, const __m512i & s11); - - template<> SIMD_INLINE __m512i Reduce8<1>(const __m512i & s00, const __m512i & s01, const __m512i & s10, const __m512i & s11) - { - return Reduce8(s00, s01, s10, s11); - } - - const __m512i K8_RC2 = SIMD_MM512_SETR_EPI8( - 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF, - 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF, - 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF, - 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF); - - template<> SIMD_INLINE __m512i Reduce8<2>(const __m512i & s00, const __m512i & s01, const __m512i & s10, const __m512i & s11) - { - return Reduce8(_mm512_shuffle_epi8(s00, K8_RC2), _mm512_shuffle_epi8(s01, K8_RC2), _mm512_shuffle_epi8(s10, K8_RC2), _mm512_shuffle_epi8(s11, K8_RC2)); - } - - const __m512i K8_RC4 = SIMD_MM512_SETR_EPI8( - 0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF, - 0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF, - 0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF, - 0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF); - - template<> SIMD_INLINE __m512i Reduce8<4>(const __m512i & s00, const __m512i & s01, const __m512i & s10, const __m512i & s11) - { - return Reduce8(_mm512_shuffle_epi8(s00, K8_RC4), _mm512_shuffle_epi8(s01, K8_RC4), _mm512_shuffle_epi8(s10, K8_RC4), _mm512_shuffle_epi8(s11, K8_RC4)); - } - - template SIMD_INLINE void ReduceColor2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst, __mmask64 * tails) - { - __m512i s00 = Load(src0 + 0, tails[0]); - __m512i s01 = Load(src0 + A, tails[1]); - __m512i s10 = Load(src1 + 0, tails[0]); - __m512i s11 = Load(src1 + A, tails[1]); - Store(dst, Reduce8(s00, s01, s10, s11), tails[2]); - } - - template void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride) - { - size_t evenWidth = AlignLo(srcWidth, 2); - size_t evenSize = evenWidth * channelCount; - size_t alignedSize = AlignLo(evenSize, DA); - __mmask64 tailMasks[3]; - for (size_t c = 0; c < 2; ++c) - tailMasks[c] = TailMask64(evenSize - alignedSize - A * c); - tailMasks[2] = TailMask64((evenSize - alignedSize) / 2); - for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) - { - const uint8_t *src0 = src; - const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride); - size_t srcOffset = 0, dstOffset = 0; - for (; srcOffset < alignedSize; srcOffset += DA, dstOffset += A) - ReduceColor2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset, tailMasks); - if (srcOffset < evenSize) - ReduceColor2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset, tailMasks); - if (evenWidth != srcWidth) - { - for (size_t c = 0; c < channelCount; ++c) - dst[evenSize / 2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]); - } - src += 2 * srcStride; - dst += dstStride; - } - } - - const __m512i K8_BGR_SM0 = SIMD_MM512_SETR_EPI8( - 0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1, - -1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1, - -1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF, - 0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1); - const __m512i K8_BGR_SM1 = SIMD_MM512_SETR_EPI8( - -1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1, - -1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF, - 0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1, - -1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1); - const __m512i K8_BGR_SM2 = SIMD_MM512_SETR_EPI8( - -1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF, - 0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1, - -1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1, - -1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF); - - const __m512i K64_BGR_PE0 = SIMD_MM512_SETR_EPI64(0x2, 0x0, 0x4, 0x1, 0x6, 0x3, 0x8, 0x5); - const __m512i K64_BGR_PE1 = SIMD_MM512_SETR_EPI64(0x7, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF); - const __m512i K64_BGR_PE2 = SIMD_MM512_SETR_EPI64(0xA, 0x7, 0xC, 0x9, 0xE, 0xB, 0xF, 0xD); - - const __m512i K8_BGR_SE0 = SIMD_MM512_SETR_EPI8( - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, - 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, - 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0); - const __m512i K8_BGR_SE1 = SIMD_MM512_SETR_EPI8( - 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, - 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, - 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1); - const __m512i K8_BGR_SE2 = SIMD_MM512_SETR_EPI8( - 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, - 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, - 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - - template SIMD_INLINE void ReduceBgr2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst, __mmask64 * tails) - { - __m512i s00 = Load(src0 + 0 * A, tails[0]); - __m512i s01 = Load(src0 + 1 * A, tails[1]); - __m512i s02 = Load(src0 + 2 * A, tails[2]); - __m512i s10 = Load(src1 + 0 * A, tails[0]); - __m512i s11 = Load(src1 + 1 * A, tails[1]); - __m512i s12 = Load(src1 + 2 * A, tails[2]); - __m512i e00 = _mm512_permutex2var_epi64(s00, K64_BGR_PE0, s01); - __m512i e01 = _mm512_permutex2var_epi64(_mm512_permutex2var_epi64(s00, K64_BGR_PE1, s01), K64_BGR_PE0, s02); - __m512i e10 = _mm512_permutex2var_epi64(s10, K64_BGR_PE0, s11); - __m512i e11 = _mm512_permutex2var_epi64(_mm512_permutex2var_epi64(s10, K64_BGR_PE1, s11), K64_BGR_PE0, s12); - __m512i m00 = _mm512_or_si512(_mm512_shuffle_epi8(s00, K8_BGR_SM0), _mm512_shuffle_epi8(e00, K8_BGR_SE0)); - __m512i m01 = _mm512_or_si512(_mm512_shuffle_epi8(s01, K8_BGR_SM1), _mm512_shuffle_epi8(e01, K8_BGR_SE1)); - __m512i m10 = _mm512_or_si512(_mm512_shuffle_epi8(s10, K8_BGR_SM0), _mm512_shuffle_epi8(e10, K8_BGR_SE0)); - __m512i m11 = _mm512_or_si512(_mm512_shuffle_epi8(s11, K8_BGR_SM1), _mm512_shuffle_epi8(e11, K8_BGR_SE1)); - Store(dst + 0 * A, Reduce8(m00, m01, m10, m11), tails[6]); - __m512i s03 = Load(src0 + 3 * A, tails[3]); - __m512i s04 = Load(src0 + 4 * A, tails[4]); - __m512i s13 = Load(src1 + 3 * A, tails[3]); - __m512i s14 = Load(src1 + 4 * A, tails[4]); - __m512i e02 = _mm512_permutex2var_epi64(s01, K64_BGR_PE2, s02); - __m512i e03 = _mm512_permutex2var_epi64(s03, K64_BGR_PE0, s04); - __m512i e12 = _mm512_permutex2var_epi64(s11, K64_BGR_PE2, s12); - __m512i e13 = _mm512_permutex2var_epi64(s13, K64_BGR_PE0, s14); - __m512i m02 = _mm512_or_si512(_mm512_shuffle_epi8(s02, K8_BGR_SM2), _mm512_shuffle_epi8(e02, K8_BGR_SE2)); - __m512i m03 = _mm512_or_si512(_mm512_shuffle_epi8(s03, K8_BGR_SM0), _mm512_shuffle_epi8(e03, K8_BGR_SE0)); - __m512i m12 = _mm512_or_si512(_mm512_shuffle_epi8(s12, K8_BGR_SM2), _mm512_shuffle_epi8(e12, K8_BGR_SE2)); - __m512i m13 = _mm512_or_si512(_mm512_shuffle_epi8(s13, K8_BGR_SM0), _mm512_shuffle_epi8(e13, K8_BGR_SE0)); - Store(dst + 1 * A, Reduce8(m02, m03, m12, m13), tails[7]); - __m512i s05 = Load(src0 + 5 * A, tails[5]); - __m512i s15 = Load(src1 + 5 * A, tails[5]); - __m512i e04 = _mm512_permutex2var_epi64(_mm512_permutex2var_epi64(s03, K64_BGR_PE1, s04), K64_BGR_PE0, s05); - __m512i e05 = _mm512_permutex2var_epi64(s04, K64_BGR_PE2, s05); - __m512i e14 = _mm512_permutex2var_epi64(_mm512_permutex2var_epi64(s13, K64_BGR_PE1, s14), K64_BGR_PE0, s15); - __m512i e15 = _mm512_permutex2var_epi64(s14, K64_BGR_PE2, s15); - __m512i m04 = _mm512_or_si512(_mm512_shuffle_epi8(s04, K8_BGR_SM1), _mm512_shuffle_epi8(e04, K8_BGR_SE1)); - __m512i m05 = _mm512_or_si512(_mm512_shuffle_epi8(s05, K8_BGR_SM2), _mm512_shuffle_epi8(e05, K8_BGR_SE2)); - __m512i m14 = _mm512_or_si512(_mm512_shuffle_epi8(s14, K8_BGR_SM1), _mm512_shuffle_epi8(e14, K8_BGR_SE1)); - __m512i m15 = _mm512_or_si512(_mm512_shuffle_epi8(s15, K8_BGR_SM2), _mm512_shuffle_epi8(e15, K8_BGR_SE2)); - Store(dst + 2 * A, Reduce8(m04, m05, m14, m15), tails[8]); - } - - template void ReduceBgr2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride) - { - size_t evenWidth = AlignLo(srcWidth, 2); - size_t alignedWidth = AlignLo(srcWidth, DA); - size_t evenSize = evenWidth * 3; - size_t alignedSize = alignedWidth * 3; - size_t srcStep = DA * 3, dstStep = A * 3; - __mmask64 tailMasks[9]; - for (size_t c = 0; c < 6; ++c) - tailMasks[c] = TailMask64(evenSize - alignedSize - A * c); - for (size_t c = 0; c < 3; ++c) - tailMasks[6 + c] = TailMask64((evenSize - alignedSize)/2 - A * c); - for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) - { - const uint8_t *src0 = src; - const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride); - size_t srcOffset = 0, dstOffset = 0; - for (; srcOffset < alignedSize; srcOffset += srcStep, dstOffset += dstStep) - ReduceBgr2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset, tailMasks); - if (srcOffset < evenSize) - ReduceBgr2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset, tailMasks); - if (evenWidth != srcWidth) - { - for (size_t c = 0; c < 3; ++c) - dst[evenSize / 2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]); - } - src += 2 * srcStride; - dst += dstStride; - } - } - - template void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) - { - assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA); - if (align) - { - assert(Aligned(src) && Aligned(srcStride)); - assert(Aligned(dst) && Aligned(dstStride)); - } - - switch (channelCount) - { - case 1: ReduceColor2x2<1, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - case 2: ReduceColor2x2<2, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - case 3: ReduceBgr2x2(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - case 4: ReduceColor2x2<4, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - default: assert(0); - } - } - - void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - else - ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwReduceGray2x2.cpp b/src/3rd/Simd/Simd/SimdAvx512bwReduceGray2x2.cpp deleted file mode 100644 index 3e81ff34..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwReduceGray2x2.cpp +++ /dev/null @@ -1,97 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - SIMD_INLINE __m512i Reduce16(const __m512i & s0, const __m512i & s1) - { - return _mm512_srli_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(s0, K8_01), _mm512_maddubs_epi16(s1, K8_01)), K16_0002), 2); - } - - SIMD_INLINE __m512i Reduce8(const __m512i & s00, const __m512i & s01, const __m512i & s10, const __m512i & s11) - { - return _mm512_permutexvar_epi64(K64_PERMUTE_FOR_PACK, _mm512_packus_epi16(Reduce16(s00, s10), Reduce16(s01, s11))); - } - - template SIMD_INLINE void ReduceGray2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst, const __mmask64 * tails) - { - const __m512i s00 = Load(src0 + 0, tails[0]); - const __m512i s01 = Load(src0 + A, tails[1]); - const __m512i s10 = Load(src1 + 0, tails[0]); - const __m512i s11 = Load(src1 + A, tails[1]); - Store(dst, Reduce8(s00, s01, s10, s11), tails[2]); - } - - template void ReduceGray2x2( - const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight); - if (align) - { - assert(Aligned(src) && Aligned(srcStride)); - assert(Aligned(dst) && Aligned(dstStride) && Aligned(dstWidth)); - } - - size_t alignedWidth = AlignLo(srcWidth, DA); - __mmask64 tailMasks[3]; - for (size_t c = 0; c < 2; ++c) - tailMasks[c] = TailMask64(srcWidth - alignedWidth - A*c); - tailMasks[2] = TailMask64((srcWidth - alignedWidth) / 2); - size_t evenWidth = AlignLo(srcWidth, 2); - - for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) - { - const uint8_t *src0 = src; - const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride); - size_t srcOffset = 0, dstOffset = 0; - for (; srcOffset < alignedWidth; srcOffset += DA, dstOffset += A) - ReduceGray2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset, tailMasks); - if (srcOffset < srcWidth) - { - ReduceGray2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset, tailMasks); - if (evenWidth != srcWidth) - dst[dstWidth - 1] = Base::Average(src0[evenWidth], src1[evenWidth]); - } - src += 2 * srcStride; - dst += dstStride; - } - } - - void ReduceGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - if (Aligned(src) && Aligned(srcWidth) && Aligned(srcStride) && Aligned(dst) && Aligned(dstWidth) && Aligned(dstStride)) - ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwReduceGray3x3.cpp b/src/3rd/Simd/Simd/SimdAvx512bwReduceGray3x3.cpp deleted file mode 100644 index 09d38bc4..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwReduceGray3x3.cpp +++ /dev/null @@ -1,144 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE __m512i DivideBy16(__m512i value); - - template <> SIMD_INLINE __m512i DivideBy16(__m512i value) - { - return _mm512_srli_epi16(_mm512_add_epi16(value, K16_0008), 4); - } - - template <> SIMD_INLINE __m512i DivideBy16(__m512i value) - { - return _mm512_srli_epi16(value, 4); - } - - const __m512i K16_0102 = SIMD_MM512_SET1_EPI16(0x0102); - - SIMD_INLINE __m512i BinomialSum16(const __m512i & s01, const __m512i & s12) - { - return _mm512_add_epi16(_mm512_and_si512(s01, K16_00FF), _mm512_maddubs_epi16(s12, K16_0102)); - } - - template SIMD_INLINE __m512i ReduceColNose(const uint8_t * p) - { - return BinomialSum16(LoadBeforeFirst<1>(p), Load(p)); - } - - template SIMD_INLINE void ReduceColNose(const uint8_t * s[3], __m512i a[3]) - { - a[0] = ReduceColNose(s[0]); - a[1] = ReduceColNose(s[1]); - a[2] = ReduceColNose(s[2]); - } - - template SIMD_INLINE __m512i ReduceColBody(const uint8_t * p) - { - return BinomialSum16(Load(p - 1), Load(p)); - } - - template SIMD_INLINE void ReduceColBody(const uint8_t * s[3], size_t offset, __m512i a[3]) - { - a[0] = ReduceColBody(s[0] + offset); - a[1] = ReduceColBody(s[1] + offset); - a[2] = ReduceColBody(s[2] + offset); - } - - template SIMD_INLINE __m512i ReduceRow(const __m512i lo[3], const __m512i hi[3]) - { - return _mm512_permutexvar_epi64(K64_PERMUTE_FOR_PACK, _mm512_packus_epi16( - DivideBy16(BinomialSum16(lo[0], lo[1], lo[2])), - DivideBy16(BinomialSum16(hi[0], hi[1], hi[2])))); - } - - template void ReduceGray3x3( - const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert(srcWidth >= DA && (srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight); - if (align) - assert(Aligned(src) && Aligned(srcStride)); - - size_t lastOddCol = srcWidth - AlignLo(srcWidth, 2); - size_t bodyWidth = AlignLo(srcWidth, DA); - for (size_t row = 0; row < srcHeight; row += 2, dst += dstStride, src += 2 * srcStride) - { - const uint8_t * s[3]; - s[1] = src; - s[0] = s[1] - (row ? srcStride : 0); - s[2] = s[1] + (row != srcHeight - 1 ? srcStride : 0); - - __m512i lo[3], hi[3]; - ReduceColNose(s, lo); - ReduceColBody(s, A, hi); - Store(dst, ReduceRow(lo, hi)); - - for (size_t srcCol = DA, dstCol = A; srcCol < bodyWidth; srcCol += DA, dstCol += A) - { - ReduceColBody(s, srcCol, lo); - ReduceColBody(s, srcCol + A, hi); - Store(dst + dstCol, ReduceRow(lo, hi)); - } - - if (bodyWidth != srcWidth) - { - size_t srcCol = srcWidth - DA - lastOddCol; - size_t dstCol = dstWidth - A - lastOddCol; - ReduceColBody(s, srcCol, lo); - ReduceColBody(s, srcCol + A, hi); - Store(dst + dstCol, ReduceRow(lo, hi)); - if (lastOddCol) - dst[dstWidth - 1] = Base::GaussianBlur3x3(s[0] + srcWidth, s[1] + srcWidth, s[2] + srcWidth, -2, -1, -1); - } - } - } - - template void ReduceGray3x3( - const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation) - { - if (compensation) - ReduceGray3x3(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - ReduceGray3x3(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - } - - void ReduceGray3x3(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation) - { - if (Aligned(src) && Aligned(srcStride)) - ReduceGray3x3(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, compensation); - else - ReduceGray3x3(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, compensation); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwReduceGray4x4.cpp b/src/3rd/Simd/Simd/SimdAvx512bwReduceGray4x4.cpp deleted file mode 100644 index 9475c008..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwReduceGray4x4.cpp +++ /dev/null @@ -1,145 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - SIMD_INLINE __m512i DivideBy64(__m512i value) - { - return _mm512_srli_epi16(_mm512_add_epi16(value, K16_0020), 6); - } - - SIMD_INLINE __m512i BinomialSum16(const __m512i & a, const __m512i & b, const __m512i & c, const __m512i & d) - { - return _mm512_add_epi16(_mm512_add_epi16(a, d), _mm512_mullo_epi16(_mm512_add_epi16(b, c), K16_0003)); - } - - const __m512i K8_01_03 = SIMD_MM512_SET2_EPI8(1, 3); - const __m512i K8_03_01 = SIMD_MM512_SET2_EPI8(3, 1); - - SIMD_INLINE __m512i BinomialSum8(const __m512i & ab, const __m512i & cd) - { - return _mm512_add_epi16(_mm512_maddubs_epi16(ab, K8_01_03), _mm512_maddubs_epi16(cd, K8_03_01)); - } - - SIMD_INLINE __m512i ReduceColNose(const uint8_t * src) - { - return BinomialSum8(LoadBeforeFirst<1>(src), Load(src + 1)); - } - - SIMD_INLINE void ReduceColNose(const uint8_t * s[4], __m512i a[4]) - { - a[0] = ReduceColNose(s[0]); - a[1] = ReduceColNose(s[1]); - a[2] = ReduceColNose(s[2]); - a[3] = ReduceColNose(s[3]); - } - - SIMD_INLINE __m512i ReduceColBody(const uint8_t * src) - { - return BinomialSum8(Load(src - 1), Load(src + 1)); - } - - SIMD_INLINE void ReduceColBody(const uint8_t * s[4], size_t offset, __m512i a[4]) - { - a[0] = ReduceColBody(s[0] + offset); - a[1] = ReduceColBody(s[1] + offset); - a[2] = ReduceColBody(s[2] + offset); - a[3] = ReduceColBody(s[3] + offset); - } - - template SIMD_INLINE __m512i ReduceColTail(const uint8_t * src); - - template <> SIMD_INLINE __m512i ReduceColTail(const uint8_t * src) - { - return BinomialSum8(Load(src - 1), LoadAfterLast<1>(src)); - } - - template <> SIMD_INLINE __m512i ReduceColTail(const uint8_t * src) - { - return BinomialSum8(Load(src - 1), LoadAfterLast2<1>(src - 1)); - } - - template SIMD_INLINE void ReduceColTail(const uint8_t * s[4], size_t offset, __m512i a[4]) - { - a[0] = ReduceColTail(s[0] + offset); - a[1] = ReduceColTail(s[1] + offset); - a[2] = ReduceColTail(s[2] + offset); - a[3] = ReduceColTail(s[3] + offset); - } - - SIMD_INLINE __m512i ReduceRow(const __m512i lo[4], const __m512i hi[4]) - { - return _mm512_permutexvar_epi64(K64_PERMUTE_FOR_PACK, _mm512_packus_epi16( - DivideBy64(BinomialSum16(lo[0], lo[1], lo[2], lo[3])), - DivideBy64(BinomialSum16(hi[0], hi[1], hi[2], hi[3])))); - } - - template void ReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth > DA); - - size_t bodyWidth = AlignLo(srcWidth, DA); - size_t srcTail = Simd::AlignHi(srcWidth - DA, 2); - - for (size_t row = 0; row < srcHeight; row += 2, dst += dstStride) - { - const uint8_t * s[4]; - s[1] = src + srcStride*row; - s[0] = s[1] - (row ? srcStride : 0); - s[2] = s[1] + (row < srcHeight - 1 ? srcStride : 0); - s[3] = s[2] + (row < srcHeight - 2 ? srcStride : 0); - - __m512i lo[4], hi[4]; - ReduceColNose(s, lo); - ReduceColBody(s, A, hi); - Store(dst, ReduceRow(lo, hi)); - for (size_t srcCol = DA, dstCol = A; srcCol < bodyWidth; srcCol += DA, dstCol += A) - { - ReduceColBody(s, srcCol + 0, lo); - ReduceColBody(s, srcCol + A, hi); - Store(dst + dstCol, ReduceRow(lo, hi)); - } - ReduceColBody(s, srcTail + 0, lo); - ReduceColTail(s, srcTail + A, hi); - Store(dst + dstWidth - A, ReduceRow(lo, hi)); - } - } - - void ReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - if (Aligned(srcWidth, 2)) - ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwReduceGray5x5.cpp b/src/3rd/Simd/Simd/SimdAvx512bwReduceGray5x5.cpp deleted file mode 100644 index bd95b8fe..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwReduceGray5x5.cpp +++ /dev/null @@ -1,202 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - namespace - { - struct Buffer - { - Buffer(size_t width) - { - _p = Allocate(sizeof(uint16_t)*(5 * width + A)); - in0 = (uint16_t*)_p; - in1 = in0 + width; - out0 = in1 + width; - out1 = out0 + width; - dst = out1 + width + HA; - } - - ~Buffer() - { - Free(_p); - } - - uint16_t * in0; - uint16_t * in1; - uint16_t * out0; - uint16_t * out1; - uint16_t * dst; - private: - void *_p; - }; - } - - template SIMD_INLINE __m512i DivideBy256(__m512i value); - - template <> SIMD_INLINE __m512i DivideBy256(__m512i value) - { - return _mm512_srli_epi16(_mm512_add_epi16(value, K16_0080), 8); - } - - template <> SIMD_INLINE __m512i DivideBy256(__m512i value) - { - return _mm512_srli_epi16(value, 8); - } - - template SIMD_INLINE __m512i LoadUnpacked(const void * src) - { - return _mm512_cvtepu8_epi16(Avx2::Load((const __m256i*)src)); - } - - template SIMD_INLINE void FirstRow5x5(__m512i src, Buffer & buffer, size_t offset) - { - Store(buffer.in0 + offset, src); - Store(buffer.in1 + offset, _mm512_mullo_epi16(src, K16_0005)); - } - - template SIMD_INLINE void FirstRow5x5(const uint8_t * src, Buffer & buffer, size_t offset) - { - FirstRow5x5(LoadUnpacked(src + offset), buffer, offset); - offset += HA; - FirstRow5x5(LoadUnpacked(src + offset), buffer, offset); - } - - template SIMD_INLINE void MainRowY5x5(const __m512i & odd, const __m512i & even, Buffer & buffer, size_t offset) - { - __m512i cp = _mm512_mullo_epi16(odd, K16_0004); - __m512i c0 = Load(buffer.in0 + offset); - __m512i c1 = Load(buffer.in1 + offset); - Store(buffer.dst + offset, _mm512_add_epi16(even, _mm512_add_epi16(c1, _mm512_add_epi16(cp, _mm512_mullo_epi16(c0, K16_0006))))); - Store(buffer.out1 + offset, _mm512_add_epi16(c0, cp)); - Store(buffer.out0 + offset, even); - } - - template SIMD_INLINE void MainRowY5x5(const uint8_t * odd, const uint8_t * even, Buffer & buffer, size_t offset) - { - MainRowY5x5(LoadUnpacked(odd + offset), LoadUnpacked(even + offset), buffer, offset); - offset += HA; - MainRowY5x5(LoadUnpacked(odd + offset), LoadUnpacked(even + offset), buffer, offset); - } - - template SIMD_INLINE __m512i MainRowX5x5(uint16_t * dst) - { - __m512i t0 = Load(dst - 2); - __m512i t1 = Load(dst - 1); - __m512i t2 = Load(dst + 0); - __m512i t3 = Load(dst + 1); - __m512i t4 = Load(dst + 2); - t2 = _mm512_add_epi16(_mm512_add_epi16(_mm512_mullo_epi16(t2, K16_0006), _mm512_mullo_epi16(_mm512_add_epi16(t1, t3), K16_0004)), _mm512_add_epi16(t0, t4)); - return DivideBy256(t2); - } - - template SIMD_INLINE __m512i MainRowX5x5(Buffer & buffer, size_t offset) - { - const __m512i lo = MainRowX5x5(buffer.dst + offset + 00); - const __m512i hi = MainRowX5x5(buffer.dst + offset + HA); - return _mm512_and_si512(_mm512_permutexvar_epi64(K64_PERMUTE_FOR_PACK, _mm512_packus_epi16(lo, hi)), K16_00FF); - } - - template SIMD_INLINE void MainRowX5x5(Buffer & buffer, size_t offset, uint8_t * dst) - { - __m512i lo = MainRowX5x5(buffer, offset + 0); - __m512i hi = MainRowX5x5(buffer, offset + A); - Store(dst, _mm512_permutexvar_epi64(K64_PERMUTE_FOR_PACK, _mm512_packus_epi16(lo, hi))); - } - - template void ReduceGray5x5( - const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA); - if (align) - assert(Aligned(src) && Aligned(srcStride)); - - size_t alignedWidth = Simd::AlignLo(srcWidth, DA); - size_t bufferDstTail = Simd::AlignHi(srcWidth - DA, 2); - - Buffer buffer(Simd::AlignHi(srcWidth, A)); - - for (size_t col = 0; col < alignedWidth; col += A) - FirstRow5x5(src, buffer, col); - if (alignedWidth != srcWidth) - { - FirstRow5x5(src, buffer, srcWidth - DA); - FirstRow5x5(src, buffer, srcWidth - A); - } - src += srcStride; - - for (size_t row = 1; row <= srcHeight; row += 2, dst += dstStride, src += 2 * srcStride) - { - const uint8_t * odd = src - (row < srcHeight ? 0 : srcStride); - const uint8_t * even = odd + (row < srcHeight - 1 ? srcStride : 0); - - for (size_t col = 0; col < alignedWidth; col += A) - MainRowY5x5(odd, even, buffer, col); - if (alignedWidth != srcWidth) - { - MainRowY5x5(odd, even, buffer, srcWidth - DA); - MainRowY5x5(odd, even, buffer, srcWidth - A); - } - - Swap(buffer.in0, buffer.out0); - Swap(buffer.in1, buffer.out1); - - buffer.dst[-2] = buffer.dst[0]; - buffer.dst[-1] = buffer.dst[0]; - buffer.dst[srcWidth] = buffer.dst[srcWidth - 1]; - buffer.dst[srcWidth + 1] = buffer.dst[srcWidth - 1]; - - for (size_t srcCol = 0, dstCol = 0; srcCol < alignedWidth; srcCol += DA, dstCol += A) - MainRowX5x5(buffer, srcCol, dst + dstCol); - if (alignedWidth != srcWidth) - MainRowX5x5(buffer, bufferDstTail, dst + dstWidth - A); - } - } - - template void ReduceGray5x5(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride)) - ReduceGray5x5(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - ReduceGray5x5(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - } - - void ReduceGray5x5(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation) - { - if (compensation) - ReduceGray5x5(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - ReduceGray5x5(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwReorder.cpp b/src/3rd/Simd/Simd/SimdAvx512bwReorder.cpp deleted file mode 100644 index 3003d9d9..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwReorder.cpp +++ /dev/null @@ -1,162 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - const __m512i K8_SHUFFLE_REORDER_16 = SIMD_MM512_SETR_EPI8( - 0x1, 0x0, 0x3, 0x2, 0x5, 0x4, 0x7, 0x6, 0x9, 0x8, 0xB, 0xA, 0xD, 0xC, 0xF, 0xE, - 0x1, 0x0, 0x3, 0x2, 0x5, 0x4, 0x7, 0x6, 0x9, 0x8, 0xB, 0xA, 0xD, 0xC, 0xF, 0xE, - 0x1, 0x0, 0x3, 0x2, 0x5, 0x4, 0x7, 0x6, 0x9, 0x8, 0xB, 0xA, 0xD, 0xC, 0xF, 0xE, - 0x1, 0x0, 0x3, 0x2, 0x5, 0x4, 0x7, 0x6, 0x9, 0x8, 0xB, 0xA, 0xD, 0xC, 0xF, 0xE); - - template SIMD_INLINE void Reorder16bit(const uint8_t * src, uint8_t * dst, __mmask64 tail = -1) - { - Store(dst, _mm512_shuffle_epi8((Load(src, tail)), K8_SHUFFLE_REORDER_16), tail); - } - - template SIMD_INLINE void Reorder16bit4(const uint8_t * src, uint8_t * dst) - { - Store(dst + 0 * A, _mm512_shuffle_epi8(Load(src + 0 * A), K8_SHUFFLE_REORDER_16)); - Store(dst + 1 * A, _mm512_shuffle_epi8(Load(src + 1 * A), K8_SHUFFLE_REORDER_16)); - Store(dst + 2 * A, _mm512_shuffle_epi8(Load(src + 2 * A), K8_SHUFFLE_REORDER_16)); - Store(dst + 3 * A, _mm512_shuffle_epi8(Load(src + 3 * A), K8_SHUFFLE_REORDER_16)); - } - - template void Reorder16bit(const uint8_t * src, size_t size, uint8_t * dst) - { - assert(size % 2 == 0); - - size_t alignedSize = AlignLo(size, A); - size_t fullAlignedSize = AlignLo(size, QA); - __mmask64 tailMask = TailMask64(size - alignedSize); - size_t i = 0; - for (; i < fullAlignedSize; i += QA) - Reorder16bit4(src + i, dst + i); - for (; i < alignedSize; i += A) - Reorder16bit(src + i, dst + i); - if (i < size) - Reorder16bit(src + i, dst + i, tailMask); - } - - void Reorder16bit(const uint8_t * src, size_t size, uint8_t * dst) - { - if (Aligned(src) && Aligned(dst)) - Reorder16bit(src, size, dst); - else - Reorder16bit(src, size, dst); - } - - const __m512i K8_SHUFFLE_REORDER_32 = SIMD_MM512_SETR_EPI8( - 0x3, 0x2, 0x1, 0x0, 0x7, 0x6, 0x5, 0x4, 0xB, 0xA, 0x9, 0x8, 0xF, 0xE, 0xD, 0xC, - 0x3, 0x2, 0x1, 0x0, 0x7, 0x6, 0x5, 0x4, 0xB, 0xA, 0x9, 0x8, 0xF, 0xE, 0xD, 0xC, - 0x3, 0x2, 0x1, 0x0, 0x7, 0x6, 0x5, 0x4, 0xB, 0xA, 0x9, 0x8, 0xF, 0xE, 0xD, 0xC, - 0x3, 0x2, 0x1, 0x0, 0x7, 0x6, 0x5, 0x4, 0xB, 0xA, 0x9, 0x8, 0xF, 0xE, 0xD, 0xC); - - template SIMD_INLINE void Reorder32bit(const uint8_t * src, uint8_t * dst, __mmask64 tail = -1) - { - Store(dst, _mm512_shuffle_epi8((Load(src, tail)), K8_SHUFFLE_REORDER_32), tail); - } - - template SIMD_INLINE void Reorder32bit4(const uint8_t * src, uint8_t * dst) - { - Store(dst + 0 * A, _mm512_shuffle_epi8(Load(src + 0 * A), K8_SHUFFLE_REORDER_32)); - Store(dst + 1 * A, _mm512_shuffle_epi8(Load(src + 1 * A), K8_SHUFFLE_REORDER_32)); - Store(dst + 2 * A, _mm512_shuffle_epi8(Load(src + 2 * A), K8_SHUFFLE_REORDER_32)); - Store(dst + 3 * A, _mm512_shuffle_epi8(Load(src + 3 * A), K8_SHUFFLE_REORDER_32)); - } - - template void Reorder32bit(const uint8_t * src, size_t size, uint8_t * dst) - { - assert(size % 4 == 0); - - size_t alignedSize = AlignLo(size, A); - size_t fullAlignedSize = AlignLo(size, QA); - __mmask64 tailMask = TailMask64(size - alignedSize); - size_t i = 0; - for (; i < fullAlignedSize; i += QA) - Reorder32bit4(src + i, dst + i); - for (; i < alignedSize; i += A) - Reorder32bit(src + i, dst + i); - if (i < size) - Reorder32bit(src + i, dst + i, tailMask); - } - - void Reorder32bit(const uint8_t * src, size_t size, uint8_t * dst) - { - if (Aligned(src) && Aligned(dst)) - Reorder32bit(src, size, dst); - else - Reorder32bit(src, size, dst); - } - - const __m512i K8_SHUFFLE_REORDER_64 = SIMD_MM512_SETR_EPI8( - 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0, 0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8, - 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0, 0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8, - 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0, 0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8, - 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0, 0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8); - - template SIMD_INLINE void Reorder64bit(const uint8_t * src, uint8_t * dst, __mmask64 tail = -1) - { - Store(dst, _mm512_shuffle_epi8((Load(src, tail)), K8_SHUFFLE_REORDER_64), tail); - } - - template SIMD_INLINE void Reorder64bit4(const uint8_t * src, uint8_t * dst) - { - Store(dst + 0 * A, _mm512_shuffle_epi8(Load(src + 0 * A), K8_SHUFFLE_REORDER_64)); - Store(dst + 1 * A, _mm512_shuffle_epi8(Load(src + 1 * A), K8_SHUFFLE_REORDER_64)); - Store(dst + 2 * A, _mm512_shuffle_epi8(Load(src + 2 * A), K8_SHUFFLE_REORDER_64)); - Store(dst + 3 * A, _mm512_shuffle_epi8(Load(src + 3 * A), K8_SHUFFLE_REORDER_64)); - } - - template void Reorder64bit(const uint8_t * src, size_t size, uint8_t * dst) - { - assert(size % 8 == 0); - - size_t alignedSize = AlignLo(size, A); - size_t fullAlignedSize = AlignLo(size, QA); - __mmask64 tailMask = TailMask64(size - alignedSize); - size_t i = 0; - for (; i < fullAlignedSize; i += QA) - Reorder64bit4(src + i, dst + i); - for (; i < alignedSize; i += A) - Reorder64bit(src + i, dst + i); - if (i < size) - Reorder64bit(src + i, dst + i, tailMask); - } - - void Reorder64bit(const uint8_t * src, size_t size, uint8_t * dst) - { - if (Aligned(src) && Aligned(dst)) - Reorder64bit(src, size, dst); - else - Reorder64bit(src, size, dst); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwResizeBilinear.cpp b/src/3rd/Simd/Simd/SimdAvx512bwResizeBilinear.cpp deleted file mode 100644 index 8582a004..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwResizeBilinear.cpp +++ /dev/null @@ -1,507 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdAvx2.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - namespace - { - struct Buffer - { - Buffer(size_t size, size_t width, size_t height) - { - _p = Allocate(3 * size + sizeof(int)*(2 * height + width)); - bx[0] = (uint8_t*)_p; - bx[1] = bx[0] + size; - ax = bx[1] + size; - ix = (int*)(ax + size); - iy = ix + width; - ay = iy + height; - } - - ~Buffer() - { - Free(_p); - } - - uint8_t * bx[2]; - uint8_t * ax; - int * ix; - int * ay; - int * iy; - private: - void *_p; - }; - - struct Index - { - int src, dst; - uint8_t shuffle[Avx2::A]; - }; - - struct BufferG - { - BufferG(size_t width, size_t blocks, size_t height) - { - _p = Allocate(3 * width + sizeof(int) * 2 * height + blocks * sizeof(Index) + 2 * A); - bx[0] = (uint8_t*)_p; - bx[1] = bx[0] + width + A; - ax = bx[1] + width + A; - ix = (Index*)(ax + width); - iy = (int*)(ix + blocks); - ay = iy + height; - } - - ~BufferG() - { - Free(_p); - } - - uint8_t * bx[2]; - uint8_t * ax; - Index * ix; - int * ay; - int * iy; - private: - void *_p; - }; - } - - template void EstimateAlphaIndexX(size_t srcSize, size_t dstSize, int * indexes, uint8_t * alphas) - { - float scale = (float)srcSize / dstSize; - - for (size_t i = 0; i < dstSize; ++i) - { - float alpha = (float)((i + 0.5)*scale - 0.5); - ptrdiff_t index = (ptrdiff_t)::floor(alpha); - alpha -= index; - - if (index < 0) - { - index = 0; - alpha = 0; - } - - if (index > (ptrdiff_t)srcSize - 2) - { - index = srcSize - 2; - alpha = 1; - } - - indexes[i] = (int)index; - alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5); - alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]); - for (size_t channel = 1; channel < channelCount; channel++) - ((uint16_t*)alphas)[channel] = *(uint16_t*)alphas; - alphas += 2 * channelCount; - } - } - - size_t BlockCountMax(size_t src, size_t dst) - { - return (size_t)Simd::Max(::ceil(float(src) / (Avx2::A - 1)), ::ceil(float(dst) / Avx2::HA)); - } - - void EstimateAlphaIndexX(int srcSize, int dstSize, Index * indexes, uint8_t * alphas, size_t & blockCount) - { - float scale = (float)srcSize / dstSize; - int block = 0; - indexes[0].src = 0; - indexes[0].dst = 0; - for (int dstIndex = 0; dstIndex < dstSize; ++dstIndex) - { - float alpha = (float)((dstIndex + 0.5)*scale - 0.5); - int srcIndex = (int)::floor(alpha); - alpha -= srcIndex; - - if (srcIndex < 0) - { - srcIndex = 0; - alpha = 0; - } - - if (srcIndex > srcSize - 2) - { - srcIndex = srcSize - 2; - alpha = 1; - } - - int dst = 2 * dstIndex - indexes[block].dst; - int src = srcIndex - indexes[block].src; - if (src >= Avx2::A - 1 || dst >= Avx2::A) - { - block++; - indexes[block].src = Simd::Min(srcIndex, srcSize - (int)Avx2::A); - indexes[block].dst = 2 * dstIndex; - dst = 0; - src = srcIndex - indexes[block].src; - } - indexes[block].shuffle[dst] = src; - indexes[block].shuffle[dst + 1] = src + 1; - - alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5); - alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]); - alphas += 2; - } - blockCount = block + 1; - } - - template void InterpolateX(const uint8_t * alpha, uint8_t * buffer); - - template <> SIMD_INLINE void InterpolateX<1>(const uint8_t * alpha, uint8_t * buffer) - { - __m512i _buffer = Load(buffer); - Store(buffer, _mm512_maddubs_epi16(_buffer, Load(alpha))); - } - - const __m512i K8_SHUFFLE_X2 = SIMD_MM512_SETR_EPI8( - 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF, - 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF, - 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF, - 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF); - - SIMD_INLINE void InterpolateX2(const uint8_t * alpha, uint8_t * buffer) - { - __m512i _buffer = _mm512_shuffle_epi8(Load(buffer), K8_SHUFFLE_X2); - Store(buffer, _mm512_maddubs_epi16(_buffer, Load(alpha))); - } - - template <> SIMD_INLINE void InterpolateX<2>(const uint8_t * alpha, uint8_t * buffer) - { - InterpolateX2(alpha + 0, buffer + 0); - InterpolateX2(alpha + A, buffer + A); - } - - const __m512i K8_SHUFFLE_X3_00 = SIMD_MM512_SETR_EPI8( - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m512i K8_SHUFFLE_X3_01 = SIMD_MM512_SETR_EPI8( - 0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1, - -1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1, - -1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF, - 0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1); - const __m512i K8_SHUFFLE_X3_02 = SIMD_MM512_SETR_EPI8( - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0); - - const __m512i K8_SHUFFLE_X3_10 = SIMD_MM512_SETR_EPI8( - 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m512i K8_SHUFFLE_X3_11 = SIMD_MM512_SETR_EPI8( - -1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1, - -1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF, - 0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1, - -1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1); - const __m512i K8_SHUFFLE_X3_12 = SIMD_MM512_SETR_EPI8( - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1); - - const __m512i K8_SHUFFLE_X3_20 = SIMD_MM512_SETR_EPI8( - 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m512i K8_SHUFFLE_X3_21 = SIMD_MM512_SETR_EPI8( - -1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF, - 0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1, - -1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1, - -1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF); - const __m512i K8_SHUFFLE_X3_22 = SIMD_MM512_SETR_EPI8( - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - - template <> SIMD_INLINE void InterpolateX<3>(const uint8_t * alpha, uint8_t * buffer) - { - __m512i src[3], shuffled; - src[0] = Load(buffer + 0 * A); - src[1] = Load(buffer + 1 * A); - src[2] = Load(buffer + 2 * A); - - shuffled = _mm512_shuffle_epi8(_mm512_alignr_epi32(src[0], src[0], 12), K8_SHUFFLE_X3_00); - shuffled = _mm512_or_si512(shuffled, _mm512_shuffle_epi8(src[0], K8_SHUFFLE_X3_01)); - shuffled = _mm512_or_si512(shuffled, _mm512_shuffle_epi8(_mm512_alignr_epi32(src[1], src[0], 4), K8_SHUFFLE_X3_02)); - Store(buffer + 0 * A, _mm512_maddubs_epi16(shuffled, Load(alpha + 0 * A))); - - shuffled = _mm512_shuffle_epi8(_mm512_alignr_epi32(src[1], src[0], 12), K8_SHUFFLE_X3_10); - shuffled = _mm512_or_si512(shuffled, _mm512_shuffle_epi8(src[1], K8_SHUFFLE_X3_11)); - shuffled = _mm512_or_si512(shuffled, _mm512_shuffle_epi8(_mm512_alignr_epi32(src[2], src[1], 4), K8_SHUFFLE_X3_12)); - Store(buffer + 1 * A, _mm512_maddubs_epi16(shuffled, Load(alpha + 1 * A))); - - shuffled = _mm512_shuffle_epi8(_mm512_alignr_epi32(src[2], src[1], 12), K8_SHUFFLE_X3_20); - shuffled = _mm512_or_si512(shuffled, _mm512_shuffle_epi8(src[2], K8_SHUFFLE_X3_21)); - shuffled = _mm512_or_si512(shuffled, _mm512_shuffle_epi8(_mm512_alignr_epi32(src[2], src[2], 4), K8_SHUFFLE_X3_22)); - Store(buffer + 2 * A, _mm512_maddubs_epi16(shuffled, Load(alpha + 2 * A))); - } - - const __m512i K8_SHUFFLE_X4 = SIMD_MM512_SETR_EPI8( - 0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF, - 0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF, - 0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF, - 0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF); - - SIMD_INLINE void InterpolateX4(const uint8_t * alpha, uint8_t * buffer) - { - __m512i _buffer = _mm512_shuffle_epi8(Load(buffer), K8_SHUFFLE_X4); - Store(buffer, _mm512_maddubs_epi16(_buffer, Load(alpha))); - } - - template <> SIMD_INLINE void InterpolateX<4>(const uint8_t * alpha, uint8_t * buffer) - { - InterpolateX4(alpha + 0 * A, buffer + 0 * A); - InterpolateX4(alpha + 1 * A, buffer + 1 * A); - InterpolateX4(alpha + 2 * A, buffer + 2 * A); - InterpolateX4(alpha + 3 * A, buffer + 3 * A); - } - - const __m512i K16_FRACTION_ROUND_TERM = SIMD_MM512_SET1_EPI16(Base::BILINEAR_ROUND_TERM); - - template SIMD_INLINE __m512i InterpolateY(const uint8_t * pbx0, const uint8_t * pbx1, __m512i alpha[2]) - { - __m512i sum = _mm512_add_epi16(_mm512_mullo_epi16(Load(pbx0), alpha[0]), _mm512_mullo_epi16(Load(pbx1), alpha[1])); - return _mm512_srli_epi16(_mm512_add_epi16(sum, K16_FRACTION_ROUND_TERM), Base::BILINEAR_SHIFT); - } - - template SIMD_INLINE void InterpolateY(const uint8_t * bx0, const uint8_t * bx1, __m512i alpha[2], uint8_t * dst) - { - __m512i lo = InterpolateY(bx0 + 0, bx1 + 0, alpha); - __m512i hi = InterpolateY(bx0 + A, bx1 + A, alpha); - Store(dst, _mm512_permutexvar_epi64(K64_PERMUTE_FOR_PACK, _mm512_packus_epi16(lo, hi))); - } - - template SIMD_INLINE void Gather(const uint8_t * src, const int * idx, size_t size, uint8_t * dst) - { - struct Src { uint8_t channels[channelCount * 1]; }; - struct Dst { uint8_t channels[channelCount * 2]; }; - const Src * s = (const Src *)src; - Dst * d = (Dst*)dst; - for (size_t i = 0; i < size; i++) - d[i] = *(Dst *)(s + idx[i]); - } - - template <> SIMD_INLINE void Gather<2>(const uint8_t * src, const int * idx, size_t size, uint8_t * dst) - { - for (size_t i = 0; i < size; i += 16) - { -#if defined(__GNUC__) && __GNUC__ < 6 - _mm512_storeu_si512(dst + 4 * i, _mm512_i32gather_epi32(_mm512_loadu_si512(idx + i), (const int *)src, 2)); -#else - _mm512_storeu_si512(dst + 4 * i, _mm512_i32gather_epi32(_mm512_loadu_si512(idx + i), src, 2)); -#endif - } - } - - template <> SIMD_INLINE void Gather<4>(const uint8_t * src, const int * idx, size_t size, uint8_t * dst) - { - for (size_t i = 0; i < size; i += 8) - { -#if defined(__GNUC__) && __GNUC__ < 6 - _mm512_storeu_si512(dst + 8 * i, _mm512_i32gather_epi64(_mm256_loadu_si256((__m256i*)(idx + i)), (const long long int*)src, 4)); -#else - _mm512_storeu_si512(dst + 8 * i, _mm512_i32gather_epi64(_mm256_loadu_si256((__m256i*)(idx + i)), src, 4)); -#endif - } - } - - template void ResizeBilinear( - const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert(dstWidth >= A); - - size_t size = 2 * dstWidth*channelCount; - size_t bufferSize = AlignHi(dstWidth, A)*channelCount * 2; - size_t alignedSize = AlignHi(size, DA) - DA; - const size_t step = A*channelCount; - - Buffer buffer(bufferSize, dstWidth, dstHeight); - - Base::EstimateAlphaIndex(srcHeight, dstHeight, buffer.iy, buffer.ay, 1); - - EstimateAlphaIndexX(srcWidth, dstWidth, buffer.ix, buffer.ax); - - ptrdiff_t previous = -2; - - __m512i a[2]; - - for (size_t yDst = 0; yDst < dstHeight; yDst++, dst += dstStride) - { - a[0] = _mm512_set1_epi16(int16_t(Base::FRACTION_RANGE - buffer.ay[yDst])); - a[1] = _mm512_set1_epi16(int16_t(buffer.ay[yDst])); - - ptrdiff_t sy = buffer.iy[yDst]; - int k = 0; - - if (sy == previous) - k = 2; - else if (sy == previous + 1) - { - Swap(buffer.bx[0], buffer.bx[1]); - k = 1; - } - - previous = sy; - - for (; k < 2; k++) - { - Gather(src + (sy + k)*srcStride, buffer.ix, dstWidth, buffer.bx[k]); - - uint8_t * pbx = buffer.bx[k]; - for (size_t i = 0; i < bufferSize; i += step) - InterpolateX(buffer.ax + i, pbx + i); - } - - for (size_t ib = 0, id = 0; ib < alignedSize; ib += DA, id += A) - InterpolateY(buffer.bx[0] + ib, buffer.bx[1] + ib, a, dst + id); - size_t i = size - DA; - InterpolateY(buffer.bx[0] + i, buffer.bx[1] + i, a, dst + i / 2); - } - } - - const __m256i K8_SHUFFLE_0 = SIMD_MM256_SETR_EPI8( - 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, - 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0); - - const __m256i K8_SHUFFLE_1 = SIMD_MM256_SETR_EPI8( - 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, - 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70); - - SIMD_INLINE const __m256i Shuffle(const __m256i & value, const __m256i & shuffle) - { - return _mm256_or_si256(_mm256_shuffle_epi8(value, _mm256_add_epi8(shuffle, K8_SHUFFLE_0)), - _mm256_shuffle_epi8(_mm256_permute4x64_epi64(value, 0x4E), _mm256_add_epi8(shuffle, K8_SHUFFLE_1))); - } - - SIMD_INLINE void LoadGray(const uint8_t * src, const Index & index, uint8_t * dst) - { - __m256i _src = _mm256_loadu_si256((__m256i*)(src + index.src)); - __m256i _shuffle = _mm256_loadu_si256((__m256i*)&index.shuffle); - _mm256_storeu_si256((__m256i*)(dst + index.dst), Shuffle(_src, _shuffle)); - } - - SIMD_INLINE void LoadGrayIntrepolated(const uint8_t * src, const Index & index, const uint8_t * alpha, uint8_t * dst) - { - __m256i _src = _mm256_loadu_si256((__m256i*)(src + index.src)); - __m256i _shuffle = _mm256_loadu_si256((__m256i*)&index.shuffle); - __m256i _alpha = _mm256_loadu_si256((__m256i*)(alpha + index.dst)); - _mm256_storeu_si256((__m256i*)(dst + index.dst), _mm256_maddubs_epi16(Shuffle(_src, _shuffle), _alpha)); - } - - void ResizeBilinearGray(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert(dstWidth >= A); - - size_t size = 2 * dstWidth; - size_t bufferWidth = AlignHi(dstWidth, A) * 2; - size_t blockCount = BlockCountMax(srcWidth, dstWidth); - size_t alignedSize = AlignHi(size, DA) - DA; - - BufferG buffer(bufferWidth, blockCount, dstHeight); - - Base::EstimateAlphaIndex(srcHeight, dstHeight, buffer.iy, buffer.ay, 1); - - EstimateAlphaIndexX((int)srcWidth, (int)dstWidth, buffer.ix, buffer.ax, blockCount); - - ptrdiff_t previous = -2; - - __m512i a[2]; - - for (size_t yDst = 0; yDst < dstHeight; yDst++, dst += dstStride) - { - a[0] = _mm512_set1_epi16(int16_t(Base::FRACTION_RANGE - buffer.ay[yDst])); - a[1] = _mm512_set1_epi16(int16_t(buffer.ay[yDst])); - - ptrdiff_t sy = buffer.iy[yDst]; - int k = 0; - - if (sy == previous) - k = 2; - else if (sy == previous + 1) - { - Swap(buffer.bx[0], buffer.bx[1]); - k = 1; - } - - previous = sy; - - for (; k < 2; k++) - { - const uint8_t * psrc = src + (sy + k)*srcStride; - uint8_t * pdst = buffer.bx[k]; - for (size_t i = 0; i < blockCount; ++i) - LoadGrayIntrepolated(psrc, buffer.ix[i], buffer.ax, pdst); - } - - for (size_t ib = 0, id = 0; ib < alignedSize; ib += DA, id += A) - InterpolateY(buffer.bx[0] + ib, buffer.bx[1] + ib, a, dst + id); - size_t i = size - DA; - InterpolateY(buffer.bx[0] + i, buffer.bx[1] + i, a, dst + i / 2); - } - } - - void ResizeBilinear( - const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) - { - switch (channelCount) - { - case 1: - if (srcWidth >= A && srcWidth < 4 * dstWidth) - ResizeBilinearGray(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - ResizeBilinear<1>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - break; - case 2: - ResizeBilinear<2>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - break; - case 3: - ResizeBilinear<3>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - break; - case 4: - ResizeBilinear<4>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - break; - default: - Avx2::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - } - } - } -#endif//SIMD_AVX512BW_ENABLE -} - diff --git a/src/3rd/Simd/Simd/SimdAvx512bwResizer.cpp b/src/3rd/Simd/Simd/SimdAvx512bwResizer.cpp deleted file mode 100644 index 68ecf3c3..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwResizer.cpp +++ /dev/null @@ -1,515 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdResizer.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdUpdate.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - ResizerByteBilinear::ResizerByteBilinear(const ResParam & param) - : Avx2::ResizerByteBilinear(param) - { - } - - template void ResizerByteBilinearInterpolateX(const uint8_t * alpha, uint8_t * buffer); - - template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<1>(const uint8_t * alpha, uint8_t * buffer) - { - __m512i _buffer = Load(buffer); - Store(buffer, _mm512_maddubs_epi16(_buffer, Load(alpha))); - } - - const __m512i K8_SHUFFLE_X2 = SIMD_MM512_SETR_EPI8( - 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF, - 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF, - 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF, - 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF); - - SIMD_INLINE void ResizerByteBilinearInterpolateX2(const uint8_t * alpha, uint8_t * buffer) - { - __m512i _buffer = _mm512_shuffle_epi8(Load(buffer), K8_SHUFFLE_X2); - Store(buffer, _mm512_maddubs_epi16(_buffer, Load(alpha))); - } - - template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<2>(const uint8_t * alpha, uint8_t * buffer) - { - ResizerByteBilinearInterpolateX2(alpha + 0, buffer + 0); - ResizerByteBilinearInterpolateX2(alpha + A, buffer + A); - } - - const __m512i K8_SHUFFLE_X3_00 = SIMD_MM512_SETR_EPI8( - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m512i K8_SHUFFLE_X3_01 = SIMD_MM512_SETR_EPI8( - 0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1, - -1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1, - -1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF, - 0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1); - const __m512i K8_SHUFFLE_X3_02 = SIMD_MM512_SETR_EPI8( - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0); - - const __m512i K8_SHUFFLE_X3_10 = SIMD_MM512_SETR_EPI8( - 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m512i K8_SHUFFLE_X3_11 = SIMD_MM512_SETR_EPI8( - -1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1, - -1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF, - 0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1, - -1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1); - const __m512i K8_SHUFFLE_X3_12 = SIMD_MM512_SETR_EPI8( - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1); - - const __m512i K8_SHUFFLE_X3_20 = SIMD_MM512_SETR_EPI8( - 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m512i K8_SHUFFLE_X3_21 = SIMD_MM512_SETR_EPI8( - -1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF, - 0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1, - -1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1, - -1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF); - const __m512i K8_SHUFFLE_X3_22 = SIMD_MM512_SETR_EPI8( - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - - template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<3>(const uint8_t * alpha, uint8_t * buffer) - { - __m512i src[3], shuffled; - src[0] = Load(buffer + 0 * A); - src[1] = Load(buffer + 1 * A); - src[2] = Load(buffer + 2 * A); - - shuffled = _mm512_shuffle_epi8(_mm512_alignr_epi32(src[0], src[0], 12), K8_SHUFFLE_X3_00); - shuffled = _mm512_or_si512(shuffled, _mm512_shuffle_epi8(src[0], K8_SHUFFLE_X3_01)); - shuffled = _mm512_or_si512(shuffled, _mm512_shuffle_epi8(_mm512_alignr_epi32(src[1], src[0], 4), K8_SHUFFLE_X3_02)); - Store(buffer + 0 * A, _mm512_maddubs_epi16(shuffled, Load(alpha + 0 * A))); - - shuffled = _mm512_shuffle_epi8(_mm512_alignr_epi32(src[1], src[0], 12), K8_SHUFFLE_X3_10); - shuffled = _mm512_or_si512(shuffled, _mm512_shuffle_epi8(src[1], K8_SHUFFLE_X3_11)); - shuffled = _mm512_or_si512(shuffled, _mm512_shuffle_epi8(_mm512_alignr_epi32(src[2], src[1], 4), K8_SHUFFLE_X3_12)); - Store(buffer + 1 * A, _mm512_maddubs_epi16(shuffled, Load(alpha + 1 * A))); - - shuffled = _mm512_shuffle_epi8(_mm512_alignr_epi32(src[2], src[1], 12), K8_SHUFFLE_X3_20); - shuffled = _mm512_or_si512(shuffled, _mm512_shuffle_epi8(src[2], K8_SHUFFLE_X3_21)); - shuffled = _mm512_or_si512(shuffled, _mm512_shuffle_epi8(_mm512_alignr_epi32(src[2], src[2], 4), K8_SHUFFLE_X3_22)); - Store(buffer + 2 * A, _mm512_maddubs_epi16(shuffled, Load(alpha + 2 * A))); - } - - const __m512i K8_SHUFFLE_X4 = SIMD_MM512_SETR_EPI8( - 0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF, - 0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF, - 0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF, - 0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF); - - SIMD_INLINE void ResizerByteBilinearInterpolateX4(const uint8_t * alpha, uint8_t * buffer) - { - __m512i _buffer = _mm512_shuffle_epi8(Load(buffer), K8_SHUFFLE_X4); - Store(buffer, _mm512_maddubs_epi16(_buffer, Load(alpha))); - } - - template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<4>(const uint8_t * alpha, uint8_t * buffer) - { - ResizerByteBilinearInterpolateX4(alpha + 0 * A, buffer + 0 * A); - ResizerByteBilinearInterpolateX4(alpha + 1 * A, buffer + 1 * A); - ResizerByteBilinearInterpolateX4(alpha + 2 * A, buffer + 2 * A); - ResizerByteBilinearInterpolateX4(alpha + 3 * A, buffer + 3 * A); - } - - const __m512i K16_FRACTION_ROUND_TERM = SIMD_MM512_SET1_EPI16(Base::BILINEAR_ROUND_TERM); - - template SIMD_INLINE __m512i ResizerByteBilinearInterpolateY(const uint8_t * pbx0, const uint8_t * pbx1, __m512i alpha[2]) - { - __m512i sum = _mm512_add_epi16(_mm512_mullo_epi16(Load(pbx0), alpha[0]), _mm512_mullo_epi16(Load(pbx1), alpha[1])); - return _mm512_srli_epi16(_mm512_add_epi16(sum, K16_FRACTION_ROUND_TERM), Base::BILINEAR_SHIFT); - } - - template SIMD_INLINE void ResizerByteBilinearInterpolateY(const uint8_t * bx0, const uint8_t * bx1, __m512i alpha[2], uint8_t * dst) - { - __m512i lo = ResizerByteBilinearInterpolateY(bx0 + 0, bx1 + 0, alpha); - __m512i hi = ResizerByteBilinearInterpolateY(bx0 + A, bx1 + A, alpha); - Store(dst, _mm512_permutexvar_epi64(K64_PERMUTE_FOR_PACK, _mm512_packus_epi16(lo, hi))); - } - - template SIMD_INLINE void ResizerByteBilinearGather(const uint8_t * src, const int * idx, size_t size, uint8_t * dst) - { - struct Src { uint8_t channels[N * 1]; }; - struct Dst { uint8_t channels[N * 2]; }; - const Src * s = (const Src *)src; - Dst * d = (Dst*)dst; - for (size_t i = 0; i < size; i++) - d[i] = *(Dst *)(s + idx[i]); - } - - template <> SIMD_INLINE void ResizerByteBilinearGather<2>(const uint8_t * src, const int * idx, size_t size, uint8_t * dst) - { - for (size_t i = 0; i < size; i += 16) - { -#if defined(__GNUC__) && __GNUC__ < 6 - _mm512_storeu_si512(dst + 4 * i, _mm512_i32gather_epi32(_mm512_loadu_si512(idx + i), (const int *)src, 2)); -#else - _mm512_storeu_si512(dst + 4 * i, _mm512_i32gather_epi32(_mm512_loadu_si512(idx + i), src, 2)); -#endif - } - } - - template <> SIMD_INLINE void ResizerByteBilinearGather<4>(const uint8_t * src, const int * idx, size_t size, uint8_t * dst) - { - for (size_t i = 0; i < size; i += 8) - { -#if defined(__GNUC__) && __GNUC__ < 6 - _mm512_storeu_si512(dst + 8 * i, _mm512_i32gather_epi64(_mm256_loadu_si256((__m256i*)(idx + i)), (const long long int*)src, 4)); -#else - _mm512_storeu_si512(dst + 8 * i, _mm512_i32gather_epi64(_mm256_loadu_si256((__m256i*)(idx + i)), src, 4)); -#endif - } - } - - template void ResizerByteBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - struct One { uint8_t val[N * 1]; }; - struct Two { uint8_t val[N * 2]; }; - - size_t size = 2 * _param.dstW*N; - size_t aligned = AlignHi(size, DA) - DA; - const size_t step = A * N; - ptrdiff_t previous = -2; - __m512i a[2]; - uint8_t * bx[2] = { _bx[0].data, _bx[1].data }; - const uint8_t * ax = _ax.data; - const int32_t * ix = _ix.data; - size_t dstW = _param.dstW; - - for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride) - { - a[0] = _mm512_set1_epi16(int16_t(Base::FRACTION_RANGE - _ay[yDst])); - a[1] = _mm512_set1_epi16(int16_t(_ay[yDst])); - - ptrdiff_t sy = _iy[yDst]; - int k = 0; - - if (sy == previous) - k = 2; - else if (sy == previous + 1) - { - Swap(bx[0], bx[1]); - k = 1; - } - - previous = sy; - - for (; k < 2; k++) - { - ResizerByteBilinearGather(src + (sy + k)*srcStride, ix, dstW, bx[k]); - - uint8_t * pbx = bx[k]; - for (size_t i = 0; i < size; i += step) - ResizerByteBilinearInterpolateX(ax + i, pbx + i); - } - - for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A) - ResizerByteBilinearInterpolateY(bx[0] + ib, bx[1] + ib, a, dst + id); - size_t i = size - DA; - ResizerByteBilinearInterpolateY(bx[0] + i, bx[1] + i, a, dst + i / 2); - } - } - - void ResizerByteBilinear::RunG(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - size_t bufW = AlignHi(_param.dstW, A) * 2; - size_t size = 2 * _param.dstW; - size_t aligned = AlignHi(size, DA) - DA; - size_t blocks = _blocks; - ptrdiff_t previous = -2; - __m512i a[2]; - uint8_t * bx[2] = { _bx[0].data, _bx[1].data }; - const uint8_t * ax = _ax.data; - const Idx * ixg = _ixg.data; - - for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride) - { - a[0] = _mm512_set1_epi16(int16_t(Base::FRACTION_RANGE - _ay[yDst])); - a[1] = _mm512_set1_epi16(int16_t(_ay[yDst])); - - ptrdiff_t sy = _iy[yDst]; - int k = 0; - - if (sy == previous) - k = 2; - else if (sy == previous + 1) - { - Swap(bx[0], bx[1]); - k = 1; - } - - previous = sy; - - for (; k < 2; k++) - { - const uint8_t * psrc = src + (sy + k)*srcStride; - uint8_t * pdst = bx[k]; - for (size_t i = 0; i < blocks; ++i) - Avx2::ResizerByteBilinearLoadGrayInterpolated(psrc, ixg[i], ax, pdst); - } - - for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A) - ResizerByteBilinearInterpolateY(bx[0] + ib, bx[1] + ib, a, dst + id); - size_t i = size - DA; - ResizerByteBilinearInterpolateY(bx[0] + i, bx[1] + i, a, dst + i / 2); - } - } - - void ResizerByteBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - assert(_param.dstW >= A); - - EstimateParams(); - switch (_param.channels) - { - case 1: - if (_blocks) - RunG(src, srcStride, dst, dstStride); - else - Run<1>(src, srcStride, dst, dstStride); - break; - case 2: Run<2>(src, srcStride, dst, dstStride); break; - case 3: Run<3>(src, srcStride, dst, dstStride); break; - case 4: Run<4>(src, srcStride, dst, dstStride); break; - default: - assert(0); - } - } - - - //--------------------------------------------------------------------- - - ResizerByteArea::ResizerByteArea(const ResParam & param) - : Avx2::ResizerByteArea(param) - { - } - - template SIMD_INLINE void ResizerByteAreaRowUpdate(const uint8_t * src0, __m512i alpha, int32_t * dst, __mmask64 tail = -1) - { - __m512i s0 = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, (Load(src0, tail))); - __m512i i0 = UnpackU8<0>(s0); - __m512i i1 = UnpackU8<1>(s0); - Update(dst + 0 * F, _mm512_madd_epi16(alpha, UnpackU8<0>(i0))); - Update(dst + 1 * F, _mm512_madd_epi16(alpha, UnpackU8<1>(i0))); - Update(dst + 2 * F, _mm512_madd_epi16(alpha, UnpackU8<0>(i1))); - Update(dst + 3 * F, _mm512_madd_epi16(alpha, UnpackU8<1>(i1))); - } - - template SIMD_INLINE void ResizerByteAreaRowUpdate(const uint8_t * src0, size_t size, size_t aligned, int32_t a, int32_t * dst, __mmask64 tail) - { - __m512i alpha = SetInt16(a, a); - size_t i = 0; - for (; i < aligned; i += A, dst += A, src0 += A) - ResizerByteAreaRowUpdate(src0, alpha, dst); - if(i < size) - ResizerByteAreaRowUpdate(src0, alpha, dst, tail); - } - - template SIMD_INLINE void ResizerByteAreaRowUpdate(const uint8_t * src0, const uint8_t * src1, __m512i alpha, int32_t * dst, __mmask64 tail = -1) - { - __m512i s0 = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, (Load(src0, tail))); - __m512i s1 = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, (Load(src1, tail))); - __m512i i0 = UnpackU8<0>(s0, s1); - __m512i i1 = UnpackU8<1>(s0, s1); - Update(dst + 0 * F, _mm512_madd_epi16(alpha, UnpackU8<0>(i0))); - Update(dst + 1 * F, _mm512_madd_epi16(alpha, UnpackU8<1>(i0))); - Update(dst + 2 * F, _mm512_madd_epi16(alpha, UnpackU8<0>(i1))); - Update(dst + 3 * F, _mm512_madd_epi16(alpha, UnpackU8<1>(i1))); - } - - template SIMD_INLINE void ResizerByteAreaRowUpdate(const uint8_t * src0, size_t stride, size_t size, size_t aligned, int32_t a0, int32_t a1, int32_t * dst, __mmask64 tail = -1) - { - __m512i alpha = SetInt16(a0, a1); - const uint8_t * src1 = src0 + stride; - size_t i = 0; - for (; i < aligned; i += A, dst += A) - ResizerByteAreaRowUpdate(src0 + i, src1 + i, alpha, dst); - if (i < size) - ResizerByteAreaRowUpdate(src0 + i, src1 + i, alpha, dst, tail); - } - - template SIMD_INLINE void ResizerByteAreaRowUpdate(const uint8_t * src0, const uint8_t * src1, - const uint8_t * src2, const uint8_t * src3, __m512i a01, __m512i a23, int32_t * dst, __mmask64 tail = -1) - { - __m512i s0 = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, (Load(src0, tail))); - __m512i s1 = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, (Load(src1, tail))); - __m512i t010 = _mm512_maddubs_epi16(UnpackU8<0>(s0, s1), a01); - __m512i t011 = _mm512_maddubs_epi16(UnpackU8<1>(s0, s1), a01); - __m512i s2 = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, (Load(src2, tail))); - __m512i s3 = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, (Load(src3, tail))); - __m512i t230 = _mm512_maddubs_epi16(UnpackU8<0>(s2, s3), a23); - __m512i t231 = _mm512_maddubs_epi16(UnpackU8<1>(s2, s3), a23); - Update(dst + 0 * F, _mm512_madd_epi16(K16_0001, UnpackU16<0>(t010, t230))); - Update(dst + 1 * F, _mm512_madd_epi16(K16_0001, UnpackU16<1>(t010, t230))); - Update(dst + 2 * F, _mm512_madd_epi16(K16_0001, UnpackU16<0>(t011, t231))); - Update(dst + 3 * F, _mm512_madd_epi16(K16_0001, UnpackU16<1>(t011, t231))); - } - - template SIMD_INLINE void ResizerByteAreaRowUpdate(const uint8_t * src0, size_t stride, size_t size, size_t aligned, int32_t a0, int32_t a12, int32_t a3, int32_t * dst, __mmask64 tail = -1) - { - __m512i a01 = SetInt8(a0, a12); - __m512i a23 = SetInt8(a12, a3); - const uint8_t * src1 = src0 + stride; - const uint8_t * src2 = src1 + stride; - const uint8_t * src3 = src2 + stride; - size_t i = 0; - for (; i < aligned; i += A, dst += A) - ResizerByteAreaRowUpdate(src0 + i, src1 + i, src2 + i, src3 + i, a01, a23, dst); - if (i < size) - ResizerByteAreaRowUpdate(src0 + i, src1 + i, src2 + i, src3 + i, a01, a23, dst, tail); - } - - SIMD_INLINE void ResizerByteAreaRowSum(const uint8_t * src, size_t stride, size_t count, size_t size, size_t aligned, int32_t curr, int32_t zero, int32_t next, int32_t * dst, __mmask64 tail) - { - if (count) - { - size_t i = 0; - ResizerByteAreaRowUpdate(src, stride, size, aligned, curr, count == 1 ? zero - next : zero, dst, tail), src += 2 * stride, i += 2; - for (; i < count; i += 2, src += 2 * stride) - ResizerByteAreaRowUpdate(src, stride, size, aligned, zero, i == count - 1 ? zero - next : zero, dst, tail); - if (i == count) - ResizerByteAreaRowUpdate(src, size, aligned, zero - next, dst, tail); - } - else - ResizerByteAreaRowUpdate(src, size, aligned, curr - next, dst, tail); - } - - template SIMD_INLINE void ResizerByteAreaSet(const int32_t * src, int32_t value, int32_t * dst) - { - for (size_t c = 0; c < N; ++c) - dst[c] = src[c] * value; - } - - template SIMD_INLINE void ResizerByteAreaAdd(const int32_t * src, int32_t value, int32_t * dst) - { - for (size_t c = 0; c < N; ++c) - dst[c] += src[c] * value; - } - - template SIMD_INLINE void ResizerByteAreaRes(const int32_t * src, uint8_t * dst) - { - for (size_t c = 0; c < N; ++c) - dst[c] = uint8_t((src[c] + Base::AREA_ROUND) >> Base::AREA_SHIFT); - } - - template SIMD_INLINE void ResizerByteAreaResult(const int32_t * src, size_t count, int32_t curr, int32_t zero, int32_t next, uint8_t * dst) - { - int32_t sum[N]; - ResizerByteAreaSet(src, curr, sum); - for (size_t i = 0; i < count; ++i) - src += N, ResizerByteAreaAdd(src, zero, sum); - ResizerByteAreaAdd(src, -next, sum); - ResizerByteAreaRes(sum, dst); - } - - template SIMD_INLINE void ResizerByteAreaResult34(const int32_t * src, size_t count, int32_t curr, int32_t zero, int32_t next, uint8_t * dst) - { - __m128i sum = _mm_mullo_epi32(_mm_loadu_si128((__m128i*)src), _mm_set1_epi32(curr)); - for (size_t i = 0; i < count; ++i) - src += N, sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_loadu_si128((__m128i*)src), _mm_set1_epi32(zero))); - sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_loadu_si128((__m128i*)src), _mm_set1_epi32(-next))); - __m128i res = _mm_srai_epi32(_mm_add_epi32(sum, _mm_set1_epi32(Base::AREA_ROUND)), Base::AREA_SHIFT); - *(int32_t*)dst = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packus_epi32(res, Sse2::K_ZERO), Sse2::K_ZERO)); - } - - template<> SIMD_INLINE void ResizerByteAreaResult<4>(const int32_t * src, size_t count, int32_t curr, int32_t zero, int32_t next, uint8_t * dst) - { - ResizerByteAreaResult34<4>(src, count, curr, zero, next, dst); - } - - template<> SIMD_INLINE void ResizerByteAreaResult<3>(const int32_t * src, size_t count, int32_t curr, int32_t zero, int32_t next, uint8_t * dst) - { - ResizerByteAreaResult34<3>(src, count, curr, zero, next, dst); - } - - template void ResizerByteArea::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - size_t dstW = _param.dstW, rowSize = _param.srcW*N, rowRest = dstStride - dstW * N; - const int32_t * iy = _iy.data, *ix = _ix.data, *ay = _ay.data, *ax = _ax.data; - int32_t ay0 = ay[0], ax0 = ax[0]; - size_t rowSizeA = AlignLo(rowSize, A); - __mmask64 tail = TailMask64(rowSize - rowSizeA); - for (size_t dy = 0; dy < _param.dstH; dy++, dst += rowRest) - { - int32_t * buf = _by.data; - size_t yn = iy[dy + 1] - iy[dy]; - ResizerByteAreaRowSum(src, srcStride, yn, rowSize, rowSizeA, ay[dy], ay0, ay[dy + 1], buf, tail), src += yn * srcStride; - for (size_t dx = 0; dx < dstW; dx++, dst += N) - { - size_t xn = ix[dx + 1] - ix[dx]; - ResizerByteAreaResult(buf, xn, ax[dx], ax0, ax[dx + 1], dst), buf += xn * N; - } - } - } - - void ResizerByteArea::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - switch (_param.channels) - { - case 1: Run<1>(src, srcStride, dst, dstStride); return; - case 2: Run<2>(src, srcStride, dst, dstStride); return; - case 3: Run<3>(src, srcStride, dst, dstStride); return; - case 4: Run<4>(src, srcStride, dst, dstStride); return; - default: - assert(0); - } - } - - //--------------------------------------------------------------------- - - void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) - { - ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m512i)); - if (param.IsByteBilinear() && dstX >= A) - return new ResizerByteBilinear(param); - else if (param.IsByteArea()) - return new ResizerByteArea(param); - else - return Avx512f::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - } - } -#endif //SIMD_AVX512BW_ENABLE -} - diff --git a/src/3rd/Simd/Simd/SimdAvx512bwSegmentation.cpp b/src/3rd/Simd/Simd/SimdAvx512bwSegmentation.cpp deleted file mode 100644 index 05ec8e83..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwSegmentation.cpp +++ /dev/null @@ -1,289 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void ChangeIndex(uint8_t * mask, __m512i oldIndex, __m512i newIndex, __mmask64 tail = -1) - { - Store(mask, newIndex, _mm512_cmpeq_epi8_mask((Load(mask, tail)), oldIndex)&tail); - } - - template SIMD_INLINE void ChangeIndex4(uint8_t * mask, __m512i oldIndex, __m512i newIndex) - { - Store(mask + 0 * A, newIndex, _mm512_cmpeq_epi8_mask(Load(mask + 0 * A), oldIndex)); - Store(mask + 1 * A, newIndex, _mm512_cmpeq_epi8_mask(Load(mask + 1 * A), oldIndex)); - Store(mask + 2 * A, newIndex, _mm512_cmpeq_epi8_mask(Load(mask + 2 * A), oldIndex)); - Store(mask + 3 * A, newIndex, _mm512_cmpeq_epi8_mask(Load(mask + 3 * A), oldIndex)); - } - - template void SegmentationChangeIndex(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t oldIndex, uint8_t newIndex) - { - if (align) - assert(Aligned(mask) && Aligned(stride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - size_t fullAlignedWidth = Simd::AlignLo(width, QA); - __mmask64 tailMask = TailMask64(width - alignedWidth); - - __m512i _oldIndex = _mm512_set1_epi8((char)oldIndex); - __m512i _newIndex = _mm512_set1_epi8((char)newIndex); - - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QA) - ChangeIndex4(mask + col, _oldIndex, _newIndex); - for (; col < alignedWidth; col += A) - ChangeIndex(mask + col, _oldIndex, _newIndex); - if (col < width) - ChangeIndex(mask + col, _oldIndex, _newIndex, tailMask); - mask += stride; - } - } - - void SegmentationChangeIndex(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t oldIndex, uint8_t newIndex) - { - if (Aligned(mask) && Aligned(stride)) - SegmentationChangeIndex(mask, stride, width, height, oldIndex, newIndex); - else - SegmentationChangeIndex(mask, stride, width, height, oldIndex, newIndex); - } - - template SIMD_INLINE void FillSingleHoles(uint8_t * mask, ptrdiff_t stride, __m512i index, __mmask64 edge = -1) - { - __mmask64 up = _mm512_cmpeq_epi8_mask((Load(mask - stride, edge)), index); - __mmask64 left = _mm512_cmpeq_epi8_mask((Load(mask - 1, edge)), index); - __mmask64 right = _mm512_cmpeq_epi8_mask((Load(mask + 1, edge)), index); - __mmask64 down = _mm512_cmpeq_epi8_mask((Load(mask + stride, edge)), index); - Store(mask, index, up & left & right & down & edge); - } - - template void SegmentationFillSingleHoles(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index) - { - assert(width > 2 && height > 2); - - __m512i _index = _mm512_set1_epi8((char)index); - size_t alignedWidth = Simd::AlignLo(width - 1, A); - __mmask64 noseMask = NoseMask64(A - 1); - __mmask64 tailMask = TailMask64(width - 1 - alignedWidth); - if (alignedWidth < A) - noseMask = noseMask&tailMask; - - for (size_t row = 2; row < height; ++row) - { - mask += stride; - size_t col = A; - FillSingleHoles(mask, stride, _index, noseMask); - for (; col < alignedWidth; col += A) - FillSingleHoles(mask + col, stride, _index); - if (col < width) - FillSingleHoles(mask + col, stride, _index, tailMask); - } - } - - void SegmentationFillSingleHoles(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index) - { - if (Aligned(mask) && Aligned(stride)) - SegmentationFillSingleHoles(mask, stride, width, height, index); - else - SegmentationFillSingleHoles(mask, stride, width, height, index); - } - - template SIMD_INLINE void SegmentationPropagate2x2(__mmask32 parentOne, __mmask32 parentAll, - const uint8_t * difference0, const uint8_t * difference1, uint8_t * child0, uint8_t * child1, size_t childCol, - const __m512i & index, const __m512i & invalid, const __m512i & empty, const __m512i & threshold, __mmask32 tail) - { - __m512i _difference0 = _mm512_mask_set1_epi16(Load((uint16_t*)(difference0 + childCol), tail&parentOne), parentAll, -1); - __m512i _difference1 = _mm512_mask_set1_epi16(Load((uint16_t*)(difference1 + childCol), tail&parentOne), parentAll, -1); - __m512i _child0 = Load((uint16_t*)(child0 + childCol), tail); - __m512i _child1 = Load((uint16_t*)(child1 + childCol), tail); - __mmask64 condition0 = _mm512_cmpgt_epu8_mask(_difference0, threshold); - __mmask64 condition1 = _mm512_cmpgt_epu8_mask(_difference1, threshold); - Store((uint16_t*)(child0 + childCol), _mm512_mask_blend_epi8(_mm512_cmplt_epu8_mask(_child0, invalid), _child0, _mm512_mask_blend_epi8(condition0, empty, index)), tail); - Store((uint16_t*)(child1 + childCol), _mm512_mask_blend_epi8(_mm512_cmplt_epu8_mask(_child1, invalid), _child1, _mm512_mask_blend_epi8(condition1, empty, index)), tail); - } - - template SIMD_INLINE void SegmentationPropagate2x2(const uint8_t * parent0, const uint8_t * parent1, size_t parentCol, - const uint8_t * difference0, const uint8_t * difference1, uint8_t * child0, uint8_t * child1, size_t childCol, - const __m512i & index, const __m512i & invalid, const __m512i & empty, const __m512i & threshold, __mmask64 tail = -1) - { - __mmask64 parent00 = _mm512_cmpeq_epi8_mask((Load(parent0 + parentCol, tail)), index); - __mmask64 parent01 = _mm512_cmpeq_epi8_mask((Load(parent0 + parentCol + 1, tail)), index); - __mmask64 parent10 = _mm512_cmpeq_epi8_mask((Load(parent1 + parentCol, tail)), index); - __mmask64 parent11 = _mm512_cmpeq_epi8_mask((Load(parent1 + parentCol + 1, tail)), index); - __mmask64 one = parent00 | parent01 | parent10 | parent11; - __mmask64 all = parent00 & parent01 & parent10 & parent11; - SegmentationPropagate2x2(__mmask32(one >> 00), __mmask32(all >> 00), difference0, difference1, child0, child1, childCol + 0, index, invalid, empty, threshold, __mmask32(tail >> 00)); - SegmentationPropagate2x2(__mmask32(one >> 32), __mmask32(all >> 32), difference0, difference1, child0, child1, childCol + A, index, invalid, empty, threshold, __mmask32(tail >> 32)); - } - - template void SegmentationPropagate2x2(const uint8_t * parent, size_t parentStride, size_t width, size_t height, - uint8_t * child, size_t childStride, const uint8_t * difference, size_t differenceStride, - uint8_t currentIndex, uint8_t invalidIndex, uint8_t emptyIndex, uint8_t differenceThreshold) - { - assert(width >= 2 && height >= 2); - height--; - width--; - - size_t alignedWidth = Simd::AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - __m512i index = _mm512_set1_epi8((char)currentIndex); - __m512i invalid = _mm512_set1_epi8((char)invalidIndex); - __m512i empty = _mm512_set1_epi8((char)emptyIndex); - __m512i threshold = _mm512_set1_epi8((char)differenceThreshold); - - for (size_t parentRow = 0, childRow = 1; parentRow < height; ++parentRow, childRow += 2) - { - const uint8_t * parent0 = parent + parentRow*parentStride; - const uint8_t * parent1 = parent0 + parentStride; - const uint8_t * difference0 = difference + childRow*differenceStride; - const uint8_t * difference1 = difference0 + differenceStride; - uint8_t * child0 = child + childRow*childStride; - uint8_t * child1 = child0 + childStride; - - size_t parentCol = 0, childCol = 1; - for (; parentCol < alignedWidth; parentCol += A, childCol += DA) - SegmentationPropagate2x2(parent0, parent1, parentCol, difference0, difference1, - child0, child1, childCol, index, invalid, empty, threshold); - if (parentCol < width) - SegmentationPropagate2x2(parent0, parent1, parentCol, difference0, difference1, - child0, child1, childCol, index, invalid, empty, threshold, tailMask); - } - } - - void SegmentationPropagate2x2(const uint8_t * parent, size_t parentStride, size_t width, size_t height, - uint8_t * child, size_t childStride, const uint8_t * difference, size_t differenceStride, - uint8_t currentIndex, uint8_t invalidIndex, uint8_t emptyIndex, uint8_t differenceThreshold) - { - if (Aligned(parent) && Aligned(parentStride)) - SegmentationPropagate2x2(parent, parentStride, width, height, child, childStride, - difference, differenceStride, currentIndex, invalidIndex, emptyIndex, differenceThreshold); - else - SegmentationPropagate2x2(parent, parentStride, width, height, child, childStride, - difference, differenceStride, currentIndex, invalidIndex, emptyIndex, differenceThreshold); - } - - SIMD_INLINE bool RowHasIndex(const uint8_t * mask, size_t alignedSize, size_t fullSize, __m512i index, __mmask64 tail) - { - size_t col = 0; - for (; col < alignedSize; col += A) - { - if (_mm512_cmpeq_epi8_mask(_mm512_loadu_si512(mask + col), index)) - return true; - } - if (col < fullSize) - { - if (_mm512_cmpeq_epi8_mask(_mm512_maskz_loadu_epi8(tail, mask + col), index)) - return true; - } - return false; - } - - template SIMD_INLINE void ColsHasIndex(const uint8_t * mask, size_t stride, size_t size, __m512i index, __mmask64 & cols, __mmask64 tail = -1) - { - for (size_t row = 0; row < size; ++row) - { - cols = cols | _mm512_cmpeq_epi8_mask((Load(mask, tail)), index); - mask += stride; - } - } - - void SegmentationShrinkRegion(const uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index, - ptrdiff_t * left, ptrdiff_t * top, ptrdiff_t * right, ptrdiff_t * bottom) - { - assert(*left >= 0 && *right <= (ptrdiff_t)width && *top >= 0 && *bottom <= (ptrdiff_t)height); - - size_t fullWidth = *right - *left; - ptrdiff_t alignedWidth = Simd::AlignLo(fullWidth, A); - ptrdiff_t alignedRight = *left + alignedWidth; - __mmask64 tailMask = TailMask64(fullWidth - alignedWidth); - ptrdiff_t alignedLeft = *right - alignedWidth; - __mmask64 noseMask = NoseMask64(fullWidth - alignedWidth); - - __m512i _index = _mm512_set1_epi8(index); - bool search = true; - for (ptrdiff_t row = *top; search && row < *bottom; ++row) - { - if (RowHasIndex(mask + row*stride + *left, alignedWidth, fullWidth, _index, tailMask)) - { - search = false; - *top = row; - } - } - - if (search) - { - *left = 0; - *top = 0; - *right = 0; - *bottom = 0; - return; - } - - for (ptrdiff_t row = *bottom - 1; row >= *top; --row) - { - if (RowHasIndex(mask + row*stride + *left, alignedWidth, fullWidth, _index, tailMask)) - { - *bottom = row + 1; - break; - } - } - - for (ptrdiff_t col = *left; col < *right; col += A) - { - __mmask64 cols = 0; - if (col < alignedRight) - ColsHasIndex(mask + (*top)*stride + col, stride, *bottom - *top, _index, cols); - else - ColsHasIndex(mask + (*top)*stride + col, stride, *bottom - *top, _index, cols, tailMask); - if (cols) - { - *left = col + FirstNotZero64(cols); - break; - } - } - - for (ptrdiff_t col = *right - A; col >= *left; col -= A) - { - __mmask64 cols = 0; - if (col >= alignedLeft) - ColsHasIndex(mask + (*top)*stride + col, stride, *bottom - *top, _index, cols); - else - ColsHasIndex(mask + (*top)*stride + col, stride, *bottom - *top, _index, cols, noseMask); - if (cols) - { - *right = col + LastNotZero64(cols); - break; - } - } - } - } -#endif//SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwShiftBilinear.cpp b/src/3rd/Simd/Simd/SimdAvx512bwShiftBilinear.cpp deleted file mode 100644 index e7552684..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwShiftBilinear.cpp +++ /dev/null @@ -1,193 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdSet.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - const __m512i K16_LINEAR_ROUND_TERM = SIMD_MM512_SET1_EPI16(Base::LINEAR_ROUND_TERM); - const __m512i K16_BILINEAR_ROUND_TERM = SIMD_MM512_SET1_EPI16(Base::BILINEAR_ROUND_TERM); - - const int BILINEAR_SHIFT_EVEN = Base::BILINEAR_SHIFT - 1; - const int BILINEAR_ROUND_TERM_EVEN = 1 << (BILINEAR_SHIFT_EVEN - 1); - const __m512i K16_BILINEAR_ROUND_TERM_EVEN = SIMD_MM512_SET1_EPI16(BILINEAR_ROUND_TERM_EVEN); - - SIMD_INLINE __m512i Interpolate(__m512i s[2][2], __m512i k[2][2]) - { - __m512i sum0 = _mm512_add_epi16(_mm512_mullo_epi16(s[0][0], k[0][0]), _mm512_mullo_epi16(s[0][1], k[0][1])); - __m512i sum1 = _mm512_add_epi16(_mm512_mullo_epi16(s[1][0], k[1][0]), _mm512_mullo_epi16(s[1][1], k[1][1])); - return _mm512_srli_epi16(_mm512_add_epi16(_mm512_add_epi16(sum0, sum1), K16_BILINEAR_ROUND_TERM), Base::BILINEAR_SHIFT); - } - - template SIMD_INLINE void LoadBlock(const uint8_t * src, __m512i & lo, __m512i & hi, __mmask64 tail = -1) - { - const __m512i _src = Load(src, tail); - lo = UnpackU8<0>(_src); - hi = UnpackU8<1>(_src); - } - - template SIMD_INLINE void Interpolate(const uint8_t * src, size_t dx, size_t dy, __m512i k[2][2], uint8_t * dst, __mmask64 tail = -1) - { - __m512i s[2][2][2]; - LoadBlock(src, s[0][0][0], s[1][0][0], tail); - LoadBlock(src + dx, s[0][0][1], s[1][0][1], tail); - LoadBlock(src + dy, s[0][1][0], s[1][1][0], tail); - LoadBlock(src + dy + dx, s[0][1][1], s[1][1][1], tail); - Store(dst, _mm512_packus_epi16(Interpolate(s[0], k), Interpolate(s[1], k)), tail); - } - - template SIMD_INLINE void Interpolate(const uint8_t * src, size_t dx, size_t dy, __m512i k[2], uint8_t * dst, __mmask64 tail = -1) - { - const __m512i s00 = Load(src, tail); - const __m512i s01 = Load(src + dx, tail); - const __m512i s10 = Load(src + dy, tail); - const __m512i s11 = Load(src + dy + dx, tail); - __m512i lo = _mm512_srli_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(UnpackU8<0>(s00, s01), k[0]), - _mm512_maddubs_epi16(UnpackU8<0>(s10, s11), k[1])), K16_BILINEAR_ROUND_TERM_EVEN), BILINEAR_SHIFT_EVEN); - __m512i hi = _mm512_srli_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(UnpackU8<1>(s00, s01), k[0]), - _mm512_maddubs_epi16(UnpackU8<1>(s10, s11), k[1])), K16_BILINEAR_ROUND_TERM_EVEN), BILINEAR_SHIFT_EVEN); - Store(dst, _mm512_packus_epi16(lo, hi), tail); - } - - template SIMD_INLINE void Interpolate(const uint8_t * src, size_t dr, const __m512i & k, uint8_t * dst, __mmask64 tail = -1) - { - const __m512i s0 = Load(src, tail); - const __m512i s1 = Load(src + dr, tail); - __m512i lo = _mm512_srli_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(UnpackU8<0>(s0, s1), k), K16_LINEAR_ROUND_TERM), Base::LINEAR_SHIFT); - __m512i hi = _mm512_srli_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(UnpackU8<1>(s0, s1), k), K16_LINEAR_ROUND_TERM), Base::LINEAR_SHIFT); - Store(dst, _mm512_packus_epi16(lo, hi), tail); - } - - void ShiftBilinear(const uint8_t *src, size_t srcStride, size_t width, size_t height, size_t channelCount, - int fDx, int fDy, uint8_t *dst, size_t dstStride) - { - size_t size = width*channelCount; - size_t alignedSize = AlignLo(size, A); - __mmask64 tailMask = TailMask64(size - alignedSize); - - if (fDy) - { - if (fDx) - { - if (fDx & fDy & 1) - { - __m512i k[2][2]; - k[0][0] = _mm512_set1_epi16((Base::FRACTION_RANGE - fDx)*(Base::FRACTION_RANGE - fDy)); - k[0][1] = _mm512_set1_epi16(fDx*(Base::FRACTION_RANGE - fDy)); - k[1][0] = _mm512_set1_epi16((Base::FRACTION_RANGE - fDx)*fDy); - k[1][1] = _mm512_set1_epi16(fDx*fDy); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedSize; col += A) - Interpolate(src + col, channelCount, srcStride, k, dst + col); - if (col < size) - Interpolate(src + col, channelCount, srcStride, k, dst + col, tailMask); - src += srcStride; - dst += dstStride; - } - } - else - { - __m512i k[2]; - k[0] = SetInt8((Base::FRACTION_RANGE - fDx)*(Base::FRACTION_RANGE - fDy) / 2, fDx*(Base::FRACTION_RANGE - fDy) / 2); - k[1] = SetInt8((Base::FRACTION_RANGE - fDx)*fDy / 2, fDx*fDy / 2); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedSize; col += A) - Interpolate(src + col, channelCount, srcStride, k, dst + col); - if (col < size) - Interpolate(src + col, channelCount, srcStride, k, dst + col, tailMask); - src += srcStride; - dst += dstStride; - } - } - } - else - { - __m512i k = SetInt8(Base::FRACTION_RANGE - fDy, fDy); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedSize; col += A) - Interpolate(src + col, srcStride, k, dst + col); - if (col < size) - Interpolate(src + col, srcStride, k, dst + col, tailMask); - src += srcStride; - dst += dstStride; - } - } - } - else - { - if (fDx) - { - __m512i k = SetInt8(Base::FRACTION_RANGE - fDx, fDx); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedSize; col += A) - Interpolate(src + col, channelCount, k, dst + col); - if (col < size) - Interpolate(src + col, channelCount, k, dst + col, tailMask); - src += srcStride; - dst += dstStride; - } - } - else - { - for (size_t row = 0; row < height; ++row) - { - memcpy(dst, src, size); - src += srcStride; - dst += dstStride; - } - } - } - } - - void ShiftBilinear( - const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t * bkg, size_t bkgStride, const double * shiftX, const double * shiftY, - size_t cropLeft, size_t cropTop, size_t cropRight, size_t cropBottom, uint8_t * dst, size_t dstStride) - { - int fDx, fDy; - Base::CommonShiftAction(src, srcStride, width, height, channelCount, bkg, bkgStride, shiftX, shiftY, - cropLeft, cropTop, cropRight, cropBottom, dst, dstStride, fDx, fDy); - - if (*shiftX + A < cropRight - cropLeft) - Avx512bw::ShiftBilinear(src, srcStride, width, height, channelCount, fDx, fDy, dst, dstStride); - else - Base::ShiftBilinear(src, srcStride, width, height, channelCount, fDx, fDy, dst, dstStride); - } - } -#endif//SIMD_AVX512bw_ENABLE -} - diff --git a/src/3rd/Simd/Simd/SimdAvx512bwSobel.cpp b/src/3rd/Simd/Simd/SimdAvx512bwSobel.cpp deleted file mode 100644 index 53be7bce..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwSobel.cpp +++ /dev/null @@ -1,526 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - const __m512i K64_PERMUTE_0 = SIMD_MM512_SETR_EPI64(0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xA, 0xB); - const __m512i K64_PERMUTE_1 = SIMD_MM512_SETR_EPI64(0x4, 0x5, 0xC, 0xD, 0x6, 0x7, 0xE, 0xF); - - template SIMD_INLINE void SobelDx(__m512i a[3][3], __m512i & lo, __m512i & hi) - { - lo = ConditionalAbs(BinomialSum16(SubUnpackedU8<0>(a[0][2], a[0][0]), SubUnpackedU8<0>(a[1][2], a[1][0]), SubUnpackedU8<0>(a[2][2], a[2][0]))); - hi = ConditionalAbs(BinomialSum16(SubUnpackedU8<1>(a[0][2], a[0][0]), SubUnpackedU8<1>(a[1][2], a[1][0]), SubUnpackedU8<1>(a[2][2], a[2][0]))); - } - - template SIMD_INLINE void SobelDx(__m512i a[3][3], int16_t * dst) - { - __m512i lo, hi; - SobelDx(a, lo, hi); - Store(dst + 00, _mm512_permutex2var_epi64(lo, K64_PERMUTE_0, hi)); - Store(dst + HA, _mm512_permutex2var_epi64(lo, K64_PERMUTE_1, hi)); - } - - template void SobelDx(const uint8_t * src, size_t srcStride, size_t width, size_t height, int16_t * dst, size_t dstStride) - { - assert(width > A); - if (align) - assert(Aligned(dst) && Aligned(dstStride, HA)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - __m512i a[3][3]; - - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - LoadNoseDx(src0 + 0, a[0]); - LoadNoseDx(src1 + 0, a[1]); - LoadNoseDx(src2 + 0, a[2]); - SobelDx(a, dst + 0); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBodyDx(src0 + col, a[0]); - LoadBodyDx(src1 + col, a[1]); - LoadBodyDx(src2 + col, a[2]); - SobelDx(a, dst + col); - } - LoadTailDx(src0 + width - A, a[0]); - LoadTailDx(src1 + width - A, a[1]); - LoadTailDx(src2 + width - A, a[2]); - SobelDx(a, dst + width - A); - - dst += dstStride; - } - } - - void SobelDx(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(dst) && Aligned(dstStride)) - SobelDx(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - SobelDx(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - void SobelDxAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(dst) && Aligned(dstStride)) - SobelDx(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - SobelDx(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - SIMD_INLINE void SobelDxAbsSum(__m512i a[3][3], __m512i * sums) - { - __m512i lo, hi; - SobelDx(a, lo, hi); - sums[0] = _mm512_add_epi32(sums[0], _mm512_madd_epi16(lo, K16_0001)); - sums[1] = _mm512_add_epi32(sums[1], _mm512_madd_epi16(hi, K16_0001)); - } - - SIMD_INLINE void SetMask3(__m512i a[3], __m512i mask) - { - a[0] = _mm512_and_si512(a[0], mask); - a[1] = _mm512_and_si512(a[1], mask); - a[2] = _mm512_and_si512(a[2], mask); - } - - SIMD_INLINE void SetMask3x3(__m512i a[3][3], __m512i mask) - { - SetMask3(a[0], mask); - SetMask3(a[1], mask); - SetMask3(a[2], mask); - } - - template void SobelDxAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - assert(width > A && width < 256 * 256 * F); - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - - __m512i a[3][3]; - __m512i tailMask = _mm512_mask_set1_epi8(K_INV_ZERO, TailMask64(A - width + bodyWidth), 0); - - size_t blockSize = (256 * 256 * F) / width; - size_t blockCount = height / blockSize + 1; - __m512i _sum = _mm512_setzero_si512(); - for (size_t block = 0; block < blockCount; ++block) - { - __m512i sums[2] = { _mm512_setzero_si512(), _mm512_setzero_si512() }; - for (size_t row = block*blockSize, endRow = Simd::Min(row + blockSize, height); row < endRow; ++row) - { - src0 = src + stride*(row - 1); - src1 = src0 + stride; - src2 = src1 + stride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - LoadNoseDx(src0 + 0, a[0]); - LoadNoseDx(src1 + 0, a[1]); - LoadNoseDx(src2 + 0, a[2]); - SobelDxAbsSum(a, sums); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBodyDx(src0 + col, a[0]); - LoadBodyDx(src1 + col, a[1]); - LoadBodyDx(src2 + col, a[2]); - SobelDxAbsSum(a, sums); - } - LoadTailDx(src0 + width - A, a[0]); - LoadTailDx(src1 + width - A, a[1]); - LoadTailDx(src2 + width - A, a[2]); - SetMask3x3(a, tailMask); - SobelDxAbsSum(a, sums); - } - sums[0] = _mm512_add_epi32(sums[0], sums[1]); - _sum = _mm512_add_epi64(_sum, _mm512_add_epi64(_mm512_unpacklo_epi32(sums[0], K_ZERO), _mm512_unpackhi_epi32(sums[0], K_ZERO))); - } - - *sum = ExtractSum(_sum); - } - - void SobelDxAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(src) && Aligned(stride)) - SobelDxAbsSum(src, stride, width, height, sum); - else - SobelDxAbsSum(src, stride, width, height, sum); - } - - template SIMD_INLINE void SobelDy(__m512i a[3][3], __m512i & lo, __m512i & hi) - { - lo = ConditionalAbs(BinomialSum16(SubUnpackedU8<0>(a[2][0], a[0][0]), SubUnpackedU8<0>(a[2][1], a[0][1]), SubUnpackedU8<0>(a[2][2], a[0][2]))); - hi = ConditionalAbs(BinomialSum16(SubUnpackedU8<1>(a[2][0], a[0][0]), SubUnpackedU8<1>(a[2][1], a[0][1]), SubUnpackedU8<1>(a[2][2], a[0][2]))); - } - - template SIMD_INLINE void SobelDy(__m512i a[3][3], int16_t * dst) - { - __m512i lo, hi; - SobelDy(a, lo, hi); - Store(dst + 00, _mm512_permutex2var_epi64(lo, K64_PERMUTE_0, hi)); - Store(dst + HA, _mm512_permutex2var_epi64(lo, K64_PERMUTE_1, hi)); - } - - template void SobelDy(const uint8_t * src, size_t srcStride, size_t width, size_t height, int16_t * dst, size_t dstStride) - { - assert(width > A); - if (align) - assert(Aligned(dst) && Aligned(dstStride, HA)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - __m512i a[3][3]; - - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - LoadNose3(src0 + 0, a[0]); - LoadNose3(src2 + 0, a[2]); - SobelDy(a, dst + 0); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBody3(src0 + col, a[0]); - LoadBody3(src2 + col, a[2]); - SobelDy(a, dst + col); - } - LoadTail3(src0 + width - A, a[0]); - LoadTail3(src2 + width - A, a[2]); - SobelDy(a, dst + width - A); - - dst += dstStride; - } - } - - void SobelDy(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - SobelDy(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - SobelDy(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - void SobelDyAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - SobelDy(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - SobelDy(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - SIMD_INLINE void SobelDyAbsSum(__m512i a[3][3], __m512i * sums) - { - __m512i lo, hi; - SobelDy(a, lo, hi); - sums[0] = _mm512_add_epi32(sums[0], _mm512_madd_epi16(lo, K16_0001)); - sums[1] = _mm512_add_epi32(sums[1], _mm512_madd_epi16(hi, K16_0001)); - } - - template void SobelDyAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - assert(width > A); - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - - __m512i a[3][3]; - __m512i tailMask = _mm512_mask_set1_epi8(K_INV_ZERO, TailMask64(A - width + bodyWidth), 0); - - size_t blockSize = (256 * 256 * F) / width; - size_t blockCount = height / blockSize + 1; - __m512i _sum = _mm512_setzero_si512(); - for (size_t block = 0; block < blockCount; ++block) - { - __m512i sums[2] = { _mm512_setzero_si512(), _mm512_setzero_si512() }; - for (size_t row = block*blockSize, endRow = Simd::Min(row + blockSize, height); row < endRow; ++row) - { - src0 = src + stride*(row - 1); - src1 = src0 + stride; - src2 = src1 + stride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - LoadNose3(src0 + 0, a[0]); - LoadNose3(src2 + 0, a[2]); - SobelDyAbsSum(a, sums); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBody3(src0 + col, a[0]); - LoadBody3(src2 + col, a[2]); - SobelDyAbsSum(a, sums); - } - LoadTail3(src0 + width - A, a[0]); - LoadTail3(src2 + width - A, a[2]); - SetMask3x3(a, tailMask); - SobelDyAbsSum(a, sums); - } - sums[0] = _mm512_add_epi32(sums[0], sums[1]); - _sum = _mm512_add_epi64(_sum, _mm512_add_epi64(_mm512_unpacklo_epi32(sums[0], K_ZERO), _mm512_unpackhi_epi32(sums[0], K_ZERO))); - } - - *sum = ExtractSum(_sum); - } - - void SobelDyAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(src) && Aligned(stride)) - SobelDyAbsSum(src, stride, width, height, sum); - else - SobelDyAbsSum(src, stride, width, height, sum); - } - - SIMD_INLINE __m512i ContourMetrics(__m512i dx, __m512i dy) - { - return _mm512_add_epi16(_mm512_slli_epi16(_mm512_add_epi16(dx, dy), 1), _mm512_maskz_set1_epi16(_mm512_cmpgt_epi16_mask(dy, dx), 1)); - } - - SIMD_INLINE void ContourMetrics(__m512i a[3][3], __m512i & lo, __m512i & hi) - { - __m512i dxLo, dxHi, dyLo, dyHi; - SobelDx(a, dxLo, dxHi); - SobelDy(a, dyLo, dyHi); - lo = ContourMetrics(dxLo, dyLo); - hi = ContourMetrics(dxHi, dyHi); - } - - template SIMD_INLINE void ContourMetrics(__m512i a[3][3], int16_t * dst) - { - __m512i lo, hi; - ContourMetrics(a, lo, hi); - Store(dst + 00, _mm512_permutex2var_epi64(lo, K64_PERMUTE_0, hi)); - Store(dst + HA, _mm512_permutex2var_epi64(lo, K64_PERMUTE_1, hi)); - } - - template void ContourMetrics(const uint8_t * src, size_t srcStride, size_t width, size_t height, int16_t * dst, size_t dstStride) - { - assert(width > A); - if (align) - assert(Aligned(dst) && Aligned(dstStride, HA)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - __m512i a[3][3]; - - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - LoadNose3(src0 + 0, a[0]); - LoadNose3(src1 + 0, a[1]); - LoadNose3(src2 + 0, a[2]); - ContourMetrics(a, dst + 0); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBody3(src0 + col, a[0]); - LoadBody3(src1 + col, a[1]); - LoadBody3(src2 + col, a[2]); - ContourMetrics(a, dst + col); - } - LoadTail3(src0 + width - A, a[0]); - LoadTail3(src1 + width - A, a[1]); - LoadTail3(src2 + width - A, a[2]); - ContourMetrics(a, dst + width - A); - - dst += dstStride; - } - } - - void ContourMetrics(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - ContourMetrics(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - ContourMetrics(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - template SIMD_INLINE void ContourMetricsMasked(__m512i a[3][3], const uint8_t * mask, const __m512i & indexMin, int16_t * dst) - { - __m512i m = _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask(Load((__m256i*)mask), indexMin), -1); - __m512i lo, hi; - ContourMetrics(a, lo, hi); - lo = _mm512_and_si512(lo, _mm512_unpacklo_epi8(m, m)); - hi = _mm512_and_si512(hi, _mm512_unpackhi_epi8(m, m)); - Store(dst + 00, _mm512_permutex2var_epi64(lo, K64_PERMUTE_0, hi)); - Store(dst + HA, _mm512_permutex2var_epi64(lo, K64_PERMUTE_1, hi)); - } - - template void ContourMetricsMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t indexMin, int16_t * dst, size_t dstStride) - { - assert(width > A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride, HA) && Aligned(mask) && Aligned(maskStride)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - __m512i _indexMin = _mm512_set1_epi8(indexMin); - __m512i a[3][3]; - - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - LoadNose3(src0 + 0, a[0]); - LoadNose3(src1 + 0, a[1]); - LoadNose3(src2 + 0, a[2]); - ContourMetricsMasked(a, mask + 0, _indexMin, dst + 0); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBody3(src0 + col, a[0]); - LoadBody3(src1 + col, a[1]); - LoadBody3(src2 + col, a[2]); - ContourMetricsMasked(a, mask + col, _indexMin, dst + col); - } - LoadTail3(src0 + width - A, a[0]); - LoadTail3(src1 + width - A, a[1]); - LoadTail3(src2 + width - A, a[2]); - ContourMetricsMasked(a, mask + width - A, _indexMin, dst + width - A); - - dst += dstStride; - mask += maskStride; - } - } - - void ContourMetricsMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t indexMin, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride) && Aligned(mask) && Aligned(maskStride)) - ContourMetricsMasked(src, srcStride, width, height, mask, maskStride, indexMin, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - ContourMetricsMasked(src, srcStride, width, height, mask, maskStride, indexMin, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - template SIMD_INLINE __mmask32 AnchorComponent(const int16_t * src, size_t step, const __m512i & current, const __m512i & threshold) - { - __m512i last = _mm512_srli_epi16(Load(src - step), 1); - __m512i next = _mm512_srli_epi16(Load(src + step), 1); - return _mm512_cmpge_epi16_mask(_mm512_sub_epi16(current, last), threshold) & _mm512_cmpge_epi16_mask(_mm512_sub_epi16(current, next), threshold); - } - - template SIMD_INLINE __mmask32 Anchor(const int16_t * src, size_t stride, const __m512i & threshold) - { - __m512i _src = Load(src); - __m512i magnitude = _mm512_srli_epi16(_src, 1); - __mmask32 direction = _mm512_cmpeq_epi16_mask(_mm512_and_si512(_src, K16_0001), K16_0001); - __mmask32 vertical = AnchorComponent(src, 1, magnitude, threshold) & direction; - __mmask32 horizontal = AnchorComponent(src, stride, magnitude, threshold) & (~direction); - return _mm512_cmpneq_epi16_mask(magnitude, K_ZERO) & (vertical | horizontal); - } - - template SIMD_INLINE void Anchor(const int16_t * src, size_t stride, const __m512i & threshold, uint8_t * dst) - { - __mmask32 lo = Anchor(src + 00, stride, threshold); - __mmask32 hi = Anchor(src + HA, stride, threshold); - Store(dst, _mm512_maskz_set1_epi8(__mmask64(lo) | (__mmask64(hi) << 32), -1)); - } - - template void ContourAnchors(const int16_t * src, size_t srcStride, size_t width, size_t height, - size_t step, int16_t threshold, uint8_t * dst, size_t dstStride) - { - assert(width > A); - if (align) - assert(Aligned(src) && Aligned(srcStride, HA) && Aligned(dst) && Aligned(dstStride)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - __m512i _threshold = _mm512_set1_epi16(threshold); - memset(dst, 0, width); - memset(dst + dstStride*(height - 1), 0, width); - src += srcStride; - dst += dstStride; - for (size_t row = 1; row < height - 1; row += step) - { - dst[0] = 0; - Anchor(src + 1, srcStride, _threshold, dst + 1); - for (size_t col = A; col < bodyWidth; col += A) - Anchor(src + col, srcStride, _threshold, dst + col); - Anchor(src + width - A - 1, srcStride, _threshold, dst + width - A - 1); - dst[width - 1] = 0; - src += step*srcStride; - dst += step*dstStride; - } - } - - void ContourAnchors(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t step, int16_t threshold, uint8_t * dst, size_t dstStride) - { - assert(srcStride % sizeof(int16_t) == 0); - - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - ContourAnchors((const int16_t *)src, srcStride / sizeof(int16_t), width, height, step, threshold, dst, dstStride); - else - ContourAnchors((const int16_t *)src, srcStride / sizeof(int16_t), width, height, step, threshold, dst, dstStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwSquaredDifferenceSum.cpp b/src/3rd/Simd/Simd/SimdAvx512bwSquaredDifferenceSum.cpp deleted file mode 100644 index 4acf40e2..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwSquaredDifferenceSum.cpp +++ /dev/null @@ -1,170 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - SIMD_INLINE __m512i SquaredDifference(const __m512i & a, const __m512i & b) - { - const __m512i lo = SubUnpackedU8<0>(a, b); - const __m512i hi = SubUnpackedU8<1>(a, b); - return _mm512_add_epi32(_mm512_madd_epi16(lo, lo), _mm512_madd_epi16(hi, hi)); - } - - template SIMD_INLINE void SquaredDifferenceSum(const uint8_t * a, const uint8_t * b, __m512i * sums, __mmask64 tail = -1) - { - const __m512i _a = Load(a, tail); - const __m512i _b = Load(b, tail); - sums[0] = _mm512_add_epi32(sums[0], SquaredDifference(_a, _b)); - } - - template SIMD_INLINE void SquaredDifferenceSum4(const uint8_t * a, const uint8_t * b, __m512i * sums) - { - sums[0] = _mm512_add_epi32(sums[0], SquaredDifference(Load(a + A * 0), Load(b + A * 0))); - sums[1] = _mm512_add_epi32(sums[1], SquaredDifference(Load(a + A * 1), Load(b + A * 1))); - sums[2] = _mm512_add_epi32(sums[2], SquaredDifference(Load(a + A * 2), Load(b + A * 2))); - sums[3] = _mm512_add_epi32(sums[3], SquaredDifference(Load(a + A * 3), Load(b + A * 3))); - } - - template void SquaredDifferenceSum( - const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - size_t width, size_t height, uint64_t * sum) - { - assert(width < 256 * 256 * F); - if (align) - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - size_t fullAlignedWidth = Simd::AlignLo(width, QA); - __mmask64 tailMask = TailMask64(width - alignedWidth); - - size_t blockSize = (256 * 256 * F) / width; - size_t blockCount = height / blockSize + 1; - __m512i _sum = _mm512_setzero_si512(); - for (size_t block = 0; block < blockCount; ++block) - { - __m512i sums[4] = { _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512() }; - for (size_t row = block*blockSize, endRow = Simd::Min(row + blockSize, height); row < endRow; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QA) - SquaredDifferenceSum4(a + col, b + col, sums); - for (; col < alignedWidth; col += A) - SquaredDifferenceSum(a + col, b + col, sums); - if (col < width) - SquaredDifferenceSum(a + col, b + col, sums, tailMask); - a += aStride; - b += bStride; - } - sums[0] = _mm512_add_epi32(_mm512_add_epi32(sums[0], sums[1]), _mm512_add_epi32(sums[2], sums[3])); - _sum = _mm512_add_epi64(_sum, _mm512_add_epi64(_mm512_unpacklo_epi32(sums[0], K_ZERO), _mm512_unpackhi_epi32(sums[0], K_ZERO))); - } - *sum = ExtractSum(_sum); - } - - void SquaredDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - size_t width, size_t height, uint64_t * sum) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)) - SquaredDifferenceSum(a, aStride, b, bStride, width, height, sum); - else - SquaredDifferenceSum(a, aStride, b, bStride, width, height, sum); - } - - template SIMD_INLINE void SquaredDifferenceSumMasked(const uint8_t * a, const uint8_t * b, const uint8_t * m, const __m512i & index, __m512i * sums, __mmask64 tail) - { - const __mmask64 mask = _mm512_cmpeq_epi8_mask((Load(m, tail)), index) & tail; - const __m512i _a = Load(a, mask); - const __m512i _b = Load(b, mask); - sums[0] = _mm512_add_epi32(sums[0], SquaredDifference(_a, _b)); - } - - template SIMD_INLINE void SquaredDifferenceSumMasked(const uint8_t * a, const uint8_t * b, const uint8_t * m, const __m512i & index, __m512i * sums) - { - const __mmask64 mask = _mm512_cmpeq_epi8_mask((Load(m + A * idx)), index); - const __m512i _a = Load(a + A * idx, mask); - const __m512i _b = Load(b + A * idx, mask); - sums[idx] = _mm512_add_epi32(sums[idx], SquaredDifference(_b, _a)); - } - - template void SquaredDifferenceSumMasked( - const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - const uint8_t * mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) - { - assert(width < 256 * 256 * F); - if (align) - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(mask) && Aligned(maskStride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - size_t fullAlignedWidth = Simd::AlignLo(width, QA); - __mmask64 tailMask = TailMask64(width - alignedWidth); - __m512i _index = _mm512_set1_epi8(index); - size_t blockSize = (256 * 256 * F) / width; - size_t blockCount = height / blockSize + 1; - __m512i _sum = _mm512_setzero_si512(); - for (size_t block = 0; block < blockCount; ++block) - { - __m512i sums[4] = { _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512() }; - for (size_t row = block*blockSize, endRow = Simd::Min(row + blockSize, height); row < endRow; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QA) - { - - SquaredDifferenceSumMasked(a + col, b + col, mask + col, _index, sums); - SquaredDifferenceSumMasked(a + col, b + col, mask + col, _index, sums); - SquaredDifferenceSumMasked(a + col, b + col, mask + col, _index, sums); - SquaredDifferenceSumMasked(a + col, b + col, mask + col, _index, sums); - } - for (; col < alignedWidth; col += A) - SquaredDifferenceSumMasked(a + col, b + col, mask + col, _index, sums, -1); - if (col < width) - SquaredDifferenceSumMasked(a + col, b + col, mask + col, _index, sums, tailMask); - a += aStride; - b += bStride; - mask += maskStride; - } - sums[0] = _mm512_add_epi32(_mm512_add_epi32(sums[0], sums[1]), _mm512_add_epi32(sums[2], sums[3])); - _sum = _mm512_add_epi64(_sum, _mm512_add_epi64(_mm512_unpacklo_epi32(sums[0], K_ZERO), _mm512_unpackhi_epi32(sums[0], K_ZERO))); - } - *sum = ExtractSum(_sum); - } - - void SquaredDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(mask) && Aligned(maskStride)) - SquaredDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - else - SquaredDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwStatistic.cpp b/src/3rd/Simd/Simd/SimdAvx512bwStatistic.cpp deleted file mode 100644 index a05f5d4f..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwStatistic.cpp +++ /dev/null @@ -1,506 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSet.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void GetStatistic(const uint8_t * src, __m512i & min, __m512i & max, __m512i & sum) - { - const __m512i _src = Load(src); - min = _mm512_min_epu8(min, _src); - max = _mm512_max_epu8(max, _src); - sum = _mm512_add_epi64(_mm512_sad_epu8(_src, K_ZERO), sum); - } - - template SIMD_INLINE void GetStatistic(const uint8_t * src, __m512i & min, __m512i & max, __m512i & sum, __mmask64 tail) - { - const __m512i _src = Load(src, tail); - min = _mm512_mask_min_epu8(min, tail, min, _src); - max = _mm512_mask_max_epu8(max, tail, max, _src); - sum = _mm512_add_epi64(_mm512_sad_epu8(_src, K_ZERO), sum); - } - - template void GetStatistic(const uint8_t * src, size_t stride, size_t width, size_t height, - uint8_t * min, uint8_t * max, uint8_t * average) - { - assert(width*height && width >= A); - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - - __m512i sum = _mm512_setzero_si512(); - __m512i min512 = _mm512_set1_epi8(-1); - __m512i max512 = _mm512_set1_epi8(0); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - GetStatistic(src + col, min512, max512, sum); - if (col < width) - GetStatistic(src + col, min512, max512, sum, tailMask); - src += stride; - } - - __m128i min128 = _mm_min_epu8(_mm_min_epu8(_mm512_extracti32x4_epi32(min512, 0), _mm512_extracti32x4_epi32(min512, 1)), - _mm_min_epu8(_mm512_extracti32x4_epi32(min512, 2), _mm512_extracti32x4_epi32(min512, 3))); - __m128i max128 = _mm_max_epu8(_mm_max_epu8(_mm512_extracti32x4_epi32(max512, 0), _mm512_extracti32x4_epi32(max512, 1)), - _mm_max_epu8(_mm512_extracti32x4_epi32(max512, 2), _mm512_extracti32x4_epi32(max512, 3))); - - uint8_t min_buffer[Sse2::A], max_buffer[Sse2::A]; - Sse2::Store((__m128i*)min_buffer, min128); - Sse2::Store((__m128i*)max_buffer, max128); - *min = UCHAR_MAX; - *max = 0; - for (size_t i = 0; i < Sse2::A; ++i) - { - *min = Base::MinU8(min_buffer[i], *min); - *max = Base::MaxU8(max_buffer[i], *max); - } - *average = (uint8_t)((ExtractSum(sum) + width*height / 2) / (width*height)); - } - - void GetStatistic(const uint8_t * src, size_t stride, size_t width, size_t height, - uint8_t * min, uint8_t * max, uint8_t * average) - { - if (Aligned(src) && Aligned(stride)) - GetStatistic(src, stride, width, height, min, max, average); - else - GetStatistic(src, stride, width, height, min, max, average); - } - - template void GetRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - - memset(sums, 0, sizeof(uint32_t)*height); - for (size_t row = 0; row < height; ++row) - { - __m512i sum = _mm512_setzero_si512(); - size_t col = 0; - for (; col < alignedWidth; col += A) - sum = _mm512_add_epi32(sum, _mm512_sad_epu8(Load(src + col), K_ZERO)); - if (col < width) - sum = _mm512_add_epi32(sum, _mm512_sad_epu8(Load(src + col, tailMask), K_ZERO)); - sums[row] = ExtractSum(sum); - src += stride; - } - } - - void GetRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - if (Aligned(src) && Aligned(stride)) - GetRowSums(src, stride, width, height, sums); - else - GetRowSums(src, stride, width, height, sums); - } - - namespace - { - struct Buffer - { - Buffer(size_t width) - { - _p = Allocate(sizeof(uint16_t)*width + sizeof(uint32_t)*width); - sums16 = (uint16_t*)_p; - sums32 = (uint32_t*)(sums16 + width); - } - - ~Buffer() - { - Free(_p); - } - - uint16_t * sums16; - uint32_t * sums32; - private: - void *_p; - }; - } - - const __m512i K32_PERMUTE_FOR_COL_SUMS = SIMD_MM512_SETR_EPI32(0x0, 0x8, 0x4, 0xC, 0x1, 0x9, 0x5, 0xD, 0x2, 0xA, 0x6, 0xE, 0x3, 0xB, 0x7, 0xF); - - template SIMD_INLINE void GetColSum16(const uint8_t * src, uint16_t * dst, __mmask64 tail = -1) - { - __m512i _src = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_COL_SUMS, (Load(src, tail))); - Store(dst + 00, _mm512_add_epi16(Load(dst + 00), _mm512_unpacklo_epi8(_src, K_ZERO))); - Store(dst + HA, _mm512_add_epi16(Load(dst + HA), _mm512_unpackhi_epi8(_src, K_ZERO))); - } - - SIMD_INLINE void Sum16To32(const uint16_t * src, uint32_t * dst) - { - __m512i lo = Load(src + 00); - __m512i hi = Load(src + HA); - Store(dst + 0 * F, _mm512_add_epi32(Load(dst + 0 * F), _mm512_unpacklo_epi16(lo, K_ZERO))); - Store(dst + 1 * F, _mm512_add_epi32(Load(dst + 1 * F), _mm512_unpacklo_epi16(hi, K_ZERO))); - Store(dst + 2 * F, _mm512_add_epi32(Load(dst + 2 * F), _mm512_unpackhi_epi16(lo, K_ZERO))); - Store(dst + 3 * F, _mm512_add_epi32(Load(dst + 3 * F), _mm512_unpackhi_epi16(hi, K_ZERO))); - } - - template void GetColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - size_t alignedLoWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedLoWidth); - size_t alignedHiWidth = AlignHi(width, A); - size_t stepSize = SCHAR_MAX + 1; - size_t stepCount = (height + SCHAR_MAX) / stepSize; - - Buffer buffer(alignedHiWidth); - memset(buffer.sums32, 0, sizeof(uint32_t)*alignedHiWidth); - for (size_t step = 0; step < stepCount; ++step) - { - size_t rowStart = step*stepSize; - size_t rowEnd = Min(rowStart + stepSize, height); - memset(buffer.sums16, 0, sizeof(uint16_t)*alignedHiWidth); - for (size_t row = rowStart; row < rowEnd; ++row) - { - size_t col = 0; - for (; col < alignedLoWidth; col += A) - GetColSum16(src + col, buffer.sums16 + col); - if (col < width) - GetColSum16(src + col, buffer.sums16 + col, tailMask); - src += stride; - } - for (size_t col = 0; col < alignedHiWidth; col += A) - Sum16To32(buffer.sums16 + col, buffer.sums32 + col); - } - memcpy(sums, buffer.sums32, sizeof(uint32_t)*width); - } - - void GetColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - if (Aligned(src) && Aligned(stride)) - GetColSums(src, stride, width, height, sums); - else - GetColSums(src, stride, width, height, sums); - } - - template void GetAbsDyRowSums(const uint8_t * src0, const uint8_t * src1, __m512i & sum, __mmask64 tail = -1) - { - __m512i _src0 = Load(src0, tail); - __m512i _src1 = Load(src1, tail); - sum = _mm512_add_epi32(sum, _mm512_sad_epu8(_src0, _src1)); - } - - template void GetAbsDyRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - - memset(sums, 0, sizeof(uint32_t)*height); - const uint8_t * src0 = src; - const uint8_t * src1 = src + stride; - height--; - for (size_t row = 0; row < height; ++row) - { - __m512i sum = _mm512_setzero_si512(); - size_t col = 0; - for (; col < alignedWidth; col += A) - GetAbsDyRowSums(src0 + col, src1 + col, sum); - if (col < width) - GetAbsDyRowSums(src0 + col, src1 + col, sum, tailMask); - sums[row] = ExtractSum(sum); - src0 += stride; - src1 += stride; - } - } - - void GetAbsDyRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - if (Aligned(src) && Aligned(stride)) - GetAbsDyRowSums(src, stride, width, height, sums); - else - GetAbsDyRowSums(src, stride, width, height, sums); - } - - template SIMD_INLINE void GetAbsDxColSum16(const uint8_t * src, uint16_t * dst, __mmask64 tail = -1) - { - __m512i src0 = Load(src + 0, tail); - __m512i src1 = Load(src + 1, tail); - __m512i absDiff = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_COL_SUMS, AbsDifferenceU8(src0, src1)); - Store(dst + 00, _mm512_add_epi16(Load(dst + 00), _mm512_unpacklo_epi8(absDiff, K_ZERO))); - Store(dst + HA, _mm512_add_epi16(Load(dst + HA), _mm512_unpackhi_epi8(absDiff, K_ZERO))); - } - - template void GetAbsDxColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - width--; - size_t alignedLoWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedLoWidth); - size_t alignedHiWidth = AlignHi(width, A); - size_t stepSize = SCHAR_MAX + 1; - size_t stepCount = (height + SCHAR_MAX) / stepSize; - - Buffer buffer(alignedHiWidth); - memset(buffer.sums32, 0, sizeof(uint32_t)*alignedHiWidth); - for (size_t step = 0; step < stepCount; ++step) - { - size_t rowStart = step*stepSize; - size_t rowEnd = Min(rowStart + stepSize, height); - memset(buffer.sums16, 0, sizeof(uint16_t)*alignedHiWidth); - for (size_t row = rowStart; row < rowEnd; ++row) - { - size_t col = 0; - for (; col < alignedLoWidth; col += A) - GetAbsDxColSum16(src + col, buffer.sums16 + col); - if (col < width) - GetAbsDxColSum16(src + col, buffer.sums16 + col, tailMask); - src += stride; - } - for (size_t col = 0; col < alignedHiWidth; col += A) - Sum16To32(buffer.sums16 + col, buffer.sums32 + col); - } - memcpy(sums, buffer.sums32, sizeof(uint32_t)*width); - sums[width] = 0; - } - - void GetAbsDxColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - if (Aligned(src) && Aligned(stride)) - GetAbsDxColSums(src, stride, width, height, sums); - else - GetAbsDxColSums(src, stride, width, height, sums); - } - - template void ValueSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t alignedWidth = AlignLo(width, A); - size_t fullAlignedWidth = AlignLo(width, QA); - __mmask64 tailMask = TailMask64(width - alignedWidth); - __m512i sums[4] = { _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512() }; - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QA) - { - sums[0] = _mm512_add_epi64(sums[0], _mm512_sad_epu8(Load(src + col + 0 * A), K_ZERO)); - sums[1] = _mm512_add_epi64(sums[1], _mm512_sad_epu8(Load(src + col + 1 * A), K_ZERO)); - sums[2] = _mm512_add_epi64(sums[2], _mm512_sad_epu8(Load(src + col + 2 * A), K_ZERO)); - sums[3] = _mm512_add_epi64(sums[3], _mm512_sad_epu8(Load(src + col + 3 * A), K_ZERO)); - } - for (; col < alignedWidth; col += A) - sums[0] = _mm512_add_epi64(sums[0], _mm512_sad_epu8(Load(src + col), K_ZERO)); - if (col < width) - sums[0] = _mm512_add_epi64(sums[0], _mm512_sad_epu8(Load(src + col, tailMask), K_ZERO)); - src += stride; - } - *sum = ExtractSum(_mm512_add_epi64(_mm512_add_epi64(sums[0], sums[1]), _mm512_add_epi64(sums[2], sums[3]))); - } - - void ValueSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(src) && Aligned(stride)) - ValueSum(src, stride, width, height, sum); - else - ValueSum(src, stride, width, height, sum); - } - - SIMD_INLINE __m512i SquareSum(__m512i value) - { - const __m512i lo = _mm512_unpacklo_epi8(value, K_ZERO); - const __m512i hi = _mm512_unpackhi_epi8(value, K_ZERO); - return _mm512_add_epi32(_mm512_madd_epi16(lo, lo), _mm512_madd_epi16(hi, hi)); - } - - template void SquareSum(const uint8_t * src, __m512i * sums, __mmask64 tail = -1) - { - sums[0] = _mm512_add_epi32(sums[0], SquareSum(Load(src, tail))); - } - - template void SquareSum4(const uint8_t * src, __m512i * sums) - { - sums[0] = _mm512_add_epi32(sums[0], SquareSum(Load(src + 0 * A))); - sums[1] = _mm512_add_epi32(sums[1], SquareSum(Load(src + 1 * A))); - sums[2] = _mm512_add_epi32(sums[2], SquareSum(Load(src + 2 * A))); - sums[3] = _mm512_add_epi32(sums[3], SquareSum(Load(src + 3 * A))); - } - - template void SquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - assert(width < 256 * 256 * F); - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - size_t fullAlignedWidth = Simd::AlignLo(width, QA); - __mmask64 tailMask = TailMask64(width - alignedWidth); - size_t blockSize = (256 * 256 * F) / width; - size_t blockCount = height / blockSize + 1; - __m512i _sum = _mm512_setzero_si512(); - for (size_t block = 0; block < blockCount; ++block) - { - __m512i sums[4] = { _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512() }; - for (size_t row = block*blockSize, endRow = Simd::Min(row + blockSize, height); row < endRow; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QA) - SquareSum4(src + col, sums); - for (; col < alignedWidth; col += A) - SquareSum(src + col, sums); - if (col < width) - SquareSum(src + col, sums, tailMask); - src += stride; - } - _sum = _mm512_add_epi64(_sum, HorizontalSum32(_mm512_add_epi32(_mm512_add_epi32(sums[0], sums[1]), _mm512_add_epi32(sums[2], sums[3])))); - } - *sum = ExtractSum(_sum); - } - - void SquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(src) && Aligned(stride)) - SquareSum(src, stride, width, height, sum); - else - SquareSum(src, stride, width, height, sum); - } - - template void ValueSquareSum(const __m512i & value, __m512i * valueSums, __m512i * squareSums) - { - valueSums[index] = _mm512_add_epi64(valueSums[index], _mm512_sad_epu8(value, K_ZERO)); - squareSums[index] = _mm512_add_epi32(squareSums[index], SquareSum(value)); - } - - template void ValueSquareSum4(const uint8_t * src, __m512i * valueSums, __m512i * squareSums) - { - ValueSquareSum<0>(Load(src + 0 * A), valueSums, squareSums); - ValueSquareSum<1>(Load(src + 1 * A), valueSums, squareSums); - ValueSquareSum<2>(Load(src + 2 * A), valueSums, squareSums); - ValueSquareSum<3>(Load(src + 3 * A), valueSums, squareSums); - } - - template void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum) - { - assert(width < 256 * 256 * F); - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - size_t fullAlignedWidth = Simd::AlignLo(width, QA); - __mmask64 tailMask = TailMask64(width - alignedWidth); - size_t blockSize = (256 * 256 * F) / width; - size_t blockCount = height / blockSize + 1; - __m512i valueSums[4] = { _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512() }; - __m512i fullSquareSum = _mm512_setzero_si512(); - for (size_t block = 0; block < blockCount; ++block) - { - __m512i squareSums[4] = { _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512() }; - for (size_t row = block * blockSize, endRow = Simd::Min(row + blockSize, height); row < endRow; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QA) - ValueSquareSum4(src + col, valueSums, squareSums); - for (; col < alignedWidth; col += A) - ValueSquareSum<0>(Load(src + col), valueSums, squareSums); - if (col < width) - ValueSquareSum<0>(Load(src + col, tailMask), valueSums, squareSums); - src += stride; - } - fullSquareSum = _mm512_add_epi64(fullSquareSum, HorizontalSum32( - _mm512_add_epi32(_mm512_add_epi32(squareSums[0], squareSums[1]), _mm512_add_epi32(squareSums[2], squareSums[3])))); - } - *valueSum = ExtractSum(_mm512_add_epi64(_mm512_add_epi64(valueSums[0], valueSums[1]), _mm512_add_epi64(valueSums[2], valueSums[3]))); - *squareSum = ExtractSum(fullSquareSum); - } - - void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum) - { - if (Aligned(src) && Aligned(stride)) - ValueSquareSum(src, stride, width, height, valueSum, squareSum); - else - ValueSquareSum(src, stride, width, height, valueSum, squareSum); - } - - SIMD_INLINE __m512i CorrelationSum(__m512i a, __m512i b) - { - const __m512i lo = _mm512_madd_epi16(_mm512_unpacklo_epi8(a, _mm512_setzero_si512()), _mm512_unpacklo_epi8(b, _mm512_setzero_si512())); - const __m512i hi = _mm512_madd_epi16(_mm512_unpackhi_epi8(a, _mm512_setzero_si512()), _mm512_unpackhi_epi8(b, _mm512_setzero_si512())); - return _mm512_add_epi32(lo, hi); - } - - template void CorrelationSum(const uint8_t * a, const uint8_t * b, __m512i * sums, __mmask64 tail = -1) - { - sums[0] = _mm512_add_epi32(sums[0], CorrelationSum(Load(a, tail), Load(b, tail))); - } - - template void CorrelationSum4(const uint8_t * a, const uint8_t * b, __m512i * sums) - { - sums[0] = _mm512_add_epi32(sums[0], CorrelationSum(Load(a + 0 * A), Load(b + 0 * A))); - sums[1] = _mm512_add_epi32(sums[1], CorrelationSum(Load(a + 1 * A), Load(b + 1 * A))); - sums[2] = _mm512_add_epi32(sums[2], CorrelationSum(Load(a + 2 * A), Load(b + 2 * A))); - sums[3] = _mm512_add_epi32(sums[3], CorrelationSum(Load(a + 3 * A), Load(b + 3 * A))); - } - - template void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum) - { - assert(width < 256 * 256 * F); - if (align) - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - size_t fullAlignedWidth = Simd::AlignLo(width, QA); - __mmask64 tailMask = TailMask64(width - alignedWidth); - size_t blockSize = (256 * 256 * F) / width; - size_t blockCount = height / blockSize + 1; - __m512i _sum = _mm512_setzero_si512(); - for (size_t block = 0; block < blockCount; ++block) - { - __m512i sums[4] = { _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512() }; - for (size_t row = block*blockSize, endRow = Simd::Min(row + blockSize, height); row < endRow; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QA) - CorrelationSum4(a + col, b + col, sums); - for (; col < alignedWidth; col += A) - CorrelationSum(a + col, b + col, sums); - if (col < width) - CorrelationSum(a + col, b + col, sums, tailMask); - a += aStride; - b += bStride; - } - _sum = _mm512_add_epi64(_sum, HorizontalSum32(_mm512_add_epi32(_mm512_add_epi32(sums[0], sums[1]), _mm512_add_epi32(sums[2], sums[3])))); - } - *sum = ExtractSum(_sum); - } - - void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)) - CorrelationSum(a, aStride, b, bStride, width, height, sum); - else - CorrelationSum(a, aStride, b, bStride, width, height, sum); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwStatisticMoments.cpp b/src/3rd/Simd/Simd/SimdAvx512bwStatisticMoments.cpp deleted file mode 100644 index 18fe324f..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwStatisticMoments.cpp +++ /dev/null @@ -1,195 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSet.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - const __m512i K16_I = SIMD_MM512_SETR_EPI16( - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, - 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, - 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37); - - SIMD_INLINE void GetObjectMoments16(__m512i src, __m512i col, __m512i & sx, __m512i & sxx) - { - sx = _mm512_add_epi32(sx, _mm512_madd_epi16(col, src)); - sxx = _mm512_add_epi32(sxx, _mm512_madd_epi16(src, _mm512_mullo_epi16(col, col))); - } - - SIMD_INLINE void GetObjectMoments8(__m512i src, __mmask64 mask, __m512i& col, uint64_t & n, __m512i & s, __m512i & sx, __m512i & sxx) - { - n += Popcnt64(mask); - __m512i _mask = _mm512_maskz_set1_epi8(mask, -1); - src = _mm512_and_si512(src, _mask); - s = _mm512_add_epi64(s, _mm512_sad_epu8(src, K_ZERO)); - GetObjectMoments16(_mm512_unpacklo_epi8(src, K_ZERO), col, sx, sxx); - col = _mm512_add_epi16(col, K16_0008); - GetObjectMoments16(_mm512_unpackhi_epi8(src, K_ZERO), col, sx, sxx); - col = _mm512_add_epi16(col, K16_0038); - } - - template void GetObjectMoments(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t index, - uint64_t & n, __m512i & s, __m512i & sx, __m512i & sy, __m512i & sxx, __m512i& sxy, __m512i& syy) - { - size_t widthA = AlignLo(width, A); - const size_t B = AlignLo(181, A); - size_t widthB = AlignLoAny(width, B); - __mmask64 tail = TailMask64(width - widthA); - const __m512i _index = _mm512_set1_epi8(index); - - for (size_t row = 0; row < height; ++row) - { - for (size_t colB = 0; colB < width;) - { - size_t colE = Simd::Min(colB + B, widthA); - __m512i _col = K16_I; - __m512i _s = _mm512_setzero_si512(); - __m512i _sx = _mm512_setzero_si512(); - __m512i _sxx = _mm512_setzero_si512(); - if (mask == NULL) - { - for (size_t col = colB; col < colE; col += A) - { - __m512i _src = Load(src + col); - GetObjectMoments8(_src, -1, _col, n, _s, _sx, _sxx); - } - if (colB == widthB && widthA < width) - { - __m512i _src = Load(src + widthA, tail); - GetObjectMoments8(_src, tail, _col, n, _s, _sx, _sxx); - colE = width; - } - } - else if (src == NULL) - { - for (size_t col = colB; col < colE; col += A) - { - __mmask64 _mask = _mm512_cmpeq_epi8_mask(Load(mask + col), _index); - GetObjectMoments8(K8_01, _mask, _col, n, _s, _sx, _sxx); - } - if (colB == widthB && widthA < width) - { - __mmask64 _mask = _mm512_cmpeq_epi8_mask((Load(mask + widthA, tail)), _index)&tail; - GetObjectMoments8(K8_01, _mask, _col, n, _s, _sx, _sxx); - colE = width; - } - } - else - { - for (size_t col = colB; col < colE; col += A) - { - __m512i _src = Load(src + col); - __mmask64 _mask = _mm512_cmpeq_epi8_mask(Load(mask + col), _index); - GetObjectMoments8(_src, _mask, _col, n, _s, _sx, _sxx); - } - if (colB == widthB && widthA < width) - { - __m512i _src = Load(src + widthA, tail); - __mmask64 _mask = _mm512_cmpeq_epi8_mask((Load(mask + widthA, tail)), _index) & tail; - GetObjectMoments8(_src, _mask, _col, n, _s, _sx, _sxx); - colE = width; - } - } - _sx = HorizontalSum32(_sx); - _sxx = HorizontalSum32(_sxx); - - __m512i _y = _mm512_set1_epi32((int32_t)row); - __m512i _x0 = _mm512_set1_epi32((int32_t)colB); - - s = _mm512_add_epi64(s, _s); - - sx = _mm512_add_epi64(sx, _sx); - __m512i _sx0 = _mm512_mul_epu32(_s, _x0); - sx = _mm512_add_epi64(sx, _sx0); - - __m512i _sy = _mm512_mul_epu32(_s, _y); - sy = _mm512_add_epi64(sy, _sy); - - sxx = _mm512_add_epi64(sxx, _sxx); - sxx = _mm512_add_epi64(sxx, _mm512_mul_epu32(_sx, _mm512_add_epi64(_x0, _x0))); - sxx = _mm512_add_epi64(sxx, _mm512_mul_epu32(_sx0, _x0)); - - sxy = _mm512_add_epi64(sxy, _mm512_mul_epu32(_sx, _y)); - sxy = _mm512_add_epi64(sxy, _mm512_mul_epu32(_sx0, _y)); - - syy = _mm512_add_epi64(syy, _mm512_mul_epu32(_sy, _y)); - - colB = colE; - } - if(src) - src += srcStride; - if(mask) - mask += maskStride; - } - } - - template void GetObjectMoments(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t* mask, size_t maskStride, uint8_t index, - uint64_t* n, uint64_t* s, uint64_t* sx, uint64_t* sy, uint64_t* sxx, uint64_t* sxy, uint64_t* syy) - { - assert(src || mask); - if (align) - assert((src == NULL || (Aligned(src) && Aligned(srcStride))) && (mask == NULL || (Aligned(mask) && Aligned(maskStride)))); - - *n = 0; - __m512i _s = _mm512_setzero_si512(); - __m512i _sx = _mm512_setzero_si512(); - __m512i _sy = _mm512_setzero_si512(); - __m512i _sxx = _mm512_setzero_si512(); - __m512i _sxy = _mm512_setzero_si512(); - __m512i _syy = _mm512_setzero_si512(); - - GetObjectMoments(src, srcStride, width, height, mask, maskStride, index, *n, _s, _sx, _sy, _sxx, _sxy, _syy); - - *s = ExtractSum(_s); - *sx = ExtractSum(_sx); - *sy = ExtractSum(_sy); - *sxx = ExtractSum(_sxx); - *sxy = ExtractSum(_sxy); - *syy = ExtractSum(_syy); - } - - void GetObjectMoments(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t* mask, size_t maskStride, uint8_t index, - uint64_t* n, uint64_t* s, uint64_t* sx, uint64_t* sy, uint64_t* sxx, uint64_t* sxy, uint64_t* syy) - { - if ((src == NULL || (Aligned(src) && Aligned(srcStride))) && (mask == NULL || (Aligned(mask) && Aligned(maskStride)))) - GetObjectMoments(src, srcStride, width, height, mask, maskStride, index, n, s, sx, sy, sxx, sxy, syy); - else - GetObjectMoments(src, srcStride, width, height, mask, maskStride, index, n, s, sx, sy, sxx, sxy, syy); - } - - void GetMoments(const uint8_t* mask, size_t stride, size_t width, size_t height, uint8_t index, - uint64_t* area, uint64_t* x, uint64_t* y, uint64_t* xx, uint64_t* xy, uint64_t* yy) - { - uint64_t stub; - GetObjectMoments(NULL, 0, width, height, mask, stride, index, &stub, area, x, y, xx, xy, yy); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwStretchGray2x2.cpp b/src/3rd/Simd/Simd/SimdAvx512bwStretchGray2x2.cpp deleted file mode 100644 index ddd12e31..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwStretchGray2x2.cpp +++ /dev/null @@ -1,80 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void StretchGray2x2(const uint8_t * src, uint8_t * dst0, uint8_t * dst1, const __mmask64 * tails) - { - __m512i _src = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(src, tails[2]))); - __m512i lo = _mm512_unpacklo_epi8(_src, _src); - __m512i hi = _mm512_unpackhi_epi8(_src, _src); - Store(dst0 + 0, lo, tails[0]); - Store(dst0 + A, hi, tails[1]); - Store(dst1 + 0, lo, tails[0]); - Store(dst1 + A, hi, tails[1]); - } - - template void StretchGray2x2( - const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert(srcWidth * 2 == dstWidth && srcHeight * 2 == dstHeight); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); - - size_t alignedWidth = AlignLo(srcWidth, A); - __mmask64 tailMasks[3]; - for (size_t c = 0; c < 2; ++c) - tailMasks[c] = TailMask64((srcWidth - alignedWidth) * 2 - A*c); - tailMasks[2] = TailMask64(srcWidth - alignedWidth); - - for (size_t srcRow = 0; srcRow < srcHeight; ++srcRow) - { - uint8_t * dst1 = dst + dstStride; - size_t srcOffset = 0, dstOffset = 0; - for (; srcOffset < alignedWidth; srcOffset += A, dstOffset += DA) - StretchGray2x2(src + srcOffset, dst + dstOffset, dst1 + dstOffset, tailMasks); - if (srcOffset < srcWidth) - StretchGray2x2(src + srcOffset, dst + dstOffset, dst1 + dstOffset, tailMasks); - src += srcStride; - dst += 2 * dstStride; - } - } - - void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - StretchGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - StretchGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwSynetConversion.cpp b/src/3rd/Simd/Simd/SimdAvx512bwSynetConversion.cpp deleted file mode 100644 index b28dcc16..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwSynetConversion.cpp +++ /dev/null @@ -1,673 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" -#include "Simd/SimdAvx2.h" -#include "Simd/SimdLog.h" -#include "Simd/SimdSynet.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void SynetConvert32fTo8u(const float* src, __m512 scale, __m512 shift, uint8_t* dst, __mmask16 tail = -1) - { - __m512i i32 = _mm512_cvtps_epi32(Fmadd(Avx512f::Load(src, tail), scale, shift)); - __m512i u8 = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, _mm512_packus_epi16(_mm512_packs_epi32(i32, K_ZERO), K_ZERO)); - Store(dst, _mm512_extracti32x4_epi32(u8, 0), tail); - } - - template SIMD_INLINE void SynetConvert32fTo8u(const float* src, const float* scale, const float* shift, uint8_t* dst, __mmask16 tail = -1) - { - SynetConvert32fTo8u(src, Avx512f::Load(scale, tail), Avx512f::Load(shift, tail), dst, tail); - } - - template void SynetConvert32fTo8uNchw(const float* src, size_t batch, size_t channels, size_t height, size_t width, const float* scale, const float* shift, uint8_t* dst) - { - if (align) - assert(Aligned(src) && Aligned(dst) && Aligned(width, A)); - - size_t widthF = AlignLo(width, F); - __mmask16 tailF = TailMask16(width - widthF); - for (size_t b = 0; b < batch; ++b) - { - for (size_t c = 0; c < channels; ++c) - { - __m512 _scale = _mm512_set1_ps(scale[c]); - __m512 _shift = _mm512_set1_ps(shift[c]); - for (size_t h = 0; h < height; ++h) - { - size_t w = 0; - for (; w < widthF; w += F) - SynetConvert32fTo8u(src + w, _scale, _shift, dst + w); - if( w < width) - SynetConvert32fTo8u(src + w, _scale, _shift, dst + w, tailF); - src += width; - dst += width; - } - } - } - } - - template void SynetConvert32fTo8uNchw(const float* src, size_t batch, size_t channels, size_t height, size_t width, const float* scale, const float* shift, uint8_t* dst) - { - if (Aligned(src) && Aligned(dst) && Aligned(width, A)) - SynetConvert32fTo8uNchw(src, batch, channels, height, width, scale, shift, dst); - else - SynetConvert32fTo8uNchw(src, batch, channels, height, width, scale, shift, dst); - } - - template void SynetConvert32fTo8uNhwc(const float* src, size_t batch, size_t channels, size_t height, size_t width, const float* scale, const float* shift, uint8_t* dst) - { - if (align) - assert(Aligned(src) && Aligned(dst) && Aligned(channels, A) && Aligned(scale) && Aligned(shift)); - - size_t channelsF = AlignLo(channels, F); - size_t widthF = AlignLo(width, F); - __mmask16 tailF = TailMask16(channels - channelsF); - for (size_t b = 0; b < batch; ++b) - { - for (size_t h = 0; h < height; ++h) - { - size_t w = 0; - for (; w < widthF; ++w) - { - size_t c = 0; - for (; c < channelsF; c += F) - SynetConvert32fTo8u(src + c, scale + c, shift + c, dst + c); - if (c < channels) - SynetConvert32fTo8u(src + c, scale + c, shift + c, dst + c, tailF); - src += channels; - dst += channels; - } - for (; w < width; ++w) - { - size_t c = 0; - for (; c < channelsF; c += F) - SynetConvert32fTo8u(src + c, scale + c, shift + c, dst + c); - if (c < channels) - SynetConvert32fTo8u(src + c, scale + c, shift + c, dst + c, tailF); - src += channels; - dst += channels; - } - } - } - } - - template void SynetConvert32fTo8uNhwc(const float* src, size_t batch, size_t channels, size_t height, size_t width, const float* scale, const float* shift, uint8_t* dst) - { - if (Aligned(src) && Aligned(dst) && Aligned(channels, A) && Aligned(scale) && Aligned(shift)) - SynetConvert32fTo8uNhwc(src, batch, channels, height, width, scale, shift, dst); - else - SynetConvert32fTo8uNhwc(src, batch, channels, height, width, scale, shift, dst); - } - - template void SynetConvert32fTo8uNhwc3(const float* src, size_t batch, size_t height, size_t width, const float* scale, const float* shift, uint8_t* dst) - { - if (align) - assert(Aligned(src) && Aligned(dst) && Aligned(width, A)); - - size_t width3 = width * 3; - size_t width3F = AlignLo(width, F) * 3; - - float _scale[F * 3], _shift[F * 3]; - for (size_t i = 0; i < F; ++i) - for (size_t c = 0; c < 3; ++c) - _scale[i * 3 + c] = scale[c], _shift[i * 3 + c] = shift[c]; - - __m512 _scale0 = Avx512f::Load(_scale + 0 * F); - __m512 _scale1 = Avx512f::Load(_scale + 1 * F); - __m512 _scale2 = Avx512f::Load(_scale + 2 * F); - __m512 _shift0 = Avx512f::Load(_shift + 0 * F); - __m512 _shift1 = Avx512f::Load(_shift + 1 * F); - __m512 _shift2 = Avx512f::Load(_shift + 2 * F); - - for (size_t b = 0; b < batch; ++b) - { - for (size_t h = 0; h < height; ++h) - { - size_t w = 0; - for (; w < width3F; w += 3 * F) - { - SynetConvert32fTo8u(src + 0 * F, _scale0, _shift0, dst + 0 * F); - SynetConvert32fTo8u(src + 1 * F, _scale1, _shift1, dst + 1 * F); - SynetConvert32fTo8u(src + 2 * F, _scale2, _shift2, dst + 2 * F); - src += 3 * F; - dst += 3 * F; - } - for (; w < width3; w += 3) - { - dst[0] = Base::SynetConvert32fTo8u(src[0], scale[0], shift[0]); - dst[1] = Base::SynetConvert32fTo8u(src[1], scale[1], shift[1]); - dst[2] = Base::SynetConvert32fTo8u(src[2], scale[2], shift[2]); - src += 3; - dst += 3; - } - } - } - } - - template void SynetConvert32fTo8uNhwc3(const float* src, size_t batch, size_t height, size_t width, const float* scale, const float* shift, uint8_t* dst) - { - if (Aligned(src) && Aligned(dst) && Aligned(width, A)) - SynetConvert32fTo8uNhwc3(src, batch, height, width, scale, shift, dst); - else - SynetConvert32fTo8uNhwc3(src, batch, height, width, scale, shift, dst); - } - - void SynetConvert32fTo8u(const float* src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float* shift, uint8_t* dst, SimdSynetCompatibilityType compatibility) - { - if (!(compatibility & SimdSynetCompatibilityNoFmaTail)) - { - width = height * width; - height = 1; - } - size_t spatial = height * width; - if (Base::NchwCompatible(channels, spatial, format)) - { - if (compatibility & SimdSynetCompatibilityNoFma) - SynetConvert32fTo8uNchw(src, batch, channels, height, width, scale, shift, dst); - else - SynetConvert32fTo8uNchw(src, batch, channels, height, width, scale, shift, dst); - } - else if (Base::NhwcCompatible(channels, spatial, format)) - { - if (channels == 3) - { - if (compatibility & SimdSynetCompatibilityNoFma) - SynetConvert32fTo8uNhwc3(src, batch, height, width, scale, shift, dst); - else - SynetConvert32fTo8uNhwc3(src, batch, height, width, scale, shift, dst); - } - else - { - if (compatibility & SimdSynetCompatibilityNoFma) - SynetConvert32fTo8uNhwc(src, batch, channels, height, width, scale, shift, dst); - else if (compatibility & SimdSynetCompatibilityNoFmaTail) - SynetConvert32fTo8uNhwc(src, batch, channels, height, width, scale, shift, dst); - else - SynetConvert32fTo8uNhwc(src, batch, channels, height, width, scale, shift, dst); - } - } - else - assert(0); - } - - //--------------------------------------------------------------------- - - - template SIMD_INLINE void StoreScaled(float * ptr, __m512i value32, __m512 scale, __m512 shift) - { - Avx512f::Store(ptr, _mm512_fmadd_ps(_mm512_cvtepi32_ps(value32), scale, shift)); - } - - const __m512i K16_BLUE_RED = SIMD_MM512_SET2_EPI16(Base::BLUE_TO_GRAY_WEIGHT, Base::RED_TO_GRAY_WEIGHT); - const __m512i K16_GREEN_0000 = SIMD_MM512_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, 0x0000); - const __m512i K32_ROUND_TERM = SIMD_MM512_SET1_EPI32(Base::BGR_TO_GRAY_ROUND_TERM); - - SIMD_INLINE __m512i BgraToGray32(__m512i bgra) - { - const __m512i g0a0 = _mm512_shuffle_epi8(bgra, K8_SUFFLE_BGRA_TO_G0A0); - const __m512i b0r0 = _mm512_and_si512(bgra, K16_00FF); - const __m512i weightedSum = _mm512_add_epi32(_mm512_madd_epi16(g0a0, K16_GREEN_0000), _mm512_madd_epi16(b0r0, K16_BLUE_RED)); - return _mm512_srli_epi32(_mm512_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT); - } - - template SIMD_INLINE void SynetSetInput1(const uint8_t * src, __m512 scale, __m512 shift, float * dst); - - template<> SIMD_INLINE void SynetSetInput1(const uint8_t * src, __m512 scale, __m512 shift, float * dst) - { - StoreScaled(dst + 0 * F, _mm512_cvtepu8_epi32(Sse2::Load((__m128i*)src + 0)), scale, shift); - StoreScaled(dst + 1 * F, _mm512_cvtepu8_epi32(Sse2::Load((__m128i*)src + 1)), scale, shift); - StoreScaled(dst + 2 * F, _mm512_cvtepu8_epi32(Sse2::Load((__m128i*)src + 2)), scale, shift); - StoreScaled(dst + 3 * F, _mm512_cvtepu8_epi32(Sse2::Load((__m128i*)src + 3)), scale, shift); - } - - const __m512i K8_SHUFFLE_BGR_TO_BGRA = SIMD_MM512_SETR_EPI8( - 0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1, - 0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1, - 0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1, - 0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1); - - template<> SIMD_INLINE void SynetSetInput1(const uint8_t * src, __m512 scale, __m512 shift, float * dst) - { - __m512i bgr0 = Load(src + 0 * A); - __m512i bgr1 = Load(src + 1 * A); - __m512i bgr2 = Load(src + 2 * A); - const __m512i bgra0 = _mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_0, bgr0); - const __m512i bgra1 = _mm512_permutex2var_epi32(bgr0, K32_PERMUTE_BGR_TO_BGRA_1, bgr1); - const __m512i bgra2 = _mm512_permutex2var_epi32(bgr1, K32_PERMUTE_BGR_TO_BGRA_2, bgr2); - const __m512i bgra3 = _mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_3, bgr2); - StoreScaled(dst + 0 * F, BgraToGray32(_mm512_shuffle_epi8(bgra0, K8_SHUFFLE_BGR_TO_BGRA)), scale, shift); - StoreScaled(dst + 1 * F, BgraToGray32(_mm512_shuffle_epi8(bgra1, K8_SHUFFLE_BGR_TO_BGRA)), scale, shift); - StoreScaled(dst + 2 * F, BgraToGray32(_mm512_shuffle_epi8(bgra2, K8_SHUFFLE_BGR_TO_BGRA)), scale, shift); - StoreScaled(dst + 3 * F, BgraToGray32(_mm512_shuffle_epi8(bgra3, K8_SHUFFLE_BGR_TO_BGRA)), scale, shift); - } - - template<> SIMD_INLINE void SynetSetInput1(const uint8_t * src, __m512 scale, __m512 shift, float * dst) - { - StoreScaled(dst + 0 * F, BgraToGray32(Load((__m512i*)src + 0)), scale, shift); - StoreScaled(dst + 1 * F, BgraToGray32(Load((__m512i*)src + 1)), scale, shift); - StoreScaled(dst + 2 * F, BgraToGray32(Load((__m512i*)src + 2)), scale, shift); - StoreScaled(dst + 3 * F, BgraToGray32(Load((__m512i*)src + 3)), scale, shift); - } - - const __m512i K8_SHUFFLE_RGB_TO_BGRA = SIMD_MM512_SETR_EPI8( - 0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1, - 0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1, - 0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1, - 0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1); - - template<> SIMD_INLINE void SynetSetInput1(const uint8_t * src, __m512 scale, __m512 shift, float * dst) - { - __m512i bgr0 = Load(src + 0 * A); - __m512i bgr1 = Load(src + 1 * A); - __m512i bgr2 = Load(src + 2 * A); - const __m512i bgra0 = _mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_0, bgr0); - const __m512i bgra1 = _mm512_permutex2var_epi32(bgr0, K32_PERMUTE_BGR_TO_BGRA_1, bgr1); - const __m512i bgra2 = _mm512_permutex2var_epi32(bgr1, K32_PERMUTE_BGR_TO_BGRA_2, bgr2); - const __m512i bgra3 = _mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_3, bgr2); - StoreScaled(dst + 0 * F, BgraToGray32(_mm512_shuffle_epi8(bgra0, K8_SHUFFLE_RGB_TO_BGRA)), scale, shift); - StoreScaled(dst + 1 * F, BgraToGray32(_mm512_shuffle_epi8(bgra1, K8_SHUFFLE_RGB_TO_BGRA)), scale, shift); - StoreScaled(dst + 2 * F, BgraToGray32(_mm512_shuffle_epi8(bgra2, K8_SHUFFLE_RGB_TO_BGRA)), scale, shift); - StoreScaled(dst + 3 * F, BgraToGray32(_mm512_shuffle_epi8(bgra3, K8_SHUFFLE_RGB_TO_BGRA)), scale, shift); - } - - template void SynetSetInput1(const uint8_t * src, size_t width, size_t height, size_t stride, const float * scale, const float * shift, float * dst) - { - __m512 _scale = _mm512_set1_ps(scale[0]); - __m512 _shift = _mm512_set1_ps(shift[0]); - size_t aligned = AlignLo(width, A); - for (size_t y = 0; y < height; ++y) - { - for (size_t x = 0; x < aligned; x += A) - SynetSetInput1(src + step * x, _scale, _shift, dst + x); - if(aligned < width) - SynetSetInput1(src + step * (width - A), _scale, _shift, dst + width - A); - src += stride; - dst += width; - } - } - - template SIMD_INLINE void SynetSetInputNchw3A(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst, size_t channel); - - template<> SIMD_INLINE void SynetSetInputNchw3A(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst, size_t channel) - { - __m512i gray0 = _mm512_cvtepu8_epi32(Sse2::Load((__m128i*)src + 0)); - __m512i gray1 = _mm512_cvtepu8_epi32(Sse2::Load((__m128i*)src + 1)); - __m512i gray2 = _mm512_cvtepu8_epi32(Sse2::Load((__m128i*)src + 2)); - __m512i gray3 = _mm512_cvtepu8_epi32(Sse2::Load((__m128i*)src + 3)); - StoreScaled(dst + 0 * F, gray0, scale[0], shift[0]); - StoreScaled(dst + 1 * F, gray1, scale[0], shift[0]); - StoreScaled(dst + 2 * F, gray2, scale[0], shift[0]); - StoreScaled(dst + 3 * F, gray3, scale[0], shift[0]); - dst += channel; - StoreScaled(dst + 0 * F, gray0, scale[1], shift[1]); - StoreScaled(dst + 1 * F, gray1, scale[1], shift[1]); - StoreScaled(dst + 2 * F, gray2, scale[1], shift[1]); - StoreScaled(dst + 3 * F, gray3, scale[1], shift[1]); - dst += channel; - StoreScaled(dst + 0 * F, gray0, scale[2], shift[2]); - StoreScaled(dst + 1 * F, gray1, scale[2], shift[2]); - StoreScaled(dst + 2 * F, gray2, scale[2], shift[2]); - StoreScaled(dst + 3 * F, gray3, scale[2], shift[2]); - } - - const __m512i K8_SHUFFLE_BGR_TO_B32 = SIMD_MM512_SETR_EPI8( - 0x0, -1, -1, -1, 0x3, -1, -1, -1, 0x6, -1, -1, -1, 0x9, -1, -1, -1, - 0x0, -1, -1, -1, 0x3, -1, -1, -1, 0x6, -1, -1, -1, 0x9, -1, -1, -1, - 0x0, -1, -1, -1, 0x3, -1, -1, -1, 0x6, -1, -1, -1, 0x9, -1, -1, -1, - 0x0, -1, -1, -1, 0x3, -1, -1, -1, 0x6, -1, -1, -1, 0x9, -1, -1, -1); - - const __m512i K8_SHUFFLE_BGR_TO_G32 = SIMD_MM512_SETR_EPI8( - 0x1, -1, -1, -1, 0x4, -1, -1, -1, 0x7, -1, -1, -1, 0xA, -1, -1, -1, - 0x1, -1, -1, -1, 0x4, -1, -1, -1, 0x7, -1, -1, -1, 0xA, -1, -1, -1, - 0x1, -1, -1, -1, 0x4, -1, -1, -1, 0x7, -1, -1, -1, 0xA, -1, -1, -1, - 0x1, -1, -1, -1, 0x4, -1, -1, -1, 0x7, -1, -1, -1, 0xA, -1, -1, -1); - - const __m512i K8_SHUFFLE_BGR_TO_R32 = SIMD_MM512_SETR_EPI8( - 0x2, -1, -1, -1, 0x5, -1, -1, -1, 0x8, -1, -1, -1, 0xB, -1, -1, -1, - 0x2, -1, -1, -1, 0x5, -1, -1, -1, 0x8, -1, -1, -1, 0xB, -1, -1, -1, - 0x2, -1, -1, -1, 0x5, -1, -1, -1, 0x8, -1, -1, -1, 0xB, -1, -1, -1, - 0x2, -1, -1, -1, 0x5, -1, -1, -1, 0x8, -1, -1, -1, 0xB, -1, -1, -1); - - SIMD_INLINE void SynetSetInputNchw3Bgr(__m512i bgra, const __m512 * scale, const __m512 * shift, float * dst, size_t channel) - { - StoreScaled(dst + 0 * channel, _mm512_shuffle_epi8(bgra, K8_SHUFFLE_BGR_TO_B32), scale[0], shift[0]); - StoreScaled(dst + 1 * channel, _mm512_shuffle_epi8(bgra, K8_SHUFFLE_BGR_TO_G32), scale[1], shift[1]); - StoreScaled(dst + 2 * channel, _mm512_shuffle_epi8(bgra, K8_SHUFFLE_BGR_TO_R32), scale[2], shift[2]); - } - - const __m512i K32_PERMUTE_BGR_TO_BGRA_BEG = SIMD_MM512_SETR_EPI32(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1); - const __m512i K32_PERMUTE_BGR_TO_BGRA_END = SIMD_MM512_SETR_EPI32(0x4, 0x5, 0x6, -1, 0x7, 0x8, 0x9, -1, 0xA, 0xB, 0xC, -1, 0xD, 0xE, 0xF, -1); - - template<> SIMD_INLINE void SynetSetInputNchw3A(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst, size_t channel) - { - SynetSetInputNchw3Bgr(_mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_BEG, Load(src + 0 * F)), scale, shift, dst + 0 * F, channel); - SynetSetInputNchw3Bgr(_mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_BEG, Load(src + 3 * F)), scale, shift, dst + 1 * F, channel); - SynetSetInputNchw3Bgr(_mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_BEG, Load(src + 6 * F)), scale, shift, dst + 2 * F, channel); - SynetSetInputNchw3Bgr(_mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_END, Load(src + 8 * F)), scale, shift, dst + 3 * F, channel); - } - - const __m512i K8_SHUFFLE_BGRA_TO_B32 = SIMD_MM512_SETR_EPI8( - 0x0, -1, -1, -1, 0x4, -1, -1, -1, 0x8, -1, -1, -1, 0xC, -1, -1, -1, - 0x0, -1, -1, -1, 0x4, -1, -1, -1, 0x8, -1, -1, -1, 0xC, -1, -1, -1, - 0x0, -1, -1, -1, 0x4, -1, -1, -1, 0x8, -1, -1, -1, 0xC, -1, -1, -1, - 0x0, -1, -1, -1, 0x4, -1, -1, -1, 0x8, -1, -1, -1, 0xC, -1, -1, -1); - - const __m512i K8_SHUFFLE_BGRA_TO_G32 = SIMD_MM512_SETR_EPI8( - 0x1, -1, -1, -1, 0x5, -1, -1, -1, 0x9, -1, -1, -1, 0xD, -1, -1, -1, - 0x1, -1, -1, -1, 0x5, -1, -1, -1, 0x9, -1, -1, -1, 0xD, -1, -1, -1, - 0x1, -1, -1, -1, 0x5, -1, -1, -1, 0x9, -1, -1, -1, 0xD, -1, -1, -1, - 0x1, -1, -1, -1, 0x5, -1, -1, -1, 0x9, -1, -1, -1, 0xD, -1, -1, -1); - - const __m512i K8_SHUFFLE_BGRA_TO_R32 = SIMD_MM512_SETR_EPI8( - 0x2, -1, -1, -1, 0x6, -1, -1, -1, 0xA, -1, -1, -1, 0xE, -1, -1, -1, - 0x2, -1, -1, -1, 0x6, -1, -1, -1, 0xA, -1, -1, -1, 0xE, -1, -1, -1, - 0x2, -1, -1, -1, 0x6, -1, -1, -1, 0xA, -1, -1, -1, 0xE, -1, -1, -1, - 0x2, -1, -1, -1, 0x6, -1, -1, -1, 0xA, -1, -1, -1, 0xE, -1, -1, -1); - - SIMD_INLINE void SynetSetInputNchw3Bgra(__m512i bgra, const __m512 * scale, const __m512 * shift, float * dst, size_t channel) - { - StoreScaled(dst + 0 * channel, _mm512_shuffle_epi8(bgra, K8_SHUFFLE_BGRA_TO_B32), scale[0], shift[0]); - StoreScaled(dst + 1 * channel, _mm512_shuffle_epi8(bgra, K8_SHUFFLE_BGRA_TO_G32), scale[1], shift[1]); - StoreScaled(dst + 2 * channel, _mm512_shuffle_epi8(bgra, K8_SHUFFLE_BGRA_TO_R32), scale[2], shift[2]); - } - - template<> SIMD_INLINE void SynetSetInputNchw3A(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst, size_t channel) - { - SynetSetInputNchw3Bgra(Load(src + 0 * A), scale, shift, dst + 0 * F, channel); - SynetSetInputNchw3Bgra(Load(src + 1 * A), scale, shift, dst + 1 * F, channel); - SynetSetInputNchw3Bgra(Load(src + 2 * A), scale, shift, dst + 2 * F, channel); - SynetSetInputNchw3Bgra(Load(src + 3 * A), scale, shift, dst + 3 * F, channel); - } - - SIMD_INLINE void SynetSetInputNchw3Rgb(__m512i bgra, const __m512 * scale, const __m512 * shift, float * dst, size_t channel) - { - StoreScaled(dst + 0 * channel, _mm512_shuffle_epi8(bgra, K8_SHUFFLE_BGR_TO_R32), scale[0], shift[0]); - StoreScaled(dst + 1 * channel, _mm512_shuffle_epi8(bgra, K8_SHUFFLE_BGR_TO_G32), scale[1], shift[1]); - StoreScaled(dst + 2 * channel, _mm512_shuffle_epi8(bgra, K8_SHUFFLE_BGR_TO_B32), scale[2], shift[2]); - } - - template<> SIMD_INLINE void SynetSetInputNchw3A(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst, size_t channel) - { - SynetSetInputNchw3Rgb(_mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_BEG, Load(src + 0 * F)), scale, shift, dst + 0 * F, channel); - SynetSetInputNchw3Rgb(_mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_BEG, Load(src + 3 * F)), scale, shift, dst + 1 * F, channel); - SynetSetInputNchw3Rgb(_mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_BEG, Load(src + 6 * F)), scale, shift, dst + 2 * F, channel); - SynetSetInputNchw3Rgb(_mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_END, Load(src + 8 * F)), scale, shift, dst + 3 * F, channel); - } - - template SIMD_INLINE void SynetSetInputNchw3F(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst, size_t channel); - - template<> SIMD_INLINE void SynetSetInputNchw3F(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst, size_t channel) - { - __m512i gray = _mm512_cvtepu8_epi32(Sse2::Load((__m128i*)src)); - StoreScaled(dst + 0 * channel, gray, scale[0], shift[0]); - StoreScaled(dst + 1 * channel, gray, scale[1], shift[1]); - StoreScaled(dst + 2 * channel, gray, scale[2], shift[2]); - } - - template<> SIMD_INLINE void SynetSetInputNchw3F(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst, size_t channel) - { - SynetSetInputNchw3Bgr(_mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_END, Load(src - F)), scale, shift, dst, channel); - } - - template<> SIMD_INLINE void SynetSetInputNchw3F(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst, size_t channel) - { - SynetSetInputNchw3Bgra(Load(src), scale, shift, dst, channel); - } - - template<> SIMD_INLINE void SynetSetInputNchw3F(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst, size_t channel) - { - SynetSetInputNchw3Rgb(_mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_END, Load(src - F)), scale, shift, dst, channel); - } - - template void SynetSetInputNchw3(const uint8_t * src, size_t width, size_t height, size_t stride, const float * scale, const float * shift, float * dst) - { - size_t widthF = AlignLo(width, F), widthA = AlignLo(width, A), channel = width * height; - __m512 _scale[3], _shift[3]; - for (size_t i = 0; i < 3; ++i) - { - _scale[i] = _mm512_set1_ps(scale[i]); - _shift[i] = _mm512_set1_ps(shift[i]); - } - for (size_t y = 0; y < height; ++y) - { - size_t x = 0; - for (; x < widthA; x += A) - SynetSetInputNchw3A(src + step * x, _scale, _shift, dst + x, channel); - for (; x < widthF; x += F) - SynetSetInputNchw3F(src + step * x, _scale, _shift, dst + x, channel); - if (widthF < width) - SynetSetInputNchw3F(src + step * (width - F), _scale, _shift, dst + width - F, channel); - src += stride; - dst += width; - } - } - - template SIMD_INLINE void SynetSetInputNhwc3A(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst); - - template<> SIMD_INLINE void SynetSetInputNhwc3A(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst) - { - __m128i gray0 = Sse2::Load((__m128i*)src + 0); - StoreScaled(dst + 0x0 * F, _mm512_cvtepu8_epi32(_mm_shuffle_epi8(gray0, Ssse3::K8_SHUFFLE_GRAY_TO_BGR0)), scale[0], shift[0]); - StoreScaled(dst + 0x1 * F, _mm512_cvtepu8_epi32(_mm_shuffle_epi8(gray0, Ssse3::K8_SHUFFLE_GRAY_TO_BGR1)), scale[1], shift[1]); - StoreScaled(dst + 0x2 * F, _mm512_cvtepu8_epi32(_mm_shuffle_epi8(gray0, Ssse3::K8_SHUFFLE_GRAY_TO_BGR2)), scale[2], shift[2]); - __m128i gray1 = Sse2::Load((__m128i*)src + 1); - StoreScaled(dst + 0x3 * F, _mm512_cvtepu8_epi32(_mm_shuffle_epi8(gray1, Ssse3::K8_SHUFFLE_GRAY_TO_BGR0)), scale[0], shift[0]); - StoreScaled(dst + 0x4 * F, _mm512_cvtepu8_epi32(_mm_shuffle_epi8(gray1, Ssse3::K8_SHUFFLE_GRAY_TO_BGR1)), scale[1], shift[1]); - StoreScaled(dst + 0x5 * F, _mm512_cvtepu8_epi32(_mm_shuffle_epi8(gray1, Ssse3::K8_SHUFFLE_GRAY_TO_BGR2)), scale[2], shift[2]); - __m128i gray2 = Sse2::Load((__m128i*)src + 2); - StoreScaled(dst + 0x6 * F, _mm512_cvtepu8_epi32(_mm_shuffle_epi8(gray2, Ssse3::K8_SHUFFLE_GRAY_TO_BGR0)), scale[0], shift[0]); - StoreScaled(dst + 0x7 * F, _mm512_cvtepu8_epi32(_mm_shuffle_epi8(gray2, Ssse3::K8_SHUFFLE_GRAY_TO_BGR1)), scale[1], shift[1]); - StoreScaled(dst + 0x8 * F, _mm512_cvtepu8_epi32(_mm_shuffle_epi8(gray2, Ssse3::K8_SHUFFLE_GRAY_TO_BGR2)), scale[2], shift[2]); - __m128i gray3 = Sse2::Load((__m128i*)src + 3); - StoreScaled(dst + 0x9 * F, _mm512_cvtepu8_epi32(_mm_shuffle_epi8(gray3, Ssse3::K8_SHUFFLE_GRAY_TO_BGR0)), scale[0], shift[0]); - StoreScaled(dst + 0xA * F, _mm512_cvtepu8_epi32(_mm_shuffle_epi8(gray3, Ssse3::K8_SHUFFLE_GRAY_TO_BGR1)), scale[1], shift[1]); - StoreScaled(dst + 0xB * F, _mm512_cvtepu8_epi32(_mm_shuffle_epi8(gray3, Ssse3::K8_SHUFFLE_GRAY_TO_BGR2)), scale[2], shift[2]); - } - - template<> SIMD_INLINE void SynetSetInputNhwc3A(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst) - { - __m512i src0 = Load((__m512i*)src + 0); - StoreScaled(dst + 0x0 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(src0, 0)), scale[0], shift[0]); - StoreScaled(dst + 0x1 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(src0, 1)), scale[1], shift[1]); - StoreScaled(dst + 0x2 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(src0, 2)), scale[2], shift[2]); - StoreScaled(dst + 0x3 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(src0, 3)), scale[0], shift[0]); - __m512i src1 = Load((__m512i*)src + 1); - StoreScaled(dst + 0x4 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(src1, 0)), scale[1], shift[1]); - StoreScaled(dst + 0x5 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(src1, 1)), scale[2], shift[2]); - StoreScaled(dst + 0x6 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(src1, 2)), scale[0], shift[0]); - StoreScaled(dst + 0x7 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(src1, 3)), scale[1], shift[1]); - __m512i src2 = Load((__m512i*)src + 2); - StoreScaled(dst + 0x8 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(src2, 0)), scale[2], shift[2]); - StoreScaled(dst + 0x9 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(src2, 1)), scale[0], shift[0]); - StoreScaled(dst + 0xA * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(src2, 2)), scale[1], shift[1]); - StoreScaled(dst + 0xB * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(src2, 3)), scale[2], shift[2]); - } - - const __m512i K8_SUFFLE_BGRA_TO_BGR = SIMD_MM512_SETR_EPI8( - 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, - 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, - 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, - 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1); - - const __m512i K32_PERMUTE_BGRA_TO_BGR = SIMD_MM512_SETR_EPI32(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1); - - template<> SIMD_INLINE void SynetSetInputNhwc3A(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst) - { - __m512i bgr0 = _mm512_permutexvar_epi32(K32_PERMUTE_BGRA_TO_BGR, _mm512_shuffle_epi8(Load(src + 0 * A), K8_SUFFLE_BGRA_TO_BGR)); - StoreScaled(dst + 0x0 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr0, 0)), scale[0], shift[0]); - StoreScaled(dst + 0x1 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr0, 1)), scale[1], shift[1]); - StoreScaled(dst + 0x2 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr0, 2)), scale[2], shift[2]); - __m512i bgr1 = _mm512_permutexvar_epi32(K32_PERMUTE_BGRA_TO_BGR, _mm512_shuffle_epi8(Load(src + 1 * A), K8_SUFFLE_BGRA_TO_BGR)); - StoreScaled(dst + 0x3 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr1, 0)), scale[0], shift[0]); - StoreScaled(dst + 0x4 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr1, 1)), scale[1], shift[1]); - StoreScaled(dst + 0x5 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr1, 2)), scale[2], shift[2]); - __m512i bgr2 = _mm512_permutexvar_epi32(K32_PERMUTE_BGRA_TO_BGR, _mm512_shuffle_epi8(Load(src + 2 * A), K8_SUFFLE_BGRA_TO_BGR)); - StoreScaled(dst + 0x6 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr2, 0)), scale[0], shift[0]); - StoreScaled(dst + 0x7 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr2, 1)), scale[1], shift[1]); - StoreScaled(dst + 0x8 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr2, 2)), scale[2], shift[2]); - __m512i bgr3 = _mm512_permutexvar_epi32(K32_PERMUTE_BGRA_TO_BGR, _mm512_shuffle_epi8(Load(src + 3 * A), K8_SUFFLE_BGRA_TO_BGR)); - StoreScaled(dst + 0x9 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr3, 0)), scale[0], shift[0]); - StoreScaled(dst + 0xA * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr3, 1)), scale[1], shift[1]); - StoreScaled(dst + 0xB * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr3, 2)), scale[2], shift[2]); - } - - const __m512i K8_SUFFLE_RGB_TO_BGR = SIMD_MM512_SETR_EPI8( - 0x2, 0x1, 0x0, 0x5, 0x4, 0x3, 0x8, 0x7, 0x6, 0xB, 0xA, 0x9, -1, -1, -1, -1, - 0x2, 0x1, 0x0, 0x5, 0x4, 0x3, 0x8, 0x7, 0x6, 0xB, 0xA, 0x9, -1, -1, -1, -1, - 0x2, 0x1, 0x0, 0x5, 0x4, 0x3, 0x8, 0x7, 0x6, 0xB, 0xA, 0x9, -1, -1, -1, -1, - 0x2, 0x1, 0x0, 0x5, 0x4, 0x3, 0x8, 0x7, 0x6, 0xB, 0xA, 0x9, -1, -1, -1, -1); - - template<> SIMD_INLINE void SynetSetInputNhwc3A(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst) - { - __m512i bgr0 = _mm512_permutexvar_epi32(K32_PERMUTE_BGRA_TO_BGR, _mm512_shuffle_epi8(_mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_BEG, Load(src + 0*F)), K8_SUFFLE_RGB_TO_BGR)); - StoreScaled(dst + 0x0 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr0, 0)), scale[0], shift[0]); - StoreScaled(dst + 0x1 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr0, 1)), scale[1], shift[1]); - StoreScaled(dst + 0x2 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr0, 2)), scale[2], shift[2]); - __m512i bgr1 = _mm512_permutexvar_epi32(K32_PERMUTE_BGRA_TO_BGR, _mm512_shuffle_epi8(_mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_BEG, Load(src + 3*F)), K8_SUFFLE_RGB_TO_BGR)); - StoreScaled(dst + 0x3 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr1, 0)), scale[0], shift[0]); - StoreScaled(dst + 0x4 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr1, 1)), scale[1], shift[1]); - StoreScaled(dst + 0x5 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr1, 2)), scale[2], shift[2]); - __m512i bgr2 = _mm512_permutexvar_epi32(K32_PERMUTE_BGRA_TO_BGR, _mm512_shuffle_epi8(_mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_BEG, Load(src + 6*F)), K8_SUFFLE_RGB_TO_BGR)); - StoreScaled(dst + 0x6 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr2, 0)), scale[0], shift[0]); - StoreScaled(dst + 0x7 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr2, 1)), scale[1], shift[1]); - StoreScaled(dst + 0x8 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr2, 2)), scale[2], shift[2]); - __m512i bgr3 = _mm512_permutexvar_epi32(K32_PERMUTE_BGRA_TO_BGR, _mm512_shuffle_epi8(_mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_END, Load(src + 8*F)), K8_SUFFLE_RGB_TO_BGR)); - StoreScaled(dst + 0x9 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr3, 0)), scale[0], shift[0]); - StoreScaled(dst + 0xA * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr3, 1)), scale[1], shift[1]); - StoreScaled(dst + 0xB * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr3, 2)), scale[2], shift[2]); - } - - template SIMD_INLINE void SynetSetInputNhwc3F(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst); - - template<> SIMD_INLINE void SynetSetInputNhwc3F(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst) - { - __m128i gray0 = Sse2::Load((__m128i*)src + 0); - StoreScaled(dst + 0x0 * F, _mm512_cvtepu8_epi32(_mm_shuffle_epi8(gray0, Ssse3::K8_SHUFFLE_GRAY_TO_BGR0)), scale[0], shift[0]); - StoreScaled(dst + 0x1 * F, _mm512_cvtepu8_epi32(_mm_shuffle_epi8(gray0, Ssse3::K8_SHUFFLE_GRAY_TO_BGR1)), scale[1], shift[1]); - StoreScaled(dst + 0x2 * F, _mm512_cvtepu8_epi32(_mm_shuffle_epi8(gray0, Ssse3::K8_SHUFFLE_GRAY_TO_BGR2)), scale[2], shift[2]); - } - - template<> SIMD_INLINE void SynetSetInputNhwc3F(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst) - { - StoreScaled(dst + 0x0 * F, _mm512_cvtepu8_epi32(Sse2::Load((__m128i*)src + 0x0)), scale[0], shift[0]); - StoreScaled(dst + 0x1 * F, _mm512_cvtepu8_epi32(Sse2::Load((__m128i*)src + 0x1)), scale[1], shift[1]); - StoreScaled(dst + 0x2 * F, _mm512_cvtepu8_epi32(Sse2::Load((__m128i*)src + 0x2)), scale[2], shift[2]); - } - - template<> SIMD_INLINE void SynetSetInputNhwc3F(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst) - { - __m512i bgr = _mm512_permutexvar_epi32(K32_PERMUTE_BGRA_TO_BGR, _mm512_shuffle_epi8(Load(src), K8_SUFFLE_BGRA_TO_BGR)); - StoreScaled(dst + 0x0 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr, 0)), scale[0], shift[0]); - StoreScaled(dst + 0x1 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr, 1)), scale[1], shift[1]); - StoreScaled(dst + 0x2 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr, 2)), scale[2], shift[2]); - } - - template<> SIMD_INLINE void SynetSetInputNhwc3F(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst) - { - __m512i bgr = _mm512_permutexvar_epi32(K32_PERMUTE_BGRA_TO_BGR, _mm512_shuffle_epi8(_mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_END, Load(src - F)), K8_SUFFLE_RGB_TO_BGR)); - StoreScaled(dst + 0x0 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr, 0)), scale[0], shift[0]); - StoreScaled(dst + 0x1 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr, 1)), scale[1], shift[1]); - StoreScaled(dst + 0x2 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr, 2)), scale[2], shift[2]); - } - - template void SynetSetInputNhwc3(const uint8_t * src, size_t width, size_t height, size_t stride, const float * scale, const float * shift, float * dst) - { - size_t widthF = AlignLo(width, F); - size_t widthA = AlignLo(width, A); - __m512 _scale[3], _shift[3]; - for (float *sc = (float*)_scale, *sh = (float*)_shift, *end = sc + 48; sc < end; sc += 3, sh += 3) - { - sc[0] = scale[0]; sc[1] = scale[1]; sc[2] = scale[2]; - sh[0] = shift[0]; sh[1] = shift[1]; sh[2] = shift[2]; - } - for (size_t y = 0; y < height; ++y) - { - size_t x = 0; - for (; x < widthA; x += A) - SynetSetInputNhwc3A(src + step * x, _scale, _shift, dst + 3 * x); - for (; x < widthF; x += F) - SynetSetInputNhwc3F(src + step * x, _scale, _shift, dst + 3 * x); - if (widthF < width) - SynetSetInputNhwc3F(src + step * (width - F), _scale, _shift, dst + 3 * (width - F)); - src += stride; - dst += 3*width; - } - } - - void SynetSetInput(const uint8_t * src, size_t width, size_t height, size_t stride, SimdPixelFormatType srcFormat, - const float * lower, const float * upper, float * dst, size_t channels, SimdTensorFormatType dstFormat) - { - assert(width >= A); - - float scale[3]; - for (size_t i = 0; i < channels; ++i) - scale[i] = (upper[i] - lower[i]) / 255.0f; - switch (channels) - { - case 1: - switch (srcFormat) - { - case SimdPixelFormatGray8: SynetSetInput1(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgr24: SynetSetInput1(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgra32: SynetSetInput1(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatRgb24: SynetSetInput1(src, width, height, stride, scale, lower, dst); return; - default: assert(0); - } - break; - case 3: - switch (dstFormat) - { - case SimdTensorFormatNchw: - switch (srcFormat) - { - case SimdPixelFormatGray8: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgr24: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgra32: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatRgb24: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return; - default: assert(0); - } - break; - case SimdTensorFormatNhwc: - switch (srcFormat) - { - case SimdPixelFormatGray8: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgr24: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgra32: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatRgb24: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return; - default: assert(0); - } - break; - default: assert(0); - } - default: assert(0); - } - } - } -#endif//SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwSynetConvolution8i.cpp b/src/3rd/Simd/Simd/SimdAvx512bwSynetConvolution8i.cpp deleted file mode 100644 index 3a43e231..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwSynetConvolution8i.cpp +++ /dev/null @@ -1,980 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution8i.h" -#include "Simd/SimdSynetConvolution8iCommon.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdMath.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdAvx512bw.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - using AlgParam = SynetConvolution8iNhwcDirect::AlgParam; - using ConvolutionPtr = SynetConvolution8iNhwcDirect::ConvolutionPtr; - using Term8iType = Base::SynetConvolution8iNhwcDirect::Term8iType; - - SIMD_INLINE __m512i Set4(const uint8_t* src) - { - return _mm512_set1_epi32(*(int32_t*)src); - } - - template void Madd4(__m512i& i32, __m512i u8, __m512i i8); - - template<> SIMD_INLINE void Madd4(__m512i& i32, __m512i u8, __m512i i8) - { - i32 = _mm512_add_epi32(i32, _mm512_madd_epi16(_mm512_maddubs_epi16(u8, i8), Avx512bw::K16_0001)); - } - - template<> SIMD_INLINE void Madd4(__m512i& i32, __m512i u8, __m512i i8) - { - __m512i lo = _mm512_madd_epi16(Cvt8uTo16i<0>(u8), Cvt8iTo16i<0>(i8)); - __m512i hi = _mm512_madd_epi16(Cvt8uTo16i<1>(u8), Cvt8iTo16i<1>(i8)); - i32 = _mm512_add_epi32(i32, Hadd32(lo, hi)); - } - - template void ConvolutionNhwcDirect_2x1(const uint8_t * src0, - const ConvParam8i& p, const AlgParam & a, size_t dy, size_t dx, size_t srcC, size_t dstC, const int8_t * weight0, - const __m512i * bias, const __m512i * params, const __m512 * scale, const __m512* shift, int32_t * buf, uint8_t* dst) - { - __m512i d00, d01, s0, w0, w1; - size_t dW = (DivHi(p.srcC, 4) - DivHi(srcC, 4)) * A, dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dWz = DivHi(srcC, 4) * A; - const int8_t* weight1 = weight0 + p.kernelY * p.kernelX * DivHi(p.srcC, 4) * A; - __m512i norm = _mm512_set1_epi32(a.norm); - size_t sy = dy * p.strideY - p.padY; - size_t sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY; - size_t kX = p.kernelX * p.dilationX; - if (dstC > F) - { - d00 = _mm512_setzero_si512(), d01 = _mm512_setzero_si512(); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - if (sy + ky < p.srcH && sx + kx < p.srcW) - { - size_t offs = (sy + ky) * dY + (sx + kx) * dX, end = offs + srcC; - for (; offs < end; offs += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - w1 = _mm512_loadu_si512((__m512i*)weight1); - s0 = Set4(src0 + offs); - Madd4(d00, s0, w0); - Madd4(d01, s0, w1); - weight0 += A, weight1 += A; - } - } - else - { - if (a.zero) - { - s0 = _mm512_set1_epi32(a.zero); - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - w1 = _mm512_loadu_si512((__m512i*)weight1); - Madd4(d00, s0, w0); - Madd4(d01, s0, w1); - weight0 += A, weight1 += A; - } - } - else - weight0 += dWz, weight1 += dWz; - } - weight0 += dW, weight1 += dW; - } - } - __mmask16 tail = TailMask16(dstC - F); - Save2(dst, buf, d00, d01, norm, bias, params, scale, shift, tail); - } - else - { - d00 = _mm512_setzero_si512(); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - if (sy + ky < p.srcH && sx + kx < p.srcW) - { - size_t offs = (sy + ky) * dY + (sx + kx) * dX, end = offs + srcC; - for (; offs < end; offs += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - s0 = Set4(src0 + offs); - Madd4(d00, s0, w0); - weight0 += A; - } - } - else - { - if (a.zero) - { - s0 = _mm512_set1_epi32(a.zero); - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - Madd4(d00, s0, w0); - weight0 += A; - } - } - else - weight0 += dWz; - } - weight0 += dW; - } - } - __mmask16 tail = TailMask16(dstC); - Save1(dst, buf, d00, norm, bias, params, scale, shift, tail); - } - } - - template void ConvolutionNhwcDirect_2x12(const uint8_t* src0, - const ConvParam8i& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const int8_t* weight0, - const __m512i* bias, const __m512i* params, const __m512* scale, const __m512* shift, int32_t* buf, uint8_t* dst) - { - __m512i d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, dA0, dA1, dB0, dB1, s0, w0, w1; - size_t dW = (DivHi(p.srcC, 4) - DivHi(srcC, 4)) * A, dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dD = p.dstC * a.size, dB = p.dstC, dWz = (DivHi(srcC, 4) * A + dW) * p.kernelX; - const int8_t * weight1 = weight0 + p.kernelY * p.kernelX * DivHi(p.srcC, 4) * A; - const uint8_t* src1 = src0 + 1 * dS; - const uint8_t* src2 = src0 + 2 * dS; - const uint8_t* src3 = src0 + 3 * dS; - const uint8_t* src4 = src0 + 4 * dS; - const uint8_t* src5 = src0 + 5 * dS; - __m512i norm = _mm512_set1_epi32(a.norm); - size_t sy = dy * p.strideY - p.padY; - size_t sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY; - size_t kX = p.kernelX * p.dilationX; - if (dstC > F) - { - d00 = _mm512_setzero_si512(), d01 = _mm512_setzero_si512(); - d10 = _mm512_setzero_si512(), d11 = _mm512_setzero_si512(); - d20 = _mm512_setzero_si512(), d21 = _mm512_setzero_si512(); - d30 = _mm512_setzero_si512(), d31 = _mm512_setzero_si512(); - d40 = _mm512_setzero_si512(), d41 = _mm512_setzero_si512(); - d50 = _mm512_setzero_si512(), d51 = _mm512_setzero_si512(); - d60 = _mm512_setzero_si512(), d61 = _mm512_setzero_si512(); - d70 = _mm512_setzero_si512(), d71 = _mm512_setzero_si512(); - d80 = _mm512_setzero_si512(), d81 = _mm512_setzero_si512(); - d90 = _mm512_setzero_si512(), d91 = _mm512_setzero_si512(); - dA0 = _mm512_setzero_si512(), dA1 = _mm512_setzero_si512(); - dB0 = _mm512_setzero_si512(), dB1 = _mm512_setzero_si512(); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - if (sy + ky < p.srcH) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - assert(sx + kx < p.srcW && sx + kx + 12 <= p.srcW); - size_t offs0 = (sy + ky) * dY + (sx + kx) * dX, end = offs0 + srcC, offs6 = offs0 + 6 * dS; - for (; offs0 < end; offs0 += 4, offs6 += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - w1 = _mm512_loadu_si512((__m512i*)weight1); - s0 = Set4(src0 + offs0), Madd4(d00, s0, w0), Madd4(d01, s0, w1); - s0 = Set4(src1 + offs0), Madd4(d10, s0, w0), Madd4(d11, s0, w1); - s0 = Set4(src2 + offs0), Madd4(d20, s0, w0), Madd4(d21, s0, w1); - s0 = Set4(src3 + offs0), Madd4(d30, s0, w0), Madd4(d31, s0, w1); - s0 = Set4(src4 + offs0), Madd4(d40, s0, w0), Madd4(d41, s0, w1); - s0 = Set4(src5 + offs0), Madd4(d50, s0, w0), Madd4(d51, s0, w1); - s0 = Set4(src0 + offs6), Madd4(d60, s0, w0), Madd4(d61, s0, w1); - s0 = Set4(src1 + offs6), Madd4(d70, s0, w0), Madd4(d71, s0, w1); - s0 = Set4(src2 + offs6), Madd4(d80, s0, w0), Madd4(d81, s0, w1); - s0 = Set4(src3 + offs6), Madd4(d90, s0, w0), Madd4(d91, s0, w1); - s0 = Set4(src4 + offs6), Madd4(dA0, s0, w0), Madd4(dA1, s0, w1); - s0 = Set4(src5 + offs6), Madd4(dB0, s0, w0), Madd4(dB1, s0, w1); - weight0 += A, weight1 += A; - } - weight0 += dW, weight1 += dW; - } - } - else if (a.zero) - { - s0 = _mm512_set1_epi32(a.zero); - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - w1 = _mm512_loadu_si512((__m512i*)weight1); - Madd4(d00, s0, w0), Madd4(d01, s0, w1); - Madd4(d10, s0, w0), Madd4(d11, s0, w1); - Madd4(d20, s0, w0), Madd4(d21, s0, w1); - Madd4(d30, s0, w0), Madd4(d31, s0, w1); - Madd4(d40, s0, w0), Madd4(d41, s0, w1); - Madd4(d50, s0, w0), Madd4(d51, s0, w1); - Madd4(d60, s0, w0), Madd4(d61, s0, w1); - Madd4(d70, s0, w0), Madd4(d71, s0, w1); - Madd4(d80, s0, w0), Madd4(d81, s0, w1); - Madd4(d90, s0, w0), Madd4(d91, s0, w1); - Madd4(dA0, s0, w0), Madd4(dA1, s0, w1); - Madd4(dB0, s0, w0), Madd4(dB1, s0, w1); - weight0 += A, weight1 += A; - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - __mmask16 tail = TailMask16(dstC - F); - Save2(dst, buf, d00, d01, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d10, d11, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d20, d21, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d30, d31, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d40, d41, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d50, d51, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d60, d61, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d70, d71, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d80, d81, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d90, d91, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, dA0, dA1, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, dB0, dB1, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - } - else - { - d00 = _mm512_setzero_si512(); - d10 = _mm512_setzero_si512(); - d20 = _mm512_setzero_si512(); - d30 = _mm512_setzero_si512(); - d40 = _mm512_setzero_si512(); - d50 = _mm512_setzero_si512(); - d60 = _mm512_setzero_si512(); - d70 = _mm512_setzero_si512(); - d80 = _mm512_setzero_si512(); - d90 = _mm512_setzero_si512(); - dA0 = _mm512_setzero_si512(); - dB0 = _mm512_setzero_si512(); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - if (sy + ky < p.srcH) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - assert(sx + kx < p.srcW && sx + kx + 12 <= p.srcW); - size_t offs0 = (sy + ky) * dY + (sx + kx) * dX, end = offs0 + srcC, offs6 = offs0 + 6 * dS; - for (; offs0 < end; offs0 += 4, offs6 += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - s0 = Set4(src0 + offs0), Madd4(d00, s0, w0); - s0 = Set4(src1 + offs0), Madd4(d10, s0, w0); - s0 = Set4(src2 + offs0), Madd4(d20, s0, w0); - s0 = Set4(src3 + offs0), Madd4(d30, s0, w0); - s0 = Set4(src4 + offs0), Madd4(d40, s0, w0); - s0 = Set4(src5 + offs0), Madd4(d50, s0, w0); - s0 = Set4(src0 + offs6), Madd4(d60, s0, w0); - s0 = Set4(src1 + offs6), Madd4(d70, s0, w0); - s0 = Set4(src2 + offs6), Madd4(d80, s0, w0); - s0 = Set4(src3 + offs6), Madd4(d90, s0, w0); - s0 = Set4(src4 + offs6), Madd4(dA0, s0, w0); - s0 = Set4(src5 + offs6), Madd4(dB0, s0, w0); - weight0 += A; - } - weight0 += dW; - } - } - else if (a.zero) - { - s0 = _mm512_set1_epi32(a.zero); - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - Madd4(d00, s0, w0); - Madd4(d10, s0, w0); - Madd4(d20, s0, w0); - Madd4(d30, s0, w0); - Madd4(d40, s0, w0); - Madd4(d50, s0, w0); - Madd4(d60, s0, w0); - Madd4(d70, s0, w0); - Madd4(d80, s0, w0); - Madd4(d90, s0, w0); - Madd4(dA0, s0, w0); - Madd4(dB0, s0, w0); - weight0 += A; - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - __mmask16 tail = TailMask16(dstC); - Save1(dst, buf, d00, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d10, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d20, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d30, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d40, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d50, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d60, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d70, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d80, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d90, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, dA0, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, dB0, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - } - } - - template void ConvolutionNhwcDirect_2xM(const uint8_t* src0, - const ConvParam8i& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const int8_t* weight0, - const __m512i* bias, const __m512i* params, const __m512* scale, const __m512* shift, int32_t* buf, uint8_t* dst) - { - __m512i d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, dA0, dA1, dB0, dB1, s0, w0, w1; - size_t dW = (DivHi(p.srcC, 4) - DivHi(srcC, 4)) * A, dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dD = p.dstC * a.size, dB = p.dstC, dWz = (DivHi(srcC, 4) * A + dW) * p.kernelX; - const int8_t* weight1 = weight0 + p.kernelY * p.kernelX * DivHi(p.srcC, 4) * A; - const uint8_t* src1 = src0 + 1 * dS; - const uint8_t* src2 = src0 + 2 * dS; - const uint8_t* src3 = src0 + 3 * dS; - const uint8_t* src4 = src0 + 4 * dS; - const uint8_t* src5 = src0 + 5 * dS; - __m512i norm = _mm512_set1_epi32(a.norm); - size_t sy = dy * p.strideY - p.padY; - size_t sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY; - size_t kX = p.kernelX * p.dilationX; - if (dstC > F) - { - if (M > 0x0) d00 = _mm512_setzero_si512(), d01 = _mm512_setzero_si512(); - if (M > 0x1) d10 = _mm512_setzero_si512(), d11 = _mm512_setzero_si512(); - if (M > 0x2) d20 = _mm512_setzero_si512(), d21 = _mm512_setzero_si512(); - if (M > 0x3) d30 = _mm512_setzero_si512(), d31 = _mm512_setzero_si512(); - if (M > 0x4) d40 = _mm512_setzero_si512(), d41 = _mm512_setzero_si512(); - if (M > 0x5) d50 = _mm512_setzero_si512(), d51 = _mm512_setzero_si512(); - if (M > 0x6) d60 = _mm512_setzero_si512(), d61 = _mm512_setzero_si512(); - if (M > 0x7) d70 = _mm512_setzero_si512(), d71 = _mm512_setzero_si512(); - if (M > 0x8) d80 = _mm512_setzero_si512(), d81 = _mm512_setzero_si512(); - if (M > 0x9) d90 = _mm512_setzero_si512(), d91 = _mm512_setzero_si512(); - if (M > 0xA) dA0 = _mm512_setzero_si512(), dA1 = _mm512_setzero_si512(); - if (M > 0xB) dB0 = _mm512_setzero_si512(), dB1 = _mm512_setzero_si512(); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - if (sy + ky < p.srcH) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - assert(sx + kx < p.srcW && sx + kx + M <= p.srcW); - size_t offs0 = (sy + ky) * dY + (sx + kx) * dX, end = offs0 + srcC, offs6 = offs0 + 6 * dS; - for (; offs0 < end; offs0 += 4, offs6 += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - w1 = _mm512_loadu_si512((__m512i*)weight1); - if (M > 0x0) s0 = Set4(src0 + offs0), Madd4(d00, s0, w0), Madd4(d01, s0, w1); - if (M > 0x1) s0 = Set4(src1 + offs0), Madd4(d10, s0, w0), Madd4(d11, s0, w1); - if (M > 0x2) s0 = Set4(src2 + offs0), Madd4(d20, s0, w0), Madd4(d21, s0, w1); - if (M > 0x3) s0 = Set4(src3 + offs0), Madd4(d30, s0, w0), Madd4(d31, s0, w1); - if (M > 0x4) s0 = Set4(src4 + offs0), Madd4(d40, s0, w0), Madd4(d41, s0, w1); - if (M > 0x5) s0 = Set4(src5 + offs0), Madd4(d50, s0, w0), Madd4(d51, s0, w1); - if (M > 0x6) s0 = Set4(src0 + offs6), Madd4(d60, s0, w0), Madd4(d61, s0, w1); - if (M > 0x7) s0 = Set4(src1 + offs6), Madd4(d70, s0, w0), Madd4(d71, s0, w1); - if (M > 0x8) s0 = Set4(src2 + offs6), Madd4(d80, s0, w0), Madd4(d81, s0, w1); - if (M > 0x9) s0 = Set4(src3 + offs6), Madd4(d90, s0, w0), Madd4(d91, s0, w1); - if (M > 0xA) s0 = Set4(src4 + offs6), Madd4(dA0, s0, w0), Madd4(dA1, s0, w1); - if (M > 0xB) s0 = Set4(src5 + offs6), Madd4(dB0, s0, w0), Madd4(dB1, s0, w1); - weight0 += A, weight1 += A; - } - weight0 += dW, weight1 += dW; - } - } - else if (a.zero) - { - s0 = _mm512_set1_epi32(a.zero); - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - w1 = _mm512_loadu_si512((__m512i*)weight1); - if (M > 0x0) Madd4(d00, s0, w0), Madd4(d01, s0, w1); - if (M > 0x1) Madd4(d10, s0, w0), Madd4(d11, s0, w1); - if (M > 0x2) Madd4(d20, s0, w0), Madd4(d21, s0, w1); - if (M > 0x3) Madd4(d30, s0, w0), Madd4(d31, s0, w1); - if (M > 0x4) Madd4(d40, s0, w0), Madd4(d41, s0, w1); - if (M > 0x5) Madd4(d50, s0, w0), Madd4(d51, s0, w1); - if (M > 0x6) Madd4(d60, s0, w0), Madd4(d61, s0, w1); - if (M > 0x7) Madd4(d70, s0, w0), Madd4(d71, s0, w1); - if (M > 0x8) Madd4(d80, s0, w0), Madd4(d81, s0, w1); - if (M > 0x9) Madd4(d90, s0, w0), Madd4(d91, s0, w1); - if (M > 0xA) Madd4(dA0, s0, w0), Madd4(dA1, s0, w1); - if (M > 0xB) Madd4(dB0, s0, w0), Madd4(dB1, s0, w1); - weight0 += A, weight1 += A; - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - __mmask16 tail = TailMask16(dstC - F); - if (M > 0x0) Save2(dst, buf, d00, d01, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x1) Save2(dst, buf, d10, d11, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x2) Save2(dst, buf, d20, d21, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x3) Save2(dst, buf, d30, d31, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x4) Save2(dst, buf, d40, d41, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x5) Save2(dst, buf, d50, d51, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x6) Save2(dst, buf, d60, d61, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x7) Save2(dst, buf, d70, d71, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x8) Save2(dst, buf, d80, d81, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x9) Save2(dst, buf, d90, d91, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0xA) Save2(dst, buf, dA0, dA1, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0xB) Save2(dst, buf, dB0, dB1, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - } - else - { - if (M > 0x0) d00 = _mm512_setzero_si512(); - if (M > 0x1) d10 = _mm512_setzero_si512(); - if (M > 0x2) d20 = _mm512_setzero_si512(); - if (M > 0x3) d30 = _mm512_setzero_si512(); - if (M > 0x4) d40 = _mm512_setzero_si512(); - if (M > 0x5) d50 = _mm512_setzero_si512(); - if (M > 0x6) d60 = _mm512_setzero_si512(); - if (M > 0x7) d70 = _mm512_setzero_si512(); - if (M > 0x8) d80 = _mm512_setzero_si512(); - if (M > 0x9) d90 = _mm512_setzero_si512(); - if (M > 0xA) dA0 = _mm512_setzero_si512(); - if (M > 0xB) dB0 = _mm512_setzero_si512(); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - if (sy + ky < p.srcH) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - assert(sx + kx < p.srcW && sx + kx + M <= p.srcW); - size_t offs0 = (sy + ky) * dY + (sx + kx) * dX, end = offs0 + srcC, offs6 = offs0 + 6 * dS; - for (; offs0 < end; offs0 += 4, offs6 += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - if (M > 0x0) s0 = Set4(src0 + offs0), Madd4(d00, s0, w0); - if (M > 0x1) s0 = Set4(src1 + offs0), Madd4(d10, s0, w0); - if (M > 0x2) s0 = Set4(src2 + offs0), Madd4(d20, s0, w0); - if (M > 0x3) s0 = Set4(src3 + offs0), Madd4(d30, s0, w0); - if (M > 0x4) s0 = Set4(src4 + offs0), Madd4(d40, s0, w0); - if (M > 0x5) s0 = Set4(src5 + offs0), Madd4(d50, s0, w0); - if (M > 0x6) s0 = Set4(src0 + offs6), Madd4(d60, s0, w0); - if (M > 0x7) s0 = Set4(src1 + offs6), Madd4(d70, s0, w0); - if (M > 0x8) s0 = Set4(src2 + offs6), Madd4(d80, s0, w0); - if (M > 0x9) s0 = Set4(src3 + offs6), Madd4(d90, s0, w0); - if (M > 0xA) s0 = Set4(src4 + offs6), Madd4(dA0, s0, w0); - if (M > 0xB) s0 = Set4(src5 + offs6), Madd4(dB0, s0, w0); - weight0 += A; - } - weight0 += dW; - } - } - else if (a.zero) - { - s0 = _mm512_set1_epi32(a.zero); - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - if (M > 0x0) Madd4(d00, s0, w0); - if (M > 0x1) Madd4(d10, s0, w0); - if (M > 0x2) Madd4(d20, s0, w0); - if (M > 0x3) Madd4(d30, s0, w0); - if (M > 0x4) Madd4(d40, s0, w0); - if (M > 0x5) Madd4(d50, s0, w0); - if (M > 0x6) Madd4(d60, s0, w0); - if (M > 0x7) Madd4(d70, s0, w0); - if (M > 0x8) Madd4(d80, s0, w0); - if (M > 0x9) Madd4(d90, s0, w0); - if (M > 0xA) Madd4(dA0, s0, w0); - if (M > 0xB) Madd4(dB0, s0, w0); - weight0 += A; - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - __mmask16 tail = TailMask16(dstC); - if (M > 0x0) Save1(dst, buf, d00, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x1) Save1(dst, buf, d10, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x2) Save1(dst, buf, d20, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x3) Save1(dst, buf, d30, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x4) Save1(dst, buf, d40, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x5) Save1(dst, buf, d50, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x6) Save1(dst, buf, d60, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x7) Save1(dst, buf, d70, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x8) Save1(dst, buf, d80, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x9) Save1(dst, buf, d90, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0xA) Save1(dst, buf, dA0, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0xB) Save1(dst, buf, dB0, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - } - } - - typedef void(*ConvolutionNhwcDirect_2xM_Ptr)(const uint8_t* src0, const ConvParam8i& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, - const int8_t* weight0, const __m512i* bias, const __m512i* params, const __m512* scale, const __m512* shift, int32_t* buf, uint8_t* dst); - - template ConvolutionNhwcDirect_2xM_Ptr GetConvolutionNhwcDirect_2xM(size_t M) - { - switch (M) - { - case 0x0: return NULL; - case 0x1: return ConvolutionNhwcDirect_2xM; - case 0x2: return ConvolutionNhwcDirect_2xM; - case 0x3: return ConvolutionNhwcDirect_2xM; - case 0x4: return ConvolutionNhwcDirect_2xM; - case 0x5: return ConvolutionNhwcDirect_2xM; - case 0x6: return ConvolutionNhwcDirect_2xM; - case 0x7: return ConvolutionNhwcDirect_2xM; - case 0x8: return ConvolutionNhwcDirect_2xM; - case 0x9: return ConvolutionNhwcDirect_2xM; - case 0xA: return ConvolutionNhwcDirect_2xM; - case 0xB: return ConvolutionNhwcDirect_2xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect_2(const uint8_t* src, - const ConvParam8i & p, const AlgParam & a, size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const int8_t* weight, - const int32_t* bias, const int32_t * params, const float * scale, const float* shift, int32_t* buf, uint8_t* dst) - { - size_t noseH = p.NoseH(), noseW = p.NoseW(), bodyH = p.BodyH(), bodyW = p.BodyW(); - size_t n = 12, bodyWn = AlignLoAny(bodyW - noseW, n) + noseW, m = bodyW - bodyWn; - ConvolutionNhwcDirect_2xM_Ptr convolutionNhwcDirect_2xM = GetConvolutionNhwcDirect_2xM(m); - size_t tailH = p.dstH, tailW = p.dstW; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - __m512i _params[2], _bias[2]; - _params[0] = _mm512_setzero_si512(); - if (type == ::SimdConvolutionActivationRestrictRange) - _params[1] = _mm512_set1_epi32(a.high); - __m512 _scale[2], _shift[2]; - - for (size_t dc = 0; dc < dstC; dc += DF) - { - size_t dC = Simd::Min(DF, dstC - dc); - _bias[0] = _mm512_loadu_si512((__m512i*)(bias + dc + 0)); - _bias[1] = _mm512_loadu_si512((__m512i*)(bias + dc + F)); - _scale[0] = _mm512_loadu_ps(scale + dc + 0); - _scale[1] = _mm512_loadu_ps(scale + dc + F); - _shift[0] = _mm512_loadu_ps(shift + dc + 0); - _shift[1] = _mm512_loadu_ps(shift + dc + F); - - uint8_t * d = dst + (dc + yBeg * p.dstW * p.dstC) * a.size; - int32_t * b = buf + dc + yBeg * p.dstW * p.dstC; - size_t dy = yBeg; - for (; dy < noseH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyWn; dx += n, b += p.dstC * n, d += p.dstC * a.size * n) - ConvolutionNhwcDirect_2x12(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyW; dx += m, b += p.dstC * m, d += p.dstC * a.size * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < tailW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - } - for (; dy < bodyH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyWn; dx += n, b += p.dstC * n, d += p.dstC * a.size * n) - ConvolutionNhwcDirect_2x12(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyW; dx += m, b += p.dstC * m, d += p.dstC * a.size * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < tailW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - } - for (; dy < tailH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyWn; dx += n, b += p.dstC * n, d += p.dstC * a.size * n) - ConvolutionNhwcDirect_2x12(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyW; dx += m, b += p.dstC * m, d += p.dstC * a.size * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < tailW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - } - weight += p.kernelY * p.kernelX * DivHi(p.srcC, 4) * DA; - } - } - - //--------------------------------------------------------------------- - - template void ConvolutionNhwcDirect1x1_2x12( - const uint8_t* src0, const ConvParam8i& p, const AlgParam& a, size_t srcC, size_t dstC, const int8_t* weight0, - const __m512i* bias, const __m512i* params, const __m512* scale, const __m512* shift, int32_t* buf, uint8_t* dst) - { - __m512i d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, dA0, dA1, dB0, dB1, s0, w0, w1; - size_t dS = p.srcC * p.strideX, dD = p.dstC * a.size, dB = p.dstC; - const int8_t* weight1 = weight0 + DivHi(p.srcC, 4) * A; - const uint8_t* src1 = src0 + 1 * dS; - const uint8_t* src2 = src0 + 2 * dS; - const uint8_t* src3 = src0 + 3 * dS; - const uint8_t* src4 = src0 + 4 * dS; - const uint8_t* src5 = src0 + 5 * dS; - __m512i norm = _mm512_set1_epi32(a.norm); - if (dstC > F) - { - d00 = _mm512_setzero_si512(), d01 = _mm512_setzero_si512(); - d10 = _mm512_setzero_si512(), d11 = _mm512_setzero_si512(); - d20 = _mm512_setzero_si512(), d21 = _mm512_setzero_si512(); - d30 = _mm512_setzero_si512(), d31 = _mm512_setzero_si512(); - d40 = _mm512_setzero_si512(), d41 = _mm512_setzero_si512(); - d50 = _mm512_setzero_si512(), d51 = _mm512_setzero_si512(); - d60 = _mm512_setzero_si512(), d61 = _mm512_setzero_si512(); - d70 = _mm512_setzero_si512(), d71 = _mm512_setzero_si512(); - d80 = _mm512_setzero_si512(), d81 = _mm512_setzero_si512(); - d90 = _mm512_setzero_si512(), d91 = _mm512_setzero_si512(); - dA0 = _mm512_setzero_si512(), dA1 = _mm512_setzero_si512(); - dB0 = _mm512_setzero_si512(), dB1 = _mm512_setzero_si512(); - for (size_t offs0 = 0, offs6 = offs0 + 6 * dS; offs0 < srcC; offs0 += 4, offs6 += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - w1 = _mm512_loadu_si512((__m512i*)weight1); - s0 = Set4(src0 + offs0), Madd4(d00, s0, w0), Madd4(d01, s0, w1); - s0 = Set4(src1 + offs0), Madd4(d10, s0, w0), Madd4(d11, s0, w1); - s0 = Set4(src2 + offs0), Madd4(d20, s0, w0), Madd4(d21, s0, w1); - s0 = Set4(src3 + offs0), Madd4(d30, s0, w0), Madd4(d31, s0, w1); - s0 = Set4(src4 + offs0), Madd4(d40, s0, w0), Madd4(d41, s0, w1); - s0 = Set4(src5 + offs0), Madd4(d50, s0, w0), Madd4(d51, s0, w1); - s0 = Set4(src0 + offs6), Madd4(d60, s0, w0), Madd4(d61, s0, w1); - s0 = Set4(src1 + offs6), Madd4(d70, s0, w0), Madd4(d71, s0, w1); - s0 = Set4(src2 + offs6), Madd4(d80, s0, w0), Madd4(d81, s0, w1); - s0 = Set4(src3 + offs6), Madd4(d90, s0, w0), Madd4(d91, s0, w1); - s0 = Set4(src4 + offs6), Madd4(dA0, s0, w0), Madd4(dA1, s0, w1); - s0 = Set4(src5 + offs6), Madd4(dB0, s0, w0), Madd4(dB1, s0, w1); - weight0 += A, weight1 += A; - } - __mmask16 tail = TailMask16(dstC - F); - Save2(dst, buf, d00, d01, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d10, d11, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d20, d21, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d30, d31, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d40, d41, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d50, d51, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d60, d61, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d70, d71, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d80, d81, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d90, d91, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, dA0, dA1, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, dB0, dB1, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - } - else - { - d00 = _mm512_setzero_si512(); - d10 = _mm512_setzero_si512(); - d20 = _mm512_setzero_si512(); - d30 = _mm512_setzero_si512(); - d40 = _mm512_setzero_si512(); - d50 = _mm512_setzero_si512(); - d60 = _mm512_setzero_si512(); - d70 = _mm512_setzero_si512(); - d80 = _mm512_setzero_si512(); - d90 = _mm512_setzero_si512(); - dA0 = _mm512_setzero_si512(); - dB0 = _mm512_setzero_si512(); - for (size_t offs0 = 0, offs6 = offs0 + 6 * dS; offs0 < srcC; offs0 += 4, offs6 += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - s0 = Set4(src0 + offs0), Madd4(d00, s0, w0); - s0 = Set4(src1 + offs0), Madd4(d10, s0, w0); - s0 = Set4(src2 + offs0), Madd4(d20, s0, w0); - s0 = Set4(src3 + offs0), Madd4(d30, s0, w0); - s0 = Set4(src4 + offs0), Madd4(d40, s0, w0); - s0 = Set4(src5 + offs0), Madd4(d50, s0, w0); - s0 = Set4(src0 + offs6), Madd4(d60, s0, w0); - s0 = Set4(src1 + offs6), Madd4(d70, s0, w0); - s0 = Set4(src2 + offs6), Madd4(d80, s0, w0); - s0 = Set4(src3 + offs6), Madd4(d90, s0, w0); - s0 = Set4(src4 + offs6), Madd4(dA0, s0, w0); - s0 = Set4(src5 + offs6), Madd4(dB0, s0, w0); - weight0 += A; - } - __mmask16 tail = TailMask16(dstC); - Save1(dst, buf, d00, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d10, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d20, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d30, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d40, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d50, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d60, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d70, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d80, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d90, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, dA0, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, dB0, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - } - } - - template void ConvolutionNhwcDirect1x1_2xM( - const uint8_t* src0, const ConvParam8i& p, const AlgParam& a, size_t srcC, size_t dstC, const int8_t* weight0, - const __m512i* bias, const __m512i* params, const __m512* scale, const __m512* shift, int32_t* buf, uint8_t* dst) - { - __m512i d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, dA0, dA1, dB0, dB1, s0, w0, w1; - size_t dS = p.srcC * p.strideX, dD = p.dstC * a.size, dB = p.dstC; - const int8_t* weight1 = weight0 + DivHi(p.srcC, 4) * A; - const uint8_t* src1 = src0 + 1 * dS; - const uint8_t* src2 = src0 + 2 * dS; - const uint8_t* src3 = src0 + 3 * dS; - const uint8_t* src4 = src0 + 4 * dS; - const uint8_t* src5 = src0 + 5 * dS; - __m512i norm = _mm512_set1_epi32(a.norm); - if (dstC > F) - { - if (M > 0x0) d00 = _mm512_setzero_si512(), d01 = _mm512_setzero_si512(); - if (M > 0x1) d10 = _mm512_setzero_si512(), d11 = _mm512_setzero_si512(); - if (M > 0x2) d20 = _mm512_setzero_si512(), d21 = _mm512_setzero_si512(); - if (M > 0x3) d30 = _mm512_setzero_si512(), d31 = _mm512_setzero_si512(); - if (M > 0x4) d40 = _mm512_setzero_si512(), d41 = _mm512_setzero_si512(); - if (M > 0x5) d50 = _mm512_setzero_si512(), d51 = _mm512_setzero_si512(); - if (M > 0x6) d60 = _mm512_setzero_si512(), d61 = _mm512_setzero_si512(); - if (M > 0x7) d70 = _mm512_setzero_si512(), d71 = _mm512_setzero_si512(); - if (M > 0x8) d80 = _mm512_setzero_si512(), d81 = _mm512_setzero_si512(); - if (M > 0x9) d90 = _mm512_setzero_si512(), d91 = _mm512_setzero_si512(); - if (M > 0xA) dA0 = _mm512_setzero_si512(), dA1 = _mm512_setzero_si512(); - if (M > 0xB) dB0 = _mm512_setzero_si512(), dB1 = _mm512_setzero_si512(); - for (size_t offs0 = 0, offs6 = offs0 + 6 * dS; offs0 < srcC; offs0 += 4, offs6 += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - w1 = _mm512_loadu_si512((__m512i*)weight1); - if (M > 0x0) s0 = Set4(src0 + offs0), Madd4(d00, s0, w0), Madd4(d01, s0, w1); - if (M > 0x1) s0 = Set4(src1 + offs0), Madd4(d10, s0, w0), Madd4(d11, s0, w1); - if (M > 0x2) s0 = Set4(src2 + offs0), Madd4(d20, s0, w0), Madd4(d21, s0, w1); - if (M > 0x3) s0 = Set4(src3 + offs0), Madd4(d30, s0, w0), Madd4(d31, s0, w1); - if (M > 0x4) s0 = Set4(src4 + offs0), Madd4(d40, s0, w0), Madd4(d41, s0, w1); - if (M > 0x5) s0 = Set4(src5 + offs0), Madd4(d50, s0, w0), Madd4(d51, s0, w1); - if (M > 0x6) s0 = Set4(src0 + offs6), Madd4(d60, s0, w0), Madd4(d61, s0, w1); - if (M > 0x7) s0 = Set4(src1 + offs6), Madd4(d70, s0, w0), Madd4(d71, s0, w1); - if (M > 0x8) s0 = Set4(src2 + offs6), Madd4(d80, s0, w0), Madd4(d81, s0, w1); - if (M > 0x9) s0 = Set4(src3 + offs6), Madd4(d90, s0, w0), Madd4(d91, s0, w1); - if (M > 0xA) s0 = Set4(src4 + offs6), Madd4(dA0, s0, w0), Madd4(dA1, s0, w1); - if (M > 0xB) s0 = Set4(src5 + offs6), Madd4(dB0, s0, w0), Madd4(dB1, s0, w1); - weight0 += A, weight1 += A; - } - __mmask16 tail = TailMask16(dstC - F); - if (M > 0x0) Save2(dst, buf, d00, d01, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x1) Save2(dst, buf, d10, d11, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x2) Save2(dst, buf, d20, d21, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x3) Save2(dst, buf, d30, d31, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x4) Save2(dst, buf, d40, d41, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x5) Save2(dst, buf, d50, d51, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x6) Save2(dst, buf, d60, d61, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x7) Save2(dst, buf, d70, d71, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x8) Save2(dst, buf, d80, d81, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x9) Save2(dst, buf, d90, d91, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0xA) Save2(dst, buf, dA0, dA1, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0xB) Save2(dst, buf, dB0, dB1, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - } - else - { - if (M > 0x0) d00 = _mm512_setzero_si512(); - if (M > 0x1) d10 = _mm512_setzero_si512(); - if (M > 0x2) d20 = _mm512_setzero_si512(); - if (M > 0x3) d30 = _mm512_setzero_si512(); - if (M > 0x4) d40 = _mm512_setzero_si512(); - if (M > 0x5) d50 = _mm512_setzero_si512(); - if (M > 0x6) d60 = _mm512_setzero_si512(); - if (M > 0x7) d70 = _mm512_setzero_si512(); - if (M > 0x8) d80 = _mm512_setzero_si512(); - if (M > 0x9) d90 = _mm512_setzero_si512(); - if (M > 0xA) dA0 = _mm512_setzero_si512(); - if (M > 0xB) dB0 = _mm512_setzero_si512(); - for (size_t offs0 = 0, offs6 = offs0 + 6 * dS; offs0 < srcC; offs0 += 4, offs6 += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - if (M > 0x0) s0 = Set4(src0 + offs0), Madd4(d00, s0, w0); - if (M > 0x1) s0 = Set4(src1 + offs0), Madd4(d10, s0, w0); - if (M > 0x2) s0 = Set4(src2 + offs0), Madd4(d20, s0, w0); - if (M > 0x3) s0 = Set4(src3 + offs0), Madd4(d30, s0, w0); - if (M > 0x4) s0 = Set4(src4 + offs0), Madd4(d40, s0, w0); - if (M > 0x5) s0 = Set4(src5 + offs0), Madd4(d50, s0, w0); - if (M > 0x6) s0 = Set4(src0 + offs6), Madd4(d60, s0, w0); - if (M > 0x7) s0 = Set4(src1 + offs6), Madd4(d70, s0, w0); - if (M > 0x8) s0 = Set4(src2 + offs6), Madd4(d80, s0, w0); - if (M > 0x9) s0 = Set4(src3 + offs6), Madd4(d90, s0, w0); - if (M > 0xA) s0 = Set4(src4 + offs6), Madd4(dA0, s0, w0); - if (M > 0xB) s0 = Set4(src5 + offs6), Madd4(dB0, s0, w0); - weight0 += A; - } - __mmask16 tail = TailMask16(dstC); - if (M > 0x0) Save1(dst, buf, d00, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x1) Save1(dst, buf, d10, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x2) Save1(dst, buf, d20, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x3) Save1(dst, buf, d30, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x4) Save1(dst, buf, d40, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x5) Save1(dst, buf, d50, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x6) Save1(dst, buf, d60, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x7) Save1(dst, buf, d70, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x8) Save1(dst, buf, d80, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x9) Save1(dst, buf, d90, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0xA) Save1(dst, buf, dA0, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0xB) Save1(dst, buf, dB0, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - } - } - - typedef void(*ConvolutionNhwcDirect1x1_2xM_Ptr)(const uint8_t* src0, const ConvParam8i& p, const AlgParam& a, size_t srcC, size_t dstC, - const int8_t* weight0, const __m512i* bias, const __m512i* params, const __m512* scale, const __m512* shift, int32_t* buf, uint8_t* dst); - - template ConvolutionNhwcDirect1x1_2xM_Ptr GetConvolutionNhwcDirect1x1_2xM(size_t M) - { - switch (M) - { - case 0x0: return NULL; - case 0x1: return ConvolutionNhwcDirect1x1_2xM; - case 0x2: return ConvolutionNhwcDirect1x1_2xM; - case 0x3: return ConvolutionNhwcDirect1x1_2xM; - case 0x4: return ConvolutionNhwcDirect1x1_2xM; - case 0x5: return ConvolutionNhwcDirect1x1_2xM; - case 0x6: return ConvolutionNhwcDirect1x1_2xM; - case 0x7: return ConvolutionNhwcDirect1x1_2xM; - case 0x8: return ConvolutionNhwcDirect1x1_2xM; - case 0x9: return ConvolutionNhwcDirect1x1_2xM; - case 0xA: return ConvolutionNhwcDirect1x1_2xM; - case 0xB: return ConvolutionNhwcDirect1x1_2xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect1x1_2(const uint8_t* src, - const ConvParam8i& p, const AlgParam& a, size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const int8_t* weight, - const int32_t* bias, const int32_t* params, const float* scale, const float* shift, int32_t* buf, uint8_t* dst) - { - size_t n1 = (yEnd - yBeg) * p.dstW, n12 = AlignLoAny(n1, 12), m = n1 - n12; - ConvolutionNhwcDirect1x1_2xM_Ptr convolutionNhwcDirect1x1_2xM = GetConvolutionNhwcDirect1x1_2xM(m); - __m512i _params[2], _bias[2]; - _params[0] = _mm512_setzero_si512(); - if (type == ::SimdConvolutionActivationRestrictRange) - _params[1] = _mm512_set1_epi32(a.high); - __m512 _scale[2], _shift[2]; - - for (size_t dc = 0; dc < dstC; dc += DF) - { - size_t dC = Simd::Min(DF, dstC - dc); - _bias[0] = _mm512_loadu_si512((__m512i*)(bias + dc + 0)); - _bias[1] = _mm512_loadu_si512((__m512i*)(bias + dc + F)); - _scale[0] = _mm512_loadu_ps(scale + dc + 0); - _scale[1] = _mm512_loadu_ps(scale + dc + F); - _shift[0] = _mm512_loadu_ps(shift + dc + 0); - _shift[1] = _mm512_loadu_ps(shift + dc + F); - const uint8_t* s = src + yBeg * p.srcW * p.srcC; - uint8_t* d = dst + (dc + yBeg * p.dstW * p.dstC) * a.size; - int32_t* b = buf + dc + yBeg * p.dstW * p.dstC; - size_t i = 0; - for (; i < n12; i += 12, s += p.srcC * 12, b += p.dstC * 12, d += p.dstC * a.size * 12) - ConvolutionNhwcDirect1x1_2x12(s, p, a, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; i < n1; i += m, s += p.srcC * m, b += p.dstC * m, d += p.dstC * a.size * m) - convolutionNhwcDirect1x1_2xM(s, p, a, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - weight += DivHi(p.srcC, 4) * DA; - } - } - - //--------------------------------------------------------------------- - - template void Set(const ConvParam8i& p, const AlgParam & a, ConvolutionPtr * d) - { - if (p.Is1x1()) - { - switch (a.microD) - { - case 2 * F: d[term] = ConvolutionNhwcDirect1x1_2; break; - default: - assert(0); - } - } - else - { - switch (a.microD) - { - case 2 * F: d[term] = ConvolutionNhwcDirect_2; break; - default: - assert(0); - } - } - } - - template void Set(const ConvParam8i& p, const AlgParam& a, ConvolutionPtr* d) - { - if (p.compatibility & SimdSynetCompatibilityNoFma) - Set(p, a, d); - else - Set(p, a, d); - } - - template void Set(const ConvParam8i& p, const AlgParam& a, ConvolutionPtr* d) - { - if (p.compatibility & SimdSynetCompatibilityOverflow16i) - Set(p, a, d); - else - Set(p, a, d); - } - - template void Set(const ConvParam8i& p, const AlgParam& a, ConvolutionPtr* d) - { - Set(p, a, d); - Set(p, a, d); - Set(p, a, d); - Set(p, a, d); - Set(p, a, d); - Set(p, a, d); - } - - static void Set(const ConvParam8i& p, const AlgParam& a, ConvolutionPtr * d) - { - switch (p.activation) - { - case SimdConvolutionActivationIdentity: Set(p, a, d); break; - case SimdConvolutionActivationRelu: Set(p, a, d); break; - case SimdConvolutionActivationRestrictRange: Set(p, a, d); break; - default: assert(0); - } - } - - SynetConvolution8iNhwcDirect::SynetConvolution8iNhwcDirect(const ConvParam8i& p) - : Avx2::SynetConvolution8iNhwcDirect(p) - { - SetAlgParam(F, 2 * F, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3()); - Set(p, _alg, _convolutions); - _convertSrc = Avx512bw::SynetConvert32fTo8u; - } - - //--------------------------------------------------------------------- - - void * SynetConvolution8iInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility) - { - ConvParam8i param(batch, conv, compatibility); - if (!param.Valid()) - return NULL; - else if (SynetConvolution8iNhwcDirect::Preferable(param)) - return new SynetConvolution8iNhwcDirect(param); - else - return new Base::SynetConvolution8iGemmNN(param); - } - } -#endif -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwSynetPooling.cpp b/src/3rd/Simd/Simd/SimdAvx512bwSynetPooling.cpp deleted file mode 100644 index cc3fd346..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwSynetPooling.cpp +++ /dev/null @@ -1,175 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdSse41.h" -#include "Simd/SimdAvx2.h" -#include "Simd/SimdAvx512bw.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - SIMD_INLINE void PoolingMaxNhwc1(const uint8_t* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m512i& min, uint8_t* dst, __mmask64 tail = -1) - { - __m512i max0 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - max0 = _mm512_max_epu8(max0, _mm512_maskz_loadu_epi8(tail, src + w * srcC)); - } - src += srcS; - } - _mm512_mask_storeu_epi8(dst, tail, max0); - } - - SIMD_INLINE void PoolingMaxNhwc2(const uint8_t* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m512i& min, uint8_t* dst) - { - __m512i max0 = min; - __m512i max1 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - const __m512i* ps = (__m512i*)(src + w * srcC); - max0 = _mm512_max_epu8(max0, _mm512_loadu_si512(ps + 0)); - max1 = _mm512_max_epu8(max1, _mm512_loadu_si512(ps + 1)); - } - src += srcS; - } - _mm512_storeu_si512((__m512i*)dst + 0, max0); - _mm512_storeu_si512((__m512i*)dst + 1, max1); - } - - SIMD_INLINE void PoolingMaxNhwc4(const uint8_t* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m512i& min, uint8_t* dst) - { - __m512i max0 = min; - __m512i max1 = min; - __m512i max2 = min; - __m512i max3 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - const __m512i* ps = (__m512i*)(src + w * srcC); - max0 = _mm512_max_epu8(max0, _mm512_loadu_si512(ps + 0)); - max1 = _mm512_max_epu8(max1, _mm512_loadu_si512(ps + 1)); - max2 = _mm512_max_epu8(max2, _mm512_loadu_si512(ps + 2)); - max3 = _mm512_max_epu8(max3, _mm512_loadu_si512(ps + 3)); - } - src += srcS; - } - _mm512_storeu_si512((__m512i*)dst + 0, max0); - _mm512_storeu_si512((__m512i*)dst + 1, max1); - _mm512_storeu_si512((__m512i*)dst + 2, max2); - _mm512_storeu_si512((__m512i*)dst + 3, max3); - } - - SIMD_INLINE void PoolingMaxNhwc8(const uint8_t* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m512i& min, uint8_t* dst) - { - __m512i max0 = min; - __m512i max1 = min; - __m512i max2 = min; - __m512i max3 = min; - __m512i max4 = min; - __m512i max5 = min; - __m512i max6 = min; - __m512i max7 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - const __m512i* ps = (__m512i*)(src + w * srcC); - max0 = _mm512_max_epu8(max0, _mm512_loadu_si512(ps + 0)); - max1 = _mm512_max_epu8(max1, _mm512_loadu_si512(ps + 1)); - max2 = _mm512_max_epu8(max2, _mm512_loadu_si512(ps + 2)); - max3 = _mm512_max_epu8(max3, _mm512_loadu_si512(ps + 3)); - max4 = _mm512_max_epu8(max4, _mm512_loadu_si512(ps + 4)); - max5 = _mm512_max_epu8(max5, _mm512_loadu_si512(ps + 5)); - max6 = _mm512_max_epu8(max6, _mm512_loadu_si512(ps + 6)); - max7 = _mm512_max_epu8(max7, _mm512_loadu_si512(ps + 7)); - } - src += srcS; - } - _mm512_storeu_si512((__m512i*)dst + 0, max0); - _mm512_storeu_si512((__m512i*)dst + 1, max1); - _mm512_storeu_si512((__m512i*)dst + 2, max2); - _mm512_storeu_si512((__m512i*)dst + 3, max3); - _mm512_storeu_si512((__m512i*)dst + 4, max4); - _mm512_storeu_si512((__m512i*)dst + 5, max5); - _mm512_storeu_si512((__m512i*)dst + 6, max6); - _mm512_storeu_si512((__m512i*)dst + 7, max7); - } - - void SynetPoolingForwardMax8u(const uint8_t* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, uint8_t* dst, size_t dstH, size_t dstW, SimdTensorFormatType format) - { - if (format == SimdTensorFormatNhwc) - { - size_t srcS = srcW * srcC; - size_t srcCA1 = AlignLo(srcC, 1 * A); - size_t srcCA2 = AlignLo(srcC, 2 * A); - size_t srcCA4 = AlignLo(srcC, 4 * A); - size_t srcCA8 = AlignLo(srcC, 8 * A); - __mmask64 tail = TailMask64(srcC - srcCA1); - __m512i min = _mm512_set1_epi8(0); - for (size_t ph = 0; ph < dstH; ++ph) - { - size_t hStart = ph * strideY - padY; - size_t hEnd = Simd::Min(hStart + kernelY, srcH); - hStart = Simd::Max(0, hStart); - for (size_t pw = 0; pw < dstW; ++pw) - { - size_t wStart = pw * strideX - padX; - size_t wEnd = Simd::Min(wStart + kernelX, srcW); - wStart = Simd::Max(0, wStart); - const uint8_t* ps = src + hStart * srcS + wStart * srcC; - size_t c = 0; - for (; c < srcCA8; c += 8 * A) - PoolingMaxNhwc8(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - for (; c < srcCA4; c += 4 * A) - PoolingMaxNhwc4(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - for (; c < srcCA2; c += 2 * A) - PoolingMaxNhwc2(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - for (; c < srcCA1; c += 1 * A) - PoolingMaxNhwc1(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - if (c < srcC) - PoolingMaxNhwc1(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c, tail); - dst += srcC; - } - } - } - else if (format == SimdTensorFormatNchw) - { - Base::SynetPoolingForwardMax8u(src, srcC, srcH, srcW, kernelY, kernelX, strideY, strideX, padY, padX, dst, dstH, dstW, format); - } - else - assert(0); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwTexture.cpp b/src/3rd/Simd/Simd/SimdAvx512bwTexture.cpp deleted file mode 100644 index fbeefcbf..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwTexture.cpp +++ /dev/null @@ -1,280 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - SIMD_INLINE __m512i TextureBoostedSaturatedGradient16(const __m512i & difference, const __m512i & saturation, const __m512i & boost) - { - return _mm512_mullo_epi16(_mm512_max_epi16(K_ZERO, _mm512_add_epi16(saturation, _mm512_min_epi16(difference, saturation))), boost); - } - - SIMD_INLINE __m512i TextureBoostedSaturatedGradient8(const __m512i & a, const __m512i & b, const __m512i & saturation, const __m512i & boost) - { - __m512i lo = TextureBoostedSaturatedGradient16(SubUnpackedU8<0>(b, a), saturation, boost); - __m512i hi = TextureBoostedSaturatedGradient16(SubUnpackedU8<1>(b, a), saturation, boost); - return _mm512_packus_epi16(lo, hi); - } - - template SIMD_INLINE void TextureBoostedSaturatedGradient(const uint8_t * src, uint8_t * dx, uint8_t * dy, - size_t stride, const __m512i & saturation, const __m512i & boost, __mmask64 tail = -1) - { - const __m512i s10 = Load(src - 1, tail); - const __m512i s12 = Load(src + 1, tail); - const __m512i s01 = Load(src - stride, tail); - const __m512i s21 = Load(src + stride, tail); - Store(dx, TextureBoostedSaturatedGradient8(s10, s12, saturation, boost), tail); - Store(dy, TextureBoostedSaturatedGradient8(s01, s21, saturation, boost), tail); - } - - template void TextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride) - { - assert(int(2)*saturation*boost <= 0xFF); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dx) && Aligned(dxStride) && Aligned(dy) && Aligned(dyStride)); - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - __m512i _saturation = _mm512_set1_epi16(saturation); - __m512i _boost = _mm512_set1_epi16(boost); - - memset(dx, 0, width); - memset(dy, 0, width); - src += srcStride; - dx += dxStride; - dy += dyStride; - for (size_t row = 2; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - TextureBoostedSaturatedGradient(src + col, dx + col, dy + col, srcStride, _saturation, _boost); - if (col < width) - TextureBoostedSaturatedGradient(src + col, dx + col, dy + col, srcStride, _saturation, _boost, tailMask); - - dx[0] = 0; - dy[0] = 0; - dx[width - 1] = 0; - dy[width - 1] = 0; - - src += srcStride; - dx += dxStride; - dy += dyStride; - } - memset(dx, 0, width); - memset(dy, 0, width); - } - - void TextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dx) && Aligned(dxStride) && Aligned(dy) && Aligned(dyStride)) - TextureBoostedSaturatedGradient(src, srcStride, width, height, saturation, boost, dx, dxStride, dy, dyStride); - else - TextureBoostedSaturatedGradient(src, srcStride, width, height, saturation, boost, dx, dxStride, dy, dyStride); - } - - template SIMD_INLINE void TextureBoostedUv(const uint8_t * src, uint8_t * dst, - const __m512i & min8, const __m512i & max8, const __m512i & boost16, __mmask64 tail = -1) - { - const __m512i _src = Load(src, tail); - const __m512i saturated = _mm512_sub_epi8(_mm512_max_epu8(min8, _mm512_min_epu8(max8, _src)), min8); - const __m512i lo = _mm512_mullo_epi16(_mm512_unpacklo_epi8(saturated, K_ZERO), boost16); - const __m512i hi = _mm512_mullo_epi16(_mm512_unpackhi_epi8(saturated, K_ZERO), boost16); - Store(dst, _mm512_packus_epi16(lo, hi), tail); - } - - template void TextureBoostedUv(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t boost, uint8_t * dst, size_t dstStride) - { - assert(boost < 0x80); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); - - size_t alignedWidth = AlignLo(width, A); - __mmask64 tailMask = TailMask64(width - alignedWidth); - int min = 128 - (128 / boost); - int max = 255 - min; - __m512i min8 = _mm512_set1_epi8(min); - __m512i max8 = _mm512_set1_epi8(max); - __m512i boost16 = _mm512_set1_epi16(boost); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - TextureBoostedUv(src + col, dst + col, min8, max8, boost16); - if (col < width) - TextureBoostedUv(src + col, dst + col, min8, max8, boost16, tailMask); - src += srcStride; - dst += dstStride; - } - } - - void TextureBoostedUv(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t boost, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - TextureBoostedUv(src, srcStride, width, height, boost, dst, dstStride); - else - TextureBoostedUv(src, srcStride, width, height, boost, dst, dstStride); - } - - SIMD_INLINE void TextureGetDifferenceSum(const __m512i & current, const __m512i & average, __m512i & positive, __m512i & negative) - { - positive = _mm512_add_epi64(positive, _mm512_sad_epu8(_mm512_subs_epu8(current, average), K_ZERO)); - negative = _mm512_add_epi64(negative, _mm512_sad_epu8(_mm512_subs_epu8(average, current), K_ZERO)); - } - - template SIMD_INLINE void TextureGetDifferenceSum(const uint8_t * src, const uint8_t * lo, const uint8_t * hi, - __m512i & positive, __m512i & negative, __mmask64 tail = -1) - { - const __m512i current = Load(src, tail); - const __m512i _lo = Load(lo, tail); - const __m512i _hi = Load(hi, tail); - const __m512i average = _mm512_avg_epu8(_lo, _hi); - TextureGetDifferenceSum(current, average, positive, negative); - } - - template SIMD_INLINE void TextureGetDifferenceSum4(const uint8_t * src, const uint8_t * lo, const uint8_t * hi, __m512i & positive, __m512i & negative) - { - TextureGetDifferenceSum(Load(src + 0 * A), _mm512_avg_epu8(Load(hi + 0 * A), Load(lo + 0 * A)), positive, negative); - TextureGetDifferenceSum(Load(src + 1 * A), _mm512_avg_epu8(Load(hi + 1 * A), Load(lo + 1 * A)), positive, negative); - TextureGetDifferenceSum(Load(src + 2 * A), _mm512_avg_epu8(Load(hi + 2 * A), Load(lo + 2 * A)), positive, negative); - TextureGetDifferenceSum(Load(src + 3 * A), _mm512_avg_epu8(Load(hi + 3 * A), Load(lo + 3 * A)), positive, negative); - } - - template void TextureGetDifferenceSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, int64_t * sum) - { - assert(sum != NULL); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(lo) && Aligned(loStride) && Aligned(hi) && Aligned(hiStride)); - - size_t alignedWidth = AlignLo(width, A); - size_t fullAlignedWidth = AlignLo(width, QA); - __mmask64 tailMask = TailMask64(width - alignedWidth); - __m512i positive = _mm512_setzero_si512(); - __m512i negative = _mm512_setzero_si512(); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QA) - TextureGetDifferenceSum4(src + col, lo + col, hi + col, positive, negative); - for (; col < alignedWidth; col += A) - TextureGetDifferenceSum(src + col, lo + col, hi + col, positive, negative); - if (col < width) - TextureGetDifferenceSum(src + col, lo + col, hi + col, positive, negative, tailMask); - src += srcStride; - lo += loStride; - hi += hiStride; - } - *sum = ExtractSum(positive) - ExtractSum(negative); - } - - void TextureGetDifferenceSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, int64_t * sum) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(lo) && Aligned(loStride) && Aligned(hi) && Aligned(hiStride)) - TextureGetDifferenceSum(src, srcStride, width, height, lo, loStride, hi, hiStride, sum); - else - TextureGetDifferenceSum(src, srcStride, width, height, lo, loStride, hi, hiStride, sum); - } - - template void TexturePerformCompensation(const uint8_t * src, size_t srcStride, size_t width, size_t height, int shift, uint8_t * dst, size_t dstStride) - { - assert(shift > -0xFF && shift < 0xFF && shift != 0); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); - - size_t alignedWidth = AlignLo(width, A); - size_t fullAlignedWidth = AlignLo(width, QA); - __mmask64 tailMask = TailMask64(width - alignedWidth); - if (shift > 0) - { - __m512i _shift = _mm512_set1_epi8((char)shift); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QA) - { - Store(dst + col + 0 * A, _mm512_adds_epu8(Load(src + col + 0 * A), _shift)); - Store(dst + col + 1 * A, _mm512_adds_epu8(Load(src + col + 1 * A), _shift)); - Store(dst + col + 2 * A, _mm512_adds_epu8(Load(src + col + 2 * A), _shift)); - Store(dst + col + 3 * A, _mm512_adds_epu8(Load(src + col + 3 * A), _shift)); - } - for (; col < alignedWidth; col += A) - Store(dst + col, _mm512_adds_epu8(Load(src + col), _shift)); - if (col < width) - Store(dst + col, _mm512_adds_epu8((Load(src + col, tailMask)), _shift), tailMask); - src += srcStride; - dst += dstStride; - } - } - if (shift < 0) - { - __m512i _shift = _mm512_set1_epi8((char)-shift); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QA) - { - Store(dst + col + 0 * A, _mm512_subs_epu8(Load(src + col + 0 * A), _shift)); - Store(dst + col + 1 * A, _mm512_subs_epu8(Load(src + col + 1 * A), _shift)); - Store(dst + col + 2 * A, _mm512_subs_epu8(Load(src + col + 2 * A), _shift)); - Store(dst + col + 3 * A, _mm512_subs_epu8(Load(src + col + 3 * A), _shift)); - } - for (; col < alignedWidth; col += A) - Store(dst + col, _mm512_subs_epu8(Load(src + col), _shift)); - if (col < width) - Store(dst + col, _mm512_subs_epu8((Load(src + col, tailMask)), _shift), tailMask); - src += srcStride; - dst += dstStride; - } - } - } - - void TexturePerformCompensation(const uint8_t * src, size_t srcStride, size_t width, size_t height, - int shift, uint8_t * dst, size_t dstStride) - { - if (shift == 0) - { - if (src != dst) - Base::Copy(src, srcStride, width, height, 1, dst, dstStride); - return; - } - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - TexturePerformCompensation(src, srcStride, width, height, shift, dst, dstStride); - else - TexturePerformCompensation(src, srcStride, width, height, shift, dst, dstStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwYuvToBgr.cpp b/src/3rd/Simd/Simd/SimdAvx512bwYuvToBgr.cpp deleted file mode 100644 index bfde8698..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwYuvToBgr.cpp +++ /dev/null @@ -1,360 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void YuvToBgr(const __m512i & y, const __m512i & u, const __m512i & v, uint8_t * bgr, const __mmask64 * tails) - { - __m512i blue = YuvToBlue(y, u); - __m512i green = YuvToGreen(y, u, v); - __m512i red = YuvToRed(y, v); - Store(bgr + 0 * A, InterleaveBgr<0>(blue, green, red), tails[0]); - Store(bgr + 1 * A, InterleaveBgr<1>(blue, green, red), tails[1]); - Store(bgr + 2 * A, InterleaveBgr<2>(blue, green, red), tails[2]); - } - - template SIMD_INLINE void Yuv420pToBgr(const uint8_t * y0, const uint8_t * y1, const uint8_t * u, const uint8_t * v, uint8_t * bgr0, uint8_t * bgr1, const __mmask64 * tails) - { - __m512i _u = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(u, tails[0]))); - __m512i u0 = UnpackU8<0>(_u, _u); - __m512i u1 = UnpackU8<1>(_u, _u); - __m512i _v = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(v, tails[0]))); - __m512i v0 = UnpackU8<0>(_v, _v); - __m512i v1 = UnpackU8<1>(_v, _v); - YuvToBgr(Load(y0 + 0, tails[1]), u0, v0, bgr0 + 0 * A, tails + 3); - YuvToBgr(Load(y0 + A, tails[2]), u1, v1, bgr0 + 3 * A, tails + 6); - YuvToBgr(Load(y1 + 0, tails[1]), u0, v0, bgr1 + 0 * A, tails + 3); - YuvToBgr(Load(y1 + A, tails[2]), u1, v1, bgr1 + 3 * A, tails + 6); - } - - template void Yuv420pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - assert((width % 2 == 0) && (height % 2 == 0)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); - } - - width /= 2; - size_t alignedWidth = AlignLo(width, A); - size_t tail = width - alignedWidth; - __mmask64 tailMasks[9]; - tailMasks[0] = TailMask64(tail); - for (size_t i = 0; i < 2; ++i) - tailMasks[1 + i] = TailMask64(tail * 2 - A * i); - for (size_t i = 0; i < 6; ++i) - tailMasks[3 + i] = TailMask64(tail * 6 - A * i); - for (size_t row = 0; row < height; row += 2) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - Yuv420pToBgr(y + col * 2, y + yStride + col * 2, u + col, v + col, bgr + col * 6, bgr + bgrStride + col * 6, tailMasks); - if (col < width) - Yuv420pToBgr(y + col * 2, y + yStride + col * 2, u + col, v + col, bgr + col * 6, bgr + bgrStride + col * 6, tailMasks); - y += 2 * yStride; - u += uStride; - v += vStride; - bgr += 2 * bgrStride; - } - } - - void Yuv420pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)) - Yuv420pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else - Yuv420pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - } - - template SIMD_INLINE void Yuv422pToBgr(const uint8_t * y, const uint8_t * u, const uint8_t * v, uint8_t * bgr, const __mmask64 * tails) - { - __m512i _u = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(u, tails[0]))); - __m512i _v = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(v, tails[0]))); - YuvToBgr(Load(y + 0, tails[1]), _mm512_unpacklo_epi8(_u, _u), _mm512_unpacklo_epi8(_v, _v), bgr + 0 * A, tails + 3); - YuvToBgr(Load(y + A, tails[2]), _mm512_unpackhi_epi8(_u, _u), _mm512_unpackhi_epi8(_v, _v), bgr + 3 * A, tails + 6); - } - - template void Yuv422pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - assert((width % 2 == 0)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); - } - - width /= 2; - size_t alignedWidth = AlignLo(width, A); - size_t tail = width - alignedWidth; - __mmask64 tailMasks[9]; - tailMasks[0] = TailMask64(tail); - for (size_t i = 0; i < 2; ++i) - tailMasks[1 + i] = TailMask64(tail * 2 - A * i); - for (size_t i = 0; i < 6; ++i) - tailMasks[3 + i] = TailMask64(tail * 6 - A * i); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - Yuv422pToBgr(y + col * 2, u + col, v + col, bgr + col * 6, tailMasks); - if (col < width) - Yuv422pToBgr(y + col * 2, u + col, v + col, bgr + col * 6, tailMasks); - y += yStride; - u += uStride; - v += vStride; - bgr += bgrStride; - } - } - - void Yuv422pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)) - Yuv422pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else - Yuv422pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - } - - template SIMD_INLINE void Yuv444pToBgr(const uint8_t * y, const uint8_t * u, const uint8_t * v, uint8_t * bgr, const __mmask64 * tails) - { - YuvToBgr(Load(y, tails[0]), Load(u, tails[0]), Load(v, tails[0]), bgr, tails + 1); - } - - template void Yuv444pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); - } - - size_t alignedWidth = AlignLo(width, A); - size_t tail = width - alignedWidth; - __mmask64 tailMasks[4]; - tailMasks[0] = TailMask64(tail); - for (size_t i = 0; i < 3; ++i) - tailMasks[1 + i] = TailMask64(tail * 3 - A * i); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - Yuv444pToBgr(y + col, u + col, v + col, bgr + col * 3, tailMasks); - if (col < width) - Yuv444pToBgr(y + col, u + col, v + col, bgr + col * 3, tailMasks); - y += yStride; - u += uStride; - v += vStride; - bgr += bgrStride; - } - } - - void Yuv444pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)) - Yuv444pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else - Yuv444pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void YuvToRgb(const __m512i& y, const __m512i& u, const __m512i& v, uint8_t* rgb, const __mmask64* tails) - { - __m512i blue = YuvToBlue(y, u); - __m512i green = YuvToGreen(y, u, v); - __m512i red = YuvToRed(y, v); - Store(rgb + 0 * A, InterleaveBgr<0>(red, green, blue), tails[0]); - Store(rgb + 1 * A, InterleaveBgr<1>(red, green, blue), tails[1]); - Store(rgb + 2 * A, InterleaveBgr<2>(red, green, blue), tails[2]); - } - - template SIMD_INLINE void Yuv420pToRgb(const uint8_t* y0, const uint8_t* y1, const uint8_t* u, const uint8_t* v, uint8_t* rgb0, uint8_t* rgb1, const __mmask64* tails) - { - __m512i _u = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(u, tails[0]))); - __m512i u0 = UnpackU8<0>(_u, _u); - __m512i u1 = UnpackU8<1>(_u, _u); - __m512i _v = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(v, tails[0]))); - __m512i v0 = UnpackU8<0>(_v, _v); - __m512i v1 = UnpackU8<1>(_v, _v); - YuvToRgb(Load(y0 + 0, tails[1]), u0, v0, rgb0 + 0 * A, tails + 3); - YuvToRgb(Load(y0 + A, tails[2]), u1, v1, rgb0 + 3 * A, tails + 6); - YuvToRgb(Load(y1 + 0, tails[1]), u0, v0, rgb1 + 0 * A, tails + 3); - YuvToRgb(Load(y1 + A, tails[2]), u1, v1, rgb1 + 3 * A, tails + 6); - } - - template void Yuv420pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) - { - assert((width % 2 == 0) && (height % 2 == 0)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(rgb) && Aligned(rgbStride)); - } - - width /= 2; - size_t alignedWidth = AlignLo(width, A); - size_t tail = width - alignedWidth; - __mmask64 tailMasks[9]; - tailMasks[0] = TailMask64(tail); - for (size_t i = 0; i < 2; ++i) - tailMasks[1 + i] = TailMask64(tail * 2 - A * i); - for (size_t i = 0; i < 6; ++i) - tailMasks[3 + i] = TailMask64(tail * 6 - A * i); - for (size_t row = 0; row < height; row += 2) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - Yuv420pToRgb(y + col * 2, y + yStride + col * 2, u + col, v + col, rgb + col * 6, rgb + rgbStride + col * 6, tailMasks); - if (col < width) - Yuv420pToRgb(y + col * 2, y + yStride + col * 2, u + col, v + col, rgb + col * 6, rgb + rgbStride + col * 6, tailMasks); - y += 2 * yStride; - u += uStride; - v += vStride; - rgb += 2 * rgbStride; - } - } - - void Yuv420pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(rgb) && Aligned(rgbStride)) - Yuv420pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - else - Yuv420pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - } - - template SIMD_INLINE void Yuv422pToRgb(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* rgb, const __mmask64* tails) - { - __m512i _u = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(u, tails[0]))); - __m512i _v = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(v, tails[0]))); - YuvToRgb(Load(y + 0, tails[1]), _mm512_unpacklo_epi8(_u, _u), _mm512_unpacklo_epi8(_v, _v), rgb + 0 * A, tails + 3); - YuvToRgb(Load(y + A, tails[2]), _mm512_unpackhi_epi8(_u, _u), _mm512_unpackhi_epi8(_v, _v), rgb + 3 * A, tails + 6); - } - - template void Yuv422pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) - { - assert((width % 2 == 0)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(rgb) && Aligned(rgbStride)); - } - - width /= 2; - size_t alignedWidth = AlignLo(width, A); - size_t tail = width - alignedWidth; - __mmask64 tailMasks[9]; - tailMasks[0] = TailMask64(tail); - for (size_t i = 0; i < 2; ++i) - tailMasks[1 + i] = TailMask64(tail * 2 - A * i); - for (size_t i = 0; i < 6; ++i) - tailMasks[3 + i] = TailMask64(tail * 6 - A * i); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - Yuv422pToRgb(y + col * 2, u + col, v + col, rgb + col * 6, tailMasks); - if (col < width) - Yuv422pToRgb(y + col * 2, u + col, v + col, rgb + col * 6, tailMasks); - y += yStride; - u += uStride; - v += vStride; - rgb += rgbStride; - } - } - - void Yuv422pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(rgb) && Aligned(rgbStride)) - Yuv422pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - else - Yuv422pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - } - - template SIMD_INLINE void Yuv444pToRgb(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* rgb, const __mmask64* tails) - { - YuvToRgb(Load(y, tails[0]), Load(u, tails[0]), Load(v, tails[0]), rgb, tails + 1); - } - - template void Yuv444pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) - { - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(rgb) && Aligned(rgbStride)); - } - - size_t alignedWidth = AlignLo(width, A); - size_t tail = width - alignedWidth; - __mmask64 tailMasks[4]; - tailMasks[0] = TailMask64(tail); - for (size_t i = 0; i < 3; ++i) - tailMasks[1 + i] = TailMask64(tail * 3 - A * i); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - Yuv444pToRgb(y + col, u + col, v + col, rgb + col * 3, tailMasks); - if (col < width) - Yuv444pToRgb(y + col, u + col, v + col, rgb + col * 3, tailMasks); - y += yStride; - u += uStride; - v += vStride; - rgb += rgbStride; - } - } - - void Yuv444pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(rgb) && Aligned(rgbStride)) - Yuv444pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - else - Yuv444pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - } - } -#endif// SIMD_AVX2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwYuvToBgra.cpp b/src/3rd/Simd/Simd/SimdAvx512bwYuvToBgra.cpp deleted file mode 100644 index 124a2103..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwYuvToBgra.cpp +++ /dev/null @@ -1,266 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE void YuvToBgra(const __m512i & y, const __m512i & u, const __m512i & v, const __m512i & a, uint8_t * bgra, const __mmask64 * tails) - { - __m512i b = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, YuvToBlue(y, u)); - __m512i g = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, YuvToGreen(y, u, v)); - __m512i r = _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, YuvToRed(y, v)); - __m512i bg0 = UnpackU8<0>(b, g); - __m512i bg1 = UnpackU8<1>(b, g); - __m512i ra0 = UnpackU8<0>(r, a); - __m512i ra1 = UnpackU8<1>(r, a); - Store(bgra + 0 * A, UnpackU16<0>(bg0, ra0), tails[0]); - Store(bgra + 1 * A, UnpackU16<1>(bg0, ra0), tails[1]); - Store(bgra + 2 * A, UnpackU16<0>(bg1, ra1), tails[2]); - Store(bgra + 3 * A, UnpackU16<1>(bg1, ra1), tails[3]); - } - - template SIMD_INLINE void Yuva420pToBgra(const uint8_t * y0, const uint8_t * y1, const uint8_t * u, const uint8_t * v, - const uint8_t * a0, const uint8_t * a1, uint8_t * bgra0, uint8_t * bgra1, const __mmask64 * tails) - { - __m512i _u = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(u, tails[0]))); - __m512i u0 = UnpackU8<0>(_u, _u); - __m512i u1 = UnpackU8<1>(_u, _u); - __m512i _v = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(v, tails[0]))); - __m512i v0 = UnpackU8<0>(_v, _v); - __m512i v1 = UnpackU8<1>(_v, _v); - YuvToBgra(Load(y0 + 0, tails[1]), u0, v0, _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, (Load(a0 + 0, tails[1]))), bgra0 + 00, tails + 3); - YuvToBgra(Load(y0 + A, tails[2]), u1, v1, _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, (Load(a0 + A, tails[2]))), bgra0 + QA, tails + 7); - YuvToBgra(Load(y1 + 0, tails[1]), u0, v0, _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, (Load(a1 + 0, tails[1]))), bgra1 + 00, tails + 3); - YuvToBgra(Load(y1 + A, tails[2]), u1, v1, _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, (Load(a1 + A, tails[2]))), bgra1 + QA, tails + 7); - } - - template void Yuva420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride) - { - assert((width % 2 == 0) && (height % 2 == 0)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride)); - assert(Aligned(a) && Aligned(aStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - width /= 2; - size_t alignedWidth = AlignLo(width, A); - size_t tail = width - alignedWidth; - __mmask64 tailMasks[11]; - tailMasks[0] = TailMask64(tail); - for (size_t i = 0; i < 2; ++i) - tailMasks[1 + i] = TailMask64(tail * 2 - A * i); - for (size_t i = 0; i < 8; ++i) - tailMasks[3 + i] = TailMask64(tail * 8 - A * i); - for (size_t row = 0; row < height; row += 2) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - Yuva420pToBgra(y + col * 2, y + yStride + col * 2, u + col, v + col, a + col * 2, a + aStride + col * 2, - bgra + col * 8, bgra + bgraStride + col * 8, tailMasks); - if (col < width) - Yuva420pToBgra(y + col * 2, y + yStride + col * 2, u + col, v + col, a + col * 2, a + aStride + col * 2, - bgra + col * 8, bgra + bgraStride + col * 8, tailMasks); - y += 2 * yStride; - u += uStride; - v += vStride; - a += 2 * aStride; - bgra += 2 * bgraStride; - } - } - - void Yuva420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride) - && Aligned(a) && Aligned(aStride) && Aligned(bgra) && Aligned(bgraStride)) - Yuva420pToBgra(y, yStride, u, uStride, v, vStride, a, aStride, width, height, bgra, bgraStride); - else - Yuva420pToBgra(y, yStride, u, uStride, v, vStride, a, aStride, width, height, bgra, bgraStride); - } - - template SIMD_INLINE void Yuv420pToBgra(const uint8_t * y0, const uint8_t * y1, const uint8_t * u, const uint8_t * v, - const __m512i & a, uint8_t * bgra0, uint8_t * bgra1, const __mmask64 * tails) - { - __m512i _u = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(u, tails[0]))); - __m512i u0 = UnpackU8<0>(_u, _u); - __m512i u1 = UnpackU8<1>(_u, _u); - __m512i _v = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(v, tails[0]))); - __m512i v0 = UnpackU8<0>(_v, _v); - __m512i v1 = UnpackU8<1>(_v, _v); - YuvToBgra(Load(y0 + 0, tails[1]), u0, v0, a, bgra0 + 00, tails + 3); - YuvToBgra(Load(y0 + A, tails[2]), u1, v1, a, bgra0 + QA, tails + 7); - YuvToBgra(Load(y1 + 0, tails[1]), u0, v0, a, bgra1 + 00, tails + 3); - YuvToBgra(Load(y1 + A, tails[2]), u1, v1, a, bgra1 + QA, tails + 7); - } - - template void Yuv420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - assert((width % 2 == 0) && (height % 2 == 0)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - __m512i a = _mm512_set1_epi8(alpha); - width /= 2; - size_t alignedWidth = AlignLo(width, A); - size_t tail = width - alignedWidth; - __mmask64 tailMasks[11]; - tailMasks[0] = TailMask64(tail); - for (size_t i = 0; i < 2; ++i) - tailMasks[1 + i] = TailMask64(tail * 2 - A * i); - for (size_t i = 0; i < 8; ++i) - tailMasks[3 + i] = TailMask64(tail * 8 - A * i); - for (size_t row = 0; row < height; row += 2) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - Yuv420pToBgra(y + col * 2, y + yStride + col * 2, u + col, v + col, a, bgra + col * 8, bgra + bgraStride + col * 8, tailMasks); - if (col < width) - Yuv420pToBgra(y + col * 2, y + yStride + col * 2, u + col, v + col, a, bgra + col * 8, bgra + bgraStride + col * 8, tailMasks); - y += 2 * yStride; - u += uStride; - v += vStride; - bgra += 2 * bgraStride; - } - } - - void Yuv420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)) - Yuv420pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else - Yuv420pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - } - - template SIMD_INLINE void Yuv422pToBgra(const uint8_t * y, const uint8_t * u, const uint8_t * v, const __m512i & a, uint8_t * bgra, const __mmask64 * tails) - { - __m512i _u = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(u, tails[0]))); - __m512i _v = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(v, tails[0]))); - YuvToBgra(Load(y + 0, tails[1]), UnpackU8<0>(_u, _u), UnpackU8<0>(_v, _v), a, bgra + 00, tails + 3); - YuvToBgra(Load(y + A, tails[2]), UnpackU8<1>(_u, _u), UnpackU8<1>(_v, _v), a, bgra + QA, tails + 7); - } - - template void Yuv422pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - assert((width % 2 == 0)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - __m512i a = _mm512_set1_epi8(alpha); - width /= 2; - size_t alignedWidth = AlignLo(width, A); - size_t tail = width - alignedWidth; - __mmask64 tailMasks[11]; - tailMasks[0] = TailMask64(tail); - for (size_t i = 0; i < 2; ++i) - tailMasks[1 + i] = TailMask64(tail * 2 - A * i); - for (size_t i = 0; i < 8; ++i) - tailMasks[3 + i] = TailMask64(tail * 8 - A * i); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - Yuv422pToBgra(y + col * 2, u + col, v + col, a, bgra + col * 8, tailMasks); - if (col < width) - Yuv422pToBgra(y + col * 2, u + col, v + col, a, bgra + col * 8, tailMasks); - y += yStride; - u += uStride; - v += vStride; - bgra += bgraStride; - } - } - - void Yuv422pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)) - Yuv422pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else - Yuv422pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - } - - template SIMD_INLINE void Yuv444pToBgra(const uint8_t * y, const uint8_t * u, const uint8_t * v, const __m512i & a, uint8_t * bgra, const __mmask64 * tails) - { - YuvToBgra(Load(y, tails[0]), Load(u, tails[0]), Load(v, tails[0]), a, bgra, tails + 1); - } - - template void Yuv444pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - __m512i a = _mm512_set1_epi8(alpha); - size_t alignedWidth = AlignLo(width, A); - size_t tail = width - alignedWidth; - __mmask64 tailMasks[5]; - tailMasks[0] = TailMask64(tail); - for (size_t i = 0; i < 4; ++i) - tailMasks[1 + i] = TailMask64(tail * 4 - A * i); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - Yuv444pToBgra(y + col, u + col, v + col, a, bgra + col * 4, tailMasks); - if (col < width) - Yuv444pToBgra(y + col, u + col, v + col, a, bgra + col * 4, tailMasks); - y += yStride; - u += uStride; - v += vStride; - bgra += bgraStride; - } - } - - void Yuv444pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)) - Yuv444pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else - Yuv444pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512bwYuvToHue.cpp b/src/3rd/Simd/Simd/SimdAvx512bwYuvToHue.cpp deleted file mode 100644 index fbfa0778..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512bwYuvToHue.cpp +++ /dev/null @@ -1,171 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - SIMD_INLINE __m512i MulDiv32(const __m512i & dividend, const __m512i & divisor, const __m512 & KF_255_DIV_6) - { - return _mm512_cvttps_epi32(_mm512_div_ps(_mm512_mul_ps(KF_255_DIV_6, _mm512_cvtepi32_ps(dividend)), _mm512_cvtepi32_ps(divisor))); - } - - SIMD_INLINE __m512i MulDiv16(const __m512i & dividend, const __m512i & divisor, const __m512 & KF_255_DIV_6) - { - const __m512i quotientLo = MulDiv32(_mm512_unpacklo_epi16(dividend, K_ZERO), _mm512_unpacklo_epi16(divisor, K_ZERO), KF_255_DIV_6); - const __m512i quotientHi = MulDiv32(_mm512_unpackhi_epi16(dividend, K_ZERO), _mm512_unpackhi_epi16(divisor, K_ZERO), KF_255_DIV_6); - return _mm512_packs_epi32(quotientLo, quotientHi); - } - - SIMD_INLINE __m512i AdjustedYuvToHue16(const __m512i & y, const __m512i & u, const __m512i & v, const __m512 & KF_255_DIV_6) - { - const __m512i red = AdjustedYuvToRed16(y, v); - const __m512i green = AdjustedYuvToGreen16(y, u, v); - const __m512i blue = AdjustedYuvToBlue16(y, u); - const __m512i max = MaxI16(red, green, blue); - const __m512i range = _mm512_subs_epi16(max, MinI16(red, green, blue)); - - const __mmask32 redMaxMask = _mm512_cmpeq_epi16_mask(red, max); - const __mmask32 greenMaxMask = (~redMaxMask)&_mm512_cmpeq_epi16_mask(green, max); - const __mmask32 blueMaxMask = ~(redMaxMask | greenMaxMask); - - __m512i dividend = _mm512_maskz_add_epi16(redMaxMask, _mm512_sub_epi16(green, blue), _mm512_mullo_epi16(range, K16_0006)); - dividend = _mm512_mask_add_epi16(dividend, greenMaxMask, _mm512_sub_epi16(blue, red), _mm512_mullo_epi16(range, K16_0002)); - dividend = _mm512_mask_add_epi16(dividend, blueMaxMask, _mm512_sub_epi16(red, green), _mm512_mullo_epi16(range, K16_0004)); - - return _mm512_and_si512(MulDiv16(dividend, range, KF_255_DIV_6), _mm512_maskz_set1_epi16(_mm512_cmpneq_epi16_mask(range, K_ZERO), 0xFF)); - } - - template SIMD_INLINE void YuvToHue(const __m512i & y, const __m512i & u, const __m512i & v, const __m512 & KF_255_DIV_6, uint8_t * hue, __mmask64 tail) - { - __m512i lo = AdjustedYuvToHue16(AdjustY16(UnpackU8<0>(y)), AdjustUV16(UnpackU8<0>(u)), AdjustUV16(UnpackU8<0>(v)), KF_255_DIV_6); - __m512i hi = AdjustedYuvToHue16(AdjustY16(UnpackU8<1>(y)), AdjustUV16(UnpackU8<1>(u)), AdjustUV16(UnpackU8<1>(v)), KF_255_DIV_6); - Store(hue, _mm512_packus_epi16(lo, hi), tail); - } - - template SIMD_INLINE void Yuv420pToHue(const uint8_t * y0, const uint8_t * y1, const uint8_t * u, const uint8_t * v, - const __m512 & KF_255_DIV_6, uint8_t * hue0, uint8_t * hue1, const __mmask64 * tails) - { - __m512i _u = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(u, tails[0]))); - __m512i u0 = UnpackU8<0>(_u, _u); - __m512i u1 = UnpackU8<1>(_u, _u); - __m512i _v = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(v, tails[0]))); - __m512i v0 = UnpackU8<0>(_v, _v); - __m512i v1 = UnpackU8<1>(_v, _v); - YuvToHue(Load(y0 + 0, tails[1]), u0, v0, KF_255_DIV_6, hue0 + 0, tails[1]); - YuvToHue(Load(y0 + A, tails[2]), u1, v1, KF_255_DIV_6, hue0 + A, tails[2]); - YuvToHue(Load(y1 + 0, tails[1]), u0, v0, KF_255_DIV_6, hue1 + 0, tails[1]); - YuvToHue(Load(y1 + A, tails[2]), u1, v1, KF_255_DIV_6, hue1 + A, tails[2]); - } - - template void Yuv420pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride) - { - assert((width % 2 == 0) && (height % 2 == 0)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(hue) && Aligned(hueStride)); - } - - const __m512 KF_255_DIV_6 = _mm512_set1_ps(Base::KF_255_DIV_6); - - width /= 2; - size_t alignedWidth = AlignLo(width, A); - size_t tail = width - alignedWidth; - __mmask64 tailMasks[3]; - tailMasks[0] = TailMask64(tail); - for (size_t i = 0; i < 2; ++i) - tailMasks[1 + i] = TailMask64(tail * 2 - A * i); - for (size_t row = 0; row < height; row += 2) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - Yuv420pToHue(y + col * 2, y + yStride + col * 2, u + col, v + col, KF_255_DIV_6, hue + col * 2, hue + hueStride + col * 2, tailMasks); - if (col < width) - Yuv420pToHue(y + col * 2, y + yStride + col * 2, u + col, v + col, KF_255_DIV_6, hue + col * 2, hue + hueStride + col * 2, tailMasks); - y += 2 * yStride; - u += uStride; - v += vStride; - hue += 2 * hueStride; - } - } - - void Yuv420pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * hue, size_t hueStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride) && Aligned(hue) && Aligned(hueStride)) - Yuv420pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - else - Yuv420pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - } - - template SIMD_INLINE void Yuv444pToHue(const uint8_t * y, const uint8_t * u, const uint8_t * v, const __m512 & KF_255_DIV_6, uint8_t * hue, __mmask64 tail = -1) - { - YuvToHue(Load(y, tail), Load(u, tail), Load(v, tail), KF_255_DIV_6, hue, tail); - } - - template void Yuv444pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(hue) && Aligned(hueStride)); - } - - const __m512 KF_255_DIV_6 = _mm512_set1_ps(Base::KF_255_DIV_6); - - size_t alignedWidth = AlignLo(width, A); - size_t tail = width - alignedWidth; - __mmask64 tailMask = TailMask64(tail); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += A) - Yuv444pToHue(y + col, u + col, v + col, KF_255_DIV_6, hue + col); - if (col < width) - Yuv444pToHue(y + col, u + col, v + col, KF_255_DIV_6, hue + col, tailMask); - y += yStride; - u += uStride; - v += vStride; - hue += hueStride; - } - } - - void Yuv444pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride) && Aligned(hue) && Aligned(hueStride)) - Yuv444pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - else - Yuv444pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - } - } -#endif// SIMD_AVX512BW_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512f.h b/src/3rd/Simd/Simd/SimdAvx512f.h deleted file mode 100644 index f98276d0..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512f.h +++ /dev/null @@ -1,213 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdAvx512f_h__ -#define __SimdAvx512f_h__ - -#include "Simd/SimdDefs.h" - -namespace Simd -{ -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - void Fill32f(float * dst, size_t size, const float * value); - - void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); - - void Gemm32fNT(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); - - void NeuralProductSum(const float * a, const float * b, size_t size, float * sum); - - void NeuralAddVectorMultipliedByValue(const float * src, size_t size, const float * value, float * dst); - - void NeuralAddVector(const float * src, size_t size, float * dst); - - void NeuralAddValue(const float * value, float * dst, size_t size); - - void NeuralRoughSigmoid(const float * src, size_t size, const float * slope, float * dst); - - void NeuralRoughSigmoid2(const float * src, size_t size, const float * slope, float * dst); - - void NeuralDerivativeSigmoid(const float * src, size_t size, const float * slope, float * dst); - - void NeuralRoughTanh(const float * src, size_t size, const float * slope, float * dst); - - void NeuralDerivativeTanh(const float * src, size_t size, const float * slope, float * dst); - - void NeuralDerivativeRelu(const float * src, size_t size, const float * slope, float * dst); - - void NeuralPow(const float * src, size_t size, const float * exponent, float * dst); - - void NeuralUpdateWeights(const float * x, size_t size, const float * a, const float * b, float * d, float * w); - - void NeuralAdaptiveGradientUpdate(const float * delta, size_t size, size_t batch, const float * alpha, const float * epsilon, float * gradient, float * weight); - - void NeuralAddConvolution2x2Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution3x3Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution4x4Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution5x5Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution2x2Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution3x3Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution4x4Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution5x5Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution2x2Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - void NeuralAddConvolution3x3Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - void NeuralAddConvolution4x4Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - void NeuralAddConvolution5x5Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - void NeuralPooling1x1Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride); - - void NeuralPooling2x2Max2x2(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride); - - void NeuralPooling2x2Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride); - - void NeuralConvolutionForward(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, const float * weight, - size_t kernelX, size_t kernelY, size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, - void * buffer, size_t * size, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth, int add); - - void SquaredDifferenceSum32f(const float * a, const float * b, size_t size, float * sum); - - void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum); - - void SvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum); - - void SynetAddBias(const float * bias, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst); - - void SynetElu32f(const float * src, size_t size, const float * alpha, float * dst); - - void SynetFusedLayerForward0(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward1(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward2(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward3(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward4(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward8(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward9(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1, SimdTensorFormatType format); - - void SynetHswish32f(const float * src, size_t size, const float * shift, const float * scale, float * dst); - - void SynetInnerProductLayerForward(const float * src, const float * weight, const float * bias, size_t count, size_t size, float * dst); - - void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst, SimdTensorFormatType format); - - void SynetPoolingForwardAverage(const float* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float* dst, size_t dstH, size_t dstW, SimdBool excludePad, SimdTensorFormatType format); - - void SynetPoolingForwardMax32f(const float * src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float * dst, size_t dstH, size_t dstW, SimdTensorFormatType format); - - void SynetPreluLayerForward(const float * src, const float * slope, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetRelu32f(const float* src, size_t size, const float* slope, float* dst); - - void SynetReorderImage(size_t batch, size_t channels, size_t spatial, const float* src, SimdTensorFormatType srcFormat, float* dst, SimdTensorFormatType dstFormat); - - void SynetReorderFilter(size_t output, size_t input, size_t kernel, const float* src, SimdTensorFormatType srcFormat, float* dst, SimdTensorFormatType dstFormat); - - void SynetRestrictRange32f(const float * src, size_t size, const float * lower, const float * upper, float * dst); - - void SynetScaleLayerForward(const float* src, const float* scale, const float* bias, size_t channels, size_t height, size_t width, float* dst, SimdTensorFormatType format, SimdSynetCompatibilityType compatibility); - - void SynetShuffleLayerForward(const float* src0, const float* src1, size_t channels0, size_t channels1, size_t spatial, float* dst0, float* dst1, SimdTensorFormatType format, int type); - - void SynetSigmoid32f(const float* src, size_t size, const float* slope, float* dst); - - void SynetSoftmaxLayerForward(const float * src, size_t outer, size_t size, size_t inner, float * dst); - - void SynetSoftplus32f(const float* src, size_t size, const float* beta, const float* threshold, float* dst); - - void SynetTanh32f(const float* src, size_t size, const float* slope, float* dst); - - void SynetUnaryOperation32fLayerForward(const float* src, size_t size, SimdSynetUnaryOperation32fType type, float* dst); - - void WinogradKernel1x3Block1x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans); - - void WinogradKernel1x3Block1x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel1x3Block1x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel1x5Block1x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans); - - void WinogradKernel1x5Block1x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel1x5Block1x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel2x2Block2x2SetFilter(const float* src, size_t size, float* dst, SimdBool trans); - - void WinogradKernel2x2Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel2x2Block2x2SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel2x2Block4x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans); - - void WinogradKernel2x2Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel2x2Block4x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel3x3Block2x2SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - void WinogradKernel3x3Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel3x3Block2x2SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel3x3Block3x3SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - void WinogradKernel3x3Block3x3SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel3x3Block3x3SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel3x3Block4x4SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - void WinogradKernel3x3Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel3x3Block4x4SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - } -#endif// SIMD_AVX512F_ENABLE -} -#endif//__SimdAvx512f_h__ diff --git a/src/3rd/Simd/Simd/SimdAvx512fFill.cpp b/src/3rd/Simd/Simd/SimdAvx512fFill.cpp deleted file mode 100644 index c7c22961..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512fFill.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - void Fill32f(float * dst, size_t size, const float * value) - { - if (value == 0 || value[0] == 0) - memset(dst, 0, size * sizeof(float)); - else - { - float v = value[0]; - const float * nose = (float*)AlignHi(dst, F * sizeof(float)); - for (; dst < nose && size; --size) - *dst++ = v; - const float * end = dst + size; - const float * endF = dst + AlignLo(size, F); - const float * endQF = dst + AlignLo(size, QF); - size_t i = 0; - __m512 _v = _mm512_set1_ps(v); - for (; dst < endQF; dst += QF) - { - _mm512_storeu_ps(dst + 0 * F, _v); - _mm512_storeu_ps(dst + 1 * F, _v); - _mm512_storeu_ps(dst + 2 * F, _v); - _mm512_storeu_ps(dst + 3 * F, _v); - } - for (; dst < endF; dst += F) - _mm512_storeu_ps(dst, _v); - for (; dst < end;) - *dst++ = v; - } - } - } -#endif// SIMD_AVX512F_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512fGemm32f.cpp b/src/3rd/Simd/Simd/SimdAvx512fGemm32f.cpp deleted file mode 100644 index 8e6f8d0d..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512fGemm32f.cpp +++ /dev/null @@ -1,3256 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdGemm.h" -#include "Simd/SimdAvx2.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - SIMD_INLINE void AddProduct(float * ptr, __m512 value, __m512 alpha) - { - _mm512_storeu_ps(ptr, _mm512_fmadd_ps(value, alpha, _mm512_loadu_ps(ptr))); - } - - SIMD_INLINE void AddProduct(float * ptr, __m512 value, __m512 alpha, __mmask16 mask) - { - _mm512_mask_storeu_ps(ptr, mask, _mm512_fmadd_ps(value, alpha, _mm512_maskz_loadu_ps(mask, ptr))); - } - - void GemmKernel4x48nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask) - { - __m512 c00 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 c30 = _mm512_setzero_ps(); - __m512 c01 = _mm512_setzero_ps(); - __m512 c11 = _mm512_setzero_ps(); - __m512 c21 = _mm512_setzero_ps(); - __m512 c31 = _mm512_setzero_ps(); - __m512 c02 = _mm512_setzero_ps(); - __m512 c12 = _mm512_setzero_ps(); - __m512 c22 = _mm512_setzero_ps(); - __m512 c32 = _mm512_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t sa = lda == 1 ? 4 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - __m512 b0, b1, b2, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - b1 = _mm512_loadu_ps(B + ob1); - b2 = _mm512_loadu_ps(B + ob2); - a0 = _mm512_set1_ps(A[oa0]); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c01 = _mm512_fmadd_ps(a0, b1, c01); - c02 = _mm512_fmadd_ps(a0, b2, c02); - a0 = _mm512_set1_ps(A[oa1]); - c10 = _mm512_fmadd_ps(a0, b0, c10); - c11 = _mm512_fmadd_ps(a0, b1, c11); - c12 = _mm512_fmadd_ps(a0, b2, c12); - a0 = _mm512_set1_ps(A[oa2]); - c20 = _mm512_fmadd_ps(a0, b0, c20); - c21 = _mm512_fmadd_ps(a0, b1, c21); - c22 = _mm512_fmadd_ps(a0, b2, c22); - a0 = _mm512_set1_ps(A[oa3]); - c30 = _mm512_fmadd_ps(a0, b0, c30); - c31 = _mm512_fmadd_ps(a0, b1, c31); - c32 = _mm512_fmadd_ps(a0, b2, c32); - B += sb; - A += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01); - AddProduct(C + 2 * F, _alpha, c02, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11); - AddProduct(C + 2 * F, _alpha, c12, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21); - AddProduct(C + 2 * F, _alpha, c22, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31); - AddProduct(C + 2 * F, _alpha, c32, mask); - } - - void GemmKernel4x32nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask) - { - __m512 c00 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 c30 = _mm512_setzero_ps(); - __m512 c01 = _mm512_setzero_ps(); - __m512 c11 = _mm512_setzero_ps(); - __m512 c21 = _mm512_setzero_ps(); - __m512 c31 = _mm512_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t sa = lda == 1 ? 4 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - __m512 b0, b1, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - b1 = _mm512_loadu_ps(B + ob1); - a0 = _mm512_set1_ps(A[oa0]); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c01 = _mm512_fmadd_ps(a0, b1, c01); - a0 = _mm512_set1_ps(A[oa1]); - c10 = _mm512_fmadd_ps(a0, b0, c10); - c11 = _mm512_fmadd_ps(a0, b1, c11); - a0 = _mm512_set1_ps(A[oa2]); - c20 = _mm512_fmadd_ps(a0, b0, c20); - c21 = _mm512_fmadd_ps(a0, b1, c21); - a0 = _mm512_set1_ps(A[oa3]); - c30 = _mm512_fmadd_ps(a0, b0, c30); - c31 = _mm512_fmadd_ps(a0, b1, c31); - B += sb; - A += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31, mask); - } - - void GemmKernel4x16nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask) - { - __m512 c0 = _mm512_setzero_ps(); - __m512 c1 = _mm512_setzero_ps(); - __m512 c2 = _mm512_setzero_ps(); - __m512 c3 = _mm512_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t sa = lda == 1 ? 4 : 1; - const size_t ob0 = ldb * 0; - __m512 b0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - c0 = _mm512_fmadd_ps(b0, _mm512_set1_ps(A[oa0]), c0); - c1 = _mm512_fmadd_ps(b0, _mm512_set1_ps(A[oa1]), c1); - c2 = _mm512_fmadd_ps(b0, _mm512_set1_ps(A[oa2]), c2); - c3 = _mm512_fmadd_ps(b0, _mm512_set1_ps(A[oa3]), c3); - B += sb; - A += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - AddProduct(C + 0 * ldc, _alpha, c0, mask); - AddProduct(C + 1 * ldc, _alpha, c1, mask); - AddProduct(C + 2 * ldc, _alpha, c2, mask); - AddProduct(C + 3 * ldc, _alpha, c3, mask); - } - - void GemmKernel6x64nn(size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, size_t sb, float* C, size_t ldc, __mmask16 mask) - { - __m512 c00 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 c30 = _mm512_setzero_ps(); - __m512 c40 = _mm512_setzero_ps(); - __m512 c50 = _mm512_setzero_ps(); - __m512 c01 = _mm512_setzero_ps(); - __m512 c11 = _mm512_setzero_ps(); - __m512 c21 = _mm512_setzero_ps(); - __m512 c31 = _mm512_setzero_ps(); - __m512 c41 = _mm512_setzero_ps(); - __m512 c51 = _mm512_setzero_ps(); - __m512 c02 = _mm512_setzero_ps(); - __m512 c12 = _mm512_setzero_ps(); - __m512 c22 = _mm512_setzero_ps(); - __m512 c32 = _mm512_setzero_ps(); - __m512 c42 = _mm512_setzero_ps(); - __m512 c52 = _mm512_setzero_ps(); - __m512 c03 = _mm512_setzero_ps(); - __m512 c13 = _mm512_setzero_ps(); - __m512 c23 = _mm512_setzero_ps(); - __m512 c33 = _mm512_setzero_ps(); - __m512 c43 = _mm512_setzero_ps(); - __m512 c53 = _mm512_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t sa = lda == 1 ? 6 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - const size_t ob3 = ldb * 3; - __m512 b0, b1, b2, b3, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - b1 = _mm512_loadu_ps(B + ob1); - b2 = _mm512_loadu_ps(B + ob2); - b3 = _mm512_loadu_ps(B + ob3); - a0 = _mm512_set1_ps(A[oa0]); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c01 = _mm512_fmadd_ps(a0, b1, c01); - c02 = _mm512_fmadd_ps(a0, b2, c02); - c03 = _mm512_fmadd_ps(a0, b3, c03); - a0 = _mm512_set1_ps(A[oa1]); - c10 = _mm512_fmadd_ps(a0, b0, c10); - c11 = _mm512_fmadd_ps(a0, b1, c11); - c12 = _mm512_fmadd_ps(a0, b2, c12); - c13 = _mm512_fmadd_ps(a0, b3, c13); - a0 = _mm512_set1_ps(A[oa2]); - c20 = _mm512_fmadd_ps(a0, b0, c20); - c21 = _mm512_fmadd_ps(a0, b1, c21); - c22 = _mm512_fmadd_ps(a0, b2, c22); - c23 = _mm512_fmadd_ps(a0, b3, c23); - a0 = _mm512_set1_ps(A[oa3]); - c30 = _mm512_fmadd_ps(a0, b0, c30); - c31 = _mm512_fmadd_ps(a0, b1, c31); - c32 = _mm512_fmadd_ps(a0, b2, c32); - c33 = _mm512_fmadd_ps(a0, b3, c33); - a0 = _mm512_set1_ps(A[oa4]); - c40 = _mm512_fmadd_ps(a0, b0, c40); - c41 = _mm512_fmadd_ps(a0, b1, c41); - c42 = _mm512_fmadd_ps(a0, b2, c42); - c43 = _mm512_fmadd_ps(a0, b3, c43); - a0 = _mm512_set1_ps(A[oa5]); - c50 = _mm512_fmadd_ps(a0, b0, c50); - c51 = _mm512_fmadd_ps(a0, b1, c51); - c52 = _mm512_fmadd_ps(a0, b2, c52); - c53 = _mm512_fmadd_ps(a0, b3, c53); - B += sb; - A += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01); - AddProduct(C + 2 * F, _alpha, c02); - AddProduct(C + 3 * F, _alpha, c03, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11); - AddProduct(C + 2 * F, _alpha, c12); - AddProduct(C + 3 * F, _alpha, c13, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21); - AddProduct(C + 2 * F, _alpha, c22); - AddProduct(C + 3 * F, _alpha, c23, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31); - AddProduct(C + 2 * F, _alpha, c32); - AddProduct(C + 3 * F, _alpha, c33, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40); - AddProduct(C + 1 * F, _alpha, c41); - AddProduct(C + 2 * F, _alpha, c42); - AddProduct(C + 3 * F, _alpha, c43, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50); - AddProduct(C + 1 * F, _alpha, c51); - AddProduct(C + 2 * F, _alpha, c52); - AddProduct(C + 3 * F, _alpha, c53, mask); - } - - void GemmKernel6x48nn(size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, size_t sb, float* C, size_t ldc, __mmask16 mask) - { - __m512 c00 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 c30 = _mm512_setzero_ps(); - __m512 c40 = _mm512_setzero_ps(); - __m512 c50 = _mm512_setzero_ps(); - __m512 c01 = _mm512_setzero_ps(); - __m512 c11 = _mm512_setzero_ps(); - __m512 c21 = _mm512_setzero_ps(); - __m512 c31 = _mm512_setzero_ps(); - __m512 c41 = _mm512_setzero_ps(); - __m512 c51 = _mm512_setzero_ps(); - __m512 c02 = _mm512_setzero_ps(); - __m512 c12 = _mm512_setzero_ps(); - __m512 c22 = _mm512_setzero_ps(); - __m512 c32 = _mm512_setzero_ps(); - __m512 c42 = _mm512_setzero_ps(); - __m512 c52 = _mm512_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t sa = lda == 1 ? 6 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - __m512 b0, b1, b2, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - b1 = _mm512_loadu_ps(B + ob1); - b2 = _mm512_loadu_ps(B + ob2); - a0 = _mm512_set1_ps(A[oa0]); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c01 = _mm512_fmadd_ps(a0, b1, c01); - c02 = _mm512_fmadd_ps(a0, b2, c02); - a0 = _mm512_set1_ps(A[oa1]); - c10 = _mm512_fmadd_ps(a0, b0, c10); - c11 = _mm512_fmadd_ps(a0, b1, c11); - c12 = _mm512_fmadd_ps(a0, b2, c12); - a0 = _mm512_set1_ps(A[oa2]); - c20 = _mm512_fmadd_ps(a0, b0, c20); - c21 = _mm512_fmadd_ps(a0, b1, c21); - c22 = _mm512_fmadd_ps(a0, b2, c22); - a0 = _mm512_set1_ps(A[oa3]); - c30 = _mm512_fmadd_ps(a0, b0, c30); - c31 = _mm512_fmadd_ps(a0, b1, c31); - c32 = _mm512_fmadd_ps(a0, b2, c32); - a0 = _mm512_set1_ps(A[oa4]); - c40 = _mm512_fmadd_ps(a0, b0, c40); - c41 = _mm512_fmadd_ps(a0, b1, c41); - c42 = _mm512_fmadd_ps(a0, b2, c42); - a0 = _mm512_set1_ps(A[oa5]); - c50 = _mm512_fmadd_ps(a0, b0, c50); - c51 = _mm512_fmadd_ps(a0, b1, c51); - c52 = _mm512_fmadd_ps(a0, b2, c52); - B += sb; - A += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01); - AddProduct(C + 2 * F, _alpha, c02, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11); - AddProduct(C + 2 * F, _alpha, c12, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21); - AddProduct(C + 2 * F, _alpha, c22, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31); - AddProduct(C + 2 * F, _alpha, c32, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40); - AddProduct(C + 1 * F, _alpha, c41); - AddProduct(C + 2 * F, _alpha, c42, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50); - AddProduct(C + 1 * F, _alpha, c51); - AddProduct(C + 2 * F, _alpha, c52, mask); - } - - void GemmKernel6x32nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask) - { - __m512 c00 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 c30 = _mm512_setzero_ps(); - __m512 c40 = _mm512_setzero_ps(); - __m512 c50 = _mm512_setzero_ps(); - __m512 c01 = _mm512_setzero_ps(); - __m512 c11 = _mm512_setzero_ps(); - __m512 c21 = _mm512_setzero_ps(); - __m512 c31 = _mm512_setzero_ps(); - __m512 c41 = _mm512_setzero_ps(); - __m512 c51 = _mm512_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t sa = lda == 1 ? 6 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - __m512 b0, b1, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - b1 = _mm512_loadu_ps(B + ob1); - a0 = _mm512_set1_ps(A[oa0]); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c01 = _mm512_fmadd_ps(a0, b1, c01); - a0 = _mm512_set1_ps(A[oa1]); - c10 = _mm512_fmadd_ps(a0, b0, c10); - c11 = _mm512_fmadd_ps(a0, b1, c11); - a0 = _mm512_set1_ps(A[oa2]); - c20 = _mm512_fmadd_ps(a0, b0, c20); - c21 = _mm512_fmadd_ps(a0, b1, c21); - a0 = _mm512_set1_ps(A[oa3]); - c30 = _mm512_fmadd_ps(a0, b0, c30); - c31 = _mm512_fmadd_ps(a0, b1, c31); - a0 = _mm512_set1_ps(A[oa4]); - c40 = _mm512_fmadd_ps(a0, b0, c40); - c41 = _mm512_fmadd_ps(a0, b1, c41); - a0 = _mm512_set1_ps(A[oa5]); - c50 = _mm512_fmadd_ps(a0, b0, c50); - c51 = _mm512_fmadd_ps(a0, b1, c51); - B += sb; - A += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40); - AddProduct(C + 1 * F, _alpha, c41, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50); - AddProduct(C + 1 * F, _alpha, c51, mask); - } - - void GemmKernel6x16nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask) - { - __m512 c00 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 c30 = _mm512_setzero_ps(); - __m512 c40 = _mm512_setzero_ps(); - __m512 c50 = _mm512_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t sa = lda == 1 ? 6 : 1; - const size_t ob0 = ldb * 0; - __m512 b0, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - a0 = _mm512_set1_ps(A[oa0]); - c00 = _mm512_fmadd_ps(a0, b0, c00); - a0 = _mm512_set1_ps(A[oa1]); - c10 = _mm512_fmadd_ps(a0, b0, c10); - a0 = _mm512_set1_ps(A[oa2]); - c20 = _mm512_fmadd_ps(a0, b0, c20); - a0 = _mm512_set1_ps(A[oa3]); - c30 = _mm512_fmadd_ps(a0, b0, c30); - a0 = _mm512_set1_ps(A[oa4]); - c40 = _mm512_fmadd_ps(a0, b0, c40); - a0 = _mm512_set1_ps(A[oa5]); - c50 = _mm512_fmadd_ps(a0, b0, c50); - B += sb; - A += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50, mask); - } - - void GemmKernel8x48nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask) - { - __m512 c00 = _mm512_setzero_ps(); - __m512 c01 = _mm512_setzero_ps(); - __m512 c02 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c11 = _mm512_setzero_ps(); - __m512 c12 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 c21 = _mm512_setzero_ps(); - __m512 c22 = _mm512_setzero_ps(); - __m512 c30 = _mm512_setzero_ps(); - __m512 c31 = _mm512_setzero_ps(); - __m512 c32 = _mm512_setzero_ps(); - __m512 c40 = _mm512_setzero_ps(); - __m512 c41 = _mm512_setzero_ps(); - __m512 c42 = _mm512_setzero_ps(); - __m512 c50 = _mm512_setzero_ps(); - __m512 c51 = _mm512_setzero_ps(); - __m512 c52 = _mm512_setzero_ps(); - __m512 c60 = _mm512_setzero_ps(); - __m512 c61 = _mm512_setzero_ps(); - __m512 c62 = _mm512_setzero_ps(); - __m512 c70 = _mm512_setzero_ps(); - __m512 c71 = _mm512_setzero_ps(); - __m512 c72 = _mm512_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t oa6 = lda * 6; - const size_t oa7 = lda * 7; - const size_t sa = lda == 1 ? 8 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - __m512 b0, b1, b2, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - b1 = _mm512_loadu_ps(B + ob1); - b2 = _mm512_loadu_ps(B + ob2); - a0 = _mm512_set1_ps(A[oa0]); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c01 = _mm512_fmadd_ps(a0, b1, c01); - c02 = _mm512_fmadd_ps(a0, b2, c02); - a0 = _mm512_set1_ps(A[oa1]); - c10 = _mm512_fmadd_ps(a0, b0, c10); - c11 = _mm512_fmadd_ps(a0, b1, c11); - c12 = _mm512_fmadd_ps(a0, b2, c12); - a0 = _mm512_set1_ps(A[oa2]); - c20 = _mm512_fmadd_ps(a0, b0, c20); - c21 = _mm512_fmadd_ps(a0, b1, c21); - c22 = _mm512_fmadd_ps(a0, b2, c22); - a0 = _mm512_set1_ps(A[oa3]); - c30 = _mm512_fmadd_ps(a0, b0, c30); - c31 = _mm512_fmadd_ps(a0, b1, c31); - c32 = _mm512_fmadd_ps(a0, b2, c32); - a0 = _mm512_set1_ps(A[oa4]); - c40 = _mm512_fmadd_ps(a0, b0, c40); - c41 = _mm512_fmadd_ps(a0, b1, c41); - c42 = _mm512_fmadd_ps(a0, b2, c42); - a0 = _mm512_set1_ps(A[oa5]); - c50 = _mm512_fmadd_ps(a0, b0, c50); - c51 = _mm512_fmadd_ps(a0, b1, c51); - c52 = _mm512_fmadd_ps(a0, b2, c52); - a0 = _mm512_set1_ps(A[oa6]); - c60 = _mm512_fmadd_ps(a0, b0, c60); - c61 = _mm512_fmadd_ps(a0, b1, c61); - c62 = _mm512_fmadd_ps(a0, b2, c62); - a0 = _mm512_set1_ps(A[oa7]); - c70 = _mm512_fmadd_ps(a0, b0, c70); - c71 = _mm512_fmadd_ps(a0, b1, c71); - c72 = _mm512_fmadd_ps(a0, b2, c72); - B += sb; - A += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01); - AddProduct(C + 2 * F, _alpha, c02, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11); - AddProduct(C + 2 * F, _alpha, c12, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21); - AddProduct(C + 2 * F, _alpha, c22, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31); - AddProduct(C + 2 * F, _alpha, c32, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40); - AddProduct(C + 1 * F, _alpha, c41); - AddProduct(C + 2 * F, _alpha, c42, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50); - AddProduct(C + 1 * F, _alpha, c51); - AddProduct(C + 2 * F, _alpha, c52, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c60); - AddProduct(C + 1 * F, _alpha, c61); - AddProduct(C + 2 * F, _alpha, c62, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c70); - AddProduct(C + 1 * F, _alpha, c71); - AddProduct(C + 2 * F, _alpha, c72, mask); - } - - void GemmKernel8x32nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask) - { - __m512 c00 = _mm512_setzero_ps(); - __m512 c01 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c11 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 c21 = _mm512_setzero_ps(); - __m512 c30 = _mm512_setzero_ps(); - __m512 c31 = _mm512_setzero_ps(); - __m512 c40 = _mm512_setzero_ps(); - __m512 c41 = _mm512_setzero_ps(); - __m512 c50 = _mm512_setzero_ps(); - __m512 c51 = _mm512_setzero_ps(); - __m512 c60 = _mm512_setzero_ps(); - __m512 c61 = _mm512_setzero_ps(); - __m512 c70 = _mm512_setzero_ps(); - __m512 c71 = _mm512_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t oa6 = lda * 6; - const size_t oa7 = lda * 7; - const size_t sa = lda == 1 ? 8 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - __m512 b0, b1, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - b1 = _mm512_loadu_ps(B + ob1); - a0 = _mm512_set1_ps(A[oa0]); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c01 = _mm512_fmadd_ps(a0, b1, c01); - a0 = _mm512_set1_ps(A[oa1]); - c10 = _mm512_fmadd_ps(a0, b0, c10); - c11 = _mm512_fmadd_ps(a0, b1, c11); - a0 = _mm512_set1_ps(A[oa2]); - c20 = _mm512_fmadd_ps(a0, b0, c20); - c21 = _mm512_fmadd_ps(a0, b1, c21); - a0 = _mm512_set1_ps(A[oa3]); - c30 = _mm512_fmadd_ps(a0, b0, c30); - c31 = _mm512_fmadd_ps(a0, b1, c31); - a0 = _mm512_set1_ps(A[oa4]); - c40 = _mm512_fmadd_ps(a0, b0, c40); - c41 = _mm512_fmadd_ps(a0, b1, c41); - a0 = _mm512_set1_ps(A[oa5]); - c50 = _mm512_fmadd_ps(a0, b0, c50); - c51 = _mm512_fmadd_ps(a0, b1, c51); - a0 = _mm512_set1_ps(A[oa6]); - c60 = _mm512_fmadd_ps(a0, b0, c60); - c61 = _mm512_fmadd_ps(a0, b1, c61); - a0 = _mm512_set1_ps(A[oa7]); - c70 = _mm512_fmadd_ps(a0, b0, c70); - c71 = _mm512_fmadd_ps(a0, b1, c71); - B += sb; - A += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40); - AddProduct(C + 1 * F, _alpha, c41, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50); - AddProduct(C + 1 * F, _alpha, c51, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c60); - AddProduct(C + 1 * F, _alpha, c61, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c70); - AddProduct(C + 1 * F, _alpha, c71, mask); - } - - void GemmKernel8x16nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask) - { - __m512 c00 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 c30 = _mm512_setzero_ps(); - __m512 c40 = _mm512_setzero_ps(); - __m512 c50 = _mm512_setzero_ps(); - __m512 c60 = _mm512_setzero_ps(); - __m512 c70 = _mm512_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t oa6 = lda * 6; - const size_t oa7 = lda * 7; - const size_t sa = lda == 1 ? 8 : 1; - const size_t ob0 = ldb * 0; - __m512 b0, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - a0 = _mm512_set1_ps(A[oa0]); - c00 = _mm512_fmadd_ps(a0, b0, c00); - a0 = _mm512_set1_ps(A[oa1]); - c10 = _mm512_fmadd_ps(a0, b0, c10); - a0 = _mm512_set1_ps(A[oa2]); - c20 = _mm512_fmadd_ps(a0, b0, c20); - a0 = _mm512_set1_ps(A[oa3]); - c30 = _mm512_fmadd_ps(a0, b0, c30); - a0 = _mm512_set1_ps(A[oa4]); - c40 = _mm512_fmadd_ps(a0, b0, c40); - a0 = _mm512_set1_ps(A[oa5]); - c50 = _mm512_fmadd_ps(a0, b0, c50); - a0 = _mm512_set1_ps(A[oa6]); - c60 = _mm512_fmadd_ps(a0, b0, c60); - a0 = _mm512_set1_ps(A[oa7]); - c70 = _mm512_fmadd_ps(a0, b0, c70); - B += sb; - A += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c60, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c70, mask); - } - - void GemmKernel9x48nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask) - { - __m512 c00 = _mm512_setzero_ps(); - __m512 c01 = _mm512_setzero_ps(); - __m512 c02 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c11 = _mm512_setzero_ps(); - __m512 c12 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 c21 = _mm512_setzero_ps(); - __m512 c22 = _mm512_setzero_ps(); - __m512 c30 = _mm512_setzero_ps(); - __m512 c31 = _mm512_setzero_ps(); - __m512 c32 = _mm512_setzero_ps(); - __m512 c40 = _mm512_setzero_ps(); - __m512 c41 = _mm512_setzero_ps(); - __m512 c42 = _mm512_setzero_ps(); - __m512 c50 = _mm512_setzero_ps(); - __m512 c51 = _mm512_setzero_ps(); - __m512 c52 = _mm512_setzero_ps(); - __m512 c60 = _mm512_setzero_ps(); - __m512 c61 = _mm512_setzero_ps(); - __m512 c62 = _mm512_setzero_ps(); - __m512 c70 = _mm512_setzero_ps(); - __m512 c71 = _mm512_setzero_ps(); - __m512 c72 = _mm512_setzero_ps(); - __m512 c80 = _mm512_setzero_ps(); - __m512 c81 = _mm512_setzero_ps(); - __m512 c82 = _mm512_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t oa6 = lda * 6; - const size_t oa7 = lda * 7; - const size_t oa8 = lda * 8; - const size_t sa = lda == 1 ? 9 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - __m512 b0, b1, b2, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - b1 = _mm512_loadu_ps(B + ob1); - b2 = _mm512_loadu_ps(B + ob2); - a0 = _mm512_set1_ps(A[oa0]); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c01 = _mm512_fmadd_ps(a0, b1, c01); - c02 = _mm512_fmadd_ps(a0, b2, c02); - a0 = _mm512_set1_ps(A[oa1]); - c10 = _mm512_fmadd_ps(a0, b0, c10); - c11 = _mm512_fmadd_ps(a0, b1, c11); - c12 = _mm512_fmadd_ps(a0, b2, c12); - a0 = _mm512_set1_ps(A[oa2]); - c20 = _mm512_fmadd_ps(a0, b0, c20); - c21 = _mm512_fmadd_ps(a0, b1, c21); - c22 = _mm512_fmadd_ps(a0, b2, c22); - a0 = _mm512_set1_ps(A[oa3]); - c30 = _mm512_fmadd_ps(a0, b0, c30); - c31 = _mm512_fmadd_ps(a0, b1, c31); - c32 = _mm512_fmadd_ps(a0, b2, c32); - a0 = _mm512_set1_ps(A[oa4]); - c40 = _mm512_fmadd_ps(a0, b0, c40); - c41 = _mm512_fmadd_ps(a0, b1, c41); - c42 = _mm512_fmadd_ps(a0, b2, c42); - a0 = _mm512_set1_ps(A[oa5]); - c50 = _mm512_fmadd_ps(a0, b0, c50); - c51 = _mm512_fmadd_ps(a0, b1, c51); - c52 = _mm512_fmadd_ps(a0, b2, c52); - a0 = _mm512_set1_ps(A[oa6]); - c60 = _mm512_fmadd_ps(a0, b0, c60); - c61 = _mm512_fmadd_ps(a0, b1, c61); - c62 = _mm512_fmadd_ps(a0, b2, c62); - a0 = _mm512_set1_ps(A[oa7]); - c70 = _mm512_fmadd_ps(a0, b0, c70); - c71 = _mm512_fmadd_ps(a0, b1, c71); - c72 = _mm512_fmadd_ps(a0, b2, c72); - a0 = _mm512_set1_ps(A[oa8]); - c80 = _mm512_fmadd_ps(a0, b0, c80); - c81 = _mm512_fmadd_ps(a0, b1, c81); - c82 = _mm512_fmadd_ps(a0, b2, c82); - B += sb; - A += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01); - AddProduct(C + 2 * F, _alpha, c02, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11); - AddProduct(C + 2 * F, _alpha, c12, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21); - AddProduct(C + 2 * F, _alpha, c22, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31); - AddProduct(C + 2 * F, _alpha, c32, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40); - AddProduct(C + 1 * F, _alpha, c41); - AddProduct(C + 2 * F, _alpha, c42, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50); - AddProduct(C + 1 * F, _alpha, c51); - AddProduct(C + 2 * F, _alpha, c52, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c60); - AddProduct(C + 1 * F, _alpha, c61); - AddProduct(C + 2 * F, _alpha, c62, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c70); - AddProduct(C + 1 * F, _alpha, c71); - AddProduct(C + 2 * F, _alpha, c72, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c80); - AddProduct(C + 1 * F, _alpha, c81); - AddProduct(C + 2 * F, _alpha, c82, mask); - } - - void GemmKernel9x32nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask) - { - __m512 c00 = _mm512_setzero_ps(); - __m512 c01 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c11 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 c21 = _mm512_setzero_ps(); - __m512 c30 = _mm512_setzero_ps(); - __m512 c31 = _mm512_setzero_ps(); - __m512 c40 = _mm512_setzero_ps(); - __m512 c41 = _mm512_setzero_ps(); - __m512 c50 = _mm512_setzero_ps(); - __m512 c51 = _mm512_setzero_ps(); - __m512 c60 = _mm512_setzero_ps(); - __m512 c61 = _mm512_setzero_ps(); - __m512 c70 = _mm512_setzero_ps(); - __m512 c71 = _mm512_setzero_ps(); - __m512 c80 = _mm512_setzero_ps(); - __m512 c81 = _mm512_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t oa6 = lda * 6; - const size_t oa7 = lda * 7; - const size_t oa8 = lda * 8; - const size_t sa = lda == 1 ? 9 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - __m512 b0, b1, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - b1 = _mm512_loadu_ps(B + ob1); - a0 = _mm512_set1_ps(A[oa0]); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c01 = _mm512_fmadd_ps(a0, b1, c01); - a0 = _mm512_set1_ps(A[oa1]); - c10 = _mm512_fmadd_ps(a0, b0, c10); - c11 = _mm512_fmadd_ps(a0, b1, c11); - a0 = _mm512_set1_ps(A[oa2]); - c20 = _mm512_fmadd_ps(a0, b0, c20); - c21 = _mm512_fmadd_ps(a0, b1, c21); - a0 = _mm512_set1_ps(A[oa3]); - c30 = _mm512_fmadd_ps(a0, b0, c30); - c31 = _mm512_fmadd_ps(a0, b1, c31); - a0 = _mm512_set1_ps(A[oa4]); - c40 = _mm512_fmadd_ps(a0, b0, c40); - c41 = _mm512_fmadd_ps(a0, b1, c41); - a0 = _mm512_set1_ps(A[oa5]); - c50 = _mm512_fmadd_ps(a0, b0, c50); - c51 = _mm512_fmadd_ps(a0, b1, c51); - a0 = _mm512_set1_ps(A[oa6]); - c60 = _mm512_fmadd_ps(a0, b0, c60); - c61 = _mm512_fmadd_ps(a0, b1, c61); - a0 = _mm512_set1_ps(A[oa7]); - c70 = _mm512_fmadd_ps(a0, b0, c70); - c71 = _mm512_fmadd_ps(a0, b1, c71); - a0 = _mm512_set1_ps(A[oa8]); - c80 = _mm512_fmadd_ps(a0, b0, c80); - c81 = _mm512_fmadd_ps(a0, b1, c81); - B += sb; - A += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40); - AddProduct(C + 1 * F, _alpha, c41, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50); - AddProduct(C + 1 * F, _alpha, c51, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c60); - AddProduct(C + 1 * F, _alpha, c61, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c70); - AddProduct(C + 1 * F, _alpha, c71, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c80); - AddProduct(C + 1 * F, _alpha, c81, mask); - } - - void GemmKernel9x16nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask) - { - __m512 c00 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 c30 = _mm512_setzero_ps(); - __m512 c40 = _mm512_setzero_ps(); - __m512 c50 = _mm512_setzero_ps(); - __m512 c60 = _mm512_setzero_ps(); - __m512 c70 = _mm512_setzero_ps(); - __m512 c80 = _mm512_setzero_ps(); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t oa6 = lda * 6; - const size_t oa7 = lda * 7; - const size_t oa8 = lda * 8; - const size_t sa = lda == 1 ? 9 : 1; - const size_t ob0 = ldb * 0; - __m512 b0, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - a0 = _mm512_set1_ps(A[oa0]); - c00 = _mm512_fmadd_ps(a0, b0, c00); - a0 = _mm512_set1_ps(A[oa1]); - c10 = _mm512_fmadd_ps(a0, b0, c10); - a0 = _mm512_set1_ps(A[oa2]); - c20 = _mm512_fmadd_ps(a0, b0, c20); - a0 = _mm512_set1_ps(A[oa3]); - c30 = _mm512_fmadd_ps(a0, b0, c30); - a0 = _mm512_set1_ps(A[oa4]); - c40 = _mm512_fmadd_ps(a0, b0, c40); - a0 = _mm512_set1_ps(A[oa5]); - c50 = _mm512_fmadd_ps(a0, b0, c50); - a0 = _mm512_set1_ps(A[oa6]); - c60 = _mm512_fmadd_ps(a0, b0, c60); - a0 = _mm512_set1_ps(A[oa7]); - c70 = _mm512_fmadd_ps(a0, b0, c70); - a0 = _mm512_set1_ps(A[oa8]); - c80 = _mm512_fmadd_ps(a0, b0, c80); - B += sb; - A += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c60, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c70, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c80, mask); - } - - void GemmKernel12x32nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask) - { - __m512 c00 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 c30 = _mm512_setzero_ps(); - __m512 c40 = _mm512_setzero_ps(); - __m512 c50 = _mm512_setzero_ps(); - __m512 c01 = _mm512_setzero_ps(); - __m512 c11 = _mm512_setzero_ps(); - __m512 c21 = _mm512_setzero_ps(); - __m512 c31 = _mm512_setzero_ps(); - __m512 c41 = _mm512_setzero_ps(); - __m512 c51 = _mm512_setzero_ps(); - __m512 c60 = _mm512_setzero_ps(); - __m512 c70 = _mm512_setzero_ps(); - __m512 c80 = _mm512_setzero_ps(); - __m512 c90 = _mm512_setzero_ps(); - __m512 cA0 = _mm512_setzero_ps(); - __m512 cB0 = _mm512_setzero_ps(); - __m512 c61 = _mm512_setzero_ps(); - __m512 c71 = _mm512_setzero_ps(); - __m512 c81 = _mm512_setzero_ps(); - __m512 c91 = _mm512_setzero_ps(); - __m512 cA1 = _mm512_setzero_ps(); - __m512 cB1 = _mm512_setzero_ps(); - const float * A0 = A, *A6 = A + 6 * lda; - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t sa = lda == 1 ? 12 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - __m512 b0, b1, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - b1 = _mm512_loadu_ps(B + ob1); - a0 = _mm512_set1_ps(A0[oa0]); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c01 = _mm512_fmadd_ps(a0, b1, c01); - a0 = _mm512_set1_ps(A0[oa1]); - c10 = _mm512_fmadd_ps(a0, b0, c10); - c11 = _mm512_fmadd_ps(a0, b1, c11); - a0 = _mm512_set1_ps(A0[oa2]); - c20 = _mm512_fmadd_ps(a0, b0, c20); - c21 = _mm512_fmadd_ps(a0, b1, c21); - a0 = _mm512_set1_ps(A0[oa3]); - c30 = _mm512_fmadd_ps(a0, b0, c30); - c31 = _mm512_fmadd_ps(a0, b1, c31); - a0 = _mm512_set1_ps(A0[oa4]); - c40 = _mm512_fmadd_ps(a0, b0, c40); - c41 = _mm512_fmadd_ps(a0, b1, c41); - a0 = _mm512_set1_ps(A0[oa5]); - c50 = _mm512_fmadd_ps(a0, b0, c50); - c51 = _mm512_fmadd_ps(a0, b1, c51); - a0 = _mm512_set1_ps(A6[oa0]); - c60 = _mm512_fmadd_ps(a0, b0, c60); - c61 = _mm512_fmadd_ps(a0, b1, c61); - a0 = _mm512_set1_ps(A6[oa1]); - c70 = _mm512_fmadd_ps(a0, b0, c70); - c71 = _mm512_fmadd_ps(a0, b1, c71); - a0 = _mm512_set1_ps(A6[oa2]); - c80 = _mm512_fmadd_ps(a0, b0, c80); - c81 = _mm512_fmadd_ps(a0, b1, c81); - a0 = _mm512_set1_ps(A6[oa3]); - c90 = _mm512_fmadd_ps(a0, b0, c90); - c91 = _mm512_fmadd_ps(a0, b1, c91); - a0 = _mm512_set1_ps(A6[oa4]); - cA0 = _mm512_fmadd_ps(a0, b0, cA0); - cA1 = _mm512_fmadd_ps(a0, b1, cA1); - a0 = _mm512_set1_ps(A6[oa5]); - cB0 = _mm512_fmadd_ps(a0, b0, cB0); - cB1 = _mm512_fmadd_ps(a0, b1, cB1); - B += sb; - A0 += sa; - A6 += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40); - AddProduct(C + 1 * F, _alpha, c41, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50); - AddProduct(C + 1 * F, _alpha, c51, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c60); - AddProduct(C + 1 * F, _alpha, c61, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c70); - AddProduct(C + 1 * F, _alpha, c71, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c80); - AddProduct(C + 1 * F, _alpha, c81, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c90); - AddProduct(C + 1 * F, _alpha, c91, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, cA0); - AddProduct(C + 1 * F, _alpha, cA1, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, cB0); - AddProduct(C + 1 * F, _alpha, cB1, mask); - } - - void GemmKernel12x16nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask) - { - __m512 c00 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 c30 = _mm512_setzero_ps(); - __m512 c40 = _mm512_setzero_ps(); - __m512 c50 = _mm512_setzero_ps(); - __m512 c60 = _mm512_setzero_ps(); - __m512 c70 = _mm512_setzero_ps(); - __m512 c80 = _mm512_setzero_ps(); - __m512 c90 = _mm512_setzero_ps(); - __m512 cA0 = _mm512_setzero_ps(); - __m512 cB0 = _mm512_setzero_ps(); - const float * A0 = A, *A6 = A + 6 * lda; - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t sa = lda == 1 ? 12 : 1; - const size_t ob0 = ldb * 0; - __m512 b0, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - a0 = _mm512_set1_ps(A0[oa0]); - c00 = _mm512_fmadd_ps(a0, b0, c00); - a0 = _mm512_set1_ps(A0[oa1]); - c10 = _mm512_fmadd_ps(a0, b0, c10); - a0 = _mm512_set1_ps(A0[oa2]); - c20 = _mm512_fmadd_ps(a0, b0, c20); - a0 = _mm512_set1_ps(A0[oa3]); - c30 = _mm512_fmadd_ps(a0, b0, c30); - a0 = _mm512_set1_ps(A0[oa4]); - c40 = _mm512_fmadd_ps(a0, b0, c40); - a0 = _mm512_set1_ps(A0[oa5]); - c50 = _mm512_fmadd_ps(a0, b0, c50); - a0 = _mm512_set1_ps(A6[oa0]); - c60 = _mm512_fmadd_ps(a0, b0, c60); - a0 = _mm512_set1_ps(A6[oa1]); - c70 = _mm512_fmadd_ps(a0, b0, c70); - a0 = _mm512_set1_ps(A6[oa2]); - c80 = _mm512_fmadd_ps(a0, b0, c80); - a0 = _mm512_set1_ps(A6[oa3]); - c90 = _mm512_fmadd_ps(a0, b0, c90); - a0 = _mm512_set1_ps(A6[oa4]); - cA0 = _mm512_fmadd_ps(a0, b0, cA0); - a0 = _mm512_set1_ps(A6[oa5]); - cB0 = _mm512_fmadd_ps(a0, b0, cB0); - B += sb; - A0 += sa; - A6 += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c60, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c70, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c80, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c90, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, cA0, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, cB0, mask); - } - - void GemmKernel14x32nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask) - { - __m512 c00 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 c30 = _mm512_setzero_ps(); - __m512 c40 = _mm512_setzero_ps(); - __m512 c50 = _mm512_setzero_ps(); - __m512 c01 = _mm512_setzero_ps(); - __m512 c11 = _mm512_setzero_ps(); - __m512 c21 = _mm512_setzero_ps(); - __m512 c31 = _mm512_setzero_ps(); - __m512 c41 = _mm512_setzero_ps(); - __m512 c51 = _mm512_setzero_ps(); - __m512 c60 = _mm512_setzero_ps(); - __m512 c70 = _mm512_setzero_ps(); - __m512 c80 = _mm512_setzero_ps(); - __m512 c90 = _mm512_setzero_ps(); - __m512 cA0 = _mm512_setzero_ps(); - __m512 cB0 = _mm512_setzero_ps(); - __m512 c61 = _mm512_setzero_ps(); - __m512 c71 = _mm512_setzero_ps(); - __m512 c81 = _mm512_setzero_ps(); - __m512 c91 = _mm512_setzero_ps(); - __m512 cA1 = _mm512_setzero_ps(); - __m512 cB1 = _mm512_setzero_ps(); - __m512 cC0 = _mm512_setzero_ps(); - __m512 cC1 = _mm512_setzero_ps(); - __m512 cD0 = _mm512_setzero_ps(); - __m512 cD1 = _mm512_setzero_ps(); - const float * A0 = A, *A7 = A + 7 * lda; - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t oa6 = lda * 6; - const size_t sa = lda == 1 ? 14 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - __m512 b0, b1, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - b1 = _mm512_loadu_ps(B + ob1); - a0 = _mm512_set1_ps(A0[oa0]); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c01 = _mm512_fmadd_ps(a0, b1, c01); - a0 = _mm512_set1_ps(A0[oa1]); - c10 = _mm512_fmadd_ps(a0, b0, c10); - c11 = _mm512_fmadd_ps(a0, b1, c11); - a0 = _mm512_set1_ps(A0[oa2]); - c20 = _mm512_fmadd_ps(a0, b0, c20); - c21 = _mm512_fmadd_ps(a0, b1, c21); - a0 = _mm512_set1_ps(A0[oa3]); - c30 = _mm512_fmadd_ps(a0, b0, c30); - c31 = _mm512_fmadd_ps(a0, b1, c31); - a0 = _mm512_set1_ps(A0[oa4]); - c40 = _mm512_fmadd_ps(a0, b0, c40); - c41 = _mm512_fmadd_ps(a0, b1, c41); - a0 = _mm512_set1_ps(A0[oa5]); - c50 = _mm512_fmadd_ps(a0, b0, c50); - c51 = _mm512_fmadd_ps(a0, b1, c51); - a0 = _mm512_set1_ps(A0[oa6]); - c60 = _mm512_fmadd_ps(a0, b0, c60); - c61 = _mm512_fmadd_ps(a0, b1, c61); - a0 = _mm512_set1_ps(A7[oa0]); - c70 = _mm512_fmadd_ps(a0, b0, c70); - c71 = _mm512_fmadd_ps(a0, b1, c71); - a0 = _mm512_set1_ps(A7[oa1]); - c80 = _mm512_fmadd_ps(a0, b0, c80); - c81 = _mm512_fmadd_ps(a0, b1, c81); - a0 = _mm512_set1_ps(A7[oa2]); - c90 = _mm512_fmadd_ps(a0, b0, c90); - c91 = _mm512_fmadd_ps(a0, b1, c91); - a0 = _mm512_set1_ps(A7[oa3]); - cA0 = _mm512_fmadd_ps(a0, b0, cA0); - cA1 = _mm512_fmadd_ps(a0, b1, cA1); - a0 = _mm512_set1_ps(A7[oa4]); - cB0 = _mm512_fmadd_ps(a0, b0, cB0); - cB1 = _mm512_fmadd_ps(a0, b1, cB1); - a0 = _mm512_set1_ps(A7[oa5]); - cC0 = _mm512_fmadd_ps(a0, b0, cC0); - cC1 = _mm512_fmadd_ps(a0, b1, cC1); - a0 = _mm512_set1_ps(A7[oa6]); - cD0 = _mm512_fmadd_ps(a0, b0, cD0); - cD1 = _mm512_fmadd_ps(a0, b1, cD1); - B += sb; - A0 += sa; - A7 += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40); - AddProduct(C + 1 * F, _alpha, c41, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50); - AddProduct(C + 1 * F, _alpha, c51, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c60); - AddProduct(C + 1 * F, _alpha, c61, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c70); - AddProduct(C + 1 * F, _alpha, c71, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c80); - AddProduct(C + 1 * F, _alpha, c81, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, c90); - AddProduct(C + 1 * F, _alpha, c91, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, cA0); - AddProduct(C + 1 * F, _alpha, cA1, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, cB0); - AddProduct(C + 1 * F, _alpha, cB1, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, cC0); - AddProduct(C + 1 * F, _alpha, cC1, mask); - C += ldc; - AddProduct(C + 0 * F, _alpha, cD0); - AddProduct(C + 1 * F, _alpha, cD1, mask); - } - - void GemmKernel14x16nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask) - { - __m512 c0 = _mm512_setzero_ps(); - __m512 c1 = _mm512_setzero_ps(); - __m512 c2 = _mm512_setzero_ps(); - __m512 c3 = _mm512_setzero_ps(); - __m512 c4 = _mm512_setzero_ps(); - __m512 c5 = _mm512_setzero_ps(); - __m512 c6 = _mm512_setzero_ps(); - __m512 c7 = _mm512_setzero_ps(); - __m512 c8 = _mm512_setzero_ps(); - __m512 c9 = _mm512_setzero_ps(); - __m512 cA = _mm512_setzero_ps(); - __m512 cB = _mm512_setzero_ps(); - __m512 cC = _mm512_setzero_ps(); - __m512 cD = _mm512_setzero_ps(); - const float * A0 = A, * A7 = A + 7*lda; - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t oa6 = lda * 6; - const size_t sa = lda == 1 ? 14 : 1; - const size_t ob0 = ldb * 0; - __m512 b0, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - a0 = _mm512_set1_ps(A0[oa0]); - c0 = _mm512_fmadd_ps(a0, b0, c0); - a0 = _mm512_set1_ps(A0[oa1]); - c1 = _mm512_fmadd_ps(a0, b0, c1); - a0 = _mm512_set1_ps(A0[oa2]); - c2 = _mm512_fmadd_ps(a0, b0, c2); - a0 = _mm512_set1_ps(A0[oa3]); - c3 = _mm512_fmadd_ps(a0, b0, c3); - a0 = _mm512_set1_ps(A0[oa4]); - c4 = _mm512_fmadd_ps(a0, b0, c4); - a0 = _mm512_set1_ps(A0[oa5]); - c5 = _mm512_fmadd_ps(a0, b0, c5); - a0 = _mm512_set1_ps(A0[oa6]); - c6 = _mm512_fmadd_ps(a0, b0, c6); - a0 = _mm512_set1_ps(A7[oa0]); - c7 = _mm512_fmadd_ps(a0, b0, c7); - a0 = _mm512_set1_ps(A7[oa1]); - c8 = _mm512_fmadd_ps(a0, b0, c8); - a0 = _mm512_set1_ps(A7[oa2]); - c9 = _mm512_fmadd_ps(a0, b0, c9); - a0 = _mm512_set1_ps(A7[oa3]); - cA = _mm512_fmadd_ps(a0, b0, cA); - a0 = _mm512_set1_ps(A7[oa4]); - cB = _mm512_fmadd_ps(a0, b0, cB); - a0 = _mm512_set1_ps(A7[oa5]); - cC = _mm512_fmadd_ps(a0, b0, cC); - a0 = _mm512_set1_ps(A7[oa6]); - cD = _mm512_fmadd_ps(a0, b0, cD); - B += sb; - A0 += sa; - A7 += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - AddProduct(C, _alpha, c0, mask); - C += ldc; - AddProduct(C, _alpha, c1, mask); - C += ldc; - AddProduct(C, _alpha, c2, mask); - C += ldc; - AddProduct(C, _alpha, c3, mask); - C += ldc; - AddProduct(C, _alpha, c4, mask); - C += ldc; - AddProduct(C, _alpha, c5, mask); - C += ldc; - AddProduct(C, _alpha, c6, mask); - C += ldc; - AddProduct(C, _alpha, c7, mask); - C += ldc; - AddProduct(C, _alpha, c8, mask); - C += ldc; - AddProduct(C, _alpha, c9, mask); - C += ldc; - AddProduct(C, _alpha, cA, mask); - C += ldc; - AddProduct(C, _alpha, cB, mask); - C += ldc; - AddProduct(C, _alpha, cC, mask); - C += ldc; - AddProduct(C, _alpha, cD, mask); - } - - void GemmKernelMx48nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask) - { -#if SIMD_ZMM_COUNT == 32 - __m512 c[9][3]; - size_t oa[9]; -#else - __m512 c[4][3]; - size_t oa[4]; -#endif - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - for (size_t i = 0; i < M; ++i) - { - c[i][0] = _mm512_setzero_ps(); - c[i][1] = _mm512_setzero_ps(); - c[i][2] = _mm512_setzero_ps(); - oa[i] = lda * i; - } - __m512 b0, b1, b2, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - b1 = _mm512_loadu_ps(B + ob1); - b2 = _mm512_loadu_ps(B + ob2); - for (size_t i = 0; i < M; ++i) - { - a0 = _mm512_set1_ps(A[oa[i]]); - c[i][0] = _mm512_add_ps(_mm512_mul_ps(b0, a0), c[i][0]); - c[i][1] = _mm512_add_ps(_mm512_mul_ps(b1, a0), c[i][1]); - c[i][2] = _mm512_add_ps(_mm512_mul_ps(b2, a0), c[i][2]); - } - B += sb; - A += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - for (size_t i = 0; i < M; ++i) - { - AddProduct(C + 0 * F, _alpha, c[i][0]); - AddProduct(C + 1 * F, _alpha, c[i][1]); - AddProduct(C + 2 * F, _alpha, c[i][2], mask); - C += ldc; - } - } - - void GemmKernelMx32nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask) - { -#if SIMD_ZMM_COUNT == 32 - __m512 c[14][2]; - size_t oa[14]; -#else - __m512 c[6][2]; - size_t oa[6]; -#endif - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - for (size_t i = 0; i < M; ++i) - { - c[i][0] = _mm512_setzero_ps(); - c[i][1] = _mm512_setzero_ps(); - oa[i] = lda * i; - } - __m512 b0, b1, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - b1 = _mm512_loadu_ps(B + ob1); - for (size_t i = 0; i < M; ++i) - { - a0 = _mm512_set1_ps(A[oa[i]]); - c[i][0] = _mm512_fmadd_ps(b0, a0, c[i][0]); - c[i][1] = _mm512_fmadd_ps(b1, a0, c[i][1]); - } - B += sb; - A += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - for (size_t i = 0; i < M; ++i) - { - AddProduct(C + 0 * F, _alpha, c[i][0]); - AddProduct(C + 1 * F, _alpha, c[i][1], mask); - C += ldc; - } - } - - void GemmKernelMx16nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask) - { - //SIMD_PERF_BEG(Simd::ToStr(M) + "-" + Simd::ToStr(N) + "-" + Simd::ToStr(K)); - -#if SIMD_ZMM_COUNT == 32 - __m512 c[14]; - size_t oa[14]; -#elif SIMD_ZMM_COUNT == 16 - __m512 c[6]; - size_t oa[6]; -#else - __m512 c[4]; - size_t oa[4]; -#endif - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - for (size_t i = 0; i < M; ++i) - { - c[i] = _mm512_setzero_ps(); - oa[i] = lda * i; - } - __m512 b0, a0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - for (size_t i = 0; i < M; ++i) - { - a0 = _mm512_set1_ps(A[oa[i]]); - c[i] = _mm512_fmadd_ps(b0, a0, c[i]); - } - B += sb; - A += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - for (size_t i = 0; i < M; ++i) - AddProduct(C + i * ldc, _alpha, c[i], mask); - } - - template void GemmKernelMx64nnT(size_t, size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, size_t sb, float* C, size_t ldc, __mmask16 mask) - { - __m512 c00, c01, c02, c03, c04, c05, c10, c11, c12, c13, c14, c15, c20, c21, c22, c23, c24, c25, c30, c31, c32, c33, c34, c35, b0, b1, b2, b3, a0; - if (M > 0x0) c00 = _mm512_setzero_ps(), c10 = _mm512_setzero_ps(), c20 = _mm512_setzero_ps(), c30 = _mm512_setzero_ps(); - if (M > 0x1) c01 = _mm512_setzero_ps(), c11 = _mm512_setzero_ps(), c21 = _mm512_setzero_ps(), c31 = _mm512_setzero_ps(); - if (M > 0x2) c02 = _mm512_setzero_ps(), c12 = _mm512_setzero_ps(), c22 = _mm512_setzero_ps(), c32 = _mm512_setzero_ps(); - if (M > 0x3) c03 = _mm512_setzero_ps(), c13 = _mm512_setzero_ps(), c23 = _mm512_setzero_ps(), c33 = _mm512_setzero_ps(); - if (M > 0x4) c04 = _mm512_setzero_ps(), c14 = _mm512_setzero_ps(), c24 = _mm512_setzero_ps(), c34 = _mm512_setzero_ps(); - if (M > 0x5) c05 = _mm512_setzero_ps(), c15 = _mm512_setzero_ps(), c25 = _mm512_setzero_ps(), c35 = _mm512_setzero_ps(); - size_t oa0, oa1, oa2, oa3, oa4, oa5; - if (M > 0) oa0 = lda * 0; - if (M > 1) oa1 = lda * 1; - if (M > 2) oa2 = lda * 2; - if (M > 3) oa3 = lda * 3; - if (M > 4) oa4 = lda * 4; - if (M > 5) oa5 = lda * 5; - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - const size_t ob3 = ldb * 3; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - b1 = _mm512_loadu_ps(B + ob1); - b2 = _mm512_loadu_ps(B + ob2); - b3 = _mm512_loadu_ps(B + ob3); - if (M > 0x0) a0 = _mm512_set1_ps(A[oa0]), c00 = _mm512_fmadd_ps(a0, b0, c00), c10 = _mm512_fmadd_ps(a0, b1, c10), c20 = _mm512_fmadd_ps(a0, b2, c20), c30 = _mm512_fmadd_ps(a0, b3, c30); - if (M > 0x1) a0 = _mm512_set1_ps(A[oa1]), c01 = _mm512_fmadd_ps(a0, b0, c01), c11 = _mm512_fmadd_ps(a0, b1, c11), c21 = _mm512_fmadd_ps(a0, b2, c21), c31 = _mm512_fmadd_ps(a0, b3, c31); - if (M > 0x2) a0 = _mm512_set1_ps(A[oa2]), c02 = _mm512_fmadd_ps(a0, b0, c02), c12 = _mm512_fmadd_ps(a0, b1, c12), c22 = _mm512_fmadd_ps(a0, b2, c22), c32 = _mm512_fmadd_ps(a0, b3, c32); - if (M > 0x3) a0 = _mm512_set1_ps(A[oa3]), c03 = _mm512_fmadd_ps(a0, b0, c03), c13 = _mm512_fmadd_ps(a0, b1, c13), c23 = _mm512_fmadd_ps(a0, b2, c23), c33 = _mm512_fmadd_ps(a0, b3, c33); - if (M > 0x4) a0 = _mm512_set1_ps(A[oa4]), c04 = _mm512_fmadd_ps(a0, b0, c04), c14 = _mm512_fmadd_ps(a0, b1, c14), c24 = _mm512_fmadd_ps(a0, b2, c24), c34 = _mm512_fmadd_ps(a0, b3, c34); - if (M > 0x5) a0 = _mm512_set1_ps(A[oa5]), c05 = _mm512_fmadd_ps(a0, b0, c05), c15 = _mm512_fmadd_ps(a0, b1, c15), c25 = _mm512_fmadd_ps(a0, b2, c25), c35 = _mm512_fmadd_ps(a0, b3, c35); - B += sb; - A += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - if (M > 0x0) AddProduct(C + 0 * F, _alpha, c00), AddProduct(C + 1 * F, _alpha, c10), AddProduct(C + 2 * F, _alpha, c20), AddProduct(C + 3 * F, _alpha, c30, mask), C += ldc; - if (M > 0x1) AddProduct(C + 0 * F, _alpha, c01), AddProduct(C + 1 * F, _alpha, c11), AddProduct(C + 2 * F, _alpha, c21), AddProduct(C + 3 * F, _alpha, c31, mask), C += ldc; - if (M > 0x2) AddProduct(C + 0 * F, _alpha, c02), AddProduct(C + 1 * F, _alpha, c12), AddProduct(C + 2 * F, _alpha, c22), AddProduct(C + 3 * F, _alpha, c32, mask), C += ldc; - if (M > 0x3) AddProduct(C + 0 * F, _alpha, c03), AddProduct(C + 1 * F, _alpha, c13), AddProduct(C + 2 * F, _alpha, c23), AddProduct(C + 3 * F, _alpha, c33, mask), C += ldc; - if (M > 0x4) AddProduct(C + 0 * F, _alpha, c04), AddProduct(C + 1 * F, _alpha, c14), AddProduct(C + 2 * F, _alpha, c24), AddProduct(C + 3 * F, _alpha, c34, mask), C += ldc; - if (M > 0x5) AddProduct(C + 0 * F, _alpha, c05), AddProduct(C + 1 * F, _alpha, c15), AddProduct(C + 2 * F, _alpha, c25), AddProduct(C + 3 * F, _alpha, c35, mask), C += ldc; - } - - template void GemmKernelMx48nnT(size_t, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask) - { - __m512 c00, c01, c02, c03, c04, c05, c06, c07, c08, c10, c11, c12, c13, c14, c15, c16, c17, c18, c20, c21, c22, c23, c24, c25, c26, c27, c28, b0, b1, b2, a0; - if (M > 0x0) c00 = _mm512_setzero_ps(), c10 = _mm512_setzero_ps(), c20 = _mm512_setzero_ps(); - if (M > 0x1) c01 = _mm512_setzero_ps(), c11 = _mm512_setzero_ps(), c21 = _mm512_setzero_ps(); - if (M > 0x2) c02 = _mm512_setzero_ps(), c12 = _mm512_setzero_ps(), c22 = _mm512_setzero_ps(); - if (M > 0x3) c03 = _mm512_setzero_ps(), c13 = _mm512_setzero_ps(), c23 = _mm512_setzero_ps(); - if (M > 0x4) c04 = _mm512_setzero_ps(), c14 = _mm512_setzero_ps(), c24 = _mm512_setzero_ps(); - if (M > 0x5) c05 = _mm512_setzero_ps(), c15 = _mm512_setzero_ps(), c25 = _mm512_setzero_ps(); - if (M > 0x6) c06 = _mm512_setzero_ps(), c16 = _mm512_setzero_ps(), c26 = _mm512_setzero_ps(); - if (M > 0x7) c07 = _mm512_setzero_ps(), c17 = _mm512_setzero_ps(), c27 = _mm512_setzero_ps(); - if (M > 0x8) c08 = _mm512_setzero_ps(), c18 = _mm512_setzero_ps(), c28 = _mm512_setzero_ps(); - const float * A0 = A, *A5 = A + 5 * lda; - size_t oa0, oa1, oa2, oa3, oa4; - if (M > 0) oa0 = lda * 0; - if (M > 1) oa1 = lda * 1; - if (M > 2) oa2 = lda * 2; - if (M > 3) oa3 = lda * 3; - if (M > 4) oa4 = lda * 4; - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - b1 = _mm512_loadu_ps(B + ob1); - b2 = _mm512_loadu_ps(B + ob2); - if (M > 0x0) a0 = _mm512_set1_ps(A0[oa0]), c00 = _mm512_fmadd_ps(a0, b0, c00), c10 = _mm512_fmadd_ps(a0, b1, c10), c20 = _mm512_fmadd_ps(a0, b2, c20); - if (M > 0x1) a0 = _mm512_set1_ps(A0[oa1]), c01 = _mm512_fmadd_ps(a0, b0, c01), c11 = _mm512_fmadd_ps(a0, b1, c11), c21 = _mm512_fmadd_ps(a0, b2, c21); - if (M > 0x2) a0 = _mm512_set1_ps(A0[oa2]), c02 = _mm512_fmadd_ps(a0, b0, c02), c12 = _mm512_fmadd_ps(a0, b1, c12), c22 = _mm512_fmadd_ps(a0, b2, c22); - if (M > 0x3) a0 = _mm512_set1_ps(A0[oa3]), c03 = _mm512_fmadd_ps(a0, b0, c03), c13 = _mm512_fmadd_ps(a0, b1, c13), c23 = _mm512_fmadd_ps(a0, b2, c23); - if (M > 0x4) a0 = _mm512_set1_ps(A0[oa4]), c04 = _mm512_fmadd_ps(a0, b0, c04), c14 = _mm512_fmadd_ps(a0, b1, c14), c24 = _mm512_fmadd_ps(a0, b2, c24); - if (M > 0x5) a0 = _mm512_set1_ps(A5[oa0]), c05 = _mm512_fmadd_ps(a0, b0, c05), c15 = _mm512_fmadd_ps(a0, b1, c15), c25 = _mm512_fmadd_ps(a0, b2, c25); - if (M > 0x6) a0 = _mm512_set1_ps(A5[oa1]), c06 = _mm512_fmadd_ps(a0, b0, c06), c16 = _mm512_fmadd_ps(a0, b1, c16), c26 = _mm512_fmadd_ps(a0, b2, c26); - if (M > 0x7) a0 = _mm512_set1_ps(A5[oa2]), c07 = _mm512_fmadd_ps(a0, b0, c07), c17 = _mm512_fmadd_ps(a0, b1, c17), c27 = _mm512_fmadd_ps(a0, b2, c27); - if (M > 0x8) a0 = _mm512_set1_ps(A5[oa3]), c08 = _mm512_fmadd_ps(a0, b0, c08), c18 = _mm512_fmadd_ps(a0, b1, c18), c28 = _mm512_fmadd_ps(a0, b2, c28); - B += sb; - A0 += sa; - A5 += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - if (M > 0x0) AddProduct(C + 0 * F, _alpha, c00), AddProduct(C + 1 * F, _alpha, c10), AddProduct(C + 2 * F, _alpha, c20, mask), C += ldc; - if (M > 0x1) AddProduct(C + 0 * F, _alpha, c01), AddProduct(C + 1 * F, _alpha, c11), AddProduct(C + 2 * F, _alpha, c21, mask), C += ldc; - if (M > 0x2) AddProduct(C + 0 * F, _alpha, c02), AddProduct(C + 1 * F, _alpha, c12), AddProduct(C + 2 * F, _alpha, c22, mask), C += ldc; - if (M > 0x3) AddProduct(C + 0 * F, _alpha, c03), AddProduct(C + 1 * F, _alpha, c13), AddProduct(C + 2 * F, _alpha, c23, mask), C += ldc; - if (M > 0x4) AddProduct(C + 0 * F, _alpha, c04), AddProduct(C + 1 * F, _alpha, c14), AddProduct(C + 2 * F, _alpha, c24, mask), C += ldc; - if (M > 0x5) AddProduct(C + 0 * F, _alpha, c05), AddProduct(C + 1 * F, _alpha, c15), AddProduct(C + 2 * F, _alpha, c25, mask), C += ldc; - if (M > 0x6) AddProduct(C + 0 * F, _alpha, c06), AddProduct(C + 1 * F, _alpha, c16), AddProduct(C + 2 * F, _alpha, c26, mask), C += ldc; - if (M > 0x7) AddProduct(C + 0 * F, _alpha, c07), AddProduct(C + 1 * F, _alpha, c17), AddProduct(C + 2 * F, _alpha, c27, mask), C += ldc; - if (M > 0x8) AddProduct(C + 0 * F, _alpha, c08), AddProduct(C + 1 * F, _alpha, c18), AddProduct(C + 2 * F, _alpha, c28, mask), C += ldc; - } - - template void GemmKernelMx32nnT(size_t, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask) - { - __m512 c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c0A, c0B, c0C, c0D, c10, c11, c12, c13, c14, c15, c16, c17, c18, c19, c1A, c1B, c1C, c1D, b0, b1, a0; - if (M > 0x0) c00 = _mm512_setzero_ps(), c10 = _mm512_setzero_ps(); - if (M > 0x1) c01 = _mm512_setzero_ps(), c11 = _mm512_setzero_ps(); - if (M > 0x2) c02 = _mm512_setzero_ps(), c12 = _mm512_setzero_ps(); - if (M > 0x3) c03 = _mm512_setzero_ps(), c13 = _mm512_setzero_ps(); - if (M > 0x4) c04 = _mm512_setzero_ps(), c14 = _mm512_setzero_ps(); - if (M > 0x5) c05 = _mm512_setzero_ps(), c15 = _mm512_setzero_ps(); - if (M > 0x6) c06 = _mm512_setzero_ps(), c16 = _mm512_setzero_ps(); - if (M > 0x7) c07 = _mm512_setzero_ps(), c17 = _mm512_setzero_ps(); - if (M > 0x8) c08 = _mm512_setzero_ps(), c18 = _mm512_setzero_ps(); - if (M > 0x9) c09 = _mm512_setzero_ps(), c19 = _mm512_setzero_ps(); - if (M > 0xA) c0A = _mm512_setzero_ps(), c1A = _mm512_setzero_ps(); - if (M > 0xB) c0B = _mm512_setzero_ps(), c1B = _mm512_setzero_ps(); - if (M > 0xC) c0C = _mm512_setzero_ps(), c1C = _mm512_setzero_ps(); - if (M > 0xD) c0D = _mm512_setzero_ps(), c1D = _mm512_setzero_ps(); - const float * A0 = A, *A7 = A + 7 * lda; - size_t oa0, oa1, oa2, oa3, oa4, oa5, oa6; - if (M > 0) oa0 = lda * 0; - if (M > 1) oa1 = lda * 1; - if (M > 2) oa2 = lda * 2; - if (M > 3) oa3 = lda * 3; - if (M > 4) oa4 = lda * 4; - if (M > 5) oa5 = lda * 5; - if (M > 6) oa6 = lda * 6; - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - b1 = _mm512_loadu_ps(B + ob1); - if (M > 0x0) a0 = _mm512_set1_ps(A0[oa0]), c00 = _mm512_fmadd_ps(a0, b0, c00), c10 = _mm512_fmadd_ps(a0, b1, c10); - if (M > 0x1) a0 = _mm512_set1_ps(A0[oa1]), c01 = _mm512_fmadd_ps(a0, b0, c01), c11 = _mm512_fmadd_ps(a0, b1, c11); - if (M > 0x2) a0 = _mm512_set1_ps(A0[oa2]), c02 = _mm512_fmadd_ps(a0, b0, c02), c12 = _mm512_fmadd_ps(a0, b1, c12); - if (M > 0x3) a0 = _mm512_set1_ps(A0[oa3]), c03 = _mm512_fmadd_ps(a0, b0, c03), c13 = _mm512_fmadd_ps(a0, b1, c13); - if (M > 0x4) a0 = _mm512_set1_ps(A0[oa4]), c04 = _mm512_fmadd_ps(a0, b0, c04), c14 = _mm512_fmadd_ps(a0, b1, c14); - if (M > 0x5) a0 = _mm512_set1_ps(A0[oa5]), c05 = _mm512_fmadd_ps(a0, b0, c05), c15 = _mm512_fmadd_ps(a0, b1, c15); - if (M > 0x6) a0 = _mm512_set1_ps(A0[oa6]), c06 = _mm512_fmadd_ps(a0, b0, c06), c16 = _mm512_fmadd_ps(a0, b1, c16); - if (M > 0x7) a0 = _mm512_set1_ps(A7[oa0]), c07 = _mm512_fmadd_ps(a0, b0, c07), c17 = _mm512_fmadd_ps(a0, b1, c17); - if (M > 0x8) a0 = _mm512_set1_ps(A7[oa1]), c08 = _mm512_fmadd_ps(a0, b0, c08), c18 = _mm512_fmadd_ps(a0, b1, c18); - if (M > 0x9) a0 = _mm512_set1_ps(A7[oa2]), c09 = _mm512_fmadd_ps(a0, b0, c09), c19 = _mm512_fmadd_ps(a0, b1, c19); - if (M > 0xA) a0 = _mm512_set1_ps(A7[oa3]), c0A = _mm512_fmadd_ps(a0, b0, c0A), c1A = _mm512_fmadd_ps(a0, b1, c1A); - if (M > 0xB) a0 = _mm512_set1_ps(A7[oa4]), c0B = _mm512_fmadd_ps(a0, b0, c0B), c1B = _mm512_fmadd_ps(a0, b1, c1B); - if (M > 0xC) a0 = _mm512_set1_ps(A7[oa5]), c0C = _mm512_fmadd_ps(a0, b0, c0C), c1C = _mm512_fmadd_ps(a0, b1, c1C); - if (M > 0xD) a0 = _mm512_set1_ps(A7[oa6]), c0D = _mm512_fmadd_ps(a0, b0, c0D), c1D = _mm512_fmadd_ps(a0, b1, c1D); - B += sb; - A0 += sa; - A7 += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - if (M > 0x0) AddProduct(C + 0 * F, _alpha, c00), AddProduct(C + 1 * F, _alpha, c10, mask), C += ldc; - if (M > 0x1) AddProduct(C + 0 * F, _alpha, c01), AddProduct(C + 1 * F, _alpha, c11, mask), C += ldc; - if (M > 0x2) AddProduct(C + 0 * F, _alpha, c02), AddProduct(C + 1 * F, _alpha, c12, mask), C += ldc; - if (M > 0x3) AddProduct(C + 0 * F, _alpha, c03), AddProduct(C + 1 * F, _alpha, c13, mask), C += ldc; - if (M > 0x4) AddProduct(C + 0 * F, _alpha, c04), AddProduct(C + 1 * F, _alpha, c14, mask), C += ldc; - if (M > 0x5) AddProduct(C + 0 * F, _alpha, c05), AddProduct(C + 1 * F, _alpha, c15, mask), C += ldc; - if (M > 0x6) AddProduct(C + 0 * F, _alpha, c06), AddProduct(C + 1 * F, _alpha, c16, mask), C += ldc; - if (M > 0x7) AddProduct(C + 0 * F, _alpha, c07), AddProduct(C + 1 * F, _alpha, c17, mask), C += ldc; - if (M > 0x8) AddProduct(C + 0 * F, _alpha, c08), AddProduct(C + 1 * F, _alpha, c18, mask), C += ldc; - if (M > 0x9) AddProduct(C + 0 * F, _alpha, c09), AddProduct(C + 1 * F, _alpha, c19, mask), C += ldc; - if (M > 0xA) AddProduct(C + 0 * F, _alpha, c0A), AddProduct(C + 1 * F, _alpha, c1A, mask), C += ldc; - if (M > 0xB) AddProduct(C + 0 * F, _alpha, c0B), AddProduct(C + 1 * F, _alpha, c1B, mask), C += ldc; - if (M > 0xC) AddProduct(C + 0 * F, _alpha, c0C), AddProduct(C + 1 * F, _alpha, c1C, mask), C += ldc; - if (M > 0xD) AddProduct(C + 0 * F, _alpha, c0D), AddProduct(C + 1 * F, _alpha, c1D, mask), C += ldc; - } - - template void GemmKernelMx16nnT(size_t, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask) - { - __m512 c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c0A, c0B, c0C, c0D, b0, a0; - if (M > 0x0) c00 = _mm512_setzero_ps(); - if (M > 0x1) c01 = _mm512_setzero_ps(); - if (M > 0x2) c02 = _mm512_setzero_ps(); - if (M > 0x3) c03 = _mm512_setzero_ps(); - if (M > 0x4) c04 = _mm512_setzero_ps(); - if (M > 0x5) c05 = _mm512_setzero_ps(); - if (M > 0x6) c06 = _mm512_setzero_ps(); - if (M > 0x7) c07 = _mm512_setzero_ps(); - if (M > 0x8) c08 = _mm512_setzero_ps(); - if (M > 0x9) c09 = _mm512_setzero_ps(); - if (M > 0xA) c0A = _mm512_setzero_ps(); - if (M > 0xB) c0B = _mm512_setzero_ps(); - if (M > 0xC) c0C = _mm512_setzero_ps(); - if (M > 0xD) c0D = _mm512_setzero_ps(); - const float * A0 = A, *A7 = A + 7 * lda; - size_t oa0, oa1, oa2, oa3, oa4, oa5, oa6; - if (M > 0) oa0 = lda * 0; - if (M > 1) oa1 = lda * 1; - if (M > 2) oa2 = lda * 2; - if (M > 3) oa3 = lda * 3; - if (M > 4) oa4 = lda * 4; - if (M > 5) oa5 = lda * 5; - if (M > 6) oa6 = lda * 6; - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - for (size_t k = 0; k < K; k++) - { - b0 = _mm512_loadu_ps(B + ob0); - if (M > 0x0) a0 = _mm512_set1_ps(A0[oa0]), c00 = _mm512_fmadd_ps(a0, b0, c00); - if (M > 0x1) a0 = _mm512_set1_ps(A0[oa1]), c01 = _mm512_fmadd_ps(a0, b0, c01); - if (M > 0x2) a0 = _mm512_set1_ps(A0[oa2]), c02 = _mm512_fmadd_ps(a0, b0, c02); - if (M > 0x3) a0 = _mm512_set1_ps(A0[oa3]), c03 = _mm512_fmadd_ps(a0, b0, c03); - if (M > 0x4) a0 = _mm512_set1_ps(A0[oa4]), c04 = _mm512_fmadd_ps(a0, b0, c04); - if (M > 0x5) a0 = _mm512_set1_ps(A0[oa5]), c05 = _mm512_fmadd_ps(a0, b0, c05); - if (M > 0x6) a0 = _mm512_set1_ps(A0[oa6]), c06 = _mm512_fmadd_ps(a0, b0, c06); - if (M > 0x7) a0 = _mm512_set1_ps(A7[oa0]), c07 = _mm512_fmadd_ps(a0, b0, c07); - if (M > 0x8) a0 = _mm512_set1_ps(A7[oa1]), c08 = _mm512_fmadd_ps(a0, b0, c08); - if (M > 0x9) a0 = _mm512_set1_ps(A7[oa2]), c09 = _mm512_fmadd_ps(a0, b0, c09); - if (M > 0xA) a0 = _mm512_set1_ps(A7[oa3]), c0A = _mm512_fmadd_ps(a0, b0, c0A); - if (M > 0xB) a0 = _mm512_set1_ps(A7[oa4]), c0B = _mm512_fmadd_ps(a0, b0, c0B); - if (M > 0xC) a0 = _mm512_set1_ps(A7[oa5]), c0C = _mm512_fmadd_ps(a0, b0, c0C); - if (M > 0xD) a0 = _mm512_set1_ps(A7[oa6]), c0D = _mm512_fmadd_ps(a0, b0, c0D); - B += sb; - A0 += sa; - A7 += sa; - } - __m512 _alpha = _mm512_set1_ps(alpha); - if (M > 0x0) AddProduct(C, _alpha, c00, mask), C += ldc; - if (M > 0x1) AddProduct(C, _alpha, c01, mask), C += ldc; - if (M > 0x2) AddProduct(C, _alpha, c02, mask), C += ldc; - if (M > 0x3) AddProduct(C, _alpha, c03, mask), C += ldc; - if (M > 0x4) AddProduct(C, _alpha, c04, mask), C += ldc; - if (M > 0x5) AddProduct(C, _alpha, c05, mask), C += ldc; - if (M > 0x6) AddProduct(C, _alpha, c06, mask), C += ldc; - if (M > 0x7) AddProduct(C, _alpha, c07, mask), C += ldc; - if (M > 0x8) AddProduct(C, _alpha, c08, mask), C += ldc; - if (M > 0x9) AddProduct(C, _alpha, c09, mask), C += ldc; - if (M > 0xA) AddProduct(C, _alpha, c0A, mask), C += ldc; - if (M > 0xB) AddProduct(C, _alpha, c0B, mask), C += ldc; - if (M > 0xC) AddProduct(C, _alpha, c0C, mask), C += ldc; - if (M > 0xD) AddProduct(C, _alpha, c0D, mask), C += ldc; - } - - SIMD_INLINE Simd::GemmNN::Tail GetGemmTail(size_t M, size_t N) - { - if (N <= 16) - { - switch (M) - { - case 0: return GemmKernelMx16nnT<0>; - case 1: return GemmKernelMx16nnT<1>; - case 2: return GemmKernelMx16nnT<2>; - case 3: return GemmKernelMx16nnT<3>; - case 4: return GemmKernelMx16nnT<4>; - case 5: return GemmKernelMx16nnT<5>; - case 6: return GemmKernelMx16nnT<6>; - case 7: return GemmKernelMx16nnT<7>; - case 8: return GemmKernelMx16nnT<8>; - case 9: return GemmKernelMx16nnT<9>; - case 10: return GemmKernelMx16nnT<10>; - case 11: return GemmKernelMx16nnT<11>; - case 12: return GemmKernelMx16nnT<12>; - case 13: return GemmKernelMx16nnT<13>; - case 14: return GemmKernelMx16nnT<14>; - } - } - else if (N <= 32) - { - switch (M) - { - case 0: return GemmKernelMx32nnT<0>; - case 1: return GemmKernelMx32nnT<1>; - case 2: return GemmKernelMx32nnT<2>; - case 3: return GemmKernelMx32nnT<3>; - case 4: return GemmKernelMx32nnT<4>; - case 5: return GemmKernelMx32nnT<5>; - case 6: return GemmKernelMx32nnT<6>; - case 7: return GemmKernelMx32nnT<7>; - case 8: return GemmKernelMx32nnT<8>; - case 9: return GemmKernelMx32nnT<9>; - case 10: return GemmKernelMx32nnT<10>; - case 11: return GemmKernelMx32nnT<11>; - case 12: return GemmKernelMx32nnT<12>; - case 13: return GemmKernelMx32nnT<13>; - case 14: return GemmKernelMx32nnT<14>; - } - } - else if (N <= 48) - { - switch (M) - { - case 0: return GemmKernelMx48nnT<0>; - case 1: return GemmKernelMx48nnT<1>; - case 2: return GemmKernelMx48nnT<2>; - case 3: return GemmKernelMx48nnT<3>; - case 4: return GemmKernelMx48nnT<4>; - case 5: return GemmKernelMx48nnT<5>; - case 6: return GemmKernelMx48nnT<6>; - case 7: return GemmKernelMx48nnT<7>; - case 8: return GemmKernelMx48nnT<8>; - } - } - else if (N <= 64) - { - switch (M) - { - case 0: return GemmKernelMx64nnT<0>; - case 1: return GemmKernelMx64nnT<1>; - case 2: return GemmKernelMx64nnT<2>; - case 3: return GemmKernelMx64nnT<3>; - case 4: return GemmKernelMx64nnT<4>; - case 5: return GemmKernelMx64nnT<5>; - } - } - assert(0); - return NULL; - } - - SIMD_INLINE void GemmPackA_4x16(const float* src, size_t stride, float* dst) - { - __m512 s0 = _mm512_loadu_ps(src + 0 * stride); - __m512 s1 = _mm512_loadu_ps(src + 1 * stride); - __m512 s2 = _mm512_loadu_ps(src + 2 * stride); - __m512 s3 = _mm512_loadu_ps(src + 3 * stride); - __m512 s020 = Interleave<0>(s0, s2); - __m512 s021 = Interleave<1>(s0, s2); - __m512 s130 = Interleave<0>(s1, s3); - __m512 s131 = Interleave<1>(s1, s3); - _mm512_storeu_ps(dst + 0x00, Interleave<0>(s020, s130)); - _mm512_storeu_ps(dst + 0x10, Interleave<1>(s020, s130)); - _mm512_storeu_ps(dst + 0x20, Interleave<0>(s021, s131)); - _mm512_storeu_ps(dst + 0x30, Interleave<1>(s021, s131)); - } - - SIMD_INLINE void GemmPackA_4x8(const float* src, size_t stride, float* dst) - { - __m256 s0 = _mm256_loadu_ps(src + 0 * stride); - __m256 s1 = _mm256_loadu_ps(src + 1 * stride); - __m256 s2 = _mm256_loadu_ps(src + 2 * stride); - __m256 s3 = _mm256_loadu_ps(src + 3 * stride); - __m256 s00 = _mm256_unpacklo_ps(s0, s2); - __m256 s01 = _mm256_unpacklo_ps(s1, s3); - __m256 s10 = _mm256_unpackhi_ps(s0, s2); - __m256 s11 = _mm256_unpackhi_ps(s1, s3); - __m256 d0 = _mm256_unpacklo_ps(s00, s01); - __m256 d1 = _mm256_unpackhi_ps(s00, s01); - __m256 d2 = _mm256_unpacklo_ps(s10, s11); - __m256 d3 = _mm256_unpackhi_ps(s10, s11); - _mm256_storeu_ps(dst + 0x00, _mm256_permute2f128_ps(d0, d1, 0x20)); - _mm256_storeu_ps(dst + 0x08, _mm256_permute2f128_ps(d2, d3, 0x20)); - _mm256_storeu_ps(dst + 0x10, _mm256_permute2f128_ps(d0, d1, 0x31)); - _mm256_storeu_ps(dst + 0x18, _mm256_permute2f128_ps(d2, d3, 0x31)); - } - - SIMD_INLINE void GemmPackA_4x4(const float* src, size_t stride, float* dst) - { - __m128 s0 = _mm_loadu_ps(src + 0 * stride); - __m128 s1 = _mm_loadu_ps(src + 1 * stride); - __m128 s2 = _mm_loadu_ps(src + 2 * stride); - __m128 s3 = _mm_loadu_ps(src + 3 * stride); - __m128 s00 = _mm_unpacklo_ps(s0, s2); - __m128 s01 = _mm_unpacklo_ps(s1, s3); - __m128 s10 = _mm_unpackhi_ps(s0, s2); - __m128 s11 = _mm_unpackhi_ps(s1, s3); - _mm_storeu_ps(dst + 0, _mm_unpacklo_ps(s00, s01)); - _mm_storeu_ps(dst + 4, _mm_unpackhi_ps(s00, s01)); - _mm_storeu_ps(dst + 8, _mm_unpacklo_ps(s10, s11)); - _mm_storeu_ps(dst + 12, _mm_unpackhi_ps(s10, s11)); - } - - const __m512i K32_PACKA6_0 = SIMD_MM512_SETR_EPI32(0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x04, 0x05, 0x06, 0x07, 0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B); - const __m512i K32_PACKA6_1 = SIMD_MM512_SETR_EPI32(0x06, 0x07, 0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B, 0x14, 0x15, 0x0C, 0x0D, 0x0E, 0x0F, 0x16, 0x17); - const __m512i K32_PACKA6_2 = SIMD_MM512_SETR_EPI32(0x00, 0x01, 0x02, 0x03, 0x18, 0x19, 0x04, 0x05, 0x06, 0x07, 0x1A, 0x1B, 0x08, 0x09, 0x0A, 0x0B); - const __m512i K32_PACKA6_3 = SIMD_MM512_SETR_EPI32(0x06, 0x07, 0x1A, 0x1B, 0x08, 0x09, 0x0A, 0x0B, 0x1C, 0x1D, 0x0C, 0x0D, 0x0E, 0x0F, 0x1E, 0x1F); - - SIMD_INLINE void GemmPackA_6x16(const float* src, size_t stride, float* dst) - { - __m512 s0 = _mm512_loadu_ps(src + 0 * stride); - __m512 s1 = _mm512_loadu_ps(src + 1 * stride); - __m512 s2 = _mm512_loadu_ps(src + 2 * stride); - __m512 s3 = _mm512_loadu_ps(src + 3 * stride); - __m512 s4 = _mm512_loadu_ps(src + 4 * stride); - __m512 s5 = _mm512_loadu_ps(src + 5 * stride); - __m512 s02_0 = Interleave<0>(s0, s2); - __m512 s02_1 = Interleave<1>(s0, s2); - __m512 s13_0 = Interleave<0>(s1, s3); - __m512 s13_1 = Interleave<1>(s1, s3); - __m512 s45_0 = Interleave<0>(s4, s5); - __m512 s45_1 = Interleave<1>(s4, s5); - __m512 s0123_0 = Interleave<0>(s02_0, s13_0); - __m512 s0123_1 = Interleave<1>(s02_0, s13_0); - __m512 s0123_2 = Interleave<0>(s02_1, s13_1); - __m512 s0123_3 = Interleave<1>(s02_1, s13_1); - _mm512_mask_storeu_ps(dst + 0x00, 0x0FFF, _mm512_permutex2var_ps(s0123_0, K32_PACKA6_0, s45_0)); - _mm512_mask_storeu_ps(dst + 0x08, 0xFFF0, _mm512_permutex2var_ps(s0123_0, K32_PACKA6_1, s45_0)); - _mm512_mask_storeu_ps(dst + 0x18, 0x0FFF, _mm512_permutex2var_ps(s0123_1, K32_PACKA6_2, s45_0)); - _mm512_mask_storeu_ps(dst + 0x20, 0xFFF0, _mm512_permutex2var_ps(s0123_1, K32_PACKA6_3, s45_0)); - _mm512_mask_storeu_ps(dst + 0x30, 0x0FFF, _mm512_permutex2var_ps(s0123_2, K32_PACKA6_0, s45_1)); - _mm512_mask_storeu_ps(dst + 0x38, 0xFFF0, _mm512_permutex2var_ps(s0123_2, K32_PACKA6_1, s45_1)); - _mm512_mask_storeu_ps(dst + 0x48, 0x0FFF, _mm512_permutex2var_ps(s0123_3, K32_PACKA6_2, s45_1)); - _mm512_mask_storeu_ps(dst + 0x50, 0xFFF0, _mm512_permutex2var_ps(s0123_3, K32_PACKA6_3, s45_1)); - } - - SIMD_INLINE void GemmPackA_6x4(const float* src, size_t stride, float* dst) - { - __m128 s0 = _mm_loadu_ps(src + 0 * stride); - __m128 s1 = _mm_loadu_ps(src + 1 * stride); - __m128 s2 = _mm_loadu_ps(src + 2 * stride); - __m128 s3 = _mm_loadu_ps(src + 3 * stride); - __m128 s4 = _mm_loadu_ps(src + 4 * stride); - __m128 s5 = _mm_loadu_ps(src + 5 * stride); - __m128 s00 = _mm_unpacklo_ps(s0, s2); - __m128 s01 = _mm_unpacklo_ps(s1, s3); - __m128 s10 = _mm_unpackhi_ps(s0, s2); - __m128 s11 = _mm_unpackhi_ps(s1, s3); - __m128 s20 = _mm_unpacklo_ps(s4, s5); - __m128 s21 = _mm_unpackhi_ps(s4, s5); - _mm_storeu_ps(dst + 0, _mm_unpacklo_ps(s00, s01)); - _mm_storel_pi((__m64*)(dst + 4), s20); - _mm_storeu_ps(dst + 6, _mm_unpackhi_ps(s00, s01)); - _mm_storeh_pi((__m64*)(dst + 10), s20); - _mm_storeu_ps(dst + 12, _mm_unpacklo_ps(s10, s11)); - _mm_storel_pi((__m64*)(dst + 16), s21); - _mm_storeu_ps(dst + 18, _mm_unpackhi_ps(s10, s11)); - _mm_storeh_pi((__m64*)(dst + 22), s21); - } - - SIMD_INLINE void GemmPackA_8x16(const float* src, size_t stride, float* dst) - { - __m512 s0 = _mm512_loadu_ps(src + 0 * stride); - __m512 s1 = _mm512_loadu_ps(src + 1 * stride); - __m512 s2 = _mm512_loadu_ps(src + 2 * stride); - __m512 s3 = _mm512_loadu_ps(src + 3 * stride); - __m512 s4 = _mm512_loadu_ps(src + 4 * stride); - __m512 s5 = _mm512_loadu_ps(src + 5 * stride); - __m512 s6 = _mm512_loadu_ps(src + 6 * stride); - __m512 s7 = _mm512_loadu_ps(src + 7 * stride); - __m512 s04_0 = Interleave<0>(s0, s4); - __m512 s04_1 = Interleave<1>(s0, s4); - __m512 s15_0 = Interleave<0>(s1, s5); - __m512 s15_1 = Interleave<1>(s1, s5); - __m512 s26_0 = Interleave<0>(s2, s6); - __m512 s26_1 = Interleave<1>(s2, s6); - __m512 s37_0 = Interleave<0>(s3, s7); - __m512 s37_1 = Interleave<1>(s3, s7); - __m512 s0246_0 = Interleave<0>(s04_0, s26_0); - __m512 s0246_1 = Interleave<1>(s04_0, s26_0); - __m512 s0246_2 = Interleave<0>(s04_1, s26_1); - __m512 s0246_3 = Interleave<1>(s04_1, s26_1); - __m512 s1357_0 = Interleave<0>(s15_0, s37_0); - __m512 s1357_1 = Interleave<1>(s15_0, s37_0); - __m512 s1357_2 = Interleave<0>(s15_1, s37_1); - __m512 s1357_3 = Interleave<1>(s15_1, s37_1); - _mm512_storeu_ps(dst + 0x00, Interleave<0>(s0246_0, s1357_0)); - _mm512_storeu_ps(dst + 0x10, Interleave<1>(s0246_0, s1357_0)); - _mm512_storeu_ps(dst + 0x20, Interleave<0>(s0246_1, s1357_1)); - _mm512_storeu_ps(dst + 0x30, Interleave<1>(s0246_1, s1357_1)); - _mm512_storeu_ps(dst + 0x40, Interleave<0>(s0246_2, s1357_2)); - _mm512_storeu_ps(dst + 0x50, Interleave<1>(s0246_2, s1357_2)); - _mm512_storeu_ps(dst + 0x60, Interleave<0>(s0246_3, s1357_3)); - _mm512_storeu_ps(dst + 0x70, Interleave<1>(s0246_3, s1357_3)); - } - - SIMD_INLINE void GemmPackA_8x4(const float* src, size_t stride, float* dst) - { - __m128 s0 = _mm_loadu_ps(src + 0 * stride); - __m128 s1 = _mm_loadu_ps(src + 1 * stride); - __m128 s2 = _mm_loadu_ps(src + 2 * stride); - __m128 s3 = _mm_loadu_ps(src + 3 * stride); - __m128 s4 = _mm_loadu_ps(src + 4 * stride); - __m128 s5 = _mm_loadu_ps(src + 5 * stride); - __m128 s6 = _mm_loadu_ps(src + 6 * stride); - __m128 s7 = _mm_loadu_ps(src + 7 * stride); - __m128 s02_0 = _mm_unpacklo_ps(s0, s2); - __m128 s02_1 = _mm_unpackhi_ps(s0, s2); - __m128 s13_0 = _mm_unpacklo_ps(s1, s3); - __m128 s13_1 = _mm_unpackhi_ps(s1, s3); - __m128 s46_0 = _mm_unpacklo_ps(s4, s6); - __m128 s46_1 = _mm_unpackhi_ps(s4, s6); - __m128 s57_0 = _mm_unpacklo_ps(s5, s7); - __m128 s57_1 = _mm_unpackhi_ps(s5, s7); - _mm_storeu_ps(dst + 0x00, _mm_unpacklo_ps(s02_0, s13_0)); - _mm_storeu_ps(dst + 0x04, _mm_unpacklo_ps(s46_0, s57_0)); - _mm_storeu_ps(dst + 0x08, _mm_unpackhi_ps(s02_0, s13_0)); - _mm_storeu_ps(dst + 0x0C, _mm_unpackhi_ps(s46_0, s57_0)); - _mm_storeu_ps(dst + 0x10, _mm_unpacklo_ps(s02_1, s13_1)); - _mm_storeu_ps(dst + 0x14, _mm_unpacklo_ps(s46_1, s57_1)); - _mm_storeu_ps(dst + 0x18, _mm_unpackhi_ps(s02_1, s13_1)); - _mm_storeu_ps(dst + 0x1C, _mm_unpackhi_ps(s46_1, s57_1)); - } - - const __m512i K32_PACKA9_0 = SIMD_MM512_SETR_EPI32(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E); - const __m512i K32_PACKA9_1 = SIMD_MM512_SETR_EPI32(0x00, 0x11, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x12, 0x09, 0x0A, 0x0B, 0x0C, 0x0D); - const __m512i K32_PACKA9_2 = SIMD_MM512_SETR_EPI32(0x00, 0x01, 0x02, 0x13, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x14, 0x0B, 0x0C, 0x0D); - const __m512i K32_PACKA9_3 = SIMD_MM512_SETR_EPI32(0x00, 0x01, 0x02, 0x03, 0x04, 0x15, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x16, 0x0D); - const __m512i K32_PACKA9_4 = SIMD_MM512_SETR_EPI32(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x17, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E); - const __m512i K32_PACKA9_5 = SIMD_MM512_SETR_EPI32(0x18, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x19, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D); - const __m512i K32_PACKA9_6 = SIMD_MM512_SETR_EPI32(0x00, 0x01, 0x1A, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x1B, 0x0A, 0x0B, 0x0C, 0x0D); - const __m512i K32_PACKA9_7 = SIMD_MM512_SETR_EPI32(0x00, 0x01, 0x02, 0x03, 0x1C, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x1D, 0x0C, 0x0D); - const __m512i K32_PACKA9_8 = SIMD_MM512_SETR_EPI32(0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x1E, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1F); - - SIMD_INLINE void GemmPackA_9x16(const float* src, size_t stride, float* dst) - { - __m512 a[9], b[8]; - a[0] = _mm512_loadu_ps(src + 0 * stride); - a[1] = _mm512_loadu_ps(src + 1 * stride); - a[2] = _mm512_loadu_ps(src + 2 * stride); - a[3] = _mm512_loadu_ps(src + 3 * stride); - a[4] = _mm512_loadu_ps(src + 4 * stride); - a[5] = _mm512_loadu_ps(src + 5 * stride); - a[6] = _mm512_loadu_ps(src + 6 * stride); - a[7] = _mm512_loadu_ps(src + 7 * stride); - a[8] = _mm512_loadu_ps(src + 8 * stride); - b[0] = Interleave<0>(a[0], a[4]); - b[1] = Interleave<1>(a[0], a[4]); - b[2] = Interleave<0>(a[1], a[5]); - b[3] = Interleave<1>(a[1], a[5]); - b[4] = Interleave<0>(a[2], a[6]); - b[5] = Interleave<1>(a[2], a[6]); - b[6] = Interleave<0>(a[3], a[7]); - b[7] = Interleave<1>(a[3], a[7]); - a[0] = Interleave<0>(b[0], b[4]); - a[1] = Interleave<1>(b[0], b[4]); - a[2] = Interleave<0>(b[1], b[5]); - a[3] = Interleave<1>(b[1], b[5]); - a[4] = Interleave<0>(b[2], b[6]); - a[5] = Interleave<1>(b[2], b[6]); - a[6] = Interleave<0>(b[3], b[7]); - a[7] = Interleave<1>(b[3], b[7]); - b[0] = Interleave<0>(a[0], a[4]); - b[1] = Interleave<1>(a[0], a[4]); - b[2] = Interleave<0>(a[1], a[5]); - b[3] = Interleave<1>(a[1], a[5]); - b[4] = Interleave<0>(a[2], a[6]); - b[5] = Interleave<1>(a[2], a[6]); - b[6] = Interleave<0>(a[3], a[7]); - b[7] = Interleave<1>(a[3], a[7]); - _mm512_storeu_ps(dst + 0x00, _mm512_permutex2var_ps(Alignr<0x0>(b[0], b[1]), K32_PACKA9_0, a[8])); - _mm512_storeu_ps(dst + 0x10, _mm512_permutex2var_ps(Alignr<0xF>(b[0], b[1]), K32_PACKA9_1, a[8])); - _mm512_storeu_ps(dst + 0x20, _mm512_permutex2var_ps(Alignr<0xD>(b[1], b[2]), K32_PACKA9_2, a[8])); - _mm512_storeu_ps(dst + 0x30, _mm512_permutex2var_ps(Alignr<0xB>(b[2], b[3]), K32_PACKA9_3, a[8])); - _mm512_storeu_ps(dst + 0x40, _mm512_permutex2var_ps(Alignr<0x9>(b[3], b[4]), K32_PACKA9_4, a[8])); - _mm512_storeu_ps(dst + 0x50, _mm512_permutex2var_ps(Alignr<0x8>(b[4], b[5]), K32_PACKA9_5, a[8])); - _mm512_storeu_ps(dst + 0x60, _mm512_permutex2var_ps(Alignr<0x6>(b[5], b[6]), K32_PACKA9_6, a[8])); - _mm512_storeu_ps(dst + 0x70, _mm512_permutex2var_ps(Alignr<0x4>(b[6], b[7]), K32_PACKA9_7, a[8])); - _mm512_storeu_ps(dst + 0x80, _mm512_permutex2var_ps(Alignr<0x0>(b[7], b[7]), K32_PACKA9_8, a[8])); - } - - SIMD_INLINE void GemmPackA_9x4(const float* src, size_t stride, float* dst) - { - __m128 s0 = _mm_loadu_ps(src + 0 * stride); - __m128 s1 = _mm_loadu_ps(src + 1 * stride); - __m128 s2 = _mm_loadu_ps(src + 2 * stride); - __m128 s3 = _mm_loadu_ps(src + 3 * stride); - __m128 s4 = _mm_loadu_ps(src + 4 * stride); - __m128 s5 = _mm_loadu_ps(src + 5 * stride); - __m128 s6 = _mm_loadu_ps(src + 6 * stride); - __m128 s7 = _mm_loadu_ps(src + 7 * stride); - __m128 s02_0 = _mm_unpacklo_ps(s0, s2); - __m128 s02_1 = _mm_unpackhi_ps(s0, s2); - __m128 s13_0 = _mm_unpacklo_ps(s1, s3); - __m128 s13_1 = _mm_unpackhi_ps(s1, s3); - __m128 s46_0 = _mm_unpacklo_ps(s4, s6); - __m128 s46_1 = _mm_unpackhi_ps(s4, s6); - __m128 s57_0 = _mm_unpacklo_ps(s5, s7); - __m128 s57_1 = _mm_unpackhi_ps(s5, s7); - src += 8 * stride; - _mm_storeu_ps(dst + 0x00, _mm_unpacklo_ps(s02_0, s13_0)); - _mm_storeu_ps(dst + 0x04, _mm_unpacklo_ps(s46_0, s57_0)); - dst[0x08] = src[0]; - _mm_storeu_ps(dst + 0x09, _mm_unpackhi_ps(s02_0, s13_0)); - _mm_storeu_ps(dst + 0x0D, _mm_unpackhi_ps(s46_0, s57_0)); - dst[0x11] = src[1]; - _mm_storeu_ps(dst + 0x12, _mm_unpacklo_ps(s02_1, s13_1)); - _mm_storeu_ps(dst + 0x16, _mm_unpacklo_ps(s46_1, s57_1)); - dst[0x1A] = src[2]; - _mm_storeu_ps(dst + 0x1B, _mm_unpackhi_ps(s02_1, s13_1)); - _mm_storeu_ps(dst + 0x1F, _mm_unpackhi_ps(s46_1, s57_1)); - dst[0x23] = src[3]; - } - - const __m512i K32_PACKA12_0 = SIMD_MM512_SETR_EPI32(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B); - const __m512i K32_PACKA12_1 = SIMD_MM512_SETR_EPI32(0x10, 0x11, 0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17); - const __m512i K32_PACKA12_2 = SIMD_MM512_SETR_EPI32(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, 0x08, 0x09, 0x0A, 0x0B); - const __m512i K32_PACKA12_3 = SIMD_MM512_SETR_EPI32(0x18, 0x19, 0x1A, 0x1B, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F); - - SIMD_INLINE void GemmPackA_12x16(const float* src, size_t stride, float* dst) - { - __m512 a[12], b[12]; - a[0] = _mm512_loadu_ps(src + 0 * stride); - a[1] = _mm512_loadu_ps(src + 1 * stride); - a[2] = _mm512_loadu_ps(src + 2 * stride); - a[3] = _mm512_loadu_ps(src + 3 * stride); - a[4] = _mm512_loadu_ps(src + 4 * stride); - a[5] = _mm512_loadu_ps(src + 5 * stride); - a[6] = _mm512_loadu_ps(src + 6 * stride); - a[7] = _mm512_loadu_ps(src + 7 * stride); - a[8] = _mm512_loadu_ps(src + 8 * stride); - a[9] = _mm512_loadu_ps(src + 9 * stride); - a[10] = _mm512_loadu_ps(src + 10 * stride); - a[11] = _mm512_loadu_ps(src + 11 * stride); - b[0] = Interleave<0>(a[0], a[4]); - b[1] = Interleave<1>(a[0], a[4]); - b[2] = Interleave<0>(a[1], a[5]); - b[3] = Interleave<1>(a[1], a[5]); - b[4] = Interleave<0>(a[2], a[6]); - b[5] = Interleave<1>(a[2], a[6]); - b[6] = Interleave<0>(a[3], a[7]); - b[7] = Interleave<1>(a[3], a[7]); - b[8] = Interleave<0>(a[8], a[10]); - b[9] = Interleave<1>(a[8], a[10]); - b[10] = Interleave<0>(a[9], a[11]); - b[11] = Interleave<1>(a[9], a[11]); - a[0] = Interleave<0>(b[0], b[4]); - a[1] = Interleave<1>(b[0], b[4]); - a[2] = Interleave<0>(b[1], b[5]); - a[3] = Interleave<1>(b[1], b[5]); - a[4] = Interleave<0>(b[2], b[6]); - a[5] = Interleave<1>(b[2], b[6]); - a[6] = Interleave<0>(b[3], b[7]); - a[7] = Interleave<1>(b[3], b[7]); - a[8] = Interleave<0>(b[8], b[10]); - a[9] = Interleave<1>(b[8], b[10]); - a[10] = Interleave<0>(b[9], b[11]); - a[11] = Interleave<1>(b[9], b[11]); - b[0] = Interleave<0>(a[0], a[4]); - b[1] = Interleave<1>(a[0], a[4]); - b[2] = Interleave<0>(a[1], a[5]); - b[3] = Interleave<1>(a[1], a[5]); - b[4] = Interleave<0>(a[2], a[6]); - b[5] = Interleave<1>(a[2], a[6]); - b[6] = Interleave<0>(a[3], a[7]); - b[7] = Interleave<1>(a[3], a[7]); - _mm512_mask_storeu_ps(dst + 0x00, 0x0FFF, _mm512_permutex2var_ps(b[0], K32_PACKA12_0, a[8])); - _mm512_mask_storeu_ps(dst + 0x08, 0xFFF0, _mm512_permutex2var_ps(b[0], K32_PACKA12_1, a[8])); - _mm512_mask_storeu_ps(dst + 0x18, 0x0FFF, _mm512_permutex2var_ps(b[1], K32_PACKA12_2, a[8])); - _mm512_mask_storeu_ps(dst + 0x20, 0xFFF0, _mm512_permutex2var_ps(b[1], K32_PACKA12_3, a[8])); - _mm512_mask_storeu_ps(dst + 0x30, 0x0FFF, _mm512_permutex2var_ps(b[2], K32_PACKA12_0, a[9])); - _mm512_mask_storeu_ps(dst + 0x38, 0xFFF0, _mm512_permutex2var_ps(b[2], K32_PACKA12_1, a[9])); - _mm512_mask_storeu_ps(dst + 0x48, 0x0FFF, _mm512_permutex2var_ps(b[3], K32_PACKA12_2, a[9])); - _mm512_mask_storeu_ps(dst + 0x50, 0xFFF0, _mm512_permutex2var_ps(b[3], K32_PACKA12_3, a[9])); - _mm512_mask_storeu_ps(dst + 0x60, 0x0FFF, _mm512_permutex2var_ps(b[4], K32_PACKA12_0, a[10])); - _mm512_mask_storeu_ps(dst + 0x68, 0xFFF0, _mm512_permutex2var_ps(b[4], K32_PACKA12_1, a[10])); - _mm512_mask_storeu_ps(dst + 0x78, 0x0FFF, _mm512_permutex2var_ps(b[5], K32_PACKA12_2, a[10])); - _mm512_mask_storeu_ps(dst + 0x80, 0xFFF0, _mm512_permutex2var_ps(b[5], K32_PACKA12_3, a[10])); - _mm512_mask_storeu_ps(dst + 0x90, 0x0FFF, _mm512_permutex2var_ps(b[6], K32_PACKA12_0, a[11])); - _mm512_mask_storeu_ps(dst + 0x98, 0xFFF0, _mm512_permutex2var_ps(b[6], K32_PACKA12_1, a[11])); - _mm512_mask_storeu_ps(dst + 0xA8, 0x0FFF, _mm512_permutex2var_ps(b[7], K32_PACKA12_2, a[11])); - _mm512_mask_storeu_ps(dst + 0xB0, 0xFFF0, _mm512_permutex2var_ps(b[7], K32_PACKA12_3, a[11])); - } - - SIMD_INLINE void GemmPackA_12x4(const float * src, size_t stride, float * dst) - { - __m128 a[4], b[4]; - for (size_t j = 0; j < 3; ++j) - { - a[0] = _mm_loadu_ps(src + 0 * stride); - a[1] = _mm_loadu_ps(src + 1 * stride); - a[2] = _mm_loadu_ps(src + 2 * stride); - a[3] = _mm_loadu_ps(src + 3 * stride); - b[0] = _mm_unpacklo_ps(a[0], a[2]); - b[1] = _mm_unpackhi_ps(a[0], a[2]); - b[2] = _mm_unpacklo_ps(a[1], a[3]); - b[3] = _mm_unpackhi_ps(a[1], a[3]); - _mm_storeu_ps(dst + 0x00, _mm_unpacklo_ps(b[0], b[2])); - _mm_storeu_ps(dst + 0x0C, _mm_unpackhi_ps(b[0], b[2])); - _mm_storeu_ps(dst + 0x18, _mm_unpacklo_ps(b[1], b[3])); - _mm_storeu_ps(dst + 0x24, _mm_unpackhi_ps(b[1], b[3])); - src += 4 * stride; - dst += 4; - } - } - - SIMD_INLINE void GemmPackA_14x16(const float* src, size_t stride, float* dst) - { - __m512 a[16], b[4]; - a[0] = _mm512_loadu_ps(src + 0 * stride); - a[1] = _mm512_loadu_ps(src + 1 * stride); - a[2] = _mm512_loadu_ps(src + 2 * stride); - a[3] = _mm512_loadu_ps(src + 3 * stride); - a[4] = _mm512_loadu_ps(src + 4 * stride); - a[5] = _mm512_loadu_ps(src + 5 * stride); - a[6] = _mm512_loadu_ps(src + 6 * stride); - a[7] = _mm512_loadu_ps(src + 7 * stride); - a[8] = _mm512_loadu_ps(src + 8 * stride); - a[9] = _mm512_loadu_ps(src + 9 * stride); - a[10] = _mm512_loadu_ps(src + 10 * stride); - a[11] = _mm512_loadu_ps(src + 11 * stride); - a[12] = _mm512_loadu_ps(src + 12 * stride); - a[13] = _mm512_loadu_ps(src + 13 * stride); - a[14] = _mm512_setzero_ps(); - a[15] = _mm512_setzero_ps(); - for (size_t i = 0; i < 4; ++i) - { - __m512* c = a + i; - b[0] = Interleave<0>(c[0], c[8]); - b[1] = Interleave<1>(c[0], c[8]); - b[2] = Interleave<0>(c[4], c[12]); - b[3] = Interleave<1>(c[4], c[12]); - c[0] = Interleave<0>(b[0], b[2]); - c[4] = Interleave<1>(b[0], b[2]); - c[8] = Interleave<0>(b[1], b[3]); - c[12] = Interleave<1>(b[1], b[3]); - } - for (size_t i = 0; i < 4; ++i) - { - const __m512 * c = a + i * 4; - b[0] = Interleave<0>(c[0], c[2]); - b[1] = Interleave<1>(c[0], c[2]); - b[2] = Interleave<0>(c[1], c[3]); - b[3] = Interleave<1>(c[1], c[3]); - _mm512_mask_storeu_ps(dst + 00, 0x3FFF, Interleave<0>(b[0], b[2])); - _mm512_mask_storeu_ps(dst + 14, 0x3FFF, Interleave<1>(b[0], b[2])); - _mm512_mask_storeu_ps(dst + 28, 0x3FFF, Interleave<0>(b[1], b[3])); - _mm512_mask_storeu_ps(dst + 42, 0x3FFF, Interleave<1>(b[1], b[3])); - dst += 56; - } - } - - SIMD_INLINE void GemmPackA_14x4(const float* src, size_t stride, float* dst) - { - __m128 a[4], b[4]; - for (size_t j = 0; j < 3; ++j) - { - a[0] = _mm_loadu_ps(src + 0 * stride); - a[1] = _mm_loadu_ps(src + 1 * stride); - a[2] = _mm_loadu_ps(src + 2 * stride); - a[3] = _mm_loadu_ps(src + 3 * stride); - b[0] = _mm_unpacklo_ps(a[0], a[2]); - b[1] = _mm_unpackhi_ps(a[0], a[2]); - b[2] = _mm_unpacklo_ps(a[1], a[3]); - b[3] = _mm_unpackhi_ps(a[1], a[3]); - _mm_storeu_ps(dst + 0x00, _mm_unpacklo_ps(b[0], b[2])); - _mm_storeu_ps(dst + 0x0E, _mm_unpackhi_ps(b[0], b[2])); - _mm_storeu_ps(dst + 0x1C, _mm_unpacklo_ps(b[1], b[3])); - _mm_storeu_ps(dst + 0x2A, _mm_unpackhi_ps(b[1], b[3])); - src += 4 * stride; - dst += 4; - } - a[0] = _mm_loadu_ps(src + 0 * stride); - a[1] = _mm_loadu_ps(src + 1 * stride); - b[0] = _mm_unpacklo_ps(a[0], a[1]); - b[1] = _mm_unpackhi_ps(a[0], a[1]); - _mm_storel_pi((__m64*)(dst + 0x00), b[0]); - _mm_storeh_pi((__m64*)(dst + 0x0E), b[0]); - _mm_storel_pi((__m64*)(dst + 0x1C), b[1]); - _mm_storeh_pi((__m64*)(dst + 0x2A), b[1]); - } - - void GemmPackA(const float * src, size_t stride, size_t M, size_t K, size_t cell, float* dst) - { - size_t K4 = AlignLo(K, 4), K8 = AlignLo(K, 8), K16 = AlignLo(K, 16); - //for (size_t i = 0; i < 16; i++) - // for (size_t j = 0; j < 14; j++) - // ((float*)src)[j * stride + i] = i * 0.010001f + j; - for (size_t i = 0; i < M; i += cell) - { - size_t m = Simd::Min(cell, M - i), k = 0; - if (cell == 4 && m == 4) - { - for (; k < K16; k += 16, dst += 64) - GemmPackA_4x16(src + k, stride, dst); - for (; k < K8; k += 8, dst += 32) - GemmPackA_4x8(src + k, stride, dst); - for (; k < K4; k += 4, dst += 16) - GemmPackA_4x4(src + k, stride, dst); - } - else if (cell == 6 && m == 6) - { - for (; k < K16; k += 16, dst += 96) - GemmPackA_6x16(src + k, stride, dst); - for (; k < K4; k += 4, dst += 24) - GemmPackA_6x4(src + k, stride, dst); - } - else if (cell == 8 && m == 8) - { - for (; k < K16; k += 16, dst += 128) - GemmPackA_8x16(src + k, stride, dst); - for (; k < K4; k += 4, dst += 32) - GemmPackA_8x4(src + k, stride, dst); - } - else if (cell == 9 && m == 9) - { - for (; k < K16; k += 16, dst += 144) - GemmPackA_9x16(src + k, stride, dst); - for (; k < K4; k += 4, dst += 36) - GemmPackA_9x4(src + k, stride, dst); - } - else if (cell == 12 && m == 12) - { - for (; k < K16; k += 16, dst += 192) - GemmPackA_12x16(src + k, stride, dst); - for (; k < K4; k += 4, dst += 48) - GemmPackA_12x4(src + k, stride, dst); - } - else if (cell == 14 && m == 14) - { - for (; k < K16; k += 16, dst += 224) - GemmPackA_14x16(src + k, stride, dst); - for (; k < K4; k += 4, dst += 56) - GemmPackA_14x4(src + k, stride, dst); - } - for (; k < K; ++k) - { - for (size_t c = 0; c < m; ++c) - *(dst++) = src[c*stride + k]; - } - src += cell * stride; - } - } - - void GemmPackB(const float * B, size_t ldb, size_t K, size_t N, size_t microN, float * pB) - { - for (size_t j = 0; j < N; j += microN) - { - size_t n = Simd::Min(microN, N - j); - if (microN == 1 * F) - { - __mmask16 mask0 = TailMask16(n - 0 * F); - for (size_t k = 0; k < K; ++k) - { - const float * b = B + k * ldb; - _mm512_storeu_ps(pB + 0 * F, _mm512_maskz_loadu_ps(mask0, b + 0 * F)); - pB += microN; - } - } - else if (microN == 2 * F) - { - __mmask16 mask0 = TailMask16(n - 0 * F); - __mmask16 mask1 = TailMask16(n - 1 * F); - for (size_t k = 0; k < K; ++k) - { - const float * b = B + k * ldb; - _mm512_storeu_ps(pB + 0 * F, _mm512_maskz_loadu_ps(mask0, b + 0 * F)); - _mm512_storeu_ps(pB + 1 * F, _mm512_maskz_loadu_ps(mask1, b + 1 * F)); - pB += microN; - } - } - else if (microN == 3 * F) - { - __mmask16 mask0 = TailMask16(n - 0 * F); - __mmask16 mask1 = TailMask16(n - 1 * F); - __mmask16 mask2 = TailMask16(n - 2 * F); - for (size_t k = 0; k < K; ++k) - { - const float * b = B + k * ldb; - _mm512_storeu_ps(pB + 0 * F, _mm512_maskz_loadu_ps(mask0, b + 0 * F)); - _mm512_storeu_ps(pB + 1 * F, _mm512_maskz_loadu_ps(mask1, b + 1 * F)); - _mm512_storeu_ps(pB + 2 * F, _mm512_maskz_loadu_ps(mask2, b + 2 * F)); - pB += microN; - } - } - else - { - for (size_t k = 0; k < K; ++k) - { - const float * b = B + k * ldb; - size_t c = 0; - for (; c < n; ++c) - *(pB++) = *(b++); - for (; c < microN; ++c) - *(pB++) = 0; - } - } - B += microN; - } - } - - SIMD_INLINE void ScaleC(float * ptr, __m512 beta, __mmask16 mask = -1) - { - _mm512_mask_storeu_ps(ptr, mask, _mm512_mul_ps(_mm512_maskz_loadu_ps(mask, ptr), beta)); - } - - void GemmScaleC(size_t M, size_t N, float beta, float * C, size_t ldc) - { - if (beta == 1.0f) - return; - else if (beta == 0.0f) - { - for (size_t i = 0; i < M; ++i) - memset(C + i * ldc, 0, N * sizeof(float)); - } - else - { - size_t NQF = AlignLo(N, QF); - size_t NF = AlignLo(N, F); - __m512 _beta = _mm512_set1_ps(beta); - __mmask16 tail = TailMask16(N - NF); - for (size_t i = 0; i < M; ++i) - { - size_t j = 0; - for (; j < NQF; j += QF) - { - ScaleC(C + j + F * 0, _beta); - ScaleC(C + j + F * 1, _beta); - ScaleC(C + j + F * 2, _beta); - ScaleC(C + j + F * 3, _beta); - } - for (; j < NF; j += F) - ScaleC(C + j, _beta); - if (j < N) - ScaleC(C + j, _beta, tail); - C += ldc; - } - } - } - - void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc) - { - SIMD_PERF_BEGF(Simd::ToStr(M) + "-" + Simd::ToStr(N) + "-" + Simd::ToStr(K), M*N*K * 2); - - typedef Simd::GemmNN GemmNN; - GemmNN::Main kernelMM, kernelMT; - GemmNN::Tail kernelTM, kernelTT; - size_t microM, microN; - if (N <= 8) - { - Avx2::Gemm32fNN(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); - return; - } -#if SIMD_ZMM_COUNT == 32 - if (N < K || M * 8 < N) - { - microM = 14; - microN = 32; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = GemmKernel14x32nn; - kernelMT = tail > F ? GemmKernel14x32nn : GemmKernel14x16nn; - kernelTM = GemmKernelMx32nn; - kernelTT = tail > F ? GemmKernelMx32nn : GemmKernelMx16nn; - } - else - { - microM = 9; - microN = 48; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = GemmKernel9x48nn; - kernelMT = tail > DF ? GemmKernel9x48nn : (tail > F ? GemmKernel9x32nn : GemmKernel9x16nn); - kernelTM = GemmKernelMx48nn; - kernelTT = tail > DF ? GemmKernelMx48nn : (tail > F ? GemmKernelMx32nn : GemmKernelMx16nn); - } - if (M == 16) - { - microM = 8; - microN = 48; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = GemmKernel8x48nn; - kernelMT = tail > DF ? GemmKernel8x48nn : (tail > F ? GemmKernel8x32nn : GemmKernel8x16nn); - kernelTM = GemmKernelMx48nn; - kernelTT = tail > DF ? GemmKernelMx48nn : (tail > F ? GemmKernelMx32nn : GemmKernelMx16nn); - } -#elif SIMD_ZMM_COUNT == 16 - if (N < K || M * 8 < N) - { - microM = 6; - microN = 32; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = GemmKernel6x32nn; - kernelMT = tail > F ? GemmKernel6x32nn : GemmKernel6x16nn; - kernelTM = GemmKernelMx32nn; - kernelTT = tail > F ? GemmKernelMx32nn : GemmKernelMx16nn; - } - else - { - microM = 4; - microN = 48; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = GemmKernel4x48nn; - kernelMT = tail > DF ? GemmKernel4x48nn : (tail > F ? GemmKernel4x32nn : GemmKernel4x16nn); - kernelTM = GemmKernelMx48nn; - kernelTT = tail > DF ? GemmKernelMx48nn : (tail > F ? GemmKernelMx32nn : GemmKernelMx16nn); - } -#else - microM = 4; - microN = 16; - kernelMM = GemmKernel4x16nn; - kernelMT = GemmKernel4x16nn; - kernelTM = GemmKernelMx16nn; - kernelTT = GemmKernelMx16nn; -#endif -#if SIMD_ZMM_COUNT >= 16 - if (M == 4) - { - microM = 4; - microN = 48; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = GemmKernel4x48nn; - kernelMT = tail > DF ? GemmKernel4x48nn : (tail > F ? GemmKernel4x32nn : GemmKernel4x16nn); - kernelTM = GemmKernelMx48nn; - kernelTT = tail > DF ? GemmKernelMx48nn : (tail > F ? GemmKernelMx32nn : GemmKernelMx16nn); - } -#endif - GemmNN::PackA packA = (microM > 6 && M*N*K > 700*700*700) ? Avx::GemmPackA : NULL; - GemmNN gemmNN(M, N, K, microM, microN, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), F, - kernelMM, kernelMT, kernelTM, kernelTT, packA, Avx512f::GemmPackB, Avx512f::GemmScaleC, TailMask16); - gemmNN.Run(alpha, A, lda, B, ldb, beta, C, ldc); - } - - //--------------------------------------------------------------------- - - typedef Simd::GemmNNcb Gemm32fNNcb; - - SIMD_INLINE Gemm32fNNcb CreateGemm32fNNcb(size_t M, size_t N, size_t K, GemmKernelType type, bool compatibility) - { - Gemm32fNNcb::Main kernelMM, kernelMT; - Gemm32fNNcb::Tail kernelTM, kernelTT; - size_t microM, microN; -#if SIMD_ZMM_COUNT == 32 - if (type == GemmKernelF4 || (type == GemmKernelAny && (M == 6 || N == 64))) - { - microN = 64; - size_t tail = N - AlignLoAny(N, microN); - { - microM = 6; - kernelMM = Avx512f::GemmKernel6x64nn; - kernelMT = tail > 3*F ? Avx512f::GemmKernel6x64nn : (tail > DF ? Avx512f::GemmKernel6x48nn : (tail > F ? Avx512f::GemmKernel6x32nn : Avx512f::GemmKernel6x16nn)); - kernelTM = Avx512f::GetGemmTail(M % microM, microN); - kernelTT = Avx512f::GetGemmTail(M % microM, tail); - } - type = GemmKernelF4; - } - if (type == GemmKernelF3 || (type == GemmKernelAny && (M == 4 || M == 8 || M == 9 || M == 16 || M == 18 || M == 32 || N == 48 || N == 96 || (M < 14 && M != 6 && M != 12)) && N > 32)) - { - microN = 48; - size_t tail = N - AlignLoAny(N, microN); - if (M == 4) - { - microM = 4; - kernelMM = Avx512f::GemmKernel4x48nn; - kernelMT = tail > DF ? Avx512f::GemmKernel4x48nn : (tail > F ? Avx512f::GemmKernel4x32nn : Avx512f::GemmKernel4x16nn); - kernelTM = Avx512f::GetGemmTail(M%microM, microN); - kernelTT = Avx512f::GetGemmTail(M%microM, tail); - } - else if (M == 8 || M == 16 || M == 32) - { - microM = 8; - kernelMM = Avx512f::GemmKernel8x48nn; - kernelMT = tail > DF ? Avx512f::GemmKernel8x48nn : (tail > F ? Avx512f::GemmKernel8x32nn : Avx512f::GemmKernel8x16nn); - kernelTM = Avx512f::GetGemmTail(M%microM, microN); - kernelTT = Avx512f::GetGemmTail(M%microM, tail); - } - else - { - microM = 9; - kernelMM = Avx512f::GemmKernel9x48nn; - kernelMT = tail > DF ? Avx512f::GemmKernel9x48nn : (tail > F ? Avx512f::GemmKernel9x32nn : Avx512f::GemmKernel9x16nn); - kernelTM = Avx512f::GetGemmTail(M%microM, microN); - kernelTT = Avx512f::GetGemmTail(M%microM, tail); - } - type = GemmKernelF3; - } - if (type == GemmKernelF2 || (type == GemmKernelF3 && N <= 32) || (type == GemmKernelAny && N > 16)) - { - microN = 32; - size_t tail = N - AlignLoAny(N, microN); - if (M <= 6) - { - microM = 6; - kernelMM = Avx512f::GemmKernel6x32nn; - kernelMT = tail > F ? Avx512f::GemmKernel6x32nn : Avx512f::GemmKernel6x16nn; - kernelTM = Avx512f::GetGemmTail(M%microM, microN); - kernelTT = Avx512f::GetGemmTail(M%microM, tail); - } - else if (M <= 12 || M == 24) - { - microM = 12; - kernelMM = Avx512f::GemmKernel12x32nn; - kernelMT = tail > F ? Avx512f::GemmKernel12x32nn : Avx512f::GemmKernel12x16nn; - kernelTM = Avx512f::GetGemmTail(M%microM, microN); - kernelTT = Avx512f::GetGemmTail(M%microM, tail); - } - else - { - microM = 14; - kernelMM = Avx512f::GemmKernel14x32nn; - kernelMT = tail > F ? Avx512f::GemmKernel14x32nn : Avx512f::GemmKernel14x16nn; - kernelTM = Avx512f::GetGemmTail(M%microM, microN); - kernelTT = Avx512f::GetGemmTail(M%microM, tail); - } - type = GemmKernelF2; - } - if (type == GemmKernelF1 || (type == GemmKernelF2 && N <= 16) || type == GemmKernelAny) - { - microM = 14; - microN = 16; - kernelMM = Avx512f::GemmKernel14x16nn; - kernelMT = Avx512f::GemmKernel14x16nn; - kernelTM = Avx512f::GetGemmTail(M%microM, microN); - kernelTT = Avx512f::GetGemmTail(M%microM, microN); - type = GemmKernelF1; - } -#elif SIMD_ZMM_COUNT == 16 - if (type == GemmKernelF3 || (type == GemmKernelAny && (M == 4 || M == 8 || M == 16 || N == 48 || N == 96) && N > 32)) - { - microM = 4; - microN = 48; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = Avx512f::GemmKernel4x48nn; - kernelMT = tail > DF ? Avx512f::GemmKernel4x48nn : (tail > F ? Avx512f::GemmKernel4x32nn : Avx512f::GemmKernel4x16nn); - kernelTM = Avx512f::GetGemmTail(M%microM, microN); - kernelTT = Avx512f::GetGemmTail(M%microM, tail); - type = GemmKernelF3; - } - if (type == GemmKernelF2 || (type == GemmKernelF3 && N <= 32) || (type == GemmKernelAny && N > 16)) - { - microM = 6; - microN = 32; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = Avx512f::GemmKernel6x32nn; - kernelMT = tail > F ? Avx512f::GemmKernel6x32nn : Avx512f::GemmKernel6x16nn; - kernelTM = Avx512f::GetGemmTail(M%microM, microN); - kernelTT = Avx512f::GetGemmTail(M%microM, tail); - type = GemmKernelF2; - } - if (type == GemmKernelF1 || (type == GemmKernelF2 && N <= 16) || type == GemmKernelAny) - { - microM = 6; - microN = 16; - kernelMM = Avx512f::GemmKernel6x16nn; - kernelMT = Avx512f::GemmKernel6x16nn; - kernelTM = Avx512f::GetGemmTail(M%microM, microN); - kernelTT = Avx512f::GetGemmTail(M%microM, microN); - type = GemmKernelF1; - } -#else - microM = 4; - microN = 16; - kernelMM = Avx512f::GemmKernel4x16nn; - kernelMT = Avx512f::GemmKernel4x16nn; - kernelTM = Avx512f::GetGemmTail(M%microM, microN); - kernelTT = Avx512f::GetGemmTail(M%microM, microN); -#endif - Gemm32fNNcb::PackA packA = ((M * 3 < N && N >= 512 && K >= 128 && M > 16) || (K >= 256 && M > 256)) ? Avx512f::GemmPackA : NULL; - return Gemm32fNNcb(M, N, K, microM, microN, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), F, - kernelMM, kernelMT, kernelTM, kernelTT, packA, Avx512f::GemmPackB, Avx512f::GemmScaleC, TailMask16, compatibility); - } - - size_t Gemm32fNNcbBufferSize(size_t M, size_t N, size_t K, GemmKernelType type, bool compatibility) - { - if (N > Avx::F) - { - Gemm32fNNcb gemm = CreateGemm32fNNcb(M, N, K, type, compatibility); - return gemm.BufferSize(); - } - else - return Avx2::Gemm32fNNcbBufferSize(M, N, K, type, compatibility); - } - - void Gemm32fNNcbReorderB(size_t M, size_t N, size_t K, const float * B, float * pB, GemmKernelType type, bool compatibility) - { - if (N > Avx::F) - { - Gemm32fNNcb gemm = CreateGemm32fNNcb(M, N, K, type, compatibility); - gemm.ReorderB(B, N, pB); - } - else - Avx2::Gemm32fNNcbReorderB(M, N, K, B, pB, type, compatibility); - } - - void Gemm32fNNcbRun(size_t M, size_t N, size_t K, const float * A, const float * pB, float * C, GemmKernelType type, bool compatibility) - { - //SIMD_PERF_BEGF(Simd::ToStr(M) + "-" + Simd::ToStr(N) + "-" + Simd::ToStr(K), M * N * K * 2); - if (N > Avx::F) - { - Gemm32fNNcb gemm = CreateGemm32fNNcb(M, N, K, type, compatibility); - gemm.Run(A, K, pB, C, N); - } - else - Avx2::Gemm32fNNcbRun(M, N, K, A, pB, C, type, compatibility); - } - - //--------------------------------------------------------------------- - - SIMD_INLINE void Add4ExtractedSums(const __m512 & sum0, const __m512 & sum1, const __m512 & sum2, const __m512 & sum3, const __m128 & alpha, float * dst) - { - __m512 sum02 = _mm512_add_ps(_mm512_unpacklo_ps(sum0, sum2), _mm512_unpackhi_ps(sum0, sum2)); - __m512 sum13 = _mm512_add_ps(_mm512_unpacklo_ps(sum1, sum3), _mm512_unpackhi_ps(sum1, sum3)); - __m512 sum512 = _mm512_add_ps(_mm512_unpacklo_ps(sum02, sum13), _mm512_unpackhi_ps(sum02, sum13)); - __m128 sum128 = _mm_add_ps(_mm_add_ps(_mm512_extractf32x4_ps(sum512, 0), _mm512_extractf32x4_ps(sum512, 1)), - _mm_add_ps(_mm512_extractf32x4_ps(sum512, 2), _mm512_extractf32x4_ps(sum512, 3))); - _mm_storeu_ps(dst, _mm_fmadd_ps(alpha, sum128, _mm_loadu_ps(dst))); - } - - static void Kernel1x1x16nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K16 = K & (~15); - const float * A0 = A + 0 * lda; - const float * B0 = B + 0 * ldb; - __m512 c00 = _mm512_setzero_ps(); - __m512 a0, b0; - size_t k = 0; - for (; k < K16; k += 16) - { - a0 = _mm512_loadu_ps(A0 + k); - b0 = _mm512_loadu_ps(B0 + k); - c00 = _mm512_fmadd_ps(a0, b0, c00); - } - if (k < K) - { - __mmask16 tail = __mmask16(-1) >> (16 + k - K); - a0 = _mm512_maskz_loadu_ps(tail, A0 + k); - b0 = _mm512_maskz_loadu_ps(tail, B0 + k); - c00 = _mm512_fmadd_ps(a0, b0, c00); - } - C[0] += alpha * Avx512f::ExtractSum(c00); - } - - static void Kernel1x4x16nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K16 = K & (~15); - const float * A0 = A + 0 * lda; - const float * B0 = B + 0 * ldb; - const float * B1 = B + 1 * ldb; - const float * B2 = B + 2 * ldb; - const float * B3 = B + 3 * ldb; - __m512 c00 = _mm512_setzero_ps(); - __m512 c01 = _mm512_setzero_ps(); - __m512 c02 = _mm512_setzero_ps(); - __m512 c03 = _mm512_setzero_ps(); - __m512 a0, b0; - size_t k = 0; - for (; k < K16; k += 16) - { - a0 = _mm512_loadu_ps(A0 + k); - b0 = _mm512_loadu_ps(B0 + k); - c00 = _mm512_fmadd_ps(a0, b0, c00); - b0 = _mm512_loadu_ps(B1 + k); - c01 = _mm512_fmadd_ps(a0, b0, c01); - b0 = _mm512_loadu_ps(B2 + k); - c02 = _mm512_fmadd_ps(a0, b0, c02); - b0 = _mm512_loadu_ps(B3 + k); - c03 = _mm512_fmadd_ps(a0, b0, c03); - } - if (k < K) - { - __mmask16 tail = __mmask16(-1) >> (16 + k - K); - a0 = _mm512_maskz_loadu_ps(tail, A0 + k); - b0 = _mm512_maskz_loadu_ps(tail, B0 + k); - c00 = _mm512_fmadd_ps(a0, b0, c00); - b0 = _mm512_maskz_loadu_ps(tail, B1 + k); - c01 = _mm512_fmadd_ps(a0, b0, c01); - b0 = _mm512_maskz_loadu_ps(tail, B2 + k); - c02 = _mm512_fmadd_ps(a0, b0, c02); - b0 = _mm512_maskz_loadu_ps(tail, B3 + k); - c03 = _mm512_fmadd_ps(a0, b0, c03); - } - __m128 _alpha = _mm_set1_ps(alpha); - Add4ExtractedSums(c00, c01, c02, c03, _alpha, C + 0 * ldc); - } - - static void Kernel2x1x16nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K16 = K & (~15); - const float * A0 = A + 0 * lda; - const float * A1 = A + 1 * lda; - const float * B0 = B + 0 * ldb; - __m512 c00 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 a0, a1, b0; - size_t k = 0; - for (; k < K16; k += 16) - { - a0 = _mm512_loadu_ps(A0 + k); - a1 = _mm512_loadu_ps(A1 + k); - b0 = _mm512_loadu_ps(B0 + k); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c10 = _mm512_fmadd_ps(a1, b0, c10); - } - if (k < K) - { - __mmask16 tail = __mmask16(-1) >> (16 + k - K); - a0 = _mm512_maskz_loadu_ps(tail, A0 + k); - a1 = _mm512_maskz_loadu_ps(tail, A1 + k); - b0 = _mm512_maskz_loadu_ps(tail, B0 + k); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c10 = _mm512_fmadd_ps(a1, b0, c10); - } - C[0 * ldc] += alpha * Avx512f::ExtractSum(c00); - C[1 * ldc] += alpha * Avx512f::ExtractSum(c10); - } - - static void Kernel2x4x16nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K16 = K & (~15); - const float * A0 = A + 0 * lda; - const float * A1 = A + 1 * lda; - const float * B0 = B + 0 * ldb; - const float * B1 = B + 1 * ldb; - const float * B2 = B + 2 * ldb; - const float * B3 = B + 3 * ldb; - __m512 c00 = _mm512_setzero_ps(); - __m512 c01 = _mm512_setzero_ps(); - __m512 c02 = _mm512_setzero_ps(); - __m512 c03 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c11 = _mm512_setzero_ps(); - __m512 c12 = _mm512_setzero_ps(); - __m512 c13 = _mm512_setzero_ps(); - __m512 a0, a1, b0; - size_t k = 0; - for (; k < K16; k += 16) - { - a0 = _mm512_loadu_ps(A0 + k); - a1 = _mm512_loadu_ps(A1 + k); - b0 = _mm512_loadu_ps(B0 + k); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c10 = _mm512_fmadd_ps(a1, b0, c10); - b0 = _mm512_loadu_ps(B1 + k); - c01 = _mm512_fmadd_ps(a0, b0, c01); - c11 = _mm512_fmadd_ps(a1, b0, c11); - b0 = _mm512_loadu_ps(B2 + k); - c02 = _mm512_fmadd_ps(a0, b0, c02); - c12 = _mm512_fmadd_ps(a1, b0, c12); - b0 = _mm512_loadu_ps(B3 + k); - c03 = _mm512_fmadd_ps(a0, b0, c03); - c13 = _mm512_fmadd_ps(a1, b0, c13); - } - if (k < K) - { - __mmask16 tail = __mmask16(-1) >> (16 + k - K); - a0 = _mm512_maskz_loadu_ps(tail, A0 + k); - a1 = _mm512_maskz_loadu_ps(tail, A1 + k); - b0 = _mm512_maskz_loadu_ps(tail, B0 + k); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c10 = _mm512_fmadd_ps(a1, b0, c10); - b0 = _mm512_maskz_loadu_ps(tail, B1 + k); - c01 = _mm512_fmadd_ps(a0, b0, c01); - c11 = _mm512_fmadd_ps(a1, b0, c11); - b0 = _mm512_maskz_loadu_ps(tail, B2 + k); - c02 = _mm512_fmadd_ps(a0, b0, c02); - c12 = _mm512_fmadd_ps(a1, b0, c12); - b0 = _mm512_maskz_loadu_ps(tail, B3 + k); - c03 = _mm512_fmadd_ps(a0, b0, c03); - c13 = _mm512_fmadd_ps(a1, b0, c13); - } - __m128 _alpha = _mm_set1_ps(alpha); - Add4ExtractedSums(c00, c01, c02, c03, _alpha, C + 0 * ldc); - Add4ExtractedSums(c10, c11, c12, c13, _alpha, C + 1 * ldc); - } - - static void Kernel3x1x16nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K16 = K & (~15); - const float * A0 = A + 0 * lda; - const float * A1 = A + 1 * lda; - const float * A2 = A + 2 * lda; - const float * B0 = B + 0 * ldb; - __m512 c00 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 a0, a1, a2, b0; - size_t k = 0; - for (; k < K16; k += 16) - { - a0 = _mm512_loadu_ps(A0 + k); - a1 = _mm512_loadu_ps(A1 + k); - a2 = _mm512_loadu_ps(A2 + k); - b0 = _mm512_loadu_ps(B0 + k); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c10 = _mm512_fmadd_ps(a1, b0, c10); - c20 = _mm512_fmadd_ps(a2, b0, c20); - } - if (k < K) - { - __mmask16 tail = __mmask16(-1) >> (16 + k - K); - a0 = _mm512_maskz_loadu_ps(tail, A0 + k); - a1 = _mm512_maskz_loadu_ps(tail, A1 + k); - a2 = _mm512_maskz_loadu_ps(tail, A2 + k); - b0 = _mm512_maskz_loadu_ps(tail, B0 + k); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c10 = _mm512_fmadd_ps(a1, b0, c10); - c20 = _mm512_fmadd_ps(a2, b0, c20); - } - C[0 * ldc] += alpha * Avx512f::ExtractSum(c00); - C[1 * ldc] += alpha * Avx512f::ExtractSum(c10); - C[2 * ldc] += alpha * Avx512f::ExtractSum(c20); - } - - static void Kernel3x4x16nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K16 = K & (~15); - const float * A0 = A + 0 * lda; - const float * A1 = A + 1 * lda; - const float * A2 = A + 2 * lda; - const float * B0 = B + 0 * ldb; - const float * B1 = B + 1 * ldb; - const float * B2 = B + 2 * ldb; - const float * B3 = B + 3 * ldb; - __m512 c00 = _mm512_setzero_ps(); - __m512 c01 = _mm512_setzero_ps(); - __m512 c02 = _mm512_setzero_ps(); - __m512 c03 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c11 = _mm512_setzero_ps(); - __m512 c12 = _mm512_setzero_ps(); - __m512 c13 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 c21 = _mm512_setzero_ps(); - __m512 c22 = _mm512_setzero_ps(); - __m512 c23 = _mm512_setzero_ps(); - __m512 a0, a1, a2, b0; - size_t k = 0; - for (; k < K16; k += 16) - { - a0 = _mm512_loadu_ps(A0 + k); - a1 = _mm512_loadu_ps(A1 + k); - a2 = _mm512_loadu_ps(A2 + k); - b0 = _mm512_loadu_ps(B0 + k); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c10 = _mm512_fmadd_ps(a1, b0, c10); - c20 = _mm512_fmadd_ps(a2, b0, c20); - b0 = _mm512_loadu_ps(B1 + k); - c01 = _mm512_fmadd_ps(a0, b0, c01); - c11 = _mm512_fmadd_ps(a1, b0, c11); - c21 = _mm512_fmadd_ps(a2, b0, c21); - b0 = _mm512_loadu_ps(B2 + k); - c02 = _mm512_fmadd_ps(a0, b0, c02); - c12 = _mm512_fmadd_ps(a1, b0, c12); - c22 = _mm512_fmadd_ps(a2, b0, c22); - b0 = _mm512_loadu_ps(B3 + k); - c03 = _mm512_fmadd_ps(a0, b0, c03); - c13 = _mm512_fmadd_ps(a1, b0, c13); - c23 = _mm512_fmadd_ps(a2, b0, c23); - } - if (k < K) - { - __mmask16 tail = __mmask16(-1) >> (16 + k - K); - a0 = _mm512_maskz_loadu_ps(tail, A0 + k); - a1 = _mm512_maskz_loadu_ps(tail, A1 + k); - a2 = _mm512_maskz_loadu_ps(tail, A2 + k); - b0 = _mm512_maskz_loadu_ps(tail, B0 + k); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c10 = _mm512_fmadd_ps(a1, b0, c10); - c20 = _mm512_fmadd_ps(a2, b0, c20); - b0 = _mm512_maskz_loadu_ps(tail, B1 + k); - c01 = _mm512_fmadd_ps(a0, b0, c01); - c11 = _mm512_fmadd_ps(a1, b0, c11); - c21 = _mm512_fmadd_ps(a2, b0, c21); - b0 = _mm512_maskz_loadu_ps(tail, B2 + k); - c02 = _mm512_fmadd_ps(a0, b0, c02); - c12 = _mm512_fmadd_ps(a1, b0, c12); - c22 = _mm512_fmadd_ps(a2, b0, c22); - b0 = _mm512_maskz_loadu_ps(tail, B3 + k); - c03 = _mm512_fmadd_ps(a0, b0, c03); - c13 = _mm512_fmadd_ps(a1, b0, c13); - c23 = _mm512_fmadd_ps(a2, b0, c23); - } - __m128 _alpha = _mm_set1_ps(alpha); - Add4ExtractedSums(c00, c01, c02, c03, _alpha, C + 0 * ldc); - Add4ExtractedSums(c10, c11, c12, c13, _alpha, C + 1 * ldc); - Add4ExtractedSums(c20, c21, c22, c23, _alpha, C + 2 * ldc); - } - - static void Kernel6x1x16nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K16 = K & (~15); - const float * A0 = A + 0 * lda; - const float * A1 = A + 1 * lda; - const float * A2 = A + 2 * lda; - const float * A3 = A + 3 * lda; - const float * A4 = A + 4 * lda; - const float * A5 = A + 5 * lda; - const float * B0 = B + 0 * ldb; - __m512 c00 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 c30 = _mm512_setzero_ps(); - __m512 c40 = _mm512_setzero_ps(); - __m512 c50 = _mm512_setzero_ps(); - __m512 a0, a1, a2, a3, a4, a5, b0; - size_t k = 0; - for (; k < K16; k += 16) - { - a0 = _mm512_loadu_ps(A0 + k); - a1 = _mm512_loadu_ps(A1 + k); - a2 = _mm512_loadu_ps(A2 + k); - a3 = _mm512_loadu_ps(A3 + k); - a4 = _mm512_loadu_ps(A4 + k); - a5 = _mm512_loadu_ps(A5 + k); - b0 = _mm512_loadu_ps(B0 + k); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c10 = _mm512_fmadd_ps(a1, b0, c10); - c20 = _mm512_fmadd_ps(a2, b0, c20); - c30 = _mm512_fmadd_ps(a3, b0, c30); - c40 = _mm512_fmadd_ps(a4, b0, c40); - c50 = _mm512_fmadd_ps(a5, b0, c50); - } - if (k < K) - { - __mmask16 tail = __mmask16(-1) >> (16 + k - K); - a0 = _mm512_maskz_loadu_ps(tail, A0 + k); - a1 = _mm512_maskz_loadu_ps(tail, A1 + k); - a2 = _mm512_maskz_loadu_ps(tail, A2 + k); - a3 = _mm512_maskz_loadu_ps(tail, A3 + k); - a4 = _mm512_maskz_loadu_ps(tail, A4 + k); - a5 = _mm512_maskz_loadu_ps(tail, A5 + k); - b0 = _mm512_maskz_loadu_ps(tail, B0 + k); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c10 = _mm512_fmadd_ps(a1, b0, c10); - c20 = _mm512_fmadd_ps(a2, b0, c20); - c30 = _mm512_fmadd_ps(a3, b0, c30); - c40 = _mm512_fmadd_ps(a4, b0, c40); - c50 = _mm512_fmadd_ps(a5, b0, c50); - } - C[0 * ldc] += alpha * Avx512f::ExtractSum(c00); - C[1 * ldc] += alpha * Avx512f::ExtractSum(c10); - C[2 * ldc] += alpha * Avx512f::ExtractSum(c20); - C[3 * ldc] += alpha * Avx512f::ExtractSum(c30); - C[4 * ldc] += alpha * Avx512f::ExtractSum(c40); - C[5 * ldc] += alpha * Avx512f::ExtractSum(c50); - } - - static void Kernel6x4x16nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K16 = K & (~15); - const float * A0 = A + 0 * lda; - const float * A1 = A + 1 * lda; - const float * A2 = A + 2 * lda; - const float * A3 = A + 3 * lda; - const float * A4 = A + 4 * lda; - const float * A5 = A + 5 * lda; - const float * B0 = B + 0 * ldb; - const float * B1 = B + 1 * ldb; - const float * B2 = B + 2 * ldb; - const float * B3 = B + 3 * ldb; - __m512 c00 = _mm512_setzero_ps(); - __m512 c01 = _mm512_setzero_ps(); - __m512 c02 = _mm512_setzero_ps(); - __m512 c03 = _mm512_setzero_ps(); - __m512 c10 = _mm512_setzero_ps(); - __m512 c11 = _mm512_setzero_ps(); - __m512 c12 = _mm512_setzero_ps(); - __m512 c13 = _mm512_setzero_ps(); - __m512 c20 = _mm512_setzero_ps(); - __m512 c21 = _mm512_setzero_ps(); - __m512 c22 = _mm512_setzero_ps(); - __m512 c23 = _mm512_setzero_ps(); - __m512 c30 = _mm512_setzero_ps(); - __m512 c31 = _mm512_setzero_ps(); - __m512 c32 = _mm512_setzero_ps(); - __m512 c33 = _mm512_setzero_ps(); - __m512 c40 = _mm512_setzero_ps(); - __m512 c41 = _mm512_setzero_ps(); - __m512 c42 = _mm512_setzero_ps(); - __m512 c43 = _mm512_setzero_ps(); - __m512 c50 = _mm512_setzero_ps(); - __m512 c51 = _mm512_setzero_ps(); - __m512 c52 = _mm512_setzero_ps(); - __m512 c53 = _mm512_setzero_ps(); - __m512 a0, a1, a2, a3, a4, a5, b0; - size_t k = 0; - for (; k < K16; k += 16) - { - a0 = _mm512_loadu_ps(A0 + k); - a1 = _mm512_loadu_ps(A1 + k); - a2 = _mm512_loadu_ps(A2 + k); - a3 = _mm512_loadu_ps(A3 + k); - a4 = _mm512_loadu_ps(A4 + k); - a5 = _mm512_loadu_ps(A5 + k); - b0 = _mm512_loadu_ps(B0 + k); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c10 = _mm512_fmadd_ps(a1, b0, c10); - c20 = _mm512_fmadd_ps(a2, b0, c20); - c30 = _mm512_fmadd_ps(a3, b0, c30); - c40 = _mm512_fmadd_ps(a4, b0, c40); - c50 = _mm512_fmadd_ps(a5, b0, c50); - b0 = _mm512_loadu_ps(B1 + k); - c01 = _mm512_fmadd_ps(a0, b0, c01); - c11 = _mm512_fmadd_ps(a1, b0, c11); - c21 = _mm512_fmadd_ps(a2, b0, c21); - c31 = _mm512_fmadd_ps(a3, b0, c31); - c41 = _mm512_fmadd_ps(a4, b0, c41); - c51 = _mm512_fmadd_ps(a5, b0, c51); - b0 = _mm512_loadu_ps(B2 + k); - c02 = _mm512_fmadd_ps(a0, b0, c02); - c12 = _mm512_fmadd_ps(a1, b0, c12); - c22 = _mm512_fmadd_ps(a2, b0, c22); - c32 = _mm512_fmadd_ps(a3, b0, c32); - c42 = _mm512_fmadd_ps(a4, b0, c42); - c52 = _mm512_fmadd_ps(a5, b0, c52); - b0 = _mm512_loadu_ps(B3 + k); - c03 = _mm512_fmadd_ps(a0, b0, c03); - c13 = _mm512_fmadd_ps(a1, b0, c13); - c23 = _mm512_fmadd_ps(a2, b0, c23); - c33 = _mm512_fmadd_ps(a3, b0, c33); - c43 = _mm512_fmadd_ps(a4, b0, c43); - c53 = _mm512_fmadd_ps(a5, b0, c53); - } - if (k < K) - { - __mmask16 tail = __mmask16(-1) >> (16 + k - K); - a0 = _mm512_maskz_loadu_ps(tail, A0 + k); - a1 = _mm512_maskz_loadu_ps(tail, A1 + k); - a2 = _mm512_maskz_loadu_ps(tail, A2 + k); - a3 = _mm512_maskz_loadu_ps(tail, A3 + k); - a4 = _mm512_maskz_loadu_ps(tail, A4 + k); - a5 = _mm512_maskz_loadu_ps(tail, A5 + k); - b0 = _mm512_maskz_loadu_ps(tail, B0 + k); - c00 = _mm512_fmadd_ps(a0, b0, c00); - c10 = _mm512_fmadd_ps(a1, b0, c10); - c20 = _mm512_fmadd_ps(a2, b0, c20); - c30 = _mm512_fmadd_ps(a3, b0, c30); - c40 = _mm512_fmadd_ps(a4, b0, c40); - c50 = _mm512_fmadd_ps(a5, b0, c50); - b0 = _mm512_maskz_loadu_ps(tail, B1 + k); - c01 = _mm512_fmadd_ps(a0, b0, c01); - c11 = _mm512_fmadd_ps(a1, b0, c11); - c21 = _mm512_fmadd_ps(a2, b0, c21); - c31 = _mm512_fmadd_ps(a3, b0, c31); - c41 = _mm512_fmadd_ps(a4, b0, c41); - c51 = _mm512_fmadd_ps(a5, b0, c51); - b0 = _mm512_maskz_loadu_ps(tail, B2 + k); - c02 = _mm512_fmadd_ps(a0, b0, c02); - c12 = _mm512_fmadd_ps(a1, b0, c12); - c22 = _mm512_fmadd_ps(a2, b0, c22); - c32 = _mm512_fmadd_ps(a3, b0, c32); - c42 = _mm512_fmadd_ps(a4, b0, c42); - c52 = _mm512_fmadd_ps(a5, b0, c52); - b0 = _mm512_maskz_loadu_ps(tail, B3 + k); - c03 = _mm512_fmadd_ps(a0, b0, c03); - c13 = _mm512_fmadd_ps(a1, b0, c13); - c23 = _mm512_fmadd_ps(a2, b0, c23); - c33 = _mm512_fmadd_ps(a3, b0, c33); - c43 = _mm512_fmadd_ps(a4, b0, c43); - c53 = _mm512_fmadd_ps(a5, b0, c53); - } - __m128 _alpha = _mm_set1_ps(alpha); - Add4ExtractedSums(c00, c01, c02, c03, _alpha, C + 0 * ldc); - Add4ExtractedSums(c10, c11, c12, c13, _alpha, C + 1 * ldc); - Add4ExtractedSums(c20, c21, c22, c23, _alpha, C + 2 * ldc); - Add4ExtractedSums(c30, c31, c32, c33, _alpha, C + 3 * ldc); - Add4ExtractedSums(c40, c41, c42, c43, _alpha, C + 4 * ldc); - Add4ExtractedSums(c50, c51, c52, c53, _alpha, C + 5 * ldc); - } - - void Gemm32fNT(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc) - { - //SIMD_PERF_BEGF(Simd::ToStr(M) + "-" + Simd::ToStr(N) + "-" + Simd::ToStr(K), M*N*K * 2); - if (K <= Avx2::F) - { - Avx2::Gemm32fNT(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); - return; - } - typedef Simd::GemmNT GemmNT; -#if SIMD_ZMM_COUNT == 32 - GemmNT gemmNT(M, N, K, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), F, Avx::GemmScaleC, - Kernel1x1x16nt, Kernel1x4x16nt, Kernel2x1x16nt, Kernel2x4x16nt, Kernel3x1x16nt, Kernel3x4x16nt, Kernel6x1x16nt, Kernel6x4x16nt); -#elif defined(SIMD_X64_ENABLE) - GemmNT gemmNT(M, N, K, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), F, Avx::GemmScaleC, - Kernel1x1x16nt, Kernel1x4x16nt, Kernel2x1x16nt, Kernel2x4x16nt, Kernel3x1x16nt, Kernel3x4x16nt, NULL, NULL); -#else - GemmNT gemmNT(M, N, K, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), F, Sse::GemmScaleC, - Kernel1x1x16nt, Kernel1x4x16nt, NULL, NULL, NULL, NULL, NULL, NULL); -#endif - gemmNT.Run(alpha, A, lda, B, ldb, beta, C, ldc); - } - } -#endif// SIMD_AVX512F_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512fNeural.cpp b/src/3rd/Simd/Simd/SimdAvx512fNeural.cpp deleted file mode 100644 index 00a65156..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512fNeural.cpp +++ /dev/null @@ -1,2911 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdStream.h" -#include "Simd/SimdNeural.h" -#include "Simd/SimdAvx2.h" -#include "Simd/SimdPow.h" -#include "Simd/SimdExp.h" - -namespace Simd -{ -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - template SIMD_INLINE void NeuralProductSum(const float * a, const float * b, size_t offset, __m512 & sum, __mmask16 m = -1) - { - __m512 _a = Load(a + offset, m); - __m512 _b = Load(b + offset, m); - sum = _mm512_fmadd_ps(_a, _b, sum); - } - - template SIMD_INLINE void NeuralProductSum(const float * a, const float * b, size_t size, float * sum) - { - if (align) - assert(Aligned(a) && Aligned(b)); - - size_t partialAlignedSize = AlignLo(size, F); - size_t fullAlignedSize = AlignLo(size, QF); - size_t i = 0; - __m512 sum0 = _mm512_setzero_ps(); - if (fullAlignedSize) - { - __m512 sum1 = _mm512_setzero_ps(); - __m512 sum2 = _mm512_setzero_ps(); - __m512 sum3 = _mm512_setzero_ps(); - for (; i < fullAlignedSize; i += QF) - { - NeuralProductSum(a, b, i + F * 0, sum0); - NeuralProductSum(a, b, i + F * 1, sum1); - NeuralProductSum(a, b, i + F * 2, sum2); - NeuralProductSum(a, b, i + F * 3, sum3); - } - sum0 = _mm512_add_ps(_mm512_add_ps(sum0, sum1), _mm512_add_ps(sum2, sum3)); - } - for (; i < partialAlignedSize; i += F) - NeuralProductSum(a, b, i, sum0); - if (i < size) - { - __mmask16 tailMask = __mmask16(-1) >> (F + i - size); - NeuralProductSum(a, b, i, sum0, tailMask); - } - *sum = ExtractSum(sum0); - } - - void NeuralProductSum(const float * a, const float * b, size_t size, float * sum) - { - if (Aligned(a) && Aligned(b)) - NeuralProductSum(a, b, size, sum); - else - NeuralProductSum(a, b, size, sum); - } - - template SIMD_INLINE void AddMultiplied(const float * src, const __m512 & value, float * dst, __mmask16 m = -1) - { - __m512 _src = Load(src, m); - __m512 _dst = Load(dst, m); - Store(dst, _mm512_fmadd_ps(value, _src, _dst), m); - } - - template SIMD_INLINE void AddMultiplied(const float * src, size_t aligned, size_t partial, size_t full, float value, float * dst) - { - size_t i = 0; - __m512 _value = _mm512_set1_ps(value); - for (; i < aligned; i += QF) - { - AddMultiplied(src + i + F * 0, _value, dst + i + F * 0); - AddMultiplied(src + i + F * 1, _value, dst + i + F * 1); - AddMultiplied(src + i + F * 2, _value, dst + i + F * 2); - AddMultiplied(src + i + F * 3, _value, dst + i + F * 3); - } - for (; i < partial; i += F) - AddMultiplied(src + i, _value, dst + i); - if (i < full) - { - __mmask16 tailMask = __mmask16(-1) >> (F + i - full); - AddMultiplied(src + i, _value, dst + i, tailMask); - } - } - - void NeuralAddVectorMultipliedByValue(const float * src, size_t size, const float * value, float * dst) - { - size_t aligned = AlignLo(size, QF); - size_t partial = AlignLo(size, F); - if (Aligned(src) && Aligned(dst)) - AddMultiplied(src, aligned, partial, size, *value, dst); - else - AddMultiplied(src, aligned, partial, size, *value, dst); - } - - template SIMD_INLINE void AddVector(const float * src, float * dst, __mmask16 m = -1) - { - __m512 _src = Load(src, m); - __m512 _dst = Load(dst, m); - Store(dst, _mm512_add_ps(_src, _dst), m); - } - - template SIMD_INLINE void AddVector(const float * src, size_t aligned, size_t partial, size_t full, float * dst) - { - size_t i = 0; - for (; i < aligned; i += QF) - { - AddVector(src + i + F * 0, dst + i + F * 0); - AddVector(src + i + F * 1, dst + i + F * 1); - AddVector(src + i + F * 2, dst + i + F * 2); - AddVector(src + i + F * 3, dst + i + F * 3); - } - for (; i < partial; i += F) - AddVector(src + i, dst + i); - if (i < full) - { - __mmask16 tailMask = __mmask16(-1) >> (F + i - full); - AddVector(src + i, dst + i, tailMask); - } - } - - void NeuralAddVector(const float * src, size_t size, float * dst) - { - size_t aligned = AlignLo(size, QF); - size_t partial = AlignLo(size, F); - if (Aligned(src) && Aligned(dst)) - AddVector(src, aligned, partial, size, dst); - else - AddVector(src, aligned, partial, size, dst); - } - - template SIMD_INLINE void AddValue(const __m512 & value, float * dst, __mmask16 m = -1) - { - __m512 _dst = Load(dst, m); - Store(dst, _mm512_add_ps(_dst, value), m); - } - - template SIMD_INLINE void AddValue(const float * value, float * dst, size_t aligned, size_t partial, size_t full) - { - size_t i = 0; - __m512 _value = _mm512_set1_ps(value[0]); - for (; i < aligned; i += QF) - { - AddValue(_value, dst + i + F * 0); - AddValue(_value, dst + i + F * 1); - AddValue(_value, dst + i + F * 2); - AddValue(_value, dst + i + F * 3); - } - for (; i < partial; i += F) - AddValue(_value, dst + i); - if (i < full) - { - __mmask16 tailMask = __mmask16(-1) >> (F + i - full); - AddValue(_value, dst + i, tailMask); - } - } - - void NeuralAddValue(const float * value, float * dst, size_t size) - { - size_t aligned = AlignLo(size, QF); - size_t partial = AlignLo(size, F); - if (Aligned(dst)) - AddValue(value, dst, aligned, partial, size); - else - AddValue(value, dst, aligned, partial, size); - } - - template SIMD_INLINE void NeuralRoughSigmoid(const float * src, const __m512 & _0, const __m512 & _1, - const __m512 & a, const __m512 & b, const __m512 & slope, float * dst, __mmask16 m = -1) - { - __m512 _src = Load(src, m); - __m512 x = AndNot(_0, _mm512_mul_ps(_src, slope)); - __m512 x2 = _mm512_mul_ps(x, x); - __m512 x4 = _mm512_mul_ps(x2, x2); - __m512 series = _mm512_add_ps(_mm512_fmadd_ps(x2, a, _1), _mm512_fmadd_ps(x4, b, x)); - __m512 exp = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(_src, _0, _CMP_GT_OS), series, Rcp14(series)); - __m512 sigmoid = Rcp14(_mm512_add_ps(_1, exp)); - Store(dst, sigmoid, m); - } - - template SIMD_INLINE void NeuralRoughSigmoid(const float * src, size_t size, const float * slope, float * dst) - { - __m512 _slope = _mm512_set1_ps(*slope); - __m512 _0 = _mm512_set1_ps(-0.0f); - __m512 _1 = _mm512_set1_ps(1.0f); - __m512 _a = _mm512_set1_ps(0.5417f); - __m512 _b = _mm512_set1_ps(0.1460f); - size_t i = 0; - size_t partialAlignedSize = Simd::AlignLo(size, F); - size_t fullAlignedSize = Simd::AlignLo(size, QF); - for (; i < fullAlignedSize; i += QF) - { - NeuralRoughSigmoid(src + i + 0 * F, _0, _1, _a, _b, _slope, dst + i + 0 * F); - NeuralRoughSigmoid(src + i + 1 * F, _0, _1, _a, _b, _slope, dst + i + 1 * F); - NeuralRoughSigmoid(src + i + 2 * F, _0, _1, _a, _b, _slope, dst + i + 2 * F); - NeuralRoughSigmoid(src + i + 3 * F, _0, _1, _a, _b, _slope, dst + i + 3 * F); - } - for (; i < partialAlignedSize; i += F) - NeuralRoughSigmoid(src + i, _0, _1, _a, _b, _slope, dst + i); - if (i < size) - { - __mmask16 tailMask = __mmask16(-1) >> (F + i - size); - NeuralRoughSigmoid(src + i, _0, _1, _a, _b, _slope, dst + i, tailMask); - } - } - - void NeuralRoughSigmoid(const float * src, size_t size, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - NeuralRoughSigmoid(src, size, slope, dst); - else - NeuralRoughSigmoid(src, size, slope, dst); - } - - template SIMD_INLINE void NeuralRoughSigmoid2(const float * src, const __m512 & k, - const __m512 & _1, const __m512 & _05, float * dst, __mmask16 m = -1) - { - __m512 _src = Load(src, m); - __m512 e1 = _mm512_max_ps(_05, _mm512_fmadd_ps(_src, k, _1)); - __m512 e2 = _mm512_mul_ps(e1, e1); - __m512 e4 = _mm512_mul_ps(e2, e2); - __m512 e8 = _mm512_mul_ps(e4, e4); - __m512 e16 = _mm512_mul_ps(e8, e8); - __m512 e32 = _mm512_mul_ps(e16, e16); - __m512 e64 = _mm512_mul_ps(e32, e32); - __m512 sigmoid = Rcp14(_mm512_fmadd_ps(e64, e64, _1)); - Store(dst, sigmoid, m); - } - - template SIMD_INLINE void NeuralRoughSigmoid2(const float * src, size_t size, const float * slope, float * dst) - { - size_t partialAlignedSize = Simd::AlignLo(size, F); - size_t fullAlignedSize = Simd::AlignLo(size, QF); - __m512 _k = _mm512_set1_ps(-(*slope)*0.0078125f); - __m512 _1 = _mm512_set1_ps(1.0f); - __m512 _05 = _mm512_set1_ps(0.5f); - size_t i = 0; - for (; i < fullAlignedSize; i += QF) - { - NeuralRoughSigmoid2(src + i + 0 * F, _k, _1, _05, dst + i + 0 * F); - NeuralRoughSigmoid2(src + i + 1 * F, _k, _1, _05, dst + i + 1 * F); - NeuralRoughSigmoid2(src + i + 2 * F, _k, _1, _05, dst + i + 2 * F); - NeuralRoughSigmoid2(src + i + 3 * F, _k, _1, _05, dst + i + 3 * F); - } - for (; i < partialAlignedSize; i += F) - NeuralRoughSigmoid2(src + i, _k, _1, _05, dst + i); - if (i < size) - { - __mmask16 tailMask = __mmask16(-1) >> (F + i - size); - NeuralRoughSigmoid2(src + i, _k, _1, _05, dst + i, tailMask); - } - } - - void NeuralRoughSigmoid2(const float * src, size_t size, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - NeuralRoughSigmoid2(src, size, slope, dst); - else - NeuralRoughSigmoid2(src, size, slope, dst); - } - - template SIMD_INLINE void NeuralDerivativeSigmoid(const float * src, const __m512 & _1, const __m512 & slope, float * dst, __mmask16 m = -1) - { - __m512 _src = Load(src, m); - __m512 _dst = Load(dst, m); - Store(dst, _mm512_mul_ps(_mm512_mul_ps(_dst, slope), _mm512_mul_ps(_mm512_sub_ps(_1, _src), _src)), m); - } - - template SIMD_INLINE void NeuralDerivativeSigmoid(const float * src, size_t size, const float * slope, float * dst) - { - size_t partialAlignedSize = Simd::AlignLo(size, F); - size_t fullAlignedSize = Simd::AlignLo(size, QF); - __m512 _1 = _mm512_set1_ps(1.0f); - __m512 _slope = _mm512_set1_ps(*slope); - size_t i = 0; - for (; i < fullAlignedSize; i += QF) - { - NeuralDerivativeSigmoid(src + i + 0 * F, _1, _slope, dst + i + 0 * F); - NeuralDerivativeSigmoid(src + i + 1 * F, _1, _slope, dst + i + 1 * F); - NeuralDerivativeSigmoid(src + i + 2 * F, _1, _slope, dst + i + 2 * F); - NeuralDerivativeSigmoid(src + i + 3 * F, _1, _slope, dst + i + 3 * F); - } - for (; i < partialAlignedSize; i += F) - NeuralDerivativeSigmoid(src + i, _1, _slope, dst + i); - if (i < size) - { - __mmask16 tailMask = __mmask16(-1) >> (F + i - size); - NeuralDerivativeSigmoid(src + i, _1, _slope, dst + i, tailMask); - } - } - - void NeuralDerivativeSigmoid(const float * src, size_t size, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - NeuralDerivativeSigmoid(src, size, slope, dst); - else - NeuralDerivativeSigmoid(src, size, slope, dst); - } - - template SIMD_INLINE void NeuralRoughTanh(const float * src, const __m512 & _0, const __m512 & _1, - const __m512 & a, const __m512 & b, const __m512 & slope, float * dst, __mmask16 m = -1) - { - __m512 _src = Load(src, m); - __m512 x = AndNot(_0, _mm512_mul_ps(_src, slope)); - __m512 x2 = _mm512_mul_ps(x, x); - __m512 x4 = _mm512_mul_ps(x2, x2); - __m512 pe = _mm512_add_ps(_mm512_fmadd_ps(x2, a, _1), _mm512_fmadd_ps(x4, b, x)); - __m512 ne = Rcp14(pe); - __m512 absTanh = _mm512_mul_ps(_mm512_sub_ps(pe, ne), Rcp14(_mm512_add_ps(pe, ne))); - __m512 tanh = Xor(absTanh, AndMaskZ(_0, _0, _mm512_cmp_ps_mask(_0, _src, _CMP_GT_OS))); - Store(dst, tanh, m); - } - - template SIMD_INLINE void NeuralRoughTanh(const float * src, size_t size, const float * slope, float * dst) - { - __m512 _slope = _mm512_set1_ps(*slope); - __m512 _0 = _mm512_set1_ps(-0.0f); - __m512 _1 = _mm512_set1_ps(1.0f); - __m512 _a = _mm512_set1_ps(0.5658f); - __m512 _b = _mm512_set1_ps(0.1430f); - size_t i = 0; - size_t partialAlignedSize = Simd::AlignLo(size, F); - size_t fullAlignedSize = Simd::AlignLo(size, QF); - for (; i < fullAlignedSize; i += QF) - { - NeuralRoughTanh(src + i + 0 * F, _0, _1, _a, _b, _slope, dst + i + 0 * F); - NeuralRoughTanh(src + i + 1 * F, _0, _1, _a, _b, _slope, dst + i + 1 * F); - NeuralRoughTanh(src + i + 2 * F, _0, _1, _a, _b, _slope, dst + i + 2 * F); - NeuralRoughTanh(src + i + 3 * F, _0, _1, _a, _b, _slope, dst + i + 3 * F); - } - for (; i < partialAlignedSize; i += F) - NeuralRoughTanh(src + i, _0, _1, _a, _b, _slope, dst + i); - if (i < size) - { - __mmask16 tailMask = __mmask16(-1) >> (F + i - size); - NeuralRoughTanh(src + i, _0, _1, _a, _b, _slope, dst + i, tailMask); - } - } - - void NeuralRoughTanh(const float * src, size_t size, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - NeuralRoughTanh(src, size, slope, dst); - else - NeuralRoughTanh(src, size, slope, dst); - } - - template SIMD_INLINE void NeuralDerivativeTanh(const float * src, const __m512 & _1, const __m512 & slope, float * dst, __mmask16 m = -1) - { - __m512 _src = Load(src, m); - __m512 _dst = Load(dst, m); - Store(dst, _mm512_mul_ps(_mm512_mul_ps(_dst, slope), _mm512_sub_ps(_1, _mm512_mul_ps(_src, _src))), m); - } - - template SIMD_INLINE void NeuralDerivativeTanh(const float * src, size_t size, const float * slope, float * dst) - { - size_t partialAlignedSize = Simd::AlignLo(size, F); - size_t fullAlignedSize = Simd::AlignLo(size, QF); - __m512 _1 = _mm512_set1_ps(1.0f); - __m512 _slope = _mm512_set1_ps(*slope); - size_t i = 0; - for (; i < fullAlignedSize; i += QF) - { - NeuralDerivativeTanh(src + i + 0 * F, _1, _slope, dst + i + 0 * F); - NeuralDerivativeTanh(src + i + 1 * F, _1, _slope, dst + i + 1 * F); - NeuralDerivativeTanh(src + i + 2 * F, _1, _slope, dst + i + 2 * F); - NeuralDerivativeTanh(src + i + 3 * F, _1, _slope, dst + i + 3 * F); - } - for (; i < partialAlignedSize; i += F) - NeuralDerivativeTanh(src + i, _1, _slope, dst + i); - if (i < size) - { - __mmask16 tailMask = __mmask16(-1) >> (F + i - size); - NeuralDerivativeTanh(src + i, _1, _slope, dst + i, tailMask); - } - } - - void NeuralDerivativeTanh(const float * src, size_t size, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - NeuralDerivativeTanh(src, size, slope, dst); - else - NeuralDerivativeTanh(src, size, slope, dst); - } - - template SIMD_INLINE void NeuralDerivativeRelu(const float * src, const __m512 & _0, const __m512 & _1, const __m512 & slope, float * dst, __mmask16 m = -1) - { - __m512 _src = Load(src, m); - __mmask16 positive = _mm512_cmp_ps_mask(_src, _0, _CMP_GT_OS); - __m512 _dst = Load(dst, m); - Store(dst, _mm512_mul_ps(_mm512_mask_blend_ps(positive, slope, _1), _dst), m); - } - - template SIMD_INLINE void NeuralDerivativeRelu(const float * src, size_t size, const float * slope, float * dst) - { - __m512 _0 = _mm512_set1_ps(0.0f); - __m512 _1 = _mm512_set1_ps(1.0f); - __m512 _slope = _mm512_set1_ps(slope[0]); - size_t partialAlignedSize = Simd::AlignLo(size, F); - size_t fullAlignedSize = Simd::AlignLo(size, QF); - size_t i = 0; - for (; i < fullAlignedSize; i += QF) - { - NeuralDerivativeRelu(src + i + 0 * F, _0, _1, _slope, dst + i + 0 * F); - NeuralDerivativeRelu(src + i + 1 * F, _0, _1, _slope, dst + i + 1 * F); - NeuralDerivativeRelu(src + i + 2 * F, _0, _1, _slope, dst + i + 2 * F); - NeuralDerivativeRelu(src + i + 3 * F, _0, _1, _slope, dst + i + 3 * F); - } - for (; i < partialAlignedSize; i += F) - NeuralDerivativeRelu(src + i, _0, _1, _slope, dst + i); - if (i < size) - { - __mmask16 tailMask = __mmask16(-1) >> (F + i - size); - NeuralDerivativeRelu(src + i, _0, _1, _slope, dst + i, tailMask); - } - } - - void NeuralDerivativeRelu(const float * src, size_t size, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - NeuralDerivativeRelu(src, size, slope, dst); - else - NeuralDerivativeRelu(src, size, slope, dst); - } - - template void NeuralPow(const float * src, size_t size, const float * exponent, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - float e = exponent[0]; - size_t aligned = AlignLo(size, F); - __m512 _e = _mm512_set1_ps(e); - Pow pow; - size_t i = 0; - for (; i < aligned; i += F) - Avx512f::Store(dst + i, pow(Avx512f::Load(src + i), _e)); - if (i < size) - { - __mmask16 tail = TailMask16(size - i); - Avx512f::Store(dst + i, pow(Avx512f::Load(src + i, tail), _e), tail); - } - } - - void NeuralPow(const float * src, size_t size, const float * exponent, float * dst) - { -#if defined(_MSC_VER) && _MSC_VER <= 1912 - Avx2::NeuralPow(src, size, exponent, dst); -#else - if (Aligned(src) && Aligned(dst)) - NeuralPow(src, size, exponent, dst); - else - NeuralPow(src, size, exponent, dst); -#endif - } - - template SIMD_INLINE void NeuralUpdateWeights(const float * x, const __m512 & a, const __m512 & b, float * d, float * w, __mmask16 m) - { - __m512 _x = Load(x, m); - __m512 _d = Load(d, m); - _d = _mm512_fmadd_ps(a, _d, _mm512_mul_ps(b, _x)); - Store(d, _d, m); - __m512 _w = Load(w, m); - Store(w, _mm512_add_ps(_w, _d), m); - } - - template SIMD_INLINE void NeuralUpdateWeights(const float * x, size_t offset, const __m512 & a, const __m512 & b, float * d, float * w, __mmask16 m = -1) - { - NeuralUpdateWeights(x + offset, a, b, d + offset, w + offset, m); - } - - template SIMD_INLINE void NeuralUpdateWeights(const float * x, size_t size, const float & a, const float & b, float * d, float * w) - { - if (align) - assert(Aligned(x) && Aligned(d) && Aligned(w)); - - __m512 _a = _mm512_set1_ps(a); - __m512 _b = _mm512_set1_ps(b); - size_t partialAlignedSize = AlignLo(size, F); - size_t fullAlignedSize = AlignLo(size, QF); - size_t i = 0; - for (; i < fullAlignedSize; i += QF) - { - NeuralUpdateWeights(x, i + F * 0, _a, _b, d, w); - NeuralUpdateWeights(x, i + F * 1, _a, _b, d, w); - NeuralUpdateWeights(x, i + F * 2, _a, _b, d, w); - NeuralUpdateWeights(x, i + F * 3, _a, _b, d, w); - } - for (; i < partialAlignedSize; i += F) - NeuralUpdateWeights(x, i, _a, _b, d, w); - if (i < size) - { - __mmask16 tailMask = __mmask16(-1) >> (F + i - size); - NeuralUpdateWeights(x, i, _a, _b, d, w, tailMask); - } - } - - void NeuralUpdateWeights(const float * x, size_t size, const float * a, const float * b, float * d, float * w) - { - if (Aligned(x) && Aligned(d) && Aligned(w)) - NeuralUpdateWeights(x, size, *a, *b, d, w); - else - NeuralUpdateWeights(x, size, *a, *b, d, w); - } - - template SIMD_INLINE void AdaptiveGradientUpdate(const float * delta, const __m512 & norm, const __m512 & alpha, const __m512 & epsilon, float * gradient, float * weight, __mmask16 m) - { - __m512 _delta = Load(delta, m); - __m512 d = _mm512_mul_ps(_delta, norm); - __m512 _gradient = Load(gradient, m); - _gradient = _mm512_fmadd_ps(d, d, _gradient); - Store(gradient, _gradient, m); - __m512 _weight = Load(weight, m); - Store(weight, _mm512_sub_ps(_weight, _mm512_mul_ps(_mm512_mul_ps(alpha, d), Rsqrt14(_mm512_add_ps(_gradient, epsilon)))), m); - } - - template SIMD_INLINE void AdaptiveGradientUpdate(const float * delta, size_t offset, const __m512 & norm, const __m512 & alpha, const __m512 & epsilon, float * gradient, float * weight, __mmask16 m = -1) - { - AdaptiveGradientUpdate(delta + offset, norm, alpha, epsilon, gradient + offset, weight + offset, m); - } - - template void NeuralAdaptiveGradientUpdate(const float * delta, size_t size, size_t batch, const float * alpha, const float * epsilon, float * gradient, float * weight) - { - if (align) - assert(Aligned(delta) && Aligned(gradient) && Aligned(weight)); - - const float norm = (float)(1.0 / batch); - __m512 _norm = _mm512_set1_ps(norm); - __m512 _alpha = _mm512_set1_ps(*alpha); - __m512 _epsilon = _mm512_set1_ps(*epsilon); - size_t partialAlignedSize = AlignLo(size, F); - size_t fullAlignedSize = AlignLo(size, QF); - size_t i = 0; - for (; i < fullAlignedSize; i += QF) - { - AdaptiveGradientUpdate(delta, i + F * 0, _norm, _alpha, _epsilon, gradient, weight); - AdaptiveGradientUpdate(delta, i + F * 1, _norm, _alpha, _epsilon, gradient, weight); - AdaptiveGradientUpdate(delta, i + F * 2, _norm, _alpha, _epsilon, gradient, weight); - AdaptiveGradientUpdate(delta, i + F * 3, _norm, _alpha, _epsilon, gradient, weight); - } - for (; i < partialAlignedSize; i += F) - AdaptiveGradientUpdate(delta, i, _norm, _alpha, _epsilon, gradient, weight); - if (i < size) - { - __mmask16 tailMask = __mmask16(-1) >> (F + i - size); - AdaptiveGradientUpdate(delta, i, _norm, _alpha, _epsilon, gradient, weight, tailMask); - } - } - - void NeuralAdaptiveGradientUpdate(const float * delta, size_t size, size_t batch, const float * alpha, const float * epsilon, float * gradient, float * weight) - { - if (Aligned(delta) && Aligned(gradient) && Aligned(weight)) - NeuralAdaptiveGradientUpdate(delta, size, batch, alpha, epsilon, gradient, weight); - else - NeuralAdaptiveGradientUpdate(delta, size, batch, alpha, epsilon, gradient, weight); - } - - template SIMD_INLINE void LoadWeightsForward(const float * src, __m512 * dst) - { - for (size_t i = 0; i < size; ++i) - dst[i] = _mm512_set1_ps(src[i]); - } - - template SIMD_INLINE void LoadWeightsBackward(const float * src, __m512 * dst) - { - for (size_t i = 0; i < size; ++i) - dst[i] = _mm512_set1_ps(src[size - i - 1]); - } - - namespace - { - template struct Buffer - { - Buffer(size_t width) - { - _size = width * sizeof(float); - size_t stride = AlignHi(width + 2 * (count - 1), F); - size_t full = count*stride * sizeof(float); - _ptr = Allocate(full); - memset(_ptr, 0, full); - rows[0] = (float*)_ptr; - for (size_t i = 1; i < count; ++i) - rows[i] = rows[i - 1] + stride; - } - - void Update(const float * src) - { - float * tmp = rows[0]; - if (src == NULL) - memset(tmp + count - 1, 0, _size); - else - memcpy(tmp + count - 1, src, _size); - for (size_t i = 0; i < count - 1; ++i) - rows[i] = rows[i + 1]; - rows[count - 1] = tmp; - } - - ~Buffer() - { - Free(_ptr); - } - - float * rows[count]; - private: - size_t _size; - void * _ptr; - }; - } - - template struct Convolution - { - template static SIMD_INLINE __m512 Forward(const float * src, size_t stride, const __m512 * weights, __mmask16 m = -1); - - template static SIMD_INLINE __m512 Backward(const Buffer & buffer, size_t offset, const __m512 * weights, __mmask16 m = -1); - - template static SIMD_INLINE void Sum1x1(const float * src0, size_t srcStride, const float * dst0, __m512 * sums, __mmask16 m = -1); - - template static SIMD_INLINE void Sum2x1(const float * src0, size_t srcStride, const float * dst0, size_t dstStride, __m512 * sums, __mmask16 m = -1); - - template static SIMD_INLINE void Sum1x2(const float * src0, size_t srcStride, const float * dst0, __m512 * sums); - - template static SIMD_INLINE void Sum2x2(const float * src0, size_t srcStride, const float * dst0, size_t dstStride, __m512 * sums); - }; - - template<> struct Convolution<2, 2> - { - template static SIMD_INLINE __m512 RowConvolution(const float * src, const __m512 * weights, __mmask16 m = -1) - { - __m512 src0 = Load(src, m); - __m512 src1 = Load(src + 1, m); - return _mm512_fmadd_ps(src0, weights[0], _mm512_mul_ps(src1, weights[1])); - } - - template static SIMD_INLINE __m512 Forward(const float * src, size_t stride, const __m512 * weights, __mmask16 m = -1) - { - __m512 row0 = RowConvolution(src, weights, m); - __m512 row1 = RowConvolution(src + stride, weights + 2, m); - return _mm512_add_ps(row0, row1); - } - - template static SIMD_INLINE __m512 Backward(const Buffer<2> & buffer, size_t offset, const __m512 * weights, __mmask16 m = -1) - { - __m512 row0 = RowConvolution(buffer.rows[0] + offset, weights + 0, m); - __m512 row1 = RowConvolution(buffer.rows[1] + offset, weights + 2, m); - return _mm512_add_ps(row0, row1); - } - - template static SIMD_INLINE void Sum1x1(const float * src0, size_t srcStride, const float * dst0, __m512 * sums, __mmask16 m = -1) - { - const float * src1 = src0 + srcStride; - __m512 dst00 = Load(dst0, m); - sums[0] = _mm512_fmadd_ps(dst00, (Load(src0 + 0, m)), sums[0]); - sums[1] = _mm512_fmadd_ps(dst00, (Load(src0 + 1, m)), sums[1]); - sums[2] = _mm512_fmadd_ps(dst00, (Load(src1 + 0, m)), sums[2]); - sums[3] = _mm512_fmadd_ps(dst00, (Load(src1 + 1, m)), sums[3]); - } - - template static SIMD_INLINE void Sum2x1(const float * src0, size_t srcStride, const float * dst0, size_t dstStride, __m512 * sums, __mmask16 m = -1) - { - const float * src1 = src0 + srcStride; - const float * src2 = src1 + srcStride; - const float * dst1 = dst0 + dstStride; - __m512 dst00 = Load(dst0, m); - __m512 src00 = Load(src0, m); - __m512 src01 = Load(src0 + 1, m); - __m512 src10 = Load(src1, m); - __m512 src11 = Load(src1 + 1, m); - sums[0] = _mm512_fmadd_ps(dst00, src00, sums[0]); - sums[1] = _mm512_fmadd_ps(dst00, src01, sums[1]); - sums[2] = _mm512_fmadd_ps(dst00, src10, sums[2]); - sums[3] = _mm512_fmadd_ps(dst00, src11, sums[3]); - __m512 dst10 = Load(dst1, m); - __m512 src20 = Load(src2, m); - __m512 src21 = Load(src2 + 1, m); - sums[0] = _mm512_fmadd_ps(dst10, src10, sums[0]); - sums[1] = _mm512_fmadd_ps(dst10, src11, sums[1]); - sums[2] = _mm512_fmadd_ps(dst10, src20, sums[2]); - sums[3] = _mm512_fmadd_ps(dst10, src21, sums[3]); - } - - template static SIMD_INLINE void Sum1x2(const float * src0, size_t srcStride, const float * dst0, __m512 * sums) - { - const float * src1 = src0 + srcStride; - __m512 dst00 = Load(dst0); - __m512 src00 = Load(src0); - __m512 src01 = Load(src0 + F); - __m512 src10 = Load(src1); - __m512 src11 = Load(src1 + F); - sums[0] = _mm512_fmadd_ps(dst00, src00, sums[0]); - sums[1] = _mm512_fmadd_ps(dst00, Alignr<1>(src00, src01), sums[1]); - sums[2] = _mm512_fmadd_ps(dst00, src10, sums[2]); - sums[3] = _mm512_fmadd_ps(dst00, Alignr<1>(src10, src11), sums[3]); - __m512 dst10 = Load(dst0 + F); - __m512 src02 = Load(src0 + F + 1); - __m512 src12 = Load(src1 + F + 1); - sums[0] = _mm512_fmadd_ps(dst10, src01, sums[0]); - sums[1] = _mm512_fmadd_ps(dst10, src02, sums[1]); - sums[2] = _mm512_fmadd_ps(dst10, src11, sums[2]); - sums[3] = _mm512_fmadd_ps(dst10, src12, sums[3]); - } - - template static SIMD_INLINE void Sum2x2(const float * src0, size_t srcStride, const float * dst0, size_t dstStride, __m512 * sums) - { - const float * src1 = src0 + srcStride; - const float * src2 = src1 + srcStride; - const float * dst1 = dst0 + dstStride; - - __m512 dst00 = Load(dst0); - __m512 src000 = Load(src0); - __m512 src010 = Load(src0 + F); - __m512 src100 = Load(src1); - __m512 src110 = Load(src1 + F); - __m512 src101 = Alignr<1>(src100, src110); - sums[0] = _mm512_fmadd_ps(dst00, src000, sums[0]); - sums[1] = _mm512_fmadd_ps(dst00, Alignr<1>(src000, src010), sums[1]); - sums[2] = _mm512_fmadd_ps(dst00, src100, sums[2]); - sums[3] = _mm512_fmadd_ps(dst00, src101, sums[3]); - - __m512 dst01 = Load(dst0 + F); - __m512 src011 = Load(src0 + F + 1); - __m512 src111 = Load(src1 + F + 1); - sums[0] = _mm512_fmadd_ps(dst01, src010, sums[0]); - sums[1] = _mm512_fmadd_ps(dst01, src011, sums[1]); - sums[2] = _mm512_fmadd_ps(dst01, src110, sums[2]); - sums[3] = _mm512_fmadd_ps(dst01, src111, sums[3]); - - __m512 dst10 = Load(dst1); - __m512 src200 = Load(src2); - __m512 src210 = Load(src2 + F); - sums[0] = _mm512_fmadd_ps(dst10, src100, sums[0]); - sums[1] = _mm512_fmadd_ps(dst10, src101, sums[1]); - sums[2] = _mm512_fmadd_ps(dst10, src200, sums[2]); - sums[3] = _mm512_fmadd_ps(dst10, Alignr<1>(src200, src210), sums[3]); - - __m512 dst11 = Load(dst1 + F); - __m512 src211 = Load(src2 + F + 1); - sums[0] = _mm512_fmadd_ps(dst11, src110, sums[0]); - sums[1] = _mm512_fmadd_ps(dst11, src111, sums[1]); - sums[2] = _mm512_fmadd_ps(dst11, src210, sums[2]); - sums[3] = _mm512_fmadd_ps(dst11, src211, sums[3]); - } - }; - - template<> struct Convolution<3, 3> - { - template static SIMD_INLINE __m512 RowConvolution(const float * src, const __m512 * weights, __mmask16 m = -1) - { - __m512 src0 = Load(src, m); - __m512 src1 = Load(src + 1, m); - __m512 src2 = Load(src + 2, m); - return _mm512_fmadd_ps(src0, weights[0], _mm512_fmadd_ps(src1, weights[1], _mm512_mul_ps(src2, weights[2]))); - } - - template static SIMD_INLINE __m512 Forward(const float * src, size_t stride, const __m512 * weights, __mmask16 m = -1) - { - __m512 row0 = RowConvolution(src, weights, m); - __m512 row1 = RowConvolution(src + stride, weights + 3, m); - __m512 row2 = RowConvolution(src + 2 * stride, weights + 6, m); - return _mm512_add_ps(_mm512_add_ps(row0, row1), row2); - } - - template static SIMD_INLINE __m512 Backward(const Buffer<3> & buffer, size_t offset, const __m512 * weights, __mmask16 m = -1) - { - __m512 row0 = RowConvolution(buffer.rows[0] + offset, weights + 0, m); - __m512 row1 = RowConvolution(buffer.rows[1] + offset, weights + 3, m); - __m512 row2 = RowConvolution(buffer.rows[2] + offset, weights + 6, m); - return _mm512_add_ps(_mm512_add_ps(row0, row1), row2); - } - - template static SIMD_INLINE void Sum1x1(const float * src0, size_t srcStride, const float * dst0, __m512 * sums, __mmask16 m = -1) - { - const float * src1 = src0 + srcStride; - const float * src2 = src1 + srcStride; - __m512 dst00 = Load(dst0, m); - __m512 src00 = Load(src0); - __m512 src0f = Load(src0 + F); - sums[0] = _mm512_fmadd_ps(dst00, (Alignr<0, mask>(src00, src0f, m)), sums[0]); - sums[1] = _mm512_fmadd_ps(dst00, (Alignr<1, mask>(src00, src0f, m)), sums[1]); - sums[2] = _mm512_fmadd_ps(dst00, (Alignr<2, mask>(src00, src0f, m)), sums[2]); - __m512 src10 = Load(src1); - __m512 src1f = Load(src1 + F); - sums[3] = _mm512_fmadd_ps(dst00, (Alignr<0, mask>(src10, src1f, m)), sums[3]); - sums[4] = _mm512_fmadd_ps(dst00, (Alignr<1, mask>(src10, src1f, m)), sums[4]); - sums[5] = _mm512_fmadd_ps(dst00, (Alignr<2, mask>(src10, src1f, m)), sums[5]); - __m512 src20 = Load(src2); - __m512 src2f = Load(src2 + F); - sums[6] = _mm512_fmadd_ps(dst00, (Alignr<0, mask>(src20, src2f, m)), sums[6]); - sums[7] = _mm512_fmadd_ps(dst00, (Alignr<1, mask>(src20, src2f, m)), sums[7]); - sums[8] = _mm512_fmadd_ps(dst00, (Alignr<2, mask>(src20, src2f, m)), sums[8]); - } - - template static SIMD_INLINE void Sum2x1(const float * src0, size_t srcStride, const float * dst0, size_t dstStride, __m512 * sums, __mmask16 m = -1) - { - const float * dst1 = dst0 + dstStride; - const float * src1 = src0 + srcStride; - const float * src2 = src1 + srcStride; - const float * src3 = src2 + srcStride; - __m512 dst00 = Load(dst0, m); - __m512 src00 = Load(src0); - __m512 src0f = Load(src0 + F); - sums[0] = _mm512_fmadd_ps(dst00, (Alignr<0, mask>(src00, src0f, m)), sums[0]); - sums[1] = _mm512_fmadd_ps(dst00, (Alignr<1, mask>(src00, src0f, m)), sums[1]); - sums[2] = _mm512_fmadd_ps(dst00, (Alignr<2, mask>(src00, src0f, m)), sums[2]); - __m512 dst10 = Load(dst1, m); - __m512 src10 = Load(src1); - __m512 src1f = Load(src1 + F); - sums[0] = _mm512_fmadd_ps(dst10, Mask(src10, m), sums[0]); - sums[3] = _mm512_fmadd_ps(dst00, Mask(src10, m), sums[3]); - __m512 src11 = Alignr<1, mask>(src10, src1f, m); - sums[1] = _mm512_fmadd_ps(dst10, src11, sums[1]); - sums[4] = _mm512_fmadd_ps(dst00, src11, sums[4]); - __m512 src12 = Alignr<2, mask>(src10, src1f, m); - sums[2] = _mm512_fmadd_ps(dst10, src12, sums[2]); - sums[5] = _mm512_fmadd_ps(dst00, src12, sums[5]); - __m512 src20 = Load(src2); - __m512 src2f = Load(src2 + F); - sums[3] = _mm512_fmadd_ps(dst10, Mask(src20, m), sums[3]); - sums[6] = _mm512_fmadd_ps(dst00, Mask(src20, m), sums[6]); - __m512 src21 = Alignr<1, mask>(src20, src2f, m); - sums[4] = _mm512_fmadd_ps(dst10, src21, sums[4]); - sums[7] = _mm512_fmadd_ps(dst00, src21, sums[7]); - __m512 src22 = Alignr<2, mask>(src20, src2f, m); - sums[5] = _mm512_fmadd_ps(dst10, src22, sums[5]); - sums[8] = _mm512_fmadd_ps(dst00, src22, sums[8]); - __m512 src30 = Load(src3); - __m512 src3f = Load(src3 + F); - sums[6] = _mm512_fmadd_ps(dst10, (Alignr<0, mask>(src30, src3f, m)), sums[6]); - sums[7] = _mm512_fmadd_ps(dst10, (Alignr<1, mask>(src30, src3f, m)), sums[7]); - sums[8] = _mm512_fmadd_ps(dst10, (Alignr<2, mask>(src30, src3f, m)), sums[8]); - } - }; - - template<> struct Convolution<4, 4> - { - template static SIMD_INLINE __m512 RowConvolution(const float * src, const __m512 * weights, __mmask16 m = -1) - { - __m512 src0 = Load(src); - __m512 srcf = Load(src + F); - __m512 sum0 = _mm512_fmadd_ps(Alignr<0>(src0, srcf), weights[0], _mm512_mul_ps(Alignr<1>(src0, srcf), weights[1])); - __m512 sum1 = _mm512_fmadd_ps(Alignr<2>(src0, srcf), weights[2], _mm512_mul_ps(Alignr<3>(src0, srcf), weights[3])); - return _mm512_add_ps(sum0, sum1); - } - - template static SIMD_INLINE __m512 Forward(const float * src, size_t stride, const __m512 * weights, __mmask16 m = -1) - { - __m512 row0 = RowConvolution(src, weights, m); - __m512 row1 = RowConvolution(src + stride, weights + 4, m); - __m512 row2 = RowConvolution(src + 2 * stride, weights + 8, m); - __m512 row3 = RowConvolution(src + 3 * stride, weights + 12, m); - return _mm512_add_ps(_mm512_add_ps(row0, row1), _mm512_add_ps(row2, row3)); - } - - template static SIMD_INLINE __m512 Backward(const Buffer<4> & buffer, size_t offset, const __m512 * weights, __mmask16 m = -1) - { - __m512 row0 = RowConvolution(buffer.rows[0] + offset, weights + 0, m); - __m512 row1 = RowConvolution(buffer.rows[1] + offset, weights + 4, m); - __m512 row2 = RowConvolution(buffer.rows[2] + offset, weights + 8, m); - __m512 row3 = RowConvolution(buffer.rows[3] + offset, weights + 12, m); - return _mm512_add_ps(_mm512_add_ps(row0, row1), _mm512_add_ps(row2, row3)); - } - - template static SIMD_INLINE void Sum1x1(const float * src0, size_t srcStride, const float * dst0, __m512 * sums, __mmask16 m = -1) - { - const float * src1 = src0 + srcStride; - const float * src2 = src1 + srcStride; - const float * src3 = src2 + srcStride; - __m512 dst00 = Load(dst0, m); - __m512 src00 = Load(src0); - __m512 src0f = Load(src0 + F); - sums[0] = _mm512_fmadd_ps(dst00, (Alignr<0, mask>(src00, src0f, m)), sums[0]); - sums[1] = _mm512_fmadd_ps(dst00, (Alignr<1, mask>(src00, src0f, m)), sums[1]); - sums[2] = _mm512_fmadd_ps(dst00, (Alignr<2, mask>(src00, src0f, m)), sums[2]); - sums[3] = _mm512_fmadd_ps(dst00, (Alignr<3, mask>(src00, src0f, m)), sums[3]); - __m512 src10 = Load(src1); - __m512 src1f = Load(src1 + F); - sums[4] = _mm512_fmadd_ps(dst00, (Alignr<0, mask>(src10, src1f, m)), sums[4]); - sums[5] = _mm512_fmadd_ps(dst00, (Alignr<1, mask>(src10, src1f, m)), sums[5]); - sums[6] = _mm512_fmadd_ps(dst00, (Alignr<2, mask>(src10, src1f, m)), sums[6]); - sums[7] = _mm512_fmadd_ps(dst00, (Alignr<3, mask>(src10, src1f, m)), sums[7]); - __m512 src20 = Load(src2); - __m512 src2f = Load(src2 + F); - sums[8] = _mm512_fmadd_ps(dst00, (Alignr<0, mask>(src20, src2f, m)), sums[8]); - sums[9] = _mm512_fmadd_ps(dst00, (Alignr<1, mask>(src20, src2f, m)), sums[9]); - sums[10] = _mm512_fmadd_ps(dst00, (Alignr<2, mask>(src20, src2f, m)), sums[10]); - sums[11] = _mm512_fmadd_ps(dst00, (Alignr<3, mask>(src20, src2f, m)), sums[11]); - __m512 src30 = Load(src3); - __m512 src3f = Load(src3 + F); - sums[12] = _mm512_fmadd_ps(dst00, (Alignr<0, mask>(src30, src3f, m)), sums[12]); - sums[13] = _mm512_fmadd_ps(dst00, (Alignr<1, mask>(src30, src3f, m)), sums[13]); - sums[14] = _mm512_fmadd_ps(dst00, (Alignr<2, mask>(src30, src3f, m)), sums[14]); - sums[15] = _mm512_fmadd_ps(dst00, (Alignr<3, mask>(src30, src3f, m)), sums[15]); - } - - template static SIMD_INLINE void Sum2x1(const float * src0, size_t srcStride, const float * dst0, size_t dstStride, __m512 * sums, __mmask16 m = -1) - { - const float * dst1 = dst0 + dstStride; - const float * src1 = src0 + srcStride; - const float * src2 = src1 + srcStride; - const float * src3 = src2 + srcStride; - const float * src4 = src3 + srcStride; - __m512 dst00 = Load(dst0, m); - __m512 src00 = Load(src0); - __m512 src0f = Load(src0 + F); - sums[0] = _mm512_fmadd_ps(dst00, (Alignr<0, mask>(src00, src0f, m)), sums[0]); - sums[1] = _mm512_fmadd_ps(dst00, (Alignr<1, mask>(src00, src0f, m)), sums[1]); - sums[2] = _mm512_fmadd_ps(dst00, (Alignr<2, mask>(src00, src0f, m)), sums[2]); - sums[3] = _mm512_fmadd_ps(dst00, (Alignr<3, mask>(src00, src0f, m)), sums[3]); - __m512 dst10 = Load(dst1, m); - __m512 src10 = Load(src1); - __m512 src1f = Load(src1 + F); - sums[0] = _mm512_fmadd_ps(dst10, Mask(src10, m), sums[0]); - sums[4] = _mm512_fmadd_ps(dst00, Mask(src10, m), sums[4]); - __m512 src11 = Alignr<1, mask>(src10, src1f, m); - sums[1] = _mm512_fmadd_ps(dst10, src11, sums[1]); - sums[5] = _mm512_fmadd_ps(dst00, src11, sums[5]); - __m512 src12 = Alignr<2, mask>(src10, src1f, m); - sums[2] = _mm512_fmadd_ps(dst10, src12, sums[2]); - sums[6] = _mm512_fmadd_ps(dst00, src12, sums[6]); - __m512 src13 = Alignr<3, mask>(src10, src1f, m); - sums[3] = _mm512_fmadd_ps(dst10, src13, sums[3]); - sums[7] = _mm512_fmadd_ps(dst00, src13, sums[7]); - __m512 src20 = Load(src2); - __m512 src2f = Load(src2 + F); - sums[4] = _mm512_fmadd_ps(dst10, Mask(src20, m), sums[4]); - sums[8] = _mm512_fmadd_ps(dst00, Mask(src20, m), sums[8]); - __m512 src21 = Alignr<1, mask>(src20, src2f, m); - sums[5] = _mm512_fmadd_ps(dst10, src21, sums[5]); - sums[9] = _mm512_fmadd_ps(dst00, src21, sums[9]); - __m512 src22 = Alignr<2, mask>(src20, src2f, m); - sums[6] = _mm512_fmadd_ps(dst10, src22, sums[6]); - sums[10] = _mm512_fmadd_ps(dst00, src22, sums[10]); - __m512 src23 = Alignr<3, mask>(src20, src2f, m); - sums[7] = _mm512_fmadd_ps(dst10, src23, sums[7]); - sums[11] = _mm512_fmadd_ps(dst00, src23, sums[11]); - __m512 src30 = Load(src3); - __m512 src3f = Load(src3 + F); - sums[8] = _mm512_fmadd_ps(dst10, Mask(src30, m), sums[8]); - sums[12] = _mm512_fmadd_ps(dst00, Mask(src30, m), sums[12]); - __m512 src31 = Alignr<1, mask>(src30, src3f, m); - sums[9] = _mm512_fmadd_ps(dst10, src31, sums[9]); - sums[13] = _mm512_fmadd_ps(dst00, src31, sums[13]); - __m512 src32 = Alignr<2, mask>(src30, src3f, m); - sums[10] = _mm512_fmadd_ps(dst10, src32, sums[10]); - sums[14] = _mm512_fmadd_ps(dst00, src32, sums[14]); - __m512 src33 = Alignr<3, mask>(src30, src3f, m); - sums[11] = _mm512_fmadd_ps(dst10, src33, sums[11]); - sums[15] = _mm512_fmadd_ps(dst00, src33, sums[15]); - __m512 src40 = Load(src4); - __m512 src4f = Load(src4 + F); - sums[12] = _mm512_fmadd_ps(dst10, (Alignr<0, mask>(src40, src4f, m)), sums[12]); - sums[13] = _mm512_fmadd_ps(dst10, (Alignr<1, mask>(src40, src4f, m)), sums[13]); - sums[14] = _mm512_fmadd_ps(dst10, (Alignr<2, mask>(src40, src4f, m)), sums[14]); - sums[15] = _mm512_fmadd_ps(dst10, (Alignr<3, mask>(src40, src4f, m)), sums[15]); - } - }; - - template<> struct Convolution<5, 5> - { - template static SIMD_INLINE __m512 RowConvolution(const float * src, const __m512 * weights, __mmask16 m = -1) - { - __m512 src0 = Load(src); - __m512 srcf = Load(src + F); - __m512 sum0 = _mm512_fmadd_ps(Alignr<0>(src0, srcf), weights[0], _mm512_mul_ps(Alignr<1>(src0, srcf), weights[1])); - __m512 sum1 = _mm512_fmadd_ps(Alignr<2>(src0, srcf), weights[2], _mm512_mul_ps(Alignr<3>(src0, srcf), weights[3])); - return _mm512_fmadd_ps(Alignr<4>(src0, srcf), weights[4], _mm512_add_ps(sum0, sum1)); - } - - template static SIMD_INLINE __m512 Forward(const float * src, size_t stride, const __m512 * weights, __mmask16 m = -1) - { - return _mm512_add_ps((RowConvolution(src, weights, m)), - _mm512_add_ps(_mm512_add_ps((RowConvolution(src + stride, weights + 5, m)), - (RowConvolution(src + 2 * stride, weights + 10, m))), - _mm512_add_ps((RowConvolution(src + 3 * stride, weights + 15, m)), - (RowConvolution(src + 4 * stride, weights + 20, m))))); - } - - template static SIMD_INLINE __m512 Backward(const Buffer<5> & buffer, size_t offset, const __m512 * weights, __mmask16 m = -1) - { - return _mm512_add_ps((RowConvolution(buffer.rows[0] + offset, weights, m)), - _mm512_add_ps(_mm512_add_ps((RowConvolution(buffer.rows[1] + offset, weights + 5, m)), - (RowConvolution(buffer.rows[2] + offset, weights + 10, m))), - _mm512_add_ps((RowConvolution(buffer.rows[3] + offset, weights + 15, m)), - (RowConvolution(buffer.rows[4] + offset, weights + 20, m))))); - } - - template static SIMD_INLINE void Sum1x1(const float * src0, size_t srcStride, const float * dst0, __m512 * sums, __mmask16 m = -1) - { - const float * src1 = src0 + srcStride; - const float * src2 = src1 + srcStride; - const float * src3 = src2 + srcStride; - const float * src4 = src3 + srcStride; - __m512 dst00 = Load(dst0, m); - __m512 src00 = Load(src0); - __m512 src0f = Load(src0 + F); - sums[0] = _mm512_fmadd_ps(dst00, (Alignr<0, mask>(src00, src0f, m)), sums[0]); - sums[1] = _mm512_fmadd_ps(dst00, (Alignr<1, mask>(src00, src0f, m)), sums[1]); - sums[2] = _mm512_fmadd_ps(dst00, (Alignr<2, mask>(src00, src0f, m)), sums[2]); - sums[3] = _mm512_fmadd_ps(dst00, (Alignr<3, mask>(src00, src0f, m)), sums[3]); - sums[4] = _mm512_fmadd_ps(dst00, (Alignr<4, mask>(src00, src0f, m)), sums[4]); - __m512 src10 = Load(src1); - __m512 src1f = Load(src1 + F); - sums[5] = _mm512_fmadd_ps(dst00, (Alignr<0, mask>(src10, src1f, m)), sums[5]); - sums[6] = _mm512_fmadd_ps(dst00, (Alignr<1, mask>(src10, src1f, m)), sums[6]); - sums[7] = _mm512_fmadd_ps(dst00, (Alignr<2, mask>(src10, src1f, m)), sums[7]); - sums[8] = _mm512_fmadd_ps(dst00, (Alignr<3, mask>(src10, src1f, m)), sums[8]); - sums[9] = _mm512_fmadd_ps(dst00, (Alignr<4, mask>(src10, src1f, m)), sums[9]); - __m512 src20 = Load(src2); - __m512 src2f = Load(src2 + F); - sums[10] = _mm512_fmadd_ps(dst00, (Alignr<0, mask>(src20, src2f, m)), sums[10]); - sums[11] = _mm512_fmadd_ps(dst00, (Alignr<1, mask>(src20, src2f, m)), sums[11]); - sums[12] = _mm512_fmadd_ps(dst00, (Alignr<2, mask>(src20, src2f, m)), sums[12]); - sums[13] = _mm512_fmadd_ps(dst00, (Alignr<3, mask>(src20, src2f, m)), sums[13]); - sums[14] = _mm512_fmadd_ps(dst00, (Alignr<4, mask>(src20, src2f, m)), sums[14]); - __m512 src30 = Load(src3); - __m512 src3f = Load(src3 + F); - sums[15] = _mm512_fmadd_ps(dst00, (Alignr<0, mask>(src30, src3f, m)), sums[15]); - sums[16] = _mm512_fmadd_ps(dst00, (Alignr<1, mask>(src30, src3f, m)), sums[16]); - sums[17] = _mm512_fmadd_ps(dst00, (Alignr<2, mask>(src30, src3f, m)), sums[17]); - sums[18] = _mm512_fmadd_ps(dst00, (Alignr<3, mask>(src30, src3f, m)), sums[18]); - sums[19] = _mm512_fmadd_ps(dst00, (Alignr<4, mask>(src30, src3f, m)), sums[19]); - __m512 src40 = Load(src4); - __m512 src4f = Load(src4 + F); - sums[20] = _mm512_fmadd_ps(dst00, (Alignr<0, mask>(src40, src4f, m)), sums[20]); - sums[21] = _mm512_fmadd_ps(dst00, (Alignr<1, mask>(src40, src4f, m)), sums[21]); - sums[22] = _mm512_fmadd_ps(dst00, (Alignr<2, mask>(src40, src4f, m)), sums[22]); - sums[23] = _mm512_fmadd_ps(dst00, (Alignr<3, mask>(src40, src4f, m)), sums[23]); - sums[24] = _mm512_fmadd_ps(dst00, (Alignr<4, mask>(src40, src4f, m)), sums[24]); - } - - template static SIMD_INLINE void SumRow1(const float * src, const __m512 & dst, __m512 * sums, __mmask16 m) - { - __m512 src0 = Load(src + 0); - __m512 srcf = Load(src + F); - sums[0] = _mm512_fmadd_ps(dst, (Alignr<0, mask>(src0, srcf, m)), sums[0]); - sums[1] = _mm512_fmadd_ps(dst, (Alignr<1, mask>(src0, srcf, m)), sums[1]); - sums[2] = _mm512_fmadd_ps(dst, (Alignr<2, mask>(src0, srcf, m)), sums[2]); - sums[3] = _mm512_fmadd_ps(dst, (Alignr<3, mask>(src0, srcf, m)), sums[3]); - sums[4] = _mm512_fmadd_ps(dst, (Alignr<4, mask>(src0, srcf, m)), sums[4]); - } - - template static SIMD_INLINE void SumRow2(const float * src, const __m512 & dst0, const __m512 & dst1, __m512 * sums, __mmask16 m) - { - __m512 src0 = Load(src + 0); - __m512 srcf = Load(src + F); - sums[0] = _mm512_fmadd_ps(dst1, Mask(src0, m), sums[0]); - sums[5] = _mm512_fmadd_ps(dst0, Mask(src0, m), sums[5]); - __m512 src1 = Alignr<1, mask>(src0, srcf, m); - sums[1] = _mm512_fmadd_ps(dst1, src1, sums[1]); - sums[6] = _mm512_fmadd_ps(dst0, src1, sums[6]); - __m512 src2 = Alignr<2, mask>(src0, srcf, m); - sums[2] = _mm512_fmadd_ps(dst1, src2, sums[2]); - sums[7] = _mm512_fmadd_ps(dst0, src2, sums[7]); - __m512 src3 = Alignr<3, mask>(src0, srcf, m); - sums[3] = _mm512_fmadd_ps(dst1, src3, sums[3]); - sums[8] = _mm512_fmadd_ps(dst0, src3, sums[8]); - __m512 src4 = Alignr<4, mask>(src0, srcf, m); - sums[4] = _mm512_fmadd_ps(dst1, src4, sums[4]); - sums[9] = _mm512_fmadd_ps(dst0, src4, sums[9]); - } - - template static SIMD_INLINE void Sum2x1(const float * src, size_t srcStride, const float * dst, size_t dstStride, __m512 * sums, __mmask16 m = -1) - { - __m512 dst0 = Load(dst, m); - SumRow1(src, dst0, sums + 0, m); - __m512 dst1 = Load(dst + dstStride, m); - SumRow2(src + srcStride, dst0, dst1, sums + 0, m); - SumRow2(src + 2 * srcStride, dst0, dst1, sums + 5, m); - SumRow2(src + 3 * srcStride, dst0, dst1, sums + 10, m); - SumRow2(src + 4 * srcStride, dst0, dst1, sums + 15, m); - SumRow1(src + 5 * srcStride, dst1, sums + 20, m); - } - }; - - template void NeuralAddConvolutionForward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - size_t alignedWidth = AlignLo(width, F); - __mmask16 tailMask = __mmask16(-1) >> (F + alignedWidth - width); - __m512 _weights[coreX*coreY]; - LoadWeightsForward(weights, _weights); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += F) - { - __m512 sum = Convolution::template Forward(src + col, srcStride, _weights); - __m512 _dst = Load(dst + col); - Store(dst + col, _mm512_add_ps(_dst, sum)); - } - if (col < width) - { - __m512 sum = Convolution::template Forward(src + col, srcStride, _weights); - __m512 _dst = Load(dst + col, tailMask); - Store(dst + col, _mm512_add_ps(_dst, sum), tailMask); - } - src += srcStride; - dst += dstStride; - } - } - - void NeuralAddConvolution2x2Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution3x3Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution4x4Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution5x5Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - } - - template struct If - { - template static SIMD_INLINE void AddMultiplied(const float * src, size_t aligned, size_t partial, size_t full, float value, float * dst) - { - Avx512f::AddMultiplied(src, aligned, partial, full, value, dst); - } - }; - - template<> struct If - { - template static SIMD_INLINE void AddMultiplied(const float * src, size_t aligned, size_t partial, size_t full, float value, float * dst) - { - } - }; - - template void NeuralAddConvolutionBackwardSmall(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - size_t aligned = AlignLo(width, QF); - size_t partial = AlignLo(width, F); - for (size_t row = 0; row < height; ++row) - { - for (size_t dy = 0; dy < coreY; ++dy) - { - const float * w = weights + dy * coreX; - float * d = dst + dy*dstStride; - If < 0 < coreX > ::template AddMultiplied(src, aligned, partial, width, w[0], d + 0); - If < 1 < coreX > ::template AddMultiplied(src, aligned, partial, width, w[1], d + 1); - If < 2 < coreX > ::template AddMultiplied(src, aligned, partial, width, w[2], d + 2); - If < 3 < coreX > ::template AddMultiplied(src, aligned, partial, width, w[3], d + 3); - If < 4 < coreX > ::template AddMultiplied(src, aligned, partial, width, w[4], d + 4); - } - src += srcStride; - dst += dstStride; - } - } - - template void NeuralAddConvolutionBackwardLarge(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - Buffer buffer(width); - height += coreY - 1; - width += coreX - 1; - size_t alignedWidth = AlignLo(width, F); - __mmask16 tailMask = __mmask16(-1) >> (F + alignedWidth - width); - __m512 _weights[coreX*coreY]; - LoadWeightsBackward(weights, _weights); - for (size_t row = 0; row < height; ++row) - { - buffer.Update(row <= height - coreY ? src : NULL); - size_t col = 0; - for (; col < alignedWidth; col += F) - { - __m512 sum = Convolution::template Backward(buffer, col, _weights); - __m512 _dst = Load(dst + col); - Store(dst + col, _mm512_add_ps(_dst, sum)); - } - if (col < width) - { - __m512 sum = Convolution::template Backward(buffer, col, _weights, tailMask); - __m512 _dst = Load(dst + col, tailMask); - Store(dst + col, _mm512_add_ps(_dst, sum), tailMask); - } - src += srcStride; - dst += dstStride; - } - } - - template void NeuralAddConvolutionBackward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (width*height < 1024) - NeuralAddConvolutionBackwardSmall(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionBackwardLarge(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution2x2Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution3x3Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution4x4Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution5x5Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - } - - SIMD_INLINE __m128 PartialSum(const __m512 & src) - { - __m128 lo = _mm_add_ps(_mm512_extractf32x4_ps(src, 0), _mm512_extractf32x4_ps(src, 1)); - __m128 hi = _mm_add_ps(_mm512_extractf32x4_ps(src, 2), _mm512_extractf32x4_ps(src, 3)); - return _mm_add_ps(lo, hi); - } - - SIMD_INLINE void Add4ExtractedSums(const __m512 * src, float * dst) - { - __m128 s0 = PartialSum(src[0]); - __m128 s1 = PartialSum(src[1]); - __m128 s2 = PartialSum(src[2]); - __m128 s3 = PartialSum(src[3]); - __m128 sums = _mm_hadd_ps(_mm_hadd_ps(s0, s1), _mm_hadd_ps(s2, s3)); - _mm_storeu_ps(dst, _mm_add_ps(_mm_loadu_ps(dst), sums)); - } - - template SIMD_INLINE void NeuralAddConvolutionSum1x1(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - size_t alignedWidth = Simd::AlignLo(width, F); - __mmask16 tailMask = __mmask16(-1) >> (F + alignedWidth - width); - __m512 _sums[coreX*coreY]; - memset(_sums, 0, sizeof(_sums)); - size_t row = 0; - for (; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += F) - Convolution::template Sum1x1(src + col, srcStride, dst + col, _sums); - if (col < width) - Convolution::template Sum1x1(src + col, srcStride, dst + col, _sums, tailMask); - src += srcStride; - dst += dstStride; - } - size_t i = 0, n = Simd::AlignLo(coreX*coreY, 4); -#ifndef _MSC_VER - for (; i < n; i += 4) - Add4ExtractedSums(_sums + i, sums + i); -#endif - for (; i < coreX*coreY; ++i) - sums[i] += ExtractSum(_sums[i]); - } - - template SIMD_INLINE void NeuralAddConvolutionSum2x1(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - size_t alignedHeight = Simd::AlignLo(height, 2); - size_t alignedWidth = Simd::AlignLo(width, F); - __mmask16 tailMask = __mmask16(-1) >> (F + alignedWidth - width); - __m512 _sums[coreX*coreY]; - memset(_sums, 0, sizeof(_sums)); - size_t row = 0; - for (; row < alignedHeight; row += 2) - { - size_t col = 0; - for (; col < alignedWidth; col += F) - Convolution::template Sum2x1(src + col, srcStride, dst + col, dstStride, _sums); - if (col < width) - Convolution::template Sum2x1(src + col, srcStride, dst + col, dstStride, _sums, tailMask); - src += 2 * srcStride; - dst += 2 * dstStride; - } - for (; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += F) - Convolution::template Sum1x1(src + col, srcStride, dst + col, _sums); - if (col < width) - Convolution::template Sum1x1(src + col, srcStride, dst + col, _sums, tailMask); - src += srcStride; - dst += dstStride; - } - size_t i = 0, n = Simd::AlignLo(coreX*coreY, 4); -#ifndef _MSC_VER - for (; i < n; i += 4) - Add4ExtractedSums(_sums + i, sums + i); -#endif - for (; i < coreX*coreY; ++i) - sums[i] += ExtractSum(_sums[i]); - } - - template SIMD_INLINE void NeuralAddConvolutionSum2x2(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - size_t alignedHeight = Simd::AlignLo(height, 2); - size_t fullAlignedWidth = Simd::AlignLo(width - 1, DF); - size_t partialAlignedWidth = Simd::AlignLo(width, F); - __mmask16 tailMask = __mmask16(-1) >> (F + partialAlignedWidth - width); - __m512 _sums[coreX*coreY]; - memset(_sums, 0, sizeof(_sums)); - size_t row = 0; - for (; row < alignedHeight; row += 2) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += DF) - Convolution::template Sum2x2(src + col, srcStride, dst + col, dstStride, _sums); - for (; col < partialAlignedWidth; col += F) - Convolution::template Sum2x1(src + col, srcStride, dst + col, dstStride, _sums); - if (col < width) - Convolution::template Sum2x1(src + col, srcStride, dst + col, dstStride, _sums, tailMask); - src += 2 * srcStride; - dst += 2 * dstStride; - } - for (; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += DF) - Convolution::template Sum1x2(src + col, srcStride, dst + col, _sums); - for (; col < partialAlignedWidth; col += F) - Convolution::template Sum1x1(src + col, srcStride, dst + col, _sums); - if (col < width) - Convolution::template Sum1x1(src + col, srcStride, dst + col, _sums, tailMask); - src += srcStride; - dst += dstStride; - } - size_t i = 0, n = Simd::AlignLo(coreX*coreY, 4); -#ifndef _MSC_VER - for (; i < n; i += 4) - Add4ExtractedSums(_sums + i, sums + i); -#endif - for (; i < coreX*coreY; ++i) - sums[i] += ExtractSum(_sums[i]); - } - - void NeuralAddConvolution2x2Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionSum2x2(src, srcStride, dst, dstStride, width, height, sums); - else - NeuralAddConvolutionSum2x2(src, srcStride, dst, dstStride, width, height, sums); - } - - void NeuralAddConvolution3x3Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionSum2x1(src, srcStride, dst, dstStride, width, height, sums); - else - NeuralAddConvolutionSum2x1(src, srcStride, dst, dstStride, width, height, sums); - } - - void NeuralAddConvolution4x4Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionSum2x1(src, srcStride, dst, dstStride, width, height, sums); - else - NeuralAddConvolutionSum2x1(src, srcStride, dst, dstStride, width, height, sums); - } - - void NeuralAddConvolution5x5Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionSum2x1(src, srcStride, dst, dstStride, width, height, sums); - else - NeuralAddConvolutionSum2x1(src, srcStride, dst, dstStride, width, height, sums); - } - - template SIMD_INLINE __m512 Pooling1x1Max3x1Body(const float * src) - { - return _mm512_max_ps(_mm512_max_ps(Load(src - 1), Load(src)), Load(src + 1)); - } - - template SIMD_INLINE void Pooling1x1Max3x3Body(const float * src, size_t stride, float * dst) - { - __m512 src0 = Pooling1x1Max3x1Body(src - stride); - __m512 src1 = Pooling1x1Max3x1Body(src); - __m512 src2 = Pooling1x1Max3x1Body(src + stride); - Store(dst, _mm512_max_ps(_mm512_max_ps(src0, src1), src2)); - } - - template SIMD_INLINE void Pooling1x1Max3x2Body(const float * src, size_t stride, float * dst) - { - __m512 src0 = Pooling1x1Max3x1Body(src); - __m512 src1 = Pooling1x1Max3x1Body(src + stride); - Store(dst, _mm512_max_ps(src0, src1)); - } - - __m512i K32_PERMUTE_NOSE = SIMD_MM512_SETR_EPI32(0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); - - template SIMD_INLINE __m512 Pooling1x1Max3x1Nose(const float * src) - { - __m512 src1 = Load(src); - __m512 src0 = _mm512_permutexvar_ps(K32_PERMUTE_NOSE, src1); - __m512 src2 = Load(src + 1); - return _mm512_max_ps(_mm512_max_ps(src0, src1), src2); - } - - template SIMD_INLINE void Pooling1x1Max3x3Nose(const float * src, size_t stride, float * dst) - { - __m512 src0 = Pooling1x1Max3x1Nose(src - stride); - __m512 src1 = Pooling1x1Max3x1Nose(src); - __m512 src2 = Pooling1x1Max3x1Nose(src + stride); - Store(dst, _mm512_max_ps(_mm512_max_ps(src0, src1), src2)); - } - template SIMD_INLINE void Pooling1x1Max3x2Nose(const float * src, size_t stride, float * dst) - { - __m512 src0 = Pooling1x1Max3x1Nose(src); - __m512 src1 = Pooling1x1Max3x1Nose(src + stride); - Store(dst, _mm512_max_ps(src0, src1)); - } - - __m512i K32_PERMUTE_TAIL = SIMD_MM512_SETR_EPI32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15); - - template SIMD_INLINE __m512 Pooling1x1Max3x1Tail(const float * src) - { - __m512 src0 = Load(src - 1); - __m512 src1 = Load(src); - __m512 src2 = _mm512_permutexvar_ps(K32_PERMUTE_TAIL, src1); - return _mm512_max_ps(_mm512_max_ps(src0, src1), src2); - } - - template SIMD_INLINE void Pooling1x1Max3x3Tail(const float * src, size_t stride, float * dst) - { - __m512 src0 = Pooling1x1Max3x1Tail(src - stride); - __m512 src1 = Pooling1x1Max3x1Tail(src); - __m512 src2 = Pooling1x1Max3x1Tail(src + stride); - Store(dst, _mm512_max_ps(_mm512_max_ps(src0, src1), src2)); - } - - template SIMD_INLINE void Pooling1x1Max3x2Tail(const float * src, size_t stride, float * dst) - { - __m512 src0 = Pooling1x1Max3x1Tail(src); - __m512 src1 = Pooling1x1Max3x1Tail(src + stride); - Store(dst, _mm512_max_ps(src0, src1)); - } - - template void NeuralPooling1x1Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - assert(width > F && height > 1); - - size_t alignedWidth = AlignHi(width, F) - F; - height -= 1; - - Pooling1x1Max3x2Nose(src, srcStride, dst); - for (size_t col = F; col < alignedWidth; col += F) - Pooling1x1Max3x2Body(src + col, srcStride, dst + col); - Pooling1x1Max3x2Tail(src + width - F, srcStride, dst + width - F); - - for (size_t row = 1; row < height; ++row) - { - src += srcStride; - dst += dstStride; - Pooling1x1Max3x3Nose(src, srcStride, dst); - for (size_t col = F; col < alignedWidth; col += F) - Pooling1x1Max3x3Body(src + col, srcStride, dst + col); - Pooling1x1Max3x3Tail(src + width - F, srcStride, dst + width - F); - } - - dst += dstStride; - Pooling1x1Max3x2Nose(src, srcStride, dst); - for (size_t col = F; col < alignedWidth; col += F) - Pooling1x1Max3x2Body(src + col, srcStride, dst + col); - Pooling1x1Max3x2Tail(src + width - F, srcStride, dst + width - F); - } - - void NeuralPooling1x1Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralPooling1x1Max3x3(src, srcStride, width, height, dst, dstStride); - else - NeuralPooling1x1Max3x3(src, srcStride, width, height, dst, dstStride); - } - - __m512i K32_PERMUTE_2_0 = SIMD_MM512_SETR_EPI32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); - __m512i K32_PERMUTE_2_1 = SIMD_MM512_SETR_EPI32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31); - __m512i K32_PERMUTE_2_2 = SIMD_MM512_SETR_EPI32(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0); - - template SIMD_INLINE __m512 Pooling2x2Max2x2(const float * src, size_t stride) - { - __m512 lo = _mm512_max_ps(Load(src + 0), Load(src + stride + 0)); - __m512 hi = _mm512_max_ps(Load(src + F), Load(src + stride + F)); - __m512 _lo = _mm512_shuffle_f32x4(lo, hi, 0x88); - __m512 _hi = _mm512_shuffle_f32x4(lo, hi, 0xDD); - return _mm512_max_ps(_mm512_shuffle_ps(_lo, _hi, 0x88), _mm512_shuffle_ps(_lo, _hi, 0xDD)); - } - - template SIMD_INLINE __m512 Pooling2x2Max2(const float * src) - { - __m512 lo = Load(src + 0); - __m512 hi = Load(src + F); - __m512 _lo = _mm512_shuffle_f32x4(lo, hi, 0x88); - __m512 _hi = _mm512_shuffle_f32x4(lo, hi, 0xDD); - return _mm512_max_ps(_mm512_shuffle_ps(_lo, _hi, 0x88), _mm512_shuffle_ps(_lo, _hi, 0xDD)); - } - - template void NeuralPooling2x2Max2x2(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - size_t heightEven = Simd::AlignLo(height, 2); - size_t widthEven = Simd::AlignLo(width, 2); - size_t alignedWidth = AlignLo(width, DF); - for (size_t row = 0; row < heightEven; row += 2) - { - for (size_t col = 0; col < alignedWidth; col += DF) - Store(dst + (col >> 1), Pooling2x2Max2x2(src + col, srcStride)); - if (widthEven - alignedWidth) - { - size_t col = widthEven - DF; - Store(dst + (col >> 1), Pooling2x2Max2x2(src + col, srcStride)); - } - if (width - widthEven) - dst[widthEven >> 1] = Simd::Max(src[widthEven], src[widthEven + srcStride]); - src += 2 * srcStride; - dst += dstStride; - } - if (height - heightEven) - { - for (size_t col = 0; col < alignedWidth; col += DF) - Store(dst + (col >> 1), Pooling2x2Max2(src + col)); - if (widthEven - alignedWidth) - { - size_t col = widthEven - DF; - Store(dst + (col >> 1), Pooling2x2Max2(src + col)); - } - if (width - widthEven) - dst[widthEven >> 1] = src[widthEven]; - } - } - - void NeuralPooling2x2Max2x2(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralPooling2x2Max2x2(src, srcStride, width, height, dst, dstStride); - else - NeuralPooling2x2Max2x2(src, srcStride, width, height, dst, dstStride); - } - - template SIMD_INLINE __m512 Pooling2x2Max1x3(const float * src, size_t stride) - { - return _mm512_max_ps(_mm512_max_ps(Load(src), Load(src + stride)), Load(src + 2 * stride)); - } - - template SIMD_INLINE __m512 Pooling2x2Max3x3(const float * src, size_t stride) - { - __m512 s0 = Pooling2x2Max1x3(src + 0, stride); - __m512 sf = Pooling2x2Max1x3(src + F, stride); - __m512 p0 = _mm512_permutex2var_ps(s0, K32_PERMUTE_2_0, sf); - __m512 p1 = _mm512_permutex2var_ps(s0, K32_PERMUTE_2_1, sf); - __m512 p2 = _mm512_permutex2var_ps(s0, K32_PERMUTE_2_2, sf); - return _mm512_max_ps(_mm512_max_ps(p0, p1), p2); - } - - template SIMD_INLINE __m512 Pooling2x2Max1x2(const float * src, size_t stride) - { - return _mm512_max_ps(Load(src), Load(src + stride)); - } - - template SIMD_INLINE __m512 Pooling2x2Max3x2(const float * src, size_t stride) - { - __m512 s0 = Pooling2x2Max1x2(src + 0, stride); - __m512 sf = Pooling2x2Max1x2(src + F, stride); - __m512 p0 = _mm512_permutex2var_ps(s0, K32_PERMUTE_2_0, sf); - __m512 p1 = _mm512_permutex2var_ps(s0, K32_PERMUTE_2_1, sf); - __m512 p2 = _mm512_permutex2var_ps(s0, K32_PERMUTE_2_2, sf); - return _mm512_max_ps(_mm512_max_ps(p0, p1), p2); - } - - template void NeuralPooling2x2Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - height -= 1; - width -= 1; - size_t heightEven = Simd::AlignLo(height, 2); - size_t widthEven = Simd::AlignLo(width, 2); - size_t step = DF - 2; - size_t alignedWidth = width / step*step; - for (size_t row = 0; row < heightEven; row += 2) - { - for (size_t col = 0; col < alignedWidth; col += step) - Store(dst + (col >> 1), Pooling2x2Max3x3(src + col, srcStride), __mmask16(0x7FFF)); - if (widthEven - alignedWidth) - { - size_t col = widthEven - step; - Store(dst + (col >> 1), Pooling2x2Max3x3(src + col, srcStride), __mmask16(0x7FFF)); - } - if (width - widthEven) - Sse::Max2x3s(src + widthEven, srcStride, dst + (widthEven >> 1)); - src += 2 * srcStride; - dst += dstStride; - } - if (height - heightEven) - { - for (size_t col = 0; col < alignedWidth; col += step) - Store(dst + (col >> 1), Pooling2x2Max3x2(src + col, srcStride), __mmask16(0x7FFF)); - if (widthEven - alignedWidth) - { - size_t col = widthEven - step; - Store(dst + (col >> 1), Pooling2x2Max3x2(src + col, srcStride), __mmask16(0x7FFF)); - } - if (width - widthEven) - Sse::Max2x2s(src + widthEven, srcStride, dst + (widthEven >> 1)); - } - } - - void NeuralPooling2x2Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralPooling2x2Max3x3(src, srcStride, width, height, dst, dstStride); - else - NeuralPooling2x2Max3x3(src, srcStride, width, height, dst, dstStride); - } - - namespace Ncf - { - namespace Ver0 - { - void PrepareB(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, size_t kernelX, size_t kernelY, - size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, float * dst) - { - const size_t K = kernelX*kernelY*srcDepth, N = dstHeight*dstWidth; - if (dilationX*dilationY*strideX*strideY != 1) - { - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - size_t srcRow0 = dstRow*strideY - padY; - for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol) - { - size_t srcCol0 = dstCol*strideX - padX; - for (size_t channel = 0; channel < srcDepth; ++channel) - { - for (size_t kernelRow = 0; kernelRow < kernelY; ++kernelRow) - { - size_t srcRow = srcRow0 + kernelRow*dilationY; - if (srcRow < srcHeight) - { - const float * psrc = src + (channel*srcHeight + srcRow)*srcWidth; - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol) - { - size_t srcCol = srcCol0 + kernelCol*dilationX; - if (srcCol < srcWidth) - *(dst++) = psrc[srcCol]; - else - *(dst++) = 0; - } - } - else - { - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol) - *(dst++) = 0; - } - } - } - } - } - } - else if (kernelX*kernelY != 1) - { - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - size_t srcRow0 = dstRow - padY; - for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol) - { - size_t srcCol0 = dstCol - padX; - for (size_t channel = 0; channel < srcDepth; ++channel) - { - for (size_t kernelRow = 0; kernelRow < kernelY; ++kernelRow) - { - size_t srcRow = srcRow0 + kernelRow; - if (srcRow < srcHeight) - { - const float * psrc = src + (channel*srcHeight + srcRow)*srcWidth; - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol) - { - size_t srcCol = srcCol0 + kernelCol; - if (srcCol < srcWidth) - *(dst++) = psrc[srcCol]; - else - *(dst++) = 0; - } - } - else - { - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol) - *(dst++) = 0; - } - } - } - } - } - } - else - { - for (size_t i = 0; i < N; ++i) - { - for (size_t k = 0; k < K; ++k) - *(dst++) = src[k*N + i]; - } - } - } - - template static SIMD_INLINE void Kernel1x4x16(const __m512 & a, size_t K, const float * b, __m512 * sums) - { - sums[0] = _mm512_fmadd_ps(a, Load(b + 0 * K), sums[0]); - sums[1] = _mm512_fmadd_ps(a, Load(b + 1 * K), sums[1]); - sums[2] = _mm512_fmadd_ps(a, Load(b + 2 * K), sums[2]); - sums[3] = _mm512_fmadd_ps(a, Load(b + 3 * K), sums[3]); - } - - template static SIMD_INLINE void Kernel1x1x16(const __m512 & a, const float * b, __m512 & sum) - { - sum = _mm512_fmadd_ps(a, Load(b), sum); - } - - SIMD_INLINE void Add4ExtractedSums(const __m512 * src, float * dst) - { - __m512 sum02 = _mm512_add_ps(_mm512_unpacklo_ps(src[0], src[2]), _mm512_unpackhi_ps(src[0], src[2])); - __m512 sum13 = _mm512_add_ps(_mm512_unpacklo_ps(src[1], src[3]), _mm512_unpackhi_ps(src[1], src[3])); - __m512 sum512 = _mm512_add_ps(_mm512_unpacklo_ps(sum02, sum13), _mm512_unpackhi_ps(sum02, sum13)); - __m128 sum128 = _mm_add_ps(_mm_add_ps(_mm512_extractf32x4_ps(sum512, 0), _mm512_extractf32x4_ps(sum512, 1)), - _mm_add_ps(_mm512_extractf32x4_ps(sum512, 2), _mm512_extractf32x4_ps(sum512, 3))); - _mm_storeu_ps(dst, _mm_add_ps(_mm_loadu_ps(dst), sum128)); - } - - template static SIMD_INLINE void Kernel6x4x16(const __m512 * a, size_t K, const float * b, __m512 * sums) - { - __m512 _b; - _b = Load(b + 0 * K); - sums[0x00] = _mm512_fmadd_ps(a[0], _b, sums[0x00]); - sums[0x04] = _mm512_fmadd_ps(a[1], _b, sums[0x04]); - sums[0x08] = _mm512_fmadd_ps(a[2], _b, sums[0x08]); - sums[0x0C] = _mm512_fmadd_ps(a[3], _b, sums[0x0C]); - sums[0x10] = _mm512_fmadd_ps(a[4], _b, sums[0x10]); - sums[0x14] = _mm512_fmadd_ps(a[5], _b, sums[0x14]); - _b = Load(b + 1 * K); - sums[0x01] = _mm512_fmadd_ps(a[0], _b, sums[0x01]); - sums[0x05] = _mm512_fmadd_ps(a[1], _b, sums[0x05]); - sums[0x09] = _mm512_fmadd_ps(a[2], _b, sums[0x09]); - sums[0x0D] = _mm512_fmadd_ps(a[3], _b, sums[0x0D]); - sums[0x11] = _mm512_fmadd_ps(a[4], _b, sums[0x11]); - sums[0x15] = _mm512_fmadd_ps(a[5], _b, sums[0x15]); - _b = Load(b + 2 * K); - sums[0x02] = _mm512_fmadd_ps(a[0], _b, sums[0x02]); - sums[0x06] = _mm512_fmadd_ps(a[1], _b, sums[0x06]); - sums[0x0A] = _mm512_fmadd_ps(a[2], _b, sums[0x0A]); - sums[0x0E] = _mm512_fmadd_ps(a[3], _b, sums[0x0E]); - sums[0x12] = _mm512_fmadd_ps(a[4], _b, sums[0x12]); - sums[0x16] = _mm512_fmadd_ps(a[5], _b, sums[0x16]); - _b = Load(b + 3 * K); - sums[0x03] = _mm512_fmadd_ps(a[0], _b, sums[0x03]); - sums[0x07] = _mm512_fmadd_ps(a[1], _b, sums[0x07]); - sums[0x0B] = _mm512_fmadd_ps(a[2], _b, sums[0x0B]); - sums[0x0F] = _mm512_fmadd_ps(a[3], _b, sums[0x0F]); - sums[0x13] = _mm512_fmadd_ps(a[4], _b, sums[0x13]); - sums[0x17] = _mm512_fmadd_ps(a[5], _b, sums[0x17]); - } - - template static SIMD_INLINE void Kernel6x1x16(const __m512 * a, const float * b, __m512 * sums) - { - __m512 b0 = Load(b); - sums[0] = _mm512_fmadd_ps(a[0], b0, sums[0]); - sums[1] = _mm512_fmadd_ps(a[1], b0, sums[1]); - sums[2] = _mm512_fmadd_ps(a[2], b0, sums[2]); - sums[3] = _mm512_fmadd_ps(a[3], b0, sums[3]); - sums[4] = _mm512_fmadd_ps(a[4], b0, sums[4]); - sums[5] = _mm512_fmadd_ps(a[5], b0, sums[5]); - } - - template static SIMD_INLINE void Kernel3x4x16(const __m512 * a, size_t K, const float * b, __m512 * sums) - { - __m512 _b; - _b = Load(b + 0 * K); - sums[0x0] = _mm512_fmadd_ps(a[0], _b, sums[0x0]); - sums[0x4] = _mm512_fmadd_ps(a[1], _b, sums[0x4]); - sums[0x8] = _mm512_fmadd_ps(a[2], _b, sums[0x8]); - _b = Load(b + 1 * K); - sums[0x1] = _mm512_fmadd_ps(a[0], _b, sums[0x1]); - sums[0x5] = _mm512_fmadd_ps(a[1], _b, sums[0x5]); - sums[0x9] = _mm512_fmadd_ps(a[2], _b, sums[0x9]); - _b = Load(b + 2 * K); - sums[0x2] = _mm512_fmadd_ps(a[0], _b, sums[0x2]); - sums[0x6] = _mm512_fmadd_ps(a[1], _b, sums[0x6]); - sums[0xA] = _mm512_fmadd_ps(a[2], _b, sums[0xA]); - _b = Load(b + 3 * K); - sums[0x3] = _mm512_fmadd_ps(a[0], _b, sums[0x3]); - sums[0x7] = _mm512_fmadd_ps(a[1], _b, sums[0x7]); - sums[0xB] = _mm512_fmadd_ps(a[2], _b, sums[0xB]); - } - - template static SIMD_INLINE void Kernel3x1x16(const __m512 * a, const float * b, __m512 * sums) - { - __m512 _b = Load(b); - sums[0x0] = _mm512_fmadd_ps(a[0], _b, sums[0x0]); - sums[0x1] = _mm512_fmadd_ps(a[1], _b, sums[0x1]); - sums[0x2] = _mm512_fmadd_ps(a[2], _b, sums[0x2]); - } - - template static SIMD_INLINE void Load6(const float * p, __m512 * a, size_t step, __mmask16 tail = -1) - { - a[0] = Load(p + 0 * step, tail); - a[1] = Load(p + 1 * step, tail); - a[2] = Load(p + 2 * step, tail); - a[3] = Load(p + 3 * step, tail); - a[4] = Load(p + 4 * step, tail); - a[5] = Load(p + 5 * step, tail); - } - - template static SIMD_INLINE void Load3(const float * p, __m512 * a, size_t step, __mmask16 tail = -1) - { - a[0] = Load(p + 0 * step, tail); - a[1] = Load(p + 1 * step, tail); - a[2] = Load(p + 2 * step, tail); - } - - template void Execute(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) - { - size_t M3 = M / 3 * 3; - size_t M6 = M / 6 * 6; - size_t N4 = Simd::AlignLo(N, 4); - size_t K16 = Simd::AlignLo(K, 16); - __mmask16 tailMask = TailMask16(K - K16); - size_t i = 0; -#if SIMD_ZMM_COUNT == 32 - for (; i < M6; i += 6) - { - const float * pa = a + i*K; - float * pc = c + i*N; - size_t j = 0; - __m512 _a[6]; - for (; j < N4; j += 4) - { - const float * pb = b + j*K; - __m512 sums[24] = { - _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), - _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), - _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), - _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), - _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), - _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; - size_t k = 0; - for (; k < K16; k += 16) - { - Load6(pa + k, _a, K); - Kernel6x4x16(_a, K, pb + k, sums); - } - if (k < K) - { - Load6(pa + k, _a, K, tailMask); - Kernel6x4x16(_a, K, pb + k, sums); - } - Add4ExtractedSums(sums + 0x00, pc + 0 * N + j); - Add4ExtractedSums(sums + 0x04, pc + 1 * N + j); - Add4ExtractedSums(sums + 0x08, pc + 2 * N + j); - Add4ExtractedSums(sums + 0x0C, pc + 3 * N + j); - Add4ExtractedSums(sums + 0x10, pc + 4 * N + j); - Add4ExtractedSums(sums + 0x14, pc + 5 * N + j); - } - for (; j < N; ++j) - { - const float * pb = b + j*K; - __m512 sums[6] = { - _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), - _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; - size_t k = 0; - for (; k < K16; k += 16) - { - Load6(pa + k, _a, K); - Kernel6x1x16(_a, pb + k, sums); - } - if (k < K) - { - Load6(pa + k, _a, K, tailMask); - Kernel6x1x16(_a, pb + k, sums); - } - pc[0 * N + j] += ExtractSum(sums[0]); - pc[1 * N + j] += ExtractSum(sums[1]); - pc[2 * N + j] += ExtractSum(sums[2]); - pc[3 * N + j] += ExtractSum(sums[3]); - pc[4 * N + j] += ExtractSum(sums[4]); - pc[5 * N + j] += ExtractSum(sums[5]); - } - } -#endif - for (; i < M3; i += 3) - { - const float * pa = a + i*K; - float * pc = c + i*N; - size_t j = 0; - __m512 _a[3]; - for (; j < N4; j += 4) - { - const float * pb = b + j*K; - __m512 sums[12] = { - _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), - _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), - _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; - size_t k = 0; - for (; k < K16; k += 16) - { - Load3(pa + k, _a, K); - Kernel3x4x16(_a, K, pb + k, sums); - } - if (k < K) - { - Load3(pa + k, _a, K, tailMask); - Kernel3x4x16(_a, K, pb + k, sums); - } - Add4ExtractedSums(sums + 0x0, pc + 0 * N + j); - Add4ExtractedSums(sums + 0x4, pc + 1 * N + j); - Add4ExtractedSums(sums + 0x8, pc + 2 * N + j); - } - for (; j < N; ++j) - { - const float * pb = b + j*K; - __m512 sums[3] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; - size_t k = 0; - for (; k < K16; k += 16) - { - Load3(pa + k, _a, K); - Kernel3x1x16(_a, pb + k, sums); - } - if (k < K) - { - Load3(pa + k, _a, K, tailMask); - Kernel3x1x16(_a, pb + k, sums); - } - pc[0 * N + j] += ExtractSum(sums[0]); - pc[1 * N + j] += ExtractSum(sums[1]); - pc[2 * N + j] += ExtractSum(sums[2]); - } - } - for (; i < M; ++i) - { - const float * pa = a + i*K; - float * pc = c + i*N; - size_t j = 0; - for (; j < N4; j += 4) - { - const float * pb = b + j*K; - __m512 sums[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; - size_t k = 0; - for (; k < K16; k += 16) - { - __m512 _a = Load(pa + k); - Kernel1x4x16(_a, K, pb + k, sums); - } - if (k < K) - { - __m512 _a = Load(pa + k, tailMask); - Kernel1x4x16(_a, K, pb + k, sums); - } - Add4ExtractedSums(sums + 0, pc + j); - } - for (; j < N; ++j) - { - const float * pb = b + j*K; - __m512 sum = _mm512_setzero_ps(); - size_t k = 0; - for (; k < K16; k += 16) - { - __m512 _a = Load(pa + k); - Kernel1x1x16(_a, pb + k, sum); - } - if (k < K) - { - __m512 _a = Load(pa + k, tailMask); - Kernel1x1x16(_a, pb + k, sum); - } - pc[j] += ExtractSum(sum); - } - } - } - - void Execute(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) - { - if (Aligned(K, F)) - Execute(M, N, K, a, b, c); - else - Execute(M, N, K, a, b, c); - } - } - - namespace Ver1 - { - void PrepareA(const float * src, size_t M, size_t K, size_t cell, float * dst) - { - size_t K4 = AlignLo(K, 4), K8 = AlignLo(K, 8); - for (size_t i = 0; i < M; i += cell) - { - size_t n = Simd::Min(cell, M - i), k = 0; - if (cell == 4 && n == 4) - { - for (; k < K8; k += 8) - { - const float * ps = src + k; - __m256 s0 = Avx::Load(ps + 0 * K); - __m256 s1 = Avx::Load(ps + 1 * K); - __m256 s2 = Avx::Load(ps + 2 * K); - __m256 s3 = Avx::Load(ps + 3 * K); - __m256 s00 = _mm256_unpacklo_ps(s0, s2); - __m256 s01 = _mm256_unpacklo_ps(s1, s3); - __m256 s10 = _mm256_unpackhi_ps(s0, s2); - __m256 s11 = _mm256_unpackhi_ps(s1, s3); - __m256 d0 = _mm256_unpacklo_ps(s00, s01); - __m256 d1 = _mm256_unpackhi_ps(s00, s01); - __m256 d2 = _mm256_unpacklo_ps(s10, s11); - __m256 d3 = _mm256_unpackhi_ps(s10, s11); - Avx::Store(dst + 0, _mm256_permute2f128_ps(d0, d1, 0x20)); - Avx::Store(dst + 8, _mm256_permute2f128_ps(d2, d3, 0x20)); - Avx::Store(dst + 16, _mm256_permute2f128_ps(d0, d1, 0x31)); - Avx::Store(dst + 24, _mm256_permute2f128_ps(d2, d3, 0x31)); - dst += 32; - } - for (; k < K4; k += 4) - { - const float * ps = src + k; - __m128 s0 = Sse::Load(ps + 0 * K); - __m128 s1 = Sse::Load(ps + 1 * K); - __m128 s2 = Sse::Load(ps + 2 * K); - __m128 s3 = Sse::Load(ps + 3 * K); - __m128 s00 = _mm_unpacklo_ps(s0, s2); - __m128 s01 = _mm_unpacklo_ps(s1, s3); - __m128 s10 = _mm_unpackhi_ps(s0, s2); - __m128 s11 = _mm_unpackhi_ps(s1, s3); - Sse::Store(dst + 0, _mm_unpacklo_ps(s00, s01)); - Sse::Store(dst + 4, _mm_unpackhi_ps(s00, s01)); - Sse::Store(dst + 8, _mm_unpacklo_ps(s10, s11)); - Sse::Store(dst + 12, _mm_unpackhi_ps(s10, s11)); - dst += 16; - } - } - for (; k < K; ++k) - { - for (size_t c = 0; c < n; ++c) - *(dst++) = src[c*K + k]; - } - src += cell*K; - } - } - - void PrepareB(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, size_t kernelX, size_t kernelY, size_t padX, size_t padY, - size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, size_t cell, float * tmp, float * dst) - { - const size_t K = kernelX*kernelY*srcDepth, N = dstHeight*dstWidth; - if (kernelX*kernelY != 1) - { - float * dst = tmp; - size_t channelSize = srcHeight * srcWidth; - if (dilationX*dilationY*strideX*strideY != 1) - { - for (size_t channel = 0, k = 0; channel < srcDepth; ++channel, src += channelSize) - { - for (size_t kernelRow = 0; kernelRow < kernelY; ++kernelRow) - { - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol, ++k) - { - size_t srcRow = kernelRow*dilationY - padY; - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - if (srcRow < srcHeight) - { - size_t srcCol = kernelCol*dilationX - padX; - for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol) - { - if (srcCol < srcWidth) - *(dst++) = src[srcRow*srcWidth + srcCol]; - else - *(dst++) = 0; - srcCol += strideX; - } - } - else - { - for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol) - *(dst++) = 0; - } - srcRow += strideY; - } - } - } - } - } - else - { - const size_t bodySize = dstWidth - padX * 2; - for (size_t channel = 0, k = 0; channel < srcDepth; ++channel, src += channelSize) - { - for (size_t kernelRow = 0; kernelRow < kernelY; ++kernelRow) - { - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol, ++k) - { - size_t srcRow = kernelRow - padY; - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow, ++srcRow) - { - if (srcRow < srcHeight) - { - size_t srcCol = kernelCol - padX, dstCol = 0; - const float * psrc = src + srcRow*srcWidth; - for (; dstCol < padX; ++dstCol, ++srcCol) - { - if (srcCol < srcWidth) - *(dst++) = psrc[srcCol]; - else - *(dst++) = 0; - } - memcpy(dst, psrc + srcCol, bodySize * 4); - dst += bodySize; - dstCol += bodySize; - srcCol += bodySize; - for (; dstCol < dstWidth; ++dstCol, ++srcCol) - { - if (srcCol < srcWidth) - *(dst++) = psrc[srcCol]; - else - *(dst++) = 0; - } - } - else - { - memset(dst, 0, dstWidth * 4); - dst += dstWidth; - } - } - } - } - } - } - src = tmp; - } - if (cell == 48) - { - for (size_t j = 0; j < N; j += cell) - { - size_t n = Simd::Min(cell, N - j); - if (n == cell) - { - for (size_t k = 0; k < K; ++k) - { - const float * psrc = src + k * N; - Store(dst + 0 * F, Load(psrc + 0 * F)); - Store(dst + 1 * F, Load(psrc + 1 * F)); - Store(dst + 2 * F, Load(psrc + 2 * F)); - dst += 48; - } - } - else - { - for (size_t k = 0; k < K; ++k) - { - const float * psrc = src + k * N; - size_t c = 0; - for (; c < n; ++c) - *(dst++) = *(psrc++); - for (; c < cell; ++c) - *(dst++) = 0; - } - } - src += cell; - } - } - else if (cell == 16) - { - for (size_t j = 0; j < N; j += cell) - { - size_t n = Simd::Min(cell, N - j); - if (n == cell) - { - for (size_t k = 0; k < K; ++k) - { - const float * psrc = src + k*N; - Store(dst, Load(psrc)); - dst += 16; - } - } - else - { - for (size_t k = 0; k < K; ++k) - { - const float * psrc = src + k*N; - size_t c = 0; - for (; c < n; ++c) - *(dst++) = *(psrc++); - for (; c < cell; ++c) - *(dst++) = 0; - } - } - src += cell; - } - } - else - { - for (size_t j = 0; j < N; j += cell) - { - size_t n = Simd::Min(cell, N - j); - for (size_t k = 0; k < K; ++k) - { - const float * psrc = src + k*N; - size_t c = 0; - for (; c < n; ++c) - *(dst++) = *(psrc++); - for (; c < cell; ++c) - *(dst++) = 0; - } - src += cell; - } - } - } - - SIMD_INLINE void AddSum(__m512 sum, float * dst) - { - _mm512_storeu_ps(dst, _mm512_add_ps(_mm512_loadu_ps(dst), sum)); - } - - template SIMD_INLINE void AddSum(__m512 sum, float * dst, __mmask16 tail = -1) - { - Store(dst, _mm512_add_ps((Load(dst, tail)), sum), tail); - } - - template SIMD_INLINE void AddSums16(const __m512 * sums, size_t size, float * dst, size_t stride, __mmask16 tail = -1) - { - for (size_t i = 0; i < size; ++i, dst += stride) - AddSum(sums[i], dst, tail); - } - - template SIMD_INLINE void KernelMx16(size_t N, size_t K, const float * a, const float * b, float * c, size_t m, __mmask16 tail = -1) - { - __m512 sums[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; - for (size_t k = 0; k < K; ++k) - { - __m512 b0 = Load(b); - for (size_t s = 0; s < m; ++s) - { - __m512 a0 = _mm512_set1_ps(a[s]); - sums[s] = _mm512_fmadd_ps(b0, a0, sums[s]); - } - b += 16; - a += m; - } - AddSums16(sums, m, c, N, tail); - } - - template SIMD_INLINE void Kernel4x16(size_t N, size_t K, const float * a, const float * b, float * c, __mmask16 tail = -1) - { - __m512 sums[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; - for (size_t k = 0; k < K; ++k) - { - __m512 b0 = Load(b); - __m512 a0 = _mm512_set1_ps(a[0]); - sums[0] = _mm512_fmadd_ps(b0, a0, sums[0]); - __m512 a1 = _mm512_set1_ps(a[1]); - sums[1] = _mm512_fmadd_ps(b0, a1, sums[1]); - __m512 a2 = _mm512_set1_ps(a[2]); - sums[2] = _mm512_fmadd_ps(b0, a2, sums[2]); - __m512 a3 = _mm512_set1_ps(a[3]); - sums[3] = _mm512_fmadd_ps(b0, a3, sums[3]); - b += 16; - a += 4; - } - AddSums16(sums, 4, c, N, tail); - } - - template void Execute4x16(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) - { - size_t M4 = Simd::AlignLo(M, 4); - size_t N16 = Simd::AlignLo(N, 16); - __mmask16 tailMask = TailMask16(N - N16); - size_t i = 0; - for (; i < M4; i += 4) - { - size_t j = 0; - for (; j < N16; j += 16) - Kernel4x16(N, K, a + i*K, b + j*K, c + i*N + j); - if (j < N) - Kernel4x16(N, K, a + i*K, b + j*K, c + i*N + j, tailMask); - } - if (i < M) - { - size_t j = 0; - for (; j < N16; j += 16) - KernelMx16(N, K, a + i*K, b + j*K, c + i*N + j, M - M4); - if (j < N) - KernelMx16(N, K, a + i*K, b + j*K, c + i*N + j, M - M4, tailMask); - } - } - - template SIMD_INLINE void KernelMx48(size_t N, size_t K, const float * a, const float * b, float * c, size_t m, const __mmask16 * tails) - { - __m512 sums[12] = { - _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), - _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), - _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; - for (size_t k = 0; k < K; ++k) - { - __m512 b0 = Load(b + 00); - __m512 b1 = Load(b + 16); - __m512 b2 = Load(b + 32); - for (size_t s = 0; s < m; ++s) - { - __m512 a0 = _mm512_set1_ps(a[s]); - sums[s + 0] = _mm512_fmadd_ps(b0, a0, sums[s + 0]); - sums[s + 4] = _mm512_fmadd_ps(b1, a0, sums[s + 4]); - sums[s + 8] = _mm512_fmadd_ps(b2, a0, sums[s + 8]); - } - b += 48; - a += m; - } - for (size_t i = 0; i < m; ++i, c += N) - { - AddSum(sums[i + 0], c + 00, tails[0]); - AddSum(sums[i + 4], c + 16, tails[1]); - AddSum(sums[i + 8], c + 32, tails[2]); - } - } - - void Kernel4x48(size_t N, size_t K, const float * a, const float * b, float * c) - { - __m512 _a, b0, b1, b2, c00, c01, c02, c10, c11, c12, c20, c21, c22, c30, c31, c32; - - c00 = _mm512_setzero_ps(); - c01 = _mm512_setzero_ps(); - c02 = _mm512_setzero_ps(); - c10 = _mm512_setzero_ps(); - c11 = _mm512_setzero_ps(); - c12 = _mm512_setzero_ps(); - c20 = _mm512_setzero_ps(); - c21 = _mm512_setzero_ps(); - c22 = _mm512_setzero_ps(); - c30 = _mm512_setzero_ps(); - c31 = _mm512_setzero_ps(); - c32 = _mm512_setzero_ps(); - - for (size_t k = 0; k < K; ++k) - { - b0 = _mm512_loadu_ps(b + 0 * F); - b1 = _mm512_loadu_ps(b + 1 * F); - b2 = _mm512_loadu_ps(b + 2 * F); - _a = _mm512_set1_ps(a[0]); - c00 = _mm512_fmadd_ps(b0, _a, c00); - c01 = _mm512_fmadd_ps(b1, _a, c01); - c02 = _mm512_fmadd_ps(b2, _a, c02); - _a = _mm512_set1_ps(a[1]); - c10 = _mm512_fmadd_ps(b0, _a, c10); - c11 = _mm512_fmadd_ps(b1, _a, c11); - c12 = _mm512_fmadd_ps(b2, _a, c12); - _a = _mm512_set1_ps(a[2]); - c20 = _mm512_fmadd_ps(b0, _a, c20); - c21 = _mm512_fmadd_ps(b1, _a, c21); - c22 = _mm512_fmadd_ps(b2, _a, c22); - _a = _mm512_set1_ps(a[3]); - c30 = _mm512_fmadd_ps(b0, _a, c30); - c31 = _mm512_fmadd_ps(b1, _a, c31); - c32 = _mm512_fmadd_ps(b2, _a, c32); - b += 48; - a += 4; - } - - AddSum(c00, c + 0 * F); - AddSum(c01, c + 1 * F); - AddSum(c02, c + 2 * F); - c += N; - AddSum(c10, c + 0 * F); - AddSum(c11, c + 1 * F); - AddSum(c12, c + 2 * F); - c += N; - AddSum(c20, c + 0 * F); - AddSum(c21, c + 1 * F); - AddSum(c22, c + 2 * F); - c += N; - AddSum(c30, c + 0 * F); - AddSum(c31, c + 1 * F); - AddSum(c32, c + 2 * F); - } - - template void Execute4x48(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) - { - size_t M4 = Simd::AlignLo(M, 4); - size_t N48 = N/48*48; - __mmask16 tailMasks[3]; - for (size_t i = 0; i < 3; ++i) - tailMasks[i] = TailMask16(N - N48 - F*i); - if (M > N) - { - size_t i = 0; - for (; i < M4; i += 4) - { - size_t j = 0; - for (; j < N48; j += 48) - Kernel4x48(N, K, a + i * K, b + j * K, c + i * N + j); - if (j < N) - KernelMx48(N, K, a + i * K, b + j * K, c + i * N + j, 4, tailMasks); - } - if (i < M) - { - size_t j = 0; - for (; j < N48; j += 48) - KernelMx48(N, K, a + i * K, b + j * K, c + i * N + j, M - M4, tailMasks); - if (j < N) - KernelMx48(N, K, a + i * K, b + j * K, c + i * N + j, M - M4, tailMasks); - } - } - else - { - size_t j = 0; - for (; j < N48; j += 48) - { - size_t i = 0; - for (; i < M4; i += 4) - Kernel4x48(N, K, a + i * K, b + j * K, c + i * N + j); - if (M4 < M) - KernelMx48(N, K, a + i * K, b + j * K, c + i * N + j, M - M4, tailMasks); - } - if (N48 < N) - { - size_t i = 0; - for (; i < M4; i += 4) - KernelMx48(N, K, a + i * K, b + j * K, c + i * N + j, 4, tailMasks); - if (M4 < M) - KernelMx48(N, K, a + i * K, b + j * K, c + i * N + j, M - M4, tailMasks); - } - } - } - - void Execute(size_t M, size_t N, size_t K, const float * a, const float * b, float * c, size_t cellA, size_t cellB) - { - if (cellA == 4) - { - if (cellB == 16) - Execute4x16(M, N, K, a, b, c); - if (cellB == 48) - Execute4x48(M, N, K, a, b, c); - } - } - } - - namespace Ver2 - { - void PrepareB(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, size_t padX, size_t padY, float * dst, size_t dstWidth, size_t dstHeight) - { - for (size_t channel = 0; channel < srcDepth; ++channel) - { - const float * s = src; - float * d = dst; - memset(d, 0, padY*dstWidth * 4); - d += padY*dstWidth; - for (size_t row = padY; row < dstHeight - padY; ++row) - { - memset(d, 0, padX * 4); - memcpy(d + padX, s, srcWidth * 4); - memset(d + padX + srcWidth, 0, padX * 4); - d += dstWidth; - s += srcWidth; - } - memset(d, 0, padY*dstWidth * 4); - src += srcWidth*srcHeight; - dst += dstWidth*dstHeight; - } - } - - template void AddConvolution8x8(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, - const float * weight, float * dst, size_t dstDepth) - { - __m256 _weight[kernelX*kernelY]; - for (size_t dstChannel = 0; dstChannel < dstDepth; ++dstChannel) - { - __m256 _dst[8]; - float * pdst = dst; - for (size_t row = 0; row < 8; ++row, pdst += 8) - _dst[row] = Avx::Load(pdst); - if (kernelY < 4) - { - for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) - { - const float * psrc = src + srcWidth*srcHeight*srcChannel; - Avx2::LoadWeightsForward(weight, _weight); - for (size_t row = 0; row < 8; ++row) - { - _dst[row] = _mm256_add_ps(_dst[row], Avx2::Convolution::template Forward(psrc, srcWidth, _weight)); - psrc += srcWidth; - } - weight += kernelX*kernelY; - } - } - else - { - for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) - { - const float * psrc = src + srcWidth*srcHeight*srcChannel; - for (size_t dy = 0; dy < kernelY; dy++) - { - const float * ps = psrc + dy*srcWidth; - Avx2::LoadWeightsForward(weight, _weight); - for (size_t row = 0; row < 8; ++row) - { - _dst[row] = _mm256_add_ps(_dst[row], Avx2::Convolution::template RowConvolution(ps, _weight)); - ps += srcWidth; - } - weight += kernelX; - } - } - } - for (size_t row = 0; row < 8; ++row, dst += 8) - Avx::Store(dst, _dst[row]); - } - } - - template void AddConvolution16x16(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, - const float * weight, float * dst, size_t dstDepth) - { - __m512 _weight[kernelX*kernelY]; - for (size_t dstChannel = 0; dstChannel < dstDepth; ++dstChannel) - { - __m512 _dst[16]; - float * pdst = dst; - for (size_t row = 0; row < 16; ++row, pdst += 16) - _dst[row] = Load(pdst); - if (kernelY < 4) - { - for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) - { - const float * psrc = src + srcWidth*srcHeight*srcChannel; - LoadWeightsForward(weight, _weight); - for (size_t row = 0; row < 16; ++row) - { - _dst[row] = _mm512_add_ps(_dst[row], (Convolution::template Forward(psrc, srcWidth, _weight))); - psrc += srcWidth; - } - weight += kernelX*kernelY; - } - } - else - { - for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) - { - const float * psrc = src + srcWidth*srcHeight*srcChannel; - for (size_t dy = 0; dy < kernelY; dy++) - { - const float * ps = psrc + dy*srcWidth; - LoadWeightsForward(weight, _weight); - for (size_t row = 0; row < 16; ++row) - { - _dst[row] = _mm512_add_ps(_dst[row], (Convolution::template RowConvolution(ps, _weight))); - ps += srcWidth; - } - weight += kernelX; - } - } - } - for (size_t row = 0; row < 16; ++row, dst += 16) - Store(dst, _dst[row]); - } - } - - template void AddConvolution(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, - const float * weight, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth) - { - if (dstWidth == 8 && dstHeight == 8) - { - AddConvolution8x8(src, srcWidth, srcHeight, srcDepth, weight, dst, dstDepth); - return; - } - if (dstWidth == 16 && dstHeight == 16) - { - AddConvolution16x16(src, srcWidth, srcHeight, srcDepth, weight, dst, dstDepth); - return; - } - size_t alignedWidth = AlignLo(dstWidth, F); - __mmask16 tailMask = TailMask16(dstWidth - alignedWidth); - __m512 _weight[kernelX*kernelY]; - for (size_t dstChannel = 0; dstChannel < dstDepth; ++dstChannel) - { - for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) - { - const float * psrc = src + srcWidth*srcHeight*srcChannel; - const float * pweight = weight + (dstChannel*srcDepth + srcChannel)*kernelX*kernelY; - float * pdst = dst + dstWidth*dstHeight*dstChannel; - LoadWeightsForward(pweight, _weight); - for (size_t row = 0; row < dstHeight; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += F) - { - __m512 _dst = Load(pdst + col); - _dst = _mm512_add_ps(_dst, (Convolution::template Forward(psrc + col, srcWidth, _weight))); - Store(pdst + col, _dst); - } - if (col < dstWidth) - { - __m512 _dst = Load(pdst + col, tailMask); - _dst = _mm512_add_ps(_dst, (Convolution::template Forward(psrc + col, srcWidth, _weight, tailMask))); - Store(pdst + col, _dst, tailMask); - } - psrc += srcWidth; - pdst += dstWidth; - } - } - } - } - - void AddConvolution1x1x16(const float * src, size_t srcDepth, const float * weight, float * dst, size_t dstDepth) - { - size_t dstDepth4 = dstDepth / 4 * 4; - size_t dstChannel = 0; - for (; dstChannel < dstDepth4; dstChannel += 4) - { - __m512 dst00 = _mm512_loadu_ps(dst + 0 * F); - __m512 dst10 = _mm512_loadu_ps(dst + 1 * F); - __m512 dst20 = _mm512_loadu_ps(dst + 2 * F); - __m512 dst30 = _mm512_loadu_ps(dst + 3 * F); - const float * psrc = src; - const float * pw0 = weight; - const float * pw1 = pw0 + srcDepth; - const float * pw2 = pw1 + srcDepth; - const float * pw3 = pw2 + srcDepth; - for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) - { - __m512 _weight; - __m512 src0 = _mm512_loadu_ps(psrc + 0 * F); - _weight = _mm512_set1_ps(pw0[srcChannel]); - dst00 = _mm512_fmadd_ps(_weight, src0, dst00); - _weight = _mm512_set1_ps(pw1[srcChannel]); - dst10 = _mm512_fmadd_ps(_weight, src0, dst10); - _weight = _mm512_set1_ps(pw2[srcChannel]); - dst20 = _mm512_fmadd_ps(_weight, src0, dst20); - _weight = _mm512_set1_ps(pw3[srcChannel]); - dst30 = _mm512_fmadd_ps(_weight, src0, dst30); - psrc += 16; - } - _mm512_storeu_ps(dst + 0 * F, dst00); - _mm512_storeu_ps(dst + 1 * F, dst10); - _mm512_storeu_ps(dst + 2 * F, dst20); - _mm512_storeu_ps(dst + 3 * F, dst30); - dst += 16 * 4; - weight += srcDepth * 4; - } - for (; dstChannel < dstDepth; ++dstChannel) - { - __m512 dst0 = _mm512_loadu_ps(dst + 0 * F); - const float * psrc = src; - for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) - { - __m512 weight0 = _mm512_set1_ps(*weight++); - dst0 = _mm512_fmadd_ps(weight0, _mm512_loadu_ps(psrc + 0 * F), dst0); - psrc += 16; - } - _mm512_storeu_ps(dst + 0 * F, dst0); - dst += 16; - } - } - - void Execute(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, - const float * weight, size_t kernelX, size_t kernelY, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth) - { - assert(kernelX == kernelY); - if (kernelX == 1 && dstWidth*dstHeight == 16) - AddConvolution1x1x16(src, srcDepth, weight, dst, dstDepth); - else if (kernelX == 2) - AddConvolution(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth); - else if (kernelX == 3) - AddConvolution(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth); - else if (kernelX == 4) - AddConvolution(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth); - else if (kernelX == 5) - AddConvolution(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth); - else - assert(0); - } - - bool Preferable(size_t srcDepth, size_t kernelX, size_t kernelY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, size_t dstDepth) - { - if (kernelX == kernelY && strideX*strideY*dilationX*dilationY == 1) - { - if (kernelX >= 2 && kernelX <= 5)// && dstWidth*dstHeight*kernelX*kernelY >= 8 * 8 * 3 * 3) - return true; - if (kernelX == 1 && (dstWidth*dstHeight == 16))// || dstWidth * dstHeight == 64)) - return true; - } - return false; - } - } - - struct Opt - { - enum Alg - { - None, - Ver0, - Ver1, - Ver2, - } alg; - - size_t sizeA; - size_t sizeB; - size_t sizeT; - - size_t cellA; - size_t cellB; - - size_t M, N, K; - size_t strideB; - size_t paddedW; - size_t paddedH; - - Opt(size_t srcWidth, size_t srcHeight, size_t srcDepth, size_t kernelX, size_t kernelY, size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, size_t dstDepth) - { - alg = None; - sizeA = 0; - sizeB = 0; - sizeT = 0; - cellA = 1; - cellB = 1; - - M = dstDepth; - N = dstHeight*dstWidth; - K = kernelX*kernelY*srcDepth; - - if (dstWidth*dstHeight / kernelX <= 1000) - alg = Ver0; - else - alg = Ver1; - if (Ver2::Preferable(srcDepth, kernelX, kernelY, strideX, strideY, dilationX, dilationY, dstWidth, dstHeight, dstDepth)) - alg = Ver2; - - switch (alg) - { - case Ver0: - sizeB = N*K; - break; - case Ver1: - cellA = 4; - cellB = 48; - sizeA = M*K; - strideB = (N + cellB - 1)/cellB*cellB; - sizeB = strideB*K; - if (kernelX*kernelY > 1) - sizeT = sizeB; - break; - case Ver2: - if (padX > 0 || padY > 0) - { - paddedW = Simd::AlignHi(srcWidth + 2 * padX, F); - paddedH = srcHeight + 2 * padY; - sizeB = paddedW*paddedH*srcDepth; - } - else - { - paddedW = srcWidth; - paddedH = srcHeight; - } - break; - default: - assert(0); - break; - } - } - }; - - struct Data - { - float * a; - float * b; - float * t; - - Data(size_t sizeA, size_t sizeB, size_t sizeT, void * externalData, size_t * externalSize) - : a(0) - , b(0) - , _data(0) - { - sizeA = AlignHi(sizeA, F); - sizeB = AlignHi(sizeB, F); - sizeT = AlignHi(sizeT, F); - size_t size = (sizeA + sizeB + sizeT) * sizeof(float); - if (size == 0) - return; - if (externalData != AlignHi(externalData, SIMD_ALIGN)) - size += SIMD_ALIGN; - float * data = NULL; - if (externalData == NULL || externalSize == NULL || *externalSize < size) - { - _data = Simd::Allocate(size); - if (externalSize) - *externalSize = size; - data = (float*)_data; - } - else - data = (float*)AlignHi(externalData, SIMD_ALIGN); - if (sizeA) - a = data; - if (sizeB) - b = data + sizeA; - if (sizeT) - t = data + sizeA + sizeB; - } - - ~Data() - { - if (_data) - Simd::Free(_data); - } - - private: - void * _data; - }; - } - - void NeuralConvolutionForward(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, - const float * weight, size_t kernelX, size_t kernelY, size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, - void * buffer, size_t * size, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth, int add) - { - using namespace Ncf; - - assert(dstWidth == (srcWidth + 2 * padX - (dilationX * (kernelX - 1) + 1)) / strideX + 1); - assert(dstHeight == (srcHeight + 2 * padY - (dilationY * (kernelY - 1) + 1)) / strideY + 1); - - if (dstWidth < F && srcDepth <= 32) - { - Avx2::NeuralConvolutionForward(src, srcWidth, srcHeight, srcDepth, weight, kernelX, kernelY, padX, padY, - strideX, strideY, dilationX, dilationY, buffer, size, dst, dstWidth, dstHeight, dstDepth, add); - return; - } - - if (!add) - memset(dst, 0, dstWidth*dstHeight*dstDepth * sizeof(float)); - - Opt opt(srcWidth, srcHeight, srcDepth, kernelX, kernelY, padX, padY, strideX, strideY, dilationX, dilationY, dstWidth, dstHeight, dstDepth); - - Data data(opt.sizeA, opt.sizeB, opt.sizeT, buffer, size); - - if (opt.sizeA) - { - switch (opt.alg) - { - case Opt::Ver1: Ver1::PrepareA(weight, opt.M, opt.K, opt.cellA, data.a); - default: - break; - } - } - else - data.a = (float*)weight; - - if (opt.sizeB) - { - switch (opt.alg) - { - case Opt::Ver0: Ver0::PrepareB(src, srcWidth, srcHeight, srcDepth, kernelX, kernelY, padX, padY, strideX, strideY, dilationX, dilationY, dstWidth, dstHeight, data.b); break; - case Opt::Ver1: Ver1::PrepareB(src, srcWidth, srcHeight, srcDepth, kernelX, kernelY, padX, padY, strideX, strideY, dilationX, dilationY, dstWidth, dstHeight, opt.cellB, data.t, data.b); break; - case Opt::Ver2: Ver2::PrepareB(src, srcWidth, srcHeight, srcDepth, padX, padY, data.b, opt.paddedW, opt.paddedH); break; - default: break; - } - } - else - data.b = (float*)src; - - switch (opt.alg) - { - case Opt::Ver0: Ver0::Execute(opt.M, opt.N, opt.K, data.a, data.b, dst); break; - case Opt::Ver1: Ver1::Execute(opt.M, opt.N, opt.K, data.a, data.b, dst, opt.cellA, opt.cellB); break; - case Opt::Ver2: Ver2::Execute(data.b, opt.paddedW, opt.paddedH, srcDepth, weight, kernelX, kernelY, dst, dstWidth, dstHeight, dstDepth); break; - default: break; - } - } - } -#endif// SIMD_AVX512F_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512fResizer.cpp b/src/3rd/Simd/Simd/SimdAvx512fResizer.cpp deleted file mode 100644 index 9017f0fe..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512fResizer.cpp +++ /dev/null @@ -1,181 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdResizer.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - const __m512i K64_PERMUTE_FOR_PACK = SIMD_MM512_SETR_EPI64(0, 2, 4, 6, 1, 3, 5, 7); - - ResizerFloatBilinear::ResizerFloatBilinear(const ResParam & param) - : Base::ResizerFloatBilinear(param) - { - } - - void ResizerFloatBilinear::Run(const float * src, size_t srcStride, float * dst, size_t dstStride) - { - size_t cn = _param.channels; - size_t rs = _param.dstW * cn; - float * pbx[2] = { _bx[0].data, _bx[1].data }; - int32_t prev = -2; - size_t rsa = AlignLo(rs, Avx512f::F); - __mmask16 tail = TailMask16(rs - rsa); - for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride) - { - float fy1 = _ay[dy]; - float fy0 = 1.0f - fy1; - int32_t sy = _iy[dy]; - int32_t k = 0; - - if (sy == prev) - k = 2; - else if (sy == prev + 1) - { - Swap(pbx[0], pbx[1]); - k = 1; - } - - prev = sy; - - for (; k < 2; k++) - { - float * pb = pbx[k]; - const float * ps = src + (sy + k)*srcStride; - size_t dx = 0; - if (cn == 1) - { - __m512 _1 = _mm512_set1_ps(1.0f); - for (; dx < rsa; dx += Avx512f::F) - { - __m512i idx = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_PACK, _mm512_load_si512(_ix.data + dx)); - __m512 sp0 = _mm512_castpd_ps(_mm512_i32gather_pd(_mm512_extracti64x4_epi64(idx, 0), (double*)ps, 4)); - __m512 sp1 = _mm512_castpd_ps(_mm512_i32gather_pd(_mm512_extracti64x4_epi64(idx, 1), (double*)ps, 4)); - __m512 fx1 = _mm512_load_ps(_ax.data + dx); - __m512 fx0 = _mm512_sub_ps(_1, fx1); - __m512 s0 = _mm512_shuffle_ps(sp0, sp1, 0x88); - __m512 s1 = _mm512_shuffle_ps(sp0, sp1, 0xDD); - _mm512_store_ps(pb + dx, _mm512_fmadd_ps(s0, fx0, _mm512_mul_ps(s1, fx1))); - } - if (dx < rs) - { - __m512i idx = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_PACK, _mm512_maskz_load_epi32(tail, _ix.data + dx)); - __m512 sp0 = _mm512_castpd_ps(_mm512_i32gather_pd(_mm512_extracti64x4_epi64(idx, 0), (double*)ps, 4)); - __m512 sp1 = _mm512_castpd_ps(_mm512_i32gather_pd(_mm512_extracti64x4_epi64(idx, 1), (double*)ps, 4)); - __m512 fx1 = _mm512_maskz_load_ps(tail, _ax.data + dx); - __m512 fx0 = _mm512_sub_ps(_1, fx1); - __m512 s0 = _mm512_shuffle_ps(sp0, sp1, 0x88); - __m512 s1 = _mm512_shuffle_ps(sp0, sp1, 0xDD); - _mm512_mask_store_ps(pb + dx, tail, _mm512_fmadd_ps(s0, fx0, _mm512_mul_ps(s1, fx1))); - } - } - else if (cn == 3 && rs > 3) - { - __m256 _1 = _mm256_set1_ps(1.0f); - size_t rs3 = rs - 3; - size_t rs6 = AlignLoAny(rs3, 6); - for (; dx < rs6; dx += 6) - { - __m256 s0 = Avx::Load(ps + _ix[dx + 0] + 0, ps + _ix[dx + 3] + 0); - __m256 s1 = Avx::Load(ps + _ix[dx + 0] + 3, ps + _ix[dx + 3] + 3); - __m256 fx1 = Avx::Load(_ax.data + dx + 0, _ax.data + dx + 3); - __m256 fx0 = _mm256_sub_ps(_1, fx1); - Avx::Store(pb + dx + 0, pb + dx + 3, _mm256_fmadd_ps(fx0, s0, _mm256_mul_ps(fx1, s1))); - } - for (; dx < rs3; dx += 3) - { - __m128 s0 = _mm_loadu_ps(ps + _ix[dx] + 0); - __m128 s1 = _mm_loadu_ps(ps + _ix[dx] + 3); - __m128 fx1 = _mm_set1_ps(_ax.data[dx]); - __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1); - _mm_storeu_ps(pb + dx, _mm_add_ps(_mm_mul_ps(fx0, s0), _mm_mul_ps(fx1, s1))); - } - for (; dx < rs; dx++) - { - int32_t sx = _ix[dx]; - float fx = _ax[dx]; - pb[dx] = ps[sx] * (1.0f - fx) + ps[sx + cn] * fx; - } - } - else - { - __m512 _1 = _mm512_set1_ps(1.0f); - __m512i _cn = _mm512_set1_epi32((int)cn); - for (; dx < rsa; dx += Avx512f::F) - { - __m512i i0 = _mm512_load_si512(_ix.data + dx); - __m512i i1 = _mm512_add_epi32(i0, _cn); - __m512 s0 = _mm512_i32gather_ps(i0, ps, 4); - __m512 s1 = _mm512_i32gather_ps(i1, ps, 4); - __m512 fx1 = _mm512_load_ps(_ax.data + dx); - __m512 fx0 = _mm512_sub_ps(_1, fx1); - _mm512_store_ps(pb + dx, _mm512_fmadd_ps(s0, fx0, _mm512_mul_ps(s1, fx1))); - } - if (dx < rs) - { - __m512i i0 = _mm512_maskz_load_epi32(tail, _ix.data + dx); - __m512i i1 = _mm512_add_epi32(i0, _cn); - __m512 s0 = _mm512_i32gather_ps(i0, ps, 4); - __m512 s1 = _mm512_i32gather_ps(i1, ps, 4); - __m512 fx1 = _mm512_maskz_load_ps(tail, _ax.data + dx); - __m512 fx0 = _mm512_sub_ps(_1, fx1); - _mm512_mask_store_ps(pb + dx, tail, _mm512_fmadd_ps(s0, fx0, _mm512_mul_ps(s1, fx1))); - } - } - } - size_t dx = 0; - __m512 _fy0 = _mm512_set1_ps(fy0); - __m512 _fy1 = _mm512_set1_ps(fy1); - for (; dx < rsa; dx += Avx512f::F) - { - __m512 b0 = Load(pbx[0] + dx); - __m512 b1 = Load(pbx[1] + dx); - Store(dst + dx, _mm512_fmadd_ps(b0, _fy0, _mm512_mul_ps(b1, _fy1))); - } - if (dx < rs) - { - __m512 b0 = Load(pbx[0] + dx, tail); - __m512 b1 = Load(pbx[1] + dx, tail); - Store(dst + dx, _mm512_fmadd_ps(b0, _fy0, _mm512_mul_ps(b1, _fy1)), tail); - } - } - } - - //--------------------------------------------------------------------- - - void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) - { - ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m512)); - if (param.IsFloatBilinear()) - return new ResizerFloatBilinear(param); - else - return Avx2::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - } - } -#endif //SIMD_AVX512f_ENABLE -} - diff --git a/src/3rd/Simd/Simd/SimdAvx512fSquaredDifferenceSum.cpp b/src/3rd/Simd/Simd/SimdAvx512fSquaredDifferenceSum.cpp deleted file mode 100644 index ac5f7bd7..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512fSquaredDifferenceSum.cpp +++ /dev/null @@ -1,139 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" - -namespace Simd -{ -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - template SIMD_INLINE void SquaredDifferenceSum32f(const float * a, const float * b, size_t offset, __m512 & sum, __mmask16 tail = -1) - { - __m512 _a = Load(a + offset, tail); - __m512 _b = Load(b + offset, tail); - __m512 _d = _mm512_sub_ps(_a, _b); - sum = _mm512_fmadd_ps(_d, _d, sum); - } - - template void SquaredDifferenceSum32f(const float * a, const float * b, size_t size, float * sum) - { - if (align) - assert(Aligned(a) && Aligned(b)); - - *sum = 0; - size_t alignedSize = AlignLo(size, F); - __mmask16 tailMask = TailMask16(size - alignedSize); - size_t fullAlignedSize = AlignLo(size, QF); - size_t i = 0; - __m512 sums[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; - if (fullAlignedSize) - { - for (; i < fullAlignedSize; i += QF) - { - SquaredDifferenceSum32f(a, b, i + 0 * F, sums[0]); - SquaredDifferenceSum32f(a, b, i + 1 * F, sums[1]); - SquaredDifferenceSum32f(a, b, i + 2 * F, sums[2]); - SquaredDifferenceSum32f(a, b, i + 3 * F, sums[3]); - } - sums[0] = _mm512_add_ps(_mm512_add_ps(sums[0], sums[1]), _mm512_add_ps(sums[2], sums[3])); - } - for (; i < alignedSize; i += F) - SquaredDifferenceSum32f(a, b, i, sums[0]); -#if defined (NDEBUG) && defined(_MSC_VER) - *sum = ExtractSum(sums[0]); - for (; i < size; ++i) - *sum += Simd::Square(a[i] - b[i]); -#else - if (i < size) - SquaredDifferenceSum32f(a, b, i, sums[0], tailMask); - *sum = ExtractSum(sums[0]); -#endif - } - - void SquaredDifferenceSum32f(const float * a, const float * b, size_t size, float * sum) - { - if (Aligned(a) && Aligned(b)) - SquaredDifferenceSum32f(a, b, size, sum); - else - SquaredDifferenceSum32f(a, b, size, sum); - } - - template SIMD_INLINE void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t offset, __m512 & sum, __m512 & correction, __mmask16 tail = -1) - { - __m512 _a = Load(a + offset, tail); - __m512 _b = Load(b + offset, tail); - __m512 _d = _mm512_sub_ps(_a, _b); - __m512 term = _mm512_fmsub_ps(_d, _d, correction); - __m512 temp = _mm512_add_ps(sum, term); - correction = _mm512_sub_ps(_mm512_sub_ps(temp, sum), term); - sum = temp; - } - - template void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum) - { - if (align) - assert(Aligned(a) && Aligned(b)); - - size_t alignedSize = AlignLo(size, F); - __mmask16 tailMask = TailMask16(size - alignedSize); - size_t fullAlignedSize = AlignLo(size, QF); - size_t i = 0; - __m512 sums[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; - __m512 corrections[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; - if (fullAlignedSize) - { - for (; i < fullAlignedSize; i += QF) - { - SquaredDifferenceKahanSum32f(a, b, i + 0 * F, sums[0], corrections[0]); - SquaredDifferenceKahanSum32f(a, b, i + 1 * F, sums[1], corrections[1]); - SquaredDifferenceKahanSum32f(a, b, i + 2 * F, sums[2], corrections[2]); - SquaredDifferenceKahanSum32f(a, b, i + 3 * F, sums[3], corrections[3]); - } - sums[0] = _mm512_add_ps(_mm512_add_ps(sums[0], sums[1]), _mm512_add_ps(sums[2], sums[3])); - } - for (; i < alignedSize; i += F) - SquaredDifferenceKahanSum32f(a, b, i, sums[0], corrections[0]); -#if defined (NDEBUG) && defined(_MSC_VER) - *sum = ExtractSum(sums[0]); - for (; i < size; ++i) - *sum += Simd::Square(a[i] - b[i]); -#else - if (i < size) - SquaredDifferenceKahanSum32f(a, b, i, sums[0], corrections[0], tailMask); - *sum = ExtractSum(sums[0]); -#endif - } - - void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum) - { - if (Aligned(a) && Aligned(b)) - SquaredDifferenceKahanSum32f(a, b, size, sum); - else - SquaredDifferenceKahanSum32f(a, b, size, sum); - } - } -#endif// SIMD_AVX512F_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512fSvm.cpp b/src/3rd/Simd/Simd/SimdAvx512fSvm.cpp deleted file mode 100644 index 709e0950..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512fSvm.cpp +++ /dev/null @@ -1,84 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" - -namespace Simd -{ -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - namespace - { - struct Buffer - { - Buffer(size_t count) - { - size_t size = sizeof(float)*count; - _p = Allocate(size); - memset(_p, 0, size); - sums = (float*)_p; - } - - ~Buffer() - { - Free(_p); - } - - float * sums; - private: - void *_p; - }; - } - - void SvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum) - { - Buffer buffer(count); - size_t alignedCount = AlignLo(count, F); - __mmask16 tailMask = TailMask16(count - alignedCount); - - for (size_t j = 0; j < length; ++j) - { - size_t i = 0; - float v = x[j]; - __m512 _v = _mm512_set1_ps(v); - for (; i < alignedCount; i += F) - Store(buffer.sums + i, _mm512_fmadd_ps(_v, Load(svs + i), Load(buffer.sums + i))); - if (i < count) - Store(buffer.sums + i, _mm512_fmadd_ps(_v, (Load(svs + i, tailMask)), (Load(buffer.sums + i, tailMask))), tailMask); - svs += count; - } - - size_t i = 0; - __m512 _sum = _mm512_setzero_ps(); - for (; i < alignedCount; i += F) - _sum = _mm512_fmadd_ps(Load(buffer.sums + i), Load(weights + i), _sum); - if (i < count) - _sum = _mm512_fmadd_ps((Load(buffer.sums + i, tailMask)), (Load(weights + i, tailMask)), _sum); - *sum = ExtractSum(_sum); - } - } -#endif// SIMD_AVX512F_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512fSynet.cpp b/src/3rd/Simd/Simd/SimdAvx512fSynet.cpp deleted file mode 100644 index d4a724d8..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512fSynet.cpp +++ /dev/null @@ -1,1244 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdPow.h" -#include "Simd/SimdExp.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdSse1.h" -#include "Simd/SimdAvx1.h" -#include "Simd/SimdAvx2.h" -#include "Simd/SimdAvx512f.h" -#include "Simd/SimdArray.h" - -namespace Simd -{ -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - template SIMD_INLINE void SynetAddBias(const __m512 & bias, float * dst, __mmask16 tail = -1) - { - Store(dst, _mm512_add_ps((Load(dst, tail)), bias), tail); - } - - template SIMD_INLINE void SynetAddBias(const float * bias, float * dst, __mmask16 tail = -1) - { - __m512 _bias = Load(bias, tail); - __m512 _dst = Load(dst, tail); - Store(dst, _mm512_add_ps(_dst, _bias), tail); - } - - template void SynetAddBiasNchw(const float * bias, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(spatial, F) && Aligned(dst)); - - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - __mmask16 tail = TailMask16(spatial - partial); - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - __m512 _bias = _mm512_set1_ps(bias[c]); - for (; s < aligned; s += QF) - { - SynetAddBias(_bias, dst + s + F * 0); - SynetAddBias(_bias, dst + s + F * 1); - SynetAddBias(_bias, dst + s + F * 2); - SynetAddBias(_bias, dst + s + F * 3); - } - for (; s < partial; s += F) - SynetAddBias(_bias, dst + s); - if (s < spatial) - SynetAddBias(_bias, dst + s, tail); - dst += spatial; - } - } - - SIMD_INLINE void SynetAddBiasNchw(const float * bias, size_t channels, size_t spatial, float * dst) - { - if (Aligned(spatial, F) && Aligned(dst)) - SynetAddBiasNchw(bias, channels, spatial, dst); - else - SynetAddBiasNchw(bias, channels, spatial, dst); - } - - template void SynetAddBiasNhwc(const float * bias, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(channels, F) && Aligned(bias) && Aligned(dst)); - - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - __mmask16 tail = TailMask16(channels - partial); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned; c += QF) - { - SynetAddBias(bias + c + F * 0, dst + c + F * 0); - SynetAddBias(bias + c + F * 1, dst + c + F * 1); - SynetAddBias(bias + c + F * 2, dst + c + F * 2); - SynetAddBias(bias + c + F * 3, dst + c + F * 3); - } - for (; c < partial; c += F) - SynetAddBias(bias + c, dst + c); - if (c < channels) - SynetAddBias(bias + c, dst + c, tail); - dst += channels; - } - } - - SIMD_INLINE void SynetAddBiasNhwc(const float * bias, size_t channels, size_t spatial, float * dst) - { - if (Aligned(bias) && Aligned(channels, F) && Aligned(dst)) - SynetAddBiasNhwc(bias, channels, spatial, dst); - else - SynetAddBiasNhwc(bias, channels, spatial, dst); - } - - template void SynetAddBiasNchw16c(const float * bias, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(dst)); - - size_t spatial4 = AlignLo(spatial, 4); - for (size_t c = 0; c < channels; c += F) - { - __m512 _bias = Load(bias + c); - size_t s = 0; - for (; s < spatial4; s += 4, dst += 4 * F) - { - SynetAddBias(_bias, dst + 0 * F); - SynetAddBias(_bias, dst + 1 * F); - SynetAddBias(_bias, dst + 2 * F); - SynetAddBias(_bias, dst + 3 * F); - } - for (; s < spatial; ++s, dst += F) - SynetAddBias(_bias, dst); - } - } - - SIMD_INLINE void SynetAddBiasNchw16c(const float * bias, size_t channels, size_t spatial, float * dst) - { - if (Aligned(dst)) - SynetAddBiasNchw16c(bias, channels, spatial, dst); - else - SynetAddBiasNchw16c(bias, channels, spatial, dst); - } - - void SynetAddBias(const float * bias, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetAddBiasNchw(bias, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetAddBiasNhwc(bias, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - Sse::SynetAddBias(bias, channels, spatial, dst, format); - else if (format == SimdTensorFormatNchw8c) - Avx::SynetAddBias(bias, channels, spatial, dst, format); - else if (format == SimdTensorFormatNchw16c) - SynetAddBiasNchw16c(bias, channels, spatial, dst); - else - Base::SynetAddBias(bias, channels, spatial, dst, format); - } - - //--------------------------------------------------------------------- - - template __m512 SynetEltwiseLayerForward(__m512 src0, __m512 src1); - - template <> SIMD_INLINE __m512 SynetEltwiseLayerForward(__m512 src0, __m512 src1) - { - return _mm512_mul_ps(src0, src1); - } - - template <> SIMD_INLINE __m512 SynetEltwiseLayerForward(__m512 src0, __m512 src1) - { - return _mm512_max_ps(src0, src1); - } - - template <> SIMD_INLINE __m512 SynetEltwiseLayerForward(__m512 src0, __m512 src1) - { - return _mm512_min_ps(src0, src1); - } - - template SIMD_INLINE void SynetEltwiseLayerForward(const float * src0, const float * src1, float * dst, size_t offset, __mmask16 tail = -1) - { - Store(dst + offset, SynetEltwiseLayerForward((Load(src0 + offset, tail)), (Load(src1 + offset, tail))), tail); - } - - template void SynetEltwiseLayerForward(float const * const * src, size_t count, size_t size, float * dst) - { - size_t aligned = AlignLo(size, QF); - size_t partial = AlignLo(size, F); - __mmask16 tail = __mmask16(-1) >> (F + partial - size); - const float * src0 = src[0]; - const float * src1 = src[1]; - size_t j = 0; - for (; j < aligned; j += QF) - { - SynetEltwiseLayerForward(src0, src1, dst, j + F * 0); - SynetEltwiseLayerForward(src0, src1, dst, j + F * 1); - SynetEltwiseLayerForward(src0, src1, dst, j + F * 2); - SynetEltwiseLayerForward(src0, src1, dst, j + F * 3); - } - for (; j < partial; j += F) - SynetEltwiseLayerForward(src0, src1, dst, j); - if (j < size) - SynetEltwiseLayerForward(src0, src1, dst, j, tail); - for (size_t i = 2; i < count; ++i) - { - const float * srci = src[i]; - for (j = 0; j < aligned; j += QF) - { - SynetEltwiseLayerForward(dst, srci, dst, j + F * 0); - SynetEltwiseLayerForward(dst, srci, dst, j + F * 1); - SynetEltwiseLayerForward(dst, srci, dst, j + F * 2); - SynetEltwiseLayerForward(dst, srci, dst, j + F * 3); - } - for (; j < partial; j += F) - SynetEltwiseLayerForward(dst, srci, dst, j); - if (j < size) - SynetEltwiseLayerForward(dst, srci, dst, j, tail); - } - } - - template void SynetEltwiseLayerForwardSum(const float * src0, const __m512 & weight0, const float * src1, const __m512 & weight1, float * dst, size_t offset, __mmask16 tail = -1) - { - Store(dst + offset, _mm512_fmadd_ps((Load(src0 + offset, tail)), weight0, _mm512_mul_ps((Load(src1 + offset, tail)), weight1)), tail); - } - - template void SynetEltwiseLayerForwardSum(const float * src, const __m512 & weight, float * dst, size_t offset, __mmask16 tail = -1) - { - Store(dst + offset, _mm512_fmadd_ps((Load(src + offset, tail)), weight, (Load(dst + offset, tail))), tail); - } - - template void SynetEltwiseLayerForwardSum(float const * const * src, const float * weight, size_t count, size_t size, float * dst) - { - size_t aligned = AlignLo(size, QF); - size_t partial = AlignLo(size, F); - __mmask16 tail = __mmask16(-1) >> (F + partial - size); - const float * src0 = src[0]; - const float * src1 = src[1]; - __m512 weight0 = _mm512_set1_ps(weight[0]); - __m512 weight1 = _mm512_set1_ps(weight[1]); - size_t j = 0; - for (; j < aligned; j += QF) - { - SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 0); - SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 1); - SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 2); - SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 3); - } - for (; j < partial; j += F) - SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j); - if (j < size) - SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j, tail); - for (size_t i = 2; i < count; ++i) - { - const float * srci = src[i]; - __m512 weighti = _mm512_set1_ps(weight[i]); - for (j = 0; j < aligned; j += QF) - { - SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 0); - SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 1); - SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 2); - SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 3); - } - for (; j < partial; j += F) - SynetEltwiseLayerForwardSum(srci, weighti, dst, j); - if (j < size) - SynetEltwiseLayerForwardSum(srci, weighti, dst, j, tail); - } - } - - template void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst) - { - switch (type) - { - case SimdSynetEltwiseOperationProduct: - SynetEltwiseLayerForward(src, count, size, dst); - break; - case SimdSynetEltwiseOperationSum: - SynetEltwiseLayerForwardSum(src, weight, count, size, dst); - break; - case SimdSynetEltwiseOperationMax: - SynetEltwiseLayerForward(src, count, size, dst); - break; - case SimdSynetEltwiseOperationMin: - SynetEltwiseLayerForward(src, count, size, dst); - break; - default: - assert(0); - } - } - - void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst) - { - assert(count >= 2); - bool aligned = Aligned(dst) && Aligned(src[0]) && Aligned(src[1]); - for (size_t i = 2; i < count; ++i) - aligned = aligned && Aligned(src[i]); - if (aligned) - SynetEltwiseLayerForward(src, weight, count, size, type, dst); - else - SynetEltwiseLayerForward(src, weight, count, size, type, dst); - } - - //--------------------------------------------------------------------- - - void SynetInnerProductLayerForward1(const float * S0, const float * W, const float * B, size_t K, float * D) - { - size_t K16 = K & (~15); - size_t K64 = K & (~63); - const float * W0 = W + 0 * K; - __m512 d00, d01, d02, d03; - __m512 s0, s1, s2, s3, w0, w1, w2, w3; - size_t k = 0; - d00 = _mm512_setzero_ps(); - if (K64) - { - d01 = _mm512_setzero_ps(); - d02 = _mm512_setzero_ps(); - d03 = _mm512_setzero_ps(); - for (; k < K64; k += 64) - { - s0 = _mm512_loadu_ps(S0 + k + 0 * F); - s1 = _mm512_loadu_ps(S0 + k + 1 * F); - w0 = _mm512_loadu_ps(W0 + k + 0 * F); - w1 = _mm512_loadu_ps(W0 + k + 1 * F); - d00 = _mm512_fmadd_ps(s0, w0, d00); - d01 = _mm512_fmadd_ps(s1, w1, d01); - s2 = _mm512_loadu_ps(S0 + k + 2 * F); - s3 = _mm512_loadu_ps(S0 + k + 3 * F); - w2 = _mm512_loadu_ps(W0 + k + 2 * F); - w3 = _mm512_loadu_ps(W0 + k + 3 * F); - d02 = _mm512_fmadd_ps(s2, w2, d02); - d03 = _mm512_fmadd_ps(s3, w3, d03); - } - d00 = _mm512_add_ps(_mm512_add_ps(d00, d01), _mm512_add_ps(d02, d03)); - } - for (; k < K16; k += 16) - { - s0 = _mm512_loadu_ps(S0 + k); - w0 = _mm512_loadu_ps(W0 + k); - d00 = _mm512_fmadd_ps(s0, w0, d00); - } - if (k < K) - { - __mmask16 tail = __mmask16(-1) >> (16 + k - K); - s0 = _mm512_maskz_loadu_ps(tail, S0 + k); - w0 = _mm512_maskz_loadu_ps(tail, W0 + k); - d00 = _mm512_fmadd_ps(s0, w0, d00); - } - D[0] = Avx512f::ExtractSum(d00) + B[0]; - } - - void SynetInnerProductLayerForward4(const float * S0, const float * W, const float * B, size_t K, float * D) - { - size_t K16 = K & (~15); - size_t K32 = K & (~31); - const float * W0 = W + 0 * K; - const float * W1 = W + 1 * K; - const float * W2 = W + 2 * K; - const float * W3 = W + 3 * K; - __m512 d00, d01, d10, d11, d20, d21, d30, d31; - __m512 s0, s1, w0, w1; - size_t k = 0; - d00 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(); - if (K32) - { - d01 = _mm512_setzero_ps(); - d11 = _mm512_setzero_ps(); - d21 = _mm512_setzero_ps(); - d31 = _mm512_setzero_ps(); - for (; k < K32; k += 32) - { - s0 = _mm512_loadu_ps(S0 + k + 0 * F); - s1 = _mm512_loadu_ps(S0 + k + 1 * F); - w0 = _mm512_loadu_ps(W0 + k + 0 * F); - w1 = _mm512_loadu_ps(W0 + k + 1 * F); - d00 = _mm512_fmadd_ps(s0, w0, d00); - d01 = _mm512_fmadd_ps(s1, w1, d01); - w0 = _mm512_loadu_ps(W1 + k + 0 * F); - w1 = _mm512_loadu_ps(W1 + k + 1 * F); - d10 = _mm512_fmadd_ps(s0, w0, d10); - d11 = _mm512_fmadd_ps(s1, w1, d11); - w0 = _mm512_loadu_ps(W2 + k + 0 * F); - w1 = _mm512_loadu_ps(W2 + k + 1 * F); - d20 = _mm512_fmadd_ps(s0, w0, d20); - d21 = _mm512_fmadd_ps(s1, w1, d21); - w0 = _mm512_loadu_ps(W3 + k + 0 * F); - w1 = _mm512_loadu_ps(W3 + k + 1 * F); - d30 = _mm512_fmadd_ps(s0, w0, d30); - d31 = _mm512_fmadd_ps(s1, w1, d31); - } - d00 = _mm512_add_ps(d00, d01); - d10 = _mm512_add_ps(d10, d11); - d20 = _mm512_add_ps(d20, d21); - d30 = _mm512_add_ps(d30, d31); - } - for (; k < K16; k += 16) - { - s0 = _mm512_loadu_ps(S0 + k + 0 * F); - w0 = _mm512_loadu_ps(W0 + k + 0 * F); - d00 = _mm512_fmadd_ps(s0, w0, d00); - w0 = _mm512_loadu_ps(W1 + k + 0 * F); - d10 = _mm512_fmadd_ps(s0, w0, d10); - w0 = _mm512_loadu_ps(W2 + k + 0 * F); - d20 = _mm512_fmadd_ps(s0, w0, d20); - w0 = _mm512_loadu_ps(W3 + k + 0 * F); - d30 = _mm512_fmadd_ps(s0, w0, d30); - } - if (k < K) - { - __mmask16 tail = __mmask16(-1) >> (16 + k - K); - s0 = _mm512_maskz_loadu_ps(tail, S0 + k); - w0 = _mm512_maskz_loadu_ps(tail, W0 + k); - d00 = _mm512_fmadd_ps(s0, w0, d00); - w0 = _mm512_maskz_loadu_ps(tail, W1 + k); - d10 = _mm512_fmadd_ps(s0, w0, d10); - w0 = _mm512_maskz_loadu_ps(tail, W2 + k); - d20 = _mm512_fmadd_ps(s0, w0, d20); - w0 = _mm512_maskz_loadu_ps(tail, W3 + k); - d30 = _mm512_fmadd_ps(s0, w0, d30); - } - _mm_storeu_ps(D, _mm_add_ps(Avx512f::Extract4Sums(d00, d10, d20, d30), _mm_loadu_ps(B))); - } - - void SynetInnerProductLayerForward(const float * src, const float * weight, const float * bias, size_t count, size_t size, float * dst) - { - float _bias[4] = { 0, 0, 0, 0 }; - size_t count4 = AlignLo(count, 4); - size_t i = 0; - for (; i < count4; i += 4) - SynetInnerProductLayerForward4(src, weight + i * size, (bias ? bias + i : _bias), size, dst + i); - for (; i < count; ++i) - SynetInnerProductLayerForward1(src, weight + i * size, (bias ? bias + i : _bias), size, dst + i); - } - - //--------------------------------------------------------------------- - - SIMD_INLINE __m512 NoseSquareSum(const float * src) - { - __m512 s0 = _mm512_maskz_loadu_ps(0xFFFC, src - 2); - __m512 s1 = _mm512_maskz_loadu_ps(0xFFFE, src - 1); - __m512 s2 = _mm512_loadu_ps(src); - __m512 s3 = _mm512_loadu_ps(src + 1); - __m512 s4 = _mm512_loadu_ps(src + 2); - return _mm512_add_ps(_mm512_fmadd_ps(s0, s0, _mm512_mul_ps(s1, s1)), _mm512_fmadd_ps(s2, s2, _mm512_fmadd_ps(s3, s3, _mm512_mul_ps(s4, s4)))); - } - - SIMD_INLINE __m512 BodySquareSum(const float * src) - { - __m512 s0 = _mm512_loadu_ps(src - 2); - __m512 s1 = _mm512_loadu_ps(src - 1); - __m512 s2 = _mm512_loadu_ps(src); - __m512 s3 = _mm512_loadu_ps(src + 1); - __m512 s4 = _mm512_loadu_ps(src + 2); - return _mm512_add_ps(_mm512_fmadd_ps(s0, s0, _mm512_mul_ps(s1, s1)), _mm512_fmadd_ps(s2, s2, _mm512_fmadd_ps(s3, s3, _mm512_mul_ps(s4, s4)))); - } - - SIMD_INLINE __m512 TailSquareSum(const float * src) - { - __m512 s0 = _mm512_loadu_ps(src - 2); - __m512 s1 = _mm512_loadu_ps(src - 1); - __m512 s2 = _mm512_loadu_ps(src); - __m512 s3 = _mm512_maskz_loadu_ps(0x7FFF, src + 1); - __m512 s4 = _mm512_maskz_loadu_ps(0x3FFF, src + 2); - return _mm512_add_ps(_mm512_fmadd_ps(s0, s0, _mm512_mul_ps(s1, s1)), _mm512_fmadd_ps(s2, s2, _mm512_fmadd_ps(s3, s3, _mm512_mul_ps(s4, s4)))); - } - - template void SynetLrnLayerCrossChannelsNchw(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst) - { - __m512 k0 = _mm512_set1_ps(k[0]); - __m512 k1 = _mm512_set1_ps(k[1]); - __m512 k2 = _mm512_set1_ps(k[2]); - Avx512f::Pow pow; - Array32f sum(spatial, true), zero(spatial, true); - size_t aligned = AlignLo(spatial, F); - __mmask16 tail = TailMask16(spatial - aligned); - for (size_t c = 0; c < half; ++c) - { - const float * pos = src + c * spatial; - size_t s = 0; - for (; s < aligned; s += F) - { - __m512 _pos = Avx512f::Load(pos + s); - Avx512f::Store(sum.data + s, _mm512_fmadd_ps(_pos, _pos, Avx512f::Load(sum.data + s))); - } - if (s < spatial) - { - __m512 _pos = Avx512f::Load(pos + s, tail); - __m512 _sum = Avx512f::Load(sum.data + s, tail); - Avx512f::Store(sum.data + s, _mm512_fmadd_ps(_pos, _pos, _sum), tail); - } - } - for (size_t c = 0; c < channels; ++c) - { - const float * pos = (c < channels - half) ? src + half * spatial : zero.data; - const float * neg = (c > half) ? src - (half + 1) * spatial : zero.data; - size_t s = 0; - for (; s < aligned; s += F) - { - __m512 _pos = Avx512f::Load(pos + s); - __m512 _neg = Avx512f::Load(neg + s); - __m512 _sum = Avx512f::Load(sum.data + s); - _sum = _mm512_fmadd_ps(_pos, _pos, _mm512_fnmadd_ps(_neg, _neg, _sum)); - __m512 _src = Avx512f::Load(src + s); - Avx512f::Store(sum.data + s, _sum); - Avx512f::Store(dst + s, _mm512_mul_ps(_src, pow(_mm512_fmadd_ps(k1, _sum, k0), k2))); - } - if (s < spatial) - { - __m512 _pos = Avx512f::Load(pos + s, tail); - __m512 _neg = Avx512f::Load(neg + s, tail); - __m512 _sum = Avx512f::Load(sum.data + s, tail); - _sum = _mm512_fmadd_ps(_pos, _pos, _mm512_fnmadd_ps(_neg, _neg, _sum)); - __m512 _src = Avx512f::Load(src + s, tail); - Avx512f::Store(sum.data + s, _sum, tail); - Avx512f::Store(dst + s, _mm512_mul_ps(_src, pow(_mm512_fmadd_ps(k1, _sum, k0), k2)), tail); - } - src += spatial; - dst += spatial; - } - } - - SIMD_INLINE void SynetLrnLayerCrossChannelsNchw(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst) - { - if (Aligned(src) && Aligned(dst) && Aligned(spatial, F)) - SynetLrnLayerCrossChannelsNchw(src, half, channels, spatial, k, dst); - else - SynetLrnLayerCrossChannelsNchw(src, half, channels, spatial, k, dst); - } - - template void SynetLrnLayerCrossChannelsNhwc2h(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst) - { - __m512 k0 = _mm512_set1_ps(k[0]); - __m512 k1 = _mm512_set1_ps(k[1]); - __m512 k2 = _mm512_set1_ps(k[2]); - Avx512f::Pow pow; - size_t aligned = AlignLo(channels - half, F); - for (size_t s = 0; s < spatial; ++s) - { - Avx512f::Store(dst + 0, _mm512_mul_ps(Avx512f::Load(src + 0), pow(_mm512_add_ps(k0, _mm512_mul_ps(k1, NoseSquareSum(src + 0))), k2))); - for (size_t c = F; c < aligned; c += F) - Avx512f::Store(dst + c, _mm512_mul_ps(Avx512f::Load(src + c), pow(_mm512_add_ps(k0, _mm512_mul_ps(k1, BodySquareSum(src + c))), k2))); - if (aligned != channels - half) - { - size_t c = channels - half - F; - Avx512f::Store(dst + c, _mm512_mul_ps(Avx512f::Load(src + c), pow(_mm512_add_ps(k0, _mm512_mul_ps(k1, BodySquareSum(src + c))), k2))); - } - size_t c = channels - F; - Avx512f::Store(dst + c, _mm512_mul_ps(Avx512f::Load(src + c), pow(_mm512_add_ps(k0, _mm512_mul_ps(k1, TailSquareSum(src + c))), k2))); - src += channels; - dst += channels; - } - } - - SIMD_INLINE void SynetLrnLayerCrossChannelsNhwc(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst) - { - if (half == 2 && channels >= F + half) - { - if (Aligned(src) && Aligned(dst) && Aligned(channels, F)) - SynetLrnLayerCrossChannelsNhwc2h(src, half, channels, spatial, k, dst); - else - SynetLrnLayerCrossChannelsNhwc2h(src, half, channels, spatial, k, dst); - } - else - Avx512f::SynetLrnLayerCrossChannels(src, half, channels, spatial, k, dst, SimdTensorFormatNhwc); - } - - void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst, SimdTensorFormatType format) - { - if (format == SimdTensorFormatNchw) - SynetLrnLayerCrossChannelsNchw(src, half, channels, spatial, k, dst); - else if (format == SimdTensorFormatNhwc) - SynetLrnLayerCrossChannelsNhwc(src, half, channels, spatial, k, dst); - else - Base::SynetLrnLayerCrossChannels(src, half, channels, spatial, k, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, float * dst, size_t offset, __mmask16 tail = -1) - { - __m512 _src = Load(src + offset, tail); - __m512 _scale = Load(scale + offset, tail); - __m512 _bias = Load(bias + offset, tail); - Store(dst + offset, Fmadd(_src, _scale, _bias), tail); - } - - template SIMD_INLINE void SynetScaleLayerForward(const float * src, const float * scale, float * dst, size_t offset, __mmask16 tail = -1) - { - __m512 _src = Load(src + offset, tail); - __m512 _scale = Load(scale + offset, tail); - Store(dst + offset, _mm512_mul_ps(_src, _scale), tail); - } - - template SIMD_INLINE void SynetScaleLayerForward(const float* src, const __m512& scale, const __m512& bias, float* dst, size_t offset, __mmask16 tail = -1) - { - __m512 _src = Load(src + offset, tail); - Store(dst + offset, Fmadd(_src, scale, bias), tail); - } - - template SIMD_INLINE void SynetScaleLayerForward(const float * src, const __m512 & scale, float * dst, size_t offset, __mmask16 tail = -1) - { - __m512 _src = Load(src + offset, tail); - Store(dst + offset, _mm512_mul_ps(_src, scale), tail); - } - - template void SynetScaleLayerForwardNchw(const float* src, const float* scale, const float* bias, size_t channels, size_t height, size_t width, float* dst) - { - if (align) - assert(Aligned(src) && Aligned(width, F) && Aligned(dst)); - - size_t widthQF = AlignLo(width, QF); - size_t widthF = AlignLo(width, F); - __mmask16 tail = TailMask16(width - widthF); - if (bias) - { - for (size_t c = 0; c < channels; ++c) - { - __m512 _scale = _mm512_set1_ps(scale[c]); - __m512 _bias = _mm512_set1_ps(bias[c]); - for (size_t h = 0; h < height; ++h) - { - size_t w = 0; - for (; w < widthQF; w += QF) - { - SynetScaleLayerForward(src, _scale, _bias, dst, w + F * 0); - SynetScaleLayerForward(src, _scale, _bias, dst, w + F * 1); - SynetScaleLayerForward(src, _scale, _bias, dst, w + F * 2); - SynetScaleLayerForward(src, _scale, _bias, dst, w + F * 3); - } - for (; w < widthF; w += F) - SynetScaleLayerForward(src, _scale, _bias, dst, w); - if (w < width) - SynetScaleLayerForward(src, _scale, _bias, dst, w, tail); - src += width; - dst += width; - } - } - } - else - { - for (size_t c = 0; c < channels; ++c) - { - __m512 _scale = _mm512_set1_ps(scale[c]); - for (size_t h = 0; h < height; ++h) - { - size_t w = 0; - for (; w < widthQF; w += QF) - { - SynetScaleLayerForward(src, _scale, dst, w + F * 0); - SynetScaleLayerForward(src, _scale, dst, w + F * 1); - SynetScaleLayerForward(src, _scale, dst, w + F * 2); - SynetScaleLayerForward(src, _scale, dst, w + F * 3); - } - for (; w < widthF; w += F) - SynetScaleLayerForward(src, _scale, dst, w); - if (w < width) - SynetScaleLayerForward(src, _scale, dst, w, tail); - src += width; - dst += width; - } - } - } - } - - template SIMD_INLINE void SynetScaleLayerForwardNchw(const float* src, const float* scale, const float* bias, size_t channels, size_t height, size_t width, float* dst) - { - if (Aligned(src) && Aligned(width, F) && Aligned(dst)) - SynetScaleLayerForwardNchw(src, scale, bias, channels, height, width, dst); - else - SynetScaleLayerForwardNchw(src, scale, bias, channels, height, width, dst); - } - - SIMD_INLINE void SynetScaleLayerForwardNchw(const float* src, const float* scale, const float* bias, size_t channels, size_t height, size_t width, float* dst, SimdSynetCompatibilityType compatibility) - { - if((compatibility & SimdSynetCompatibilityNoFma) && bias) - SynetScaleLayerForwardNchw(src, scale, bias, channels, height, width, dst); - else if ((compatibility & SimdSynetCompatibilityNoFmaTail) && bias) - SynetScaleLayerForwardNchw(src, scale, bias, channels, height, width, dst); - else - SynetScaleLayerForwardNchw(src, scale, bias, channels, 1, height*width, dst); - } - - template void SynetScaleLayerForwardNhwc(const float * src, const float * scale, const float * bias, size_t channels, size_t height, size_t width, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(scale) && Aligned(bias) && Aligned(channels, F) && Aligned(dst)); - - size_t channelsQF = AlignLo(channels, QF); - size_t channelsF = AlignLo(channels, F); - __mmask16 tail = TailMask16(channels - channelsF); - if (bias) - { - size_t widthF = AlignLo(width, F); - for (size_t h = 0; h < height; ++h) - { - size_t w = 0; - for (; w < widthF; ++w) - { - size_t c = 0; - for (; c < channelsQF; c += QF) - { - SynetScaleLayerForward(src, scale, bias, dst, c + F * 0); - SynetScaleLayerForward(src, scale, bias, dst, c + F * 1); - SynetScaleLayerForward(src, scale, bias, dst, c + F * 2); - SynetScaleLayerForward(src, scale, bias, dst, c + F * 3); - } - for (; c < channelsF; c += F) - SynetScaleLayerForward(src, scale, bias, dst, c); - if (c < channels) - SynetScaleLayerForward(src, scale, bias, dst, c, tail); - src += channels; - dst += channels; - } - for (; w < width; ++w) - { - size_t c = 0; - for (; c < channelsQF; c += QF) - { - SynetScaleLayerForward(src, scale, bias, dst, c + F * 0); - SynetScaleLayerForward(src, scale, bias, dst, c + F * 1); - SynetScaleLayerForward(src, scale, bias, dst, c + F * 2); - SynetScaleLayerForward(src, scale, bias, dst, c + F * 3); - } - for (; c < channelsF; c += F) - SynetScaleLayerForward(src, scale, bias, dst, c); - if (c < channels) - SynetScaleLayerForward(src, scale, bias, dst, c, tail); - src += channels; - dst += channels; - } - } - } - else - { - for (size_t h = 0; h < height; ++h) - { - for (size_t w = 0; w < width; ++w) - { - size_t c = 0; - for (; c < channelsQF; c += QF) - { - SynetScaleLayerForward(src, scale, dst, c + F * 0); - SynetScaleLayerForward(src, scale, dst, c + F * 1); - SynetScaleLayerForward(src, scale, dst, c + F * 2); - SynetScaleLayerForward(src, scale, dst, c + F * 3); - } - for (; c < channelsF; c += F) - SynetScaleLayerForward(src, scale, dst, c); - if (c < channels) - SynetScaleLayerForward(src, scale, dst, c, tail); - src += channels; - dst += channels; - } - } - } - } - - template void SynetScaleLayerForwardNhwc3(const float* src, const float* scale, const float* bias, size_t height, size_t width, float* dst) - { - if (align) - assert(Aligned(src) && Aligned(dst) && Aligned(width)); - - size_t width3 = width * 3; - size_t widthF3 = AlignLo(width, F) * 3; - if (bias) - { - float _scale[F * 3], _bias[F * 3]; - for (size_t i = 0; i < F; ++i) - for (size_t c = 0; c < 3; ++c) - _scale[i * 3 + c] = scale[c], _bias[i * 3 + c] = bias[c]; - __m512 _scale0 = Load(_scale + 0 * F); - __m512 _scale1 = Load(_scale + 1 * F); - __m512 _scale2 = Load(_scale + 2 * F); - __m512 _bias0 = Load(_bias + 0 * F); - __m512 _bias1 = Load(_bias + 1 * F); - __m512 _bias2 = Load(_bias + 2 * F); - for (size_t h = 0; h < height; ++h) - { - size_t w = 0; - for (; w < widthF3; w += F * 3) - { - SynetScaleLayerForward(src, _scale0, _bias0, dst, w + F * 0); - SynetScaleLayerForward(src, _scale1, _bias1, dst, w + F * 1); - SynetScaleLayerForward(src, _scale2, _bias2, dst, w + F * 2); - } - for (; w < width3; w += 3) - { - dst[w + 0] = src[w + 0] * scale[0] + bias[0]; - dst[w + 1] = src[w + 1] * scale[1] + bias[1]; - dst[w + 2] = src[w + 2] * scale[2] + bias[2]; - } - src += width3; - dst += width3; - } - } - else - { - float _scale[F * 3]; - for (size_t i = 0; i < F; ++i) - for (size_t c = 0; c < 3; ++c) - _scale[i * 3 + c] = scale[c]; - __m512 _scale0 = Load(_scale + 0 * F); - __m512 _scale1 = Load(_scale + 1 * F); - __m512 _scale2 = Load(_scale + 2 * F); - for (size_t h = 0; h < height; ++h) - { - size_t w = 0; - for (; w < widthF3; w += F * 3) - { - SynetScaleLayerForward(src, _scale0, dst, w + F * 0); - SynetScaleLayerForward(src, _scale1, dst, w + F * 1); - SynetScaleLayerForward(src, _scale2, dst, w + F * 2); - } - for (; w < width3; w += 3) - { - dst[w + 0] = src[w + 0] * scale[0]; - dst[w + 1] = src[w + 1] * scale[1]; - dst[w + 2] = src[w + 2] * scale[2]; - } - src += width3; - dst += width3; - } - } - } - - template SIMD_INLINE void SynetScaleLayerForwardNhwc(const float* src, const float* scale, const float* bias, size_t channels, size_t height, size_t width, float* dst) - { - if (channels == 3) - { - if (Aligned(src) && Aligned(dst) && Aligned(width)) - SynetScaleLayerForwardNhwc3(src, scale, bias, height, width, dst); - else - SynetScaleLayerForwardNhwc3(src, scale, bias, height, width, dst); - } - else - { - if (Aligned(src) && Aligned(scale) && Aligned(bias) && Aligned(channels, F) && Aligned(dst)) - SynetScaleLayerForwardNhwc(src, scale, bias, channels, height, width, dst); - else - SynetScaleLayerForwardNhwc(src, scale, bias, channels, height, width, dst); - } - } - - SIMD_INLINE void SynetScaleLayerForwardNhwc(const float* src, const float* scale, const float* bias, size_t channels, size_t height, size_t width, float* dst, SimdSynetCompatibilityType compatibility) - { - if ((compatibility & SimdSynetCompatibilityNoFma) && bias) - SynetScaleLayerForwardNhwc(src, scale, bias, channels, 1, height * width, dst); - else if ((compatibility & SimdSynetCompatibilityNoFmaTail) && bias) - SynetScaleLayerForwardNhwc(src, scale, bias, channels, height, width, dst); - else - SynetScaleLayerForwardNhwc(src, scale, bias, channels, 1, height * width, dst); - } - - template void SynetScaleLayerForwardNchw16c(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - if (bias) - { - for (size_t c = 0; c < channels; c += F) - { - __m512 _scale = Load(scale + c); - __m512 _bias = Load(bias + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 0); - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 1); - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 2); - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetScaleLayerForward(src, _scale, _bias, dst, s); - src += spatialF; - dst += spatialF; - } - } - else - { - for (size_t c = 0; c < channels; c += F) - { - __m512 _scale = Load(scale + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetScaleLayerForward(src, _scale, dst, s + F * 0); - SynetScaleLayerForward(src, _scale, dst, s + F * 1); - SynetScaleLayerForward(src, _scale, dst, s + F * 2); - SynetScaleLayerForward(src, _scale, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetScaleLayerForward(src, _scale, dst, s); - src += spatialF; - dst += spatialF; - } - } - } - - SIMD_INLINE void SynetScaleLayerForwardNchw16c(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, float * dst, SimdSynetCompatibilityType compatibility) - { - if ((compatibility & SimdSynetCompatibilityNoFma) && bias) - { - if (Aligned(src) && Aligned(dst)) - SynetScaleLayerForwardNchw16c(src, scale, bias, channels, spatial, dst); - else - SynetScaleLayerForwardNchw16c(src, scale, bias, channels, spatial, dst); - } - else - { - if (Aligned(src) && Aligned(dst)) - SynetScaleLayerForwardNchw16c(src, scale, bias, channels, spatial, dst); - else - SynetScaleLayerForwardNchw16c(src, scale, bias, channels, spatial, dst); - } - } - - void SynetScaleLayerForward(const float* src, const float* scale, const float* bias, size_t channels, size_t height, size_t width, float* dst, SimdTensorFormatType format, SimdSynetCompatibilityType compatibility) - { - size_t spatial = height * width; - if (Base::NchwCompatible(channels, spatial, format)) - SynetScaleLayerForwardNchw(src, scale, bias, channels, height, width, dst, compatibility); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetScaleLayerForwardNhwc(src, scale, bias, channels, height, width, dst, compatibility); - else if (format == SimdTensorFormatNchw4c) - Sse::SynetScaleLayerForward(src, scale, bias, channels, height, width, dst, format, compatibility); - else if (format == SimdTensorFormatNchw8c) - Avx2::SynetScaleLayerForward(src, scale, bias, channels, height, width, dst, format, compatibility); - else if (format == SimdTensorFormatNchw16c) - SynetScaleLayerForwardNchw16c(src, scale, bias, channels, spatial, dst, compatibility); - else - Base::SynetScaleLayerForward(src, scale, bias, channels, height, width, dst, format, compatibility); - } - - //--------------------------------------------------------------------- - - void SynetShuffleLayerForward(const float* src0, const float* src1, size_t channels0, size_t channels1, size_t spatial, float* dst0, float* dst1, SimdTensorFormatType format, int type) - { - if (format == SimdTensorFormatNchw) - Base::SynetShuffleLayerForward(src0, src1, channels0, channels1, spatial, dst0, dst1, format, type); - else if (format == SimdTensorFormatNhwc) - { - size_t channels = (channels0 + channels1) / 2; - size_t channels0DF = AlignLo(channels0, DF); - __mmask16 tail00 = TailMask16(channels0 - channels0DF); - __mmask16 tail0F = TailMask16(channels0 - channels0DF - F); - size_t channels0t = (channels0 - channels0DF) / 2; - __mmask16 tail0 = TailMask16(channels0t); - size_t channels1DF = AlignLo(channels1, DF); - __mmask16 tail10 = TailMask16(channels1 - channels1DF); - __mmask16 tail1F = TailMask16(channels1 - channels1DF - F); - size_t channels1t = (channels1 - channels1DF) / 2; - __mmask16 tail1 = TailMask16(channels1t); - if (type == 0) - { - for (size_t s = 0; s < spatial; ++s) - { - size_t cd = 0, cs0 = 0, cs1 = 0; - for (; cs0 < channels0DF; cs0 += DF, cd += F) - { - __m512 s0 = _mm512_loadu_ps(src0 + cs0 + 0); - __m512 s1 = _mm512_loadu_ps(src0 + cs0 + F); - _mm512_storeu_ps(dst0 + cd, Deinterleave<0>(s0, s1)); - _mm512_storeu_ps(dst1 + cd, Deinterleave<1>(s0, s1)); - } - if (channels0DF < channels0) - { - __m512 s0 = _mm512_maskz_loadu_ps(tail00, src0 + cs0 + 0); - __m512 s1 = _mm512_maskz_loadu_ps(tail0F, src0 + cs0 + F); - _mm512_mask_storeu_ps(dst0 + cd, tail0, Deinterleave<0>(s0, s1)); - _mm512_mask_storeu_ps(dst1 + cd, tail0, Deinterleave<1>(s0, s1)); - cd += channels0t; - } - for (; cs1 < channels1DF; cs1 += DF, cd += F) - { - __m512 s0 = _mm512_loadu_ps(src1 + cs1 + 0); - __m512 s1 = _mm512_loadu_ps(src1 + cs1 + F); - _mm512_storeu_ps(dst0 + cd, Deinterleave<0>(s0, s1)); - _mm512_storeu_ps(dst1 + cd, Deinterleave<1>(s0, s1)); - } - if (channels1DF < channels1) - { - __m512 s0 = _mm512_maskz_loadu_ps(tail10, src1 + cs1 + 0); - __m512 s1 = _mm512_maskz_loadu_ps(tail1F, src1 + cs1 + F); - _mm512_mask_storeu_ps(dst0 + cd, tail1, Deinterleave<0>(s0, s1)); - _mm512_mask_storeu_ps(dst1 + cd, tail1, Deinterleave<1>(s0, s1)); - cd += channels1t; - } - src0 += channels0; - src1 += channels1; - dst0 += channels; - dst1 += channels; - } - } - else if (type == 1) - { - for (size_t s = 0; s < spatial; ++s) - { - size_t cs = 0, cd0 = 0, cd1 = 0; - for (; cd0 < channels0DF; cd0 += DF, cs += F) - { - __m512 s0 = _mm512_loadu_ps(src0 + cs); - __m512 s1 = _mm512_loadu_ps(src1 + cs); - _mm512_storeu_ps(dst0 + cd0 + 0, Interleave<0>(s0, s1)); - _mm512_storeu_ps(dst0 + cd0 + F, Interleave<1>(s0, s1)); - } - if (channels0DF < channels0) - { - __m512 s0 = _mm512_maskz_loadu_ps(tail0, src0 + cs); - __m512 s1 = _mm512_maskz_loadu_ps(tail0, src1 + cs); - _mm512_mask_storeu_ps(dst0 + cd0 + 0, tail00, Interleave<0>(s0, s1)); - _mm512_mask_storeu_ps(dst0 + cd0 + F, tail0F, Interleave<1>(s0, s1)); - cs += channels0t; - } - for (; cd1 < channels1DF; cd1 += DF, cs += F) - { - __m512 s0 = _mm512_loadu_ps(src0 + cs); - __m512 s1 = _mm512_loadu_ps(src1 + cs); - _mm512_storeu_ps(dst1 + cd1 + 0, Interleave<0>(s0, s1)); - _mm512_storeu_ps(dst1 + cd1 + F, Interleave<1>(s0, s1)); - } - if (channels1DF < channels1) - { - __m512 s0 = _mm512_maskz_loadu_ps(tail1, src0 + cs); - __m512 s1 = _mm512_maskz_loadu_ps(tail1, src1 + cs); - _mm512_mask_storeu_ps(dst1 + cd1 + 0, tail10, Interleave<0>(s0, s1)); - _mm512_mask_storeu_ps(dst1 + cd1 + F, tail1F, Interleave<1>(s0, s1)); - cs += channels1t; - } - src0 += channels; - src1 += channels; - dst0 += channels0; - dst1 += channels1; - } - } - else - assert(0); - } - else - assert(0); - } - - //--------------------------------------------------------------------- - - void SynetSoftmaxLayerForward(const float * src, size_t outer, size_t count, size_t inner, float * dst) - { - Avx512f::Exp exp; - if (inner == 1 && count == 2) - { - size_t aligned = Simd::AlignLo(outer, F); - size_t o = 0; - for (; o < aligned; o += F) - { - __m512 s0 = _mm512_loadu_ps(src + 0); - __m512 s1 = _mm512_loadu_ps(src + F); - __m512 ss0 = _mm512_shuffle_ps(s0, s1, 0x88); - __m512 ss1 = _mm512_shuffle_ps(s0, s1, 0xDD); - __m512 max = _mm512_max_ps(ss0, ss1); - __m512 exp0 = exp.Exponent(_mm512_sub_ps(ss0, max)); - __m512 exp1 = exp.Exponent(_mm512_sub_ps(ss1, max)); - __m512 sum = _mm512_add_ps(exp0, exp1); - __m512 d0 = _mm512_div_ps(exp0, sum); - __m512 d1 = _mm512_div_ps(exp1, sum); - _mm512_storeu_ps(dst + 0, _mm512_unpacklo_ps(d0, d1)); - _mm512_storeu_ps(dst + F, _mm512_unpackhi_ps(d0, d1)); - src += DF; - dst += DF; - } - for (; o < outer; ++o) - { - float max = Simd::Max(src[0], src[1]); - float exp0 = ::exp(src[0] - max); - float exp1 = ::exp(src[1] - max); - float sum = exp0 + exp1; - dst[0] = exp0 / sum; - dst[1] = exp1 / sum; - src += 2; - dst += 2; - } - } - else - { - size_t aligned = Simd::AlignLo(inner, F); - __mmask16 tail = TailMask16(inner - aligned); - Array32f tmp(inner * 2); - const float * s; - float * max = tmp.data, *sum = tmp.data + inner, *d; - for (size_t o = 0; o < outer; ++o) - { - memcpy(max, src, inner * sizeof(float)); - s = src + inner; - for (size_t c = 1; c < count; ++c) - { - size_t i = 0; - for (; i < aligned; i += F) - _mm512_storeu_ps(max + i, _mm512_max_ps(_mm512_loadu_ps(s + i), _mm512_loadu_ps(max + i))); - if(i < inner) - _mm512_mask_storeu_ps(max + i, tail, _mm512_max_ps(_mm512_maskz_loadu_ps(tail, s + i), _mm512_maskz_loadu_ps(tail, max + i))); - s += inner; - } - - s = src; - d = dst; - memset(sum, 0, inner * sizeof(float)); - for (size_t c = 0; c < count; ++c) - { - size_t i = 0; - for (; i < aligned; i += F) - { - __m512 _d = exp.Exponent(_mm512_sub_ps(_mm512_loadu_ps(s + i), _mm512_loadu_ps(max + i))); - _mm512_storeu_ps(d + i, _d); - _mm512_storeu_ps(sum + i, _mm512_add_ps(_d, _mm512_loadu_ps(sum + i))); - } - if(i < inner) - { - __m512 _d = exp.Exponent(_mm512_sub_ps(_mm512_maskz_loadu_ps(tail, s + i), _mm512_maskz_loadu_ps(tail, max + i))); - _mm512_mask_storeu_ps(d + i, tail, _d); - _mm512_mask_storeu_ps(sum + i, tail, _mm512_add_ps(_d, _mm512_maskz_loadu_ps(tail, sum + i))); - } - s += inner; - d += inner; - } - - d = dst; - for (size_t c = 0; c < count; ++c) - { - size_t i = 0; - for (; i < aligned; i += F) - _mm512_storeu_ps(d + i, _mm512_div_ps(_mm512_loadu_ps(d + i), _mm512_loadu_ps(sum + i))); - if(i < inner) - _mm512_mask_storeu_ps(d + i, tail, _mm512_div_ps(_mm512_maskz_loadu_ps(tail, d + i), _mm512_maskz_loadu_ps(tail, sum + i))); - d += inner; - } - src += count * inner; - dst += count * inner; - } - } - } - - //--------------------------------------------------------------------- - - template __m512 SynetUnaryOperation32f(__m512 value); - - template<> SIMD_INLINE __m512 SynetUnaryOperation32f(__m512 value) - { - return AndNot(_mm512_set1_ps(-0.0f), value); - } - - template<> SIMD_INLINE __m512 SynetUnaryOperation32f(__m512 value) - { - return Exponent(value); - } - - template<> SIMD_INLINE __m512 SynetUnaryOperation32f(__m512 value) - { - return Logarithm(value); - } - - template<> SIMD_INLINE __m512 SynetUnaryOperation32f(__m512 value) - { - return _mm512_sub_ps(_mm512_setzero_ps(), value); - } - - template<> SIMD_INLINE __m512 SynetUnaryOperation32f(__m512 value) - { - return _mm512_rsqrt14_ps(value); - } - - template<> SIMD_INLINE __m512 SynetUnaryOperation32f(__m512 value) - { - return _mm512_sqrt_ps(value); - } - - template<> SIMD_INLINE __m512 SynetUnaryOperation32f(__m512 value) - { - return Tanh(value); - } - - template<> SIMD_INLINE __m512 SynetUnaryOperation32f(__m512 value) - { - return _mm512_setzero_ps(); - } - - template void SynetUnaryOperation32fLayerForward(const float* src, size_t size, float* dst) - { - size_t sizeF = AlignLo(size, F); - size_t sizeQF = AlignLo(size, QF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - Avx512f::Store(dst + i + 0 * F, SynetUnaryOperation32f(Avx512f::Load(src + i + 0 * F))); - Avx512f::Store(dst + i + 1 * F, SynetUnaryOperation32f(Avx512f::Load(src + i + 1 * F))); - Avx512f::Store(dst + i + 2 * F, SynetUnaryOperation32f(Avx512f::Load(src + i + 2 * F))); - Avx512f::Store(dst + i + 3 * F, SynetUnaryOperation32f(Avx512f::Load(src + i + 3 * F))); - } - for (; i < sizeF; i += F) - Avx512f::Store(dst + i, SynetUnaryOperation32f(Avx512f::Load(src + i))); - if (i < size) - { - __mmask16 tail = TailMask16(size - sizeF); - Avx512f::Store(dst + i, SynetUnaryOperation32f(Avx512f::Load(src + i, tail)), tail); - } - } - - template void SynetUnaryOperation32fLayerForward(const float* src, size_t size, SimdSynetUnaryOperation32fType type, float* dst) - { - switch (type) - { - case SimdSynetUnaryOperation32fAbs: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fExp: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fLog: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fNeg: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fRsqrt: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fSqrt: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fTanh: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fZero: SynetUnaryOperation32fLayerForward(src, size, dst); break; - default: - assert(0); - } - } - - void SynetUnaryOperation32fLayerForward(const float* src, size_t size, SimdSynetUnaryOperation32fType type, float* dst) - { - if (Aligned(src) && Aligned(dst)) - SynetUnaryOperation32fLayerForward(src, size, type, dst); - else - SynetUnaryOperation32fLayerForward(src, size, type, dst); - } - } -#endif// SIMD_AVX512F_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512fSynetActivation.cpp b/src/3rd/Simd/Simd/SimdAvx512fSynetActivation.cpp deleted file mode 100644 index e3603204..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512fSynetActivation.cpp +++ /dev/null @@ -1,504 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdArray.h" -#include "Simd/SimdPow.h" -#include "Simd/SimdExp.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdSse1.h" -#include "Simd/SimdAvx1.h" -#include "Simd/SimdSynet.h" - -namespace Simd -{ -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - template SIMD_INLINE void SynetElu32f(const float * src, const Avx512f::Exp & exp, __m512 alpha, float * dst, size_t offset, __mmask16 tail = -1) - { - Avx512f::Store(dst + offset, exp.Elu(Avx512f::Load(src + offset, tail), alpha), tail); - } - - template void SynetElu32f(const float * src, size_t size, const float * alpha, float * dst) - { - __m512 _alpha = _mm512_set1_ps(alpha[0]); - Avx512f::Exp exp; - size_t sizeF = AlignLo(size, F); - size_t sizeQF = AlignLo(size, QF); - __mmask16 tail = TailMask16(size - sizeF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - SynetElu32f(src, exp, _alpha, dst, i + 0 * F); - SynetElu32f(src, exp, _alpha, dst, i + 1 * F); - SynetElu32f(src, exp, _alpha, dst, i + 2 * F); - SynetElu32f(src, exp, _alpha, dst, i + 3 * F); - } - for (; i < sizeF; i += F) - SynetElu32f(src, exp, _alpha, dst, i); - if(i < size) - SynetElu32f(src, exp, _alpha, dst, i, tail); - - } - - void SynetElu32f(const float * src, size_t size, const float * alpha, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetElu32f(src, size, alpha, dst); - else - SynetElu32f(src, size, alpha, dst); - } - - //------------------------------------------------------------------------- - - template SIMD_INLINE void SynetHswish32f(const float * src, __m512 shift, __m512 scale, float * dst, size_t offset, __mmask16 tail = -1) - { - __m512 _src = Load(src + offset, tail); - __m512 _dst = _mm512_mul_ps(_mm512_mul_ps(_mm512_max_ps(_mm512_add_ps(_mm512_min_ps(_src, shift), shift), _mm512_setzero_ps()), scale), _src); - Store(dst + offset, _dst, tail); - } - - template void SynetHswish32f(const float * src, size_t size, const float * shift, const float * scale, float * dst) - { - __m512 _shift = _mm512_set1_ps(shift[0]); - __m512 _scale = _mm512_set1_ps(scale[0]); - size_t sizeF = AlignLo(size, F); - size_t sizeQF = AlignLo(size, QF); - __mmask16 tail = TailMask16(size - sizeF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - SynetHswish32f(src, _shift, _scale, dst, i + 0 * F); - SynetHswish32f(src, _shift, _scale, dst, i + 1 * F); - SynetHswish32f(src, _shift, _scale, dst, i + 2 * F); - SynetHswish32f(src, _shift, _scale, dst, i + 3 * F); - } - for (; i < sizeF; i += F) - SynetHswish32f(src, _shift, _scale, dst, i); - if (i < size) - SynetHswish32f(src, _shift, _scale, dst, i, tail); - } - - void SynetHswish32f(const float * src, size_t size, const float * shift, const float * scale, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetHswish32f(src, size, shift, scale, dst); - else - SynetHswish32f(src, size, shift, scale, dst); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetPreluLayerForward(const float* src, const float* slope, float* dst, size_t offset, __mmask16 tail = -1) - { - __m512 _src = Load(src + offset, tail); - __m512 _slope = Load(slope + offset, tail); - __m512 pos = _mm512_max_ps(_mm512_setzero_ps(), _src); - __m512 neg = _mm512_min_ps(_mm512_setzero_ps(), _src); - Store(dst + offset, _mm512_add_ps(pos, _mm512_mul_ps(_slope, neg)), tail); - } - - template SIMD_INLINE void SynetPreluLayerForward(const float* src, __m512 slope, float* dst, size_t offset, __mmask16 tail = -1) - { - __m512 _src = Load(src + offset, tail); - __m512 pos = _mm512_max_ps(_mm512_setzero_ps(), _src); - __m512 neg = _mm512_min_ps(_mm512_setzero_ps(), _src); - Store(dst + offset, _mm512_add_ps(pos, _mm512_mul_ps(slope, neg)), tail); - } - - template void SynetPreluLayerForward(const float* src, const float* slope, size_t count, size_t size, float* dst, SimdBool trans) - { - if (align) - assert(((trans || size == 1) && count != 1 ? Aligned(count) && Aligned(slope) : Aligned(size)) && Aligned(src) && Aligned(dst)); - if ((trans || size == 1) && count != 1) - { - size_t aligned = AlignLo(count, QF); - size_t partial = AlignLo(count, F); - __mmask16 tail = __mmask16(-1) >> (F + partial - count); - for (size_t j = 0; j < size; ++j) - { - size_t i = 0; - for (; i < aligned; i += QF) - { - SynetPreluLayerForward(src, slope, dst, i + F * 0); - SynetPreluLayerForward(src, slope, dst, i + F * 1); - SynetPreluLayerForward(src, slope, dst, i + F * 2); - SynetPreluLayerForward(src, slope, dst, i + F * 3); - } - for (; i < partial; i += F) - SynetPreluLayerForward(src, slope, dst, i); - if (i < count) - SynetPreluLayerForward(src, slope, dst, i, tail); - src += count; - dst += count; - } - } - else - { - size_t aligned = AlignLo(size, QF); - size_t partial = AlignLo(size, F); - __mmask16 tail = __mmask16(-1) >> (F + partial - size); - for (size_t i = 0; i < count; ++i) - { - size_t j = 0; - __m512 _slope = _mm512_set1_ps(slope[i]); - for (; j < aligned; j += QF) - { - SynetPreluLayerForward(src, _slope, dst, j + F * 0); - SynetPreluLayerForward(src, _slope, dst, j + F * 1); - SynetPreluLayerForward(src, _slope, dst, j + F * 2); - SynetPreluLayerForward(src, _slope, dst, j + F * 3); - } - for (; j < partial; j += F) - SynetPreluLayerForward(src, _slope, dst, j); - if (i < count) - SynetPreluLayerForward(src, _slope, dst, j, tail); - src += size; - dst += size; - } - } - } - - template void SynetPreluLayerForwardNchw(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) - { - if (align) - assert(Aligned(src) && Aligned(spatial, F) && Aligned(dst)); - - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - __mmask16 tail = TailMask16(spatial - partial); - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - __m512 _slope = _mm512_set1_ps(slope[c]); - for (; s < aligned; s += QF) - { - SynetPreluLayerForward(src, _slope, dst, s + F * 0); - SynetPreluLayerForward(src, _slope, dst, s + F * 1); - SynetPreluLayerForward(src, _slope, dst, s + F * 2); - SynetPreluLayerForward(src, _slope, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetPreluLayerForward(src, _slope, dst, s); - if (s < spatial) - SynetPreluLayerForward(src, _slope, dst, s, tail); - src += spatial; - dst += spatial; - } - } - - SIMD_INLINE void SynetPreluLayerForwardNchw(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) - { - if (Aligned(src) && Aligned(spatial, F) && Aligned(dst)) - SynetPreluLayerForwardNchw(src, slope, channels, spatial, dst); - else - SynetPreluLayerForwardNchw(src, slope, channels, spatial, dst); - } - - template void SynetPreluLayerForwardNhwc(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) - { - if (align) - assert(Aligned(src) && Aligned(slope) && Aligned(channels, F) && Aligned(dst)); - - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - __mmask16 tail = TailMask16(channels - partial); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned; c += QF) - { - SynetPreluLayerForward(src, slope, dst, c + F * 0); - SynetPreluLayerForward(src, slope, dst, c + F * 1); - SynetPreluLayerForward(src, slope, dst, c + F * 2); - SynetPreluLayerForward(src, slope, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetPreluLayerForward(src, slope, dst, c); - if (c < channels) - SynetPreluLayerForward(src, slope, dst, c, tail); - src += channels; - dst += channels; - } - } - - SIMD_INLINE void SynetPreluLayerForwardNhwc(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) - { - if (Aligned(src) && Aligned(slope) && Aligned(channels, F) && Aligned(dst)) - SynetPreluLayerForwardNhwc(src, slope, channels, spatial, dst); - else - SynetPreluLayerForwardNhwc(src, slope, channels, spatial, dst); - } - - template void SynetPreluLayerForwardNchw16c(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4) * F; - for (size_t c = 0; c < channels; c += F) - { - __m512 _slope = Load(slope + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetPreluLayerForward(src, _slope, dst, s + F * 0); - SynetPreluLayerForward(src, _slope, dst, s + F * 1); - SynetPreluLayerForward(src, _slope, dst, s + F * 2); - SynetPreluLayerForward(src, _slope, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetPreluLayerForward(src, _slope, dst, s); - src += spatialF; - dst += spatialF; - } - } - - SIMD_INLINE void SynetPreluLayerForwardNchw16c(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) - { - if (Aligned(src) && Aligned(dst)) - SynetPreluLayerForwardNchw16c(src, slope, channels, spatial, dst); - else - SynetPreluLayerForwardNchw16c(src, slope, channels, spatial, dst); - } - - void SynetPreluLayerForward(const float* src, const float* slope, size_t channels, size_t spatial, float* dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetPreluLayerForwardNchw(src, slope, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetPreluLayerForwardNhwc(src, slope, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - Sse::SynetPreluLayerForward(src, slope, channels, spatial, dst, format); - else if (format == SimdTensorFormatNchw8c) - Avx::SynetPreluLayerForward(src, slope, channels, spatial, dst, format); - else if (format == SimdTensorFormatNchw16c) - SynetPreluLayerForwardNchw16c(src, slope, channels, spatial, dst); - else - Base::SynetPreluLayerForward(src, slope, channels, spatial, dst, format); - } - - //------------------------------------------------------------------------- - - template SIMD_INLINE void SynetRelu32f(const float* src, __m512 slope, float* dst, size_t offset, __mmask16 tail = -1) - { - __m512 _src = Load(src + offset, tail); - __m512 _dst = SynetRelu32f(_src, slope); - Store(dst + offset, _dst, tail); - } - - template void SynetRelu32f(const float* src, size_t size, const float* slope, float* dst) - { - __m512 _slope = _mm512_set1_ps(slope[0]); - size_t sizeF = AlignLo(size, F); - size_t sizeQF = AlignLo(size, QF); - __mmask16 tail = TailMask16(size - sizeF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - SynetRelu32f(src, _slope, dst, i + 0 * F); - SynetRelu32f(src, _slope, dst, i + 1 * F); - SynetRelu32f(src, _slope, dst, i + 2 * F); - SynetRelu32f(src, _slope, dst, i + 3 * F); - } - for (; i < sizeF; i += F) - SynetRelu32f(src, _slope, dst, i); - if (i < size) - SynetRelu32f(src, _slope, dst, i, tail); - } - - void SynetRelu32f(const float* src, size_t size, const float* slope, float* dst) - { - if (Aligned(src) && Aligned(dst)) - SynetRelu32f(src, size, slope, dst); - else - SynetRelu32f(src, size, slope, dst); - } - - //--------------------------------------------------------------------- - - template void SynetRestrictRange32f(const float * src, size_t size, const float * lower, const float * upper, float * dst) - { - assert(lower[0] <= upper[0]); - if (align) - assert(Aligned(src) && Aligned(dst)); - float min = *lower; - float max = *upper; - __m512 _min = _mm512_set1_ps(min); - __m512 _max = _mm512_set1_ps(max); - size_t sizeF = Simd::AlignLo(size, F); - size_t sizeQF = Simd::AlignLo(size, QF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - Store(dst + i + 0 * F, _mm512_min_ps(_mm512_max_ps(_min, Load(src + i + 0 * F)), _max)); - Store(dst + i + 1 * F, _mm512_min_ps(_mm512_max_ps(_min, Load(src + i + 1 * F)), _max)); - Store(dst + i + 2 * F, _mm512_min_ps(_mm512_max_ps(_min, Load(src + i + 2 * F)), _max)); - Store(dst + i + 3 * F, _mm512_min_ps(_mm512_max_ps(_min, Load(src + i + 3 * F)), _max)); - } - for (; i < sizeF; i += F) - Store(dst + i, _mm512_min_ps(_mm512_max_ps(_min, Load(src + i)), _max)); - if (i < size) - { - __mmask16 tail = TailMask16(size - i); - Store(dst + i, _mm512_min_ps(_mm512_max_ps(_min, (Load(src + i, tail))), _max), tail); - } - } - - void SynetRestrictRange32f(const float * src, size_t size, const float * lower, const float * upper, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetRestrictRange32f(src, size, lower, upper, dst); - else - SynetRestrictRange32f(src, size, lower, upper, dst); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetSigmoid32f(const float* src, const Avx512f::Exp& exp, float* dst, size_t offset, __mmask16 tail = -1) - { - __m512 _src = Load(src + offset, tail); - __m512 _dst = exp.Sigmoid(_src); - Store(dst + offset, _dst, tail); - } - - template void SynetSigmoid32f(const float* src, size_t size, const float* slope, float* dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - Exp exp(-slope[0]); - size_t sizeF = AlignLo(size, F); - size_t sizeQF = AlignLo(size, QF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - SynetSigmoid32f(src, exp, dst, i + 0 * F); - SynetSigmoid32f(src, exp, dst, i + 1 * F); - SynetSigmoid32f(src, exp, dst, i + 2 * F); - SynetSigmoid32f(src, exp, dst, i + 3 * F); - } - for (; i < sizeF; i += F) - SynetSigmoid32f(src, exp, dst, i); - if (i < size) - { - __mmask16 tail = TailMask16(size - i); - SynetSigmoid32f(src, exp, dst, i, tail); - } - } - - void SynetSigmoid32f(const float* src, size_t size, const float* slope, float* dst) - { - if (Aligned(src) && Aligned(dst)) - SynetSigmoid32f(src, size, slope, dst); - else - SynetSigmoid32f(src, size, slope, dst); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetSoftplus32f(const float* src, __m512 beta, __m512 threshold, float* dst, size_t offset, __mmask16 tail = -1) - { - __m512 _src = Load(src + offset, tail); - __m512 _dst = Softplus(_src, beta, threshold); - Store(dst + offset, _dst, tail); - } - - template void SynetSoftplus32f(const float* src, size_t size, const float* beta, const float* threshold, float* dst) - { - __m512 _beta = _mm512_set1_ps(beta[0]); - __m512 _threshold = _mm512_set1_ps(threshold[0]); - size_t sizeF = AlignLo(size, F); - size_t sizeQF = AlignLo(size, QF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - SynetSoftplus32f(src, _beta, _threshold, dst, i + 0 * F); - SynetSoftplus32f(src, _beta, _threshold, dst, i + 1 * F); - SynetSoftplus32f(src, _beta, _threshold, dst, i + 2 * F); - SynetSoftplus32f(src, _beta, _threshold, dst, i + 3 * F); - } - for (; i < sizeF; i += F) - SynetSoftplus32f(src, _beta, _threshold, dst, i); - if (i < size) - { - __mmask16 tail = TailMask16(size - i); - SynetSoftplus32f(src, _beta, _threshold, dst, i, tail); - } - } - - void SynetSoftplus32f(const float* src, size_t size, const float* beta, const float* threshold, float* dst) - { - if (Aligned(src) && Aligned(dst)) - SynetSoftplus32f(src, size, beta, threshold, dst); - else - SynetSoftplus32f(src, size, beta, threshold, dst); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetTanh32f(const float* src, const Avx512f::Exp& exp, float* dst, size_t offset, __mmask16 tail = -1) - { - __m512 _src = Load(src + offset, tail); - __m512 _dst = exp.Tanh(_src); - Store(dst + offset, _dst, tail); - } - - template void SynetTanh32f(const float* src, size_t size, const float* slope, float* dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - Exp exp(-2.0f*slope[0]); - size_t sizeF = AlignLo(size, F); - size_t sizeQF = AlignLo(size, QF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - SynetTanh32f(src, exp, dst, i + 0 * F); - SynetTanh32f(src, exp, dst, i + 1 * F); - SynetTanh32f(src, exp, dst, i + 2 * F); - SynetTanh32f(src, exp, dst, i + 3 * F); - } - for (; i < sizeF; i += F) - SynetTanh32f(src, exp, dst, i); - if (i < size) - { - __mmask16 tail = TailMask16(size - i); - SynetTanh32f(src, exp, dst, i, tail); - } - } - - void SynetTanh32f(const float* src, size_t size, const float* slope, float* dst) - { - if (Aligned(src) && Aligned(dst)) - SynetTanh32f(src, size, slope, dst); - else - SynetTanh32f(src, size, slope, dst); - } - } -#endif// SIMD_AVX512F_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512fSynetConversion.cpp b/src/3rd/Simd/Simd/SimdAvx512fSynetConversion.cpp deleted file mode 100644 index 34793635..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512fSynetConversion.cpp +++ /dev/null @@ -1,625 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdTranspose.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdAvx1.h" - -namespace Simd -{ -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - template void SynetReorderImage_Chw_Hwc(size_t channels, size_t spatial, const float * src, float * dst) - { - size_t channels8 = AlignLo(channels, 8); - size_t spatial8 = AlignLo(spatial, 8); - size_t channels16 = AlignLo(channels, 16); - size_t spatial16 = AlignLo(spatial, 16); - size_t s = 0; - for (; s < spatial16; s += 16, src += 16, dst += 16 * channels) - { - size_t c = 0; - const float * ps = src; - float * pd = dst; - for (; c < channels16; c += 16, ps += 16 * spatial, pd += 16) - Transpose16x16(ps, spatial, pd, channels); - for (; c < channels8; c += 8, ps += 8 * spatial, pd += 8) - Transpose16x8(ps, spatial, pd, channels); - for (; c < channels; ++c, ps += spatial, pd += 1) - { - pd[0x0 * channels] = ps[0x0]; - pd[0x1 * channels] = ps[0x1]; - pd[0x2 * channels] = ps[0x2]; - pd[0x3 * channels] = ps[0x3]; - pd[0x4 * channels] = ps[0x4]; - pd[0x5 * channels] = ps[0x5]; - pd[0x6 * channels] = ps[0x6]; - pd[0x7 * channels] = ps[0x7]; - pd[0x8 * channels] = ps[0x8]; - pd[0x9 * channels] = ps[0x9]; - pd[0xA * channels] = ps[0xA]; - pd[0xB * channels] = ps[0xB]; - pd[0xC * channels] = ps[0xC]; - pd[0xD * channels] = ps[0xD]; - pd[0xE * channels] = ps[0xE]; - pd[0xF * channels] = ps[0xF]; - } - } - for (; s < spatial8; s += 8, src += 8, dst += 8 * channels) - { - size_t c = 0; - const float * ps = src; - float * pd = dst; - for (; c < channels16; c += 16, ps += 16 * spatial, pd += 16) - Transpose8x16(ps, spatial, pd, channels); - for (; c < channels8; c += 8, ps += 8 * spatial, pd += 8) - Avx::Transpose8x8(ps, spatial, pd, channels); - for (; c < channels; ++c, ps += spatial, pd += 1) - { - pd[0x0 * channels] = ps[0x0]; - pd[0x1 * channels] = ps[0x1]; - pd[0x2 * channels] = ps[0x2]; - pd[0x3 * channels] = ps[0x3]; - pd[0x4 * channels] = ps[0x4]; - pd[0x5 * channels] = ps[0x5]; - pd[0x6 * channels] = ps[0x6]; - pd[0x7 * channels] = ps[0x7]; - } - } - for (; s < spatial; ++s, src += 1, dst += channels) - for (size_t c = 0; c < channels; ++c) - dst[c] = src[c*spatial]; - } - - template void SynetReorderImage_Chw_Chw16c(size_t channels, size_t spatial, const float * src, float * dst) - { - size_t spatial8 = AlignLo(spatial, 8); - size_t channels16 = AlignLo(channels, 16); - size_t spatial16 = AlignLo(spatial, 16); - size_t tail = channels - channels16; - size_t c = 0; - for (; c < channels16; c += 16, src += 16 * spatial) - { - size_t s = 0; - const float * ps = src; - for (; s < spatial16; s += 16, dst += 16 * F, ps += 16) - Transpose16x16(ps, spatial, dst, 16); - for (; s < spatial8; s += 8, dst += 8 * F, ps += 8) - Transpose8x16(ps, spatial, dst, 16); - for (; s < spatial; ++s, dst += F, ps += 1) - { - dst[0x0] = ps[0x0 * spatial]; - dst[0x1] = ps[0x1 * spatial]; - dst[0x2] = ps[0x2 * spatial]; - dst[0x3] = ps[0x3 * spatial]; - dst[0x4] = ps[0x4 * spatial]; - dst[0x5] = ps[0x5 * spatial]; - dst[0x6] = ps[0x6 * spatial]; - dst[0x7] = ps[0x7 * spatial]; - dst[0x8] = ps[0x8 * spatial]; - dst[0x9] = ps[0x9 * spatial]; - dst[0xA] = ps[0xA * spatial]; - dst[0xB] = ps[0xB * spatial]; - dst[0xC] = ps[0xC * spatial]; - dst[0xD] = ps[0xD * spatial]; - dst[0xE] = ps[0xE * spatial]; - dst[0xF] = ps[0xF * spatial]; - } - } - if (tail) - { - const float * ps = src; - for (size_t s = 0; s < spatial; ++s, dst += F, ps += 1) - { - size_t i = 0; - for (; i < tail; ++i) - dst[i] = ps[i*spatial]; - for (; i < F; ++i) - dst[i] = 0; - } - } - } - - template void SynetReorderImage_Hwc_Chw(size_t channels, size_t spatial, const float * src, float * dst) - { - SynetReorderImage_Chw_Hwc(spatial, channels, src, dst); - } - - template void SynetReorderImage_Hwc_Chw16c(size_t channels, size_t spatial, const float * src, float * dst) - { - size_t channelsF = AlignLo(channels, F); - size_t channelsF4 = AlignLo(channels, 4 * F); - size_t tail = channels - channelsF; - size_t spatial4 = AlignLo(spatial, 4); - size_t stride = spatial * F; - size_t c = 0; - for (; c < channelsF4; c += 4 * F, src += 4 * F) - { - const float * ps = src; - float * pd = dst; - size_t i = 0; - for (; i < spatial4; i += 4, pd += 4 * F, ps += 4 * channels) - Transpose4x4xF(ps, channels, pd, stride); - for (; i < spatial; ++i, pd += F, ps += channels) - { - Copy(ps + 0 * F, pd + 0 * stride); - Copy(ps + 1 * F, pd + 1 * stride); - Copy(ps + 2 * F, pd + 2 * stride); - Copy(ps + 3 * F, pd + 3 * stride); - } - dst += 4 * stride; - } - for (; c < channelsF; c += F, src += F) - { - const float * ps = src; - for (size_t s = 0; s < spatial; ++s, ps += channels, dst += F) - Copy(ps, dst); - } - if (tail) - { - __mmask16 mask = TailMask16(tail); - const float * ps = src; - for (size_t s = 0; s < spatial; ++s, ps += channels, dst += F) - CopyZP(ps, dst, mask); - } - } - - template void SynetReorderImage_Chw16c_Chw(size_t channels, size_t spatial, const float * src, float * dst) - { - size_t spatial8 = AlignLo(spatial, 8); - size_t channels16 = AlignLo(channels, 16); - size_t spatial16 = AlignLo(spatial, 16); - size_t tail = channels - channels16; - size_t c = 0; - for (; c < channels16; c += 16, dst += 16 * spatial, src += 16 * spatial) - { - const float * ps = src; - size_t s = 0; - for (; s < spatial16; s += 16, ps += 16 * F) - Transpose16x16(ps, 16, dst + s, spatial); - for (; s < spatial8; s += 8, ps += 8 * F) - Transpose16x8(ps, 16, dst + s, spatial); - for (; s < spatial; ++s, ps += 16) - { - dst[s + 0x0 * spatial] = ps[0x0]; - dst[s + 0x1 * spatial] = ps[0x1]; - dst[s + 0x2 * spatial] = ps[0x2]; - dst[s + 0x3 * spatial] = ps[0x3]; - dst[s + 0x4 * spatial] = ps[0x4]; - dst[s + 0x5 * spatial] = ps[0x5]; - dst[s + 0x6 * spatial] = ps[0x6]; - dst[s + 0x7 * spatial] = ps[0x7]; - dst[s + 0x8 * spatial] = ps[0x8]; - dst[s + 0x9 * spatial] = ps[0x9]; - dst[s + 0xA * spatial] = ps[0xA]; - dst[s + 0xB * spatial] = ps[0xB]; - dst[s + 0xC * spatial] = ps[0xC]; - dst[s + 0xD * spatial] = ps[0xD]; - dst[s + 0xE * spatial] = ps[0xE]; - dst[s + 0xF * spatial] = ps[0xF]; - } - } - if (tail) - { - const float * ps = src; - for (size_t i = 0; i < tail; ++i, ps += 1, dst += spatial) - { - for (size_t s = 0; s < spatial; ++s) - dst[s] = ps[s*F]; - } - } - } - - template void SynetReorderImage_Chw16c_Hwc(size_t channels, size_t spatial, const float * src, float * dst) - { - size_t stride = F * spatial; - size_t channelsF = AlignLo(channels, F); - size_t channelsF4 = AlignLo(channels, 4 * F); - size_t tail = channels - channelsF; - __mmask16 mask = TailMask16(tail); - size_t spatial4 = AlignLo(spatial, 4); - size_t s = 0; - for (; s < spatial4; s += 4, src += 4 * F, dst += 4 * channels) - { - const float * ps = src; - float * pd = dst; - size_t c = 0; - for (; c < channelsF4; c += 4 * F, ps += 4 * stride, pd += 4 * F) - Transpose4x4xF(ps, stride, pd, channels); - for (; c < channelsF; c += F, ps += stride, pd += F) - { - Copy(ps + 0 * F, pd + 0 * channels); - Copy(ps + 1 * F, pd + 1 * channels); - Copy(ps + 2 * F, pd + 2 * channels); - Copy(ps + 3 * F, pd + 3 * channels); - } - if (tail) - { - Copy(ps + 0 * F, pd + 0 * channels, mask); - Copy(ps + 1 * F, pd + 1 * channels, mask); - Copy(ps + 2 * F, pd + 2 * channels, mask); - Copy(ps + 3 * F, pd + 3 * channels, mask); - } - } - for (; s < spatial; ++s, src += F) - { - const float * ps = src; - for (size_t c = 0; c < channelsF; c += F, ps += stride, dst += F) - Copy(ps, dst); - if (tail) - Copy(ps, dst, mask), dst += tail; - } - } - - typedef void(*SynetImageConverterPtr)(size_t channels, size_t spatial, const float * src, float * dst); - SynetImageConverterPtr GetImageConverter(SimdTensorFormatType src, SimdTensorFormatType dst) - { - if (src == SimdTensorFormatNchw) - { - if (dst == SimdTensorFormatNhwc) - return SynetReorderImage_Chw_Hwc; - if (dst == SimdTensorFormatNchw16c) - return SynetReorderImage_Chw_Chw16c; - } - if (src == SimdTensorFormatNhwc) - { - if (dst == SimdTensorFormatNchw) - return SynetReorderImage_Hwc_Chw; - if (dst == SimdTensorFormatNchw16c) - return SynetReorderImage_Hwc_Chw16c; - } - if (src == SimdTensorFormatNchw16c) - { - if (dst == SimdTensorFormatNchw) - return SynetReorderImage_Chw16c_Chw; - if (dst == SimdTensorFormatNhwc) - return SynetReorderImage_Chw16c_Hwc; - } - return NULL; - } - - void SynetReorderImage(size_t batch, size_t channels, size_t spatial, const float * src, SimdTensorFormatType srcFormat, float * dst, SimdTensorFormatType dstFormat) - { - SynetImageConverterPtr imageConverter = GetImageConverter(srcFormat, dstFormat); - if (imageConverter) - { - size_t srcStride = AlignHi(channels, Base::SynetTensorAlignment(srcFormat))*spatial; - size_t dstStride = AlignHi(channels, Base::SynetTensorAlignment(dstFormat))*spatial; - for (size_t n = 0; n < batch; ++n) - { - imageConverter(channels, spatial, src, dst); - src += srcStride; - dst += dstStride; - } - } - else - return Avx::SynetReorderImage(batch, channels, spatial, src, srcFormat, dst, dstFormat); - } - - template void SynetReorderFilter_Oiyx_Yxio(size_t output, size_t input, size_t kernel, const float * src, float * dst) - { - if (kernel == 1) - { - SynetReorderImage_Chw_Hwc(output, input, src, dst); - return; - } - size_t output8 = AlignLo(output, 8); - size_t kernel8 = AlignLo(kernel, 8); - size_t output16 = AlignLo(output, 16); - size_t kernel16 = AlignLo(kernel, 16); - size_t ik = input * kernel, oi = output * input; - for (size_t i = 0; i < input; ++i, src += kernel, dst += output) - { - const float * ps = src; - float * pd = dst; - size_t k = 0; - for (; k < kernel16; k += 16, ps += 16, pd += 16 * oi) - { - size_t o = 0; - for (; o < output16; o += 16) - Transpose16x16(ps + o * ik, ik, pd + o, oi); - for (; o < output8; o += 8) - Transpose16x8(ps + o * ik, ik, pd + o, oi); - for (; o < output; ++o) - { - pd[0x0 * oi + o] = ps[o * ik + 0x0]; - pd[0x1 * oi + o] = ps[o * ik + 0x1]; - pd[0x2 * oi + o] = ps[o * ik + 0x2]; - pd[0x3 * oi + o] = ps[o * ik + 0x3]; - pd[0x4 * oi + o] = ps[o * ik + 0x4]; - pd[0x5 * oi + o] = ps[o * ik + 0x5]; - pd[0x6 * oi + o] = ps[o * ik + 0x6]; - pd[0x7 * oi + o] = ps[o * ik + 0x7]; - pd[0x8 * oi + o] = ps[o * ik + 0x8]; - pd[0x9 * oi + o] = ps[o * ik + 0x9]; - pd[0xA * oi + o] = ps[o * ik + 0xA]; - pd[0xB * oi + o] = ps[o * ik + 0xB]; - pd[0xC * oi + o] = ps[o * ik + 0xC]; - pd[0xD * oi + o] = ps[o * ik + 0xD]; - pd[0xE * oi + o] = ps[o * ik + 0xE]; - pd[0xF * oi + o] = ps[o * ik + 0xF]; - } - } - for (; k < kernel8; k += 8, ps += 8, pd += 8 * oi) - { - size_t o = 0; - for (; o < output16; o += 16) - Transpose8x16(ps + o * ik, ik, pd + o, oi); - for (; o < output8; o += 8) - Avx::Transpose8x8(ps + o * ik, ik, pd + o, oi); - for (; o < output; ++o) - { - pd[0x0 * oi + o] = ps[o * ik + 0x0]; - pd[0x1 * oi + o] = ps[o * ik + 0x1]; - pd[0x2 * oi + o] = ps[o * ik + 0x2]; - pd[0x3 * oi + o] = ps[o * ik + 0x3]; - pd[0x4 * oi + o] = ps[o * ik + 0x4]; - pd[0x5 * oi + o] = ps[o * ik + 0x5]; - pd[0x6 * oi + o] = ps[o * ik + 0x6]; - pd[0x7 * oi + o] = ps[o * ik + 0x7]; - } - } - for (; k < kernel; ++k, ps += 1, pd += oi) - for (size_t o = 0; o < output; ++o) - pd[o] = ps[o*ik]; - } - } - - template void SynetReorderFilter_Oiyx_Oyxi16o(size_t output, size_t input, size_t kernel, const float * src, float * dst) - { - if (kernel == 1) - { - SynetReorderImage_Chw_Chw16c(output, input, src, dst); - return; - } - size_t output16 = AlignLo(output, 16); - size_t kernel8 = AlignLo(kernel, 8); - size_t tail = output - output16; - size_t ik = input * kernel; - size_t stride = input * 16; - for (size_t o = 0; o < output16; o += F) - { - for (size_t i = 0; i < input; ++i) - { - const float * ps = src + o * ik + i * kernel; - float * pd = dst + o * ik + i * 16; - size_t k = 0; - for (; k < kernel8; k += 8, ps += 8, pd += 8 * stride) - Transpose8x16(ps, ik, pd, stride); - for (; k < kernel; ++k, ps += 1, pd += stride) - for (size_t j = 0; j < 16; ++j) - pd[j] = ps[j*ik]; - } - } - if (tail) - { - - __mmask16 mask = TailMask16(tail); - for (size_t i = 0; i < input; ++i) - { - const float * ps = src + output16 * ik + i * kernel; - float * pd = dst + output16 * ik + i * 16; - for (size_t k = 0; k < kernel; ++k, ps += 1, pd += stride) - { - size_t j = 0; - for (; j < tail; ++j) - pd[j] = ps[j*ik]; - for (; j < 16; ++j) - pd[j] = 0; - } - } - } - } - - template void SynetReorderFilter_Yxio_Oiyx(size_t output, size_t input, size_t kernel, const float * src, float * dst) - { - if (kernel == 1) - { - SynetReorderImage_Chw_Hwc(input, output, src, dst); - return; - } - SynetReorderFilter_Oiyx_Yxio(kernel, input, output, src, dst); - } - - template void SynetReorderFilter_Yxio_Oyxi16o(size_t output, size_t input, size_t kernel, const float * src, float * dst) - { - size_t outputF = AlignLo(output, F); - size_t outputF4 = AlignLo(output, F * 4); - size_t ki = kernel * input; - size_t stride = ki * F; - size_t ki4 = AlignLo(ki, 4); - size_t o = 0; - for (; o < outputF4; o += 4 * F, src += 4 * F) - { - const float * ps = src; - float * pd = dst; - size_t i = 0; - for (; i < ki4; i += 4, pd += 4 * F, ps += 4 * output) - Transpose4x4xF(ps, output, pd, stride); - for (; i < ki; ++i, pd += F, ps += output) - { - Copy(ps + 0 * F, pd + 0 * stride); - Copy(ps + 1 * F, pd + 1 * stride); - Copy(ps + 2 * F, pd + 2 * stride); - Copy(ps + 3 * F, pd + 3 * stride); - } - dst += 4 * stride; - } - for (; o < outputF; o += F, src += F) - { - const float * ps = src; - float * pd = dst; - size_t i = 0; - for (; i < ki; ++i, pd += F, ps += output) - Copy(ps, pd); - dst += stride; - } - if (outputF < output) - { - size_t tail = output - outputF; - __mmask16 mask = TailMask16(tail); - for (size_t k = 0; k < kernel; ++k) - for (size_t i = 0; i < input; ++i, src += output, dst += F) - CopyZP(src, dst, mask); - } - } - - template void SynetReorderFilter_Oyxi16o_Oiyx(size_t output, size_t input, size_t kernel, const float * src, float * dst) - { - if (kernel == 1) - { - SynetReorderImage_Chw16c_Chw(output, input, src, dst); - return; - } - size_t output16 = AlignLo(output, 16); - size_t tail = output - output16; - size_t kernel8 = AlignLo(kernel, 8); - size_t ik = input * kernel; - size_t stride = 16 * input; - size_t o = 0; - for (; o < output16; o += 16, src += 16 * ik) - { - const float * ps = src; - float * pd = dst; - for (size_t i = 0; i < input; ++i, ps += 16) - { - size_t k = 0; - for (; k < kernel8; k += 8, pd += 8) - Transpose16x8(ps + k * stride, stride, pd, ik); - for (; k < kernel; ++k, pd++) - { - pd[0x0 * ik] = ps[k*stride + 0x0]; - pd[0x1 * ik] = ps[k*stride + 0x1]; - pd[0x2 * ik] = ps[k*stride + 0x2]; - pd[0x3 * ik] = ps[k*stride + 0x3]; - pd[0x4 * ik] = ps[k*stride + 0x4]; - pd[0x5 * ik] = ps[k*stride + 0x5]; - pd[0x6 * ik] = ps[k*stride + 0x6]; - pd[0x7 * ik] = ps[k*stride + 0x7]; - pd[0x8 * ik] = ps[k*stride + 0x8]; - pd[0x9 * ik] = ps[k*stride + 0x9]; - pd[0xA * ik] = ps[k*stride + 0xA]; - pd[0xB * ik] = ps[k*stride + 0xB]; - pd[0xC * ik] = ps[k*stride + 0xC]; - pd[0xD * ik] = ps[k*stride + 0xD]; - pd[0xE * ik] = ps[k*stride + 0xE]; - pd[0xF * ik] = ps[k*stride + 0xF]; - } - } - dst += 16 * ik; - } - if (tail) - { - for (size_t j = 0; j < tail; ++j) - { - const float * ps = src + j; - for (size_t i = 0; i < input; ++i, ps += 16) - for (size_t k = 0; k < kernel; ++k) - *(dst++) = ps[k*stride]; - } - } - } - - template void SynetReorderFilter_Oyxi16o_Yxio(size_t output, size_t input, size_t kernel, const float * src, float * dst) - { - size_t outputF = AlignLo(output, F); - size_t outputF4 = AlignLo(output, 4 * F); - size_t tail = output - outputF; - __mmask16 mask = TailMask16(tail); - size_t ki = kernel * input; - size_t ki4 = AlignLo(ki, 4); - size_t stride = ki * F; - size_t i = 0; - for (; i < ki4; i += 4, src += 4 * F) - { - const float * ps = src; - float * pd = dst; - size_t o = 0; - for (; o < outputF4; o += 4 * F, ps += 4 * stride, pd += 4 * F) - Transpose4x4xF(ps, stride, pd, output); - for (; o < outputF; o += F, ps += stride, pd += F) - { - Copy(ps + 0 * F, pd + 0 * output); - Copy(ps + 1 * F, pd + 1 * output); - Copy(ps + 2 * F, pd + 2 * output); - Copy(ps + 3 * F, pd + 3 * output); - } - if (tail) - { - Copy(ps + 0 * F, pd + 0 * output, mask); - Copy(ps + 1 * F, pd + 1 * output, mask); - Copy(ps + 2 * F, pd + 2 * output, mask); - Copy(ps + 3 * F, pd + 3 * output, mask); - } - dst += 4 * output; - } - for (; i < ki; ++i, src += F) - { - const float * ps = src; - for (size_t o = 0; o < outputF; o += F, ps += stride, dst += F) - Copy(ps, dst); - if (tail) - Copy(ps, dst, mask), dst += tail; - } - } - - typedef void(*SynetFilterConverterPtr)(size_t output, size_t input, size_t kernel, const float * src, float * dst); - SynetFilterConverterPtr GetFilterConverter(SimdTensorFormatType src, SimdTensorFormatType dst) - { - if (src == SimdTensorFormatOiyx) - { - if (dst == SimdTensorFormatYxio) - return SynetReorderFilter_Oiyx_Yxio; - if (dst == SimdTensorFormatOyxi16o) - return SynetReorderFilter_Oiyx_Oyxi16o; - } - if (src == SimdTensorFormatYxio) - { - if (dst == SimdTensorFormatOiyx) - return SynetReorderFilter_Yxio_Oiyx; - if (dst == SimdTensorFormatOyxi16o) - return SynetReorderFilter_Yxio_Oyxi16o; - } - if (src == SimdTensorFormatOyxi16o) - { - if (dst == SimdTensorFormatOiyx) - return SynetReorderFilter_Oyxi16o_Oiyx; - if (dst == SimdTensorFormatYxio) - return SynetReorderFilter_Oyxi16o_Yxio; - } - return NULL; - } - - void SynetReorderFilter(size_t output, size_t input, size_t kernel, const float * src, SimdTensorFormatType srcFormat, float * dst, SimdTensorFormatType dstFormat) - { - SynetFilterConverterPtr filterConverter = GetFilterConverter(srcFormat, dstFormat); - if (filterConverter) - filterConverter(output, input, kernel, src, dst); - else - Avx::SynetReorderFilter(output, input, kernel, src, srcFormat, dst, dstFormat); - } - } -#endif// SIMD_AVX512F_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512fSynetConvolution32f.cpp b/src/3rd/Simd/Simd/SimdAvx512fSynetConvolution32f.cpp deleted file mode 100644 index e940594a..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512fSynetConvolution32f.cpp +++ /dev/null @@ -1,2404 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdAvx512f.h" -#include "Simd/SimdGemm.h" -#include "Simd/SimdExp.h" - -#if defined(SIMD_X86_ENABLE) && defined(_MSC_VER) && _MSC_VER < 1924 -#define SIMD_MSVS2017_WIN32_RELEASE_COMPILER_ERROR -#endif - -namespace Simd -{ -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - void ConvolutionBiasAndActivation(const float * bias, size_t count, size_t size, ::SimdConvolutionActivationType activation, const float * params, ::SimdBool trans, float * dst) - { -#ifdef SIMD_MSVS2017_WIN32_RELEASE_COMPILER_ERROR - Avx::ConvolutionBiasAndActivation(bias, count, size, activation, params, trans, dst); -#else - size_t aligned = AlignLo(trans ? count : size, F); - __mmask16 tail = __mmask16(-1) >> (F + aligned - (trans ? count : size)); - if (activation == ::SimdConvolutionActivationIdentity) - { - if (bias) - SynetAddBias(bias, count, size, dst, (SimdTensorFormatType)trans); - } - else if (activation == ::SimdConvolutionActivationRelu) - { - if (bias) - { - __m512 _0 = _mm512_set1_ps(0.0f); - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - size_t i = 0; - for (; i < aligned; i += F) - { - __m512 _dst = _mm512_loadu_ps(dst + i); - __m512 _bias = _mm512_loadu_ps(bias + i); - _mm512_storeu_ps(dst + i, _mm512_max_ps(_0, _mm512_add_ps(_dst, _bias))); - } - if (i < count) - { - __m512 _dst = _mm512_maskz_loadu_ps(tail, dst + i); - __m512 _bias = _mm512_maskz_loadu_ps(tail, bias + i); - _mm512_mask_storeu_ps(dst + i, tail, _mm512_max_ps(_0, _mm512_add_ps(_dst, _bias))); - } - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - __m512 _bias = _mm512_set1_ps(bias[i]); - size_t j = 0; - for (; j < aligned; j += F) - { - __m512 _dst = _mm512_loadu_ps(dst + j); - _mm512_storeu_ps(dst + j, _mm512_max_ps(_0, _mm512_add_ps(_dst, _bias))); - } - if (j < size) - { - __m512 _dst = _mm512_maskz_loadu_ps(tail, dst + j); - _mm512_mask_storeu_ps(dst + j, tail, _mm512_max_ps(_0, _mm512_add_ps(_dst, _bias))); - } - dst += size; - } - } - } - else - { - float slope = 0; - SynetRelu32f(dst, size*count, &slope, dst); - } - } - else if (activation == ::SimdConvolutionActivationLeakyRelu) - { - float slope = params[0]; - if (bias) - { - __m512 _slope = _mm512_set1_ps(slope); - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - size_t i = 0; - for (; i < aligned; i += F) - { - __m512 _dst = _mm512_loadu_ps(dst + i); - __m512 _bias = _mm512_loadu_ps(bias + i); - _mm512_storeu_ps(dst + i, SynetRelu32f(_mm512_add_ps(_dst, _bias), _slope)); - } - if (i < count) - { - __m512 _dst = _mm512_maskz_loadu_ps(tail, dst + i); - __m512 _bias = _mm512_maskz_loadu_ps(tail, bias + i); - _mm512_mask_storeu_ps(dst + i, tail, SynetRelu32f(_mm512_add_ps(_dst, _bias), _slope)); - } - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - __m512 _bias = _mm512_set1_ps(bias[i]); - size_t j = 0; - for (; j < aligned; j += F) - { - __m512 value = _mm512_add_ps(_mm512_loadu_ps(dst + j), _bias); - _mm512_storeu_ps(dst + j, SynetRelu32f(value, _slope)); - } - if (j < size) - { - __m512 value = _mm512_add_ps(_mm512_maskz_loadu_ps(tail, dst + j), _bias); - _mm512_mask_storeu_ps(dst + j, tail, SynetRelu32f(value, _slope)); - } - dst += size; - } - } - } - else - SynetRelu32f(dst, size*count, &slope, dst); - } - else if (activation == ::SimdConvolutionActivationRestrictRange) - { - float lower = params[0]; - float upper = params[1]; - if (bias) - { - __m512 _lower = _mm512_set1_ps(lower); - __m512 _upper = _mm512_set1_ps(upper); - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - size_t i = 0; - for (; i < aligned; i += F) - { - __m512 value = _mm512_add_ps(_mm512_loadu_ps(dst + i), _mm512_loadu_ps(bias + i)); - _mm512_storeu_ps(dst + i, _mm512_min_ps(_mm512_max_ps(_lower, value), _upper)); - } - if (i < count) - { - __m512 value = _mm512_add_ps(_mm512_maskz_loadu_ps(tail, dst + i), _mm512_maskz_loadu_ps(tail, bias + i)); - _mm512_mask_storeu_ps(dst + i, tail, _mm512_min_ps(_mm512_max_ps(_lower, value), _upper)); - } - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - __m512 _bias = _mm512_set1_ps(bias[i]); - size_t j = 0; - for (; j < aligned; j += F) - { - __m512 value = _mm512_add_ps(_mm512_loadu_ps(dst + j), _bias); - _mm512_storeu_ps(dst + j, _mm512_min_ps(_mm512_max_ps(_lower, value), _upper)); - } - if (j < size) - { - __m512 value = _mm512_add_ps(_mm512_maskz_loadu_ps(tail, dst + j), _bias); - _mm512_mask_storeu_ps(dst + j, tail, _mm512_min_ps(_mm512_max_ps(_lower, value), _upper)); - } - dst += size; - } - } - } - else - SynetRestrictRange32f(dst, size*count, &lower, &upper, dst); - } - else if (activation == ::SimdConvolutionActivationPrelu) - { - if (bias) - { - if (trans) - { - if (count == 1 || count == 2 || count == 4 || count == 8 || count == 16) - { - __m512 _bias, _slope; - if (count == 1) - { - _bias = _mm512_broadcast_f32x4(_mm_set1_ps(bias[0])); - _slope = _mm512_broadcast_f32x4(_mm_set1_ps(params[0])); - } - else if (count == 2) - { - _bias = _mm512_broadcast_f32x4(_mm_setr_ps(bias[0], bias[1], bias[0], bias[1])); - _slope = _mm512_broadcast_f32x4(_mm_setr_ps(params[0], params[1], params[0], params[1])); - } - else if (count == 4) - { - _bias = _mm512_broadcast_f32x4(_mm_loadu_ps(bias)); - _slope = _mm512_broadcast_f32x4(_mm_loadu_ps(params)); - } - else if (count == 8) - { - _bias = _mm512_setr_ps(bias[0], bias[1], bias[2], bias[3], bias[4], bias[5], bias[6], bias[7], - bias[0], bias[1], bias[2], bias[3], bias[4], bias[5], bias[6], bias[7]); - _slope = _mm512_setr_ps(params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7], - params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7]); - } - else if (count == 16) - { - _bias = _mm512_loadu_ps(bias); - _slope = _mm512_loadu_ps(params); - } - else - assert(0); - size_t n = size * count, nF = AlignLo(n, F), i = 0; - for (; i < nF; i += F) - { - __m512 value = _mm512_add_ps(_mm512_loadu_ps(dst + i), _bias); - _mm512_storeu_ps(dst + i, SynetRelu32f(value, _slope)); - } - if (i < n) - { - __mmask16 tail = TailMask16(n - nF); - __m512 value = _mm512_add_ps(_mm512_maskz_loadu_ps(tail, dst + i), _bias); - _mm512_mask_storeu_ps(dst + i, tail, SynetRelu32f(value, _slope)); - } - } - else - { - for (size_t j = 0; j < size; ++j) - { - size_t i = 0; - for (; i < aligned; i += F) - { - __m512 value = _mm512_add_ps(_mm512_loadu_ps(dst + i), _mm512_loadu_ps(bias + i)); - _mm512_storeu_ps(dst + i, SynetRelu32f(value, _mm512_loadu_ps(params + i))); - } - if (i < count) - { - __m512 value = _mm512_add_ps(_mm512_maskz_loadu_ps(tail, dst + i), _mm512_maskz_loadu_ps(tail, bias + i)); - _mm512_mask_storeu_ps(dst + i, tail, SynetRelu32f(value, _mm512_maskz_loadu_ps(tail, params + i))); - } - dst += count; - } - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - __m512 _bias = _mm512_set1_ps(bias[i]); - __m512 _slope = _mm512_set1_ps(params[i]); - size_t j = 0; - for (; j < aligned; j += F) - { - __m512 value = _mm512_add_ps(_mm512_loadu_ps(dst + j), _bias); - _mm512_storeu_ps(dst + j, SynetRelu32f(value, _slope)); - } - if (j < size) - { - __m512 value = _mm512_add_ps(_mm512_maskz_loadu_ps(tail, dst + j), _bias); - _mm512_mask_storeu_ps(dst + j, tail, SynetRelu32f(value, _slope)); - } - dst += size; - } - } - } - else - Avx512f::SynetPreluLayerForward(dst, params, count, size, dst, (SimdTensorFormatType)trans); - } - else if (activation == ::SimdConvolutionActivationElu) - { - float alpha = params[0]; - if (bias) - { - __m512 _alpha = _mm512_set1_ps(alpha); - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - size_t i = 0; - for (; i < aligned; i += F) - { - __m512 _dst = _mm512_loadu_ps(dst + i); - __m512 _bias = _mm512_loadu_ps(bias + i); - _mm512_storeu_ps(dst + i, Avx512f::Elu(_mm512_add_ps(_dst, _bias), _alpha)); - } - if (i < count) - { - __m512 _dst = _mm512_maskz_loadu_ps(tail, dst + i); - __m512 _bias = _mm512_maskz_loadu_ps(tail, bias + i); - _mm512_mask_storeu_ps(dst + i, tail, Avx512f::Elu(_mm512_add_ps(_dst, _bias), _alpha)); - } - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - __m512 _bias = _mm512_set1_ps(bias[i]); - size_t j = 0; - for (; j < aligned; j += F) - { - __m512 value = _mm512_add_ps(_mm512_loadu_ps(dst + j), _bias); - _mm512_storeu_ps(dst + j, Avx512f::Elu(value, _alpha)); - } - if (j < size) - { - __m512 value = _mm512_add_ps(_mm512_maskz_loadu_ps(tail, dst + j), _bias); - _mm512_mask_storeu_ps(dst + j, tail, Avx512f::Elu(value, _alpha)); - } - dst += size; - } - } - } - else - SynetElu32f(dst, size*count, &alpha, dst); - } - else if (activation == ::SimdConvolutionActivationHswish) - { - float shift = params[0]; - float scale = params[1]; - if (bias) - { - __m512 _shift = _mm512_set1_ps(shift); - __m512 _scale = _mm512_set1_ps(scale); - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - size_t i = 0; - for (; i < aligned; i += F) - { - __m512 _dst = _mm512_loadu_ps(dst + i); - __m512 _bias = _mm512_loadu_ps(bias + i); - _mm512_storeu_ps(dst + i, Avx512f::SynetHswish32f(_mm512_add_ps(_dst, _bias), _shift, _scale)); - } - if (i < count) - { - __m512 _dst = _mm512_maskz_loadu_ps(tail, dst + i); - __m512 _bias = _mm512_maskz_loadu_ps(tail, bias + i); - _mm512_mask_storeu_ps(dst + i, tail, Avx512f::SynetHswish32f(_mm512_add_ps(_dst, _bias), _shift, _scale)); - } - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - __m512 _bias = _mm512_set1_ps(bias[i]); - size_t j = 0; - for (; j < aligned; j += F) - { - __m512 value = _mm512_add_ps(_mm512_loadu_ps(dst + j), _bias); - _mm512_storeu_ps(dst + j, Avx512f::SynetHswish32f(value, _shift, _scale)); - } - if (j < size) - { - __m512 value = _mm512_add_ps(_mm512_maskz_loadu_ps(tail, dst + j), _bias); - _mm512_mask_storeu_ps(dst + j, tail, Avx512f::SynetHswish32f(value, _shift, _scale)); - } - dst += size; - } - } - } - else - SynetHswish32f(dst, size*count, &shift, &scale, dst); - } - else - assert(0); -#endif - } - - //--------------------------------------------------------------------- - - SynetConvolution32fGemmNN::SynetConvolution32fGemmNN(const ConvParam32f & p) - : Avx2::SynetConvolution32fGemmNN(p) - { - _index.Resize(F); - for (size_t i = 0; i < F; ++i) - _index[i] = int(i * p.strideX); - _nose.Resize(p.kernelX); - _tail.Resize(p.kernelX); - ptrdiff_t aligned = AlignHi(p.dstW, F) - F; - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - _nose[kx] = 0; - _tail[kx] = 0; - ptrdiff_t sx = kx * p.dilationX - p.padX; - for (size_t dx = 0; dx < p.dstW; ++dx) - { - if (sx >= 0 && sx < ptrdiff_t(p.srcW) && dx < F) - _nose[kx] |= 1 << dx; - if (sx < ptrdiff_t(p.srcW) && ptrdiff_t(dx) >= aligned) - _tail[kx] |= 1 << (dx - aligned); - sx += p.strideX; - } - } - if (p.dstC == 8) - return; - _gemm.Init(InitGemmFuncs(Avx512f::Gemm32fNN, "Avx512f", p.gemm, "Ext")); - if (_param.trans && _param.group == 1) - { - if (NHWC_GEMM_RUNTIME) - { - _gemmCb.Init(InitGemmCbFuncs(Avx512f::Gemm32fNNcbBufferSize, Avx512f::Gemm32fNNcbReorderB, Avx512f::Gemm32fNNcbRun, "Avx512f", GemmKernelF2, GemmKernelF3)); - _nhwcWeight.Resize(_gemmCb.At(0).BufferSize(_M*_merge, _N, _K)); - } - else - _nhwcWeight.Resize(Avx512f::Gemm32fNNcbBufferSize(_M*_merge, _N, _K, GemmKernelAny, NHWC_GEMM_COMPATIBLE)); - _nhwcRun = Avx512f::Gemm32fNNcbRun; - _nhwcReorderB = Avx512f::Gemm32fNNcbReorderB; - } - _biasAndActivation = _N > Avx::F ? Avx512f::ConvolutionBiasAndActivation : Avx::ConvolutionBiasAndActivation; - } - - void SynetConvolution32fGemmNN::ImgToCol(const float * src, float * dst) - { - const ConvParam32f & p = _param; - size_t srcSize = p.srcW * p.srcH; - if (p.dilationX == 1 && p.dilationY == 1 && p.strideX == 2 && p.strideY == 2 && p.padX == 0 && p.padY == 0 && p.padW == 0 && p.padH == 0 && p.kernelX == 1 && p.kernelY == 1) - { - for (size_t c = 0; c < p.srcC; ++c) - { - for (size_t dy = 0; dy < p.dstH; ++dy) - { - const float * psrc = src + 2 * dy*p.srcW; - for (size_t dx = 0, sx = 0; dx < p.dstW; ++dx, sx += 2) - *(dst++) = psrc[sx]; - } - src += srcSize; - } - } - else if (p.dilationX*p.dilationY*p.strideX*p.strideY != 1) - { - __m512 _0 = _mm512_setzero_ps(); - __m512i index = _mm512_loadu_si512(_index.data); - size_t aligned = AlignHi(p.dstW, F) - F; - __mmask16 storeTail = TailMask16(p.dstW - aligned); - __mmask16 storeNose = aligned ? __mmask16(-1) : storeTail; - for (size_t c = 0; c < p.srcC; ++c) - { - for (size_t ky = 0; ky < p.kernelY; ky++) - { - for (size_t kx = 0; kx < p.kernelX; kx++) - { - __mmask16 nose = _nose[kx]; - __mmask16 tail = _tail[kx]; - size_t sx0 = kx * p.dilationX - p.padX; - size_t sy = ky * p.dilationY - p.padY; - for (size_t dy = 0; dy < p.dstH; ++dy) - { - if (sy < p.srcH) - { - size_t dx = 0, sx = sx0 + sy * p.srcW; - _mm512_mask_storeu_ps(dst + dx, storeNose, _mm512_mask_i32gather_ps(_0, nose, index, (src + sx), 4)); - dx += F, sx += p.strideX*F; - //if (p.strideX == 3) - //{ - // for (; dx < aligned; dx += F, sx += p.strideX*F) - // _mm512_storeu_ps(dst + dx, Avx512f::Gather<3>(src + sx)); - //} - //else - //{ - for (; dx < aligned; dx += F, sx += p.strideX*F) - _mm512_storeu_ps(dst + dx, _mm512_i32gather_ps(index, (src + sx), 4)); - //} - if (aligned) - _mm512_mask_storeu_ps(dst + dx, storeTail, _mm512_mask_i32gather_ps(_0, tail, index, (src + sx), 4)); - } - else - { - memset(dst, 0, p.dstW * sizeof(float)); - } - dst += p.dstW; - sy += p.strideY; - } - } - } - src += srcSize; - } - } - else - { - Base::SynetConvolution32fGemmNN::ImgToCol(src, dst); - } - } - - //--------------------------------------------------------------------- - - SynetConvolution32fGemmNT::SynetConvolution32fGemmNT(const ConvParam32f & p) - : Avx2::SynetConvolution32fGemmNT(p) - { - _gemm.Init(InitGemmFuncs(Avx512f::Gemm32fNT, "Avx512f")); - _biasAndActivation = Avx512f::ConvolutionBiasAndActivation; - } - - //--------------------------------------------------------------------- - - SynetConvolution32fWinograd::SynetConvolution32fWinograd(const ConvParam32f & p) - : Avx2::SynetConvolution32fWinograd(p) - { - if (p.dstC == 8) - return; - if (p.kernelY == 1 && p.kernelX == 3) - { - { - SetBlock(1, 4); - _setFilter = Avx512f::WinogradKernel1x3Block1x4SetFilter; - _setInput = Avx512f::WinogradKernel1x3Block1x4SetInput; - _setOutput = Avx512f::WinogradKernel1x3Block1x4SetOutput; - } - } - else if (p.kernelY == 1 && p.kernelX == 5) - { - { - SetBlock(1, 4); - _setFilter = Avx512f::WinogradKernel1x5Block1x4SetFilter; - _setInput = Avx512f::WinogradKernel1x5Block1x4SetInput; - _setOutput = Avx512f::WinogradKernel1x5Block1x4SetOutput; - } - } - else if (p.kernelY == 2 && p.kernelX == 2) - { - if (_blockY == 4 && _blockX == 4) - { - SetBlock(4, 4); - _setFilter = Avx512f::WinogradKernel2x2Block4x4SetFilter; - _setInput = Avx512f::WinogradKernel2x2Block4x4SetInput; - _setOutput = Avx512f::WinogradKernel2x2Block4x4SetOutput; - } - else if (_blockY == 2 && _blockX == 2) - { - SetBlock(2, 2); - _setFilter = Avx512f::WinogradKernel2x2Block2x2SetFilter; - _setInput = Avx512f::WinogradKernel2x2Block2x2SetInput; - _setOutput = Avx512f::WinogradKernel2x2Block2x2SetOutput; - } - else - assert(0); - } - else if (p.kernelY == 3 && p.kernelX == 3) - { - if (_blockY == 4 && _blockX == 4) - { - _setFilter = Avx512f::WinogradKernel3x3Block4x4SetFilter; - _setInput = Avx512f::WinogradKernel3x3Block4x4SetInput; - _setOutput = Avx512f::WinogradKernel3x3Block4x4SetOutput; - } - else if (_blockY == 3 && _blockX == 3) - { - _setFilter = Avx512f::WinogradKernel3x3Block3x3SetFilter; - _setInput = Avx512f::WinogradKernel3x3Block3x3SetInput; - _setOutput = Avx512f::WinogradKernel3x3Block3x3SetOutput; - } - else if (_blockY == 2 && _blockX == 2) - { - _setFilter = Avx512f::WinogradKernel3x3Block2x2SetFilter; - _setInput = Avx512f::WinogradKernel3x3Block2x2SetInput; - _setOutput = Avx512f::WinogradKernel3x3Block2x2SetOutput; - } - else - assert(0); - } - else - assert(0); - _gemm.Init(InitGemmFuncs(Avx512f::Gemm32fNN, "Avx512f", p.gemm, "Ext")); - if (_param.trans) - { - if (NHWC_GEMM_RUNTIME) - { - _gemmCb.Init(InitGemmCbFuncs(Avx512f::Gemm32fNNcbBufferSize, Avx512f::Gemm32fNNcbReorderB, Avx512f::Gemm32fNNcbRun, "Avx512f", GemmKernelF2, GemmKernelF3)); - _nhwcStrideW = _gemmCb.At(0).BufferSize(_M*_merge, _N, _K); - } - else - _nhwcStrideW = Avx512f::Gemm32fNNcbBufferSize(_M*_merge, _N, _K, GemmKernelAny, NHWC_GEMM_COMPATIBLE); - _nhwcWeight.Resize(_nhwcStrideW*_count); - _nhwcRun = Avx512f::Gemm32fNNcbRun; - _nhwcReorderB = Avx512f::Gemm32fNNcbReorderB; - } - _biasAndActivation = Avx512f::ConvolutionBiasAndActivation; - } - - //--------------------------------------------------------------------- - - SynetConvolution32fDirectNchw::SynetConvolution32fDirectNchw(const ConvParam32f & p) - : Avx2::SynetConvolution32fDirectNchw(p) - { - _convolutionBiasActivation = SetConvolutionBiasActivation(); - } - - template SIMD_INLINE void LoadWeight(const float * src, __m512 * dst) - { - for (size_t i = 0; i < size; ++i) - dst[i] = _mm512_set1_ps(src[i]); - } - - template struct Kernel - { - static __m512 SynetConvolution32f(const float * src, size_t step, const __m512 * weight); - }; - - template<> struct Kernel<1, 1> - { - static SIMD_INLINE __m512 SynetConvolution32f(const float * src, size_t step, const __m512 * weight) - { - return _mm512_mul_ps(_mm512_loadu_ps(src), weight[0]); - } - }; - - template<> struct Kernel<1, 2> - { - static SIMD_INLINE __m512 SynetConvolution32f(const float * src, size_t step, const __m512 * weight) - { - __m512 s0 = _mm512_loadu_ps(src + 0); - __m512 s1 = _mm512_loadu_ps(src + F); - return _mm512_permutexvar_ps(K32_PERMUTE_FOR_PACK, _mm512_mul_ps(_mm512_shuffle_ps(s0, s1, 0x88), weight[0])); - } - }; - - template<> struct Kernel<2, 1> - { - static SIMD_INLINE __m512 RowConv(const float * src, const __m512 * weight) - { - return _mm512_fmadd_ps(_mm512_loadu_ps(src), weight[0], - _mm512_mul_ps(_mm512_loadu_ps(src + 1), weight[1])); - } - - static SIMD_INLINE __m512 SynetConvolution32f(const float * src, size_t step, const __m512 * weight) - { - return _mm512_add_ps(RowConv(src, weight), RowConv(src + step, weight + 2)); - } - }; - - template<> struct Kernel<2, 2> - { - static SIMD_INLINE __m512 RowConv(const float * src, const __m512 * weight) - { - __m512 s0 = _mm512_loadu_ps(src + 0); - __m512 s1 = _mm512_loadu_ps(src + F); - return _mm512_fmadd_ps(_mm512_shuffle_ps(s0, s1, 0x88), weight[0], - _mm512_mul_ps(_mm512_shuffle_ps(s0, s1, 0xDD), weight[1])); - } - - static SIMD_INLINE __m512 SynetConvolution32f(const float * src, size_t step, const __m512 * weight) - { - return _mm512_permutexvar_ps(K32_PERMUTE_FOR_PACK, _mm512_add_ps(RowConv(src, weight), RowConv(src + step, weight + 2))); - } - }; - - template<> struct Kernel<3, 1> - { - static SIMD_INLINE __m512 RowConv(const float * src, const __m512 * weight) - { - return _mm512_fmadd_ps(_mm512_loadu_ps(src), weight[0], - _mm512_fmadd_ps(_mm512_loadu_ps(src + 1), weight[1], - _mm512_mul_ps(_mm512_loadu_ps(src + 2), weight[2]))); - } - - static SIMD_INLINE __m512 SynetConvolution32f(const float * src, size_t step, const __m512 * weight) - { - return _mm512_add_ps(RowConv(src, weight), - _mm512_add_ps(RowConv(src + step, weight + 3), - RowConv(src + 2 * step, weight + 6))); - } - }; - - template<> struct Kernel<3, 2> - { - static SIMD_INLINE __m512 RowConv(const float * src, const __m512 * weight) - { - __m512 s00 = _mm512_loadu_ps(src); - __m512 s10 = _mm512_loadu_ps(src + F); - __m512 s02 = _mm512_loadu_ps(src + 2); - __m512 s12 = _mm512_loadu_ps(src + 2 + F); - return _mm512_fmadd_ps(_mm512_shuffle_ps(s00, s10, 0x88), weight[0], - _mm512_fmadd_ps(_mm512_shuffle_ps(s00, s10, 0xDD), weight[1], - _mm512_mul_ps(_mm512_shuffle_ps(s02, s12, 0x88), weight[2]))); - } - - static SIMD_INLINE __m512 SynetConvolution32f(const float * src, size_t step, const __m512 * weight) - { - return _mm512_permutexvar_ps(K32_PERMUTE_FOR_PACK, _mm512_add_ps(RowConv(src, weight), - _mm512_add_ps(RowConv(src + step, weight + 3), RowConv(src + 2 * step, weight + 6)))); - } - }; - - const __m512i K32_IDX_3_0A = SIMD_MM512_SETR_EPI32(0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0, 0, 0, 0, 0); - const __m512i K32_IDX_3_0B = SIMD_MM512_SETR_EPI32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 7, 10, 13); - const __m512i K32_IDX_3_1A = SIMD_MM512_SETR_EPI32(1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46); - const __m512i K32_IDX_3_1B = SIMD_MM512_SETR_EPI32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 5, 8, 11, 14); - const __m512i K32_IDX_3_2A = SIMD_MM512_SETR_EPI32(2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 32, 35, 38, 41, 44, 47); - const __m512i K32_IDX_3_2B = SIMD_MM512_SETR_EPI32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 6, 9, 12, 15); - - template<> struct Kernel<3, 3> - { - - static SIMD_INLINE __m512 RowConv(const float * src, const __m512 * weight) - { - __m512 src0 = _mm512_loadu_ps(src + 0 * F); - __m512 src1 = _mm512_loadu_ps(src + 1 * F); - __m512 src2 = _mm512_loadu_ps(src + 2 * F); - __m512 s0 = _mm512_mask_permutexvar_ps(_mm512_maskz_permutex2var_ps(0xFFFF, src0, K32_IDX_3_0A, src1), 0xF800, K32_IDX_3_0B, src2); - __m512 s1 = _mm512_mask_permutexvar_ps(_mm512_maskz_permutex2var_ps(0xFFFF, src0, K32_IDX_3_1A, src1), 0xF800, K32_IDX_3_1B, src2); - __m512 s2 = _mm512_mask_permutexvar_ps(_mm512_maskz_permutex2var_ps(0xFFFF, src0, K32_IDX_3_2A, src1), 0xFC00, K32_IDX_3_2B, src2); - return _mm512_fmadd_ps(s0, weight[0], _mm512_fmadd_ps(s1, weight[1], _mm512_mul_ps(s2, weight[2]))); - } - - static SIMD_INLINE __m512 SynetConvolution32f(const float * src, size_t step, const __m512 * weight) - { - return _mm512_add_ps(RowConv(src, weight), _mm512_add_ps(RowConv(src + step, weight + 3), RowConv(src + 2 * step, weight + 6))); - } - }; - - template<> struct Kernel<4, 1> - { - static SIMD_INLINE __m512 RowConv(const float * src, const __m512 * weight) - { - return _mm512_fmadd_ps(_mm512_loadu_ps(src), weight[0], _mm512_fmadd_ps(_mm512_loadu_ps(src + 1), weight[1], - _mm512_fmadd_ps(_mm512_loadu_ps(src + 2), weight[2], _mm512_mul_ps(_mm512_loadu_ps(src + 3), weight[3])))); - } - - static SIMD_INLINE __m512 SynetConvolution32f(const float * src, size_t step, const __m512 * weight) - { - return _mm512_add_ps(RowConv(src, weight), _mm512_add_ps(RowConv(src + step, weight + 4), - _mm512_add_ps(RowConv(src + 2 * step, weight + 8), RowConv(src + 3 * step, weight + 12)))); - } - }; - - template<> struct Kernel<4, 2> - { - static SIMD_INLINE __m512 RowConv(const float * src, const __m512 * weight) - { - __m512 s00 = _mm512_loadu_ps(src); - __m512 s10 = _mm512_loadu_ps(src + F); - __m512 s02 = _mm512_loadu_ps(src + 2); - __m512 s12 = _mm512_loadu_ps(src + 2 + F); - return _mm512_fmadd_ps(_mm512_shuffle_ps(s00, s10, 0x88), weight[0], _mm512_fmadd_ps(_mm512_shuffle_ps(s00, s10, 0xDD), weight[1], - _mm512_fmadd_ps(_mm512_shuffle_ps(s02, s12, 0x88), weight[2], _mm512_mul_ps(_mm512_shuffle_ps(s02, s12, 0xDD), weight[3])))); - } - - static SIMD_INLINE __m512 SynetConvolution32f(const float * src, size_t step, const __m512 * weight) - { - return _mm512_permutexvar_ps(K32_PERMUTE_FOR_PACK, _mm512_add_ps(RowConv(src, weight), - _mm512_add_ps(RowConv(src + step, weight + 4), _mm512_add_ps(RowConv(src + 2 * step, weight + 8), RowConv(src + 3 * step, weight + 12))))); - } - }; - - template<> struct Kernel<5, 1> - { - static SIMD_INLINE __m512 RowConv(const float * src, const __m512 * weight) - { - return _mm512_fmadd_ps(_mm512_loadu_ps(src), weight[0], _mm512_fmadd_ps(_mm512_loadu_ps(src + 1), weight[1], - _mm512_fmadd_ps(_mm512_loadu_ps(src + 2), weight[2], _mm512_fmadd_ps(_mm512_loadu_ps(src + 3), weight[3], - _mm512_mul_ps(_mm512_loadu_ps(src + 4), weight[4]))))); - } - - static SIMD_INLINE __m512 SynetConvolution32f(const float * src, size_t step, const __m512 * weight) - { - return _mm512_add_ps(RowConv(src, weight), _mm512_add_ps(RowConv(src + step, weight + 5), - _mm512_add_ps(RowConv(src + 2 * step, weight + 10), _mm512_add_ps(RowConv(src + 3 * step, weight + 15), - RowConv(src + 4 * step, weight + 20))))); - } - }; - - template<> struct Kernel<5, 2> - { - static SIMD_INLINE __m512 RowConv(const float * src, const __m512 * weight) - { - __m512 s00 = _mm512_loadu_ps(src); - __m512 s10 = _mm512_loadu_ps(src + F); - __m512 s02 = _mm512_loadu_ps(src + 2); - __m512 s12 = _mm512_loadu_ps(src + 2 + F); - __m512 s04 = _mm512_loadu_ps(src + 4); - __m512 s14 = _mm512_loadu_ps(src + 4 + F); - return _mm512_fmadd_ps(_mm512_shuffle_ps(s00, s10, 0x88), weight[0], _mm512_fmadd_ps(_mm512_shuffle_ps(s00, s10, 0xDD), weight[1], - _mm512_fmadd_ps(_mm512_shuffle_ps(s02, s12, 0x88), weight[2], _mm512_fmadd_ps(_mm512_shuffle_ps(s02, s12, 0xDD), weight[3], - _mm512_mul_ps(_mm512_shuffle_ps(s04, s14, 0x88), weight[4]))))); - } - - static SIMD_INLINE __m512 SynetConvolution32f(const float * src, size_t step, const __m512 * weight) - { - return _mm512_permutexvar_ps(K32_PERMUTE_FOR_PACK, _mm512_add_ps(RowConv(src, weight), _mm512_add_ps(RowConv(src + step, weight + 5), - _mm512_add_ps(RowConv(src + 2 * step, weight + 10), _mm512_add_ps(RowConv(src + 3 * step, weight + 15), RowConv(src + 4 * step, weight + 20)))))); - } - }; - - template<::SimdConvolutionActivationType type> SIMD_INLINE __m512 Activate(__m512 value, const __m512 * params); - - template<> SIMD_INLINE __m512 Activate<::SimdConvolutionActivationIdentity>(__m512 value, const __m512 * params) - { - return value; - } - - template<> SIMD_INLINE __m512 Activate<::SimdConvolutionActivationRelu>(__m512 value, const __m512 * params) - { - return _mm512_max_ps(_mm512_setzero_ps(), value); - } - - template<> SIMD_INLINE __m512 Activate<::SimdConvolutionActivationLeakyRelu>(__m512 value, const __m512 * params) - { - return _mm512_add_ps(_mm512_max_ps(_mm512_setzero_ps(), value), _mm512_mul_ps(params[0], _mm512_min_ps(_mm512_setzero_ps(), value))); - } - - template<> SIMD_INLINE __m512 Activate<::SimdConvolutionActivationRestrictRange>(__m512 value, const __m512 * params) - { - return _mm512_min_ps(_mm512_max_ps(params[0], value), params[1]); - } - - template<> SIMD_INLINE __m512 Activate<::SimdConvolutionActivationPrelu>(__m512 value, const __m512 * params) - { - return _mm512_add_ps(_mm512_max_ps(_mm512_setzero_ps(), value), _mm512_mul_ps(params[0], _mm512_min_ps(_mm512_setzero_ps(), value))); - } - - template<> SIMD_INLINE __m512 Activate<::SimdConvolutionActivationElu>(__m512 value, const __m512 * params) - { - return Avx512f::Elu(value, params[0]); - } - - template<> SIMD_INLINE __m512 Activate<::SimdConvolutionActivationHswish>(__m512 value, const __m512 * params) - { - return Avx512f::SynetHswish32f(value, params[0], params[1]); - } - - template - void ConvolutionBiasActivation(const float * src, size_t srcC, size_t srcH, size_t srcW, const float * weight, - const float * bias, const float * params, float * dst, size_t dstC, size_t dstH, size_t dstW) - { - __m512 _weight[kernel*kernel]; - __m512 _params[2]; - _params[0] = _mm512_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm512_set1_ps(params[1]); - size_t dstWF = Simd::AlignLo(dstW, F); - __mmask16 tail = TailMask16(dstW - dstWF); - for (size_t dc = 0; dc < dstC; ++dc) - { - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm512_set1_ps(params[dc]); - if (srcC == 1) - { - const float * ps = src; - float * pd = dst; - LoadWeight(weight, _weight); - __m512 _bias = bias ? _mm512_set1_ps(bias[dc]) : _mm512_setzero_ps(); - for (size_t y = 0; y < dstH; ++y) - { - size_t x = 0; - for (; x < dstWF; x += F) - { - __m512 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm512_storeu_ps(pd + x, Activate(_mm512_add_ps(_bias, conv), _params)); - } - if (x < dstW) - { - __m512 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm512_mask_storeu_ps(pd + x, tail, Activate(_mm512_add_ps(_bias, conv), _params)); - } - ps += srcW * stride; - pd += dstW; - } - weight += kernel * kernel; - } - else - { - size_t sc = 0; - for (; sc < 1; ++sc) - { - const float * ps = src; - float * pd = dst; - LoadWeight(weight, _weight); - __m512 _bias = bias ? _mm512_set1_ps(bias[dc]) : _mm512_setzero_ps(); - for (size_t y = 0; y < dstH; ++y) - { - size_t x = 0; - for (; x < dstWF; x += F) - { - __m512 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm512_storeu_ps(pd + x, _mm512_add_ps(_bias, conv)); - } - if (x < dstW) - { - __m512 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm512_mask_storeu_ps(pd + x, tail, _mm512_add_ps(_bias, conv)); - } - ps += srcW * stride; - pd += dstW; - } - weight += kernel * kernel; - } - for (; sc < srcC - 1; ++sc) - { - const float * ps = src + sc * srcW * srcH; - float * pd = dst; - LoadWeight(weight, _weight); - for (size_t y = 0; y < dstH; ++y) - { - size_t x = 0; - for (; x < dstWF; x += F) - { - __m512 _dst = _mm512_loadu_ps(pd + x); - __m512 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm512_storeu_ps(pd + x, _mm512_add_ps(_dst, conv)); - } - if (x < dstW) - { - __m512 _dst = _mm512_maskz_loadu_ps(tail, pd + x); - __m512 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm512_mask_storeu_ps(pd + x, tail, _mm512_add_ps(_dst, conv)); - } - ps += srcW * stride; - pd += dstW; - } - weight += kernel * kernel; - } - for (; sc < srcC; ++sc) - { - const float * ps = src + sc * srcW * srcH; - float * pd = dst; - LoadWeight(weight, _weight); - for (size_t y = 0; y < dstH; ++y) - { - size_t x = 0; - for (; x < dstWF; x += F) - { - __m512 _dst = _mm512_loadu_ps(pd + x); - __m512 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm512_storeu_ps(pd + x, Activate(_mm512_add_ps(_dst, conv), _params)); - } - if (x < dstW) - { - __m512 _dst = _mm512_maskz_loadu_ps(tail, pd + x); - __m512 conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - _mm512_mask_storeu_ps(pd + x, tail, Activate(_mm512_add_ps(_dst, conv), _params)); - } - ps += srcW * stride; - pd += dstW; - } - weight += kernel * kernel; - } - } - dst += dstH * dstW; - } - } - - bool SynetConvolution32fDirectNchw::Preferable(const ConvParam32f & p) - { - if (!p.IsDilation(1)) - return false; - if (!(p.IsStride(1) || p.IsStride(2) || p.IsStride(3))) - return false; - double k = double(p.srcC) / p.group * p.strideX * p.strideX * p.strideY / p.kernelX / p.kernelY; - return k < 2.0 && ((p.IsStride(1) && p.IsKernel(1)) || p.IsKernel(2) || p.IsKernel(3) -#if SIMD_ZMM_COUNT == 32 || 1 - || ((p.IsKernel(4) || p.IsKernel(5)) && p.dstW > F) -#endif - ) && p.trans == 0; - } - - template SynetConvolution32fDirectNchw::ConvolutionBiasActivationPtr SetConvolutionBiasActivation(::SimdConvolutionActivationType type) - { - switch (type) - { - case ::SimdConvolutionActivationIdentity: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationRelu: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationLeakyRelu: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationRestrictRange: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationPrelu: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationElu: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationHswish: return ConvolutionBiasActivation; - default: - assert(0); - return NULL; - } - } - - SynetConvolution32fDirectNchw::ConvolutionBiasActivationPtr SynetConvolution32fDirectNchw::SetConvolutionBiasActivation() - { - const ConvParam32f & p = _param; - if (p.dstW <= HF && p.kernelX <= 3) - return Avx2::SynetConvolution32fDirectNchw::SetConvolutionBiasActivation(); - switch (p.strideX) - { - case 1: - if (p.kernelX == 1) - return Avx512f::SetConvolutionBiasActivation<1, 1>(p.activation); - if (p.kernelX == 2) - return Avx512f::SetConvolutionBiasActivation<2, 1>(p.activation); - if (p.kernelX == 3) - return Avx512f::SetConvolutionBiasActivation<3, 1>(p.activation); - if (p.kernelX == 4) - return Avx512f::SetConvolutionBiasActivation<4, 1>(p.activation); - if (p.kernelX == 5) - return Avx512f::SetConvolutionBiasActivation<5, 1>(p.activation); - break; - case 2: - if (p.kernelX == 2) - return Avx512f::SetConvolutionBiasActivation<2, 2>(p.activation); - if (p.kernelX == 3) - return Avx512f::SetConvolutionBiasActivation<3, 2>(p.activation); - if (p.kernelX == 4) - return Avx512f::SetConvolutionBiasActivation<4, 2>(p.activation); - if (p.kernelX == 5) - return Avx512f::SetConvolutionBiasActivation<5, 2>(p.activation); - break; - case 3: - if (p.kernelX == 3) - return Avx512f::SetConvolutionBiasActivation<3, 3>(p.activation); - break; - } - return Avx2::SynetConvolution32fDirectNchw::SetConvolutionBiasActivation(); - } - - //--------------------------------------------------------------------- - - SynetConvolution32fDirectNhwc::SynetConvolution32fDirectNhwc(const ConvParam32f & p) - : Avx2::SynetConvolution32fDirectNhwc(p) - { - _convolutionBiasActivation = SetConvolutionBiasActivation(); - } - - template<::SimdConvolutionActivationType type> SIMD_INLINE __m512 Activate(__m512 value, const float * params, size_t offset, __mmask16 tail = -1); - - template<> SIMD_INLINE __m512 Activate<::SimdConvolutionActivationIdentity>(__m512 value, const float * params, size_t offset, __mmask16 tail) - { - return value; - } - - template<> SIMD_INLINE __m512 Activate<::SimdConvolutionActivationRelu>(__m512 value, const float * params, size_t offset, __mmask16 tail) - { - return _mm512_max_ps(_mm512_setzero_ps(), value); - } - - template<> SIMD_INLINE __m512 Activate<::SimdConvolutionActivationLeakyRelu>(__m512 value, const float * params, size_t offset, __mmask16 tail) - { - return _mm512_add_ps(_mm512_max_ps(_mm512_setzero_ps(), value), _mm512_mul_ps(_mm512_set1_ps(params[0]), _mm512_min_ps(_mm512_setzero_ps(), value))); - } - - template<> SIMD_INLINE __m512 Activate<::SimdConvolutionActivationRestrictRange>(__m512 value, const float * params, size_t offset, __mmask16 tail) - { - return _mm512_min_ps(_mm512_max_ps(_mm512_set1_ps(params[0]), value), _mm512_set1_ps(params[1])); - } - - template<> SIMD_INLINE __m512 Activate<::SimdConvolutionActivationPrelu>(__m512 value, const float * params, size_t offset, __mmask16 tail) - { - return _mm512_add_ps(_mm512_max_ps(_mm512_setzero_ps(), value), _mm512_mul_ps(_mm512_maskz_loadu_ps(tail, params + offset), _mm512_min_ps(_mm512_setzero_ps(), value))); - } - - template<> SIMD_INLINE __m512 Activate<::SimdConvolutionActivationElu>(__m512 value, const float * params, size_t offset, __mmask16 tail) - { - return Avx512f::Elu(value, _mm512_set1_ps(params[0])); - } - - template<> SIMD_INLINE __m512 Activate<::SimdConvolutionActivationHswish>(__m512 value, const float * params, size_t offset, __mmask16 tail) - { - return Avx512f::SynetHswish32f(value, _mm512_set1_ps(params[0]), _mm512_set1_ps(params[1])); - } - - SIMD_INLINE void KernelHwcDefaultEdge(const float * src, const ConvParam32f & p, size_t kH, size_t kW, const float * weight, __m512 & sum, __mmask16 tail = -1) - { - size_t size = kW * p.srcC, rest = (p.kernelX - kW)*p.srcC*p.dstC, dstC = p.dstC, stride = p.srcW * p.srcC; - for (size_t ky = 0; ky < kH; ++ky) - { - for (size_t i = 0; i < size; ++i, weight += dstC) - sum = _mm512_fmadd_ps(_mm512_set1_ps(src[i]), _mm512_maskz_loadu_ps(tail, weight), sum); - weight += rest; - src += stride; - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void KernelHwcDefaultEdge(const float * src, const ConvParam32f & p, size_t kH, size_t kW, const float * weight, const float * bias, const float * params, float * dst) - { - size_t dstC = p.dstC; - size_t dstCF = AlignLo(dstC, F); - - size_t dc = 0; - for (; dc < dstCF; dc += F) - { - __m512 conv = bias ? _mm512_loadu_ps(bias + dc) : _mm512_setzero_ps(); - KernelHwcDefaultEdge(src, p, kH, kW, weight + dc, conv); - _mm512_storeu_ps(dst + dc, Activate(conv, params, dc)); - } - if (dc < dstC) - { - __mmask16 tail = TailMask16(dstC - dstCF); - __m512 conv = bias ? _mm512_maskz_loadu_ps(tail, bias + dc) : _mm512_setzero_ps(); - KernelHwcDefaultEdge(src, p, kH, kW, weight + dc, conv, tail); - _mm512_mask_storeu_ps(dst + dc, tail, Activate(conv, params, dc, tail)); - } - } - - SIMD_INLINE void KernelHwcDefaultBody2x2(const float * src, const ConvParam32f & p, const float * weight, __m512 sums[2][2]) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - __m512 w0, w1, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm512_loadu_ps(weight + 0 * F); - w1 = _mm512_loadu_ps(weight + 1 * F); - s0 = _mm512_set1_ps(src0[offset]); - sums[0][0] = _mm512_fmadd_ps(s0, w0, sums[0][0]); - sums[0][1] = _mm512_fmadd_ps(s0, w1, sums[0][1]); - s0 = _mm512_set1_ps(src1[offset]); - sums[1][0] = _mm512_fmadd_ps(s0, w0, sums[1][0]); - sums[1][1] = _mm512_fmadd_ps(s0, w1, sums[1][1]); - weight += dstC; - } - } - } - - SIMD_INLINE void KernelHwcDefaultBody2x1(const float * src, const ConvParam32f & p, const float * weight, __m512 sums[2][1], __mmask16 tail = -1) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - __m512 w0, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm512_maskz_loadu_ps(tail, weight + 0 * F); - s0 = _mm512_set1_ps(src0[offset]); - sums[0][0] = _mm512_fmadd_ps(s0, w0, sums[0][0]); - s0 = _mm512_set1_ps(src1[offset]); - sums[1][0] = _mm512_fmadd_ps(s0, w0, sums[1][0]); - weight += dstC; - } - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void KernelHwcDefaultBody2(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t dstC = p.dstC; - size_t dstCF1 = AlignLo(dstC, 1 * F); - size_t dstCF2 = AlignLo(dstC, 2 * F); - size_t dc = 0; - for (; dc < dstCF2; dc += 2 * F) - { - __m512 sums[2][2]; - __m512 bias0 = bias ? _mm512_loadu_ps(bias + dc + 0 * F) : _mm512_setzero_ps(); - __m512 bias1 = bias ? _mm512_loadu_ps(bias + dc + 1 * F) : _mm512_setzero_ps(); - sums[0][0] = bias0; - sums[0][1] = bias1; - sums[1][0] = bias0; - sums[1][1] = bias1; - KernelHwcDefaultBody2x2(src, p, weight + dc, sums); - _mm512_storeu_ps(dst + dc + 0 * dstC + 0 * F, Activate(sums[0][0], params, dc + 0 * F)); - _mm512_storeu_ps(dst + dc + 0 * dstC + 1 * F, Activate(sums[0][1], params, dc + 1 * F)); - _mm512_storeu_ps(dst + dc + 1 * dstC + 0 * F, Activate(sums[1][0], params, dc + 0 * F)); - _mm512_storeu_ps(dst + dc + 1 * dstC + 1 * F, Activate(sums[1][1], params, dc + 1 * F)); - } - for (; dc < dstCF1; dc += 1 * F) - { - __m512 sums[2][1]; - __m512 bias0 = bias ? _mm512_loadu_ps(bias + dc) : _mm512_setzero_ps(); - sums[0][0] = bias0; - sums[1][0] = bias0; - KernelHwcDefaultBody2x1(src, p, weight + dc, sums); - _mm512_storeu_ps(dst + dc + 0 * dstC, Activate(sums[0][0], params, dc)); - _mm512_storeu_ps(dst + dc + 1 * dstC, Activate(sums[1][0], params, dc)); - } - if (dc < dstC) - { - __mmask16 tail = TailMask16(dstC - dstCF1); - __m512 sums[2][1]; - __m512 bias0 = bias ? _mm512_maskz_loadu_ps(tail, bias + dc) : _mm512_setzero_ps(); - sums[0][0] = bias0; - sums[1][0] = bias0; - KernelHwcDefaultBody2x1(src, p, weight + dc, sums); - _mm512_mask_storeu_ps(dst + dc + 0 * dstC, tail, Activate(sums[0][0], params, dc, tail)); - _mm512_mask_storeu_ps(dst + dc + 1 * dstC, tail, Activate(sums[1][0], params, dc, tail)); - } - } - - SIMD_INLINE void KernelHwcDefaultBody6x2(const float * src, const ConvParam32f & p, const float * weight, __m512 sums[6][2]) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - const float * src2 = src + 2 * step; - const float * src3 = src + 3 * step; - const float * src4 = src + 4 * step; - const float * src5 = src + 5 * step; - __m512 w0, w1, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm512_loadu_ps(weight + 0 * F); - w1 = _mm512_loadu_ps(weight + 1 * F); - s0 = _mm512_set1_ps(src0[offset]); - sums[0][0] = _mm512_fmadd_ps(s0, w0, sums[0][0]); - sums[0][1] = _mm512_fmadd_ps(s0, w1, sums[0][1]); - s0 = _mm512_set1_ps(src1[offset]); - sums[1][0] = _mm512_fmadd_ps(s0, w0, sums[1][0]); - sums[1][1] = _mm512_fmadd_ps(s0, w1, sums[1][1]); - s0 = _mm512_set1_ps(src2[offset]); - sums[2][0] = _mm512_fmadd_ps(s0, w0, sums[2][0]); - sums[2][1] = _mm512_fmadd_ps(s0, w1, sums[2][1]); - s0 = _mm512_set1_ps(src3[offset]); - sums[3][0] = _mm512_fmadd_ps(s0, w0, sums[3][0]); - sums[3][1] = _mm512_fmadd_ps(s0, w1, sums[3][1]); - s0 = _mm512_set1_ps(src4[offset]); - sums[4][0] = _mm512_fmadd_ps(s0, w0, sums[4][0]); - sums[4][1] = _mm512_fmadd_ps(s0, w1, sums[4][1]); - s0 = _mm512_set1_ps(src5[offset]); - sums[5][0] = _mm512_fmadd_ps(s0, w0, sums[5][0]); - sums[5][1] = _mm512_fmadd_ps(s0, w1, sums[5][1]); - weight += dstC; - } - } - } - - SIMD_INLINE void KernelHwcDefaultBody6x1(const float * src, const ConvParam32f & p, const float * weight, __m512 sums[6][1], __mmask16 tail = -1) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - const float * src2 = src + 2 * step; - const float * src3 = src + 3 * step; - const float * src4 = src + 4 * step; - const float * src5 = src + 5 * step; - __m512 w0, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm512_maskz_loadu_ps(tail, weight + 0 * F); - s0 = _mm512_set1_ps(src0[offset]); - sums[0][0] = _mm512_fmadd_ps(s0, w0, sums[0][0]); - s0 = _mm512_set1_ps(src1[offset]); - sums[1][0] = _mm512_fmadd_ps(s0, w0, sums[1][0]); - s0 = _mm512_set1_ps(src2[offset]); - sums[2][0] = _mm512_fmadd_ps(s0, w0, sums[2][0]); - s0 = _mm512_set1_ps(src3[offset]); - sums[3][0] = _mm512_fmadd_ps(s0, w0, sums[3][0]); - s0 = _mm512_set1_ps(src4[offset]); - sums[4][0] = _mm512_fmadd_ps(s0, w0, sums[4][0]); - s0 = _mm512_set1_ps(src5[offset]); - sums[5][0] = _mm512_fmadd_ps(s0, w0, sums[5][0]); - weight += dstC; - } - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void KernelHwcDefaultBody6(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t dstC = p.dstC; - size_t dstCF1 = AlignLo(dstC, 1 * F); - size_t dstCF2 = AlignLo(dstC, 2 * F); - size_t dc = 0; - for (; dc < dstCF2; dc += 2 * F) - { - __m512 sums[6][2]; - __m512 bias0 = bias ? _mm512_loadu_ps(bias + dc + 0 * F) : _mm512_setzero_ps(); - __m512 bias1 = bias ? _mm512_loadu_ps(bias + dc + 1 * F) : _mm512_setzero_ps(); - sums[0][0] = bias0; - sums[0][1] = bias1; - sums[1][0] = bias0; - sums[1][1] = bias1; - sums[2][0] = bias0; - sums[2][1] = bias1; - sums[3][0] = bias0; - sums[3][1] = bias1; - sums[4][0] = bias0; - sums[4][1] = bias1; - sums[5][0] = bias0; - sums[5][1] = bias1; - KernelHwcDefaultBody6x2(src, p, weight + dc, sums); - _mm512_storeu_ps(dst + dc + 0 * dstC + 0 * F, Activate(sums[0][0], params, dc + 0 * F)); - _mm512_storeu_ps(dst + dc + 0 * dstC + 1 * F, Activate(sums[0][1], params, dc + 1 * F)); - _mm512_storeu_ps(dst + dc + 1 * dstC + 0 * F, Activate(sums[1][0], params, dc + 0 * F)); - _mm512_storeu_ps(dst + dc + 1 * dstC + 1 * F, Activate(sums[1][1], params, dc + 1 * F)); - _mm512_storeu_ps(dst + dc + 2 * dstC + 0 * F, Activate(sums[2][0], params, dc + 0 * F)); - _mm512_storeu_ps(dst + dc + 2 * dstC + 1 * F, Activate(sums[2][1], params, dc + 1 * F)); - _mm512_storeu_ps(dst + dc + 3 * dstC + 0 * F, Activate(sums[3][0], params, dc + 0 * F)); - _mm512_storeu_ps(dst + dc + 3 * dstC + 1 * F, Activate(sums[3][1], params, dc + 1 * F)); - _mm512_storeu_ps(dst + dc + 4 * dstC + 0 * F, Activate(sums[4][0], params, dc + 0 * F)); - _mm512_storeu_ps(dst + dc + 4 * dstC + 1 * F, Activate(sums[4][1], params, dc + 1 * F)); - _mm512_storeu_ps(dst + dc + 5 * dstC + 0 * F, Activate(sums[5][0], params, dc + 0 * F)); - _mm512_storeu_ps(dst + dc + 5 * dstC + 1 * F, Activate(sums[5][1], params, dc + 1 * F)); - } - for (; dc < dstCF1; dc += 1 * F) - { - __m512 sums[6][1]; - __m512 bias0 = bias ? _mm512_loadu_ps(bias + dc) : _mm512_setzero_ps(); - sums[0][0] = bias0; - sums[1][0] = bias0; - sums[2][0] = bias0; - sums[3][0] = bias0; - sums[4][0] = bias0; - sums[5][0] = bias0; - KernelHwcDefaultBody6x1(src, p, weight + dc, sums); - _mm512_storeu_ps(dst + dc + 0 * dstC, Activate(sums[0][0], params, dc)); - _mm512_storeu_ps(dst + dc + 1 * dstC, Activate(sums[1][0], params, dc)); - _mm512_storeu_ps(dst + dc + 2 * dstC, Activate(sums[2][0], params, dc)); - _mm512_storeu_ps(dst + dc + 3 * dstC, Activate(sums[3][0], params, dc)); - _mm512_storeu_ps(dst + dc + 4 * dstC, Activate(sums[4][0], params, dc)); - _mm512_storeu_ps(dst + dc + 5 * dstC, Activate(sums[5][0], params, dc)); - } - if (dc < dstC) - { - __mmask16 tail = TailMask16(dstC - dstCF1); - __m512 sums[6][1]; - __m512 bias0 = bias ? _mm512_maskz_loadu_ps(tail, bias + dc) : _mm512_setzero_ps(); - sums[0][0] = bias0; - sums[1][0] = bias0; - sums[2][0] = bias0; - sums[3][0] = bias0; - sums[4][0] = bias0; - sums[5][0] = bias0; - KernelHwcDefaultBody6x1(src, p, weight + dc, sums, tail); - _mm512_mask_storeu_ps(dst + dc + 0 * dstC, tail, Activate(sums[0][0], params, dc, tail)); - _mm512_mask_storeu_ps(dst + dc + 1 * dstC, tail, Activate(sums[1][0], params, dc, tail)); - _mm512_mask_storeu_ps(dst + dc + 2 * dstC, tail, Activate(sums[2][0], params, dc, tail)); - _mm512_mask_storeu_ps(dst + dc + 3 * dstC, tail, Activate(sums[3][0], params, dc, tail)); - _mm512_mask_storeu_ps(dst + dc + 4 * dstC, tail, Activate(sums[4][0], params, dc, tail)); - _mm512_mask_storeu_ps(dst + dc + 5 * dstC, tail, Activate(sums[5][0], params, dc, tail)); - } - } - - SIMD_INLINE void KernelHwcDefaultBody8x3(const float * src, const ConvParam32f & p, const float * weight, __m512 sums[8][3]) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - const float * src2 = src + 2 * step; - const float * src3 = src + 3 * step; - const float * src4 = src + 4 * step; - const float * src5 = src + 5 * step; - const float * src6 = src + 6 * step; - const float * src7 = src + 7 * step; - __m512 w0, w1, w2, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm512_loadu_ps(weight + 0 * F); - w1 = _mm512_loadu_ps(weight + 1 * F); - w2 = _mm512_loadu_ps(weight + 2 * F); - s0 = _mm512_set1_ps(src0[offset]); - sums[0][0] = _mm512_fmadd_ps(s0, w0, sums[0][0]); - sums[0][1] = _mm512_fmadd_ps(s0, w1, sums[0][1]); - sums[0][2] = _mm512_fmadd_ps(s0, w2, sums[0][2]); - s0 = _mm512_set1_ps(src1[offset]); - sums[1][0] = _mm512_fmadd_ps(s0, w0, sums[1][0]); - sums[1][1] = _mm512_fmadd_ps(s0, w1, sums[1][1]); - sums[1][2] = _mm512_fmadd_ps(s0, w2, sums[1][2]); - s0 = _mm512_set1_ps(src2[offset]); - sums[2][0] = _mm512_fmadd_ps(s0, w0, sums[2][0]); - sums[2][1] = _mm512_fmadd_ps(s0, w1, sums[2][1]); - sums[2][2] = _mm512_fmadd_ps(s0, w2, sums[2][2]); - s0 = _mm512_set1_ps(src3[offset]); - sums[3][0] = _mm512_fmadd_ps(s0, w0, sums[3][0]); - sums[3][1] = _mm512_fmadd_ps(s0, w1, sums[3][1]); - sums[3][2] = _mm512_fmadd_ps(s0, w2, sums[3][2]); - s0 = _mm512_set1_ps(src4[offset]); - sums[4][0] = _mm512_fmadd_ps(s0, w0, sums[4][0]); - sums[4][1] = _mm512_fmadd_ps(s0, w1, sums[4][1]); - sums[4][2] = _mm512_fmadd_ps(s0, w2, sums[4][2]); - s0 = _mm512_set1_ps(src5[offset]); - sums[5][0] = _mm512_fmadd_ps(s0, w0, sums[5][0]); - sums[5][1] = _mm512_fmadd_ps(s0, w1, sums[5][1]); - sums[5][2] = _mm512_fmadd_ps(s0, w2, sums[5][2]); - s0 = _mm512_set1_ps(src6[offset]); - sums[6][0] = _mm512_fmadd_ps(s0, w0, sums[6][0]); - sums[6][1] = _mm512_fmadd_ps(s0, w1, sums[6][1]); - sums[6][2] = _mm512_fmadd_ps(s0, w2, sums[6][2]); - s0 = _mm512_set1_ps(src7[offset]); - sums[7][0] = _mm512_fmadd_ps(s0, w0, sums[7][0]); - sums[7][1] = _mm512_fmadd_ps(s0, w1, sums[7][1]); - sums[7][2] = _mm512_fmadd_ps(s0, w2, sums[7][2]); - weight += dstC; - } - } - } - - SIMD_INLINE void KernelHwcDefaultBody8x1(const float * src, const ConvParam32f & p, const float * weight, __m512 sums[8][1]) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - const float * src2 = src + 2 * step; - const float * src3 = src + 3 * step; - const float * src4 = src + 4 * step; - const float * src5 = src + 5 * step; - const float * src6 = src + 6 * step; - const float * src7 = src + 7 * step; - __m512 w0, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm512_loadu_ps(weight + 0 * F); - s0 = _mm512_set1_ps(src0[offset]); - sums[0][0] = _mm512_fmadd_ps(s0, w0, sums[0][0]); - s0 = _mm512_set1_ps(src1[offset]); - sums[1][0] = _mm512_fmadd_ps(s0, w0, sums[1][0]); - s0 = _mm512_set1_ps(src2[offset]); - sums[2][0] = _mm512_fmadd_ps(s0, w0, sums[2][0]); - s0 = _mm512_set1_ps(src3[offset]); - sums[3][0] = _mm512_fmadd_ps(s0, w0, sums[3][0]); - s0 = _mm512_set1_ps(src4[offset]); - sums[4][0] = _mm512_fmadd_ps(s0, w0, sums[4][0]); - s0 = _mm512_set1_ps(src5[offset]); - sums[5][0] = _mm512_fmadd_ps(s0, w0, sums[5][0]); - s0 = _mm512_set1_ps(src6[offset]); - sums[6][0] = _mm512_fmadd_ps(s0, w0, sums[6][0]); - s0 = _mm512_set1_ps(src7[offset]); - sums[7][0] = _mm512_fmadd_ps(s0, w0, sums[7][0]); - weight += dstC; - } - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void KernelHwcDefaultBody8(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t dstC = p.dstC; - size_t dstCF1 = AlignLo(dstC, 1 * F); - size_t dstCF3 = AlignLoAny(dstC, 3 * F); - size_t dc = 0; - for (; dc < dstCF3; dc += 3 * F) - { - __m512 sums[8][3]; - __m512 bias0 = bias ? _mm512_loadu_ps(bias + dc + 0 * F) : _mm512_setzero_ps(); - __m512 bias1 = bias ? _mm512_loadu_ps(bias + dc + 1 * F) : _mm512_setzero_ps(); - __m512 bias2 = bias ? _mm512_loadu_ps(bias + dc + 2 * F) : _mm512_setzero_ps(); - sums[0][0] = bias0; - sums[0][1] = bias1; - sums[0][2] = bias2; - sums[1][0] = bias0; - sums[1][1] = bias1; - sums[1][2] = bias2; - sums[2][0] = bias0; - sums[2][1] = bias1; - sums[2][2] = bias2; - sums[3][0] = bias0; - sums[3][1] = bias1; - sums[3][2] = bias2; - sums[4][0] = bias0; - sums[4][1] = bias1; - sums[4][2] = bias2; - sums[5][0] = bias0; - sums[5][1] = bias1; - sums[5][2] = bias2; - sums[6][0] = bias0; - sums[6][1] = bias1; - sums[6][2] = bias2; - sums[7][0] = bias0; - sums[7][1] = bias1; - sums[7][2] = bias2; - KernelHwcDefaultBody8x3(src, p, weight + dc, sums); - _mm512_storeu_ps(dst + dc + 0 * dstC + 0 * F, Activate(sums[0][0], params, dc + 0 * F)); - _mm512_storeu_ps(dst + dc + 0 * dstC + 1 * F, Activate(sums[0][1], params, dc + 1 * F)); - _mm512_storeu_ps(dst + dc + 0 * dstC + 2 * F, Activate(sums[0][2], params, dc + 2 * F)); - _mm512_storeu_ps(dst + dc + 1 * dstC + 0 * F, Activate(sums[1][0], params, dc + 0 * F)); - _mm512_storeu_ps(dst + dc + 1 * dstC + 1 * F, Activate(sums[1][1], params, dc + 1 * F)); - _mm512_storeu_ps(dst + dc + 1 * dstC + 2 * F, Activate(sums[1][2], params, dc + 2 * F)); - _mm512_storeu_ps(dst + dc + 2 * dstC + 0 * F, Activate(sums[2][0], params, dc + 0 * F)); - _mm512_storeu_ps(dst + dc + 2 * dstC + 1 * F, Activate(sums[2][1], params, dc + 1 * F)); - _mm512_storeu_ps(dst + dc + 2 * dstC + 2 * F, Activate(sums[2][2], params, dc + 2 * F)); - _mm512_storeu_ps(dst + dc + 3 * dstC + 0 * F, Activate(sums[3][0], params, dc + 0 * F)); - _mm512_storeu_ps(dst + dc + 3 * dstC + 1 * F, Activate(sums[3][1], params, dc + 1 * F)); - _mm512_storeu_ps(dst + dc + 3 * dstC + 2 * F, Activate(sums[3][2], params, dc + 2 * F)); - _mm512_storeu_ps(dst + dc + 4 * dstC + 0 * F, Activate(sums[4][0], params, dc + 0 * F)); - _mm512_storeu_ps(dst + dc + 4 * dstC + 1 * F, Activate(sums[4][1], params, dc + 1 * F)); - _mm512_storeu_ps(dst + dc + 4 * dstC + 2 * F, Activate(sums[4][2], params, dc + 2 * F)); - _mm512_storeu_ps(dst + dc + 5 * dstC + 0 * F, Activate(sums[5][0], params, dc + 0 * F)); - _mm512_storeu_ps(dst + dc + 5 * dstC + 1 * F, Activate(sums[5][1], params, dc + 1 * F)); - _mm512_storeu_ps(dst + dc + 5 * dstC + 2 * F, Activate(sums[5][2], params, dc + 2 * F)); - _mm512_storeu_ps(dst + dc + 6 * dstC + 0 * F, Activate(sums[6][0], params, dc + 0 * F)); - _mm512_storeu_ps(dst + dc + 6 * dstC + 1 * F, Activate(sums[6][1], params, dc + 1 * F)); - _mm512_storeu_ps(dst + dc + 6 * dstC + 2 * F, Activate(sums[6][2], params, dc + 2 * F)); - _mm512_storeu_ps(dst + dc + 7 * dstC + 0 * F, Activate(sums[7][0], params, dc + 0 * F)); - _mm512_storeu_ps(dst + dc + 7 * dstC + 1 * F, Activate(sums[7][1], params, dc + 1 * F)); - _mm512_storeu_ps(dst + dc + 7 * dstC + 2 * F, Activate(sums[7][2], params, dc + 2 * F)); - } - for (; dc < dstCF1; dc += 1 * F) - { - __m512 sums[8][1]; - __m512 bias0 = bias ? _mm512_loadu_ps(bias + dc) : _mm512_setzero_ps(); - sums[0][0] = bias0; - sums[1][0] = bias0; - sums[2][0] = bias0; - sums[3][0] = bias0; - sums[4][0] = bias0; - sums[5][0] = bias0; - sums[6][0] = bias0; - sums[7][0] = bias0; - KernelHwcDefaultBody6x1(src, p, weight + dc, sums); - _mm512_storeu_ps(dst + dc + 0 * dstC, Activate(sums[0][0], params, dc)); - _mm512_storeu_ps(dst + dc + 1 * dstC, Activate(sums[1][0], params, dc)); - _mm512_storeu_ps(dst + dc + 2 * dstC, Activate(sums[2][0], params, dc)); - _mm512_storeu_ps(dst + dc + 3 * dstC, Activate(sums[3][0], params, dc)); - _mm512_storeu_ps(dst + dc + 4 * dstC, Activate(sums[4][0], params, dc)); - _mm512_storeu_ps(dst + dc + 5 * dstC, Activate(sums[5][0], params, dc)); - _mm512_storeu_ps(dst + dc + 6 * dstC, Activate(sums[6][0], params, dc)); - _mm512_storeu_ps(dst + dc + 7 * dstC, Activate(sums[7][0], params, dc)); - } - if (dc < dstC) - { - __mmask16 tail = TailMask16(dstC - dstCF1); - __m512 sums[8][1]; - __m512 bias0 = bias ? _mm512_maskz_loadu_ps(tail, bias + dc) : _mm512_setzero_ps(); - sums[0][0] = bias0; - sums[1][0] = bias0; - sums[2][0] = bias0; - sums[3][0] = bias0; - sums[4][0] = bias0; - sums[5][0] = bias0; - sums[6][0] = bias0; - sums[7][0] = bias0; - KernelHwcDefaultBody6x1(src, p, weight + dc, sums, tail); - _mm512_mask_storeu_ps(dst + dc + 0 * dstC, tail, Activate(sums[0][0], params, dc, tail)); - _mm512_mask_storeu_ps(dst + dc + 1 * dstC, tail, Activate(sums[1][0], params, dc, tail)); - _mm512_mask_storeu_ps(dst + dc + 2 * dstC, tail, Activate(sums[2][0], params, dc, tail)); - _mm512_mask_storeu_ps(dst + dc + 3 * dstC, tail, Activate(sums[3][0], params, dc, tail)); - _mm512_mask_storeu_ps(dst + dc + 4 * dstC, tail, Activate(sums[4][0], params, dc, tail)); - _mm512_mask_storeu_ps(dst + dc + 5 * dstC, tail, Activate(sums[5][0], params, dc, tail)); - _mm512_mask_storeu_ps(dst + dc + 6 * dstC, tail, Activate(sums[6][0], params, dc, tail)); - _mm512_mask_storeu_ps(dst + dc + 7 * dstC, tail, Activate(sums[7][0], params, dc, tail)); - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void KernelHwcDefaultBody6_1x1x16(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t size = p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - const float * src2 = src + 2 * step; - const float * src3 = src + 3 * step; - const float * src4 = src + 4 * step; - const float * src5 = src + 5 * step; - __m512 w0, w1, s0, s1; - __m512 sums[6]; - __m512 bias0 = bias ? _mm512_loadu_ps(bias) : _mm512_setzero_ps(); - sums[0] = bias0; - sums[1] = bias0; - sums[2] = bias0; - sums[3] = bias0; - sums[4] = bias0; - sums[5] = bias0; - size_t offset = 0, size2 = size & (~1); - for (; offset < size2; offset += 2) - { - w0 = _mm512_loadu_ps(weight + 0 * F); - w1 = _mm512_loadu_ps(weight + 1 * F); - s0 = _mm512_set1_ps(src0[offset + 0]); - s1 = _mm512_set1_ps(src1[offset + 0]); - sums[0] = _mm512_fmadd_ps(s0, w0, sums[0]); - sums[1] = _mm512_fmadd_ps(s1, w0, sums[1]); - s0 = _mm512_set1_ps(src0[offset + 1]); - s1 = _mm512_set1_ps(src1[offset + 1]); - sums[0] = _mm512_fmadd_ps(s0, w1, sums[0]); - sums[1] = _mm512_fmadd_ps(s1, w1, sums[1]); - s0 = _mm512_set1_ps(src2[offset + 0]); - s1 = _mm512_set1_ps(src3[offset + 0]); - sums[2] = _mm512_fmadd_ps(s0, w0, sums[2]); - sums[3] = _mm512_fmadd_ps(s1, w0, sums[3]); - s0 = _mm512_set1_ps(src2[offset + 1]); - s1 = _mm512_set1_ps(src3[offset + 1]); - sums[2] = _mm512_fmadd_ps(s0, w1, sums[2]); - sums[3] = _mm512_fmadd_ps(s1, w1, sums[3]); - s0 = _mm512_set1_ps(src4[offset + 0]); - s1 = _mm512_set1_ps(src5[offset + 0]); - sums[4] = _mm512_fmadd_ps(s0, w0, sums[4]); - sums[5] = _mm512_fmadd_ps(s1, w0, sums[5]); - s0 = _mm512_set1_ps(src4[offset + 1]); - s1 = _mm512_set1_ps(src5[offset + 1]); - sums[4] = _mm512_fmadd_ps(s0, w1, sums[4]); - sums[5] = _mm512_fmadd_ps(s1, w1, sums[5]); - weight += 2 * F; - } - for (; offset < size; ++offset) - { - w0 = _mm512_loadu_ps(weight + 0 * F); - s0 = _mm512_set1_ps(src0[offset]); - s1 = _mm512_set1_ps(src1[offset]); - sums[0] = _mm512_fmadd_ps(s0, w0, sums[0]); - sums[1] = _mm512_fmadd_ps(s1, w0, sums[1]); - s0 = _mm512_set1_ps(src2[offset]); - s1 = _mm512_set1_ps(src3[offset]); - sums[2] = _mm512_fmadd_ps(s0, w0, sums[2]); - sums[3] = _mm512_fmadd_ps(s1, w0, sums[3]); - s0 = _mm512_set1_ps(src4[offset]); - s1 = _mm512_set1_ps(src5[offset]); - sums[4] = _mm512_fmadd_ps(s0, w0, sums[4]); - sums[5] = _mm512_fmadd_ps(s1, w0, sums[5]); - weight += F; - } - _mm512_storeu_ps(dst + 0 * F, Activate(sums[0], params, 0)); - _mm512_storeu_ps(dst + 1 * F, Activate(sums[1], params, 0)); - _mm512_storeu_ps(dst + 2 * F, Activate(sums[2], params, 0)); - _mm512_storeu_ps(dst + 3 * F, Activate(sums[3], params, 0)); - _mm512_storeu_ps(dst + 4 * F, Activate(sums[4], params, 0)); - _mm512_storeu_ps(dst + 5 * F, Activate(sums[5], params, 0)); - } - - template<::SimdConvolutionActivationType type> void ConvolutionDirectNhwcConvolutionBiasActivationDefault(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - bool is1x1x16 = p.dstC == 16 && p.kernelX == 1 && p.kernelY == 1; - size_t noseH = p.padY, noseW = p.padX; - size_t bodyH = p.srcH - p.kernelY + 1 + noseH, bodyW = p.srcW - p.kernelX + 1 + noseW; - size_t tailH = bodyH + p.padH, tailW = bodyW + p.padW; - size_t bodyW2 = AlignLoAny(bodyW - noseW, 2 * p.strideX) + noseW; - size_t bodyW6 = AlignLoAny(bodyW - noseW, 6 * p.strideX) + noseW; - size_t bodyW8 = AlignLoAny(bodyW - noseW, 8 * p.strideX) + noseW; - size_t wS = p.srcC*p.dstC; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - size_t sy = 0; - for (; sy < noseH; sy += p.strideY) - { - size_t sx = 0; - const float * w = weight + (noseH - sy) * p.kernelY * wS; - for (; sx < noseW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src, p, kY + sy, kX + sx, w + (noseW - sx)*wS, bias, params, dst); - for (; sx < bodyW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, kY + sy, p.kernelX, w, bias, params, dst); - for (; sx < tailW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, kY + sy, kW - sx, w, bias, params, dst); - } - src += (sy - noseH)*p.srcW*p.srcC; - for (; sy < bodyH; sy += p.strideY) - { - size_t sx = 0; - for (; sx < noseW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src, p, p.kernelY, kX + sx, weight + (noseW - sx)*wS, bias, params, dst); - if (is1x1x16) - { - for (; sx < bodyW6; sx += 6 * p.strideX, dst += 6 * p.dstC) - KernelHwcDefaultBody6_1x1x16(src + (sx - noseW) * p.srcC, p, weight, bias, params, dst); - } - else if (p.dstC == 48) - { - for (; sx < bodyW8; sx += 8 * p.strideX, dst += 8 * p.dstC) - KernelHwcDefaultBody8(src + (sx - noseW) * p.srcC, p, weight, bias, params, dst); - } - else - { - for (; sx < bodyW6; sx += 6 * p.strideX, dst += 6 * p.dstC) - KernelHwcDefaultBody6(src + (sx - noseW) * p.srcC, p, weight, bias, params, dst); - } - for (; sx < bodyW2; sx += 2 * p.strideX, dst += 2 * p.dstC) - KernelHwcDefaultBody2(src + (sx - noseW) * p.srcC, p, weight, bias, params, dst); - for (; sx < bodyW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, p.kernelY, p.kernelX, weight, bias, params, dst); - for (; sx < tailW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, p.kernelY, kW - sx, weight, bias, params, dst); - src += p.strideY*p.srcW*p.srcC; - } - for (; sy < tailH; sy += p.strideY) - { - size_t sx = 0; - for (; sx < noseW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src, p, kH - sy, kX + sx, weight + (noseW - sx)*wS, bias, params, dst); - for (; sx < bodyW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, kH - sy, p.kernelX, weight, bias, params, dst); - for (; sx < tailW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, kH - sy, kW - sx, weight, bias, params, dst); - src += p.strideY*p.srcW*p.srcC; - } - } - - template<::SimdConvolutionActivationType type> void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t size = p.group; - size_t sizeF = AlignLo(size, F); - size_t size2F = AlignLo(size, 2 * F); - size_t size4F = AlignLo(size, 4 * F); - size_t size8F = AlignLo(size, 8 * F); - for (size_t dy = 0; dy < p.dstH; ++dy) - { - for (size_t dx = 0; dx < p.dstW; ++dx) - { - size_t i = 0; - for (; i < size8F; i += 8 * F) - { - __m512 sums[8]; - if (bias) - { - sums[0] = _mm512_loadu_ps(bias + i + 0 * F); - sums[1] = _mm512_loadu_ps(bias + i + 1 * F); - sums[2] = _mm512_loadu_ps(bias + i + 2 * F); - sums[3] = _mm512_loadu_ps(bias + i + 3 * F); - sums[4] = _mm512_loadu_ps(bias + i + 4 * F); - sums[5] = _mm512_loadu_ps(bias + i + 5 * F); - sums[6] = _mm512_loadu_ps(bias + i + 6 * F); - sums[7] = _mm512_loadu_ps(bias + i + 7 * F); - } - else - { - sums[0] = _mm512_setzero_ps(); - sums[1] = _mm512_setzero_ps(); - sums[2] = _mm512_setzero_ps(); - sums[3] = _mm512_setzero_ps(); - sums[4] = _mm512_setzero_ps(); - sums[5] = _mm512_setzero_ps(); - sums[6] = _mm512_setzero_ps(); - sums[7] = _mm512_setzero_ps(); - } - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * p.strideX + kx * p.dilationX - p.padX; - if (sx < p.srcW) - { - const float * pw = weight + (ky*p.kernelX + kx)*size + i; - const float * ps = src + (sy*p.srcW + sx)*size + i; - sums[0] = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 0 * F), _mm512_loadu_ps(pw + 0 * F), sums[0]); - sums[1] = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 1 * F), _mm512_loadu_ps(pw + 1 * F), sums[1]); - sums[2] = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 2 * F), _mm512_loadu_ps(pw + 2 * F), sums[2]); - sums[3] = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 3 * F), _mm512_loadu_ps(pw + 3 * F), sums[3]); - sums[4] = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 4 * F), _mm512_loadu_ps(pw + 4 * F), sums[4]); - sums[5] = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 5 * F), _mm512_loadu_ps(pw + 5 * F), sums[5]); - sums[6] = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 6 * F), _mm512_loadu_ps(pw + 6 * F), sums[6]); - sums[7] = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 7 * F), _mm512_loadu_ps(pw + 7 * F), sums[7]); - } - } - } - } - _mm512_storeu_ps(dst + i + 0 * F, Activate(sums[0], params, i + 0 * F)); - _mm512_storeu_ps(dst + i + 1 * F, Activate(sums[1], params, i + 1 * F)); - _mm512_storeu_ps(dst + i + 2 * F, Activate(sums[2], params, i + 2 * F)); - _mm512_storeu_ps(dst + i + 3 * F, Activate(sums[3], params, i + 3 * F)); - _mm512_storeu_ps(dst + i + 4 * F, Activate(sums[4], params, i + 4 * F)); - _mm512_storeu_ps(dst + i + 5 * F, Activate(sums[5], params, i + 5 * F)); - _mm512_storeu_ps(dst + i + 6 * F, Activate(sums[6], params, i + 6 * F)); - _mm512_storeu_ps(dst + i + 7 * F, Activate(sums[7], params, i + 7 * F)); - } - for (; i < size4F; i += 4 * F) - { - __m512 sums[4]; - if (bias) - { - sums[0] = _mm512_loadu_ps(bias + i + 0 * F); - sums[1] = _mm512_loadu_ps(bias + i + 1 * F); - sums[2] = _mm512_loadu_ps(bias + i + 2 * F); - sums[3] = _mm512_loadu_ps(bias + i + 3 * F); - } - else - { - sums[0] = _mm512_setzero_ps(); - sums[1] = _mm512_setzero_ps(); - sums[2] = _mm512_setzero_ps(); - sums[3] = _mm512_setzero_ps(); - } - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * p.strideX + kx * p.dilationX - p.padX; - if (sx < p.srcW) - { - const float * pw = weight + (ky*p.kernelX + kx)*size + i; - const float * ps = src + (sy*p.srcW + sx)*size + i; - sums[0] = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 0 * F), _mm512_loadu_ps(pw + 0 * F), sums[0]); - sums[1] = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 1 * F), _mm512_loadu_ps(pw + 1 * F), sums[1]); - sums[2] = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 2 * F), _mm512_loadu_ps(pw + 2 * F), sums[2]); - sums[3] = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 3 * F), _mm512_loadu_ps(pw + 3 * F), sums[3]); - } - } - } - } - _mm512_storeu_ps(dst + i + 0 * F, Activate(sums[0], params, i + 0 * F)); - _mm512_storeu_ps(dst + i + 1 * F, Activate(sums[1], params, i + 1 * F)); - _mm512_storeu_ps(dst + i + 2 * F, Activate(sums[2], params, i + 2 * F)); - _mm512_storeu_ps(dst + i + 3 * F, Activate(sums[3], params, i + 3 * F)); - } - for (; i < size2F; i += 2 * F) - { - __m512 sums[2]; - if (bias) - { - sums[0] = _mm512_loadu_ps(bias + i + 0 * F); - sums[1] = _mm512_loadu_ps(bias + i + 1 * F); - } - else - { - sums[0] = _mm512_setzero_ps(); - sums[1] = _mm512_setzero_ps(); - } - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * p.strideX + kx * p.dilationX - p.padX; - if (sx < p.srcW) - { - const float * pw = weight + (ky*p.kernelX + kx)*size + i; - const float * ps = src + (sy*p.srcW + sx)*size + i; - sums[0] = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 0 * F), _mm512_loadu_ps(pw + 0 * F), sums[0]); - sums[1] = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 1 * F), _mm512_loadu_ps(pw + 1 * F), sums[1]); - } - } - } - } - _mm512_storeu_ps(dst + i + 0 * F, Activate(sums[0], params, i + 0 * F)); - _mm512_storeu_ps(dst + i + 1 * F, Activate(sums[1], params, i + 1 * F)); - } - for (; i < size; i += F) - { - __mmask16 tail = i < sizeF ? __mmask16(-1) : TailMask16(size - i); - __m512 sum = bias ? _mm512_maskz_loadu_ps(tail, bias + i) : _mm512_setzero_ps(); - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * p.strideX + kx * p.dilationX - p.padX; - if (sx < p.srcW) - { - const float * pw = weight + (ky*p.kernelX + kx)*size + i; - const float * ps = src + (sy*p.srcW + sx)*size + i; - sum = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps), _mm512_maskz_loadu_ps(tail, pw), sum); - } - } - } - } - _mm512_mask_storeu_ps(dst + i, tail, Activate(sum, params, i, tail)); - } - dst += p.dstC; - } - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(const float * src, const ConvParam32f & p, size_t dy, size_t dx, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcC = p.srcC; - size_t srcCF = AlignLo(srcC, F); - size_t c = 0; - for (; c < srcCF; c += F) - { - __m512 sum = bias ? _mm512_loadu_ps(bias + c) : _mm512_setzero_ps(); - for (size_t ky = 0; ky < 3; ++ky) - { - size_t sy = dy * p.strideY + ky - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < 3; ++kx) - { - size_t sx = dx * p.strideX + kx - p.padX; - if (sx < p.srcW) - { - const float * pw = weight + (ky * 3 + kx) * srcC; - const float * ps = src + (sy*p.srcW + sx) * srcC; - sum = _mm512_fmadd_ps(_mm512_loadu_ps(ps), _mm512_loadu_ps(pw), sum); - } - } - } - } - _mm512_storeu_ps(dst + c, Activate(sum, params, c)); - src += F; - weight += F; - } - if (c < srcC) - { - __mmask16 tail = TailMask16(srcC - c); - __m512 sum = bias ? _mm512_maskz_loadu_ps(tail, bias + c) : _mm512_setzero_ps(); - for (size_t ky = 0; ky < 3; ++ky) - { - size_t sy = dy * p.strideY + ky - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < 3; ++kx) - { - size_t sx = dx * p.strideX + kx - p.padX; - if (sx < p.srcW) - { - const float * pw = weight + (ky*3 + kx) * srcC; - const float * ps = src + (sy*p.srcW + sx) * srcC; - sum = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps), _mm512_maskz_loadu_ps(tail, pw), sum); - } - } - } - } - _mm512_mask_storeu_ps(dst + c, tail, Activate(sum, params, c, tail)); - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1(const float * src, size_t srcS, size_t srcC, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcCF = AlignLo(srcC, F); - size_t c = 0; - for (; c < srcCF; c += F) - { - __m512 sum = bias ? _mm512_loadu_ps(bias + c) : _mm512_setzero_ps(); - for (size_t ky = 0; ky < 3; ++ky) - { - const float * ps = src + ky * srcS; - const float * pw = weight + ky * 3 * srcC; - sum = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 0 * srcC), _mm512_loadu_ps(pw + 0 * srcC), sum); - sum = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 1 * srcC), _mm512_loadu_ps(pw + 1 * srcC), sum); - sum = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 2 * srcC), _mm512_loadu_ps(pw + 2 * srcC), sum); - } - _mm512_storeu_ps(dst + c, Activate(sum, params, c)); - src += F; - weight += F; - } - if (c < srcC) - { - __mmask16 tail = TailMask16(srcC - c); - __m512 sum = bias ? _mm512_maskz_loadu_ps(tail, bias + c) : _mm512_setzero_ps(); - for (size_t ky = 0; ky < 3; ++ky) - { - const float * ps = src + ky * srcS; - const float * pw = weight + ky * 3 * srcC; - sum = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps + 0 * srcC), _mm512_maskz_loadu_ps(tail, pw + 0 * srcC), sum); - sum = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps + 1 * srcC), _mm512_maskz_loadu_ps(tail, pw + 1 * srcC), sum); - sum = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps + 2 * srcC), _mm512_maskz_loadu_ps(tail, pw + 2 * srcC), sum); - } - _mm512_mask_storeu_ps(dst + c, tail, Activate(sum, params, c, tail)); - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2(const float * src, size_t srcS, size_t srcX, size_t srcC, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcCF = AlignLo(srcC, F); - size_t c = 0; - __m512 sum0, sum1, w0; - for (; c < srcCF; c += F) - { - sum0 = bias ? _mm512_loadu_ps(bias + c) : _mm512_setzero_ps(); - sum1 = sum0; - const float * pw = weight + c; - for (size_t ky = 0; ky < 3; ++ky) - { - const float * ps0 = src + ky * srcS; - const float * ps1 = ps0 + srcX; - w0 = _mm512_loadu_ps(pw); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(ps0 + 0 * srcC), w0, sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps1 + 0 * srcC), w0, sum1); - pw += srcC; - w0 = _mm512_loadu_ps(pw); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(ps0 + 1 * srcC), w0, sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps1 + 1 * srcC), w0, sum1); - pw += srcC; - w0 = _mm512_loadu_ps(pw); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(ps0 + 2 * srcC), w0, sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps1 + 2 * srcC), w0, sum1); - pw += srcC; - } - _mm512_storeu_ps(dst + c, Activate(sum0, params, c)); - _mm512_storeu_ps(dst + c + srcC, Activate(sum1, params, c)); - src += F; - } - if (c < srcC) - { - __mmask16 tail = TailMask16(srcC - c); - sum0 = bias ? _mm512_maskz_loadu_ps(tail, bias + c) : _mm512_setzero_ps(); - sum1 = sum0; - const float * pw = weight + c; - for (size_t ky = 0; ky < 3; ++ky) - { - const float * ps0 = src + ky * srcS; - const float * ps1 = ps0 + srcX; - w0 = _mm512_maskz_loadu_ps(tail, pw); - sum0 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps0 + 0 * srcC), w0, sum0); - sum1 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps1 + 0 * srcC), w0, sum1); - pw += srcC; - w0 = _mm512_maskz_loadu_ps(tail, pw); - sum0 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps0 + 1 * srcC), w0, sum0); - sum1 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps1 + 1 * srcC), w0, sum1); - pw += srcC; - w0 = _mm512_maskz_loadu_ps(tail, pw); - sum0 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps0 + 2 * srcC), w0, sum0); - sum1 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps1 + 2 * srcC), w0, sum1); - pw += srcC; - } - _mm512_mask_storeu_ps(dst + c, tail, Activate(sum0, params, c, tail)); - _mm512_mask_storeu_ps(dst + c + srcC, tail, Activate(sum1, params, c, tail)); - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4(const float * src, size_t srcS, size_t srcX, size_t srcC, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcCF = AlignLo(srcC, F); - size_t c = 0; - for (; c < srcCF; c += F) - { - __m512 sum0, sum1, sum2, sum3, w0; - sum0 = bias ? _mm512_loadu_ps(bias + c) : _mm512_setzero_ps(); - sum1 = sum0; - sum2 = sum0; - sum3 = sum0; - const float * pw = weight + c; - const float * ps0 = src + 0 * srcX; - const float * ps1 = src + 1 * srcX; - const float * ps2 = src + 2 * srcX; - const float * ps3 = src + 3 * srcX; - for (size_t ky = 0; ky < 3; ++ky) - { - size_t offset = ky * srcS; - w0 = _mm512_loadu_ps(pw); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(ps0 + offset), w0, sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps1 + offset), w0, sum1); - sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(ps2 + offset), w0, sum2); - sum3 = _mm512_fmadd_ps(_mm512_loadu_ps(ps3 + offset), w0, sum3); - pw += srcC, offset += srcC; - w0 = _mm512_loadu_ps(pw); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(ps0 + offset), w0, sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps1 + offset), w0, sum1); - sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(ps2 + offset), w0, sum2); - sum3 = _mm512_fmadd_ps(_mm512_loadu_ps(ps3 + offset), w0, sum3); - pw += srcC, offset += srcC; - w0 = _mm512_loadu_ps(pw); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(ps0 + offset), w0, sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps1 + offset), w0, sum1); - sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(ps2 + offset), w0, sum2); - sum3 = _mm512_fmadd_ps(_mm512_loadu_ps(ps3 + offset), w0, sum3); - pw += srcC, offset += srcC; - } - _mm512_storeu_ps(dst + 0 * srcC, Activate(sum0, params, c)); - _mm512_storeu_ps(dst + 1 * srcC, Activate(sum1, params, c)); - _mm512_storeu_ps(dst + 2 * srcC, Activate(sum2, params, c)); - _mm512_storeu_ps(dst + 3 * srcC, Activate(sum3, params, c)); - src += F; - dst += F; - } - if (c < srcC) - { - __mmask16 tail = TailMask16(srcC - c); - __m512 sum0, sum1, sum2, sum3, w0; - sum0 = bias ? _mm512_maskz_loadu_ps(tail, bias + c) : _mm512_setzero_ps(); - sum1 = sum0; - sum2 = sum0; - sum3 = sum0; - const float * pw = weight + c; - const float * ps0 = src + 0 * srcX; - const float * ps1 = src + 1 * srcX; - const float * ps2 = src + 2 * srcX; - const float * ps3 = src + 3 * srcX; - for (size_t ky = 0; ky < 3; ++ky) - { - size_t offset = ky * srcS; - w0 = _mm512_maskz_loadu_ps(tail, pw); - sum0 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps0 + offset), w0, sum0); - sum1 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps1 + offset), w0, sum1); - sum2 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps2 + offset), w0, sum2); - sum3 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps3 + offset), w0, sum3); - pw += srcC, offset += srcC; - w0 = _mm512_maskz_loadu_ps(tail, pw); - sum0 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps0 + offset), w0, sum0); - sum1 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps1 + offset), w0, sum1); - sum2 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps2 + offset), w0, sum2); - sum3 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps3 + offset), w0, sum3); - pw += srcC, offset += srcC; - w0 = _mm512_maskz_loadu_ps(tail, pw); - sum0 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps0 + offset), w0, sum0); - sum1 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps1 + offset), w0, sum1); - sum2 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps2 + offset), w0, sum2); - sum3 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps3 + offset), w0, sum3); - pw += srcC, offset += srcC; - } - _mm512_mask_storeu_ps(dst + 0 * srcC, tail, Activate(sum0, params, c, tail)); - _mm512_mask_storeu_ps(dst + 1 * srcC, tail, Activate(sum1, params, c, tail)); - _mm512_mask_storeu_ps(dst + 2 * srcC, tail, Activate(sum2, params, c, tail)); - _mm512_mask_storeu_ps(dst + 3 * srcC, tail, Activate(sum3, params, c, tail)); - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge16(const float * src, const ConvParam32f & p, size_t dy, size_t dx, const __m512 * weight, __m512 bias, const float * params, float * dst) - { - __m512 sum = bias; - for (size_t ky = 0; ky < 3; ++ky) - { - size_t sy = dy * p.strideY + ky - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < 3; ++kx) - { - size_t sx = dx * p.strideX + kx - p.padX; - if (sx < p.srcW) - { - const float * ps = src + (sy*p.srcW + sx) * F; - sum = _mm512_fmadd_ps(_mm512_loadu_ps(ps), weight[ky * 3 + kx], sum); - } - } - } - } - _mm512_storeu_ps(dst, Activate(sum, params, 0)); - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main16x1(const float * src, size_t srcS, const __m512 * weight, __m512 bias, const float * params, float * dst) - { - __m512 sum = bias; - sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 0 * F), weight[0], sum); - sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 1 * F), weight[1], sum); - sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 2 * F), weight[2], sum); - src += srcS; - sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 0 * F), weight[3], sum); - sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 1 * F), weight[4], sum); - sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 2 * F), weight[5], sum); - src += srcS; - sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 0 * F), weight[6], sum); - sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 1 * F), weight[7], sum); - sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 2 * F), weight[8], sum); - _mm512_storeu_ps(dst, Activate(sum, params, 0)); - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main16x2(const float * src, size_t srcS, const __m512 * weight, __m512 bias, const float * params, float * dst) - { - __m512 sum0 = bias; - __m512 sum1 = bias; - for (size_t ky = 0; ky < 3; ++ky) - { - __m512 s0 = _mm512_loadu_ps(src + 0 * F); - __m512 s1 = _mm512_loadu_ps(src + 1 * F); - __m512 s2 = _mm512_loadu_ps(src + 2 * F); - __m512 s3 = _mm512_loadu_ps(src + 3 * F); - sum0 = _mm512_fmadd_ps(s0, weight[0], sum0); - sum1 = _mm512_fmadd_ps(s1, weight[0], sum1); - sum0 = _mm512_fmadd_ps(s1, weight[1], sum0); - sum1 = _mm512_fmadd_ps(s2, weight[1], sum1); - sum0 = _mm512_fmadd_ps(s2, weight[2], sum0); - sum1 = _mm512_fmadd_ps(s3, weight[2], sum1); - src += srcS; - weight += 3; - } - _mm512_storeu_ps(dst + 0, Activate(sum0, params, 0)); - _mm512_storeu_ps(dst + F, Activate(sum1, params, 0)); - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main48(const float * src, size_t srcS, const __m512 * weight, const float * bias, const float * params, float * dst) - { - __m512 sum0, sum1, sum2; - if (bias) - { - sum0 = _mm512_loadu_ps(bias + 0 * F); - sum1 = _mm512_loadu_ps(bias + 1 * F); - sum2 = _mm512_loadu_ps(bias + 2 * F); - } - else - { - sum0 = _mm512_setzero_ps(); - sum1 = _mm512_setzero_ps(); - sum2 = _mm512_setzero_ps(); - } - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 0 * F), weight[0], sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 1 * F), weight[1], sum1); - sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 2 * F), weight[2], sum2); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 3 * F), weight[3], sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 4 * F), weight[4], sum1); - sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 5 * F), weight[5], sum2); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 6 * F), weight[6], sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 7 * F), weight[7], sum1); - sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 8 * F), weight[8], sum2); - src += srcS; - weight += 9; - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 0 * F), weight[0], sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 1 * F), weight[1], sum1); - sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 2 * F), weight[2], sum2); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 3 * F), weight[3], sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 4 * F), weight[4], sum1); - sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 5 * F), weight[5], sum2); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 6 * F), weight[6], sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 7 * F), weight[7], sum1); - sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 8 * F), weight[8], sum2); - src += srcS; - weight += 9; - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 0 * F), weight[0], sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 1 * F), weight[1], sum1); - sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 2 * F), weight[2], sum2); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 3 * F), weight[3], sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 4 * F), weight[4], sum1); - sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 5 * F), weight[5], sum2); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 6 * F), weight[6], sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 7 * F), weight[7], sum1); - sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 8 * F), weight[8], sum2); - _mm512_storeu_ps(dst + 0 * F, Activate(sum0, params, 0 * F)); - _mm512_storeu_ps(dst + 1 * F, Activate(sum1, params, 1 * F)); - _mm512_storeu_ps(dst + 2 * F, Activate(sum2, params, 2 * F)); - } - - template<::SimdConvolutionActivationType type> void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcS = p.srcC*p.srcW; - size_t srcX = p.srcC*p.strideX; - size_t dstH = p.dstH - p.padH; - size_t dstW = p.dstW - p.padW; - size_t dstW2 = AlignLo(dstW - p.padX, 2) + p.padX; - size_t dstW4 = AlignLo(dstW - p.padX, 4) + p.padX; - if (p.dstC == F && p.strideX == 1) - { - __m512 _weight[9]; - for (size_t i = 0; i < 9; ++i) - _weight[i] = _mm512_loadu_ps(weight + i * F); - __m512 _bias = bias ? _mm512_loadu_ps(bias) : _mm512_setzero_ps(); - size_t dy = 0; - for (; dy < p.padY; ++dy) - for (size_t dx = 0; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge16(src, p, dy, dx, _weight, _bias, params, dst), dst += F; - for (; dy < dstH; ++dy) - { - size_t dx = 0; - for (; dx < p.padX; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge16(src, p, dy, dx, _weight, _bias, params, dst), dst += F; - size_t offset = ((dy * p.strideY - p.padY)*p.srcW + dx * p.strideX - p.padX)*p.srcC; - for (; dx < dstW2; dx += 2) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main16x2(src + offset, srcS, _weight, _bias, params, dst), offset += 2*F, dst += 2*F; - for (; dx < dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main16x1(src + offset, srcS, _weight, _bias, params, dst), offset += F, dst += F; - for (; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge16(src, p, dy, dx, _weight, _bias, params, dst), dst += F; - } - for (; dy < p.dstH; ++dy) - for (size_t dx = 0; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge16(src, p, dy, dx, _weight, _bias, params, dst), dst += F; - } - else - { - size_t dy = 0; - for (; dy < p.padY; ++dy) - for (size_t dx = 0; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC; - for (; dy < dstH; ++dy) - { - size_t dx = 0; - for (; dx < p.padX; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC; - size_t offset = ((dy * p.strideY - p.padY)*p.srcW + dx * p.strideX - p.padX)*p.srcC; - if (p.srcC == 48) - { - __m512 _weight[27]; - for (size_t i = 0; i < 27; ++i) - _weight[i] = _mm512_loadu_ps(weight + i * F); - for (; dx < dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main48(src + offset, srcS, _weight, bias, params, dst), dst += p.dstC, offset += srcX; - } - else - for (; dx < dstW4; dx += 4) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4(src + offset, srcS, srcX, p.srcC, weight, bias, params, dst), dst += 4 * p.dstC, offset += 4 * srcX; - for (; dx < dstW2; dx += 2) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2(src + offset, srcS, srcX, p.srcC, weight, bias, params, dst), dst += 2 * p.dstC, offset += 2 * srcX; - for (; dx < dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1(src + offset, srcS, p.srcC, weight, bias, params, dst), dst += p.dstC, offset += srcX; - for (; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC; - } - for (; dy < p.dstH; ++dy) - for (size_t dx = 0; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC; - } - } - - template <::SimdConvolutionActivationType type> SynetConvolution32fDirectNhwc::ConvolutionBiasActivationPtr GetConvolutionBiasActivation(const ConvParam32f & p) - { - if (p.group == 1) - return ConvolutionDirectNhwcConvolutionBiasActivationDefault; - else if (p.IsDepthwise()) - { - if(p.IsKernel(3) && p.IsDilation(1)) - return ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3; - else - return ConvolutionDirectNhwcConvolutionBiasActivationDepthwise; - } - return NULL; - } - - SynetConvolution32fDirectNhwc::ConvolutionBiasActivationPtr SynetConvolution32fDirectNhwc::SetConvolutionBiasActivation() - { - const ConvParam32f & p = _param; - SynetConvolution32fDirectNhwc::ConvolutionBiasActivationPtr func = NULL; - if (p.dstC > HF && p.dstC != 24 && p.dstH >= p.padY + p.padH && p.dstW >= p.padX + p.padW) - { - switch (p.activation) - { - case ::SimdConvolutionActivationIdentity: func = GetConvolutionBiasActivation<::SimdConvolutionActivationIdentity>(p); break; - case ::SimdConvolutionActivationRelu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationRelu>(p); break; - case ::SimdConvolutionActivationLeakyRelu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationLeakyRelu>(p); break; - case ::SimdConvolutionActivationRestrictRange: func = GetConvolutionBiasActivation<::SimdConvolutionActivationRestrictRange>(p); break; - case ::SimdConvolutionActivationPrelu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationPrelu>(p); break; - case ::SimdConvolutionActivationElu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationElu>(p); break; - case ::SimdConvolutionActivationHswish: func = GetConvolutionBiasActivation<::SimdConvolutionActivationHswish>(p); break; - } - } - return func ? func : Avx2::SynetConvolution32fDirectNhwc::SetConvolutionBiasActivation(); - }; - - //--------------------------------------------------------------------- - - SynetConvolution32fNhwcDirect::SynetConvolution32fNhwcDirect(const ConvParam32f& p) - : Avx2::SynetConvolution32fNhwcDirect(p) - { - if (p.dstC <= Avx::F) - return; -#ifdef SIMD_SYNET_CONVOLUTION_NHWC_DIRECT_OLD - //_old.enable = true; - if (_old.enable) - { - if (Set2f(p, _old.convolution)) - OldSetAlgParam(F); - } - else -#endif - { - RunFuncs funcs; - for (size_t n = 2; n <= 3; ++n) - { - funcs.push_back(RunFunc(Ext() + "-" + ToStr(n))); - SetAlgParam(F, n, funcs.back().alg); - if (!SetRt(p, funcs.back().alg)) - return; - } - _run.Init(funcs); - } - } - - bool SynetConvolution32fNhwcDirect::SetRt(const ConvParam32f& p, AlgParam& a) - { - switch (a.microD) - { - case 2 * F: return Set2r(p, a); - case 3 * F: return Set3r(p, a); - default: - return false; - } - } - - //--------------------------------------------------------------------- - - void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdGemm32fNNPtr gemm) - { - ConvParam32f param(batch, conv, gemm); - if (!param.Valid()) - return NULL; - else if (Avx::SynetConvolution32fDepthwiseDotProduct::Preferable(param)) - return new Avx::SynetConvolution32fDepthwiseDotProduct(param); - else if (SynetConvolution32fWinograd::Preferable(param)) - return new SynetConvolution32fWinograd(param); - else if (SynetConvolution32fGemmNT::Preferable(param)) - return new SynetConvolution32fGemmNT(param); - else if (SynetConvolution32fDirectNchw::Preferable(param)) - return new Avx512f::SynetConvolution32fDirectNchw(param); - else if (SynetConvolution32fNhwcDirect::Preferable(param)) - return new SynetConvolution32fNhwcDirect(param); - else if (SynetConvolution32fDirectNhwc::Preferable(param)) - return new SynetConvolution32fDirectNhwc(param); - else - return new SynetConvolution32fGemmNN(param); - } - } -#endif//SIMD_AVX512F_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512fSynetConvolution32fNhwcDirect2f.cpp b/src/3rd/Simd/Simd/SimdAvx512fSynetConvolution32fNhwcDirect2f.cpp deleted file mode 100644 index bcf52abc..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512fSynetConvolution32fNhwcDirect2f.cpp +++ /dev/null @@ -1,1021 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - using AlgParam = SynetConvolution32fNhwcDirect::AlgParam; - - template void ConvolutionNhwcDirect_2x12(const float* src0, const ConvParam32f& p, - size_t kernelH, size_t kernelW, size_t srcC, const float* weight, const __m512* bias, const __m512* params, float* dst, const __mmask16 tails[2]) - { - __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, da0, da1, db0, db1, s0, w0, w1; - size_t dS = p.srcC * p.strideX, dW = DF * (p.kernelX - kernelW) * srcC, dY = p.srcW * p.srcC, dX = p.srcC, dD = p.dstC; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (tails[1]) - { - d00 = _mm512_setzero_ps(); d01 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(); d11 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(); d21 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(); d31 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(); d41 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(); d51 = _mm512_setzero_ps(); - d60 = _mm512_setzero_ps(); d61 = _mm512_setzero_ps(); - d70 = _mm512_setzero_ps(); d71 = _mm512_setzero_ps(); - d80 = _mm512_setzero_ps(); d81 = _mm512_setzero_ps(); - d90 = _mm512_setzero_ps(); d91 = _mm512_setzero_ps(); - da0 = _mm512_setzero_ps(); da1 = _mm512_setzero_ps(); - db0 = _mm512_setzero_ps(); db1 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset0 = ky * dY + kx * dX, offset6 = offset0 + 6 * dS, end0 = offset0 + srcC; offset0 < end0; ++offset0, ++offset6) - { - w0 = _mm512_loadu_ps(weight + 0); - w1 = _mm512_loadu_ps(weight + F); - s0 = _mm512_set1_ps(src0[offset0]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - d01 = _mm512_fmadd_ps(s0, w1, d01); - s0 = _mm512_set1_ps(src1[offset0]); - d10 = _mm512_fmadd_ps(s0, w0, d10); - d11 = _mm512_fmadd_ps(s0, w1, d11); - s0 = _mm512_set1_ps(src2[offset0]); - d20 = _mm512_fmadd_ps(s0, w0, d20); - d21 = _mm512_fmadd_ps(s0, w1, d21); - s0 = _mm512_set1_ps(src3[offset0]); - d30 = _mm512_fmadd_ps(s0, w0, d30); - d31 = _mm512_fmadd_ps(s0, w1, d31); - s0 = _mm512_set1_ps(src4[offset0]); - d40 = _mm512_fmadd_ps(s0, w0, d40); - d41 = _mm512_fmadd_ps(s0, w1, d41); - s0 = _mm512_set1_ps(src5[offset0]); - d50 = _mm512_fmadd_ps(s0, w0, d50); - d51 = _mm512_fmadd_ps(s0, w1, d51); - s0 = _mm512_set1_ps(src0[offset6]); - d60 = _mm512_fmadd_ps(s0, w0, d60); - d61 = _mm512_fmadd_ps(s0, w1, d61); - s0 = _mm512_set1_ps(src1[offset6]); - d70 = _mm512_fmadd_ps(s0, w0, d70); - d71 = _mm512_fmadd_ps(s0, w1, d71); - s0 = _mm512_set1_ps(src2[offset6]); - d80 = _mm512_fmadd_ps(s0, w0, d80); - d81 = _mm512_fmadd_ps(s0, w1, d81); - s0 = _mm512_set1_ps(src3[offset6]); - d90 = _mm512_fmadd_ps(s0, w0, d90); - d91 = _mm512_fmadd_ps(s0, w1, d91); - s0 = _mm512_set1_ps(src4[offset6]); - da0 = _mm512_fmadd_ps(s0, w0, da0); - da1 = _mm512_fmadd_ps(s0, w1, da1); - s0 = _mm512_set1_ps(src5[offset6]); - db0 = _mm512_fmadd_ps(s0, w0, db0); - db1 = _mm512_fmadd_ps(s0, w1, db1); - weight += DF; - } - } - weight += dW; - } - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d60, bias, params); - Term::template Save(dst + F, d61, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d70, bias, params); - Term::template Save(dst + F, d71, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d80, bias, params); - Term::template Save(dst + F, d81, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d90, bias, params); - Term::template Save(dst + F, d91, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, da0, bias, params); - Term::template Save(dst + F, da1, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, db0, bias, params); - Term::template Save(dst + F, db1, bias, params, tails[1]); - } - else - { - d00 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(); - d60 = _mm512_setzero_ps(); - d70 = _mm512_setzero_ps(); - d80 = _mm512_setzero_ps(); - d90 = _mm512_setzero_ps(); - da0 = _mm512_setzero_ps(); - db0 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset0 = ky * dY + kx * dX, offset6 = offset0 + 6 * dS, end0 = offset0 + srcC; offset0 < end0; ++offset0, ++offset6) - { - w0 = _mm512_loadu_ps(weight + 0); - s0 = _mm512_set1_ps(src0[offset0]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - s0 = _mm512_set1_ps(src1[offset0]); - d10 = _mm512_fmadd_ps(s0, w0, d10); - s0 = _mm512_set1_ps(src2[offset0]); - d20 = _mm512_fmadd_ps(s0, w0, d20); - s0 = _mm512_set1_ps(src3[offset0]); - d30 = _mm512_fmadd_ps(s0, w0, d30); - s0 = _mm512_set1_ps(src4[offset0]); - d40 = _mm512_fmadd_ps(s0, w0, d40); - s0 = _mm512_set1_ps(src5[offset0]); - d50 = _mm512_fmadd_ps(s0, w0, d50); - s0 = _mm512_set1_ps(src0[offset6]); - d60 = _mm512_fmadd_ps(s0, w0, d60); - s0 = _mm512_set1_ps(src1[offset6]); - d70 = _mm512_fmadd_ps(s0, w0, d70); - s0 = _mm512_set1_ps(src2[offset6]); - d80 = _mm512_fmadd_ps(s0, w0, d80); - s0 = _mm512_set1_ps(src3[offset6]); - d90 = _mm512_fmadd_ps(s0, w0, d90); - s0 = _mm512_set1_ps(src4[offset6]); - da0 = _mm512_fmadd_ps(s0, w0, da0); - s0 = _mm512_set1_ps(src5[offset6]); - db0 = _mm512_fmadd_ps(s0, w0, db0); - weight += DF; - } - } - weight += dW; - } - Term::template Save(dst + 0, d00, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d10, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d20, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d30, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d40, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d50, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d60, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d70, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d80, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d90, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, da0, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, db0, bias, params, tails[0]); - } - } - - template void ConvolutionNhwcDirect_2x6(const float* src0, const ConvParam32f& p, - size_t kernelH, size_t kernelW, size_t srcC, const float* weight, const __m512* bias, const __m512* params, float* dst, const __mmask16 tails[2]) - { - __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t dS = p.srcC * p.strideX, dW = DF * (p.kernelX - kernelW) * srcC, dY = p.srcW * p.srcC, dX = p.srcC, dD = p.dstC; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (tails[1]) - { - d00 = _mm512_setzero_ps(); d01 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(); d11 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(); d21 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(); d31 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(); d41 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(); d51 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = _mm512_loadu_ps(weight + 0); - w1 = _mm512_loadu_ps(weight + F); - s0 = _mm512_set1_ps(src0[offset]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - d01 = _mm512_fmadd_ps(s0, w1, d01); - s0 = _mm512_set1_ps(src1[offset]); - d10 = _mm512_fmadd_ps(s0, w0, d10); - d11 = _mm512_fmadd_ps(s0, w1, d11); - s0 = _mm512_set1_ps(src2[offset]); - d20 = _mm512_fmadd_ps(s0, w0, d20); - d21 = _mm512_fmadd_ps(s0, w1, d21); - s0 = _mm512_set1_ps(src3[offset]); - d30 = _mm512_fmadd_ps(s0, w0, d30); - d31 = _mm512_fmadd_ps(s0, w1, d31); - s0 = _mm512_set1_ps(src4[offset]); - d40 = _mm512_fmadd_ps(s0, w0, d40); - d41 = _mm512_fmadd_ps(s0, w1, d41); - s0 = _mm512_set1_ps(src5[offset]); - d50 = _mm512_fmadd_ps(s0, w0, d50); - d51 = _mm512_fmadd_ps(s0, w1, d51); - weight += DF; - } - } - weight += dW; - } - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params, tails[1]); - } - else - { - d00 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = _mm512_loadu_ps(weight + 0); - s0 = _mm512_set1_ps(src0[offset]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - s0 = _mm512_set1_ps(src1[offset]); - d10 = _mm512_fmadd_ps(s0, w0, d10); - s0 = _mm512_set1_ps(src2[offset]); - d20 = _mm512_fmadd_ps(s0, w0, d20); - s0 = _mm512_set1_ps(src3[offset]); - d30 = _mm512_fmadd_ps(s0, w0, d30); - s0 = _mm512_set1_ps(src4[offset]); - d40 = _mm512_fmadd_ps(s0, w0, d40); - s0 = _mm512_set1_ps(src5[offset]); - d50 = _mm512_fmadd_ps(s0, w0, d50); - weight += DF; - } - } - weight += dW; - } - Term::template Save(dst + 0, d00, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d10, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d20, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d30, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d40, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d50, bias, params, tails[0]); - } - } - - template void ConvolutionNhwcDirect_2x3(const float* src0, const ConvParam32f& p, - size_t kernelH, size_t kernelW, size_t srcC, const float* weight, const __m512* bias, const __m512* params, float* dst, const __mmask16 tails[2]) - { - __m512 d00, d01, d10, d11, d20, d21, s0, w0, w1; - size_t dS = p.srcC * p.strideX, dW = DF * (p.kernelX - kernelW) * srcC, dY = p.srcW * p.srcC, dX = p.srcC, dD = p.dstC; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - if (tails[1]) - { - d00 = _mm512_setzero_ps(); d01 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(); d11 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(); d21 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = _mm512_loadu_ps(weight + 0); - w1 = _mm512_loadu_ps(weight + F); - s0 = _mm512_set1_ps(src0[offset]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - d01 = _mm512_fmadd_ps(s0, w1, d01); - s0 = _mm512_set1_ps(src1[offset]); - d10 = _mm512_fmadd_ps(s0, w0, d10); - d11 = _mm512_fmadd_ps(s0, w1, d11); - s0 = _mm512_set1_ps(src2[offset]); - d20 = _mm512_fmadd_ps(s0, w0, d20); - d21 = _mm512_fmadd_ps(s0, w1, d21); - weight += DF; - } - } - weight += dW; - } - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, tails[1]); - } - else - { - d00 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = _mm512_loadu_ps(weight + 0); - s0 = _mm512_set1_ps(src0[offset]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - s0 = _mm512_set1_ps(src1[offset]); - d10 = _mm512_fmadd_ps(s0, w0, d10); - s0 = _mm512_set1_ps(src2[offset]); - d20 = _mm512_fmadd_ps(s0, w0, d20); - weight += DF; - } - } - weight += dW; - } - Term::template Save(dst + 0, d00, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d10, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d20, bias, params, tails[0]); - } - } - - template void ConvolutionNhwcDirect_2x1(const float* src0, const ConvParam32f& p, - size_t kernelH, size_t kernelW, size_t srcC, const float* weight, const __m512* bias, const __m512* params, float* dst, const __mmask16 tails[2]) - { - __m512 d00, d01, s0, w0, w1; - size_t dW = DF * (p.kernelX - kernelW) * srcC, dY = p.srcW * p.srcC, dX = p.srcC; - if (tails[1]) - { - d00 = _mm512_setzero_ps(); - d01 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = _mm512_loadu_ps(weight + 0); - w1 = _mm512_loadu_ps(weight + F); - s0 = _mm512_set1_ps(src0[offset]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - d01 = _mm512_fmadd_ps(s0, w1, d01); - weight += DF; - } - } - weight += dW; - } - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, tails[1]); - } - else - { - d00 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = _mm512_loadu_ps(weight + 0); - s0 = _mm512_set1_ps(src0[offset]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - weight += DF; - } - } - weight += dW; - } - Term::template Save(dst + 0, d00, bias, params, tails[0]); - } - } - - template void ConvolutionNhwcDirect_2(const float* src, const ConvParam32f& p, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t noseH = p.padY, noseW = p.padX; - size_t bodyH = p.srcH - p.kernelY + 1 + noseH, bodyW = p.srcW - p.kernelX + 1 + noseW; - size_t bodyW3 = bodyW < noseW ? 0 : AlignLoAny(bodyW - noseW, 3 * p.strideX) + noseW; - size_t bodyW6 = bodyW < noseW ? 0 : AlignLoAny(bodyW - noseW, 6 * p.strideX) + noseW; - size_t bodyW12 = 0;// bodyW < noseW ? 0 : AlignLoAny(bodyW - noseW, 12 * p.strideX) + noseW; - size_t tailH = bodyH + p.padH, tailW = bodyW + p.padW; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - - __m512 _params[2], _bias[2]; - _params[0] = _mm512_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm512_set1_ps(params[1]); - - for (size_t dc = 0; dc < dstC; dc += DF) - { - size_t tail = Simd::Min(DF, dstC - dc); - __mmask16 tails[2] = { TailMask16(tail), TailMask16(tail - F) }; - _bias[0] = _mm512_loadu_ps(bias + dc + 0); - _bias[1] = _mm512_loadu_ps(bias + dc + F); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = _mm512_loadu_ps(params + dc + 0); - _params[1] = _mm512_loadu_ps(params + dc + F); - } - float* d = dst + dc + yBeg * p.dstW * p.dstC; - size_t dy = yBeg, sy = dy * p.strideY; - for (; sy < noseH && dy < yEnd; sy += p.strideY, dy++) - { - size_t sx = 0; - const float* s = src; - const float* w = weight + (noseH - sy) * p.kernelX * srcC * DF; - for (; sx < noseW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s, p, kY + sy, kX + sx, srcC, w + (noseW - sx) * srcC * DF, _bias, _params, d, tails); - for (; sx < bodyW12; sx += 12 * p.strideX, d += 12 * p.dstC) - ConvolutionNhwcDirect_2x12(s + (sx - noseW) * p.srcC, p, kY + sy, p.kernelX, srcC, w, _bias, _params, d, tails); - for (; sx < bodyW6; sx += 6 * p.strideX, d += 6 * p.dstC) - ConvolutionNhwcDirect_2x6(s + (sx - noseW) * p.srcC, p, kY + sy, p.kernelX, srcC, w, _bias, _params, d, tails); - for (; sx < bodyW3; sx += 3 * p.strideX, d += 3 * p.dstC) - ConvolutionNhwcDirect_2x3(s + (sx - noseW) * p.srcC, p, kY + sy, p.kernelX, srcC, w, _bias, _params, d, tails); - for (; sx < bodyW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, kY + sy, p.kernelX, srcC, w, _bias, _params, d, tails); - for (; sx < tailW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, kY + sy, kW - sx, srcC, w, _bias, _params, d, tails); - } - for (; sy < bodyH && dy < yEnd; sy += p.strideY, dy++) - { - size_t sx = 0; - const float* s = src + (sy - noseH) * p.srcW * p.srcC; - const float* w = weight; - for (; sx < noseW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s, p, p.kernelY, kX + sx, srcC, w + (noseW - sx) * srcC * DF, _bias, _params, d, tails); - for (; sx < bodyW12; sx += 12 * p.strideX, d += 12 * p.dstC) - ConvolutionNhwcDirect_2x12(s + (sx - noseW) * p.srcC, p, p.kernelY, p.kernelX, srcC, w, _bias, _params, d, tails); - for (; sx < bodyW6; sx += 6 * p.strideX, d += 6 * p.dstC) - ConvolutionNhwcDirect_2x6(s + (sx - noseW) * p.srcC, p, p.kernelY, p.kernelX, srcC, w, _bias, _params, d, tails); - for (; sx < bodyW3; sx += 3 * p.strideX, d += 3 * p.dstC) - ConvolutionNhwcDirect_2x3(s + (sx - noseW) * p.srcC, p, p.kernelY, p.kernelX, srcC, w, _bias, _params, d, tails); - for (; sx < bodyW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, p.kernelY, p.kernelX, srcC, w, _bias, _params, d, tails); - for (; sx < tailW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, p.kernelY, kW - sx, srcC, w, _bias, _params, d, tails); - } - for (; sy < tailH && dy < yEnd; sy += p.strideY, dy++) - { - size_t sx = 0; - const float* s = src + (sy - noseH) * p.srcW * p.srcC; - const float* w = weight; - for (; sx < noseW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s, p, kH - sy, kX + sx, srcC, w + (noseW - sx) * srcC * DF, _bias, _params, d, tails); - for (; sx < bodyW12; sx += 12 * p.strideX, d += 12 * p.dstC) - ConvolutionNhwcDirect_2x12(s + (sx - noseW) * p.srcC, p, kH - sy, p.kernelX, srcC, w, _bias, _params, d, tails); - for (; sx < bodyW6; sx += 6 * p.strideX, d += 6 * p.dstC) - ConvolutionNhwcDirect_2x6(s + (sx - noseW) * p.srcC, p, kH - sy, p.kernelX, srcC, w, _bias, _params, d, tails); - for (; sx < bodyW3; sx += 3 * p.strideX, d += 3 * p.dstC) - ConvolutionNhwcDirect_2x3(s + (sx - noseW) * p.srcC, p, kH - sy, p.kernelX, srcC, w, _bias, _params, d, tails); - for (; sx < bodyW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, kH - sy, p.kernelX, srcC, w, _bias, _params, d, tails); - for (; sx < tailW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, kH - sy, kW - sx, srcC, w, _bias, _params, d, tails); - } - weight += p.kernelY * p.kernelX * srcC * DF; - } - } - - template void ConvolutionNhwcDirect_2(const float* src, const ConvParam32f& p, - const SynetConvolution32fNhwcDirect::AlgParam& a, const float* weight, const float* bias, const float* params, float* dst) - { - for (size_t dc = 0; dc < p.dstC; dc += a.macroD) - { - size_t macroD = Simd::Min(p.dstC, dc + a.macroD) - dc; - for (size_t sc = 0; sc < p.srcC; sc += a.macroC) - { - size_t macroC = Simd::Min(p.srcC, sc + a.macroC) - sc; - size_t macroK = p.kernelY * p.kernelX * macroC; - for (size_t yBeg = 0; yBeg < p.dstH;) - { - size_t yEnd = Simd::Min(yBeg + a.macroH, p.dstH); - if (a.macroC == p.srcC) - ConvolutionNhwcDirect_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc == 0) - ConvolutionNhwcDirect_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc + macroC == p.srcC) - ConvolutionNhwcDirect_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else - ConvolutionNhwcDirect_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - yBeg = yEnd; - } - weight += AlignHiAny(macroD, a.microD) * macroK; - } - if (type == ::SimdConvolutionActivationPrelu) - params += macroD; - } - } - - //--------------------------------------------------------------------- - - template void ConvolutionNhwcDirect1x1_2x12(const float* src0, const ConvParam32f& p, - size_t srcC, const float* weight, const __m512* bias, const __m512* params, float* dst, const __mmask16 tails[2]) - { - __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, da0, da1, db0, db1, s0, w0, w1; - size_t dS = p.srcC, dD = p.dstC; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (tails[1]) - { - d00 = _mm512_setzero_ps(); d01 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(); d11 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(); d21 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(); d31 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(); d41 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(); d51 = _mm512_setzero_ps(); - d60 = _mm512_setzero_ps(); d61 = _mm512_setzero_ps(); - d70 = _mm512_setzero_ps(); d71 = _mm512_setzero_ps(); - d80 = _mm512_setzero_ps(); d81 = _mm512_setzero_ps(); - d90 = _mm512_setzero_ps(); d91 = _mm512_setzero_ps(); - da0 = _mm512_setzero_ps(); da1 = _mm512_setzero_ps(); - db0 = _mm512_setzero_ps(); db1 = _mm512_setzero_ps(); - for (size_t offset0 = 0, offset6 = 6 * dS; offset0 < srcC; ++offset0, ++offset6) - { - w0 = _mm512_loadu_ps(weight + 0); - w1 = _mm512_loadu_ps(weight + F); - s0 = _mm512_set1_ps(src0[offset0]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - d01 = _mm512_fmadd_ps(s0, w1, d01); - s0 = _mm512_set1_ps(src1[offset0]); - d10 = _mm512_fmadd_ps(s0, w0, d10); - d11 = _mm512_fmadd_ps(s0, w1, d11); - s0 = _mm512_set1_ps(src2[offset0]); - d20 = _mm512_fmadd_ps(s0, w0, d20); - d21 = _mm512_fmadd_ps(s0, w1, d21); - s0 = _mm512_set1_ps(src3[offset0]); - d30 = _mm512_fmadd_ps(s0, w0, d30); - d31 = _mm512_fmadd_ps(s0, w1, d31); - s0 = _mm512_set1_ps(src4[offset0]); - d40 = _mm512_fmadd_ps(s0, w0, d40); - d41 = _mm512_fmadd_ps(s0, w1, d41); - s0 = _mm512_set1_ps(src5[offset0]); - d50 = _mm512_fmadd_ps(s0, w0, d50); - d51 = _mm512_fmadd_ps(s0, w1, d51); - s0 = _mm512_set1_ps(src0[offset6]); - d60 = _mm512_fmadd_ps(s0, w0, d60); - d61 = _mm512_fmadd_ps(s0, w1, d61); - s0 = _mm512_set1_ps(src1[offset6]); - d70 = _mm512_fmadd_ps(s0, w0, d70); - d71 = _mm512_fmadd_ps(s0, w1, d71); - s0 = _mm512_set1_ps(src2[offset6]); - d80 = _mm512_fmadd_ps(s0, w0, d80); - d81 = _mm512_fmadd_ps(s0, w1, d81); - s0 = _mm512_set1_ps(src3[offset6]); - d90 = _mm512_fmadd_ps(s0, w0, d90); - d91 = _mm512_fmadd_ps(s0, w1, d91); - s0 = _mm512_set1_ps(src4[offset6]); - da0 = _mm512_fmadd_ps(s0, w0, da0); - da1 = _mm512_fmadd_ps(s0, w1, da1); - s0 = _mm512_set1_ps(src5[offset6]); - db0 = _mm512_fmadd_ps(s0, w0, db0); - db1 = _mm512_fmadd_ps(s0, w1, db1); - weight += DF; - } - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d60, bias, params); - Term::template Save(dst + F, d61, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d70, bias, params); - Term::template Save(dst + F, d71, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d80, bias, params); - Term::template Save(dst + F, d81, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d90, bias, params); - Term::template Save(dst + F, d91, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, da0, bias, params); - Term::template Save(dst + F, da1, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, db0, bias, params); - Term::template Save(dst + F, db1, bias, params, tails[1]); - } - else - { - d00 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(); - d60 = _mm512_setzero_ps(); - d70 = _mm512_setzero_ps(); - d80 = _mm512_setzero_ps(); - d90 = _mm512_setzero_ps(); - da0 = _mm512_setzero_ps(); - db0 = _mm512_setzero_ps(); - for (size_t offset0 = 0, offset6 = 6 * dS; offset0 < srcC; ++offset0, ++offset6) - { - w0 = _mm512_loadu_ps(weight + 0); - s0 = _mm512_set1_ps(src0[offset0]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - s0 = _mm512_set1_ps(src1[offset0]); - d10 = _mm512_fmadd_ps(s0, w0, d10); - s0 = _mm512_set1_ps(src2[offset0]); - d20 = _mm512_fmadd_ps(s0, w0, d20); - s0 = _mm512_set1_ps(src3[offset0]); - d30 = _mm512_fmadd_ps(s0, w0, d30); - s0 = _mm512_set1_ps(src4[offset0]); - d40 = _mm512_fmadd_ps(s0, w0, d40); - s0 = _mm512_set1_ps(src5[offset0]); - d50 = _mm512_fmadd_ps(s0, w0, d50); - s0 = _mm512_set1_ps(src0[offset6]); - d60 = _mm512_fmadd_ps(s0, w0, d60); - s0 = _mm512_set1_ps(src1[offset6]); - d70 = _mm512_fmadd_ps(s0, w0, d70); - s0 = _mm512_set1_ps(src2[offset6]); - d80 = _mm512_fmadd_ps(s0, w0, d80); - s0 = _mm512_set1_ps(src3[offset6]); - d90 = _mm512_fmadd_ps(s0, w0, d90); - s0 = _mm512_set1_ps(src4[offset6]); - da0 = _mm512_fmadd_ps(s0, w0, da0); - s0 = _mm512_set1_ps(src5[offset6]); - db0 = _mm512_fmadd_ps(s0, w0, db0); - weight += DF; - } - Term::template Save(dst + 0, d00, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d10, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d20, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d30, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d40, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d50, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d60, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d70, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d80, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d90, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, da0, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, db0, bias, params, tails[0]); - } - } - - template void ConvolutionNhwcDirect1x1_2x6(const float* src0, const ConvParam32f& p, - size_t srcC, const float* weight, const __m512* bias, const __m512* params, float* dst, const __mmask16 tails[2]) - { - __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t dS = p.srcC, dD = p.dstC; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (tails[1]) - { - d00 = _mm512_setzero_ps(); d01 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(); d11 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(); d21 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(); d31 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(); d41 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(); d51 = _mm512_setzero_ps(); - for (size_t offset = 0; offset < srcC; ++offset) - { - w0 = _mm512_loadu_ps(weight + 0); - w1 = _mm512_loadu_ps(weight + F); - s0 = _mm512_set1_ps(src0[offset]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - d01 = _mm512_fmadd_ps(s0, w1, d01); - s0 = _mm512_set1_ps(src1[offset]); - d10 = _mm512_fmadd_ps(s0, w0, d10); - d11 = _mm512_fmadd_ps(s0, w1, d11); - s0 = _mm512_set1_ps(src2[offset]); - d20 = _mm512_fmadd_ps(s0, w0, d20); - d21 = _mm512_fmadd_ps(s0, w1, d21); - s0 = _mm512_set1_ps(src3[offset]); - d30 = _mm512_fmadd_ps(s0, w0, d30); - d31 = _mm512_fmadd_ps(s0, w1, d31); - s0 = _mm512_set1_ps(src4[offset]); - d40 = _mm512_fmadd_ps(s0, w0, d40); - d41 = _mm512_fmadd_ps(s0, w1, d41); - s0 = _mm512_set1_ps(src5[offset]); - d50 = _mm512_fmadd_ps(s0, w0, d50); - d51 = _mm512_fmadd_ps(s0, w1, d51); - weight += DF; - } - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params, tails[1]); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params, tails[1]); - } - else - { - d00 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(); - for (size_t offset = 0; offset < srcC; ++offset) - { - w0 = _mm512_loadu_ps(weight + 0); - s0 = _mm512_set1_ps(src0[offset]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - s0 = _mm512_set1_ps(src1[offset]); - d10 = _mm512_fmadd_ps(s0, w0, d10); - s0 = _mm512_set1_ps(src2[offset]); - d20 = _mm512_fmadd_ps(s0, w0, d20); - s0 = _mm512_set1_ps(src3[offset]); - d30 = _mm512_fmadd_ps(s0, w0, d30); - s0 = _mm512_set1_ps(src4[offset]); - d40 = _mm512_fmadd_ps(s0, w0, d40); - s0 = _mm512_set1_ps(src5[offset]); - d50 = _mm512_fmadd_ps(s0, w0, d50); - weight += DF; - } - Term::template Save(dst + 0, d00, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d10, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d20, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d30, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d40, bias, params, tails[0]); - dst += dD; - Term::template Save(dst + 0, d50, bias, params, tails[0]); - } - } - - template void ConvolutionNhwcDirect1x1_2xM(const float* src0, const ConvParam32f& p, - size_t srcC, const float* weight, const __m512* bias, const __m512* params, float* dst, const __mmask16 tails[2]) - { - __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t dS = p.srcC, dD = p.dstC; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (tails[1]) - { - if (M > 0) d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps(); - if (M > 1) d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps(); - if (M > 2) d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps(); - if (M > 3) d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps(); - if (M > 4) d40 = _mm512_setzero_ps(), d41 = _mm512_setzero_ps(); - if (M > 5) d50 = _mm512_setzero_ps(), d51 = _mm512_setzero_ps(); - for (size_t offset = 0; offset < srcC; ++offset) - { - w0 = _mm512_loadu_ps(weight + 0); - w1 = _mm512_loadu_ps(weight + F); - if (M > 0) s0 = _mm512_set1_ps(src0[offset]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01); - if (M > 1) s0 = _mm512_set1_ps(src1[offset]), d10 = _mm512_fmadd_ps(s0, w0, d10), d11 = _mm512_fmadd_ps(s0, w1, d11); - if (M > 2) s0 = _mm512_set1_ps(src2[offset]), d20 = _mm512_fmadd_ps(s0, w0, d20), d21 = _mm512_fmadd_ps(s0, w1, d21); - if (M > 3) s0 = _mm512_set1_ps(src3[offset]), d30 = _mm512_fmadd_ps(s0, w0, d30), d31 = _mm512_fmadd_ps(s0, w1, d31); - if (M > 4) s0 = _mm512_set1_ps(src4[offset]), d40 = _mm512_fmadd_ps(s0, w0, d40), d41 = _mm512_fmadd_ps(s0, w1, d41); - if (M > 5) s0 = _mm512_set1_ps(src5[offset]), d50 = _mm512_fmadd_ps(s0, w0, d50), d51 = _mm512_fmadd_ps(s0, w1, d51); - weight += DF; - } - if (M > 0) Term::template Save(dst + 0, d00, bias, params), Term::template Save(dst + F, d01, bias, params, tails[1]), dst += dD; - if (M > 1) Term::template Save(dst + 0, d10, bias, params), Term::template Save(dst + F, d11, bias, params, tails[1]), dst += dD; - if (M > 2) Term::template Save(dst + 0, d20, bias, params), Term::template Save(dst + F, d21, bias, params, tails[1]), dst += dD; - if (M > 3) Term::template Save(dst + 0, d30, bias, params), Term::template Save(dst + F, d31, bias, params, tails[1]), dst += dD; - if (M > 4) Term::template Save(dst + 0, d40, bias, params), Term::template Save(dst + F, d41, bias, params, tails[1]), dst += dD; - if (M > 5) Term::template Save(dst + 0, d50, bias, params), Term::template Save(dst + F, d51, bias, params, tails[1]), dst += dD; - } - else - { - if (M > 0) d00 = _mm512_setzero_ps(); - if (M > 1) d10 = _mm512_setzero_ps(); - if (M > 2) d20 = _mm512_setzero_ps(); - if (M > 3) d30 = _mm512_setzero_ps(); - if (M > 4) d40 = _mm512_setzero_ps(); - if (M > 5) d50 = _mm512_setzero_ps(); - for (size_t offset = 0; offset < srcC; ++offset) - { - w0 = _mm512_loadu_ps(weight + 0); - if (M > 0) s0 = _mm512_set1_ps(src0[offset]), d00 = _mm512_fmadd_ps(s0, w0, d00); - if (M > 1) s0 = _mm512_set1_ps(src1[offset]), d10 = _mm512_fmadd_ps(s0, w0, d10); - if (M > 2) s0 = _mm512_set1_ps(src2[offset]), d20 = _mm512_fmadd_ps(s0, w0, d20); - if (M > 3) s0 = _mm512_set1_ps(src3[offset]), d30 = _mm512_fmadd_ps(s0, w0, d30); - if (M > 4) s0 = _mm512_set1_ps(src4[offset]), d40 = _mm512_fmadd_ps(s0, w0, d40); - if (M > 5) s0 = _mm512_set1_ps(src5[offset]), d50 = _mm512_fmadd_ps(s0, w0, d50); - weight += DF; - } - if (M > 0) Term::template Save(dst + 0, d00, bias, params, tails[0]), dst += dD; - if (M > 1) Term::template Save(dst + 0, d10, bias, params, tails[0]), dst += dD; - if (M > 2) Term::template Save(dst + 0, d20, bias, params, tails[0]), dst += dD; - if (M > 3) Term::template Save(dst + 0, d30, bias, params, tails[0]), dst += dD; - if (M > 4) Term::template Save(dst + 0, d40, bias, params, tails[0]), dst += dD; - if (M > 5) Term::template Save(dst + 0, d50, bias, params, tails[0]), dst += dD; - } - } - - typedef void(*ConvolutionNhwcDirect1x1_2xM_Ptr)(const float* src0, const ConvParam32f& p, size_t srcC, const float* weight, const __m512* bias, const __m512* params, float* dst, const __mmask16 tails[2]); - - template ConvolutionNhwcDirect1x1_2xM_Ptr GetConvolutionNhwcDirect1x1_2xM(size_t M) - { - switch (M) - { - case 0: return ConvolutionNhwcDirect1x1_2xM; - case 1: return ConvolutionNhwcDirect1x1_2xM; - case 2: return ConvolutionNhwcDirect1x1_2xM; - case 3: return ConvolutionNhwcDirect1x1_2xM; - case 4: return ConvolutionNhwcDirect1x1_2xM; - case 5: return ConvolutionNhwcDirect1x1_2xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect1x1_2(const float* src, const ConvParam32f& p, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t n1 = (yEnd - yBeg) * p.dstW; - size_t n6 = AlignLoAny(n1, 6); - size_t n12 = AlignLoAny(n1, 12); - size_t nTail = n1 - n6; - ConvolutionNhwcDirect1x1_2xM_Ptr tailN = GetConvolutionNhwcDirect1x1_2xM(nTail); - - __m512 _params[2], _bias[2]; - _params[0] = _mm512_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm512_set1_ps(params[1]); - - for (size_t dc = 0; dc < dstC; dc += DF) - { - size_t tail = Simd::Min(DF, dstC - dc); - __mmask16 tails[2] = { TailMask16(tail), TailMask16(tail - F) }; - _bias[0] = _mm512_loadu_ps(bias + dc + 0); - _bias[1] = _mm512_loadu_ps(bias + dc + F); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = _mm512_loadu_ps(params + dc + 0); - _params[1] = _mm512_loadu_ps(params + dc + F); - } - const float* ps = src + yBeg * p.srcW * p.srcC; - float* pd = dst + dc + yBeg * p.dstW * p.dstC; - size_t i = 0; - for (; i < n12; i += 12, ps += 12 * p.srcC, pd += 12 * p.dstC) - ConvolutionNhwcDirect1x1_2x12(ps, p, srcC, weight, _bias, _params, pd, tails); - for (; i < n6; i += 6, ps += 6 * p.srcC, pd += 6 * p.dstC) - ConvolutionNhwcDirect1x1_2x6(ps, p, srcC, weight, _bias, _params, pd, tails); - if (nTail) - tailN(ps, p, srcC, weight, _bias, _params, pd, tails), ps += nTail * p.srcC, pd += nTail * p.dstC; - weight += srcC * DF; - } - } - - template void ConvolutionNhwcDirect1x1_2(const float* src, const ConvParam32f& p, - const SynetConvolution32fNhwcDirect::AlgParam& a, const float* weight, const float* bias, const float* params, float* dst) - { - for (size_t dc = 0; dc < p.dstC; dc += a.macroD) - { - size_t macroD = Simd::Min(p.dstC, dc + a.macroD) - dc; - for (size_t sc = 0; sc < p.srcC; sc += a.macroC) - { - size_t macroC = Simd::Min(p.srcC, sc + a.macroC) - sc; - for (size_t yBeg = 0; yBeg < p.dstH;) - { - size_t yEnd = Simd::Min(yBeg + a.macroH, p.dstH); - if (a.macroC == p.srcC) - ConvolutionNhwcDirect1x1_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc == 0) - ConvolutionNhwcDirect1x1_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc + macroC == p.srcC) - ConvolutionNhwcDirect1x1_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else - ConvolutionNhwcDirect1x1_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - yBeg = yEnd; - } - weight += AlignHiAny(macroD, a.microD) * macroC; - } - if (type == ::SimdConvolutionActivationPrelu) - params += macroD; - } - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void Set(const ConvParam32f& p, SynetConvolution32fNhwcDirect::OldConvolutionPtr& convolution) - { - if (p.Is1x1()) - convolution = ConvolutionNhwcDirect1x1_2; - else - convolution = ConvolutionNhwcDirect_2; - } - - bool SynetConvolution32fNhwcDirect::Set2f(const ConvParam32f& p, OldConvolutionPtr& convolution) - { - switch (p.activation) - { - case SimdConvolutionActivationIdentity: Set(p, convolution); break; - case SimdConvolutionActivationRelu: Set(p, convolution); break; - case SimdConvolutionActivationLeakyRelu: Set(p, convolution); break; - case SimdConvolutionActivationRestrictRange: Set(p, convolution); break; - case SimdConvolutionActivationPrelu: Set(p, convolution); break; - case SimdConvolutionActivationElu: Set(p, convolution); break; - case SimdConvolutionActivationHswish: Set(p, convolution); break; - default: assert(0); - } - return true; - } - } -#endif//SIMD_AVX512F_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512fSynetConvolution32fNhwcDirect2r.cpp b/src/3rd/Simd/Simd/SimdAvx512fSynetConvolution32fNhwcDirect2r.cpp deleted file mode 100644 index 03befa1b..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512fSynetConvolution32fNhwcDirect2r.cpp +++ /dev/null @@ -1,798 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - using AlgParam = SynetConvolution32fNhwcDirect::AlgParam; - - typedef void(*ConvolutionNhwcDirect_NxM_Ptr)(const float* src0, const ConvParam32f& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, const float* weight0, const __m512* bias, const __m512* params, float* dst, const __mmask16* tails); - typedef void(*ConvolutionNhwcDirect1x1_NxM_Ptr)(const float* src0, const ConvParam32f& p, const AlgParam& a, size_t srcC, const float* weight0, const __m512* bias, const __m512* params, float* dst, const __mmask16* tails); - - //--------------------------------------------------------------------- - - template void ConvolutionNhwcDirect_2x1(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, const float* weight0, const __m512* bias, const __m512* params, float* dst, const __mmask16* tails) - { - __m512 d00, d01, s0, w0, w1; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - if (tails[1]) - { - d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - w1 = _mm512_loadu_ps(weight1 + offw); - s0 = _mm512_set1_ps(src0[offs]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01); - } - } - weight0 += dW, weight1 += dW; - } - } - Save2(dst, d00, d01, bias, params, tails); - } - else - { - d00 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - s0 = _mm512_set1_ps(src0[offs]), d00 = _mm512_fmadd_ps(s0, w0, d00); - } - } - weight0 += dW; - } - } - Save1(dst, d00, bias, params, tails); - } - } - - template void ConvolutionNhwcDirect_2x14(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, const float* weight0, const __m512* bias, const __m512* params, float* dst, const __mmask16* tails) - { - __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, da0, da1, db0, db1, dc0, dc1, dd0, dd1, s0, w0, w1; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - const float* src6 = src0 + 6 * dS; - if (tails[1]) - { - d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(), d41 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(), d51 = _mm512_setzero_ps(); - d60 = _mm512_setzero_ps(), d61 = _mm512_setzero_ps(); - d70 = _mm512_setzero_ps(), d71 = _mm512_setzero_ps(); - d80 = _mm512_setzero_ps(), d81 = _mm512_setzero_ps(); - d90 = _mm512_setzero_ps(), d91 = _mm512_setzero_ps(); - da0 = _mm512_setzero_ps(), da1 = _mm512_setzero_ps(); - db0 = _mm512_setzero_ps(), db1 = _mm512_setzero_ps(); - dc0 = _mm512_setzero_ps(), dc1 = _mm512_setzero_ps(); - dd0 = _mm512_setzero_ps(), dd1 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 14 <= srcW); - size_t off0 = beg + kx * dX, end = off0 + srcC, off7 = off0 + 7 * dS, offw = 0; - for (; off0 < end; ++off0, ++off7, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - w1 = _mm512_loadu_ps(weight1 + offw); - s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01); - s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10), d11 = _mm512_fmadd_ps(s0, w1, d11); - s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20), d21 = _mm512_fmadd_ps(s0, w1, d21); - s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30), d31 = _mm512_fmadd_ps(s0, w1, d31); - s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40), d41 = _mm512_fmadd_ps(s0, w1, d41); - s0 = _mm512_set1_ps(src5[off0]), d50 = _mm512_fmadd_ps(s0, w0, d50), d51 = _mm512_fmadd_ps(s0, w1, d51); - s0 = _mm512_set1_ps(src6[off0]), d60 = _mm512_fmadd_ps(s0, w0, d60), d61 = _mm512_fmadd_ps(s0, w1, d61); - s0 = _mm512_set1_ps(src0[off7]), d70 = _mm512_fmadd_ps(s0, w0, d70), d71 = _mm512_fmadd_ps(s0, w1, d71); - s0 = _mm512_set1_ps(src1[off7]), d80 = _mm512_fmadd_ps(s0, w0, d80), d81 = _mm512_fmadd_ps(s0, w1, d81); - s0 = _mm512_set1_ps(src2[off7]), d90 = _mm512_fmadd_ps(s0, w0, d90), d91 = _mm512_fmadd_ps(s0, w1, d91); - s0 = _mm512_set1_ps(src3[off7]), da0 = _mm512_fmadd_ps(s0, w0, da0), da1 = _mm512_fmadd_ps(s0, w1, da1); - s0 = _mm512_set1_ps(src4[off7]), db0 = _mm512_fmadd_ps(s0, w0, db0), db1 = _mm512_fmadd_ps(s0, w1, db1); - s0 = _mm512_set1_ps(src5[off7]), dc0 = _mm512_fmadd_ps(s0, w0, dc0), dc1 = _mm512_fmadd_ps(s0, w1, dc1); - s0 = _mm512_set1_ps(src6[off7]), dd0 = _mm512_fmadd_ps(s0, w0, dd0), dd1 = _mm512_fmadd_ps(s0, w1, dd1); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - Save2(dst, d00, d01, bias, params, tails), dst += dD; - Save2(dst, d10, d11, bias, params, tails), dst += dD; - Save2(dst, d20, d21, bias, params, tails), dst += dD; - Save2(dst, d30, d31, bias, params, tails), dst += dD; - Save2(dst, d40, d41, bias, params, tails), dst += dD; - Save2(dst, d50, d51, bias, params, tails), dst += dD; - Save2(dst, d60, d61, bias, params, tails), dst += dD; - Save2(dst, d70, d71, bias, params, tails), dst += dD; - Save2(dst, d80, d81, bias, params, tails), dst += dD; - Save2(dst, d90, d91, bias, params, tails), dst += dD; - Save2(dst, da0, da1, bias, params, tails), dst += dD; - Save2(dst, db0, db1, bias, params, tails), dst += dD; - Save2(dst, dc0, dc1, bias, params, tails), dst += dD; - Save2(dst, dd0, dd1, bias, params, tails), dst += dD; - } - else - { - d00 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(); - d60 = _mm512_setzero_ps(); - d70 = _mm512_setzero_ps(); - d80 = _mm512_setzero_ps(); - d90 = _mm512_setzero_ps(); - da0 = _mm512_setzero_ps(); - db0 = _mm512_setzero_ps(); - dc0 = _mm512_setzero_ps(); - dd0 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 14 <= srcW); - size_t off0 = beg + kx * dX, end = off0 + srcC, off7 = off0 + 7 * dS, offw = 0; - for (; off0 < end; ++off0, ++off7, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00); - s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10); - s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20); - s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30); - s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40); - s0 = _mm512_set1_ps(src5[off0]), d50 = _mm512_fmadd_ps(s0, w0, d50); - s0 = _mm512_set1_ps(src6[off0]), d60 = _mm512_fmadd_ps(s0, w0, d60); - s0 = _mm512_set1_ps(src0[off7]), d70 = _mm512_fmadd_ps(s0, w0, d70); - s0 = _mm512_set1_ps(src1[off7]), d80 = _mm512_fmadd_ps(s0, w0, d80); - s0 = _mm512_set1_ps(src2[off7]), d90 = _mm512_fmadd_ps(s0, w0, d90); - s0 = _mm512_set1_ps(src3[off7]), da0 = _mm512_fmadd_ps(s0, w0, da0); - s0 = _mm512_set1_ps(src4[off7]), db0 = _mm512_fmadd_ps(s0, w0, db0); - s0 = _mm512_set1_ps(src5[off7]), dc0 = _mm512_fmadd_ps(s0, w0, dc0); - s0 = _mm512_set1_ps(src6[off7]), dd0 = _mm512_fmadd_ps(s0, w0, dd0); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - Save1(dst, d00, bias, params, tails), dst += dD; - Save1(dst, d10, bias, params, tails), dst += dD; - Save1(dst, d20, bias, params, tails), dst += dD; - Save1(dst, d30, bias, params, tails), dst += dD; - Save1(dst, d40, bias, params, tails), dst += dD; - Save1(dst, d50, bias, params, tails), dst += dD; - Save1(dst, d60, bias, params, tails), dst += dD; - Save1(dst, d70, bias, params, tails), dst += dD; - Save1(dst, d80, bias, params, tails), dst += dD; - Save1(dst, d90, bias, params, tails), dst += dD; - Save1(dst, da0, bias, params, tails), dst += dD; - Save1(dst, db0, bias, params, tails), dst += dD; - Save1(dst, dc0, bias, params, tails), dst += dD; - Save1(dst, dd0, bias, params, tails), dst += dD; - } - } - - template void ConvolutionNhwcDirect_2xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, const float* weight0, const __m512* bias, const __m512* params, float* dst, const __mmask16* tails) - { - __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, da0, da1, db0, db1, dc0, dc1, dd0, dd1, s0, w0, w1; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - const float* src6 = src0 + 6 * dS; - if (tails[1]) - { - if (M > 0x0) d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps(); - if (M > 0x1) d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps(); - if (M > 0x2) d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps(); - if (M > 0x3) d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps(); - if (M > 0x4) d40 = _mm512_setzero_ps(), d41 = _mm512_setzero_ps(); - if (M > 0x5) d50 = _mm512_setzero_ps(), d51 = _mm512_setzero_ps(); - if (M > 0x6) d60 = _mm512_setzero_ps(), d61 = _mm512_setzero_ps(); - if (M > 0x7) d70 = _mm512_setzero_ps(), d71 = _mm512_setzero_ps(); - if (M > 0x8) d80 = _mm512_setzero_ps(), d81 = _mm512_setzero_ps(); - if (M > 0x9) d90 = _mm512_setzero_ps(), d91 = _mm512_setzero_ps(); - if (M > 0xa) da0 = _mm512_setzero_ps(), da1 = _mm512_setzero_ps(); - if (M > 0xb) db0 = _mm512_setzero_ps(), db1 = _mm512_setzero_ps(); - if (M > 0xc) dc0 = _mm512_setzero_ps(), dc1 = _mm512_setzero_ps(); - if (M > 0xd) dd0 = _mm512_setzero_ps(), dd1 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t off0 = beg + kx * dX, end = off0 + srcC, off7 = off0 + 7 * dS, offw = 0; - for (; off0 < end; ++off0, ++off7, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - w1 = _mm512_loadu_ps(weight1 + offw); - if (M > 0x0) s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01); - if (M > 0x1) s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10), d11 = _mm512_fmadd_ps(s0, w1, d11); - if (M > 0x2) s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20), d21 = _mm512_fmadd_ps(s0, w1, d21); - if (M > 0x3) s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30), d31 = _mm512_fmadd_ps(s0, w1, d31); - if (M > 0x4) s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40), d41 = _mm512_fmadd_ps(s0, w1, d41); - if (M > 0x5) s0 = _mm512_set1_ps(src5[off0]), d50 = _mm512_fmadd_ps(s0, w0, d50), d51 = _mm512_fmadd_ps(s0, w1, d51); - if (M > 0x6) s0 = _mm512_set1_ps(src6[off0]), d60 = _mm512_fmadd_ps(s0, w0, d60), d61 = _mm512_fmadd_ps(s0, w1, d61); - if (M > 0x7) s0 = _mm512_set1_ps(src0[off7]), d70 = _mm512_fmadd_ps(s0, w0, d70), d71 = _mm512_fmadd_ps(s0, w1, d71); - if (M > 0x8) s0 = _mm512_set1_ps(src1[off7]), d80 = _mm512_fmadd_ps(s0, w0, d80), d81 = _mm512_fmadd_ps(s0, w1, d81); - if (M > 0x9) s0 = _mm512_set1_ps(src2[off7]), d90 = _mm512_fmadd_ps(s0, w0, d90), d91 = _mm512_fmadd_ps(s0, w1, d91); - if (M > 0xa) s0 = _mm512_set1_ps(src3[off7]), da0 = _mm512_fmadd_ps(s0, w0, da0), da1 = _mm512_fmadd_ps(s0, w1, da1); - if (M > 0xb) s0 = _mm512_set1_ps(src4[off7]), db0 = _mm512_fmadd_ps(s0, w0, db0), db1 = _mm512_fmadd_ps(s0, w1, db1); - if (M > 0xc) s0 = _mm512_set1_ps(src5[off7]), dc0 = _mm512_fmadd_ps(s0, w0, dc0), dc1 = _mm512_fmadd_ps(s0, w1, dc1); - if (M > 0xd) s0 = _mm512_set1_ps(src6[off7]), dd0 = _mm512_fmadd_ps(s0, w0, dd0), dd1 = _mm512_fmadd_ps(s0, w1, dd1); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (M > 0x0) Save2(dst, d00, d01, bias, params, tails), dst += dD; - if (M > 0x1) Save2(dst, d10, d11, bias, params, tails), dst += dD; - if (M > 0x2) Save2(dst, d20, d21, bias, params, tails), dst += dD; - if (M > 0x3) Save2(dst, d30, d31, bias, params, tails), dst += dD; - if (M > 0x4) Save2(dst, d40, d41, bias, params, tails), dst += dD; - if (M > 0x5) Save2(dst, d50, d51, bias, params, tails), dst += dD; - if (M > 0x6) Save2(dst, d60, d61, bias, params, tails), dst += dD; - if (M > 0x7) Save2(dst, d70, d71, bias, params, tails), dst += dD; - if (M > 0x8) Save2(dst, d80, d81, bias, params, tails), dst += dD; - if (M > 0x9) Save2(dst, d90, d91, bias, params, tails), dst += dD; - if (M > 0xa) Save2(dst, da0, da1, bias, params, tails), dst += dD; - if (M > 0xb) Save2(dst, db0, db1, bias, params, tails), dst += dD; - if (M > 0xc) Save2(dst, dc0, dc1, bias, params, tails), dst += dD; - if (M > 0xd) Save2(dst, dd0, dd1, bias, params, tails), dst += dD; - } - else - { - if (M > 0x0) d00 = _mm512_setzero_ps(); - if (M > 0x1) d10 = _mm512_setzero_ps(); - if (M > 0x2) d20 = _mm512_setzero_ps(); - if (M > 0x3) d30 = _mm512_setzero_ps(); - if (M > 0x4) d40 = _mm512_setzero_ps(); - if (M > 0x5) d50 = _mm512_setzero_ps(); - if (M > 0x6) d60 = _mm512_setzero_ps(); - if (M > 0x7) d70 = _mm512_setzero_ps(); - if (M > 0x8) d80 = _mm512_setzero_ps(); - if (M > 0x9) d90 = _mm512_setzero_ps(); - if (M > 0xa) da0 = _mm512_setzero_ps(); - if (M > 0xb) db0 = _mm512_setzero_ps(); - if (M > 0xc) dc0 = _mm512_setzero_ps(); - if (M > 0xd) dd0 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t off0 = beg + kx * dX, end = off0 + srcC, off7 = off0 + 7 * dS, offw = 0; - for (; off0 < end; ++off0, ++off7, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - if (M > 0x0) s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00); - if (M > 0x1) s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10); - if (M > 0x2) s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20); - if (M > 0x3) s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30); - if (M > 0x4) s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40); - if (M > 0x5) s0 = _mm512_set1_ps(src5[off0]), d50 = _mm512_fmadd_ps(s0, w0, d50); - if (M > 0x6) s0 = _mm512_set1_ps(src6[off0]), d60 = _mm512_fmadd_ps(s0, w0, d60); - if (M > 0x7) s0 = _mm512_set1_ps(src0[off7]), d70 = _mm512_fmadd_ps(s0, w0, d70); - if (M > 0x8) s0 = _mm512_set1_ps(src1[off7]), d80 = _mm512_fmadd_ps(s0, w0, d80); - if (M > 0x9) s0 = _mm512_set1_ps(src2[off7]), d90 = _mm512_fmadd_ps(s0, w0, d90); - if (M > 0xa) s0 = _mm512_set1_ps(src3[off7]), da0 = _mm512_fmadd_ps(s0, w0, da0); - if (M > 0xb) s0 = _mm512_set1_ps(src4[off7]), db0 = _mm512_fmadd_ps(s0, w0, db0); - if (M > 0xc) s0 = _mm512_set1_ps(src5[off7]), dc0 = _mm512_fmadd_ps(s0, w0, dc0); - if (M > 0xd) s0 = _mm512_set1_ps(src6[off7]), dd0 = _mm512_fmadd_ps(s0, w0, dd0); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (M > 0x0) Save1(dst, d00, bias, params, tails), dst += dD; - if (M > 0x1) Save1(dst, d10, bias, params, tails), dst += dD; - if (M > 0x2) Save1(dst, d20, bias, params, tails), dst += dD; - if (M > 0x3) Save1(dst, d30, bias, params, tails), dst += dD; - if (M > 0x4) Save1(dst, d40, bias, params, tails), dst += dD; - if (M > 0x5) Save1(dst, d50, bias, params, tails), dst += dD; - if (M > 0x6) Save1(dst, d60, bias, params, tails), dst += dD; - if (M > 0x7) Save1(dst, d70, bias, params, tails), dst += dD; - if (M > 0x8) Save1(dst, d80, bias, params, tails), dst += dD; - if (M > 0x9) Save1(dst, d90, bias, params, tails), dst += dD; - if (M > 0xa) Save1(dst, da0, bias, params, tails), dst += dD; - if (M > 0xb) Save1(dst, db0, bias, params, tails), dst += dD; - if (M > 0xc) Save1(dst, dc0, bias, params, tails), dst += dD; - if (M > 0xd) Save1(dst, dd0, bias, params, tails), dst += dD; - } - } - - template ConvolutionNhwcDirect_NxM_Ptr GetConvolutionNhwcDirect_2xM(size_t M) - { - switch (M) - { - case 0x0: return NULL; - case 0x1: return ConvolutionNhwcDirect_2xM; - case 0x2: return ConvolutionNhwcDirect_2xM; - case 0x3: return ConvolutionNhwcDirect_2xM; - case 0x4: return ConvolutionNhwcDirect_2xM; - case 0x5: return ConvolutionNhwcDirect_2xM; - case 0x6: return ConvolutionNhwcDirect_2xM; - case 0x7: return ConvolutionNhwcDirect_2xM; - case 0x8: return ConvolutionNhwcDirect_2xM; - case 0x9: return ConvolutionNhwcDirect_2xM; - case 0xa: return ConvolutionNhwcDirect_2xM; - case 0xb: return ConvolutionNhwcDirect_2xM; - case 0xc: return ConvolutionNhwcDirect_2xM; - case 0xd: return ConvolutionNhwcDirect_2xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect_2(const float* src, const ConvParam32f& p, const AlgParam& a, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t noseH = p.NoseH(), noseW = p.NoseW(), bodyH = p.BodyH(), bodyW = p.BodyW(); - size_t n = 14, bodyWn = AlignLoAny(bodyW - noseW, n) + noseW, m = bodyW - bodyWn; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_2x1 = ConvolutionNhwcDirect_2x1; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_2xN = ConvolutionNhwcDirect_2x14; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_2xM = GetConvolutionNhwcDirect_2xM(m); - size_t tailH = p.dstH, tailW = p.dstW; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - - __m512 _params[2], _bias[2]; - _params[0] = _mm512_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm512_set1_ps(params[1]); - - for (size_t dc = 0; dc < dstC; dc += a.microD) - { - size_t dC = Simd::Min(a.microD, dstC - dc); - __mmask16 tails[2] = { TailMask16(dC), TailMask16(dC - F) }; - if (dC > 0 * F) _bias[0] = _mm512_loadu_ps(bias + dc + 0 * F); - if (dC > 1 * F) _bias[1] = _mm512_loadu_ps(bias + dc + 1 * F); - if (type == ::SimdConvolutionActivationPrelu) - { - if (dC > 0 * F) _params[0] = _mm512_loadu_ps(params + dc + 0 * F); - if (dC > 1 * F) _params[1] = _mm512_loadu_ps(params + dc + 1 * F); - } - float* d = dst + dc + yBeg * p.dstW * p.dstC; - size_t dy = yBeg; - for (; dy < noseH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_2xN(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - } - for (; dy < bodyH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_2xN(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - } - for (; dy < tailH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_2xN(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - } - weight += p.kernelY * p.kernelX * p.srcC * a.microD; - } - } - - //--------------------------------------------------------------------- - - template void ConvolutionNhwcDirect1x1_2x14(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, const float* weight0, const __m512* bias, const __m512* params, float* dst, const __mmask16* tails) - { - __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, da0, da1, db0, db1, dc0, dc1, dd0, dd1, s0, w0, w1; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - const float* src6 = src0 + 6 * dS; - if (tails[1]) - { - d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(), d41 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(), d51 = _mm512_setzero_ps(); - d60 = _mm512_setzero_ps(), d61 = _mm512_setzero_ps(); - d70 = _mm512_setzero_ps(), d71 = _mm512_setzero_ps(); - d80 = _mm512_setzero_ps(), d81 = _mm512_setzero_ps(); - d90 = _mm512_setzero_ps(), d91 = _mm512_setzero_ps(); - da0 = _mm512_setzero_ps(), da1 = _mm512_setzero_ps(); - db0 = _mm512_setzero_ps(), db1 = _mm512_setzero_ps(); - dc0 = _mm512_setzero_ps(), dc1 = _mm512_setzero_ps(); - dd0 = _mm512_setzero_ps(), dd1 = _mm512_setzero_ps(); - for (size_t off0 = 0, off7 = 7 * dS, offw = 0; off0 < srcC; ++off0, ++off7, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - w1 = _mm512_loadu_ps(weight1 + offw); - s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01); - s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10), d11 = _mm512_fmadd_ps(s0, w1, d11); - s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20), d21 = _mm512_fmadd_ps(s0, w1, d21); - s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30), d31 = _mm512_fmadd_ps(s0, w1, d31); - s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40), d41 = _mm512_fmadd_ps(s0, w1, d41); - s0 = _mm512_set1_ps(src5[off0]), d50 = _mm512_fmadd_ps(s0, w0, d50), d51 = _mm512_fmadd_ps(s0, w1, d51); - s0 = _mm512_set1_ps(src6[off0]), d60 = _mm512_fmadd_ps(s0, w0, d60), d61 = _mm512_fmadd_ps(s0, w1, d61); - s0 = _mm512_set1_ps(src0[off7]), d70 = _mm512_fmadd_ps(s0, w0, d70), d71 = _mm512_fmadd_ps(s0, w1, d71); - s0 = _mm512_set1_ps(src1[off7]), d80 = _mm512_fmadd_ps(s0, w0, d80), d81 = _mm512_fmadd_ps(s0, w1, d81); - s0 = _mm512_set1_ps(src2[off7]), d90 = _mm512_fmadd_ps(s0, w0, d90), d91 = _mm512_fmadd_ps(s0, w1, d91); - s0 = _mm512_set1_ps(src3[off7]), da0 = _mm512_fmadd_ps(s0, w0, da0), da1 = _mm512_fmadd_ps(s0, w1, da1); - s0 = _mm512_set1_ps(src4[off7]), db0 = _mm512_fmadd_ps(s0, w0, db0), db1 = _mm512_fmadd_ps(s0, w1, db1); - s0 = _mm512_set1_ps(src5[off7]), dc0 = _mm512_fmadd_ps(s0, w0, dc0), dc1 = _mm512_fmadd_ps(s0, w1, dc1); - s0 = _mm512_set1_ps(src6[off7]), dd0 = _mm512_fmadd_ps(s0, w0, dd0), dd1 = _mm512_fmadd_ps(s0, w1, dd1); - } - Save2(dst, d00, d01, bias, params, tails), dst += dD; - Save2(dst, d10, d11, bias, params, tails), dst += dD; - Save2(dst, d20, d21, bias, params, tails), dst += dD; - Save2(dst, d30, d31, bias, params, tails), dst += dD; - Save2(dst, d40, d41, bias, params, tails), dst += dD; - Save2(dst, d50, d51, bias, params, tails), dst += dD; - Save2(dst, d60, d61, bias, params, tails), dst += dD; - Save2(dst, d70, d71, bias, params, tails), dst += dD; - Save2(dst, d80, d81, bias, params, tails), dst += dD; - Save2(dst, d90, d91, bias, params, tails), dst += dD; - Save2(dst, da0, da1, bias, params, tails), dst += dD; - Save2(dst, db0, db1, bias, params, tails), dst += dD; - Save2(dst, dc0, dc1, bias, params, tails), dst += dD; - Save2(dst, dd0, dd1, bias, params, tails), dst += dD; - } - else - { - d00 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(); - d60 = _mm512_setzero_ps(); - d70 = _mm512_setzero_ps(); - d80 = _mm512_setzero_ps(); - d90 = _mm512_setzero_ps(); - da0 = _mm512_setzero_ps(); - db0 = _mm512_setzero_ps(); - dc0 = _mm512_setzero_ps(); - dd0 = _mm512_setzero_ps(); - for (size_t off0 = 0, off7 = 7 * dS, offw = 0; off0 < srcC; ++off0, ++off7, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00); - s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10); - s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20); - s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30); - s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40); - s0 = _mm512_set1_ps(src5[off0]), d50 = _mm512_fmadd_ps(s0, w0, d50); - s0 = _mm512_set1_ps(src6[off0]), d60 = _mm512_fmadd_ps(s0, w0, d60); - s0 = _mm512_set1_ps(src0[off7]), d70 = _mm512_fmadd_ps(s0, w0, d70); - s0 = _mm512_set1_ps(src1[off7]), d80 = _mm512_fmadd_ps(s0, w0, d80); - s0 = _mm512_set1_ps(src2[off7]), d90 = _mm512_fmadd_ps(s0, w0, d90); - s0 = _mm512_set1_ps(src3[off7]), da0 = _mm512_fmadd_ps(s0, w0, da0); - s0 = _mm512_set1_ps(src4[off7]), db0 = _mm512_fmadd_ps(s0, w0, db0); - s0 = _mm512_set1_ps(src5[off7]), dc0 = _mm512_fmadd_ps(s0, w0, dc0); - s0 = _mm512_set1_ps(src6[off7]), dd0 = _mm512_fmadd_ps(s0, w0, dd0); - } - Save1(dst, d00, bias, params, tails), dst += dD; - Save1(dst, d10, bias, params, tails), dst += dD; - Save1(dst, d20, bias, params, tails), dst += dD; - Save1(dst, d30, bias, params, tails), dst += dD; - Save1(dst, d40, bias, params, tails), dst += dD; - Save1(dst, d50, bias, params, tails), dst += dD; - Save1(dst, d60, bias, params, tails), dst += dD; - Save1(dst, d70, bias, params, tails), dst += dD; - Save1(dst, d80, bias, params, tails), dst += dD; - Save1(dst, d90, bias, params, tails), dst += dD; - Save1(dst, da0, bias, params, tails), dst += dD; - Save1(dst, db0, bias, params, tails), dst += dD; - Save1(dst, dc0, bias, params, tails), dst += dD; - Save1(dst, dd0, bias, params, tails), dst += dD; - } - } - - template void ConvolutionNhwcDirect1x1_2xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, const float* weight0, const __m512* bias, const __m512* params, float* dst, const __mmask16* tails) - { - __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, da0, da1, db0, db1, dc0, dc1, dd0, dd1, s0, w0, w1; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - const float* src6 = src0 + 6 * dS; - if (tails[1]) - { - if (M > 0x0) d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps(); - if (M > 0x1) d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps(); - if (M > 0x2) d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps(); - if (M > 0x3) d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps(); - if (M > 0x4) d40 = _mm512_setzero_ps(), d41 = _mm512_setzero_ps(); - if (M > 0x5) d50 = _mm512_setzero_ps(), d51 = _mm512_setzero_ps(); - if (M > 0x6) d60 = _mm512_setzero_ps(), d61 = _mm512_setzero_ps(); - if (M > 0x7) d70 = _mm512_setzero_ps(), d71 = _mm512_setzero_ps(); - if (M > 0x8) d80 = _mm512_setzero_ps(), d81 = _mm512_setzero_ps(); - if (M > 0x9) d90 = _mm512_setzero_ps(), d91 = _mm512_setzero_ps(); - if (M > 0xa) da0 = _mm512_setzero_ps(), da1 = _mm512_setzero_ps(); - if (M > 0xb) db0 = _mm512_setzero_ps(), db1 = _mm512_setzero_ps(); - if (M > 0xc) dc0 = _mm512_setzero_ps(), dc1 = _mm512_setzero_ps(); - if (M > 0xd) dd0 = _mm512_setzero_ps(), dd1 = _mm512_setzero_ps(); - for (size_t off0 = 0, off7 = 7 * dS, offw = 0; off0 < srcC; ++off0, ++off7, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - w1 = _mm512_loadu_ps(weight1 + offw); - if (M > 0x0) s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01); - if (M > 0x1) s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10), d11 = _mm512_fmadd_ps(s0, w1, d11); - if (M > 0x2) s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20), d21 = _mm512_fmadd_ps(s0, w1, d21); - if (M > 0x3) s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30), d31 = _mm512_fmadd_ps(s0, w1, d31); - if (M > 0x4) s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40), d41 = _mm512_fmadd_ps(s0, w1, d41); - if (M > 0x5) s0 = _mm512_set1_ps(src5[off0]), d50 = _mm512_fmadd_ps(s0, w0, d50), d51 = _mm512_fmadd_ps(s0, w1, d51); - if (M > 0x6) s0 = _mm512_set1_ps(src6[off0]), d60 = _mm512_fmadd_ps(s0, w0, d60), d61 = _mm512_fmadd_ps(s0, w1, d61); - if (M > 0x7) s0 = _mm512_set1_ps(src0[off7]), d70 = _mm512_fmadd_ps(s0, w0, d70), d71 = _mm512_fmadd_ps(s0, w1, d71); - if (M > 0x8) s0 = _mm512_set1_ps(src1[off7]), d80 = _mm512_fmadd_ps(s0, w0, d80), d81 = _mm512_fmadd_ps(s0, w1, d81); - if (M > 0x9) s0 = _mm512_set1_ps(src2[off7]), d90 = _mm512_fmadd_ps(s0, w0, d90), d91 = _mm512_fmadd_ps(s0, w1, d91); - if (M > 0xa) s0 = _mm512_set1_ps(src3[off7]), da0 = _mm512_fmadd_ps(s0, w0, da0), da1 = _mm512_fmadd_ps(s0, w1, da1); - if (M > 0xb) s0 = _mm512_set1_ps(src4[off7]), db0 = _mm512_fmadd_ps(s0, w0, db0), db1 = _mm512_fmadd_ps(s0, w1, db1); - if (M > 0xc) s0 = _mm512_set1_ps(src5[off7]), dc0 = _mm512_fmadd_ps(s0, w0, dc0), dc1 = _mm512_fmadd_ps(s0, w1, dc1); - if (M > 0xd) s0 = _mm512_set1_ps(src6[off7]), dd0 = _mm512_fmadd_ps(s0, w0, dd0), dd1 = _mm512_fmadd_ps(s0, w1, dd1); - } - if (M > 0x0) Save2(dst, d00, d01, bias, params, tails), dst += dD; - if (M > 0x1) Save2(dst, d10, d11, bias, params, tails), dst += dD; - if (M > 0x2) Save2(dst, d20, d21, bias, params, tails), dst += dD; - if (M > 0x3) Save2(dst, d30, d31, bias, params, tails), dst += dD; - if (M > 0x4) Save2(dst, d40, d41, bias, params, tails), dst += dD; - if (M > 0x5) Save2(dst, d50, d51, bias, params, tails), dst += dD; - if (M > 0x6) Save2(dst, d60, d61, bias, params, tails), dst += dD; - if (M > 0x7) Save2(dst, d70, d71, bias, params, tails), dst += dD; - if (M > 0x8) Save2(dst, d80, d81, bias, params, tails), dst += dD; - if (M > 0x9) Save2(dst, d90, d91, bias, params, tails), dst += dD; - if (M > 0xa) Save2(dst, da0, da1, bias, params, tails), dst += dD; - if (M > 0xb) Save2(dst, db0, db1, bias, params, tails), dst += dD; - if (M > 0xc) Save2(dst, dc0, dc1, bias, params, tails), dst += dD; - if (M > 0xd) Save2(dst, dd0, dd1, bias, params, tails), dst += dD; - } - else - { - if (M > 0x0) d00 = _mm512_setzero_ps(); - if (M > 0x1) d10 = _mm512_setzero_ps(); - if (M > 0x2) d20 = _mm512_setzero_ps(); - if (M > 0x3) d30 = _mm512_setzero_ps(); - if (M > 0x4) d40 = _mm512_setzero_ps(); - if (M > 0x5) d50 = _mm512_setzero_ps(); - if (M > 0x6) d60 = _mm512_setzero_ps(); - if (M > 0x7) d70 = _mm512_setzero_ps(); - if (M > 0x8) d80 = _mm512_setzero_ps(); - if (M > 0x9) d90 = _mm512_setzero_ps(); - if (M > 0xa) da0 = _mm512_setzero_ps(); - if (M > 0xb) db0 = _mm512_setzero_ps(); - if (M > 0xc) dc0 = _mm512_setzero_ps(); - if (M > 0xd) dd0 = _mm512_setzero_ps(); - for (size_t off0 = 0, off7 = 7 * dS, offw = 0; off0 < srcC; ++off0, ++off7, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - if (M > 0x0) s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00); - if (M > 0x1) s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10); - if (M > 0x2) s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20); - if (M > 0x3) s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30); - if (M > 0x4) s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40); - if (M > 0x5) s0 = _mm512_set1_ps(src5[off0]), d50 = _mm512_fmadd_ps(s0, w0, d50); - if (M > 0x6) s0 = _mm512_set1_ps(src6[off0]), d60 = _mm512_fmadd_ps(s0, w0, d60); - if (M > 0x7) s0 = _mm512_set1_ps(src0[off7]), d70 = _mm512_fmadd_ps(s0, w0, d70); - if (M > 0x8) s0 = _mm512_set1_ps(src1[off7]), d80 = _mm512_fmadd_ps(s0, w0, d80); - if (M > 0x9) s0 = _mm512_set1_ps(src2[off7]), d90 = _mm512_fmadd_ps(s0, w0, d90); - if (M > 0xa) s0 = _mm512_set1_ps(src3[off7]), da0 = _mm512_fmadd_ps(s0, w0, da0); - if (M > 0xb) s0 = _mm512_set1_ps(src4[off7]), db0 = _mm512_fmadd_ps(s0, w0, db0); - if (M > 0xc) s0 = _mm512_set1_ps(src5[off7]), dc0 = _mm512_fmadd_ps(s0, w0, dc0); - if (M > 0xd) s0 = _mm512_set1_ps(src6[off7]), dd0 = _mm512_fmadd_ps(s0, w0, dd0); - } - if (M > 0x0) Save1(dst, d00, bias, params, tails), dst += dD; - if (M > 0x1) Save1(dst, d10, bias, params, tails), dst += dD; - if (M > 0x2) Save1(dst, d20, bias, params, tails), dst += dD; - if (M > 0x3) Save1(dst, d30, bias, params, tails), dst += dD; - if (M > 0x4) Save1(dst, d40, bias, params, tails), dst += dD; - if (M > 0x5) Save1(dst, d50, bias, params, tails), dst += dD; - if (M > 0x6) Save1(dst, d60, bias, params, tails), dst += dD; - if (M > 0x7) Save1(dst, d70, bias, params, tails), dst += dD; - if (M > 0x8) Save1(dst, d80, bias, params, tails), dst += dD; - if (M > 0x9) Save1(dst, d90, bias, params, tails), dst += dD; - if (M > 0xa) Save1(dst, da0, bias, params, tails), dst += dD; - if (M > 0xb) Save1(dst, db0, bias, params, tails), dst += dD; - if (M > 0xc) Save1(dst, dc0, bias, params, tails), dst += dD; - if (M > 0xd) Save1(dst, dd0, bias, params, tails), dst += dD; - } - } - - template ConvolutionNhwcDirect1x1_NxM_Ptr GetConvolutionNhwcDirect1x1_2xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 0x1: return ConvolutionNhwcDirect1x1_2xM; - case 0x2: return ConvolutionNhwcDirect1x1_2xM; - case 0x3: return ConvolutionNhwcDirect1x1_2xM; - case 0x4: return ConvolutionNhwcDirect1x1_2xM; - case 0x5: return ConvolutionNhwcDirect1x1_2xM; - case 0x6: return ConvolutionNhwcDirect1x1_2xM; - case 0x7: return ConvolutionNhwcDirect1x1_2xM; - case 0x8: return ConvolutionNhwcDirect1x1_2xM; - case 0x9: return ConvolutionNhwcDirect1x1_2xM; - case 0xa: return ConvolutionNhwcDirect1x1_2xM; - case 0xb: return ConvolutionNhwcDirect1x1_2xM; - case 0xc: return ConvolutionNhwcDirect1x1_2xM; - case 0xd: return ConvolutionNhwcDirect1x1_2xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect1x1_2(const float* src, const ConvParam32f& p, const AlgParam& a, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t n = 14, n1 = (yEnd - yBeg) * p.dstW, nn = AlignLoAny(n1, n), m = n1 - nn; - ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_2xN = ConvolutionNhwcDirect1x1_2x14; - ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_2xM = GetConvolutionNhwcDirect1x1_2xM(m); - - __m512 _params[2], _bias[2]; - _params[0] = _mm512_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm512_set1_ps(params[1]); - - for (size_t dc = 0; dc < dstC; dc += a.microD) - { - size_t dC = Simd::Min(a.microD, dstC - dc); - __mmask16 tails[2] = { TailMask16(dC), TailMask16(dC - F) }; - if (dC > 0 * F) _bias[0] = _mm512_loadu_ps(bias + dc + 0 * F); - if (dC > 1 * F) _bias[1] = _mm512_loadu_ps(bias + dc + 1 * F); - if (type == ::SimdConvolutionActivationPrelu) - { - if (dC > 0 * F) _params[0] = _mm512_loadu_ps(params + dc + 0 * F); - if (dC > 1 * F) _params[1] = _mm512_loadu_ps(params + dc + 1 * F); - } - const float* ps = src + yBeg * p.srcW * p.srcC; - float* pd = dst + dc + yBeg * p.dstW * p.dstC; - size_t i = 0; - for (; i < nn; i += n, ps += n * p.srcC, pd += n * p.dstC) - convolutionNhwcDirect1x1_2xN(ps, p, a, srcC, weight, _bias, _params, pd, tails); - for (; i < n1; i += m, ps += m * p.srcC, pd += m * p.dstC) - convolutionNhwcDirect1x1_2xM(ps, p, a, srcC, weight, _bias, _params, pd, tails); - weight += p.srcC * a.microD; - } - } - - //--------------------------------------------------------------------- - - template static SIMD_INLINE void Set(const ConvParam32f& p, AlgParam& a) - { - a.convolutions[term] = p.Is1x1() ? ConvolutionNhwcDirect1x1_2 : ConvolutionNhwcDirect_2; - } - - template static SIMD_INLINE void Set(const ConvParam32f& p, AlgParam& a) - { - Set(p, a); - Set(p, a); - Set(p, a); - Set(p, a); - } - - bool SynetConvolution32fNhwcDirect::Set2r(const ConvParam32f& p, AlgParam& a) - { - assert(a.microD == 2 * F); - switch (p.activation) - { - case SimdConvolutionActivationIdentity: Set(p, a); break; - case SimdConvolutionActivationRelu: Set(p, a); break; - case SimdConvolutionActivationLeakyRelu: Set(p, a); break; - case SimdConvolutionActivationRestrictRange: Set(p, a); break; - case SimdConvolutionActivationPrelu: Set(p, a); break; - case SimdConvolutionActivationElu: Set(p, a); break; - case SimdConvolutionActivationHswish: Set(p, a); break; - default: assert(0); - } - return true; - } - } -#endif//SIMD_AVX512F_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512fSynetConvolution32fNhwcDirect3r.cpp b/src/3rd/Simd/Simd/SimdAvx512fSynetConvolution32fNhwcDirect3r.cpp deleted file mode 100644 index 87c111d8..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512fSynetConvolution32fNhwcDirect3r.cpp +++ /dev/null @@ -1,869 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - using AlgParam = SynetConvolution32fNhwcDirect::AlgParam; - - typedef void(*ConvolutionNhwcDirect_NxM_Ptr)(const float* src0, const ConvParam32f& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, const float* weight0, const __m512* bias, const __m512* params, float* dst, const __mmask16* tails); - typedef void(*ConvolutionNhwcDirect1x1_NxM_Ptr)(const float* src0, const ConvParam32f& p, const AlgParam& a, size_t srcC, const float* weight0, const __m512* bias, const __m512* params, float* dst, const __mmask16* tails); - - //--------------------------------------------------------------------- - - template void ConvolutionNhwcDirect_3x1(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, const float* weight0, const __m512* bias, const __m512* params, float* dst, const __mmask16* tails) - { - __m512 d00, d01, d02, s0, w0, w1, w2; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - if (tails[2]) - { - d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps(), d02 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - w1 = _mm512_loadu_ps(weight1 + offw); - w2 = _mm512_loadu_ps(weight2 + offw); - s0 = _mm512_set1_ps(src0[offs]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01), d02 = _mm512_fmadd_ps(s0, w2, d02); - } - } - weight0 += dW, weight1 += dW, weight2 += dW; - } - } - Save3(dst, d00, d01, d02, bias, params, tails); - } - else if (tails[1]) - { - d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - w1 = _mm512_loadu_ps(weight1 + offw); - s0 = _mm512_set1_ps(src0[offs]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01); - } - } - weight0 += dW, weight1 += dW; - } - } - Save2(dst, d00, d01, bias, params, tails); - } - else - { - d00 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - s0 = _mm512_set1_ps(src0[offs]), d00 = _mm512_fmadd_ps(s0, w0, d00); - } - } - weight0 += dW; - } - } - Save1(dst, d00, bias, params, tails); - } - } - - template void ConvolutionNhwcDirect_3x9(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, const float* weight0, const __m512* bias, const __m512* params, float* dst, const __mmask16* tails) - { - __m512 d00, d01, d02, d10, d11, d12, d20, d21, d22, d30, d31, d32, d40, d41, d42, d50, d51, d52, d60, d61, d62, d70, d71, d72, d80, d81, d82, s0, w0, w1, w2; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - if (tails[2]) - { - d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps(), d02 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps(), d12 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps(), d22 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps(), d32 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(), d41 = _mm512_setzero_ps(), d42 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(), d51 = _mm512_setzero_ps(), d52 = _mm512_setzero_ps(); - d60 = _mm512_setzero_ps(), d61 = _mm512_setzero_ps(), d62 = _mm512_setzero_ps(); - d70 = _mm512_setzero_ps(), d71 = _mm512_setzero_ps(), d72 = _mm512_setzero_ps(); - d80 = _mm512_setzero_ps(), d81 = _mm512_setzero_ps(), d82 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 8 <= srcW); - size_t off0 = beg + kx * dX, end = off0 + srcC, off5 = off0 + 5 * dS, offw = 0; - for (; off0 < end; ++off0, ++off5, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - w1 = _mm512_loadu_ps(weight1 + offw); - w2 = _mm512_loadu_ps(weight2 + offw); - s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01), d02 = _mm512_fmadd_ps(s0, w2, d02); - s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10), d11 = _mm512_fmadd_ps(s0, w1, d11), d12 = _mm512_fmadd_ps(s0, w2, d12); - s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20), d21 = _mm512_fmadd_ps(s0, w1, d21), d22 = _mm512_fmadd_ps(s0, w2, d22); - s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30), d31 = _mm512_fmadd_ps(s0, w1, d31), d32 = _mm512_fmadd_ps(s0, w2, d32); - s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40), d41 = _mm512_fmadd_ps(s0, w1, d41), d42 = _mm512_fmadd_ps(s0, w2, d42); - s0 = _mm512_set1_ps(src0[off5]), d50 = _mm512_fmadd_ps(s0, w0, d50), d51 = _mm512_fmadd_ps(s0, w1, d51), d52 = _mm512_fmadd_ps(s0, w2, d52); - s0 = _mm512_set1_ps(src1[off5]), d60 = _mm512_fmadd_ps(s0, w0, d60), d61 = _mm512_fmadd_ps(s0, w1, d61), d62 = _mm512_fmadd_ps(s0, w2, d62); - s0 = _mm512_set1_ps(src2[off5]), d70 = _mm512_fmadd_ps(s0, w0, d70), d71 = _mm512_fmadd_ps(s0, w1, d71), d72 = _mm512_fmadd_ps(s0, w2, d72); - s0 = _mm512_set1_ps(src3[off5]), d80 = _mm512_fmadd_ps(s0, w0, d80), d81 = _mm512_fmadd_ps(s0, w1, d81), d82 = _mm512_fmadd_ps(s0, w2, d82); - } - weight0 += dW, weight1 += dW, weight2 += dW; - } - } - else - weight0 += dWz, weight1 += dWz, weight2 += dWz; - } - Save3(dst, d00, d01, d02, bias, params, tails), dst += dD; - Save3(dst, d10, d11, d12, bias, params, tails), dst += dD; - Save3(dst, d20, d21, d22, bias, params, tails), dst += dD; - Save3(dst, d30, d31, d32, bias, params, tails), dst += dD; - Save3(dst, d40, d41, d42, bias, params, tails), dst += dD; - Save3(dst, d50, d51, d52, bias, params, tails), dst += dD; - Save3(dst, d60, d61, d62, bias, params, tails), dst += dD; - Save3(dst, d70, d71, d72, bias, params, tails), dst += dD; - Save3(dst, d80, d81, d82, bias, params, tails), dst += dD; - } - else if (tails[1]) - { - d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(), d41 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(), d51 = _mm512_setzero_ps(); - d60 = _mm512_setzero_ps(), d61 = _mm512_setzero_ps(); - d70 = _mm512_setzero_ps(), d71 = _mm512_setzero_ps(); - d80 = _mm512_setzero_ps(), d81 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 8 <= srcW); - size_t off0 = beg + kx * dX, end = off0 + srcC, off5 = off0 + 5 * dS, offw = 0; - for (; off0 < end; ++off0, ++off5, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - w1 = _mm512_loadu_ps(weight1 + offw); - s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01); - s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10), d11 = _mm512_fmadd_ps(s0, w1, d11); - s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20), d21 = _mm512_fmadd_ps(s0, w1, d21); - s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30), d31 = _mm512_fmadd_ps(s0, w1, d31); - s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40), d41 = _mm512_fmadd_ps(s0, w1, d41); - s0 = _mm512_set1_ps(src0[off5]), d50 = _mm512_fmadd_ps(s0, w0, d50), d51 = _mm512_fmadd_ps(s0, w1, d51); - s0 = _mm512_set1_ps(src1[off5]), d60 = _mm512_fmadd_ps(s0, w0, d60), d61 = _mm512_fmadd_ps(s0, w1, d61); - s0 = _mm512_set1_ps(src2[off5]), d70 = _mm512_fmadd_ps(s0, w0, d70), d71 = _mm512_fmadd_ps(s0, w1, d71); - s0 = _mm512_set1_ps(src3[off5]), d80 = _mm512_fmadd_ps(s0, w0, d80), d81 = _mm512_fmadd_ps(s0, w1, d81); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - Save2(dst, d00, d01, bias, params, tails), dst += dD; - Save2(dst, d10, d11, bias, params, tails), dst += dD; - Save2(dst, d20, d21, bias, params, tails), dst += dD; - Save2(dst, d30, d31, bias, params, tails), dst += dD; - Save2(dst, d40, d41, bias, params, tails), dst += dD; - Save2(dst, d50, d51, bias, params, tails), dst += dD; - Save2(dst, d60, d61, bias, params, tails), dst += dD; - Save2(dst, d70, d71, bias, params, tails), dst += dD; - Save2(dst, d80, d81, bias, params, tails), dst += dD; - } - else - { - d00 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(); - d60 = _mm512_setzero_ps(); - d70 = _mm512_setzero_ps(); - d80 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 8 <= srcW); - size_t off0 = beg + kx * dX, end = off0 + srcC, off5 = off0 + 5 * dS, offw = 0; - for (; off0 < end; ++off0, ++off5, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00); - s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10); - s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20); - s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30); - s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40); - s0 = _mm512_set1_ps(src0[off5]), d50 = _mm512_fmadd_ps(s0, w0, d50); - s0 = _mm512_set1_ps(src1[off5]), d60 = _mm512_fmadd_ps(s0, w0, d60); - s0 = _mm512_set1_ps(src2[off5]), d70 = _mm512_fmadd_ps(s0, w0, d70); - s0 = _mm512_set1_ps(src3[off5]), d80 = _mm512_fmadd_ps(s0, w0, d80); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - Save1(dst, d00, bias, params, tails), dst += dD; - Save1(dst, d10, bias, params, tails), dst += dD; - Save1(dst, d20, bias, params, tails), dst += dD; - Save1(dst, d30, bias, params, tails), dst += dD; - Save1(dst, d40, bias, params, tails), dst += dD; - Save1(dst, d50, bias, params, tails), dst += dD; - Save1(dst, d60, bias, params, tails), dst += dD; - Save1(dst, d70, bias, params, tails), dst += dD; - Save1(dst, d80, bias, params, tails), dst += dD; - } - } - - template void ConvolutionNhwcDirect_3xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, const float* weight0, const __m512* bias, const __m512* params, float* dst, const __mmask16* tails) - { - __m512 d00, d01, d02, d10, d11, d12, d20, d21, d22, d30, d31, d32, d40, d41, d42, d50, d51, d52, d60, d61, d62, d70, d71, d72, d80, d81, d82, s0, w0, w1, w2; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - if (tails[2]) - { - if (M > 0) d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps(), d02 = _mm512_setzero_ps(); - if (M > 1) d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps(), d12 = _mm512_setzero_ps(); - if (M > 2) d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps(), d22 = _mm512_setzero_ps(); - if (M > 3) d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps(), d32 = _mm512_setzero_ps(); - if (M > 4) d40 = _mm512_setzero_ps(), d41 = _mm512_setzero_ps(), d42 = _mm512_setzero_ps(); - if (M > 5) d50 = _mm512_setzero_ps(), d51 = _mm512_setzero_ps(), d52 = _mm512_setzero_ps(); - if (M > 6) d60 = _mm512_setzero_ps(), d61 = _mm512_setzero_ps(), d62 = _mm512_setzero_ps(); - if (M > 7) d70 = _mm512_setzero_ps(), d71 = _mm512_setzero_ps(), d72 = _mm512_setzero_ps(); - if (M > 8) d80 = _mm512_setzero_ps(), d81 = _mm512_setzero_ps(), d82 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t off0 = beg + kx * dX, end = off0 + srcC, off5 = off0 + 5 * dS, offw = 0; - for (; off0 < end; ++off0, ++off5, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - w1 = _mm512_loadu_ps(weight1 + offw); - w2 = _mm512_loadu_ps(weight2 + offw); - if (M > 0) s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01), d02 = _mm512_fmadd_ps(s0, w2, d02); - if (M > 1) s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10), d11 = _mm512_fmadd_ps(s0, w1, d11), d12 = _mm512_fmadd_ps(s0, w2, d12); - if (M > 2) s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20), d21 = _mm512_fmadd_ps(s0, w1, d21), d22 = _mm512_fmadd_ps(s0, w2, d22); - if (M > 3) s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30), d31 = _mm512_fmadd_ps(s0, w1, d31), d32 = _mm512_fmadd_ps(s0, w2, d32); - if (M > 4) s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40), d41 = _mm512_fmadd_ps(s0, w1, d41), d42 = _mm512_fmadd_ps(s0, w2, d42); - if (M > 5) s0 = _mm512_set1_ps(src0[off5]), d50 = _mm512_fmadd_ps(s0, w0, d50), d51 = _mm512_fmadd_ps(s0, w1, d51), d52 = _mm512_fmadd_ps(s0, w2, d52); - if (M > 6) s0 = _mm512_set1_ps(src1[off5]), d60 = _mm512_fmadd_ps(s0, w0, d60), d61 = _mm512_fmadd_ps(s0, w1, d61), d62 = _mm512_fmadd_ps(s0, w2, d62); - if (M > 7) s0 = _mm512_set1_ps(src2[off5]), d70 = _mm512_fmadd_ps(s0, w0, d70), d71 = _mm512_fmadd_ps(s0, w1, d71), d72 = _mm512_fmadd_ps(s0, w2, d72); - if (M > 8) s0 = _mm512_set1_ps(src3[off5]), d80 = _mm512_fmadd_ps(s0, w0, d80), d81 = _mm512_fmadd_ps(s0, w1, d81), d82 = _mm512_fmadd_ps(s0, w2, d82); - } - weight0 += dW, weight1 += dW, weight2 += dW; - } - } - else - weight0 += dWz, weight1 += dWz, weight2 += dWz; - } - if (M > 0) Save3(dst, d00, d01, d02, bias, params, tails), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params, tails), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params, tails), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params, tails), dst += dD; - if (M > 4) Save3(dst, d40, d41, d42, bias, params, tails), dst += dD; - if (M > 5) Save3(dst, d50, d51, d52, bias, params, tails), dst += dD; - if (M > 6) Save3(dst, d60, d61, d62, bias, params, tails), dst += dD; - if (M > 7) Save3(dst, d70, d71, d72, bias, params, tails), dst += dD; - if (M > 8) Save3(dst, d80, d81, d82, bias, params, tails), dst += dD; - } - else if (tails[1]) - { - if (M > 0) d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps(); - if (M > 1) d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps(); - if (M > 2) d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps(); - if (M > 3) d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps(); - if (M > 4) d40 = _mm512_setzero_ps(), d41 = _mm512_setzero_ps(); - if (M > 5) d50 = _mm512_setzero_ps(), d51 = _mm512_setzero_ps(); - if (M > 6) d60 = _mm512_setzero_ps(), d61 = _mm512_setzero_ps(); - if (M > 7) d70 = _mm512_setzero_ps(), d71 = _mm512_setzero_ps(); - if (M > 8) d80 = _mm512_setzero_ps(), d81 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t off0 = beg + kx * dX, end = off0 + srcC, off5 = off0 + 5 * dS, offw = 0; - for (; off0 < end; ++off0, ++off5, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - w1 = _mm512_loadu_ps(weight1 + offw); - if (M > 0) s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01); - if (M > 1) s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10), d11 = _mm512_fmadd_ps(s0, w1, d11); - if (M > 2) s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20), d21 = _mm512_fmadd_ps(s0, w1, d21); - if (M > 3) s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30), d31 = _mm512_fmadd_ps(s0, w1, d31); - if (M > 4) s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40), d41 = _mm512_fmadd_ps(s0, w1, d41); - if (M > 5) s0 = _mm512_set1_ps(src0[off5]), d50 = _mm512_fmadd_ps(s0, w0, d50), d51 = _mm512_fmadd_ps(s0, w1, d51); - if (M > 6) s0 = _mm512_set1_ps(src1[off5]), d60 = _mm512_fmadd_ps(s0, w0, d60), d61 = _mm512_fmadd_ps(s0, w1, d61); - if (M > 7) s0 = _mm512_set1_ps(src2[off5]), d70 = _mm512_fmadd_ps(s0, w0, d70), d71 = _mm512_fmadd_ps(s0, w1, d71); - if (M > 8) s0 = _mm512_set1_ps(src3[off5]), d80 = _mm512_fmadd_ps(s0, w0, d80), d81 = _mm512_fmadd_ps(s0, w1, d81); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (M > 0) Save2(dst, d00, d01, bias, params, tails), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params, tails), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params, tails), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params, tails), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params, tails), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params, tails), dst += dD; - if (M > 6) Save2(dst, d60, d61, bias, params, tails), dst += dD; - if (M > 7) Save2(dst, d70, d71, bias, params, tails), dst += dD; - if (M > 8) Save2(dst, d80, d81, bias, params, tails), dst += dD; - } - else - { - if (M > 0) d00 = _mm512_setzero_ps(); - if (M > 1) d10 = _mm512_setzero_ps(); - if (M > 2) d20 = _mm512_setzero_ps(); - if (M > 3) d30 = _mm512_setzero_ps(); - if (M > 4) d40 = _mm512_setzero_ps(); - if (M > 5) d50 = _mm512_setzero_ps(); - if (M > 6) d60 = _mm512_setzero_ps(); - if (M > 7) d70 = _mm512_setzero_ps(); - if (M > 8) d80 = _mm512_setzero_ps(); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t off0 = beg + kx * dX, end = off0 + srcC, off5 = off0 + 5 * dS, offw = 0; - for (; off0 < end; ++off0, ++off5, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - if (M > 0) s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00); - if (M > 1) s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10); - if (M > 2) s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20); - if (M > 3) s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30); - if (M > 4) s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40); - if (M > 5) s0 = _mm512_set1_ps(src0[off5]), d50 = _mm512_fmadd_ps(s0, w0, d50); - if (M > 6) s0 = _mm512_set1_ps(src1[off5]), d60 = _mm512_fmadd_ps(s0, w0, d60); - if (M > 7) s0 = _mm512_set1_ps(src2[off5]), d70 = _mm512_fmadd_ps(s0, w0, d70); - if (M > 8) s0 = _mm512_set1_ps(src3[off5]), d80 = _mm512_fmadd_ps(s0, w0, d80); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (M > 0) Save1(dst, d00, bias, params, tails), dst += dD; - if (M > 1) Save1(dst, d10, bias, params, tails), dst += dD; - if (M > 2) Save1(dst, d20, bias, params, tails), dst += dD; - if (M > 3) Save1(dst, d30, bias, params, tails), dst += dD; - if (M > 4) Save1(dst, d40, bias, params, tails), dst += dD; - if (M > 5) Save1(dst, d50, bias, params, tails), dst += dD; - if (M > 6) Save1(dst, d60, bias, params, tails), dst += dD; - if (M > 7) Save1(dst, d70, bias, params, tails), dst += dD; - if (M > 8) Save1(dst, d80, bias, params, tails), dst += dD; - } - } - - template ConvolutionNhwcDirect_NxM_Ptr GetConvolutionNhwcDirect_3xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect_3xM; - case 2: return ConvolutionNhwcDirect_3xM; - case 3: return ConvolutionNhwcDirect_3xM; - case 4: return ConvolutionNhwcDirect_3xM; - case 5: return ConvolutionNhwcDirect_3xM; - case 6: return ConvolutionNhwcDirect_3xM; - case 7: return ConvolutionNhwcDirect_3xM; - case 8: return ConvolutionNhwcDirect_3xM; - case 9: return ConvolutionNhwcDirect_3xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect_3(const float* src, const ConvParam32f& p, const AlgParam& a, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t noseH = p.NoseH(), noseW = p.NoseW(), bodyH = p.BodyH(), bodyW = p.BodyW(); - size_t n = 9, bodyWn = AlignLoAny(bodyW - noseW, n) + noseW, m = bodyW - bodyWn; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_3x1 = ConvolutionNhwcDirect_3x1; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_3xN = ConvolutionNhwcDirect_3x9; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_3xM = GetConvolutionNhwcDirect_3xM(m); - size_t tailH = p.dstH, tailW = p.dstW; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - - __m512 _params[3], _bias[3]; - _params[0] = _mm512_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm512_set1_ps(params[1]); - - for (size_t dc = 0; dc < dstC; dc += a.microD) - { - size_t dC = Simd::Min(a.microD, dstC - dc); - __mmask16 tails[3] = { TailMask16(dC - 0 * F), TailMask16(dC - 1 * F), TailMask16(dC - 2 * F) }; - if (dC > 0 * F) _bias[0] = _mm512_loadu_ps(bias + dc + 0 * F); - if (dC > 1 * F) _bias[1] = _mm512_loadu_ps(bias + dc + 1 * F); - if (dC > 2 * F) _bias[2] = _mm512_loadu_ps(bias + dc + 2 * F); - if (type == ::SimdConvolutionActivationPrelu) - { - if (dC > 0 * F) _params[0] = _mm512_loadu_ps(params + dc + 0 * F); - if (dC > 1 * F) _params[1] = _mm512_loadu_ps(params + dc + 1 * F); - if (dC > 2 * F) _params[2] = _mm512_loadu_ps(params + dc + 2 * F); - } - float* d = dst + dc + yBeg * p.dstW * p.dstC; - size_t dy = yBeg; - for (; dy < noseH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_3xN(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_3xM(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - } - for (; dy < bodyH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_3xN(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_3xM(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - } - for (; dy < tailH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_3xN(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_3xM(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails); - } - weight += p.kernelY * p.kernelX * p.srcC * a.microD; - } - } - - //--------------------------------------------------------------------- - - template void ConvolutionNhwcDirect1x1_3x9(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, const float* weight0, const __m512* bias, const __m512* params, float* dst, const __mmask16* tails) - { - __m512 d00, d01, d02, d10, d11, d12, d20, d21, d22, d30, d31, d32, d40, d41, d42, d50, d51, d52, d60, d61, d62, d70, d71, d72, d80, d81, d82, s0, w0, w1, w2; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - if (tails[2]) - { - d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps(), d02 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps(), d12 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps(), d22 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps(), d32 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(), d41 = _mm512_setzero_ps(), d42 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(), d51 = _mm512_setzero_ps(), d52 = _mm512_setzero_ps(); - d60 = _mm512_setzero_ps(), d61 = _mm512_setzero_ps(), d62 = _mm512_setzero_ps(); - d70 = _mm512_setzero_ps(), d71 = _mm512_setzero_ps(), d72 = _mm512_setzero_ps(); - d80 = _mm512_setzero_ps(), d81 = _mm512_setzero_ps(), d82 = _mm512_setzero_ps(); - for (size_t off0 = 0, off5 = 5 * dS, offw = 0; off0 < srcC; ++off0, ++off5, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - w1 = _mm512_loadu_ps(weight1 + offw); - w2 = _mm512_loadu_ps(weight2 + offw); - s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01), d02 = _mm512_fmadd_ps(s0, w2, d02); - s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10), d11 = _mm512_fmadd_ps(s0, w1, d11), d12 = _mm512_fmadd_ps(s0, w2, d12); - s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20), d21 = _mm512_fmadd_ps(s0, w1, d21), d22 = _mm512_fmadd_ps(s0, w2, d22); - s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30), d31 = _mm512_fmadd_ps(s0, w1, d31), d32 = _mm512_fmadd_ps(s0, w2, d32); - s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40), d41 = _mm512_fmadd_ps(s0, w1, d41), d42 = _mm512_fmadd_ps(s0, w2, d42); - s0 = _mm512_set1_ps(src0[off5]), d50 = _mm512_fmadd_ps(s0, w0, d50), d51 = _mm512_fmadd_ps(s0, w1, d51), d52 = _mm512_fmadd_ps(s0, w2, d52); - s0 = _mm512_set1_ps(src1[off5]), d60 = _mm512_fmadd_ps(s0, w0, d60), d61 = _mm512_fmadd_ps(s0, w1, d61), d62 = _mm512_fmadd_ps(s0, w2, d62); - s0 = _mm512_set1_ps(src2[off5]), d70 = _mm512_fmadd_ps(s0, w0, d70), d71 = _mm512_fmadd_ps(s0, w1, d71), d72 = _mm512_fmadd_ps(s0, w2, d72); - s0 = _mm512_set1_ps(src3[off5]), d80 = _mm512_fmadd_ps(s0, w0, d80), d81 = _mm512_fmadd_ps(s0, w1, d81), d82 = _mm512_fmadd_ps(s0, w2, d82); - } - Save3(dst, d00, d01, d02, bias, params, tails), dst += dD; - Save3(dst, d10, d11, d12, bias, params, tails), dst += dD; - Save3(dst, d20, d21, d22, bias, params, tails), dst += dD; - Save3(dst, d30, d31, d32, bias, params, tails), dst += dD; - Save3(dst, d40, d41, d42, bias, params, tails), dst += dD; - Save3(dst, d50, d51, d52, bias, params, tails), dst += dD; - Save3(dst, d60, d61, d62, bias, params, tails), dst += dD; - Save3(dst, d70, d71, d72, bias, params, tails), dst += dD; - Save3(dst, d80, d81, d82, bias, params, tails), dst += dD; - } - else if (tails[1]) - { - d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(), d41 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(), d51 = _mm512_setzero_ps(); - d60 = _mm512_setzero_ps(), d61 = _mm512_setzero_ps(); - d70 = _mm512_setzero_ps(), d71 = _mm512_setzero_ps(); - d80 = _mm512_setzero_ps(), d81 = _mm512_setzero_ps(); - for (size_t off0 = 0, off5 = 5 * dS, offw = 0; off0 < srcC; ++off0, ++off5, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - w1 = _mm512_loadu_ps(weight1 + offw); - s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01); - s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10), d11 = _mm512_fmadd_ps(s0, w1, d11); - s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20), d21 = _mm512_fmadd_ps(s0, w1, d21); - s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30), d31 = _mm512_fmadd_ps(s0, w1, d31); - s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40), d41 = _mm512_fmadd_ps(s0, w1, d41); - s0 = _mm512_set1_ps(src0[off5]), d50 = _mm512_fmadd_ps(s0, w0, d50), d51 = _mm512_fmadd_ps(s0, w1, d51); - s0 = _mm512_set1_ps(src1[off5]), d60 = _mm512_fmadd_ps(s0, w0, d60), d61 = _mm512_fmadd_ps(s0, w1, d61); - s0 = _mm512_set1_ps(src2[off5]), d70 = _mm512_fmadd_ps(s0, w0, d70), d71 = _mm512_fmadd_ps(s0, w1, d71); - s0 = _mm512_set1_ps(src3[off5]), d80 = _mm512_fmadd_ps(s0, w0, d80), d81 = _mm512_fmadd_ps(s0, w1, d81); - } - Save2(dst, d00, d01, bias, params, tails), dst += dD; - Save2(dst, d10, d11, bias, params, tails), dst += dD; - Save2(dst, d20, d21, bias, params, tails), dst += dD; - Save2(dst, d30, d31, bias, params, tails), dst += dD; - Save2(dst, d40, d41, bias, params, tails), dst += dD; - Save2(dst, d50, d51, bias, params, tails), dst += dD; - Save2(dst, d60, d61, bias, params, tails), dst += dD; - Save2(dst, d70, d71, bias, params, tails), dst += dD; - Save2(dst, d80, d81, bias, params, tails), dst += dD; - } - else - { - d00 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(); - d60 = _mm512_setzero_ps(); - d70 = _mm512_setzero_ps(); - d80 = _mm512_setzero_ps(); - for (size_t off0 = 0, off5 = 5 * dS, offw = 0; off0 < srcC; ++off0, ++off5, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00); - s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10); - s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20); - s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30); - s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40); - s0 = _mm512_set1_ps(src0[off5]), d50 = _mm512_fmadd_ps(s0, w0, d50); - s0 = _mm512_set1_ps(src1[off5]), d60 = _mm512_fmadd_ps(s0, w0, d60); - s0 = _mm512_set1_ps(src2[off5]), d70 = _mm512_fmadd_ps(s0, w0, d70); - s0 = _mm512_set1_ps(src3[off5]), d80 = _mm512_fmadd_ps(s0, w0, d80); - } - Save1(dst, d00, bias, params, tails), dst += dD; - Save1(dst, d10, bias, params, tails), dst += dD; - Save1(dst, d20, bias, params, tails), dst += dD; - Save1(dst, d30, bias, params, tails), dst += dD; - Save1(dst, d40, bias, params, tails), dst += dD; - Save1(dst, d50, bias, params, tails), dst += dD; - Save1(dst, d60, bias, params, tails), dst += dD; - Save1(dst, d70, bias, params, tails), dst += dD; - Save1(dst, d80, bias, params, tails), dst += dD; - } - } - - template void ConvolutionNhwcDirect1x1_3xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, const float* weight0, const __m512* bias, const __m512* params, float* dst, const __mmask16* tails) - { - __m512 d00, d01, d02, d10, d11, d12, d20, d21, d22, d30, d31, d32, d40, d41, d42, d50, d51, d52, d60, d61, d62, d70, d71, d72, d80, d81, d82, s0, w0, w1, w2; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - if (tails[2]) - { - if (M > 0) d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps(), d02 = _mm512_setzero_ps(); - if (M > 1) d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps(), d12 = _mm512_setzero_ps(); - if (M > 2) d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps(), d22 = _mm512_setzero_ps(); - if (M > 3) d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps(), d32 = _mm512_setzero_ps(); - if (M > 4) d40 = _mm512_setzero_ps(), d41 = _mm512_setzero_ps(), d42 = _mm512_setzero_ps(); - if (M > 5) d50 = _mm512_setzero_ps(), d51 = _mm512_setzero_ps(), d52 = _mm512_setzero_ps(); - if (M > 6) d60 = _mm512_setzero_ps(), d61 = _mm512_setzero_ps(), d62 = _mm512_setzero_ps(); - if (M > 7) d70 = _mm512_setzero_ps(), d71 = _mm512_setzero_ps(), d72 = _mm512_setzero_ps(); - if (M > 8) d80 = _mm512_setzero_ps(), d81 = _mm512_setzero_ps(), d82 = _mm512_setzero_ps(); - for (size_t off0 = 0, off5 = 5 * dS, offw = 0; off0 < srcC; ++off0, ++off5, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - w1 = _mm512_loadu_ps(weight1 + offw); - w2 = _mm512_loadu_ps(weight2 + offw); - if (M > 0) s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01), d02 = _mm512_fmadd_ps(s0, w2, d02); - if (M > 1) s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10), d11 = _mm512_fmadd_ps(s0, w1, d11), d12 = _mm512_fmadd_ps(s0, w2, d12); - if (M > 2) s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20), d21 = _mm512_fmadd_ps(s0, w1, d21), d22 = _mm512_fmadd_ps(s0, w2, d22); - if (M > 3) s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30), d31 = _mm512_fmadd_ps(s0, w1, d31), d32 = _mm512_fmadd_ps(s0, w2, d32); - if (M > 4) s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40), d41 = _mm512_fmadd_ps(s0, w1, d41), d42 = _mm512_fmadd_ps(s0, w2, d42); - if (M > 5) s0 = _mm512_set1_ps(src0[off5]), d50 = _mm512_fmadd_ps(s0, w0, d50), d51 = _mm512_fmadd_ps(s0, w1, d51), d52 = _mm512_fmadd_ps(s0, w2, d52); - if (M > 6) s0 = _mm512_set1_ps(src1[off5]), d60 = _mm512_fmadd_ps(s0, w0, d60), d61 = _mm512_fmadd_ps(s0, w1, d61), d62 = _mm512_fmadd_ps(s0, w2, d62); - if (M > 7) s0 = _mm512_set1_ps(src2[off5]), d70 = _mm512_fmadd_ps(s0, w0, d70), d71 = _mm512_fmadd_ps(s0, w1, d71), d72 = _mm512_fmadd_ps(s0, w2, d72); - if (M > 8) s0 = _mm512_set1_ps(src3[off5]), d80 = _mm512_fmadd_ps(s0, w0, d80), d81 = _mm512_fmadd_ps(s0, w1, d81), d82 = _mm512_fmadd_ps(s0, w2, d82); - } - if (M > 0) Save3(dst, d00, d01, d02, bias, params, tails), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params, tails), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params, tails), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params, tails), dst += dD; - if (M > 4) Save3(dst, d40, d41, d42, bias, params, tails), dst += dD; - if (M > 5) Save3(dst, d50, d51, d52, bias, params, tails), dst += dD; - if (M > 6) Save3(dst, d60, d61, d62, bias, params, tails), dst += dD; - if (M > 7) Save3(dst, d70, d71, d72, bias, params, tails), dst += dD; - if (M > 8) Save3(dst, d80, d81, d82, bias, params, tails), dst += dD; - } - else if (tails[1]) - { - if (M > 0) d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps(); - if (M > 1) d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps(); - if (M > 2) d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps(); - if (M > 3) d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps(); - if (M > 4) d40 = _mm512_setzero_ps(), d41 = _mm512_setzero_ps(); - if (M > 5) d50 = _mm512_setzero_ps(), d51 = _mm512_setzero_ps(); - if (M > 6) d60 = _mm512_setzero_ps(), d61 = _mm512_setzero_ps(); - if (M > 7) d70 = _mm512_setzero_ps(), d71 = _mm512_setzero_ps(); - if (M > 8) d80 = _mm512_setzero_ps(), d81 = _mm512_setzero_ps(); - for (size_t off0 = 0, off5 = 5 * dS, offw = 0; off0 < srcC; ++off0, ++off5, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - w1 = _mm512_loadu_ps(weight1 + offw); - if (M > 0) s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01); - if (M > 1) s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10), d11 = _mm512_fmadd_ps(s0, w1, d11); - if (M > 2) s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20), d21 = _mm512_fmadd_ps(s0, w1, d21); - if (M > 3) s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30), d31 = _mm512_fmadd_ps(s0, w1, d31); - if (M > 4) s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40), d41 = _mm512_fmadd_ps(s0, w1, d41); - if (M > 5) s0 = _mm512_set1_ps(src0[off5]), d50 = _mm512_fmadd_ps(s0, w0, d50), d51 = _mm512_fmadd_ps(s0, w1, d51); - if (M > 6) s0 = _mm512_set1_ps(src1[off5]), d60 = _mm512_fmadd_ps(s0, w0, d60), d61 = _mm512_fmadd_ps(s0, w1, d61); - if (M > 7) s0 = _mm512_set1_ps(src2[off5]), d70 = _mm512_fmadd_ps(s0, w0, d70), d71 = _mm512_fmadd_ps(s0, w1, d71); - if (M > 8) s0 = _mm512_set1_ps(src3[off5]), d80 = _mm512_fmadd_ps(s0, w0, d80), d81 = _mm512_fmadd_ps(s0, w1, d81); - } - if (M > 0) Save2(dst, d00, d01, bias, params, tails), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params, tails), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params, tails), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params, tails), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params, tails), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params, tails), dst += dD; - if (M > 6) Save2(dst, d60, d61, bias, params, tails), dst += dD; - if (M > 7) Save2(dst, d70, d71, bias, params, tails), dst += dD; - if (M > 8) Save2(dst, d80, d81, bias, params, tails), dst += dD; - } - else - { - if (M > 0) d00 = _mm512_setzero_ps(); - if (M > 1) d10 = _mm512_setzero_ps(); - if (M > 2) d20 = _mm512_setzero_ps(); - if (M > 3) d30 = _mm512_setzero_ps(); - if (M > 4) d40 = _mm512_setzero_ps(); - if (M > 5) d50 = _mm512_setzero_ps(); - if (M > 6) d60 = _mm512_setzero_ps(); - if (M > 7) d70 = _mm512_setzero_ps(); - if (M > 8) d80 = _mm512_setzero_ps(); - for (size_t off0 = 0, off5 = 5 * dS, offw = 0; off0 < srcC; ++off0, ++off5, offw += F) - { - w0 = _mm512_loadu_ps(weight0 + offw); - if (M > 0) s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00); - if (M > 1) s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10); - if (M > 2) s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20); - if (M > 3) s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30); - if (M > 4) s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40); - if (M > 5) s0 = _mm512_set1_ps(src0[off5]), d50 = _mm512_fmadd_ps(s0, w0, d50); - if (M > 6) s0 = _mm512_set1_ps(src1[off5]), d60 = _mm512_fmadd_ps(s0, w0, d60); - if (M > 7) s0 = _mm512_set1_ps(src2[off5]), d70 = _mm512_fmadd_ps(s0, w0, d70); - if (M > 8) s0 = _mm512_set1_ps(src3[off5]), d80 = _mm512_fmadd_ps(s0, w0, d80); - } - if (M > 0) Save1(dst, d00, bias, params, tails), dst += dD; - if (M > 1) Save1(dst, d10, bias, params, tails), dst += dD; - if (M > 2) Save1(dst, d20, bias, params, tails), dst += dD; - if (M > 3) Save1(dst, d30, bias, params, tails), dst += dD; - if (M > 4) Save1(dst, d40, bias, params, tails), dst += dD; - if (M > 5) Save1(dst, d50, bias, params, tails), dst += dD; - if (M > 6) Save1(dst, d60, bias, params, tails), dst += dD; - if (M > 7) Save1(dst, d70, bias, params, tails), dst += dD; - if (M > 8) Save1(dst, d80, bias, params, tails), dst += dD; - } - } - - template ConvolutionNhwcDirect1x1_NxM_Ptr GetConvolutionNhwcDirect1x1_3xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect1x1_3xM; - case 2: return ConvolutionNhwcDirect1x1_3xM; - case 3: return ConvolutionNhwcDirect1x1_3xM; - case 4: return ConvolutionNhwcDirect1x1_3xM; - case 5: return ConvolutionNhwcDirect1x1_3xM; - case 6: return ConvolutionNhwcDirect1x1_3xM; - case 7: return ConvolutionNhwcDirect1x1_3xM; - case 8: return ConvolutionNhwcDirect1x1_3xM; - case 9: return ConvolutionNhwcDirect1x1_3xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect1x1_3(const float* src, const ConvParam32f& p, const AlgParam& a, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t n = 9, n1 = (yEnd - yBeg) * p.dstW, nn = AlignLoAny(n1, n), m = n1 - nn; - ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_3xN = ConvolutionNhwcDirect1x1_3x9; - ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_3xM = GetConvolutionNhwcDirect1x1_3xM(m); - - __m512 _params[3], _bias[3]; - _params[0] = _mm512_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm512_set1_ps(params[1]); - - for (size_t dc = 0; dc < dstC; dc += a.microD) - { - size_t dC = Simd::Min(a.microD, dstC - dc); - __mmask16 tails[3] = { TailMask16(dC - 0 * F), TailMask16(dC - 1 * F), TailMask16(dC - 2 * F) }; - if (dC > 0 * F) _bias[0] = _mm512_loadu_ps(bias + dc + 0 * F); - if (dC > 1 * F) _bias[1] = _mm512_loadu_ps(bias + dc + 1 * F); - if (dC > 2 * F) _bias[2] = _mm512_loadu_ps(bias + dc + 2 * F); - if (type == ::SimdConvolutionActivationPrelu) - { - if (dC > 0 * F) _params[0] = _mm512_loadu_ps(params + dc + 0 * F); - if (dC > 1 * F) _params[1] = _mm512_loadu_ps(params + dc + 1 * F); - if (dC > 2 * F) _params[2] = _mm512_loadu_ps(params + dc + 2 * F); - } - const float* ps = src + yBeg * p.srcW * p.srcC; - float* pd = dst + dc + yBeg * p.dstW * p.dstC; - size_t i = 0; - for (; i < nn; i += n, ps += n * p.srcC, pd += n * p.dstC) - convolutionNhwcDirect1x1_3xN(ps, p, a, srcC, weight, _bias, _params, pd, tails); - for (; i < n1; i += m, ps += m * p.srcC, pd += m * p.dstC) - convolutionNhwcDirect1x1_3xM(ps, p, a, srcC, weight, _bias, _params, pd, tails); - weight += p.srcC * a.microD; - } - } - - //--------------------------------------------------------------------- - - template static SIMD_INLINE void Set(const ConvParam32f& p, AlgParam& a) - { - a.convolutions[term] = p.Is1x1() ? ConvolutionNhwcDirect1x1_3 : ConvolutionNhwcDirect_3; - } - - template static SIMD_INLINE void Set(const ConvParam32f& p, AlgParam& a) - { - Set(p, a); - Set(p, a); - Set(p, a); - Set(p, a); - } - - bool SynetConvolution32fNhwcDirect::Set3r(const ConvParam32f& p, AlgParam& a) - { - assert(a.microD == 3 * F); - switch (p.activation) - { - case SimdConvolutionActivationIdentity: Set(p, a); break; - case SimdConvolutionActivationRelu: Set(p, a); break; - case SimdConvolutionActivationLeakyRelu: Set(p, a); break; - case SimdConvolutionActivationRestrictRange: Set(p, a); break; - case SimdConvolutionActivationPrelu: Set(p, a); break; - case SimdConvolutionActivationElu: Set(p, a); break; - case SimdConvolutionActivationHswish: Set(p, a); break; - default: assert(0); - } - return true; - } - } -#endif//SIMD_AVX512F_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512fSynetDeconvolution32f.cpp b/src/3rd/Simd/Simd/SimdAvx512fSynetDeconvolution32f.cpp deleted file mode 100644 index ecb06c3f..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512fSynetDeconvolution32f.cpp +++ /dev/null @@ -1,549 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetDeconvolution32f.h" -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdAvx512f.h" -#include "Simd/SimdGemm.h" -#include "Simd/SimdExp.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - SynetDeconvolution32fGemmNN::SynetDeconvolution32fGemmNN(const DeconvParam32f & p) - : Avx2::SynetDeconvolution32fGemmNN(p) - { - _gemm.Init(InitGemmFuncs(Avx512f::Gemm32fNN, "Avx512f", p.gemm, "Ext")); - if (_param.trans && _param.group == 1) - { - if (NHWC_GEMM_RUNTIME) - { - _gemmCb.Init(InitGemmCbFuncs(Avx512f::Gemm32fNNcbBufferSize, Avx512f::Gemm32fNNcbReorderB, Avx512f::Gemm32fNNcbRun, "Avx512f", GemmKernelF2, GemmKernelF3)); - _nhwcWeight.Resize(_gemmCb.At(0).BufferSize(_M*_merge, _N, _K)); - } - else - _nhwcWeight.Resize(Avx512f::Gemm32fNNcbBufferSize(_M*_merge, _N, _K, GemmKernelAny, NHWC_GEMM_COMPATIBLE)); - _nhwcRun = Avx512f::Gemm32fNNcbRun; - _nhwcReorderB = Avx512f::Gemm32fNNcbReorderB; - } - _biasAndActivation = Avx512f::ConvolutionBiasAndActivation; - } - - //--------------------------------------------------------------------- - - - typedef void(*DeconvolutionNhwcDirect2x2_Ptr) (const float * src0, const DeconvParam32f & p, size_t srcC, size_t dstC, const float * weight, const __m512 * bias, const __m512 * params, float * ds); - -#if SIMD_ZMM_COUNT == 32 - template void DeconvolutionNhwcDirect2x2_14(const float * src0, - const DeconvParam32f & p, size_t srcC, size_t dstC, const float * weight0, const __m512 * bias, const __m512 * params, float * dst) - { - size_t dS = p.srcC, dD = p.dstC; - const float * weight1 = weight0 + srcC * F; - const float * src1 = src0 + 1 * dS; - const float * src2 = src0 + 2 * dS; - const float * src3 = src0 + 3 * dS; - const float * src4 = src0 + 4 * dS; - const float * src5 = src0 + 5 * dS; - const float * src6 = src0 + 6 * dS; - __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, dA0, dA1, dB0, dB1, dC0, dC1, dD0, dD1, s0, w0, w1; - d00 = _mm512_setzero_ps(); d01 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(); d11 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(); d21 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(); d31 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(); d41 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(); d51 = _mm512_setzero_ps(); - d60 = _mm512_setzero_ps(); d61 = _mm512_setzero_ps(); - d70 = _mm512_setzero_ps(); d71 = _mm512_setzero_ps(); - d80 = _mm512_setzero_ps(); d81 = _mm512_setzero_ps(); - d90 = _mm512_setzero_ps(); d91 = _mm512_setzero_ps(); - dA0 = _mm512_setzero_ps(); dA1 = _mm512_setzero_ps(); - dB0 = _mm512_setzero_ps(); dB1 = _mm512_setzero_ps(); - dC0 = _mm512_setzero_ps(); dC1 = _mm512_setzero_ps(); - dD0 = _mm512_setzero_ps(); dD1 = _mm512_setzero_ps(); - for (size_t sc0 = 0, sc7 = 7 * dS; sc0 < srcC; ++sc0, ++sc7) - { - w0 = _mm512_loadu_ps(weight0); - w1 = _mm512_loadu_ps(weight1); - s0 = _mm512_set1_ps(src0[sc0]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - d01 = _mm512_fmadd_ps(s0, w1, d01); - s0 = _mm512_set1_ps(src1[sc0]); - d10 = _mm512_fmadd_ps(s0, w0, d10); - d11 = _mm512_fmadd_ps(s0, w1, d11); - s0 = _mm512_set1_ps(src2[sc0]); - d20 = _mm512_fmadd_ps(s0, w0, d20); - d21 = _mm512_fmadd_ps(s0, w1, d21); - s0 = _mm512_set1_ps(src3[sc0]); - d30 = _mm512_fmadd_ps(s0, w0, d30); - d31 = _mm512_fmadd_ps(s0, w1, d31); - s0 = _mm512_set1_ps(src4[sc0]); - d40 = _mm512_fmadd_ps(s0, w0, d40); - d41 = _mm512_fmadd_ps(s0, w1, d41); - s0 = _mm512_set1_ps(src5[sc0]); - d50 = _mm512_fmadd_ps(s0, w0, d50); - d51 = _mm512_fmadd_ps(s0, w1, d51); - s0 = _mm512_set1_ps(src6[sc0]); - d60 = _mm512_fmadd_ps(s0, w0, d60); - d61 = _mm512_fmadd_ps(s0, w1, d61); - s0 = _mm512_set1_ps(src0[sc7]); - d70 = _mm512_fmadd_ps(s0, w0, d70); - d71 = _mm512_fmadd_ps(s0, w1, d71); - s0 = _mm512_set1_ps(src1[sc7]); - d80 = _mm512_fmadd_ps(s0, w0, d80); - d81 = _mm512_fmadd_ps(s0, w1, d81); - s0 = _mm512_set1_ps(src2[sc7]); - d90 = _mm512_fmadd_ps(s0, w0, d90); - d91 = _mm512_fmadd_ps(s0, w1, d91); - s0 = _mm512_set1_ps(src3[sc7]); - dA0 = _mm512_fmadd_ps(s0, w0, dA0); - dA1 = _mm512_fmadd_ps(s0, w1, dA1); - s0 = _mm512_set1_ps(src4[sc7]); - dB0 = _mm512_fmadd_ps(s0, w0, dB0); - dB1 = _mm512_fmadd_ps(s0, w1, dB1); - s0 = _mm512_set1_ps(src5[sc7]); - dC0 = _mm512_fmadd_ps(s0, w0, dC0); - dC1 = _mm512_fmadd_ps(s0, w1, dC1); - s0 = _mm512_set1_ps(src6[sc7]); - dD0 = _mm512_fmadd_ps(s0, w0, dD0); - dD1 = _mm512_fmadd_ps(s0, w1, dD1); - weight0 += F; - weight1 += F; - } - if (dstC == F) - { - Term::template Save(dst + 0x00 * dD, d00, bias, params); - Term::template Save(dst + 0x01 * dD, d01, bias, params); - Term::template Save(dst + 0x02 * dD, d10, bias, params); - Term::template Save(dst + 0x03 * dD, d11, bias, params); - Term::template Save(dst + 0x04 * dD, d20, bias, params); - Term::template Save(dst + 0x05 * dD, d21, bias, params); - Term::template Save(dst + 0x06 * dD, d30, bias, params); - Term::template Save(dst + 0x07 * dD, d31, bias, params); - Term::template Save(dst + 0x08 * dD, d40, bias, params); - Term::template Save(dst + 0x09 * dD, d41, bias, params); - Term::template Save(dst + 0x0A * dD, d50, bias, params); - Term::template Save(dst + 0x0B * dD, d51, bias, params); - Term::template Save(dst + 0x0C * dD, d60, bias, params); - Term::template Save(dst + 0x0D * dD, d61, bias, params); - Term::template Save(dst + 0x0E * dD, d70, bias, params); - Term::template Save(dst + 0x0F * dD, d71, bias, params); - Term::template Save(dst + 0x10 * dD, d80, bias, params); - Term::template Save(dst + 0x11 * dD, d81, bias, params); - Term::template Save(dst + 0x12 * dD, d90, bias, params); - Term::template Save(dst + 0x13 * dD, d91, bias, params); - Term::template Save(dst + 0x14 * dD, dA0, bias, params); - Term::template Save(dst + 0x15 * dD, dA1, bias, params); - Term::template Save(dst + 0x16 * dD, dB0, bias, params); - Term::template Save(dst + 0x17 * dD, dB1, bias, params); - Term::template Save(dst + 0x18 * dD, dC0, bias, params); - Term::template Save(dst + 0x19 * dD, dC1, bias, params); - Term::template Save(dst + 0x1A * dD, dD0, bias, params); - Term::template Save(dst + 0x1B * dD, dD1, bias, params); - } - else - { - __mmask16 mask = __mmask16(-1) >> (16 - dstC); - Term::template Save(dst + 0x00 * dD, d00, bias, params, mask); - Term::template Save(dst + 0x01 * dD, d01, bias, params, mask); - Term::template Save(dst + 0x02 * dD, d10, bias, params, mask); - Term::template Save(dst + 0x03 * dD, d11, bias, params, mask); - Term::template Save(dst + 0x04 * dD, d20, bias, params, mask); - Term::template Save(dst + 0x05 * dD, d21, bias, params, mask); - Term::template Save(dst + 0x06 * dD, d30, bias, params, mask); - Term::template Save(dst + 0x07 * dD, d31, bias, params, mask); - Term::template Save(dst + 0x08 * dD, d40, bias, params, mask); - Term::template Save(dst + 0x09 * dD, d41, bias, params, mask); - Term::template Save(dst + 0x0A * dD, d50, bias, params, mask); - Term::template Save(dst + 0x0B * dD, d51, bias, params, mask); - Term::template Save(dst + 0x0C * dD, d60, bias, params, mask); - Term::template Save(dst + 0x0D * dD, d61, bias, params, mask); - Term::template Save(dst + 0x0E * dD, d70, bias, params, mask); - Term::template Save(dst + 0x0F * dD, d71, bias, params, mask); - Term::template Save(dst + 0x10 * dD, d80, bias, params, mask); - Term::template Save(dst + 0x11 * dD, d81, bias, params, mask); - Term::template Save(dst + 0x12 * dD, d90, bias, params, mask); - Term::template Save(dst + 0x13 * dD, d91, bias, params, mask); - Term::template Save(dst + 0x14 * dD, dA0, bias, params, mask); - Term::template Save(dst + 0x15 * dD, dA1, bias, params, mask); - Term::template Save(dst + 0x16 * dD, dB0, bias, params, mask); - Term::template Save(dst + 0x17 * dD, dB1, bias, params, mask); - Term::template Save(dst + 0x18 * dD, dC0, bias, params, mask); - Term::template Save(dst + 0x19 * dD, dC1, bias, params, mask); - Term::template Save(dst + 0x1A * dD, dD0, bias, params, mask); - Term::template Save(dst + 0x1B * dD, dD1, bias, params, mask); - } - } - - template void DeconvolutionNhwcDirect2x2_M(const float * src0, - const DeconvParam32f & p, size_t srcC, size_t dstC, const float * weight0, const __m512 * bias, const __m512 * params, float * dst) - { - size_t dS = p.srcC, dD = p.dstC; - const float * weight1 = weight0 + srcC * F, *src1, *src2, *src3, *src4, *src5, *src6; - if (tail > 1) src1 = src0 + 1 * dS; - if (tail > 2) src2 = src0 + 2 * dS; - if (tail > 3) src3 = src0 + 3 * dS; - if (tail > 4) src4 = src0 + 4 * dS; - if (tail > 5) src5 = src0 + 5 * dS; - if (tail > 6) src6 = src0 + 6 * dS; - __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, dA0, dA1, dB0, dB1, dC0, dC1, dD0, dD1, s0, w0, w1; - if (tail > 0x0) d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps(); - if (tail > 0x1) d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps(); - if (tail > 0x2) d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps(); - if (tail > 0x3) d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps(); - if (tail > 0x4) d40 = _mm512_setzero_ps(), d41 = _mm512_setzero_ps(); - if (tail > 0x5) d50 = _mm512_setzero_ps(), d51 = _mm512_setzero_ps(); - if (tail > 0x6) d60 = _mm512_setzero_ps(), d61 = _mm512_setzero_ps(); - if (tail > 0x7) d70 = _mm512_setzero_ps(), d71 = _mm512_setzero_ps(); - if (tail > 0x8) d80 = _mm512_setzero_ps(), d81 = _mm512_setzero_ps(); - if (tail > 0x9) d90 = _mm512_setzero_ps(), d91 = _mm512_setzero_ps(); - if (tail > 0xA) dA0 = _mm512_setzero_ps(), dA1 = _mm512_setzero_ps(); - if (tail > 0xB) dB0 = _mm512_setzero_ps(), dB1 = _mm512_setzero_ps(); - if (tail > 0xC) dC0 = _mm512_setzero_ps(), dC1 = _mm512_setzero_ps(); - if (tail > 0xD) dD0 = _mm512_setzero_ps(), dD1 = _mm512_setzero_ps(); - for (size_t sc0 = 0, sc7 = 7 * dS; sc0 < srcC; ++sc0, ++sc7) - { - w0 = _mm512_loadu_ps(weight0); - w1 = _mm512_loadu_ps(weight1); - if (tail > 0x0) s0 = _mm512_set1_ps(src0[sc0]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01); - if (tail > 0x1) s0 = _mm512_set1_ps(src1[sc0]), d10 = _mm512_fmadd_ps(s0, w0, d10), d11 = _mm512_fmadd_ps(s0, w1, d11); - if (tail > 0x2) s0 = _mm512_set1_ps(src2[sc0]), d20 = _mm512_fmadd_ps(s0, w0, d20), d21 = _mm512_fmadd_ps(s0, w1, d21); - if (tail > 0x3) s0 = _mm512_set1_ps(src3[sc0]), d30 = _mm512_fmadd_ps(s0, w0, d30), d31 = _mm512_fmadd_ps(s0, w1, d31); - if (tail > 0x4) s0 = _mm512_set1_ps(src4[sc0]), d40 = _mm512_fmadd_ps(s0, w0, d40), d41 = _mm512_fmadd_ps(s0, w1, d41); - if (tail > 0x5) s0 = _mm512_set1_ps(src5[sc0]), d50 = _mm512_fmadd_ps(s0, w0, d50), d51 = _mm512_fmadd_ps(s0, w1, d51); - if (tail > 0x6) s0 = _mm512_set1_ps(src6[sc0]), d60 = _mm512_fmadd_ps(s0, w0, d60), d61 = _mm512_fmadd_ps(s0, w1, d61); - if (tail > 0x7) s0 = _mm512_set1_ps(src0[sc7]), d70 = _mm512_fmadd_ps(s0, w0, d70), d71 = _mm512_fmadd_ps(s0, w1, d71); - if (tail > 0x8) s0 = _mm512_set1_ps(src1[sc7]), d80 = _mm512_fmadd_ps(s0, w0, d80), d81 = _mm512_fmadd_ps(s0, w1, d81); - if (tail > 0x9) s0 = _mm512_set1_ps(src2[sc7]), d90 = _mm512_fmadd_ps(s0, w0, d90), d91 = _mm512_fmadd_ps(s0, w1, d91); - if (tail > 0xA) s0 = _mm512_set1_ps(src3[sc7]), dA0 = _mm512_fmadd_ps(s0, w0, dA0), dA1 = _mm512_fmadd_ps(s0, w1, dA1); - if (tail > 0xB) s0 = _mm512_set1_ps(src4[sc7]), dB0 = _mm512_fmadd_ps(s0, w0, dB0), dB1 = _mm512_fmadd_ps(s0, w1, dB1); - if (tail > 0xC) s0 = _mm512_set1_ps(src5[sc7]), dC0 = _mm512_fmadd_ps(s0, w0, dC0), dC1 = _mm512_fmadd_ps(s0, w1, dC1); - if (tail > 0xD) s0 = _mm512_set1_ps(src6[sc7]), dD0 = _mm512_fmadd_ps(s0, w0, dD0), dD1 = _mm512_fmadd_ps(s0, w1, dD1); - weight0 += F; - weight1 += F; - } - if (dstC == F) - { - if (tail > 0x0) Term::template Save(dst + 0x00 * dD, d00, bias, params), Term::template Save(dst + 0x01 * dD, d01, bias, params); - if (tail > 0x1) Term::template Save(dst + 0x02 * dD, d10, bias, params), Term::template Save(dst + 0x03 * dD, d11, bias, params); - if (tail > 0x2) Term::template Save(dst + 0x04 * dD, d20, bias, params), Term::template Save(dst + 0x05 * dD, d21, bias, params); - if (tail > 0x3) Term::template Save(dst + 0x06 * dD, d30, bias, params), Term::template Save(dst + 0x07 * dD, d31, bias, params); - if (tail > 0x4) Term::template Save(dst + 0x08 * dD, d40, bias, params), Term::template Save(dst + 0x09 * dD, d41, bias, params); - if (tail > 0x5) Term::template Save(dst + 0x0A * dD, d50, bias, params), Term::template Save(dst + 0x0B * dD, d51, bias, params); - if (tail > 0x6) Term::template Save(dst + 0x0C * dD, d60, bias, params), Term::template Save(dst + 0x0D * dD, d61, bias, params); - if (tail > 0x7) Term::template Save(dst + 0x0E * dD, d70, bias, params), Term::template Save(dst + 0x0F * dD, d71, bias, params); - if (tail > 0x8) Term::template Save(dst + 0x10 * dD, d80, bias, params), Term::template Save(dst + 0x11 * dD, d81, bias, params); - if (tail > 0x9) Term::template Save(dst + 0x12 * dD, d90, bias, params), Term::template Save(dst + 0x13 * dD, d91, bias, params); - if (tail > 0xA) Term::template Save(dst + 0x14 * dD, dA0, bias, params), Term::template Save(dst + 0x15 * dD, dA1, bias, params); - if (tail > 0xB) Term::template Save(dst + 0x16 * dD, dB0, bias, params), Term::template Save(dst + 0x17 * dD, dB1, bias, params); - if (tail > 0xC) Term::template Save(dst + 0x18 * dD, dC0, bias, params), Term::template Save(dst + 0x19 * dD, dC1, bias, params); - if (tail > 0xD) Term::template Save(dst + 0x1A * dD, dD0, bias, params), Term::template Save(dst + 0x1B * dD, dD1, bias, params); - } - else - { - __mmask16 mask = __mmask16(-1) >> (16 - dstC); - if (tail > 0x0) Term::template Save(dst + 0x00 * dD, d00, bias, params, mask), Term::template Save(dst + 0x01 * dD, d01, bias, params, mask); - if (tail > 0x1) Term::template Save(dst + 0x02 * dD, d10, bias, params, mask), Term::template Save(dst + 0x03 * dD, d11, bias, params, mask); - if (tail > 0x2) Term::template Save(dst + 0x04 * dD, d20, bias, params, mask), Term::template Save(dst + 0x05 * dD, d21, bias, params, mask); - if (tail > 0x3) Term::template Save(dst + 0x06 * dD, d30, bias, params, mask), Term::template Save(dst + 0x07 * dD, d31, bias, params, mask); - if (tail > 0x4) Term::template Save(dst + 0x08 * dD, d40, bias, params, mask), Term::template Save(dst + 0x09 * dD, d41, bias, params, mask); - if (tail > 0x5) Term::template Save(dst + 0x0A * dD, d50, bias, params, mask), Term::template Save(dst + 0x0B * dD, d51, bias, params, mask); - if (tail > 0x6) Term::template Save(dst + 0x0C * dD, d60, bias, params, mask), Term::template Save(dst + 0x0D * dD, d61, bias, params, mask); - if (tail > 0x7) Term::template Save(dst + 0x0E * dD, d70, bias, params, mask), Term::template Save(dst + 0x0F * dD, d71, bias, params, mask); - if (tail > 0x8) Term::template Save(dst + 0x10 * dD, d80, bias, params, mask), Term::template Save(dst + 0x11 * dD, d81, bias, params, mask); - if (tail > 0x9) Term::template Save(dst + 0x12 * dD, d90, bias, params, mask), Term::template Save(dst + 0x13 * dD, d91, bias, params, mask); - if (tail > 0xA) Term::template Save(dst + 0x14 * dD, dA0, bias, params, mask), Term::template Save(dst + 0x15 * dD, dA1, bias, params, mask); - if (tail > 0xB) Term::template Save(dst + 0x16 * dD, dB0, bias, params, mask), Term::template Save(dst + 0x17 * dD, dB1, bias, params, mask); - if (tail > 0xC) Term::template Save(dst + 0x18 * dD, dC0, bias, params, mask), Term::template Save(dst + 0x19 * dD, dC1, bias, params, mask); - if (tail > 0xD) Term::template Save(dst + 0x1A * dD, dD0, bias, params, mask), Term::template Save(dst + 0x1B * dD, dD1, bias, params, mask); - } - } -#else - template void DeconvolutionNhwcDirect2x2_6(const float * src0, - const DeconvParam32f & p, size_t srcC, size_t dstC, const float * weight0, const __m512 * bias, const __m512 * params, float * dst) - { - size_t dS = p.srcC, dD = p.dstC; - const float * weight1 = weight0 + srcC * F; - const float * src1 = src0 + 1 * dS; - const float * src2 = src0 + 2 * dS; - const float * src3 = src0 + 3 * dS; - const float * src4 = src0 + 4 * dS; - const float * src5 = src0 + 5 * dS; - __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - d00 = _mm512_setzero_ps(); d01 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(); d11 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(); d21 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(); d31 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(); d41 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(); d51 = _mm512_setzero_ps(); - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm512_loadu_ps(weight0); - w1 = _mm512_loadu_ps(weight1); - s0 = _mm512_set1_ps(src0[sc]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - d01 = _mm512_fmadd_ps(s0, w1, d01); - s0 = _mm512_set1_ps(src1[sc]); - d10 = _mm512_fmadd_ps(s0, w0, d10); - d11 = _mm512_fmadd_ps(s0, w1, d11); - s0 = _mm512_set1_ps(src2[sc]); - d20 = _mm512_fmadd_ps(s0, w0, d20); - d21 = _mm512_fmadd_ps(s0, w1, d21); - s0 = _mm512_set1_ps(src3[sc]); - d30 = _mm512_fmadd_ps(s0, w0, d30); - d31 = _mm512_fmadd_ps(s0, w1, d31); - s0 = _mm512_set1_ps(src4[sc]); - d40 = _mm512_fmadd_ps(s0, w0, d40); - d41 = _mm512_fmadd_ps(s0, w1, d41); - s0 = _mm512_set1_ps(src5[sc]); - d50 = _mm512_fmadd_ps(s0, w0, d50); - d51 = _mm512_fmadd_ps(s0, w1, d51); - weight0 += F; - weight1 += F; - } - if (dstC == F) - { - Term::template Save(dst + 0x0 * dD, d00, bias, params); - Term::template Save(dst + 0x1 * dD, d01, bias, params); - Term::template Save(dst + 0x2 * dD, d10, bias, params); - Term::template Save(dst + 0x3 * dD, d11, bias, params); - Term::template Save(dst + 0x4 * dD, d20, bias, params); - Term::template Save(dst + 0x5 * dD, d21, bias, params); - Term::template Save(dst + 0x6 * dD, d30, bias, params); - Term::template Save(dst + 0x7 * dD, d31, bias, params); - Term::template Save(dst + 0x8 * dD, d40, bias, params); - Term::template Save(dst + 0x9 * dD, d41, bias, params); - Term::template Save(dst + 0xA * dD, d50, bias, params); - Term::template Save(dst + 0xB * dD, d51, bias, params); - } - else - { - __mmask16 mask = __mmask16(-1) >> (16 - dstC); - Term::template Save(dst + 0x0 * dD, d00, bias, params, mask); - Term::template Save(dst + 0x1 * dD, d01, bias, params, mask); - Term::template Save(dst + 0x2 * dD, d10, bias, params, mask); - Term::template Save(dst + 0x3 * dD, d11, bias, params, mask); - Term::template Save(dst + 0x4 * dD, d20, bias, params, mask); - Term::template Save(dst + 0x5 * dD, d21, bias, params, mask); - Term::template Save(dst + 0x6 * dD, d30, bias, params, mask); - Term::template Save(dst + 0x7 * dD, d31, bias, params, mask); - Term::template Save(dst + 0x8 * dD, d40, bias, params, mask); - Term::template Save(dst + 0x9 * dD, d41, bias, params, mask); - Term::template Save(dst + 0xA * dD, d50, bias, params, mask); - Term::template Save(dst + 0xB * dD, d51, bias, params, mask); - } - } - - template void DeconvolutionNhwcDirect2x2_M(const float * src0, - const DeconvParam32f & p, size_t srcC, size_t dstC, const float * weight0, const __m512 * bias, const __m512 * params, float * dst) - { - size_t dS = p.srcC, dD = p.dstC; - const float * weight1 = weight0 + srcC * F, *src1, *src2, *src3, *src4, *src5; - if (tail > 1) src1 = src0 + 1 * dS; - if (tail > 2) src2 = src0 + 2 * dS; - if (tail > 3) src3 = src0 + 3 * dS; - if (tail > 4) src4 = src0 + 4 * dS; - if (tail > 5) src5 = src0 + 5 * dS; - __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - if (tail > 0) d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps(); - if (tail > 1) d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps(); - if (tail > 2) d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps(); - if (tail > 3) d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps(); - if (tail > 4) d40 = _mm512_setzero_ps(), d41 = _mm512_setzero_ps(); - if (tail > 5) d50 = _mm512_setzero_ps(), d51 = _mm512_setzero_ps(); - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm512_loadu_ps(weight0); - w1 = _mm512_loadu_ps(weight1); - if (tail > 0) s0 = _mm512_set1_ps(src0[sc]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01); - if (tail > 1) s0 = _mm512_set1_ps(src1[sc]), d10 = _mm512_fmadd_ps(s0, w0, d10), d11 = _mm512_fmadd_ps(s0, w1, d11); - if (tail > 2) s0 = _mm512_set1_ps(src2[sc]), d20 = _mm512_fmadd_ps(s0, w0, d20), d21 = _mm512_fmadd_ps(s0, w1, d21); - if (tail > 3) s0 = _mm512_set1_ps(src3[sc]), d30 = _mm512_fmadd_ps(s0, w0, d30), d31 = _mm512_fmadd_ps(s0, w1, d31); - if (tail > 4) s0 = _mm512_set1_ps(src4[sc]), d40 = _mm512_fmadd_ps(s0, w0, d40), d41 = _mm512_fmadd_ps(s0, w1, d41); - if (tail > 5) s0 = _mm512_set1_ps(src5[sc]), d50 = _mm512_fmadd_ps(s0, w0, d50), d51 = _mm512_fmadd_ps(s0, w1, d51); - weight0 += F; - weight1 += F; - } - if (dstC == F) - { - if (tail > 0) Term::template Save(dst + 0x0 * dD, d00, bias, params), Term::template Save(dst + 0x1 * dD, d01, bias, params); - if (tail > 1) Term::template Save(dst + 0x2 * dD, d10, bias, params), Term::template Save(dst + 0x3 * dD, d11, bias, params); - if (tail > 2) Term::template Save(dst + 0x4 * dD, d20, bias, params), Term::template Save(dst + 0x5 * dD, d21, bias, params); - if (tail > 3) Term::template Save(dst + 0x6 * dD, d30, bias, params), Term::template Save(dst + 0x7 * dD, d31, bias, params); - if (tail > 4) Term::template Save(dst + 0x8 * dD, d40, bias, params), Term::template Save(dst + 0x9 * dD, d41, bias, params); - if (tail > 5) Term::template Save(dst + 0xA * dD, d50, bias, params), Term::template Save(dst + 0xB * dD, d51, bias, params); - } - else - { - __mmask16 mask = __mmask16(-1) >> (16 - dstC); - if (tail > 0) Term::template Save(dst + 0x0 * dD, d00, bias, params, mask), Term::template Save(dst + 0x1 * dD, d01, bias, params, mask); - if (tail > 1) Term::template Save(dst + 0x2 * dD, d10, bias, params, mask), Term::template Save(dst + 0x3 * dD, d11, bias, params, mask); - if (tail > 2) Term::template Save(dst + 0x4 * dD, d20, bias, params, mask), Term::template Save(dst + 0x5 * dD, d21, bias, params, mask); - if (tail > 3) Term::template Save(dst + 0x6 * dD, d30, bias, params, mask), Term::template Save(dst + 0x7 * dD, d31, bias, params, mask); - if (tail > 4) Term::template Save(dst + 0x8 * dD, d40, bias, params, mask), Term::template Save(dst + 0x9 * dD, d41, bias, params, mask); - if (tail > 5) Term::template Save(dst + 0xA * dD, d50, bias, params, mask), Term::template Save(dst + 0xB * dD, d51, bias, params, mask); - } - } -#endif - - template SIMD_INLINE DeconvolutionNhwcDirect2x2_Ptr GetTailKernel(size_t tail) - { - switch (tail) - { - case 0: return DeconvolutionNhwcDirect2x2_M; - case 1: return DeconvolutionNhwcDirect2x2_M; - case 2: return DeconvolutionNhwcDirect2x2_M; - case 3: return DeconvolutionNhwcDirect2x2_M; - case 4: return DeconvolutionNhwcDirect2x2_M; - case 5: return DeconvolutionNhwcDirect2x2_M; -#if SIMD_ZMM_COUNT == 32 - case 6: return DeconvolutionNhwcDirect2x2_M; - case 7: return DeconvolutionNhwcDirect2x2_M; - case 8: return DeconvolutionNhwcDirect2x2_M; - case 9: return DeconvolutionNhwcDirect2x2_M; - case 10: return DeconvolutionNhwcDirect2x2_M; - case 11: return DeconvolutionNhwcDirect2x2_M; - case 12: return DeconvolutionNhwcDirect2x2_M; - case 13: return DeconvolutionNhwcDirect2x2_M; -#endif - default: - assert(0); - return NULL; - } - } - - template void DeconvolutionNhwcDirect2x2(const float * src, const DeconvParam32f & p, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float * weight, const float * bias, const float * params, float * dst) - { -#if SIMD_ZMM_COUNT == 32 - size_t step = 14; - DeconvolutionNhwcDirect2x2_Ptr bodyKernel = DeconvolutionNhwcDirect2x2_14; -#else - size_t step = 6; - DeconvolutionNhwcDirect2x2_Ptr bodyKernel = DeconvolutionNhwcDirect2x2_6; -#endif - size_t body = AlignLoAny(p.srcW, step), tail = p.srcW - body; - DeconvolutionNhwcDirect2x2_Ptr tailKernel = GetTailKernel(tail); - - __m512 _params[2], _bias[1]; - _params[0] = _mm512_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm512_set1_ps(params[1]); - - for (size_t dc = 0; dc < dstC; dc += F) - { - size_t dC = Simd::Min(F, dstC - dc); - _bias[0] = _mm512_loadu_ps(bias + dc); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm512_loadu_ps(params + dc); - const float * s = src + yBeg * p.srcW * p.srcC; - float * d = dst + yBeg * p.strideY * p.dstW * p.dstC; - const float * w0 = weight + 0 * p.kernelX * p.srcC * F; - const float * w1 = weight + 1 * p.kernelX * p.srcC * F; - for (size_t sy = yBeg; sy < yEnd; sy += 1, s += p.srcW * p.srcC) - { - for (size_t sx = 0; sx < body; sx += step) - bodyKernel(s + sx * p.srcC, p, srcC, dC, w0, _bias, _params, d), d += step * p.strideX * p.dstC; - if (tail) - tailKernel(s + body * p.srcC, p, srcC, dC, w0, _bias, _params, d), d += tail * p.strideX * p.dstC; - for (size_t sx = 0; sx < body; sx += step) - bodyKernel(s + sx * p.srcC, p, srcC, dC, w1, _bias, _params, d), d += step * p.strideX * p.dstC; - if (tail) - tailKernel(s + body * p.srcC, p, srcC, dC, w1, _bias, _params, d), d += tail * p.strideX * p.dstC; - } - weight += p.kernelY * p.kernelX*srcC*F; - dst += F; - } - } - - template void DeconvolutionNhwcDirect2x2(const float * src, const DeconvParam32f & p, - const SynetDeconvolution32fNhwcDirect2x2::AlgParam & a, const float * weight, const float * bias, const float * params, float * dst) - { - for (size_t dc = 0; dc < p.dstC; dc += a.macroD) - { - size_t macroD = Simd::Min(p.dstC, dc + a.macroD) - dc; - for (size_t sc = 0; sc < p.srcC; sc += a.macroC) - { - size_t macroC = Simd::Min(p.srcC, sc + a.macroC) - sc; - size_t macroK = p.kernelY * p.kernelX * macroC; - for (size_t yBeg = 0; yBeg < p.srcH;) - { - size_t yEnd = Simd::Min(yBeg + a.macroH, p.srcH); - if (a.macroC == p.srcC) - DeconvolutionNhwcDirect2x2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc == 0) - DeconvolutionNhwcDirect2x2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc + macroC == p.srcC) - DeconvolutionNhwcDirect2x2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else - DeconvolutionNhwcDirect2x2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - yBeg = yEnd; - } - weight += AlignHiAny(macroD, a.microD)*macroK; - } - if (type == ::SimdConvolutionActivationPrelu) - params += macroD; - } - } - - SynetDeconvolution32fNhwcDirect2x2::SynetDeconvolution32fNhwcDirect2x2(const DeconvParam32f & p) - : Avx2::SynetDeconvolution32fNhwcDirect2x2(p) - { - if (p.dstC > HF) - { - switch (p.activation) - { - case SimdConvolutionActivationIdentity: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationRelu: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationLeakyRelu: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationRestrictRange: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationPrelu: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationElu: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationHswish: _deconvolution = DeconvolutionNhwcDirect2x2; break; - default: assert(0); - } - SetAlgParam(F, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3()); - } - } - - //--------------------------------------------------------------------- - - void * SynetDeconvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdGemm32fNNPtr gemm) - { - DeconvParam32f param(batch, conv, gemm); - if (!param.Valid()) - return NULL; - if (SynetDeconvolution32fNhwcDirect2x2::Preferable(param)) - return new SynetDeconvolution32fNhwcDirect2x2(param); - else - return new SynetDeconvolution32fGemmNN(param); - } - } -#endif//SIMD_AVX512F_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512fSynetFused.cpp b/src/3rd/Simd/Simd/SimdAvx512fSynetFused.cpp deleted file mode 100644 index 2617f6bd..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512fSynetFused.cpp +++ /dev/null @@ -1,1234 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdPow.h" -#include "Simd/SimdExp.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdSse1.h" -#include "Simd/SimdAvx1.h" -#include "Simd/SimdAvx2.h" -#include "Simd/SimdAvx512f.h" -#include "Simd/SimdArray.h" - -namespace Simd -{ -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - template SIMD_INLINE void SynetFusedLayerForward0(const float * src, const float * bias, const float * scale, __m512 sign, float * dst, size_t offset, __mmask16 tail = -1) - { - __m512 _bias = Load(bias + offset, tail); - __m512 x = _mm512_add_ps((Load(src + offset, tail)), _bias); - __m512 _scale = Load(scale + offset, tail); - Store(dst + offset, _mm512_add_ps(_mm512_mul_ps(_mm512_sub_ps(x, AndNot(sign, x)), _scale), _mm512_max_ps(_mm512_setzero_ps(), x)), tail); - } - - template SIMD_INLINE void SynetFusedLayerForward0(const float * src, __m512 bias, __m512 scale, __m512 sign, float * dst, size_t offset, __mmask16 tail = -1) - { - __m512 x = _mm512_add_ps((Load(src + offset, tail)), bias); - Store(dst + offset, _mm512_add_ps(_mm512_mul_ps(_mm512_sub_ps(x, AndNot(sign, x)), scale), _mm512_max_ps(_mm512_setzero_ps(), x)), tail); - } - - template void SynetFusedLayerForward0Nchw(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(spatial, F) && Aligned(dst)); - - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - __m512 sign = _mm512_set1_ps(-0.0f); - __mmask16 tail = TailMask16(spatial - partial); - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - __m512 _bias = _mm512_set1_ps(bias[c]); - __m512 _scale = _mm512_set1_ps(scale[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward0(src, _bias, _scale, sign, dst, s + F * 0); - SynetFusedLayerForward0(src, _bias, _scale, sign, dst, s + F * 1); - SynetFusedLayerForward0(src, _bias, _scale, sign, dst, s + F * 2); - SynetFusedLayerForward0(src, _bias, _scale, sign, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetFusedLayerForward0(src, _bias, _scale, sign, dst, s); - if(s < spatial) - SynetFusedLayerForward0(src, _bias, _scale, sign, dst, s, tail); - src += spatial; - dst += spatial; - } - } - - SIMD_INLINE void SynetFusedLayerForward0Nchw(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(spatial, F) && Aligned(dst)) - SynetFusedLayerForward0Nchw(src, bias, scale, channels, spatial, dst); - else - SynetFusedLayerForward0Nchw(src, bias, scale, channels, spatial, dst); - } - - template void SynetFusedLayerForward0Nhwc(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(bias) && Aligned(scale) && Aligned(channels, F) && Aligned(dst)); - - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - __m512 sign = _mm512_set1_ps(-0.0f); - __mmask16 tail = TailMask16(channels - partial); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned; c += QF) - { - SynetFusedLayerForward0(src, bias, scale, sign, dst, c + F * 0); - SynetFusedLayerForward0(src, bias, scale, sign, dst, c + F * 1); - SynetFusedLayerForward0(src, bias, scale, sign, dst, c + F * 2); - SynetFusedLayerForward0(src, bias, scale, sign, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetFusedLayerForward0(src, bias, scale, sign, dst, c); - if (c < channels) - SynetFusedLayerForward0(src, bias, scale, sign, dst, c, tail); - src += channels; - dst += channels; - } - } - - SIMD_INLINE void SynetFusedLayerForward0Nhwc(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(bias) && Aligned(scale) && Aligned(channels, F) && Aligned(dst)) - SynetFusedLayerForward0Nhwc(src, bias, scale, channels, spatial, dst); - else - SynetFusedLayerForward0Nhwc(src, bias, scale, channels, spatial, dst); - } - - template void SynetFusedLayerForward0Nchw16c(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - __m512 sign = _mm512_set1_ps(-0.0f); - for (size_t c = 0; c < channels; c += F) - { - __m512 _bias = Load(bias + c); - __m512 _scale = Load(scale + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward0(src, _bias, _scale, sign, dst, s + F * 0); - SynetFusedLayerForward0(src, _bias, _scale, sign, dst, s + F * 1); - SynetFusedLayerForward0(src, _bias, _scale, sign, dst, s + F * 2); - SynetFusedLayerForward0(src, _bias, _scale, sign, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward0(src, _bias, _scale, sign, dst, s); - src += spatialF; - dst += spatialF; - } - } - - SIMD_INLINE void SynetFusedLayerForward0Nchw16c(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetFusedLayerForward0Nchw16c(src, bias, scale, channels, spatial, dst); - else - SynetFusedLayerForward0Nchw16c(src, bias, scale, channels, spatial, dst); - } - - void SynetFusedLayerForward0(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward0Nchw(src, bias, scale, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward0Nhwc(src, bias, scale, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - Sse::SynetFusedLayerForward0(src, bias, scale, channels, spatial, dst, format); - else if (format == SimdTensorFormatNchw8c) - Avx::SynetFusedLayerForward0(src, bias, scale, channels, spatial, dst, format); - else if (format == SimdTensorFormatNchw16c) - SynetFusedLayerForward0Nchw16c(src, bias, scale, channels, spatial, dst); - else - Base::SynetFusedLayerForward0(src, bias, scale, channels, spatial, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetFusedLayerForward1(const float * src, const float * bias0, const float * scale1, const float * bias1, float * dst, size_t offset, __mmask16 tail = -1) - { - __m512 _bias0 = Load(bias0 + offset, tail); - __m512 x = _mm512_add_ps((Load(src + offset, tail)), _bias0); - __m512 _scale1 = Load(scale1 + offset, tail); - __m512 _bias1 = Load(bias1 + offset, tail); - Store(dst + offset, _mm512_add_ps(_mm512_fmadd_ps(_mm512_max_ps(_mm512_setzero_ps(), _mm512_sub_ps(_mm512_setzero_ps(), x)), _scale1, _bias1), _mm512_max_ps(_mm512_setzero_ps(), x)), tail); - } - - template SIMD_INLINE void SynetFusedLayerForward1(const float * src, __m512 bias0, __m512 scale1, __m512 bias1, float * dst, size_t offset, __mmask16 tail = -1) - { - __m512 x = _mm512_add_ps((Load(src + offset, tail)), bias0); - Store(dst + offset, _mm512_add_ps(_mm512_fmadd_ps(_mm512_max_ps(_mm512_setzero_ps(), _mm512_sub_ps(_mm512_setzero_ps(), x)), scale1, bias1), _mm512_max_ps(_mm512_setzero_ps(), x)), tail); - } - - template void SynetFusedLayerForward1Nchw(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(spatial, F) && Aligned(dst)); - - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - __mmask16 tail = TailMask16(spatial - partial); - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - __m512 _bias0 = _mm512_set1_ps(bias0[c]); - __m512 _scale1 = _mm512_set1_ps(scale1[c]); - __m512 _bias1 = _mm512_set1_ps(bias1[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, dst, s + F * 0); - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, dst, s + F * 1); - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, dst, s + F * 2); - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, dst, s); - if (s < spatial) - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, dst, s, tail); - src += spatial; - dst += spatial; - } - } - - SIMD_INLINE void SynetFusedLayerForward1Nchw(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(spatial, F) && Aligned(dst)) - SynetFusedLayerForward1Nchw(src, bias0, scale1, bias1, channels, spatial, dst); - else - SynetFusedLayerForward1Nchw(src, bias0, scale1, bias1, channels, spatial, dst); - } - - template void SynetFusedLayerForward1Nhwc(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(bias0) && Aligned(scale1) && Aligned(bias1) && Aligned(channels, F) && Aligned(dst)); - - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - __mmask16 tail = TailMask16(channels - partial); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned; c += QF) - { - SynetFusedLayerForward1(src, bias0, scale1, bias1, dst, c + F * 0); - SynetFusedLayerForward1(src, bias0, scale1, bias1, dst, c + F * 1); - SynetFusedLayerForward1(src, bias0, scale1, bias1, dst, c + F * 2); - SynetFusedLayerForward1(src, bias0, scale1, bias1, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetFusedLayerForward1(src, bias0, scale1, bias1, dst, c); - if (c < channels) - SynetFusedLayerForward1(src, bias0, scale1, bias1, dst, c, tail); - src += channels; - dst += channels; - } - } - - SIMD_INLINE void SynetFusedLayerForward1Nhwc(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(bias0) && Aligned(scale1) && Aligned(bias1) && Aligned(channels, F) && Aligned(dst)) - SynetFusedLayerForward1Nhwc(src, bias0, scale1, bias1, channels, spatial, dst); - else - SynetFusedLayerForward1Nhwc(src, bias0, scale1, bias1, channels, spatial, dst); - } - - template void SynetFusedLayerForward1Nchw16c(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - for (size_t c = 0; c < channels; c += F) - { - __m512 _bias0 = Load(bias0 + c); - __m512 _scale1 = Load(scale1 + c); - __m512 _bias1 = Load(bias1 + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, dst, s + F * 0); - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, dst, s + F * 1); - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, dst, s + F * 2); - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, dst, s); - src += spatialF; - dst += spatialF; - } - } - - SIMD_INLINE void SynetFusedLayerForward1Nchw16c(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetFusedLayerForward1Nchw16c(src, bias0, scale1, bias1, channels, spatial, dst); - else - SynetFusedLayerForward1Nchw16c(src, bias0, scale1, bias1, channels, spatial, dst); - } - - void SynetFusedLayerForward1(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward1Nchw(src, bias0, scale1, bias1, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward1Nhwc(src, bias0, scale1, bias1, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - Sse::SynetFusedLayerForward1(src, bias0, scale1, bias1, channels, spatial, dst, format); - else if (format == SimdTensorFormatNchw8c) - Avx::SynetFusedLayerForward1(src, bias0, scale1, bias1, channels, spatial, dst, format); - else if (format == SimdTensorFormatNchw16c) - SynetFusedLayerForward1Nchw16c(src, bias0, scale1, bias1, channels, spatial, dst); - else - Base::SynetFusedLayerForward1(src, bias0, scale1, bias1, channels, spatial, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetFusedLayerForward2(const float * src, const float * scale, const float * bias, __m512 slope, float * dst, size_t offset, __mmask16 tail = -1) - { - __m512 _src = Load(src + offset, tail); - __m512 _scale = Load(scale + offset, tail); - __m512 _bias = Load(bias + offset, tail); - __m512 x = _mm512_fmadd_ps(_src, _scale, _bias); - __m512 _dst = _mm512_add_ps(_mm512_max_ps(_mm512_setzero_ps(), x), _mm512_mul_ps(_mm512_min_ps(_mm512_setzero_ps(), x), slope)); - Store(dst + offset, _dst, tail); - } - - template SIMD_INLINE void SynetFusedLayerForward2(const float * src, __m512 scale, __m512 bias, __m512 slope, float * dst, size_t offset, __mmask16 tail = -1) - { - __m512 _src = Load(src + offset, tail); - __m512 x = _mm512_fmadd_ps(_src, scale, bias); - __m512 _dst = _mm512_add_ps(_mm512_max_ps(_mm512_setzero_ps(), x), _mm512_mul_ps(_mm512_min_ps(_mm512_setzero_ps(), x), slope)); - Store(dst + offset, _dst, tail); - } - - template void SynetFusedLayerForward2Nchw(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(spatial, F) && Aligned(dst)); - - __m512 _slope = _mm512_set1_ps(slope[0]); - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - __mmask16 tail = TailMask16(spatial - partial); - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - __m512 _scale = _mm512_set1_ps(scale[c]); - __m512 _bias = _mm512_set1_ps(bias[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward2(src, _scale, _bias, _slope, dst, s + F * 0); - SynetFusedLayerForward2(src, _scale, _bias, _slope, dst, s + F * 1); - SynetFusedLayerForward2(src, _scale, _bias, _slope, dst, s + F * 2); - SynetFusedLayerForward2(src, _scale, _bias, _slope, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetFusedLayerForward2(src, _scale, _bias, _slope, dst, s); - if (s < spatial) - SynetFusedLayerForward2(src, _scale, _bias, _slope, dst, s, tail); - src += spatial; - dst += spatial; - } - } - - SIMD_INLINE void SynetFusedLayerForward2Nchw(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(spatial, F) && Aligned(dst)) - SynetFusedLayerForward2Nchw(src, scale, bias, channels, spatial, slope, dst); - else - SynetFusedLayerForward2Nchw(src, scale, bias, channels, spatial, slope, dst); - } - - template void SynetFusedLayerForward2Nhwc(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(scale) && Aligned(bias) && Aligned(channels, F) && Aligned(dst)); - - __m512 _slope = _mm512_set1_ps(slope[0]); - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - __mmask16 tail = TailMask16(channels - partial); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned; c += QF) - { - SynetFusedLayerForward2(src, scale, bias, _slope, dst, c + F * 0); - SynetFusedLayerForward2(src, scale, bias, _slope, dst, c + F * 1); - SynetFusedLayerForward2(src, scale, bias, _slope, dst, c + F * 2); - SynetFusedLayerForward2(src, scale, bias, _slope, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetFusedLayerForward2(src, scale, bias, _slope, dst, c); - if (c < channels) - SynetFusedLayerForward2(src, scale, bias, _slope, dst, c, tail); - src += channels; - dst += channels; - } - } - - SIMD_INLINE void SynetFusedLayerForward2Nhwc(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(scale) && Aligned(bias) && Aligned(channels, F) && Aligned(dst)) - SynetFusedLayerForward2Nhwc(src, scale, bias, channels, spatial, slope, dst); - else - SynetFusedLayerForward2Nhwc(src, scale, bias, channels, spatial, slope, dst); - } - - template void SynetFusedLayerForward2Nchw16c(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - __m512 _slope = _mm512_set1_ps(slope[0]); - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - for (size_t c = 0; c < channels; c += F) - { - __m512 _scale = Load(scale + c); - __m512 _bias = Load(bias + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward2(src, _scale, _bias, _slope, dst, s + F * 0); - SynetFusedLayerForward2(src, _scale, _bias, _slope, dst, s + F * 1); - SynetFusedLayerForward2(src, _scale, _bias, _slope, dst, s + F * 2); - SynetFusedLayerForward2(src, _scale, _bias, _slope, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward2(src, _scale, _bias, _slope, dst, s); - src += spatialF; - dst += spatialF; - } - } - - SIMD_INLINE void SynetFusedLayerForward2Nchw16c(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetFusedLayerForward2Nchw16c(src, scale, bias, channels, spatial, slope, dst); - else - SynetFusedLayerForward2Nchw16c(src, scale, bias, channels, spatial, slope, dst); - } - - void SynetFusedLayerForward2(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward2Nchw(src, scale, bias, channels, spatial, slope, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward2Nhwc(src, scale, bias, channels, spatial, slope, dst); - else if (format == SimdTensorFormatNchw4c) - Sse::SynetFusedLayerForward2(src, scale, bias, channels, spatial, slope, dst, format); - else if (format == SimdTensorFormatNchw8c) - Avx::SynetFusedLayerForward2(src, scale, bias, channels, spatial, slope, dst, format); - else if (format == SimdTensorFormatNchw16c) - SynetFusedLayerForward2Nchw16c(src, scale, bias, channels, spatial, slope, dst); - else - Base::SynetFusedLayerForward2(src, scale, bias, channels, spatial, slope, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetFusedLayerForward3(const float * src, const float * bias, const float * scale, float * dst, size_t offset, __mmask16 tail = -1) - { - __m512 _bias = Load(bias + offset, tail); - __m512 x = _mm512_add_ps((Load(src + offset, tail)), _bias); - __m512 _scale = Load(scale + offset, tail); - __m512 pos = _mm512_max_ps(_mm512_setzero_ps(), x); - __m512 neg = _mm512_min_ps(_mm512_setzero_ps(), x); - Store(dst + offset, _mm512_add_ps(pos, _mm512_mul_ps(_scale, neg)), tail); - } - - template SIMD_INLINE void SynetFusedLayerForward3(const float * src, __m512 bias, __m512 scale, float * dst, size_t offset, __mmask16 tail = -1) - { - __m512 x = _mm512_add_ps((Load(src + offset, tail)), bias); - __m512 pos = _mm512_max_ps(_mm512_setzero_ps(), x); - __m512 neg = _mm512_min_ps(_mm512_setzero_ps(), x); - Store(dst + offset, _mm512_add_ps(pos, _mm512_mul_ps(scale, neg)), tail); - } - - template void SynetFusedLayerForward3Nchw(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(spatial, F) && Aligned(dst)); - - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - __mmask16 tail = TailMask16(spatial - partial); - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - __m512 _bias = _mm512_set1_ps(bias[c]); - __m512 _scale = _mm512_set1_ps(scale[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward3(src, _bias, _scale, dst, s + F * 0); - SynetFusedLayerForward3(src, _bias, _scale, dst, s + F * 1); - SynetFusedLayerForward3(src, _bias, _scale, dst, s + F * 2); - SynetFusedLayerForward3(src, _bias, _scale, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetFusedLayerForward3(src, _bias, _scale, dst, s); - if (s < spatial) - SynetFusedLayerForward3(src, _bias, _scale, dst, s, tail); - src += spatial; - dst += spatial; - } - } - - SIMD_INLINE void SynetFusedLayerForward3Nchw(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(spatial, F) && Aligned(dst)) - SynetFusedLayerForward3Nchw(src, bias, scale, channels, spatial, dst); - else - SynetFusedLayerForward3Nchw(src, bias, scale, channels, spatial, dst); - } - - template void SynetFusedLayerForward3Nhwc(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(bias) && Aligned(scale) && Aligned(channels, F) && Aligned(dst)); - - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - __mmask16 tail = TailMask16(channels - partial); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned; c += QF) - { - SynetFusedLayerForward3(src, bias, scale, dst, c + F * 0); - SynetFusedLayerForward3(src, bias, scale, dst, c + F * 1); - SynetFusedLayerForward3(src, bias, scale, dst, c + F * 2); - SynetFusedLayerForward3(src, bias, scale, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetFusedLayerForward3(src, bias, scale, dst, c); - if (c < channels) - SynetFusedLayerForward3(src, bias, scale, dst, c, tail); - src += channels; - dst += channels; - } - } - - SIMD_INLINE void SynetFusedLayerForward3Nhwc(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(bias) && Aligned(scale) && Aligned(channels, F) && Aligned(dst)) - SynetFusedLayerForward3Nhwc(src, bias, scale, channels, spatial, dst); - else - SynetFusedLayerForward3Nhwc(src, bias, scale, channels, spatial, dst); - } - - template void SynetFusedLayerForward3Nchw16c(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - for (size_t c = 0; c < channels; c += F) - { - __m512 _bias = Load(bias + c); - __m512 _scale = Load(scale + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward3(src, _bias, _scale, dst, s + F * 0); - SynetFusedLayerForward3(src, _bias, _scale, dst, s + F * 1); - SynetFusedLayerForward3(src, _bias, _scale, dst, s + F * 2); - SynetFusedLayerForward3(src, _bias, _scale, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward3(src, _bias, _scale, dst, s); - src += spatialF; - dst += spatialF; - } - } - - SIMD_INLINE void SynetFusedLayerForward3Nchw16c(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetFusedLayerForward3Nchw16c(src, bias, scale, channels, spatial, dst); - else - SynetFusedLayerForward3Nchw16c(src, bias, scale, channels, spatial, dst); - } - - void SynetFusedLayerForward3(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward3Nchw(src, bias, scale, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward3Nhwc(src, bias, scale, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - Sse::SynetFusedLayerForward3(src, bias, scale, channels, spatial, dst, format); - else if (format == SimdTensorFormatNchw8c) - Avx::SynetFusedLayerForward3(src, bias, scale, channels, spatial, dst, format); - else if (format == SimdTensorFormatNchw16c) - SynetFusedLayerForward3Nchw16c(src, bias, scale, channels, spatial, dst); - else - Base::SynetFusedLayerForward3(src, bias, scale, channels, spatial, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetFusedLayerForward4(const float * src, const float * bias0, __m512 scale1, __m512 bias1, float * dst0, float * dst1, size_t offset, __mmask16 tail = -1) - { - __m512 x = _mm512_add_ps((Load(src + offset, tail)), (Load(bias0 + offset, tail))); - Store(dst0 + offset, _mm512_max_ps(_mm512_setzero_ps(), x), tail); - Store(dst1 + offset, _mm512_max_ps(_mm512_setzero_ps(), _mm512_fmadd_ps(x, scale1, bias1)), tail); - } - - template SIMD_INLINE void SynetFusedLayerForward4(const float * src, __m512 bias0, __m512 scale1, __m512 bias1, float * dst0, float * dst1, size_t offset, __mmask16 tail = -1) - { - __m512 x = _mm512_add_ps((Load(src + offset, tail)), bias0); - Store(dst0 + offset, _mm512_max_ps(_mm512_setzero_ps(), x), tail); - Store(dst1 + offset, _mm512_max_ps(_mm512_setzero_ps(), _mm512_fmadd_ps(x, scale1, bias1)), tail); - } - - template void SynetFusedLayerForward4Nchw(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst0) - { - if (align) - assert(Aligned(src) && Aligned(spatial, F) && Aligned(dst0)); - - __m512 _bias1 = _mm512_set1_ps(bias1[0]); - __m512 _scale1 = _mm512_set1_ps(scale1[0]); - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - __mmask16 tail = TailMask16(spatial - partial); - float * dst1 = dst0 + channels * spatial; - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - __m512 _bias0 = _mm512_set1_ps(bias0[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, dst0, dst1, s + F * 0); - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, dst0, dst1, s + F * 1); - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, dst0, dst1, s + F * 2); - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, dst0, dst1, s + F * 3); - } - for (; s < partial; s += F) - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, dst0, dst1, s); - if (s < spatial) - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, dst0, dst1, s, tail); - src += spatial; - dst0 += spatial; - dst1 += spatial; - } - } - - SIMD_INLINE void SynetFusedLayerForward4Nchw(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(spatial, F) && Aligned(dst)) - SynetFusedLayerForward4Nchw(src, bias0, scale1, bias1, channels, spatial, dst); - else - SynetFusedLayerForward4Nchw(src, bias0, scale1, bias1, channels, spatial, dst); - } - - template void SynetFusedLayerForward4Nhwc(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst0) - { - if (align) - assert(Aligned(src) && Aligned(bias0) && Aligned(channels, F) && Aligned(dst0)); - - __m512 _bias1 = _mm512_set1_ps(bias1[0]); - __m512 _scale1 = _mm512_set1_ps(scale1[0]); - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - __mmask16 tail = TailMask16(channels - partial); - float * dst1 = dst0 + channels; - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned; c += QF) - { - SynetFusedLayerForward4(src, bias0, _scale1, _bias1, dst0, dst1, c + F * 0); - SynetFusedLayerForward4(src, bias0, _scale1, _bias1, dst0, dst1, c + F * 1); - SynetFusedLayerForward4(src, bias0, _scale1, _bias1, dst0, dst1, c + F * 2); - SynetFusedLayerForward4(src, bias0, _scale1, _bias1, dst0, dst1, c + F * 3); - } - for (; c < partial; c += F) - SynetFusedLayerForward4(src, bias0, _scale1, _bias1, dst0, dst1, c); - if (c < channels) - SynetFusedLayerForward4(src, bias0, _scale1, _bias1, dst0, dst1, c, tail); - src += channels; - dst0 += 2 * channels; - dst1 += 2 * channels; - } - } - - SIMD_INLINE void SynetFusedLayerForward4Nhwc(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(bias0) && Aligned(channels, F) && Aligned(dst)) - SynetFusedLayerForward4Nhwc(src, bias0, scale1, bias1, channels, spatial, dst); - else - SynetFusedLayerForward4Nhwc(src, bias0, scale1, bias1, channels, spatial, dst); - } - - template void SynetFusedLayerForward4Nchw16cA(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst0) - { - if (align) - assert(Aligned(src) && Aligned(dst0)); - - __m512 _bias1 = _mm512_set1_ps(bias1[0]); - __m512 _scale1 = _mm512_set1_ps(scale1[0]); - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4) * F; - float * dst1 = dst0 + channels * spatial; - for (size_t c = 0; c < channels; c += F) - { - __m512 _bias0 = Load(bias0 + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, dst0, dst1, s + F * 0); - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, dst0, dst1, s + F * 1); - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, dst0, dst1, s + F * 2); - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, dst0, dst1, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, dst0, dst1, s); - src += spatialF; - dst0 += spatialF; - dst1 += spatialF; - } - } - - SIMD_INLINE void SynetFusedLayerForward4Nchw16cA(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - assert(Aligned(channels, F)); - if (Aligned(src) && Aligned(dst)) - SynetFusedLayerForward4Nchw16cA(src, bias0, scale1, bias1, channels, spatial, dst); - else - SynetFusedLayerForward4Nchw16cA(src, bias0, scale1, bias1, channels, spatial, dst); - } - - void SynetFusedLayerForward4(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward4Nchw(src, bias0, scale1, bias1, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward4Nhwc(src, bias0, scale1, bias1, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - Sse::SynetFusedLayerForward4(src, bias0, scale1, bias1, channels, spatial, dst, format); - else if (format == SimdTensorFormatNchw8c) - Avx::SynetFusedLayerForward4(src, bias0, scale1, bias1, channels, spatial, dst, format); - else if (format == SimdTensorFormatNchw16c && Aligned(channels, F)) - SynetFusedLayerForward4Nchw16cA(src, bias0, scale1, bias1, channels, spatial, dst); - else - Base::SynetFusedLayerForward4(src, bias0, scale1, bias1, channels, spatial, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetFusedLayerForward8(const float * src0, const float * src1, const float * src2, float * dst, size_t offset, __mmask16 tail = -1) - { - Store(dst + offset, _mm512_add_ps((Load(src0 + offset, tail)), - _mm512_mul_ps((Load(src1 + offset, tail)), (Load(src2 + offset, tail)))), tail); - } - - template SIMD_INLINE void SynetFusedLayerForward8(const float * src0, const float * src1, const __m512 & src2, float * dst, size_t offset, __mmask16 tail = -1) - { - Store(dst + offset, _mm512_add_ps((Load(src0 + offset, tail)), - _mm512_mul_ps((Load(src1 + offset, tail)), src2)), tail); - } - - template void SynetFusedLayerForward8Nchw(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src0) && Aligned(src1) && Aligned(spatial, F) && Aligned(dst)); - - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - __mmask16 tail = TailMask16(spatial - partial); - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - __m512 _src2 = _mm512_set1_ps(src2[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 0); - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 1); - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 2); - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetFusedLayerForward8(src0, src1, _src2, dst, s); - if (s < spatial) - SynetFusedLayerForward8(src0, src1, _src2, dst, s, tail); - src0 += spatial; - src1 += spatial; - dst += spatial; - } - } - - SIMD_INLINE void SynetFusedLayerForward8Nchw(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src0) && Aligned(src1) && Aligned(spatial, F) && Aligned(dst)) - SynetFusedLayerForward8Nchw(src0, src1, src2, channels, spatial, dst); - else - SynetFusedLayerForward8Nchw(src0, src1, src2, channels, spatial, dst); - } - - template void SynetFusedLayerForward8Nhwc(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src0) && Aligned(src1) && Aligned(src2) && Aligned(channels, F) && Aligned(dst)); - - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - __mmask16 tail = TailMask16(channels - partial); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned; c += QF) - { - SynetFusedLayerForward8(src0, src1, src2, dst, c + F * 0); - SynetFusedLayerForward8(src0, src1, src2, dst, c + F * 1); - SynetFusedLayerForward8(src0, src1, src2, dst, c + F * 2); - SynetFusedLayerForward8(src0, src1, src2, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetFusedLayerForward8(src0, src1, src2, dst, c); - if (c < channels) - SynetFusedLayerForward8(src0, src1, src2, dst, c, tail); - src0 += channels; - src1 += channels; - dst += channels; - } - } - - SIMD_INLINE void SynetFusedLayerForward8Nhwc(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src0) && Aligned(src1) && Aligned(src2) && Aligned(channels, F) && Aligned(dst)) - SynetFusedLayerForward8Nhwc(src0, src1, src2, channels, spatial, dst); - else - SynetFusedLayerForward8Nhwc(src0, src1, src2, channels, spatial, dst); - } - - template void SynetFusedLayerForward8Nchw16c(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src0) && Aligned(src1) && Aligned(dst)); - - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - for (size_t c = 0; c < channels; c += F) - { - __m512 _src2 = Load(src2 + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 0); - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 1); - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 2); - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward8(src0, src1, _src2, dst, s); - src0 += spatialF; - src1 += spatialF; - dst += spatialF; - } - } - - SIMD_INLINE void SynetFusedLayerForward8Nchw16c(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src0) && Aligned(src1) && Aligned(dst)) - SynetFusedLayerForward8Nchw16c(src0, src1, src2, channels, spatial, dst); - else - SynetFusedLayerForward8Nchw16c(src0, src1, src2, channels, spatial, dst); - } - - void SynetFusedLayerForward8(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward8Nchw(src0, src1, src2, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward8Nhwc(src0, src1, src2, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - Sse::SynetFusedLayerForward8(src0, src1, src2, channels, spatial, dst, format); - else if (format == SimdTensorFormatNchw8c) - Avx::SynetFusedLayerForward8(src0, src1, src2, channels, spatial, dst, format); - else if (format == SimdTensorFormatNchw16c) - SynetFusedLayerForward8Nchw16c(src0, src1, src2, channels, spatial, dst); - else - Base::SynetFusedLayerForward8(src0, src1, src2, channels, spatial, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetFusedLayerForward9(const float * src, const float * scale, const float * bias, float * dst0, float * dst1, size_t offset, __mmask16 tail = -1) - { - __m512 _src = Load(src + offset, tail); - __m512 _scale = Load(scale + offset, tail); - __m512 _bias = Load(bias + offset, tail); - Store(dst0 + offset, _mm512_max_ps(_mm512_setzero_ps(), _mm512_fmadd_ps(_src, _scale, _bias)), tail); - Store(dst1 + offset, _src, tail); - } - - template SIMD_INLINE void SynetFusedLayerForward9(const float * src, const float * scale, const float * bias, float * dst0, size_t offset, __mmask16 tail = -1) - { - __m512 _src = Load(src + offset, tail); - __m512 _scale = Load(scale + offset, tail); - __m512 _bias = Load(bias + offset, tail); - Store(dst0 + offset, _mm512_max_ps(_mm512_setzero_ps(), _mm512_fmadd_ps(_src, _scale, _bias)), tail); - } - - template SIMD_INLINE void SynetFusedLayerForward9(const float * src, const __m512 & scale, const __m512 & bias, float * dst0, float * dst1, size_t offset, __mmask16 tail = -1) - { - __m512 _src = Load(src + offset, tail); - Store(dst0 + offset, _mm512_max_ps(_mm512_setzero_ps(), _mm512_fmadd_ps(_src, scale, bias)), tail); - Store(dst1 + offset, _src, tail); - } - - template SIMD_INLINE void SynetFusedLayerForward9(const float * src, const __m512 & scale, const __m512 & bias, float * dst0, size_t offset, __mmask16 tail = -1) - { - __m512 _src = Load(src + offset, tail); - Store(dst0 + offset, _mm512_max_ps(_mm512_setzero_ps(), _mm512_fmadd_ps(_src, scale, bias)), tail); - } - - template void SynetFusedLayerForward9Nchw(const float * src0, const float * src1, const float * scale0, const float * bias0, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - if (align) - assert(Aligned(src0) && Aligned(src1) && Aligned(spatial, F) && Aligned(dst0) && Aligned(dst1)); - const float * scale1 = scale0 + channels0; - const float * bias1 = bias0 + channels0; - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - __mmask16 tail = TailMask16(spatial - partial); - if (dst1) - { - for (size_t c = 0; c < channels0; ++c) - { - size_t s = 0; - __m512 _scale0 = _mm512_set1_ps(scale0[c]); - __m512 _bias0 = _mm512_set1_ps(bias0[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + 0 * F); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + 1 * F); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + 2 * F); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + 3 * F); - } - for (; s < partial; s += F) - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s); - if (s < spatial) - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s, tail); - src0 += spatial; - dst0 += spatial; - dst1 += spatial; - } - for (size_t c = 0; c < channels1; ++c) - { - size_t s = 0; - __m512 _scale1 = _mm512_set1_ps(scale1[c]); - __m512 _bias1 = _mm512_set1_ps(bias1[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + 0 * F); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + 1 * F); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + 2 * F); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + 3 * F); - } - for (; s < partial; s += F) - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s); - if (s < spatial) - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s, tail); - src1 += spatial; - dst0 += spatial; - dst1 += spatial; - } - } - else - { - for (size_t c = 0; c < channels0; ++c) - { - size_t s = 0; - __m512 _scale0 = _mm512_set1_ps(scale0[c]); - __m512 _bias0 = _mm512_set1_ps(bias0[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + 0 * F); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + 1 * F); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + 2 * F); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + 3 * F); - } - for (; s < partial; s += F) - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s); - if (s < spatial) - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s, tail); - src0 += spatial; - dst0 += spatial; - } - for (size_t c = 0; c < channels1; ++c) - { - size_t s = 0; - __m512 _scale1 = _mm512_set1_ps(scale1[c]); - __m512 _bias1 = _mm512_set1_ps(bias1[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + 0 * F); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + 1 * F); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + 2 * F); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + 3 * F); - } - for (; s < partial; s += F) - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s); - if (s < spatial) - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s, tail); - src1 += spatial; - dst0 += spatial; - } - } - } - - SIMD_INLINE void SynetFusedLayerForward9Nchw(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - if (Aligned(src0) && Aligned(src1) && Aligned(spatial, F) && Aligned(dst0) && Aligned(dst1)) - SynetFusedLayerForward9Nchw(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else - SynetFusedLayerForward9Nchw(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - } - - template void SynetFusedLayerForward9Nhwc(const float * src0, const float * src1, const float * scale0, const float * bias0, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - if (align) - assert(Aligned(src0) && Aligned(src1) && Aligned(scale0) && Aligned(bias0) && Aligned(channels0, F) && Aligned(channels1, F) && Aligned(dst0) && Aligned(dst1)); - const float * scale1 = scale0 + channels0; - const float * bias1 = bias0 + channels0; - size_t aligned0 = AlignLo(channels0, QF); - size_t partial0 = AlignLo(channels0, F); - __mmask16 tail0 = TailMask16(channels0 - partial0); - size_t aligned1 = AlignLo(channels1, QF); - size_t partial1 = AlignLo(channels1, F); - __mmask16 tail1 = TailMask16(channels1 - partial1); - if (dst1) - { - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned0; c += QF) - { - SynetFusedLayerForward9(src0, scale0, bias0, dst0, dst1, c + 0 * F); - SynetFusedLayerForward9(src0, scale0, bias0, dst0, dst1, c + 1 * F); - SynetFusedLayerForward9(src0, scale0, bias0, dst0, dst1, c + 2 * F); - SynetFusedLayerForward9(src0, scale0, bias0, dst0, dst1, c + 3 * F); - } - for (; c < partial0; c += F) - SynetFusedLayerForward9(src0, scale0, bias0, dst0, dst1, c); - if (c < channels0) - SynetFusedLayerForward9(src0, scale0, bias0, dst0, dst1, c, tail0); - src0 += channels0; - dst0 += channels0; - dst1 += channels0; - c = 0; - for (; c < aligned1; c += QF) - { - SynetFusedLayerForward9(src1, scale1, bias1, dst0, dst1, c + 0 * F); - SynetFusedLayerForward9(src1, scale1, bias1, dst0, dst1, c + 1 * F); - SynetFusedLayerForward9(src1, scale1, bias1, dst0, dst1, c + 2 * F); - SynetFusedLayerForward9(src1, scale1, bias1, dst0, dst1, c + 3 * F); - } - for (; c < partial1; c += F) - SynetFusedLayerForward9(src1, scale1, bias1, dst0, dst1, c); - if (c < channels1) - SynetFusedLayerForward9(src1, scale1, bias1, dst0, dst1, c, tail1); - src1 += channels1; - dst0 += channels1; - dst1 += channels1; - } - } - else - { - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned0; c += QF) - { - SynetFusedLayerForward9(src0, scale0, bias0, dst0, c + 0 * F); - SynetFusedLayerForward9(src0, scale0, bias0, dst0, c + 1 * F); - SynetFusedLayerForward9(src0, scale0, bias0, dst0, c + 2 * F); - SynetFusedLayerForward9(src0, scale0, bias0, dst0, c + 3 * F); - } - for (; c < partial0; c += F) - SynetFusedLayerForward9(src0, scale0, bias0, dst0, c); - if (c < channels0) - SynetFusedLayerForward9(src0, scale0, bias0, dst0, c, tail0); - src0 += channels0; - dst0 += channels0; - c = 0; - for (; c < aligned1; c += QF) - { - SynetFusedLayerForward9(src1, scale1, bias1, dst0, c + 0 * F); - SynetFusedLayerForward9(src1, scale1, bias1, dst0, c + 1 * F); - SynetFusedLayerForward9(src1, scale1, bias1, dst0, c + 2 * F); - SynetFusedLayerForward9(src1, scale1, bias1, dst0, c + 3 * F); - } - for (; c < partial1; c += F) - SynetFusedLayerForward9(src1, scale1, bias1, dst0, c); - if (c < channels1) - SynetFusedLayerForward9(src1, scale1, bias1, dst0, c, tail1); - src1 += channels1; - dst0 += channels1; - } - } - } - - SIMD_INLINE void SynetFusedLayerForward9Nhwc(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - if (Aligned(src0) && Aligned(src1) && Aligned(scale) && Aligned(bias) && Aligned(channels0, F) && Aligned(channels1, F) && Aligned(dst0) && Aligned(dst1)) - SynetFusedLayerForward9Nhwc(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else - SynetFusedLayerForward9Nhwc(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - } - - template void SynetFusedLayerForward9Nchw16cA(const float * src0, const float * src1, const float * scale0, const float * bias0, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - if (align) - assert(Aligned(src0) && Aligned(src1) && Aligned(dst0) && Aligned(dst1)); - const float * scale1 = scale0 + channels0; - const float * bias1 = bias0 + channels0; - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - if (dst1) - { - for (size_t c = 0; c < channels0; c += F) - { - __m512 _scale0 = Load(scale0 + c); - __m512 _bias0 = Load(bias0 + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + F * 0); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + F * 1); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + F * 2); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s); - src0 += spatialF; - dst0 += spatialF; - dst1 += spatialF; - } - for (size_t c = 0; c < channels1; c += F) - { - __m512 _scale1 = Load(scale1 + c); - __m512 _bias1 = Load(bias1 + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + F * 0); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + F * 1); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + F * 2); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s); - src1 += spatialF; - dst0 += spatialF; - dst1 += spatialF; - } - } - else - { - for (size_t c = 0; c < channels0; c += F) - { - __m512 _scale0 = Load(scale0 + c); - __m512 _bias0 = Load(bias0 + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + F * 0); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + F * 1); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + F * 2); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s); - src0 += spatialF; - dst0 += spatialF; - } - for (size_t c = 0; c < channels1; c += F) - { - __m512 _scale1 = Load(scale1 + c); - __m512 _bias1 = Load(bias1 + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + F * 0); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + F * 1); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + F * 2); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s); - src1 += spatialF; - dst0 += spatialF; - } - } - } - - SIMD_INLINE void SynetFusedLayerForward9Nchw16cA(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - assert(Aligned(channels0, F)); - if (Aligned(src0) && Aligned(src1) && Aligned(dst0) && Aligned(dst1)) - SynetFusedLayerForward9Nchw16cA(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else - SynetFusedLayerForward9Nchw16cA(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - } - - void SynetFusedLayerForward9(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels0 + channels1, spatial, format)) - SynetFusedLayerForward9Nchw(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else if (Base::NhwcCompatible(channels0 + channels1, spatial, format)) - SynetFusedLayerForward9Nhwc(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else if (format == SimdTensorFormatNchw4c) - Sse::SynetFusedLayerForward9(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1, format); - else if (format == SimdTensorFormatNchw8c) - Avx::SynetFusedLayerForward9(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1, format); - else if (format == SimdTensorFormatNchw16c && Aligned(channels0, F)) - SynetFusedLayerForward9Nchw16cA(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else - Base::SynetFusedLayerForward9(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1, format); - } - } -#endif// SIMD_AVX512F_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512fSynetMergedConvolution32f.cpp b/src/3rd/Simd/Simd/SimdAvx512fSynetMergedConvolution32f.cpp deleted file mode 100644 index 3e1d6770..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512fSynetMergedConvolution32f.cpp +++ /dev/null @@ -1,1512 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetMergedConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdUpdate.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#if defined(SIMD_AVX512F_ENABLE) - namespace Avx512f - { - template SIMD_INLINE void InputConvolution1x1_2x12(const float* src0, size_t srcC, - const float* weight, const __m512* bias, const __m512* params, float* dst0, float* dst1) - { - __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, da0, da1, db0, db1, s0, w0, w1; - d00 = bias[0], d01 = bias[1]; - d10 = bias[0], d11 = bias[1]; - d20 = bias[0], d21 = bias[1]; - d30 = bias[0], d31 = bias[1]; - d40 = bias[0], d41 = bias[1]; - d50 = bias[0], d51 = bias[1]; - d60 = bias[0], d61 = bias[1]; - d70 = bias[0], d71 = bias[1]; - d80 = bias[0], d81 = bias[1]; - d90 = bias[0], d91 = bias[1]; - da0 = bias[0], da1 = bias[1]; - db0 = bias[0], db1 = bias[1]; - const float* src1 = src0 + 1 * srcC; - const float* src2 = src0 + 2 * srcC; - const float* src3 = src0 + 3 * srcC; - const float* src4 = src0 + 4 * srcC; - const float* src5 = src0 + 5 * srcC; - for (size_t sc0 = 0, sc6 = 6 * srcC; sc0 < srcC; ++sc0, ++sc6) - { - w0 = _mm512_loadu_ps(weight + 0); - w1 = _mm512_loadu_ps(weight + F); - s0 = _mm512_set1_ps(src0[sc0]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - d01 = _mm512_fmadd_ps(s0, w1, d01); - s0 = _mm512_set1_ps(src1[sc0]); - d10 = _mm512_fmadd_ps(s0, w0, d10); - d11 = _mm512_fmadd_ps(s0, w1, d11); - s0 = _mm512_set1_ps(src2[sc0]); - d20 = _mm512_fmadd_ps(s0, w0, d20); - d21 = _mm512_fmadd_ps(s0, w1, d21); - s0 = _mm512_set1_ps(src3[sc0]); - d30 = _mm512_fmadd_ps(s0, w0, d30); - d31 = _mm512_fmadd_ps(s0, w1, d31); - s0 = _mm512_set1_ps(src4[sc0]); - d40 = _mm512_fmadd_ps(s0, w0, d40); - d41 = _mm512_fmadd_ps(s0, w1, d41); - s0 = _mm512_set1_ps(src5[sc0]); - d50 = _mm512_fmadd_ps(s0, w0, d50); - d51 = _mm512_fmadd_ps(s0, w1, d51); - s0 = _mm512_set1_ps(src0[sc6]); - d60 = _mm512_fmadd_ps(s0, w0, d60); - d61 = _mm512_fmadd_ps(s0, w1, d61); - s0 = _mm512_set1_ps(src1[sc6]); - d70 = _mm512_fmadd_ps(s0, w0, d70); - d71 = _mm512_fmadd_ps(s0, w1, d71); - s0 = _mm512_set1_ps(src2[sc6]); - d80 = _mm512_fmadd_ps(s0, w0, d80); - d81 = _mm512_fmadd_ps(s0, w1, d81); - s0 = _mm512_set1_ps(src3[sc6]); - d90 = _mm512_fmadd_ps(s0, w0, d90); - d91 = _mm512_fmadd_ps(s0, w1, d91); - s0 = _mm512_set1_ps(src4[sc6]); - da0 = _mm512_fmadd_ps(s0, w0, da0); - da1 = _mm512_fmadd_ps(s0, w1, da1); - s0 = _mm512_set1_ps(src5[sc6]); - db0 = _mm512_fmadd_ps(s0, w0, db0); - db1 = _mm512_fmadd_ps(s0, w1, db1); - weight += DF; - } - _mm512_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); - _mm512_storeu_ps(dst1 + 0 * F, Activate(d01, params, 1)); - _mm512_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); - _mm512_storeu_ps(dst1 + 1 * F, Activate(d11, params, 1)); - _mm512_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); - _mm512_storeu_ps(dst1 + 2 * F, Activate(d21, params, 1)); - _mm512_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); - _mm512_storeu_ps(dst1 + 3 * F, Activate(d31, params, 1)); - _mm512_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); - _mm512_storeu_ps(dst1 + 4 * F, Activate(d41, params, 1)); - _mm512_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); - _mm512_storeu_ps(dst1 + 5 * F, Activate(d51, params, 1)); - _mm512_storeu_ps(dst0 + 6 * F, Activate(d60, params, 0)); - _mm512_storeu_ps(dst1 + 6 * F, Activate(d61, params, 1)); - _mm512_storeu_ps(dst0 + 7 * F, Activate(d70, params, 0)); - _mm512_storeu_ps(dst1 + 7 * F, Activate(d71, params, 1)); - _mm512_storeu_ps(dst0 + 8 * F, Activate(d80, params, 0)); - _mm512_storeu_ps(dst1 + 8 * F, Activate(d81, params, 1)); - _mm512_storeu_ps(dst0 + 9 * F, Activate(d90, params, 0)); - _mm512_storeu_ps(dst1 + 9 * F, Activate(d91, params, 1)); - _mm512_storeu_ps(dst0 + 10 * F, Activate(da0, params, 0)); - _mm512_storeu_ps(dst1 + 10 * F, Activate(da1, params, 1)); - _mm512_storeu_ps(dst0 + 11 * F, Activate(db0, params, 0)); - _mm512_storeu_ps(dst1 + 11 * F, Activate(db1, params, 1)); - } - - template SIMD_INLINE void InputConvolution1x1_2x6(const float * src0, size_t srcC, - const float * weight, const __m512 * bias, const __m512 * params, float * dst0, float * dst1) - { - __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - d00 = bias[0], d01 = bias[1]; - d10 = bias[0], d11 = bias[1]; - d20 = bias[0], d21 = bias[1]; - d30 = bias[0], d31 = bias[1]; - d40 = bias[0], d41 = bias[1]; - d50 = bias[0], d51 = bias[1]; - const float * src1 = src0 + 1 * srcC; - const float * src2 = src0 + 2 * srcC; - const float * src3 = src0 + 3 * srcC; - const float * src4 = src0 + 4 * srcC; - const float * src5 = src0 + 5 * srcC; - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm512_loadu_ps(weight + 0); - w1 = _mm512_loadu_ps(weight + F); - s0 = _mm512_set1_ps(src0[sc]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - d01 = _mm512_fmadd_ps(s0, w1, d01); - s0 = _mm512_set1_ps(src1[sc]); - d10 = _mm512_fmadd_ps(s0, w0, d10); - d11 = _mm512_fmadd_ps(s0, w1, d11); - s0 = _mm512_set1_ps(src2[sc]); - d20 = _mm512_fmadd_ps(s0, w0, d20); - d21 = _mm512_fmadd_ps(s0, w1, d21); - s0 = _mm512_set1_ps(src3[sc]); - d30 = _mm512_fmadd_ps(s0, w0, d30); - d31 = _mm512_fmadd_ps(s0, w1, d31); - s0 = _mm512_set1_ps(src4[sc]); - d40 = _mm512_fmadd_ps(s0, w0, d40); - d41 = _mm512_fmadd_ps(s0, w1, d41); - s0 = _mm512_set1_ps(src5[sc]); - d50 = _mm512_fmadd_ps(s0, w0, d50); - d51 = _mm512_fmadd_ps(s0, w1, d51); - weight += DF; - } - _mm512_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); - _mm512_storeu_ps(dst1 + 0 * F, Activate(d01, params, 1)); - _mm512_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); - _mm512_storeu_ps(dst1 + 1 * F, Activate(d11, params, 1)); - _mm512_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); - _mm512_storeu_ps(dst1 + 2 * F, Activate(d21, params, 1)); - _mm512_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); - _mm512_storeu_ps(dst1 + 3 * F, Activate(d31, params, 1)); - _mm512_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); - _mm512_storeu_ps(dst1 + 4 * F, Activate(d41, params, 1)); - _mm512_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); - _mm512_storeu_ps(dst1 + 5 * F, Activate(d51, params, 1)); - } - - template SIMD_INLINE void InputConvolution1x1_2xM(const float * src0, size_t srcC, - const float * weight, const __m512 * bias, const __m512 * params, float * dst0, float * dst1) - { - __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - if (M > 0) d00 = bias[0], d01 = bias[1]; - if (M > 1) d10 = bias[0], d11 = bias[1]; - if (M > 2) d20 = bias[0], d21 = bias[1]; - if (M > 3) d30 = bias[0], d31 = bias[1]; - if (M > 4) d40 = bias[0], d41 = bias[1]; - if (M > 5) d50 = bias[0], d51 = bias[1]; - const float * src1 = src0 + 1 * srcC; - const float * src2 = src0 + 2 * srcC; - const float * src3 = src0 + 3 * srcC; - const float * src4 = src0 + 4 * srcC; - const float * src5 = src0 + 5 * srcC; - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm512_loadu_ps(weight + 0); - w1 = _mm512_loadu_ps(weight + F); - if (M > 0) s0 = _mm512_set1_ps(src0[sc]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01); - if (M > 1) s0 = _mm512_set1_ps(src1[sc]), d10 = _mm512_fmadd_ps(s0, w0, d10), d11 = _mm512_fmadd_ps(s0, w1, d11); - if (M > 2) s0 = _mm512_set1_ps(src2[sc]), d20 = _mm512_fmadd_ps(s0, w0, d20), d21 = _mm512_fmadd_ps(s0, w1, d21); - if (M > 3) s0 = _mm512_set1_ps(src3[sc]), d30 = _mm512_fmadd_ps(s0, w0, d30), d31 = _mm512_fmadd_ps(s0, w1, d31); - if (M > 4) s0 = _mm512_set1_ps(src4[sc]), d40 = _mm512_fmadd_ps(s0, w0, d40), d41 = _mm512_fmadd_ps(s0, w1, d41); - if (M > 5) s0 = _mm512_set1_ps(src5[sc]), d50 = _mm512_fmadd_ps(s0, w0, d50), d51 = _mm512_fmadd_ps(s0, w1, d51); - weight += DF; - } - if (M > 0) _mm512_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)), _mm512_storeu_ps(dst1 + 0 * F, Activate(d01, params, 1)); - if (M > 1) _mm512_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)), _mm512_storeu_ps(dst1 + 1 * F, Activate(d11, params, 1)); - if (M > 2) _mm512_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)), _mm512_storeu_ps(dst1 + 2 * F, Activate(d21, params, 1)); - if (M > 3) _mm512_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)), _mm512_storeu_ps(dst1 + 3 * F, Activate(d31, params, 1)); - if (M > 4) _mm512_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)), _mm512_storeu_ps(dst1 + 4 * F, Activate(d41, params, 1)); - if (M > 5) _mm512_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)), _mm512_storeu_ps(dst1 + 5 * F, Activate(d51, params, 1)); - } - - typedef void(*InputConvolution1x1_2xM_Ptr)(const float * src0, size_t srcC, const float * weight, const __m512 * bias, const __m512 * params, float * dst0, float * dst1); - - template InputConvolution1x1_2xM_Ptr GetInputConvolution1x1_2xM(size_t M) - { - switch (M) - { - case 0: return InputConvolution1x1_2xM; - case 1: return InputConvolution1x1_2xM; - case 2: return InputConvolution1x1_2xM; - case 3: return InputConvolution1x1_2xM; - case 4: return InputConvolution1x1_2xM; - case 5: return InputConvolution1x1_2xM; - } - assert(0); - return NULL; - } - - template SIMD_INLINE void InputConvolution1x1_1x6(const float * src0, size_t srcC, - const float * weight, const __m512 * bias, const __m512 * params, float * dst0) - { - __m512 d00, d10, d20, d30, d40, d50, s0, w0; - d00 = bias[0]; - d10 = bias[0]; - d20 = bias[0]; - d30 = bias[0]; - d40 = bias[0]; - d50 = bias[0]; - const float * src1 = src0 + 1 * srcC; - const float * src2 = src0 + 2 * srcC; - const float * src3 = src0 + 3 * srcC; - const float * src4 = src0 + 4 * srcC; - const float * src5 = src0 + 5 * srcC; - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm512_loadu_ps(weight + 0); - s0 = _mm512_set1_ps(src0[sc]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - s0 = _mm512_set1_ps(src1[sc]); - d10 = _mm512_fmadd_ps(s0, w0, d10); - s0 = _mm512_set1_ps(src2[sc]); - d20 = _mm512_fmadd_ps(s0, w0, d20); - s0 = _mm512_set1_ps(src3[sc]); - d30 = _mm512_fmadd_ps(s0, w0, d30); - s0 = _mm512_set1_ps(src4[sc]); - d40 = _mm512_fmadd_ps(s0, w0, d40); - s0 = _mm512_set1_ps(src5[sc]); - d50 = _mm512_fmadd_ps(s0, w0, d50); - weight += DF; - } - _mm512_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); - _mm512_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); - _mm512_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); - _mm512_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); - _mm512_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); - _mm512_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); - } - - template SIMD_INLINE void InputConvolution1x1_1xM(const float * src0, size_t srcC, - const float * weight, const __m512 * bias, const __m512 * params, float * dst0) - { - __m512 d00, d10, d20, d30, d40, d50, s0, w0; - if (M > 0) d00 = bias[0]; - if (M > 1) d10 = bias[0]; - if (M > 2) d20 = bias[0]; - if (M > 3) d30 = bias[0]; - if (M > 4) d40 = bias[0]; - if (M > 5) d50 = bias[0]; - const float * src1 = src0 + 1 * srcC; - const float * src2 = src0 + 2 * srcC; - const float * src3 = src0 + 3 * srcC; - const float * src4 = src0 + 4 * srcC; - const float * src5 = src0 + 5 * srcC; - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm512_loadu_ps(weight + 0); - if (M > 0) s0 = _mm512_set1_ps(src0[sc]), d00 = _mm512_fmadd_ps(s0, w0, d00); - if (M > 1) s0 = _mm512_set1_ps(src1[sc]), d10 = _mm512_fmadd_ps(s0, w0, d10); - if (M > 2) s0 = _mm512_set1_ps(src2[sc]), d20 = _mm512_fmadd_ps(s0, w0, d20); - if (M > 3) s0 = _mm512_set1_ps(src3[sc]), d30 = _mm512_fmadd_ps(s0, w0, d30); - if (M > 4) s0 = _mm512_set1_ps(src4[sc]), d40 = _mm512_fmadd_ps(s0, w0, d40); - if (M > 5) s0 = _mm512_set1_ps(src5[sc]), d50 = _mm512_fmadd_ps(s0, w0, d50); - weight += DF; - } - if (M > 0) _mm512_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); - if (M > 1) _mm512_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); - if (M > 2) _mm512_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); - if (M > 3) _mm512_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); - if (M > 4) _mm512_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); - if (M > 5) _mm512_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); - } - - typedef void(*InputConvolution1x1_1xM_Ptr)(const float * src0, size_t srcC, const float * weight, const __m512 * bias, const __m512 * params, float * dst0); - - template InputConvolution1x1_1xM_Ptr GetInputConvolution1x1_1xM(size_t M) - { - switch (M) - { - case 0: return InputConvolution1x1_1xM; - case 1: return InputConvolution1x1_1xM; - case 2: return InputConvolution1x1_1xM; - case 3: return InputConvolution1x1_1xM; - case 4: return InputConvolution1x1_1xM; - case 5: return InputConvolution1x1_1xM; - } - assert(0); - return NULL; - } - - template void InputConvolution1x1(const float * src, const SimdConvolutionParameters & p, - size_t dstC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcH = p.srcH, srcW = p.srcW, srcC = p.srcC, dstW = p.dstW; - size_t dstM = (bufH[0] - 1), dstS = bufH[0] * dstW *F; - size_t dstCDF = AlignLo(dstC, DF); - __m512 _params[2], _bias[2]; - _params[0] = _mm512_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm512_set1_ps(params[1]); -#ifdef SIMD_MERGECONV_MERGE_INPUT_ROWS_1X1 - size_t yInt = Simd::Max(yBeg, yEnd&(~dstM)), nBeg = yBeg * dstW, nInt = yInt * dstW, nEnd = yEnd * dstW; - size_t nInt6 = AlignLoAny(nInt - nBeg, 6) + nBeg, nEnd6 = AlignLoAny(nEnd - nInt, 6) + nInt, nIntTail = nInt - nInt6, nEndTail = nEnd - nEnd6; - size_t nInt12 = AlignLoAny(nInt - nBeg, 12) + nBeg, nEnd12 = AlignLoAny(nEnd - nInt, 12) + nInt; - InputConvolution1x1_2xM_Ptr tailInt_2 = GetInputConvolution1x1_2xM(nIntTail); - InputConvolution1x1_2xM_Ptr tailEnd_2 = GetInputConvolution1x1_2xM(nEndTail); -#else - size_t dstW6 = AlignLoAny(dstW, 6), wTail = dstW - dstW6; - InputConvolution1x1_2xM_Ptr tailW_2 = GetInputConvolution1x1_2xM(wTail); - InputConvolution1x1_1xM_Ptr tailW_1 = GetInputConvolution1x1_1xM(wTail); -#endif - - size_t dc = 0; - for (; dc < dstC; dc += DF) - { - _bias[0] = bias ? _mm512_loadu_ps(bias + dc + 0) : _mm512_setzero_ps(); - _bias[1] = bias ? _mm512_loadu_ps(bias + dc + F) : _mm512_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = _mm512_loadu_ps(params + dc + 0); - _params[1] = _mm512_loadu_ps(params + dc + F); - } - const float * pS = src + yBeg * srcW*srcC; - const float * pW = weight + dc * srcC; - float * pD = dst + (dc / F)*dstS; -#ifdef SIMD_MERGECONV_MERGE_INPUT_ROWS_1X1 - float * dst0 = pD + (yBeg&dstM)*dstW*F; - float * dst1 = pD + (yInt&dstM)*dstW*F; - size_t dn = nBeg; - if (dstC - dc > F) - { - for (; dn < nInt12; dn += 12, pS += 12 * srcC, dst0 += 12 * F) - InputConvolution1x1_2x12(pS, srcC, pW, _bias, _params, dst0, dst0 + dstS); - for (; dn < nInt6; dn += 6, pS += 6 * srcC, dst0 += 6 * F) - InputConvolution1x1_2x6(pS, srcC, pW, _bias, _params, dst0, dst0 + dstS); - if (nIntTail) - tailInt_2(pS, srcC, pW, _bias, _params, dst0, dst0 + dstS), pS += nIntTail * srcC, dn += nIntTail; - for (; dn < nEnd12; dn += 12, pS += 12 * srcC, dst1 += 12 * F) - InputConvolution1x1_2x12(pS, srcC, pW, _bias, _params, dst1, dst1 + dstS); - for (; dn < nEnd6; dn += 6, pS += 6 * srcC, dst1 += 6 * F) - InputConvolution1x1_2x6(pS, srcC, pW, _bias, _params, dst1, dst1 + dstS); - if (nEndTail) - tailEnd_2(pS, srcC, pW, _bias, _params, dst1, dst1 + dstS), pS += nEndTail * srcC, dn += nEndTail; - } - else - { - InputConvolution1x1_1xM_Ptr tailInt_1 = GetInputConvolution1x1_1xM(nIntTail); - InputConvolution1x1_1xM_Ptr tailEnd_1 = GetInputConvolution1x1_1xM(nEndTail); - for (; dn < nInt6; dn += 6, pS += 6 * srcC, dst0 += 6 * F) - InputConvolution1x1_1x6(pS, srcC, pW, _bias, _params, dst0); - if (nIntTail) - tailInt_1(pS, srcC, pW, _bias, _params, dst0), pS += nIntTail * srcC, dn += nIntTail; - for (; dn < nEnd6; dn += 6, pS += 6 * srcC, dst1 += 6 * F) - InputConvolution1x1_1x6(pS, srcC, pW, _bias, _params, dst1); - if (nEndTail) - tailEnd_1(pS, srcC, pW, _bias, _params, dst1), pS += nEndTail * srcC, dn += nEndTail; - } -#else - for (size_t dy = yBeg; dy < yEnd; ++dy) - { - float * dst0 = pD + (dy&dstM)*dstW*F; - size_t dx = 0; - if (dstC - dc > F) - { - for (; dx < dstW6; dx += 6, pS += 6 * srcC, dst0 += 6 * F) - InputConvolution1x1_2x6(pS, srcC, pW, _bias, _params, dst0, dst0 + dstS); - if (wTail) - tailW_2(pS, srcC, pW, _bias, _params, dst0, dst0 + dstS), pS += wTail * srcC, dx += wTail; - } - else - { - for (; dx < dstW6; dx += 6, pS += 6 * srcC, dst0 += 6 * F) - InputConvolution1x1_1x6(pS, srcC, pW, _bias, _params, dst0); - if (wTail) - tailW_1(pS, srcC, pW, _bias, _params, dst0), pS += wTail * srcC, dx += wTail; - } - } -#endif - } - } - - template SIMD_INLINE void InputConvolution_2x1(const float * src0, const SimdConvolutionParameters & p, - size_t kH, size_t kW, const float * weight, const __m512 * bias, const __m512 * params, float * dst0, float * dst1) - { - __m512 d00, d01, s0, w0, w1; - d00 = bias[0]; - d01 = bias[1]; - size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW)*p.srcC, stride = p.srcW * p.srcC; - for (size_t ky = 0; ky < kH; ++ky) - { - for (size_t i = 0; i < size; ++i) - { - w0 = _mm512_loadu_ps(weight + 0); - w1 = _mm512_loadu_ps(weight + F); - s0 = _mm512_set1_ps(src0[i]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - d01 = _mm512_fmadd_ps(s0, w1, d01); - weight += DF; - } - weight += tail; - src0 += stride; - } - _mm512_storeu_ps(dst0, Activate(d00, params, 0)); - _mm512_storeu_ps(dst1, Activate(d01, params, 1)); - } - - template SIMD_INLINE void InputConvolution_1x1(const float * src0, const SimdConvolutionParameters & p, - size_t kH, size_t kW, const float * weight, const __m512 * bias, const __m512 * params, float * dst0) - { - __m512 d00, s0, w0; - d00 = bias[0]; - size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW)*p.srcC, stride = p.srcW * p.srcC; - for (size_t ky = 0; ky < kH; ++ky) - { - for (size_t i = 0; i < size; ++i) - { - w0 = _mm512_loadu_ps(weight + 0); - s0 = _mm512_set1_ps(src0[i]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - weight += DF; - } - weight += tail; - src0 += stride; - } - _mm512_storeu_ps(dst0, Activate(d00, params, 0)); - } - - template SIMD_INLINE void InputConvolution_2x6(const float * src0, const SimdConvolutionParameters & p, - size_t kH, size_t kW, const float * weight, const __m512 * bias, const __m512 * params, float * dst0, float * dst1) - { - __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - d00 = bias[0], d01 = bias[1]; - d10 = bias[0], d11 = bias[1]; - d20 = bias[0], d21 = bias[1]; - d30 = bias[0], d31 = bias[1]; - d40 = bias[0], d41 = bias[1]; - d50 = bias[0], d51 = bias[1]; - size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW)*p.srcC, stride = p.srcW * p.srcC, step = p.srcC*p.strideX; - const float * src1 = src0 + 1 * step; - const float * src2 = src0 + 2 * step; - const float * src3 = src0 + 3 * step; - const float * src4 = src0 + 4 * step; - const float * src5 = src0 + 5 * step; - for (size_t ky = 0; ky < kH; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm512_loadu_ps(weight + 0); - w1 = _mm512_loadu_ps(weight + F); - s0 = _mm512_set1_ps(src0[offset]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - d01 = _mm512_fmadd_ps(s0, w1, d01); - s0 = _mm512_set1_ps(src1[offset]); - d10 = _mm512_fmadd_ps(s0, w0, d10); - d11 = _mm512_fmadd_ps(s0, w1, d11); - s0 = _mm512_set1_ps(src2[offset]); - d20 = _mm512_fmadd_ps(s0, w0, d20); - d21 = _mm512_fmadd_ps(s0, w1, d21); - s0 = _mm512_set1_ps(src3[offset]); - d30 = _mm512_fmadd_ps(s0, w0, d30); - d31 = _mm512_fmadd_ps(s0, w1, d31); - s0 = _mm512_set1_ps(src4[offset]); - d40 = _mm512_fmadd_ps(s0, w0, d40); - d41 = _mm512_fmadd_ps(s0, w1, d41); - s0 = _mm512_set1_ps(src5[offset]); - d50 = _mm512_fmadd_ps(s0, w0, d50); - d51 = _mm512_fmadd_ps(s0, w1, d51); - weight += DF; - } - weight += tail; - } - _mm512_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); - _mm512_storeu_ps(dst1 + 0 * F, Activate(d01, params, 1)); - _mm512_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); - _mm512_storeu_ps(dst1 + 1 * F, Activate(d11, params, 1)); - _mm512_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); - _mm512_storeu_ps(dst1 + 2 * F, Activate(d21, params, 1)); - _mm512_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); - _mm512_storeu_ps(dst1 + 3 * F, Activate(d31, params, 1)); - _mm512_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); - _mm512_storeu_ps(dst1 + 4 * F, Activate(d41, params, 1)); - _mm512_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); - _mm512_storeu_ps(dst1 + 5 * F, Activate(d51, params, 1)); - } - - template SIMD_INLINE void InputConvolution_1x6(const float * src0, const SimdConvolutionParameters & p, - size_t kH, size_t kW, const float * weight, const __m512 * bias, const __m512 * params, float * dst0) - { - __m512 d00, d10, d20, d30, d40, d50, s0, w0; - d00 = bias[0]; - d10 = bias[0]; - d20 = bias[0]; - d30 = bias[0]; - d40 = bias[0]; - d50 = bias[0]; - size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW)*p.srcC, stride = p.srcW * p.srcC, step = p.srcC*p.strideX; - const float * src1 = src0 + 1 * step; - const float * src2 = src0 + 2 * step; - const float * src3 = src0 + 3 * step; - const float * src4 = src0 + 4 * step; - const float * src5 = src0 + 5 * step; - for (size_t ky = 0; ky < kH; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm512_loadu_ps(weight + 0); - s0 = _mm512_set1_ps(src0[offset]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - s0 = _mm512_set1_ps(src1[offset]); - d10 = _mm512_fmadd_ps(s0, w0, d10); - s0 = _mm512_set1_ps(src2[offset]); - d20 = _mm512_fmadd_ps(s0, w0, d20); - s0 = _mm512_set1_ps(src3[offset]); - d30 = _mm512_fmadd_ps(s0, w0, d30); - s0 = _mm512_set1_ps(src4[offset]); - d40 = _mm512_fmadd_ps(s0, w0, d40); - s0 = _mm512_set1_ps(src5[offset]); - d50 = _mm512_fmadd_ps(s0, w0, d50); - weight += DF; - } - weight += tail; - } - _mm512_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); - _mm512_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); - _mm512_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); - _mm512_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); - _mm512_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); - _mm512_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); - } - - template void InputConvolution(const float * src, const SimdConvolutionParameters & p, - size_t dstC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcH = p.srcH, srcW = p.srcW, srcC = p.srcC, dstW = p.dstW; - size_t kernelY = p.kernelY, kernelX = p.kernelX, strideY = p.strideY, strideX = p.strideX; - size_t dstM = (bufH[0] - 1), dstS = bufH[0] * dstW * F; - size_t dstCDF = AlignLo(dstC, DF); - if (dstC - F > dstCDF) - dstCDF += DF; - - size_t noseH = p.padY, noseW = p.padX; - size_t bodyH = p.srcH - p.kernelY + 1 + noseH, bodyW = p.srcW - p.kernelX + 1 + noseW; - size_t bodyW6 = AlignLoAny(bodyW - noseW, 6 * p.strideX) + noseW; - size_t tailH = bodyH + p.padH, tailW = bodyW + p.padW; - size_t wS = p.srcC*p.dstC; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - - __m512 _params[2], _bias[2]; - _params[0] = _mm512_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm512_set1_ps(params[1]); - - size_t dc = 0; - for (; dc < dstCDF; dc += DF) - { - _bias[0] = bias ? _mm512_loadu_ps(bias + dc + 0) : _mm512_setzero_ps(); - _bias[1] = bias ? _mm512_loadu_ps(bias + dc + F) : _mm512_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = _mm512_loadu_ps(params + dc + 0); - _params[1] = _mm512_loadu_ps(params + dc + F); - } - size_t dy = yBeg, sy = dy * strideY; - for (; sy < noseH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS, *dst1 = dst0 + dstS; - size_t sx = 0; - const float * s = src; - const float * w = weight + (noseH - sy) * kernelX * DF * srcC; - for (; sx < noseW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s, p, kY + sy, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0, dst1); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F, dst1 += 6 * F) - InputConvolution_2x6(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < bodyW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < tailW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kY + sy, kW - sx, w, _bias, _params, dst0, dst1); - } - for (; sy < bodyH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS, *dst1 = dst0 + dstS; - size_t sx = 0; - const float * s = src + (sy - noseH)*srcW*srcC; - const float * w = weight; - for (; sx < noseW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s, p, kernelY, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0, dst1); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F, dst1 += 6 * F) - InputConvolution_2x6(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < bodyW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < tailW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kernelY, kW - sx, w, _bias, _params, dst0, dst1); - } - for (; sy < tailH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS, *dst1 = dst0 + dstS; - size_t sx = 0; - const float * s = src + (sy - noseH)*srcW*srcC; - const float * w = weight; - for (; sx < noseW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s, p, kH - sy, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0, dst1); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F, dst1 += 6 * F) - InputConvolution_2x6(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < bodyW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < tailW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kH - sy, kW - sx, w, _bias, _params, dst0, dst1); - } - weight += kernelY * kernelX*srcC*DF; - } - if (dc < dstC) - { - _bias[0] = bias ? _mm512_loadu_ps(bias + dc) : _mm512_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm512_loadu_ps(params + dc); - size_t dy = yBeg, sy = dy * strideY; - for (; sy < noseH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS; - size_t sx = 0; - const float * s = src; - const float * w = weight + (noseH - sy) * kernelX * DF * srcC; - for (; sx < noseW; sx += strideX, dst0 += F) - InputConvolution_1x1(s, p, kY + sy, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F) - InputConvolution_1x6(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0); - for (; sx < bodyW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0); - for (; sx < tailW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kY + sy, kW - sx, w, _bias, _params, dst0); - } - for (; sy < bodyH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS; - size_t sx = 0; - const float * s = src + (sy - noseH)*srcW*srcC; - const float * w = weight; - for (; sx < noseW; sx += strideX, dst0 += F) - InputConvolution_1x1(s, p, kernelY, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F) - InputConvolution_1x6(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0); - for (; sx < bodyW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0); - for (; sx < tailW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kernelY, kW - sx, w, _bias, _params, dst0); - } - for (; sy < tailH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS; - size_t sx = 0; - const float * s = src + (sy - noseH)*srcW*srcC; - const float * w = weight; - for (; sx < noseW; sx += strideX, dst0 += F) - InputConvolution_1x1(s, p, kH - sy, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F) - InputConvolution_1x6(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0); - for (; sx < bodyW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0); - for (; sx < tailW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kH - sy, kW - sx, w, _bias, _params, dst0); - } - } - } - - template void DepthwiseConvolution(const float* src, const SimdConvolutionParameters& p, - size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float* weight, const float* bias, const float* params, float* dst) - { - size_t strideY = p.strideY, strideX = p.strideX, padY = p.padY, padX = p.padX, padH = p.padH, padW = p.padW; - size_t srcW = p.srcW * F, dstW = p.dstW * F, weightS = p.kernelY * p.kernelX * F, strideXF = strideX * F; - size_t srcM = (bufH[0] - 1), dstM = (bufH[1] - 1), srcS = bufH[0] * srcW, dstS = bufH[1] * dstW; - size_t noseY = (p.padY + p.strideY - 1) / p.strideY; - size_t bodyY = (p.srcH + p.padY + p.strideY - p.kernelY) / p.strideY; - size_t noseX = (p.padX + p.strideX - 1) / p.strideX; - size_t bodyX = (p.srcW + p.padX + p.strideX - p.kernelX) / p.strideX; - size_t bodyX2 = AlignLo(bodyX - noseX, 2) + noseX; - size_t bodyX4 = AlignLo(bodyX - noseX, 4) + noseX; - size_t bodyX8 = AlignLo(bodyX - noseX, 8) + noseX; - - __m512 _params[2]; - _params[0] = _mm512_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm512_set1_ps(params[1]); - for (size_t c = 0; c < srcC; c += F) - { - __m512 _bias = bias ? _mm512_loadu_ps(bias + c) : _mm512_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm512_loadu_ps(params + c); - - for (size_t dy = yBeg; dy < yEnd; ++dy) - { - float* pd = dst + (dy & dstM) * dstW; - if (dy >= noseY && dy < bodyY) - { - size_t dx = 0; - for (; dx < noseX; ++dx, pd += F) - { - __m512 sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * p.strideY + ky - padY; - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * p.strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + ((sy & srcM) * p.srcW + sx) * F; - sum = _mm512_fmadd_ps(_mm512_loadu_ps(ps), _mm512_loadu_ps(pw), sum); - } - } - } - _mm512_storeu_ps(pd, Activate(sum, _params, 0)); - } - for (; dx < bodyX8; dx += 8, pd += 8 * F) - { - __m512 sum0 = _bias; - __m512 sum1 = _bias; - __m512 sum2 = _bias; - __m512 sum3 = _bias; - __m512 sum4 = _bias; - __m512 sum5 = _bias; - __m512 sum6 = _bias; - __m512 sum7 = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - __m512 w0 = _mm512_loadu_ps(pw); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 0 * strideXF), w0, sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 1 * strideXF), w0, sum1); - sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 2 * strideXF), w0, sum2); - sum3 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 3 * strideXF), w0, sum3); - sum4 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 4 * strideXF), w0, sum4); - sum5 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 5 * strideXF), w0, sum5); - sum6 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 6 * strideXF), w0, sum6); - sum7 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 7 * strideXF), w0, sum7); - } - } - _mm512_storeu_ps(pd + 0 * F, Activate(sum0, _params, 0)); - _mm512_storeu_ps(pd + 1 * F, Activate(sum1, _params, 0)); - _mm512_storeu_ps(pd + 2 * F, Activate(sum2, _params, 0)); - _mm512_storeu_ps(pd + 3 * F, Activate(sum3, _params, 0)); - _mm512_storeu_ps(pd + 4 * F, Activate(sum4, _params, 0)); - _mm512_storeu_ps(pd + 5 * F, Activate(sum5, _params, 0)); - _mm512_storeu_ps(pd + 6 * F, Activate(sum6, _params, 0)); - _mm512_storeu_ps(pd + 7 * F, Activate(sum7, _params, 0)); - } - for (; dx < bodyX4; dx += 4, pd += 4 * F) - { - __m512 sum0 = _bias; - __m512 sum1 = _bias; - __m512 sum2 = _bias; - __m512 sum3 = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - __m512 w0 = _mm512_loadu_ps(pw); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 0 * strideXF), w0, sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 1 * strideXF), w0, sum1); - sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 2 * strideXF), w0, sum2); - sum3 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 3 * strideXF), w0, sum3); - } - } - _mm512_storeu_ps(pd + 0 * F, Activate(sum0, _params, 0)); - _mm512_storeu_ps(pd + 1 * F, Activate(sum1, _params, 0)); - _mm512_storeu_ps(pd + 2 * F, Activate(sum2, _params, 0)); - _mm512_storeu_ps(pd + 3 * F, Activate(sum3, _params, 0)); - } - for (; dx < bodyX2; dx += 2, pd += 2 * F) - { - __m512 sum0 = _bias; - __m512 sum1 = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - __m512 w0 = _mm512_loadu_ps(pw); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 0 * strideXF), w0, sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 1 * strideXF), w0, sum1); - } - } - _mm512_storeu_ps(pd + 0 * F, Activate(sum0, _params, 0)); - _mm512_storeu_ps(pd + 1 * F, Activate(sum1, _params, 0)); - } - for (; dx < bodyX; ++dx, pd += F) - { - __m512 sum = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - __m512 w0 = _mm512_loadu_ps(pw); - sum = _mm512_fmadd_ps(_mm512_loadu_ps(ps), w0, sum); - } - } - _mm512_storeu_ps(pd, Activate(sum, _params, 0)); - } - for (; dx < p.dstW; ++dx, pd += F) - { - __m512 sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + ((sy & srcM) * p.srcW + sx) * F; - sum = _mm512_fmadd_ps(_mm512_loadu_ps(ps), _mm512_loadu_ps(pw), sum); - } - } - } - _mm512_storeu_ps(pd, Activate(sum, _params, 0)); - } - } - else - { - for (size_t dx = 0; dx < p.dstW; ++dx, pd += F) - { - __m512 sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + ((sy & srcM) * p.srcW + sx) * F; - sum = _mm512_fmadd_ps(_mm512_loadu_ps(ps), _mm512_loadu_ps(pw), sum); - } - } - } - } - _mm512_storeu_ps(pd, Activate(sum, _params, 0)); - } - } - } - src += srcS; - dst += dstS; - weight += weightS; - } - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Edge2x2( - const float * src0, const float * src1, const __m512 * weight, const __m512 & bias, const __m512 * params, float * dst) - { - __m512 sum0 = bias, sum1 = _mm512_setzero_ps(); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src0 + 0 * F), weight[0], sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src0 + 1 * F), weight[1], sum1); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src1 + 0 * F), weight[3], sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src1 + 1 * F), weight[4], sum1); - _mm512_storeu_ps(dst, Activate(_mm512_add_ps(sum0, sum1), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Edge2x3( - const float * src0, const float * src1, const __m512 * weight, const __m512 & bias, const __m512 * params, float * dst) - { - __m512 sum0 = bias, sum1 = _mm512_setzero_ps(), sum2 = _mm512_setzero_ps(); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src0 + 0 * F), weight[0], sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src0 + 1 * F), weight[1], sum1); - sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src0 + 2 * F), weight[2], sum2); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src1 + 0 * F), weight[3], sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src1 + 1 * F), weight[4], sum1); - sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src1 + 2 * F), weight[5], sum2); - _mm512_storeu_ps(dst, Activate(_mm512_add_ps(_mm512_add_ps(sum0, sum1), sum2), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Edge3x2( - const float * src0, const float * src1, const float * src2, const __m512 * weight, const __m512 & bias, const __m512 * params, float * dst) - { - __m512 sum0 = bias, sum1 = _mm512_setzero_ps(); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src0 + 0 * F), weight[0], sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src0 + 1 * F), weight[1], sum1); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src1 + 0 * F), weight[3], sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src1 + 1 * F), weight[4], sum1); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src2 + 0 * F), weight[6], sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src2 + 1 * F), weight[7], sum1); - _mm512_storeu_ps(dst, Activate(_mm512_add_ps(sum0, sum1), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Main1x1( - const float * src0, const float * src1, const float * src2, const __m512 * weight, const __m512 & bias, const __m512 * params, float * dst) - { - __m512 sum0 = bias, sum1 = _mm512_setzero_ps(), sum2 = _mm512_setzero_ps(); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src0 + 0 * F), weight[0], sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src0 + 1 * F), weight[1], sum1); - sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src0 + 2 * F), weight[2], sum2); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src1 + 0 * F), weight[3], sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src1 + 1 * F), weight[4], sum1); - sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src1 + 2 * F), weight[5], sum2); - sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src2 + 0 * F), weight[6], sum0); - sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src2 + 1 * F), weight[7], sum1); - sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src2 + 2 * F), weight[8], sum2); - _mm512_storeu_ps(dst, Activate(_mm512_add_ps(_mm512_add_ps(sum0, sum1), sum2), params, 0)); - } - - template void DepthwiseConvolution3x3(const float * src, const SimdConvolutionParameters & p, - size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float * weight, const float * bias, const float * params, float * dst) - { - size_t strideY = p.strideY, padY = p.padY, padX = p.padX, padH = p.padH, padW = p.padW; - size_t srcW = p.srcW * F, dstW = p.dstW * F, weightS = p.kernelY * p.kernelX * F; - size_t srcM = (bufH[0] - 1), dstM = (bufH[1] - 1), srcS = bufH[0] * srcW, dstS = bufH[1] * dstW; - size_t xStep = F * p.strideX, xStep0 = (p.strideX - p.padX)*F; - size_t xMainEnd = p.dstW - p.padW, yMainEnd = yEnd == p.dstH && p.padH ? yEnd - 1 : yEnd; - - __m512 _params[2]; - _params[0] = _mm512_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm512_set1_ps(params[1]); - for (size_t c = 0; c < srcC; c += F) - { - __m512 _weight[9]; - for (size_t i = 0; i < 9; ++i) - _weight[i] = _mm512_loadu_ps(weight + i * F); - __m512 _bias = bias ? _mm512_loadu_ps(bias + c) : _mm512_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm512_loadu_ps(params + c); - - size_t dy = yBeg; - if (yBeg == 0 && padY) - { - size_t sy = 0, dx = 0; - const float * src0 = src + ((sy + 0)&srcM)*srcW; - const float * src1 = src + ((sy + 1)&srcM)*srcW; - float * pDst = dst + (dy&dstM)*dstW; - if (padX) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 4, _bias, _params, pDst), pDst += F, dx++, src0 += xStep0, src1 += xStep0; - for (; dx < xMainEnd; dx++, pDst += F, src0 += xStep, src1 += xStep) - ConvolutionDepthwise3x3Edge2x3(src0, src1, _weight + 3, _bias, _params, pDst); - if (padW) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 3, _bias, _params, pDst); - dy++; - } - for (; dy < yMainEnd; ++dy) - { - size_t sy = dy * strideY - padY, dx = 0; - const float * src0 = src + ((sy + 0)&srcM)*srcW; - const float * src1 = src + ((sy + 1)&srcM)*srcW; - const float * src2 = src + ((sy + 2)&srcM)*srcW; - float * pDst = dst + (dy&dstM)*dstW; - if (padX) - ConvolutionDepthwise3x3Edge3x2(src0, src1, src2, _weight + 1, _bias, _params, pDst), pDst += F, dx++, src0 += xStep0, src1 += xStep0, src2 += xStep0; - for (; dx < xMainEnd; dx++, pDst += F, src0 += xStep, src1 += xStep, src2 += xStep) - ConvolutionDepthwise3x3Main1x1(src0, src1, src2, _weight + 0, _bias, _params, pDst); - if (padW) - ConvolutionDepthwise3x3Edge3x2(src0, src1, src2, _weight + 0, _bias, _params, pDst); - } - if (dy < yEnd) - { - size_t sy = dy * strideY - padY, dx = 0; - const float * src0 = src + ((sy + 0)&srcM)*srcW; - const float * src1 = src + ((sy + 1)&srcM)*srcW; - float * pDst = dst + (dy&dstM)*dstW; - if (padX) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 1, _bias, _params, pDst), pDst += F, dx++, src0 += xStep0, src1 += xStep0; - for (; dx < xMainEnd; dx++, pDst += F, src0 += xStep, src1 += xStep) - ConvolutionDepthwise3x3Edge2x3(src0, src1, _weight + 0, _bias, _params, pDst); - if (padW) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 0, _bias, _params, pDst); - } - src += srcS; - dst += dstS; - weight += weightS; - } - } - - template void OutputConvolution_2x12(const float* src, size_t srcC, size_t srcS, - const float* weight, const __m512* bias, const __m512* params, float* dst, size_t dstC, const __mmask16 tails[2]) - { - __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, da0, da1, db0, db1, s0, w0, w1; - if (tails[1]) - { - d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(), d41 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(), d51 = _mm512_setzero_ps(); - d60 = _mm512_setzero_ps(), d61 = _mm512_setzero_ps(); - d70 = _mm512_setzero_ps(), d71 = _mm512_setzero_ps(); - d80 = _mm512_setzero_ps(), d81 = _mm512_setzero_ps(); - d90 = _mm512_setzero_ps(), d91 = _mm512_setzero_ps(); - da0 = _mm512_setzero_ps(), da1 = _mm512_setzero_ps(); - db0 = _mm512_setzero_ps(), db1 = _mm512_setzero_ps(); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm512_loadu_ps(weight + 0); - w1 = _mm512_loadu_ps(weight + F); - s0 = _mm512_set1_ps(src[i + 0 * F]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - d01 = _mm512_fmadd_ps(s0, w1, d01); - s0 = _mm512_set1_ps(src[i + 1 * F]); - d10 = _mm512_fmadd_ps(s0, w0, d10); - d11 = _mm512_fmadd_ps(s0, w1, d11); - s0 = _mm512_set1_ps(src[i + 2 * F]); - d20 = _mm512_fmadd_ps(s0, w0, d20); - d21 = _mm512_fmadd_ps(s0, w1, d21); - s0 = _mm512_set1_ps(src[i + 3 * F]); - d30 = _mm512_fmadd_ps(s0, w0, d30); - d31 = _mm512_fmadd_ps(s0, w1, d31); - s0 = _mm512_set1_ps(src[i + 4 * F]); - d40 = _mm512_fmadd_ps(s0, w0, d40); - d41 = _mm512_fmadd_ps(s0, w1, d41); - s0 = _mm512_set1_ps(src[i + 5 * F]); - d50 = _mm512_fmadd_ps(s0, w0, d50); - d51 = _mm512_fmadd_ps(s0, w1, d51); - s0 = _mm512_set1_ps(src[i + 6 * F]); - d60 = _mm512_fmadd_ps(s0, w0, d60); - d61 = _mm512_fmadd_ps(s0, w1, d61); - s0 = _mm512_set1_ps(src[i + 7 * F]); - d70 = _mm512_fmadd_ps(s0, w0, d70); - d71 = _mm512_fmadd_ps(s0, w1, d71); - s0 = _mm512_set1_ps(src[i + 8 * F]); - d80 = _mm512_fmadd_ps(s0, w0, d80); - d81 = _mm512_fmadd_ps(s0, w1, d81); - s0 = _mm512_set1_ps(src[i + 9 * F]); - d90 = _mm512_fmadd_ps(s0, w0, d90); - d91 = _mm512_fmadd_ps(s0, w1, d91); - s0 = _mm512_set1_ps(src[i + 10 * F]); - da0 = _mm512_fmadd_ps(s0, w0, da0); - da1 = _mm512_fmadd_ps(s0, w1, da1); - s0 = _mm512_set1_ps(src[i + 11 * F]); - db0 = _mm512_fmadd_ps(s0, w0, db0); - db1 = _mm512_fmadd_ps(s0, w1, db1); - } - src += srcS; - } - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, tails[1]); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, tails[1]); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, tails[1]); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params, tails[1]); - dst += dstC; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params, tails[1]); - dst += dstC; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params, tails[1]); - dst += dstC; - Term::template Save(dst + 0, d60, bias, params); - Term::template Save(dst + F, d61, bias, params, tails[1]); - dst += dstC; - Term::template Save(dst + 0, d70, bias, params); - Term::template Save(dst + F, d71, bias, params, tails[1]); - dst += dstC; - Term::template Save(dst + 0, d80, bias, params); - Term::template Save(dst + F, d81, bias, params, tails[1]); - dst += dstC; - Term::template Save(dst + 0, d90, bias, params); - Term::template Save(dst + F, d91, bias, params, tails[1]); - dst += dstC; - Term::template Save(dst + 0, da0, bias, params); - Term::template Save(dst + F, da1, bias, params, tails[1]); - dst += dstC; - Term::template Save(dst + 0, db0, bias, params); - Term::template Save(dst + F, db1, bias, params, tails[1]); - } - else - { - d00 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(); - d60 = _mm512_setzero_ps(); - d70 = _mm512_setzero_ps(); - d80 = _mm512_setzero_ps(); - d90 = _mm512_setzero_ps(); - da0 = _mm512_setzero_ps(); - db0 = _mm512_setzero_ps(); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm512_loadu_ps(weight + 0); - s0 = _mm512_set1_ps(src[i + 0 * F]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - s0 = _mm512_set1_ps(src[i + 1 * F]); - d10 = _mm512_fmadd_ps(s0, w0, d10); - s0 = _mm512_set1_ps(src[i + 2 * F]); - d20 = _mm512_fmadd_ps(s0, w0, d20); - s0 = _mm512_set1_ps(src[i + 3 * F]); - d30 = _mm512_fmadd_ps(s0, w0, d30); - s0 = _mm512_set1_ps(src[i + 4 * F]); - d40 = _mm512_fmadd_ps(s0, w0, d40); - s0 = _mm512_set1_ps(src[i + 5 * F]); - d50 = _mm512_fmadd_ps(s0, w0, d50); - s0 = _mm512_set1_ps(src[i + 6 * F]); - d60 = _mm512_fmadd_ps(s0, w0, d60); - s0 = _mm512_set1_ps(src[i + 7 * F]); - d70 = _mm512_fmadd_ps(s0, w0, d70); - s0 = _mm512_set1_ps(src[i + 8 * F]); - d80 = _mm512_fmadd_ps(s0, w0, d80); - s0 = _mm512_set1_ps(src[i + 9 * F]); - d90 = _mm512_fmadd_ps(s0, w0, d90); - s0 = _mm512_set1_ps(src[i + 10 * F]); - da0 = _mm512_fmadd_ps(s0, w0, da0); - s0 = _mm512_set1_ps(src[i + 11 * F]); - db0 = _mm512_fmadd_ps(s0, w0, db0); - } - src += srcS; - } - Term::template Save(dst + 0, d00, bias, params, tails[0]); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params, tails[0]); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params, tails[0]); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params, tails[0]); - dst += dstC; - Term::template Save(dst + 0, d40, bias, params, tails[0]); - dst += dstC; - Term::template Save(dst + 0, d50, bias, params, tails[0]); - dst += dstC; - Term::template Save(dst + 0, d60, bias, params, tails[0]); - dst += dstC; - Term::template Save(dst + 0, d70, bias, params, tails[0]); - dst += dstC; - Term::template Save(dst + 0, d80, bias, params, tails[0]); - dst += dstC; - Term::template Save(dst + 0, d90, bias, params, tails[0]); - dst += dstC; - Term::template Save(dst + 0, da0, bias, params, tails[0]); - dst += dstC; - Term::template Save(dst + 0, db0, bias, params, tails[0]); - } - } - - template void OutputConvolution_2x6(const float * src, size_t srcC, size_t srcS, - const float * weight, const __m512 * bias, const __m512 * params, float * dst, size_t dstC, const __mmask16 tails[2]) - { - __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - if (tails[1]) - { - d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(), d41 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(), d51 = _mm512_setzero_ps(); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm512_loadu_ps(weight + 0); - w1 = _mm512_loadu_ps(weight + F); - s0 = _mm512_set1_ps(src[i + 0 * F]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - d01 = _mm512_fmadd_ps(s0, w1, d01); - s0 = _mm512_set1_ps(src[i + 1 * F]); - d10 = _mm512_fmadd_ps(s0, w0, d10); - d11 = _mm512_fmadd_ps(s0, w1, d11); - s0 = _mm512_set1_ps(src[i + 2 * F]); - d20 = _mm512_fmadd_ps(s0, w0, d20); - d21 = _mm512_fmadd_ps(s0, w1, d21); - s0 = _mm512_set1_ps(src[i + 3 * F]); - d30 = _mm512_fmadd_ps(s0, w0, d30); - d31 = _mm512_fmadd_ps(s0, w1, d31); - s0 = _mm512_set1_ps(src[i + 4 * F]); - d40 = _mm512_fmadd_ps(s0, w0, d40); - d41 = _mm512_fmadd_ps(s0, w1, d41); - s0 = _mm512_set1_ps(src[i + 5 * F]); - d50 = _mm512_fmadd_ps(s0, w0, d50); - d51 = _mm512_fmadd_ps(s0, w1, d51); - } - src += srcS; - } - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, tails[1]); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, tails[1]); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, tails[1]); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params, tails[1]); - dst += dstC; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params, tails[1]); - dst += dstC; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params, tails[1]); - } - else - { - d00 = _mm512_setzero_ps(); - d10 = _mm512_setzero_ps(); - d20 = _mm512_setzero_ps(); - d30 = _mm512_setzero_ps(); - d40 = _mm512_setzero_ps(); - d50 = _mm512_setzero_ps(); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm512_loadu_ps(weight + 0); - s0 = _mm512_set1_ps(src[i + 0 * F]); - d00 = _mm512_fmadd_ps(s0, w0, d00); - s0 = _mm512_set1_ps(src[i + 1 * F]); - d10 = _mm512_fmadd_ps(s0, w0, d10); - s0 = _mm512_set1_ps(src[i + 2 * F]); - d20 = _mm512_fmadd_ps(s0, w0, d20); - s0 = _mm512_set1_ps(src[i + 3 * F]); - d30 = _mm512_fmadd_ps(s0, w0, d30); - s0 = _mm512_set1_ps(src[i + 4 * F]); - d40 = _mm512_fmadd_ps(s0, w0, d40); - s0 = _mm512_set1_ps(src[i + 5 * F]); - d50 = _mm512_fmadd_ps(s0, w0, d50); - } - src += srcS; - } - Term::template Save(dst + 0, d00, bias, params, tails[0]); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params, tails[0]); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params, tails[0]); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params, tails[0]); - dst += dstC; - Term::template Save(dst + 0, d40, bias, params, tails[0]); - dst += dstC; - Term::template Save(dst + 0, d50, bias, params, tails[0]); - } - } - - template void OutputConvolution_2xM(const float* src, size_t srcC, size_t srcS, - const float* weight, const __m512* bias, const __m512* params, float* dst, size_t dstC, const __mmask16 tails[2]) - { - __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - if (tails[1]) - { - if (M > 0) d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps(); - if (M > 1) d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps(); - if (M > 2) d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps(); - if (M > 3) d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps(); - if (M > 4) d40 = _mm512_setzero_ps(), d41 = _mm512_setzero_ps(); - if (M > 5) d50 = _mm512_setzero_ps(), d51 = _mm512_setzero_ps(); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm512_loadu_ps(weight + 0); - w1 = _mm512_loadu_ps(weight + F); - if (M > 0) s0 = _mm512_set1_ps(src[i + 0 * F]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01); - if (M > 1) s0 = _mm512_set1_ps(src[i + 1 * F]), d10 = _mm512_fmadd_ps(s0, w0, d10), d11 = _mm512_fmadd_ps(s0, w1, d11); - if (M > 2) s0 = _mm512_set1_ps(src[i + 2 * F]), d20 = _mm512_fmadd_ps(s0, w0, d20), d21 = _mm512_fmadd_ps(s0, w1, d21); - if (M > 3) s0 = _mm512_set1_ps(src[i + 3 * F]), d30 = _mm512_fmadd_ps(s0, w0, d30), d31 = _mm512_fmadd_ps(s0, w1, d31); - if (M > 4) s0 = _mm512_set1_ps(src[i + 4 * F]), d40 = _mm512_fmadd_ps(s0, w0, d40), d41 = _mm512_fmadd_ps(s0, w1, d41); - if (M > 5) s0 = _mm512_set1_ps(src[i + 5 * F]), d50 = _mm512_fmadd_ps(s0, w0, d50), d51 = _mm512_fmadd_ps(s0, w1, d51); - } - src += srcS; - } - if (M > 0) Term::template Save(dst + 0, d00, bias, params), Term::template Save(dst + F, d01, bias, params, tails[1]), dst += dstC; - if (M > 1) Term::template Save(dst + 0, d10, bias, params), Term::template Save(dst + F, d11, bias, params, tails[1]), dst += dstC; - if (M > 2) Term::template Save(dst + 0, d20, bias, params), Term::template Save(dst + F, d21, bias, params, tails[1]), dst += dstC; - if (M > 3) Term::template Save(dst + 0, d30, bias, params), Term::template Save(dst + F, d31, bias, params, tails[1]), dst += dstC; - if (M > 4) Term::template Save(dst + 0, d40, bias, params), Term::template Save(dst + F, d41, bias, params, tails[1]), dst += dstC; - if (M > 5) Term::template Save(dst + 0, d50, bias, params), Term::template Save(dst + F, d51, bias, params, tails[1]), dst += dstC; - } - else - { - if (M > 0) d00 = _mm512_setzero_ps(); - if (M > 1) d10 = _mm512_setzero_ps(); - if (M > 2) d20 = _mm512_setzero_ps(); - if (M > 3) d30 = _mm512_setzero_ps(); - if (M > 4) d40 = _mm512_setzero_ps(); - if (M > 5) d50 = _mm512_setzero_ps(); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm512_loadu_ps(weight + 0); - if (M > 0) s0 = _mm512_set1_ps(src[i + 0 * F]), d00 = _mm512_fmadd_ps(s0, w0, d00); - if (M > 1) s0 = _mm512_set1_ps(src[i + 1 * F]), d10 = _mm512_fmadd_ps(s0, w0, d10); - if (M > 2) s0 = _mm512_set1_ps(src[i + 2 * F]), d20 = _mm512_fmadd_ps(s0, w0, d20); - if (M > 3) s0 = _mm512_set1_ps(src[i + 3 * F]), d30 = _mm512_fmadd_ps(s0, w0, d30); - if (M > 4) s0 = _mm512_set1_ps(src[i + 4 * F]), d40 = _mm512_fmadd_ps(s0, w0, d40); - if (M > 5) s0 = _mm512_set1_ps(src[i + 5 * F]), d50 = _mm512_fmadd_ps(s0, w0, d50); - } - src += srcS; - } - if (M > 0) Term::template Save(dst + 0, d00, bias, params, tails[0]), dst += dstC; - if (M > 1) Term::template Save(dst + 0, d10, bias, params, tails[0]), dst += dstC; - if (M > 2) Term::template Save(dst + 0, d20, bias, params, tails[0]), dst += dstC; - if (M > 3) Term::template Save(dst + 0, d30, bias, params, tails[0]), dst += dstC; - if (M > 4) Term::template Save(dst + 0, d40, bias, params, tails[0]), dst += dstC; - if (M > 5) Term::template Save(dst + 0, d50, bias, params, tails[0]), dst += dstC; - } - } - - typedef void(*OutputConvolution_2xM_Ptr)(const float* src, size_t srcC, size_t srcS, const float* weight, const __m512* bias, const __m512* params, float* dst, size_t dstC, const __mmask16 tails[2]); - - template OutputConvolution_2xM_Ptr GetOutputConvolution_2xM(size_t M) - { - switch (M) - { - case 0: return OutputConvolution_2xM; - case 1: return OutputConvolution_2xM; - case 2: return OutputConvolution_2xM; - case 3: return OutputConvolution_2xM; - case 4: return OutputConvolution_2xM; - case 5: return OutputConvolution_2xM; - } - assert(0); - return NULL; - } - - template void OutputConvolution(const float * src, const SimdConvolutionParameters & p, - size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float * weight, const float * bias, const float * params, float * dst) - { - assert(p.group == 1 && p.kernelY == 1 && p.strideY == 1); - size_t srcH = p.srcH, srcW = p.srcW, dstW = p.dstW, dstC = p.dstC; - size_t srcM = (bufH[1] - 1), srcS = bufH[1] * srcW*F; -#ifdef SIMD_MERGECONV_MERGE_OUTPUT_ROWS - size_t yInt = Simd::Max(yBeg, yEnd & (~srcM)), nBeg = yBeg * srcW, nInt = yInt * srcW, nEnd = yEnd * srcW; - size_t nInt6 = AlignLoAny(nInt - nBeg, 6) + nBeg, nEnd6 = AlignLoAny(nEnd - nInt, 6) + nInt, nIntTail = nInt - nInt6, nEndTail = nEnd - nEnd6; - size_t nInt12 = AlignLoAny(nInt - nBeg, 12) + nBeg, nEnd12 = AlignLoAny(nEnd - nInt, 12) + nInt; - OutputConvolution_2xM_Ptr tailInt = GetOutputConvolution_2xM(nIntTail); - OutputConvolution_2xM_Ptr tailEnd = GetOutputConvolution_2xM(nEndTail); -#else - size_t dstW6 = AlignLoAny(dstW, 6), wTail = dstW - dstW6; - OutputConvolution_2xM_Ptr tailW = GetOutputConvolution_2xM(wTail); -#endif - - __m512 _params[2], _bias[2]; - _params[0] = _mm512_set1_ps(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = _mm512_set1_ps(params[1]); - - dst += yBeg * p.dstW * p.dstC; - size_t dc = 0; - for (; dc < dstC; dc += DF) - { - size_t tail = Simd::Min(DF, dstC - dc); - __mmask16 tails[2] = { TailMask16(tail), TailMask16(tail - F) }; - _bias[0] = _mm512_loadu_ps(bias + dc + 0); - _bias[1] = _mm512_loadu_ps(bias + dc + F); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = _mm512_loadu_ps(params + dc + 0); - _params[1] = _mm512_loadu_ps(params + dc + F); - } - float * pDst = dst + dc; -#ifdef SIMD_MERGECONV_MERGE_OUTPUT_ROWS - const float* src0 = src + (yBeg & srcM) * srcW * F; - const float* src1 = src + (yInt & srcM) * srcW * F; - size_t dn = nBeg; - for (; dn < nInt12; dn += 12, pDst += 12 * dstC, src0 += 12 * F) - OutputConvolution_2x12(src0, srcC, srcS, weight, _bias, _params, pDst, dstC, tails); - for (; dn < nInt6; dn += 6, pDst += 6 * dstC, src0 += 6 * F) - OutputConvolution_2x6(src0, srcC, srcS, weight, _bias, _params, pDst, dstC, tails); - if (nIntTail) - tailInt(src0, srcC, srcS, weight, _bias, _params, pDst, dstC, tails), dn += nIntTail, pDst += nIntTail * dstC, src0 += nIntTail * F; - for (; dn < nEnd12; dn += 12, pDst += 12 * dstC, src1 += 12 * F) - OutputConvolution_2x12(src1, srcC, srcS, weight, _bias, _params, pDst, dstC, tails); - for (; dn < nEnd6; dn += 6, pDst += 6 * dstC, src1 += 6 * F) - OutputConvolution_2x6(src1, srcC, srcS, weight, _bias, _params, pDst, dstC, tails); - if (nEndTail) - tailEnd(src1, srcC, srcS, weight, _bias, _params, pDst, dstC, tails), dn += nEndTail, pDst += nEndTail * dstC, src1 += nEndTail * F; -#else - for (size_t y = yBeg; y < yEnd; ++y) - { - const float * pSrc = src + (y&srcM)*srcW*F; - size_t x = 0; - for (; x < dstW6; x += 6, pDst += 6 * dstC, pSrc += 6 * F) - OutputConvolution_2x6(pSrc, srcC, srcS, weight, _bias, _params, pDst, dstC, tails); - if (wTail) - tailW(pSrc, srcC, srcS, weight, _bias, _params, pDst, dstC, tails), pDst += wTail * dstC, pSrc += wTail * F; - } -#endif - weight += srcC * DF; - } - } - - template void SetConvolutionPtr(const MergConvParam32f & p, size_t index, SynetMergedConvolution32f::ConvolutionPtr convolution[3]) - { - switch (index) - { - case 0: - if (p.conv[0].kernelY == 1 && p.conv[0].strideY == 1) - convolution[0] = InputConvolution1x1; - else - convolution[0] = InputConvolution; - break; - case 1: - if (p.conv[1].kernelY == 3) - convolution[1] = DepthwiseConvolution3x3; - else - convolution[1] = DepthwiseConvolution; - break; - case 2: - if (p.add) - { - convolution[2] = OutputConvolution; - convolution[3] = OutputConvolution; - convolution[4] = OutputConvolution; - convolution[5] = OutputConvolution; - } - else - { - convolution[2] = OutputConvolution; - convolution[3] = OutputConvolution; - convolution[4] = OutputConvolution; - convolution[5] = OutputConvolution; - } - break; - default: - assert(0); - } - } - - SynetMergedConvolution32f::SynetMergedConvolution32f(const MergConvParam32f & p) - : Avx2::SynetMergedConvolution32f(p) - { - SetSize(Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), Avx512f::F); - for (size_t i = 0; i < _param.count; ++i) - { - switch (p.conv[i].activation) - { - case SimdConvolutionActivationIdentity: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationRelu: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationLeakyRelu: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationRestrictRange: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationPrelu: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationElu: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationHswish: SetConvolutionPtr(_param, i, _convolution); break; - default: assert(0); - } - } - } - - //--------------------------------------------------------------------- - - void * SynetMergedConvolution32fInit(size_t batch, const SimdConvolutionParameters * convs, size_t count, SimdBool add) - { - MergConvParam32f param(batch, convs, count, add); - if (!param.Valid()) - return NULL; - if (param.conv[1].dstC <= HF && param.conv[2].dstC <= HF) - return new Avx2::SynetMergedConvolution32f(param); - else - return new Avx512f::SynetMergedConvolution32f(param); - } - } - #endif//SIMD_AVX512f_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512fSynetPooling.cpp b/src/3rd/Simd/Simd/SimdAvx512fSynetPooling.cpp deleted file mode 100644 index 5e236a62..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512fSynetPooling.cpp +++ /dev/null @@ -1,375 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdAvx1.h" -#include "Simd/SimdAvx2.h" -#include "Simd/SimdAvx512f.h" - -namespace Simd -{ -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - SIMD_INLINE void PoolingAverageNhwc1(const float* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m512 & norm, float * dst, __mmask16 tail = -1) - { - __m512 sum0 = _mm512_setzero_ps(); - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - const float* ps = src + w * srcC; - sum0 = _mm512_add_ps(sum0, _mm512_maskz_loadu_ps(tail, ps + 0 * F)); - } - src += srcS; - } - _mm512_mask_storeu_ps(dst + 0 * F, tail, _mm512_mul_ps(sum0, norm)); - } - - SIMD_INLINE void PoolingAverageNhwc2(const float* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m512& norm, float* dst) - { - __m512 sum0 = _mm512_setzero_ps(); - __m512 sum1 = _mm512_setzero_ps(); - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - const float* ps = src + w * srcC; - sum0 = _mm512_add_ps(sum0, _mm512_loadu_ps(ps + 0 * F)); - sum1 = _mm512_add_ps(sum1, _mm512_loadu_ps(ps + 1 * F)); - } - src += srcS; - } - _mm512_storeu_ps(dst + 0 * F, _mm512_mul_ps(sum0, norm)); - _mm512_storeu_ps(dst + 1 * F, _mm512_mul_ps(sum1, norm)); - } - - SIMD_INLINE void PoolingAverageNhwc4(const float* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m512& norm, float* dst) - { - __m512 sum0 = _mm512_setzero_ps(); - __m512 sum1 = _mm512_setzero_ps(); - __m512 sum2 = _mm512_setzero_ps(); - __m512 sum3 = _mm512_setzero_ps(); - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - const float* ps = src + w * srcC; - sum0 = _mm512_add_ps(sum0, _mm512_loadu_ps(ps + 0 * F)); - sum1 = _mm512_add_ps(sum1, _mm512_loadu_ps(ps + 1 * F)); - sum2 = _mm512_add_ps(sum2, _mm512_loadu_ps(ps + 2 * F)); - sum3 = _mm512_add_ps(sum3, _mm512_loadu_ps(ps + 3 * F)); - } - src += srcS; - } - _mm512_storeu_ps(dst + 0 * F, _mm512_mul_ps(sum0, norm)); - _mm512_storeu_ps(dst + 1 * F, _mm512_mul_ps(sum1, norm)); - _mm512_storeu_ps(dst + 2 * F, _mm512_mul_ps(sum2, norm)); - _mm512_storeu_ps(dst + 3 * F, _mm512_mul_ps(sum3, norm)); - } - - SIMD_INLINE void PoolingAverageNhwc8(const float* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m512& norm, float* dst) - { - __m512 sum0 = _mm512_setzero_ps(); - __m512 sum1 = _mm512_setzero_ps(); - __m512 sum2 = _mm512_setzero_ps(); - __m512 sum3 = _mm512_setzero_ps(); - __m512 sum4 = _mm512_setzero_ps(); - __m512 sum5 = _mm512_setzero_ps(); - __m512 sum6 = _mm512_setzero_ps(); - __m512 sum7 = _mm512_setzero_ps(); - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - const float* ps = src + w * srcC; - sum0 = _mm512_add_ps(sum0, _mm512_loadu_ps(ps + 0 * F)); - sum1 = _mm512_add_ps(sum1, _mm512_loadu_ps(ps + 1 * F)); - sum2 = _mm512_add_ps(sum2, _mm512_loadu_ps(ps + 2 * F)); - sum3 = _mm512_add_ps(sum3, _mm512_loadu_ps(ps + 3 * F)); - sum4 = _mm512_add_ps(sum4, _mm512_loadu_ps(ps + 4 * F)); - sum5 = _mm512_add_ps(sum5, _mm512_loadu_ps(ps + 5 * F)); - sum6 = _mm512_add_ps(sum6, _mm512_loadu_ps(ps + 6 * F)); - sum7 = _mm512_add_ps(sum7, _mm512_loadu_ps(ps + 7 * F)); - } - src += srcS; - } - _mm512_storeu_ps(dst + 0 * F, _mm512_mul_ps(sum0, norm)); - _mm512_storeu_ps(dst + 1 * F, _mm512_mul_ps(sum1, norm)); - _mm512_storeu_ps(dst + 2 * F, _mm512_mul_ps(sum2, norm)); - _mm512_storeu_ps(dst + 3 * F, _mm512_mul_ps(sum3, norm)); - _mm512_storeu_ps(dst + 4 * F, _mm512_mul_ps(sum4, norm)); - _mm512_storeu_ps(dst + 5 * F, _mm512_mul_ps(sum5, norm)); - _mm512_storeu_ps(dst + 6 * F, _mm512_mul_ps(sum6, norm)); - _mm512_storeu_ps(dst + 7 * F, _mm512_mul_ps(sum7, norm)); - } - - SIMD_INLINE void PoolingAverageNhwc(const float* src, size_t srcS, size_t srcC, size_t srcCF1, - size_t srcCF2, size_t srcCF4, size_t srcCF8, size_t kernelY, size_t kernelX, const __m512& norm, float* dst, __mmask16 tail) - { - size_t c = 0; - for (; c < srcCF8; c += 8 * F) - PoolingAverageNhwc8(src + c, srcS, srcC, kernelY, kernelX, norm, dst + c); - for (; c < srcCF4; c += 4 * F) - PoolingAverageNhwc4(src + c, srcS, srcC, kernelY, kernelX, norm, dst + c); - for (; c < srcCF2; c += 2 * F) - PoolingAverageNhwc2(src + c, srcS, srcC, kernelY, kernelX, norm, dst + c); - for (; c < srcCF1; c += 1 * F) - PoolingAverageNhwc1(src + c, srcS, srcC, kernelY, kernelX, norm, dst + c); - if (c < srcC) - PoolingAverageNhwc1(src + c, srcS, srcC, kernelY, kernelX, norm, dst + c, tail); - } - - void SynetPoolingForwardAverage(const float* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float* dst, size_t dstH, size_t dstW, SimdBool excludePad, SimdTensorFormatType format) - { - if (format == SimdTensorFormatNhwc) - { - if (srcC > Avx::F) - { - size_t srcS = srcW * srcC; - size_t srcCF1 = AlignLo(srcC, 1 * F); - size_t srcCF2 = AlignLo(srcC, 2 * F); - size_t srcCF4 = AlignLo(srcC, 4 * F); - size_t srcCF8 = AlignLo(srcC, 8 * F); - __mmask16 tail = TailMask16(srcC - srcCF1); - if (padX == 0 && padY == 0 && (dstW - 1) * strideX + kernelX == srcW && (dstH - 1) * strideY + kernelY == srcH) - { - size_t stepY = srcW * srcC * strideY, stepX = strideX * srcC; - __m512 norm = _mm512_set1_ps(1.0f / (kernelY * kernelX)); - for (size_t ph = 0; ph < dstH; ++ph) - { - const float* ps = src + ph * stepY; - for (size_t pw = 0; pw < dstW; ++pw, ps += stepX, dst += srcC) - PoolingAverageNhwc(ps, srcS, srcC, srcCF1, srcCF2, srcCF4, srcCF8, kernelY, kernelX, norm, dst, tail); - } - } - else if (excludePad) - { - for (size_t ph = 0; ph < dstH; ++ph) - { - size_t hStart = ph * strideY - padY; - size_t hEnd = Simd::Min(hStart + kernelY, srcH); - hStart = Simd::Max(0, hStart); - size_t kH = hEnd - hStart; - for (size_t pw = 0; pw < dstW; ++pw) - { - size_t wStart = pw * strideX - padX; - size_t wEnd = Simd::Min(wStart + kernelX, srcW); - wStart = Simd::Max(0, wStart); - size_t kW = wEnd - wStart; - const float* ps = src + hStart * srcS + wStart * srcC; - __m512 norm = _mm512_set1_ps(1.0f / (kH * kW)); - PoolingAverageNhwc(ps, srcS, srcC, srcCF1, srcCF2, srcCF4, srcCF8, kH, kW, norm, dst, tail); - dst += srcC; - } - } - } - else - { - __m512 norm = _mm512_set1_ps(1.0f / (kernelY * kernelX)); - for (size_t ph = 0; ph < dstH; ++ph) - { - size_t hStart = ph * strideY - padY; - size_t hEnd = Simd::Min(hStart + kernelY, srcH); - hStart = Simd::Max(0, hStart); - size_t kH = hEnd - hStart; - for (size_t pw = 0; pw < dstW; ++pw) - { - size_t wStart = pw * strideX - padX; - size_t wEnd = Simd::Min(wStart + kernelX, srcW); - wStart = Simd::Max(0, wStart); - size_t kW = wEnd - wStart; - const float* ps = src + hStart * srcS + wStart * srcC; - PoolingAverageNhwc(ps, srcS, srcC, srcCF1, srcCF2, srcCF4, srcCF8, kH, kW, norm, dst, tail); - dst += srcC; - } - } - } - return; - } - } - else if (format == SimdTensorFormatNchw) - { - } - Avx::SynetPoolingForwardAverage(src, srcC, srcH, srcW, kernelY, kernelX, strideY, strideX, padY, padX, dst, dstH, dstW, excludePad, format); - } - - //--------------------------------------------------------------------- - - SIMD_INLINE void PoolingMaxHwc1(const float * src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m512 & min, float * dst, __mmask16 tail = -1) - { - __m512 max0 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - max0 = _mm512_max_ps(max0, _mm512_maskz_loadu_ps(tail, src + w * srcC + 0 * F)); - } - src += srcS; - } - _mm512_mask_storeu_ps(dst + 0 * F, tail, max0); - } - - SIMD_INLINE void PoolingMaxHwc2(const float * src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m512 & min, float * dst) - { - __m512 max0 = min; - __m512 max1 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - max0 = _mm512_max_ps(max0, _mm512_loadu_ps(src + w * srcC + 0 * F)); - max1 = _mm512_max_ps(max1, _mm512_loadu_ps(src + w * srcC + 1 * F)); - } - src += srcS; - } - _mm512_storeu_ps(dst + 0 * F, max0); - _mm512_storeu_ps(dst + 1 * F, max1); - } - - SIMD_INLINE void PoolingMaxHwc4(const float * src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m512 & min, float * dst) - { - __m512 max0 = min; - __m512 max1 = min; - __m512 max2 = min; - __m512 max3 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - max0 = _mm512_max_ps(max0, _mm512_loadu_ps(src + w * srcC + 0 * F)); - max1 = _mm512_max_ps(max1, _mm512_loadu_ps(src + w * srcC + 1 * F)); - max2 = _mm512_max_ps(max2, _mm512_loadu_ps(src + w * srcC + 2 * F)); - max3 = _mm512_max_ps(max3, _mm512_loadu_ps(src + w * srcC + 3 * F)); - } - src += srcS; - } - _mm512_storeu_ps(dst + 0 * F, max0); - _mm512_storeu_ps(dst + 1 * F, max1); - _mm512_storeu_ps(dst + 2 * F, max2); - _mm512_storeu_ps(dst + 3 * F, max3); - } - - SIMD_INLINE void PoolingMaxHwc8(const float * src, size_t srcS, size_t srcC, size_t kH, size_t kW, const __m512 & min, float * dst) - { - __m512 max0 = min; - __m512 max1 = min; - __m512 max2 = min; - __m512 max3 = min; - __m512 max4 = min; - __m512 max5 = min; - __m512 max6 = min; - __m512 max7 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - max0 = _mm512_max_ps(max0, _mm512_loadu_ps(src + w * srcC + 0 * F)); - max1 = _mm512_max_ps(max1, _mm512_loadu_ps(src + w * srcC + 1 * F)); - max2 = _mm512_max_ps(max2, _mm512_loadu_ps(src + w * srcC + 2 * F)); - max3 = _mm512_max_ps(max3, _mm512_loadu_ps(src + w * srcC + 3 * F)); - max4 = _mm512_max_ps(max4, _mm512_loadu_ps(src + w * srcC + 4 * F)); - max5 = _mm512_max_ps(max5, _mm512_loadu_ps(src + w * srcC + 5 * F)); - max6 = _mm512_max_ps(max6, _mm512_loadu_ps(src + w * srcC + 6 * F)); - max7 = _mm512_max_ps(max7, _mm512_loadu_ps(src + w * srcC + 7 * F)); - } - src += srcS; - } - _mm512_storeu_ps(dst + 0 * F, max0); - _mm512_storeu_ps(dst + 1 * F, max1); - _mm512_storeu_ps(dst + 2 * F, max2); - _mm512_storeu_ps(dst + 3 * F, max3); - _mm512_storeu_ps(dst + 4 * F, max4); - _mm512_storeu_ps(dst + 5 * F, max5); - _mm512_storeu_ps(dst + 6 * F, max6); - _mm512_storeu_ps(dst + 7 * F, max7); - } - - void SynetPoolingForwardMax32f(const float * src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float * dst, size_t dstH, size_t dstW, SimdTensorFormatType format) - { - if (format == SimdTensorFormatNhwc) - { - size_t srcS = srcW * srcC; - size_t srcCF1 = AlignLo(srcC, 1 * F); - size_t srcCF2 = AlignLo(srcC, 2 * F); - size_t srcCF4 = AlignLo(srcC, 4 * F); - size_t srcCF8 = AlignLo(srcC, 8 * F); - __m512 min = _mm512_set1_ps(-FLT_MAX); - __mmask16 tail = TailMask16(srcC - srcCF1); - for (size_t ph = 0; ph < dstH; ++ph) - { - size_t hStart = ph * strideY - padY; - size_t hEnd = Simd::Min(hStart + kernelY, srcH); - hStart = Simd::Max(0, hStart); - for (size_t pw = 0; pw < dstW; ++pw) - { - size_t wStart = pw * strideX - padX; - size_t wEnd = Simd::Min(wStart + kernelX, srcW); - wStart = Simd::Max(0, wStart); - const float* ps = src + hStart * srcS + wStart * srcC; - size_t c = 0; - for (; c < srcCF8; c += 8 * F) - PoolingMaxHwc8(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - for (; c < srcCF4; c += 4 * F) - PoolingMaxHwc4(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - for (; c < srcCF2; c += 2 * F) - PoolingMaxHwc2(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - for (; c < srcCF1; c += 1 * F) - PoolingMaxHwc1(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - if (c < srcC) - PoolingMaxHwc1(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c, tail); - dst += srcC; - } - } - } - else if (format == SimdTensorFormatNchw) - { - if (strideY == 1 && strideX == 1 && kernelY == 3 && kernelX == 3 && srcH == dstH && srcW == dstW && dstW > F) - { - for (size_t c = 0; c < srcC; ++c, src += srcH * srcW, dst += dstH * dstW) - Avx512f::NeuralPooling1x1Max3x3(src, srcW, srcW, srcH, dst, dstW); - return; - } - if (strideY == 2 && strideX == 2 && kernelY == 2 && kernelX == 2 && padY == 0 && padX == 0 && dstW >= F) - { - for (size_t c = 0; c < srcC; ++c, src += srcH * srcW, dst += dstH * dstW) - Avx512f::NeuralPooling2x2Max2x2(src, srcW, srcW, srcH, dst, dstW); - return; - } - if (strideY == 2 && strideX == 2 && kernelY == 3 && kernelX == 3 && padY == 0 && padX == 0 && dstW > F) - { - for (size_t c = 0; c < srcC; ++c, src += srcH * srcW, dst += dstH * dstW) - Avx512f::NeuralPooling2x2Max3x3(src, srcW, srcW, srcH, dst, dstW); - return; - } - Avx2::SynetPoolingForwardMax32f(src, srcC, srcH, srcW, kernelY, kernelX, strideY, strideX, padY, padX, dst, dstH, dstW, format); - } - else - assert(0); - } - } -#endif// SIMD_AVX512F_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512fWinograd.cpp b/src/3rd/Simd/Simd/SimdAvx512fWinograd.cpp deleted file mode 100644 index 89a332f0..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512fWinograd.cpp +++ /dev/null @@ -1,2977 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdSse1.h" -#include "Simd/SimdAvx1.h" - -namespace Simd -{ -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - SIMD_INLINE void WinogradKernel1x3Block1x4SetFilter(const __m512* t, float* dst, size_t stride, __mmask16 tail) - { - const __m512 r4 = _mm512_set1_ps(1.0f / 4.0f); - const __m512 r6 = _mm512_set1_ps(1.0f / 6.0f); - const __m512 mr6 = _mm512_set1_ps(-1.0f / 6.0f); - const __m512 r12 = _mm512_set1_ps(1.0f / 12.0f); - const __m512 r24 = _mm512_set1_ps(1.0f / 24.0f); - _mm512_mask_storeu_ps(dst + 0 * stride, tail, _mm512_mul_ps(r4, t[0])); - __m512 t0 = _mm512_add_ps(t[0], t[2]); - _mm512_mask_storeu_ps(dst + 1 * stride, tail, _mm512_mul_ps(mr6, _mm512_add_ps(t0, t[1]))); - _mm512_mask_storeu_ps(dst + 2 * stride, tail, _mm512_mul_ps(mr6, _mm512_sub_ps(t0, t[1]))); - __m512 t1 = _mm512_add_ps(_mm512_mul_ps(r24, t[0]), _mm512_mul_ps(r6, t[2])); - __m512 t2 = _mm512_mul_ps(r12, t[1]); - _mm512_mask_storeu_ps(dst + 3 * stride, tail, _mm512_add_ps(t1, t2)); - _mm512_mask_storeu_ps(dst + 4 * stride, tail, _mm512_sub_ps(t1, t2)); - _mm512_mask_storeu_ps(dst + 5 * stride, tail, t[2]); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetFilter16t(const float* src, float* dst, size_t stride, __mmask16 tail = -1) - { - __m512 s[3]; - s[0] = _mm512_maskz_loadu_ps(tail, src + 0 * stride); - s[1] = _mm512_maskz_loadu_ps(tail, src + 1 * stride); - s[2] = _mm512_maskz_loadu_ps(tail, src + 2 * stride); - WinogradKernel1x3Block1x4SetFilter(s, dst + 0 * stride, stride, tail); - } - - void WinogradKernel1x3Block1x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans) - { - size_t sizeF = AlignLo(size, F), i = 0; - if (trans) - { - for (; i < sizeF; i += F) - WinogradKernel1x3Block1x4SetFilter16t(src + i, dst + i, size); - if (i < size) - { - __mmask16 tail = TailMask16(size - sizeF); - WinogradKernel1x3Block1x4SetFilter16t(src + i, dst + i, size, tail); - } - } - else - { - Sse::WinogradKernel1x3Block1x4SetFilter(src, size, dst, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel1x3Block1x4SetInput8Store(const __m512 src[6], float* dst, size_t stride, __mmask16 tail = -1) - { - __m512 _2 = _mm512_set1_ps(2.0f); - __m512 _4 = _mm512_set1_ps(4.0f); - __m512 _5 = _mm512_set1_ps(5.0f); - _mm512_mask_storeu_ps(dst + 0 * stride, tail, _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, src[0]), _mm512_mul_ps(_5, src[2])), src[4])); - _mm512_mask_storeu_ps(dst + 1 * stride, tail, _mm512_sub_ps(_mm512_add_ps(src[3], src[4]), _mm512_mul_ps(_4, _mm512_add_ps(src[1], src[2])))); - _mm512_mask_storeu_ps(dst + 2 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_4, _mm512_sub_ps(src[1], src[2])), _mm512_sub_ps(src[4], src[3]))); - _mm512_mask_storeu_ps(dst + 3 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[3], src[1])), _mm512_sub_ps(src[4], src[2]))); - _mm512_mask_storeu_ps(dst + 4 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[1], src[3])), _mm512_sub_ps(src[4], src[2]))); - _mm512_mask_storeu_ps(dst + 5 * stride, tail, _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, src[1]), _mm512_mul_ps(_5, src[3])), src[5])); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetInput8t(const float* src, size_t srcC, __m512 dst[6], __mmask16 tail = -1) - { - dst[0] = _mm512_maskz_loadu_ps(tail, src + 0 * srcC); - dst[1] = _mm512_maskz_loadu_ps(tail, src + 1 * srcC); - dst[2] = _mm512_maskz_loadu_ps(tail, src + 2 * srcC); - dst[3] = _mm512_maskz_loadu_ps(tail, src + 3 * srcC); - dst[4] = _mm512_maskz_loadu_ps(tail, src + 4 * srcC); - dst[5] = _mm512_maskz_loadu_ps(tail, src + 5 * srcC); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetInput8t(const float* src, size_t srcC, float* dst, size_t dstStride) - { - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - __m512 tmp[6]; - WinogradKernel1x3Block1x4SetInput8t(src + c, srcC, tmp); - WinogradKernel1x3Block1x4SetInput8Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - __m512 tmp[6]; - __mmask16 tail = TailMask16(srcC - srcCF); - WinogradKernel1x3Block1x4SetInput8t(src + srcCF, srcC, tmp, tail); - WinogradKernel1x3Block1x4SetInput8Store(tmp, dst + srcCF, dstStride, tail); - } - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetInput8t(const float* src, size_t srcC, size_t colB, size_t colE, __m512 dst[6], __mmask16 tail = -1) - { - for (size_t col = 0; col < colB; ++col) - dst[col] = _mm512_setzero_ps(); - for (size_t col = colB; col < colE; ++col) - dst[col] = _mm512_maskz_loadu_ps(tail, src + col * srcC); - for (size_t col = colE; col < 6; ++col) - dst[col] = _mm512_setzero_ps(); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetInput8t(const float* src, size_t srcC, size_t colB, size_t colE, float* dst, size_t dstStride) - { - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - __m512 tmp[6]; - WinogradKernel1x3Block1x4SetInput8t(src + c, srcC, colB, colE, tmp); - WinogradKernel1x3Block1x4SetInput8Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - __m512 tmp[6]; - __mmask16 tail = TailMask16(srcC - srcCF); - WinogradKernel1x3Block1x4SetInput8t(src + srcCF, srcC, colB, colE, tmp, tail); - WinogradKernel1x3Block1x4SetInput8Store(tmp, dst + srcCF, dstStride, tail); - } - } - - void WinogradKernel1x3Block1x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padX == padW && padY == 0 && padH == 0 && (padX == 0 || padX == 1)); - if (trans ? (srcChannels < 4) : (srcWidth < 12)) - { - Base::WinogradKernel1x3Block1x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - return; - } - size_t dstH = srcHeight; - size_t dstW = padX ? srcWidth : srcWidth - 2; - size_t tileW = (dstW + 3) / 4; - size_t dstW4 = AlignLo(dstW, 4); - if (trans) - { - size_t noseW = Simd::Min(6, dstW + 1); - size_t startX = padX ? 4 : 0; - if (padX) - { - if (dstW == dstW4) - dstW4 -= 4; - src -= srcChannels; - } - size_t tailW = dstW - dstW4 + (padX ? 1 : 2); - for (size_t row = 0; row < dstH; row += 1) - { - size_t col = 0; - if (padX) - WinogradKernel1x3Block1x4SetInput8t(src, srcChannels, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel1x3Block1x4SetInput8t(src + col * srcChannels, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel1x3Block1x4SetInput8t(src + col * srcChannels, srcChannels, 0, tailW, dst, dstStride), dst += srcChannels; - src += srcWidth * srcChannels; - } - } - else - { - Base::WinogradKernel1x3Block1x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel1x3Block1x4SetOutputLoad6(const float* src, size_t stride, __m512 dst[4], __mmask16 tail = -1) - { - __m512 s[6]; - s[0] = _mm512_maskz_loadu_ps(tail, src + 0 * stride); - s[1] = _mm512_maskz_loadu_ps(tail, src + 1 * stride); - s[2] = _mm512_maskz_loadu_ps(tail, src + 2 * stride); - s[3] = _mm512_maskz_loadu_ps(tail, src + 3 * stride); - s[4] = _mm512_maskz_loadu_ps(tail, src + 4 * stride); - s[5] = _mm512_maskz_loadu_ps(tail, src + 5 * stride); - __m512 _2 = _mm512_set1_ps(2.0f); - __m512 _4 = _mm512_set1_ps(4.0f); - __m512 _8 = _mm512_set1_ps(8.0f); - dst[0] = _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(s[0], s[1]), _mm512_add_ps(s[2], s[3])), s[4]); - dst[1] = _mm512_add_ps(_mm512_sub_ps(s[1], s[2]), _mm512_mul_ps(_2, _mm512_sub_ps(s[3], s[4]))); - dst[2] = _mm512_add_ps(_mm512_add_ps(s[1], s[2]), _mm512_mul_ps(_4, _mm512_add_ps(s[3], s[4]))); - dst[3] = _mm512_add_ps(_mm512_add_ps(_mm512_sub_ps(s[1], s[2]), _mm512_mul_ps(_8, _mm512_sub_ps(s[3], s[4]))), s[5]); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetOutputStore4(const __m512 src[4], float* dst, size_t dstC, __mmask16 tail = -1) - { - _mm512_mask_storeu_ps(dst + 0 * dstC, tail, src[0]); - _mm512_mask_storeu_ps(dst + 1 * dstC, tail, src[1]); - _mm512_mask_storeu_ps(dst + 2 * dstC, tail, src[2]); - _mm512_mask_storeu_ps(dst + 3 * dstC, tail, src[3]); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetOutput16t(const float* src, size_t srcStride, float* dst, size_t dstC) - { - size_t dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m512 tmp[4]; - WinogradKernel1x3Block1x4SetOutputLoad6(src + d, srcStride, tmp); - WinogradKernel1x3Block1x4SetOutputStore4(tmp, dst + d, dstC); - } - if (dstCF < dstC) - { - __m512 tmp[4]; - __mmask16 tail = TailMask16(dstC - dstCF); - WinogradKernel1x3Block1x4SetOutputLoad6(src + dstCF, srcStride, tmp, tail); - WinogradKernel1x3Block1x4SetOutputStore4(tmp, dst + dstCF, dstC, tail); - } - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetOutputStore4(const __m512 src[4], float* dst, size_t dstC, size_t colE, __mmask16 tail = -1) - { - for (size_t col = 0; col < colE; ++col) - _mm512_mask_storeu_ps(dst + col * dstC, tail, src[col]); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetOutput16t(const float* src, size_t srcStride, float* dst, size_t dstC, size_t colE) - { - size_t dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m512 tmp[4]; - WinogradKernel1x3Block1x4SetOutputLoad6(src + d, srcStride, tmp); - WinogradKernel1x3Block1x4SetOutputStore4(tmp, dst + d, dstC, colE); - } - if (dstCF < dstC) - { - __m512 tmp[4]; - __mmask16 tail = TailMask16(dstC - dstCF); - WinogradKernel1x3Block1x4SetOutputLoad6(src + dstCF, srcStride, tmp, tail); - WinogradKernel1x3Block1x4SetOutputStore4(tmp, dst + dstCF, dstC, colE, tail); - } - } - - void WinogradKernel1x3Block1x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - if (trans ? false : (dstWidth < 16)) - { - Avx::WinogradKernel1x3Block1x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - return; - } - size_t tileW = (dstWidth + 3) / 4; - size_t dstW4 = AlignLo(dstWidth, 4); - if (trans) - { - for (size_t row = 0; row < dstHeight; row += 1) - { - size_t col; - for (col = 0; col < dstW4; col += 4) - WinogradKernel1x3Block1x4SetOutput16t(src, srcStride, dst + col * dstChannels, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel1x3Block1x4SetOutput16t(src, srcStride, dst + col * dstChannels, dstChannels, dstWidth - col), src += dstChannels; - dst += dstWidth * dstChannels; - } - } - else - { - Base::WinogradKernel1x3Block1x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel1x5Block1x4SetFilter(const __m512* t, float* dst, size_t stride, __mmask16 tail) - { - const __m512 r36 = _mm512_set1_ps(1.0f / 36.0f); - const __m512 r48 = _mm512_set1_ps(1.0f / 48.0f); - const __m512 mr120 = _mm512_set1_ps(-1.0f / 120.0f); - const __m512 r720 = _mm512_set1_ps(1.0f / 720.0f); - const __m512 _2 = _mm512_set1_ps(2.0f); - const __m512 _3 = _mm512_set1_ps(3.0f); - const __m512 _4 = _mm512_set1_ps(4.0f); - const __m512 _9 = _mm512_set1_ps(9.0f); - _mm512_mask_storeu_ps(dst + 0 * stride, tail, _mm512_mul_ps(r36, t[0])); - __m512 a[2]; - a[0] = _mm512_add_ps(_mm512_add_ps(t[0], t[2]), t[4]); - a[1] = _mm512_add_ps(t[1], t[3]); - _mm512_mask_storeu_ps(dst + 1 * stride, tail, _mm512_mul_ps(r48, _mm512_add_ps(a[0], a[1]))); - _mm512_mask_storeu_ps(dst + 2 * stride, tail, _mm512_mul_ps(r48, _mm512_sub_ps(a[0], a[1]))); - a[0] = _mm512_add_ps(t[0], _mm512_mul_ps(_4, _mm512_add_ps(t[2], _mm512_mul_ps(_4, t[4])))); - a[1] = _mm512_mul_ps(_2, _mm512_add_ps(t[1], _mm512_mul_ps(_4, t[3]))); - _mm512_mask_storeu_ps(dst + 3 * stride, tail, _mm512_mul_ps(mr120, _mm512_add_ps(a[0], a[1]))); - _mm512_mask_storeu_ps(dst + 4 * stride, tail, _mm512_mul_ps(mr120, _mm512_sub_ps(a[0], a[1]))); - a[0] = _mm512_add_ps(t[0], _mm512_mul_ps(_9, _mm512_add_ps(t[2], _mm512_mul_ps(_9, t[4])))); - a[1] = _mm512_mul_ps(_3, _mm512_add_ps(t[1], _mm512_mul_ps(_9, t[3]))); - _mm512_mask_storeu_ps(dst + 5 * stride, tail, _mm512_mul_ps(r720, _mm512_add_ps(a[0], a[1]))); - _mm512_mask_storeu_ps(dst + 6 * stride, tail, _mm512_mul_ps(r720, _mm512_sub_ps(a[0], a[1]))); - _mm512_mask_storeu_ps(dst + 7 * stride, tail, t[4]); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetFilter16t(const float* src, float* dst, size_t stride, __mmask16 tail = -1) - { - __m512 s[5]; - s[0] = _mm512_maskz_loadu_ps(tail, src + 0 * stride); - s[1] = _mm512_maskz_loadu_ps(tail, src + 1 * stride); - s[2] = _mm512_maskz_loadu_ps(tail, src + 2 * stride); - s[3] = _mm512_maskz_loadu_ps(tail, src + 3 * stride); - s[4] = _mm512_maskz_loadu_ps(tail, src + 4 * stride); - WinogradKernel1x5Block1x4SetFilter(s, dst, stride, tail); - } - - void WinogradKernel1x5Block1x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans) - { - size_t sizeF = AlignLo(size, F), i = 0; - if (trans) - { - for (; i < sizeF; i += F) - WinogradKernel1x5Block1x4SetFilter16t(src + i, dst + i, size); - if (i < size) - { - __mmask16 tail = TailMask16(size - sizeF); - WinogradKernel1x5Block1x4SetFilter16t(src + i, dst + i, size, tail); - } - } - else - { - Sse::WinogradKernel1x5Block1x4SetFilter(src, size, dst, trans); - } - } - - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel1x5Block1x4SetInput16Store(const __m512 src[8], float* dst, size_t stride, __mmask16 tail = -1) - { - __m512 _2 = _mm512_set1_ps(2.0f); - __m512 _3 = _mm512_set1_ps(3.0f); - __m512 _4 = _mm512_set1_ps(4.0f); - __m512 _5 = _mm512_set1_ps(5.0f); - __m512 _9 = _mm512_set1_ps(9.0f); - __m512 _10 = _mm512_set1_ps(10.0f); - __m512 _13 = _mm512_set1_ps(13.0f); - __m512 _14 = _mm512_set1_ps(14.0f); - __m512 _36 = _mm512_set1_ps(36.0f); - __m512 _49 = _mm512_set1_ps(49.0f); - _mm512_mask_storeu_ps(dst + 0 * stride, tail, _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_36, src[0]), _mm512_mul_ps(_49, src[2])), _mm512_sub_ps(_mm512_mul_ps(_14, src[4]), src[6]))); - __m512 a[2]; - a[0] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_36, src[2]), _mm512_mul_ps(_13, src[4])), src[6]); - a[1] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_36, src[1]), _mm512_mul_ps(_13, src[3])), src[5]); - _mm512_mask_storeu_ps(dst + 1 * stride, tail, _mm512_add_ps(a[0], a[1])); - _mm512_mask_storeu_ps(dst + 2 * stride, tail, _mm512_sub_ps(a[0], a[1])); - a[0] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_9, src[2]), _mm512_mul_ps(_10, src[4])), src[6]); - a[1] = _mm512_mul_ps(_2, _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_9, src[1]), _mm512_mul_ps(_10, src[3])), src[5])); - _mm512_mask_storeu_ps(dst + 3 * stride, tail, _mm512_add_ps(a[0], a[1])); - _mm512_mask_storeu_ps(dst + 4 * stride, tail, _mm512_sub_ps(a[0], a[1])); - a[0] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, src[2]), _mm512_mul_ps(_5, src[4])), src[6]); - a[1] = _mm512_mul_ps(_3, _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, src[1]), _mm512_mul_ps(_5, src[3])), src[5])); - _mm512_mask_storeu_ps(dst + 5 * stride, tail, _mm512_add_ps(a[0], a[1])); - _mm512_mask_storeu_ps(dst + 6 * stride, tail, _mm512_sub_ps(a[0], a[1])); - _mm512_mask_storeu_ps(dst + 7 * stride, tail, _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_49, src[3]), _mm512_mul_ps(_36, src[1])), _mm512_sub_ps(src[7], _mm512_mul_ps(_14, src[5])))); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetInput16t(const float* src, size_t srcC, __m512 dst[8], __mmask16 tail = -1) - { - dst[0] = _mm512_maskz_loadu_ps(tail, src + 0 * srcC); - dst[1] = _mm512_maskz_loadu_ps(tail, src + 1 * srcC); - dst[2] = _mm512_maskz_loadu_ps(tail, src + 2 * srcC); - dst[3] = _mm512_maskz_loadu_ps(tail, src + 3 * srcC); - dst[4] = _mm512_maskz_loadu_ps(tail, src + 4 * srcC); - dst[5] = _mm512_maskz_loadu_ps(tail, src + 5 * srcC); - dst[6] = _mm512_maskz_loadu_ps(tail, src + 6 * srcC); - dst[7] = _mm512_maskz_loadu_ps(tail, src + 7 * srcC); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetInput16t(const float* src, size_t srcC, float* dst, size_t dstStride) - { - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - __m512 tmp[8]; - WinogradKernel1x5Block1x4SetInput16t(src + c, srcC, tmp); - WinogradKernel1x5Block1x4SetInput16Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - __m512 tmp[8]; - __mmask16 tail = TailMask16(srcC - srcCF); - WinogradKernel1x5Block1x4SetInput16t(src + srcCF, srcC, tmp, tail); - WinogradKernel1x5Block1x4SetInput16Store(tmp, dst + srcCF, dstStride, tail); - } - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetInput16t(const float* src, size_t srcC, size_t colB, size_t colE, __m512 dst[8], __mmask16 tail = -1) - { - for (size_t col = 0; col < colB; ++col) - dst[col] = _mm512_setzero_ps(); - for (size_t col = colB; col < colE; ++col) - dst[col] = _mm512_maskz_loadu_ps(tail, src + col * srcC); - for (size_t col = colE; col < 8; ++col) - dst[col] = _mm512_setzero_ps(); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetInput16t(const float* src, size_t srcC, size_t colB, size_t colE, float* dst, size_t dstStride) - { - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - __m512 tmp[8]; - WinogradKernel1x5Block1x4SetInput16t(src + c, srcC, colB, colE, tmp); - WinogradKernel1x5Block1x4SetInput16Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - __m512 tmp[8]; - __mmask16 tail = TailMask16(srcC - srcCF); - WinogradKernel1x5Block1x4SetInput16t(src + srcCF, srcC, colB, colE, tmp, tail); - WinogradKernel1x5Block1x4SetInput16Store(tmp, dst + srcCF, dstStride, tail); - } - } - - void WinogradKernel1x5Block1x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padX == padW && padY == 0 && padH == 0 && (padX == 0 || padX == 2)); - if (!trans) - { - Base::WinogradKernel1x5Block1x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - return; - } - size_t dstH = srcHeight; - size_t dstW = padX ? srcWidth : srcWidth - 4; - size_t tileW = (dstW + 3) / 4; - size_t dstW4 = AlignLo(dstW, 4); - size_t noseW = Simd::Min(8, dstW + 2); - size_t startX = padX ? 4 : 0; - if (padX) - { - if (dstW == dstW4 || dstW == dstW4 + 1) - dstW4 -= 4; - src -= 2 * srcChannels; - } - size_t tailW = dstW - dstW4 + (padX ? 2 : 4); - for (size_t row = 0; row < dstH; row += 1) - { - size_t col = 0; - if (padX) - WinogradKernel1x5Block1x4SetInput16t(src, srcChannels, 2, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel1x5Block1x4SetInput16t(src + col * srcChannels, srcChannels, dst, dstStride), dst += srcChannels; - for (size_t tail = tailW; col < dstW; col += 4, tail -= 4) - WinogradKernel1x5Block1x4SetInput16t(src + col * srcChannels, srcChannels, 0, tail, dst, dstStride), dst += srcChannels; - src += srcWidth * srcChannels; - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel1x5Block1x4SetOutputLoad8(const float* src, size_t stride, __m512 dst[4], __mmask16 tail = -1) - { - const __m512 _2 = _mm512_set1_ps(2.0f); - const __m512 _3 = _mm512_set1_ps(3.0f); - const __m512 _4 = _mm512_set1_ps(4.0f); - const __m512 _9 = _mm512_set1_ps(9.0f); - __m512 s[8]; - s[0] = _mm512_maskz_loadu_ps(tail, src + 1 * stride); - s[7] = _mm512_maskz_loadu_ps(tail, src + 2 * stride); - s[1] = _mm512_add_ps(s[0], s[7]); - s[2] = _mm512_sub_ps(s[0], s[7]); - s[0] = _mm512_maskz_loadu_ps(tail, src + 3 * stride); - s[7] = _mm512_maskz_loadu_ps(tail, src + 4 * stride); - s[3] = _mm512_add_ps(s[0], s[7]); - s[4] = _mm512_mul_ps(_2, _mm512_sub_ps(s[0], s[7])); - s[0] = _mm512_maskz_loadu_ps(tail, src + 5 * stride); - s[7] = _mm512_maskz_loadu_ps(tail, src + 6 * stride); - s[5] = _mm512_add_ps(s[0], s[7]); - s[6] = _mm512_mul_ps(_3, _mm512_sub_ps(s[0], s[7])); - dst[0] = _mm512_add_ps(_mm512_maskz_loadu_ps(tail, src + 0 * stride), _mm512_add_ps(_mm512_add_ps(s[1], s[3]), s[5])); - dst[1] = _mm512_add_ps(s[2], _mm512_add_ps(s[4], s[6])); - dst[2] = _mm512_add_ps(s[1], _mm512_add_ps(_mm512_mul_ps(_4, s[3]), _mm512_mul_ps(_9, s[5]))); - dst[3] = _mm512_add_ps(_mm512_maskz_loadu_ps(tail, src + 7 * stride), _mm512_add_ps(_mm512_add_ps(s[2], _mm512_mul_ps(_4, s[4])), _mm512_mul_ps(_9, s[6]))); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetOutputStore4(const __m512 src[4], float* dst, size_t dstC, __mmask16 tail = -1) - { - _mm512_mask_storeu_ps(dst + 0 * dstC, tail, src[0]); - _mm512_mask_storeu_ps(dst + 1 * dstC, tail, src[1]); - _mm512_mask_storeu_ps(dst + 2 * dstC, tail, src[2]); - _mm512_mask_storeu_ps(dst + 3 * dstC, tail, src[3]); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstC) - { - size_t dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m512 tmp[4]; - WinogradKernel1x5Block1x4SetOutputLoad8(src + d, srcStride, tmp); - WinogradKernel1x5Block1x4SetOutputStore4(tmp, dst + d, dstC); - } - if (dstCF < dstC) - { - __m512 tmp[4]; - __mmask16 tail = TailMask16(dstC - dstCF); - WinogradKernel1x5Block1x4SetOutputLoad8(src + dstCF, srcStride, tmp, tail); - WinogradKernel1x5Block1x4SetOutputStore4(tmp, dst + dstCF, dstC, tail); - } - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetOutputStore4(const __m512 src[4], float* dst, size_t dstC, size_t colE, __mmask16 tail = -1) - { - for (size_t col = 0; col < colE; ++col) - _mm512_mask_storeu_ps(dst + col * dstC, tail, src[col]); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstC, size_t colE) - { - size_t dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m512 tmp[4]; - WinogradKernel1x5Block1x4SetOutputLoad8(src + d, srcStride, tmp); - WinogradKernel1x5Block1x4SetOutputStore4(tmp, dst + d, dstC, colE); - } - if (dstCF < dstC) - { - __m512 tmp[4]; - __mmask16 tail = TailMask16(dstC - dstCF); - WinogradKernel1x5Block1x4SetOutputLoad8(src + dstCF, srcStride, tmp, tail); - WinogradKernel1x5Block1x4SetOutputStore4(tmp, dst + dstCF, dstC, colE, tail); - } - } - - void WinogradKernel1x5Block1x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - if (!trans) - { - Base::WinogradKernel1x5Block1x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - return; - } - size_t tileW = (dstWidth + 3) / 4; - size_t dstW4 = AlignLo(dstWidth, 4); - for (size_t row = 0; row < dstHeight; row += 1) - { - size_t col; - for (col = 0; col < dstW4; col += 4) - WinogradKernel1x5Block1x4SetOutput4t(src, srcStride, dst + col * dstChannels, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel1x5Block1x4SetOutput4t(src, srcStride, dst + col * dstChannels, dstChannels, dstWidth - col), src += dstChannels; - dst += dstWidth * dstChannels; - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block2x2SetFilter(const __m512 src[4], float* dst, size_t stride, __mmask16 tail) - { - _mm512_mask_storeu_ps(dst + 0 * stride, tail, src[0]); - _mm512_mask_storeu_ps(dst + 1 * stride, tail, _mm512_add_ps(src[0], src[1])); - _mm512_mask_storeu_ps(dst + 2 * stride, tail, src[1]); - - _mm512_mask_storeu_ps(dst + 3 * stride, tail, _mm512_add_ps(src[0], src[2])); - _mm512_mask_storeu_ps(dst + 4 * stride, tail, _mm512_add_ps(_mm512_add_ps(src[0], src[1]), _mm512_add_ps(src[2], src[3]))); - _mm512_mask_storeu_ps(dst + 5 * stride, tail, _mm512_add_ps(src[1], src[3])); - - _mm512_mask_storeu_ps(dst + 6 * stride, tail, src[2]); - _mm512_mask_storeu_ps(dst + 7 * stride, tail, _mm512_add_ps(src[2], src[3])); - _mm512_mask_storeu_ps(dst + 8 * stride, tail, src[3]); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetFilter16t(const float* src, float* dst, size_t stride, __mmask16 tail = -1) - { - __m512 _src[4]; - _src[0] = _mm512_maskz_loadu_ps(tail, src + 0 * stride); - _src[1] = _mm512_maskz_loadu_ps(tail, src + 1 * stride); - _src[2] = _mm512_maskz_loadu_ps(tail, src + 2 * stride); - _src[3] = _mm512_maskz_loadu_ps(tail, src + 3 * stride); - WinogradKernel2x2Block2x2SetFilter(_src, dst, stride, tail); - } - - void WinogradKernel2x2Block2x2SetFilter(const float* src, size_t size, float* dst, SimdBool trans) - { - size_t sizeF = AlignLo(size, F), i = 0; - if (trans) - { - for (; i < sizeF; i += F) - WinogradKernel2x2Block2x2SetFilter16t(src + i, dst + i, size); - if (i < size) - { - __mmask16 tail = TailMask16(size - sizeF); - WinogradKernel2x2Block2x2SetFilter16t(src + i, dst + i, size, tail); - } - } - else - { - Sse::WinogradKernel2x2Block2x2SetFilter(src, size, dst, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block2x2SetInput16Store(const __m512* src, float* dst, size_t stride, __mmask16 tail = -1) - { - _mm512_mask_storeu_ps(dst + 0 * stride, tail, _mm512_add_ps(_mm512_sub_ps(src[0], src[1]), _mm512_sub_ps(src[4], src[3]))); - _mm512_mask_storeu_ps(dst + 1 * stride, tail, _mm512_sub_ps(src[1], src[4])); - _mm512_mask_storeu_ps(dst + 2 * stride, tail, _mm512_add_ps(_mm512_sub_ps(src[2], src[1]), _mm512_sub_ps(src[4], src[5]))); - _mm512_mask_storeu_ps(dst + 3 * stride, tail, _mm512_sub_ps(src[3], src[4])); - _mm512_mask_storeu_ps(dst + 4 * stride, tail, src[4]); - _mm512_mask_storeu_ps(dst + 5 * stride, tail, _mm512_sub_ps(src[5], src[4])); - _mm512_mask_storeu_ps(dst + 6 * stride, tail, _mm512_add_ps(_mm512_sub_ps(src[4], src[3]), _mm512_sub_ps(src[6], src[7]))); - _mm512_mask_storeu_ps(dst + 7 * stride, tail, _mm512_sub_ps(src[7], src[4])); - _mm512_mask_storeu_ps(dst + 8 * stride, tail, _mm512_add_ps(_mm512_sub_ps(src[4], src[5]), _mm512_sub_ps(src[8], src[7]))); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetInput16t(const float* src, size_t srcS, size_t srcC, __m512 dst[9], __mmask16 tail = -1) - { - dst[0] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 0 * srcC); - dst[1] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 1 * srcC); - dst[2] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 2 * srcC); - dst[3] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 0 * srcC); - dst[4] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 1 * srcC); - dst[5] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 2 * srcC); - dst[6] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 0 * srcC); - dst[7] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 1 * srcC); - dst[8] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 2 * srcC); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetInput16t(const float* src, size_t srcW, size_t srcC, float* dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - size_t c = 0; - for (; c < srcCF; c += F) - { - __m512 tmp[9]; - WinogradKernel2x2Block2x2SetInput16t(src + c, srcS, srcC, tmp); - WinogradKernel2x2Block2x2SetInput16Store(tmp, dst + c, dstStride); - } - if (c < srcC) - { - __mmask16 tail = TailMask16(srcC - srcCF); - __m512 tmp[9]; - WinogradKernel2x2Block2x2SetInput16t(src + c, srcS, srcC, tmp, tail); - WinogradKernel2x2Block2x2SetInput16Store(tmp, dst + c, dstStride, tail); - } - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetInput16t(const float* src, size_t srcS, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, __m512 dst[9], __mmask16 tail = -1) - { - for (size_t row = 0; row < rowB; ++row) - { - for (size_t col = 0; col < 3; ++col) - dst[col] = _mm512_setzero_ps(); - dst += 3; - } - for (size_t row = rowB; row < rowE; ++row) - { - for (size_t col = 0; col < colB; ++col) - dst[col] = _mm512_setzero_ps(); - for (size_t col = colB; col < colE; ++col) - dst[col] = _mm512_maskz_loadu_ps(tail, src + row * srcS + col * srcC); - for (size_t col = colE; col < 3; ++col) - dst[col] = _mm512_setzero_ps(); - dst += 3; - } - for (size_t row = rowE; row < 3; ++row) - { - for (size_t col = 0; col < 3; ++col) - dst[col] = _mm512_setzero_ps(); - dst += 3; - } - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetInput16t(const float* src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float* dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - size_t c = 0; - for (; c < srcCF; c += F) - { - __m512 tmp[9]; - WinogradKernel2x2Block2x2SetInput16t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel2x2Block2x2SetInput16Store(tmp, dst + c, dstStride); - } - if (c < srcC) - { - __mmask16 tail = TailMask16(srcC - srcCF); - __m512 tmp[9]; - WinogradKernel2x2Block2x2SetInput16t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp, tail); - WinogradKernel2x2Block2x2SetInput16Store(tmp, dst + c, dstStride, tail); - } - } - - void WinogradKernel2x2Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padY == padX && padW == padH && (padY + padH == 0 || padY + padH == 1)); - if (trans ? false : true) - { - Base::WinogradKernel2x2Block2x2SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - return; - } - size_t dstH = srcHeight - 1 + padY + padH; - size_t dstW = srcWidth - 1 + padX + padW; - size_t dstH2 = AlignLo(dstH, 2); - size_t dstW2 = AlignLo(dstW, 2); - size_t noseW = Simd::Min(3, dstW + 1); - size_t noseH = Simd::Min(3, dstH + 1); - size_t startY = padY ? 2 : 0; - size_t startX = padX ? 2 : 0; - if (padY || padH) - { - if (dstH == dstH2) - dstH2 -= 2; - if (dstW == dstW2) - dstW2 -= 2; - if (padY) - src -= (srcWidth + 1) * (trans ? srcChannels : 1); - } - size_t tailW = dstW - dstW2 + (padW ? 0 : 1); - size_t tailH = dstH - dstH2 + (padH ? 0 : 1); - size_t row = 0, col = 0; - if (padY) - { - if (padX) - WinogradKernel2x2Block2x2SetInput16t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW2; col += 2) - WinogradKernel2x2Block2x2SetInput16t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 3, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel2x2Block2x2SetInput16t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; - } - for (row = startY; row < dstH2; row += 2) - { - if (padX) - WinogradKernel2x2Block2x2SetInput16t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 3, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW2; col += 2) - WinogradKernel2x2Block2x2SetInput16t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel2x2Block2x2SetInput16t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 3, 0, tailW, dst, dstStride), dst += srcChannels; - } - if (row < dstH) - { - if (padX) - WinogradKernel2x2Block2x2SetInput16t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW2; col += 2) - WinogradKernel2x2Block2x2SetInput16t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 3, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel2x2Block2x2SetInput16t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block2x2SetOutputLoad9(const float* src, size_t stride, __m512* dst, __mmask16 tail = -1) - { - __m512 s[9]; - s[0] = _mm512_maskz_loadu_ps(tail, src + 0 * stride); - s[1] = _mm512_maskz_loadu_ps(tail, src + 1 * stride); - s[2] = _mm512_maskz_loadu_ps(tail, src + 2 * stride); - s[3] = _mm512_maskz_loadu_ps(tail, src + 3 * stride); - s[4] = _mm512_maskz_loadu_ps(tail, src + 4 * stride); - s[5] = _mm512_maskz_loadu_ps(tail, src + 5 * stride); - s[6] = _mm512_maskz_loadu_ps(tail, src + 6 * stride); - s[7] = _mm512_maskz_loadu_ps(tail, src + 7 * stride); - s[8] = _mm512_maskz_loadu_ps(tail, src + 8 * stride); - dst[0] = _mm512_add_ps(_mm512_add_ps(s[0], s[1]), _mm512_add_ps(s[3], s[4])); - dst[1] = _mm512_add_ps(_mm512_add_ps(s[1], s[2]), _mm512_add_ps(s[4], s[5])); - dst[2] = _mm512_add_ps(_mm512_add_ps(s[3], s[4]), _mm512_add_ps(s[6], s[7])); - dst[3] = _mm512_add_ps(_mm512_add_ps(s[4], s[5]), _mm512_add_ps(s[7], s[8])); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetOutputStore16(const __m512 src[4], float* dst, size_t dstS, size_t dstC, __mmask16 tail = -1) - { - _mm512_mask_storeu_ps(dst + 0 * dstS + 0 * dstC, tail, src[0]); - _mm512_mask_storeu_ps(dst + 0 * dstS + 1 * dstC, tail, src[1]); - _mm512_mask_storeu_ps(dst + 1 * dstS + 0 * dstC, tail, src[2]); - _mm512_mask_storeu_ps(dst + 1 * dstS + 1 * dstC, tail, src[3]); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetOutput16t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m512 tmp[4]; - WinogradKernel2x2Block2x2SetOutputLoad9(src + d, srcStride, tmp); - WinogradKernel2x2Block2x2SetOutputStore16(tmp, dst + d, dstS, dstC); - } - if (dstCF < dstC) - { - __mmask16 tail = TailMask16(dstC - dstCF); - __m512 tmp[4]; - WinogradKernel2x2Block2x2SetOutputLoad9(src + dstCF, srcStride, tmp, tail); - WinogradKernel2x2Block2x2SetOutputStore16(tmp, dst + dstCF, dstS, dstC, tail); - } - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetOutputStore16(const __m512 src[4], float* dst, size_t dstS, size_t dstC, size_t rowE, size_t colE, __mmask16 tail = -1) - { - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - _mm512_mask_storeu_ps(dst + row * dstS + col * dstC, tail, src[row * 2 + col]); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetOutput16t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m512 tmp[4]; - WinogradKernel2x2Block2x2SetOutputLoad9(src + d, srcStride, tmp); - WinogradKernel2x2Block2x2SetOutputStore16(tmp, dst + d, dstS, dstC, rowE, colE); - } - if (dstCF < dstC) - { - __mmask16 tail = TailMask16(dstC - dstCF); - __m512 tmp[4]; - WinogradKernel2x2Block2x2SetOutputLoad9(src + dstCF, srcStride, tmp, tail); - WinogradKernel2x2Block2x2SetOutputStore16(tmp, dst + dstCF, dstS, dstC, rowE, colE, tail); - } - } - - void WinogradKernel2x2Block2x2SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - if (trans ? false : true) - { - Base::WinogradKernel2x2Block2x2SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - return; - } - size_t tileH = (dstHeight + 1) / 2; - size_t tileW = (dstWidth + 1) / 2; - size_t dstH2 = AlignLo(dstHeight, 2); - size_t dstW2 = AlignLo(dstWidth, 2); - size_t row, col; - for (row = 0; row < dstH2; row += 2) - { - for (col = 0; col < dstW2; col += 2) - WinogradKernel2x2Block2x2SetOutput16t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel2x2Block2x2SetOutput16t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, 2, dstWidth - col), src += dstChannels; - } - if (row < dstHeight) - { - for (col = 0; col < dstW2; col += 2) - WinogradKernel2x2Block2x2SetOutput16t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, 2), src += dstChannels; - if (col < dstWidth) - WinogradKernel2x2Block2x2SetOutput16t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block4x4SetFilterRow(const __m512* t, float* dst, size_t stride, __mmask16 tail) - { - const __m512 r2 = _mm512_set1_ps(1.0f / 2.0f); - const __m512 r3 = _mm512_set1_ps(1.0f / 3.0f); - const __m512 r6 = _mm512_set1_ps(1.0f / 6.0f); - const __m512 mr2 = _mm512_set1_ps(-1.0f / 2.0f); - - _mm512_mask_storeu_ps(dst + 0 * stride, tail, _mm512_mul_ps(r2, t[0])); - _mm512_mask_storeu_ps(dst + 1 * stride, tail, _mm512_mul_ps(mr2, _mm512_add_ps(t[0], t[1]))); - _mm512_mask_storeu_ps(dst + 2 * stride, tail, _mm512_mul_ps(r6, _mm512_sub_ps(t[1], t[0]))); - _mm512_mask_storeu_ps(dst + 3 * stride, tail, _mm512_add_ps(_mm512_mul_ps(r6, t[0]), _mm512_mul_ps(r3, t[1]))); - _mm512_mask_storeu_ps(dst + 4 * stride, tail, t[1]); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetFilter(const __m512 src[4], float* dst, size_t stride, __mmask16 tail) - { - const __m512 r2 = _mm512_set1_ps(1.0f / 2.0f); - const __m512 r3 = _mm512_set1_ps(1.0f / 3.0f); - const __m512 r6 = _mm512_set1_ps(1.0f / 6.0f); - const __m512 mr2 = _mm512_set1_ps(-1.0f / 2.0f); - - __m512 t[2]; - t[0] = _mm512_mul_ps(r2, src[0]); - t[1] = _mm512_mul_ps(r2, src[1]); - WinogradKernel2x2Block4x4SetFilterRow(t, dst + 0 * stride, stride, tail); - - t[0] = _mm512_mul_ps(mr2, _mm512_add_ps(src[0], src[2])); - t[1] = _mm512_mul_ps(mr2, _mm512_add_ps(src[1], src[3])); - WinogradKernel2x2Block4x4SetFilterRow(t, dst + 5 * stride, stride, tail); - - t[0] = _mm512_mul_ps(r6, _mm512_sub_ps(src[2], src[0])); - t[1] = _mm512_mul_ps(r6, _mm512_sub_ps(src[3], src[1])); - WinogradKernel2x2Block4x4SetFilterRow(t, dst + 10 * stride, stride, tail); - - t[0] = _mm512_add_ps(_mm512_mul_ps(r6, src[0]), _mm512_mul_ps(r3, src[2])); - t[1] = _mm512_add_ps(_mm512_mul_ps(r6, src[1]), _mm512_mul_ps(r3, src[3])); - WinogradKernel2x2Block4x4SetFilterRow(t, dst + 15 * stride, stride, tail); - - t[0] = src[2]; - t[1] = src[3]; - WinogradKernel2x2Block4x4SetFilterRow(t, dst + 20 * stride, stride, tail); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetFilter16t(const float* src, float* dst, size_t stride, __mmask16 tail = -1) - { - __m512 _src[4]; - _src[0] = _mm512_loadu_ps(src + 0 * stride); - _src[1] = _mm512_loadu_ps(src + 1 * stride); - _src[2] = _mm512_loadu_ps(src + 2 * stride); - _src[3] = _mm512_loadu_ps(src + 3 * stride); - WinogradKernel2x2Block4x4SetFilter(_src, dst, stride, tail); - } - - void WinogradKernel2x2Block4x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans) - { - size_t sizeF = AlignLo(size, F), i = 0; - if (trans) - { - for (; i < sizeF; i += F) - WinogradKernel2x2Block4x4SetFilter16t(src + i, dst + i, size); - if (i < size) - { - __mmask16 tail = TailMask16(size - sizeF); - WinogradKernel2x2Block4x4SetFilter16t(src + i, dst + i, size, tail); - } - } - else - { - Sse::WinogradKernel2x2Block4x4SetFilter(src, size, dst, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInputStoreRow(const __m512 tmp[5], __m512 _2, __m512 _3, float* dst, size_t stride, __mmask16 tail) - { - _mm512_mask_storeu_ps(dst + 0 * stride, tail, _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, tmp[0]), tmp[1]), _mm512_sub_ps(tmp[3], _mm512_mul_ps(_2, tmp[2])))); - _mm512_mask_storeu_ps(dst + 1 * stride, tail, _mm512_sub_ps(tmp[3], _mm512_add_ps(_mm512_mul_ps(_2, tmp[1]), tmp[2]))); - _mm512_mask_storeu_ps(dst + 2 * stride, tail, _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, tmp[1]), _mm512_mul_ps(_3, tmp[2])), tmp[3])); - _mm512_mask_storeu_ps(dst + 3 * stride, tail, _mm512_sub_ps(tmp[3], tmp[1])); - _mm512_mask_storeu_ps(dst + 4 * stride, tail, _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, tmp[1]), tmp[2]), _mm512_sub_ps(tmp[4], _mm512_mul_ps(_2, tmp[3])))); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInputStore(const __m512* src, float* dst, size_t stride, __mmask16 tail = -1) - { - const __m512 _2 = _mm512_set1_ps(2.0f); - const __m512 _3 = _mm512_set1_ps(3.0f); - __m512 tmp[5]; - tmp[0] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, src[0]), src[5]), _mm512_sub_ps(src[15], _mm512_mul_ps(_2, src[10]))); - tmp[1] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, src[1]), src[6]), _mm512_sub_ps(src[16], _mm512_mul_ps(_2, src[11]))); - tmp[2] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, src[2]), src[7]), _mm512_sub_ps(src[17], _mm512_mul_ps(_2, src[12]))); - tmp[3] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, src[3]), src[8]), _mm512_sub_ps(src[18], _mm512_mul_ps(_2, src[13]))); - tmp[4] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, src[4]), src[9]), _mm512_sub_ps(src[19], _mm512_mul_ps(_2, src[14]))); - WinogradKernel2x2Block4x4SetInputStoreRow(tmp, _2, _3, dst + 0 * stride, stride, tail); - - tmp[0] = _mm512_sub_ps(src[15], _mm512_add_ps(_mm512_mul_ps(_2, src[5]), src[10])); - tmp[1] = _mm512_sub_ps(src[16], _mm512_add_ps(_mm512_mul_ps(_2, src[6]), src[11])); - tmp[2] = _mm512_sub_ps(src[17], _mm512_add_ps(_mm512_mul_ps(_2, src[7]), src[12])); - tmp[3] = _mm512_sub_ps(src[18], _mm512_add_ps(_mm512_mul_ps(_2, src[8]), src[13])); - tmp[4] = _mm512_sub_ps(src[19], _mm512_add_ps(_mm512_mul_ps(_2, src[9]), src[14])); - WinogradKernel2x2Block4x4SetInputStoreRow(tmp, _2, _3, dst + 5 * stride, stride, tail); - - tmp[0] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, src[5]), _mm512_mul_ps(_3, src[10])), src[15]); - tmp[1] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, src[6]), _mm512_mul_ps(_3, src[11])), src[16]); - tmp[2] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, src[7]), _mm512_mul_ps(_3, src[12])), src[17]); - tmp[3] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, src[8]), _mm512_mul_ps(_3, src[13])), src[18]); - tmp[4] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, src[9]), _mm512_mul_ps(_3, src[14])), src[19]); - WinogradKernel2x2Block4x4SetInputStoreRow(tmp, _2, _3, dst + 10 * stride, stride, tail); - - tmp[0] = _mm512_sub_ps(src[15], src[5]); - tmp[1] = _mm512_sub_ps(src[16], src[6]); - tmp[2] = _mm512_sub_ps(src[17], src[7]); - tmp[3] = _mm512_sub_ps(src[18], src[8]); - tmp[4] = _mm512_sub_ps(src[19], src[9]); - WinogradKernel2x2Block4x4SetInputStoreRow(tmp, _2, _3, dst + 15 * stride, stride, tail); - - tmp[0] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, src[5]), src[10]), _mm512_sub_ps(src[20], _mm512_mul_ps(_2, src[15]))); - tmp[1] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, src[6]), src[11]), _mm512_sub_ps(src[21], _mm512_mul_ps(_2, src[16]))); - tmp[2] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, src[7]), src[12]), _mm512_sub_ps(src[22], _mm512_mul_ps(_2, src[17]))); - tmp[3] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, src[8]), src[13]), _mm512_sub_ps(src[23], _mm512_mul_ps(_2, src[18]))); - tmp[4] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, src[9]), src[14]), _mm512_sub_ps(src[24], _mm512_mul_ps(_2, src[19]))); - WinogradKernel2x2Block4x4SetInputStoreRow(tmp, _2, _3, dst + 20 * stride, stride, tail); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInput16t(const float* src, size_t srcS, size_t srcC, __m512 dst[25], __mmask16 tail = -1) - { - dst[0] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 0 * srcC); - dst[1] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 1 * srcC); - dst[2] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 2 * srcC); - dst[3] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 3 * srcC); - dst[4] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 4 * srcC); - dst[5] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 0 * srcC); - dst[6] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 1 * srcC); - dst[7] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 2 * srcC); - dst[8] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 3 * srcC); - dst[9] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 4 * srcC); - dst[10] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 0 * srcC); - dst[11] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 1 * srcC); - dst[12] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 2 * srcC); - dst[13] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 3 * srcC); - dst[14] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 4 * srcC); - dst[15] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 0 * srcC); - dst[16] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 1 * srcC); - dst[17] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 2 * srcC); - dst[18] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 3 * srcC); - dst[19] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 4 * srcC); - dst[20] = _mm512_maskz_loadu_ps(tail, src + 4 * srcS + 0 * srcC); - dst[21] = _mm512_maskz_loadu_ps(tail, src + 4 * srcS + 1 * srcC); - dst[22] = _mm512_maskz_loadu_ps(tail, src + 4 * srcS + 2 * srcC); - dst[23] = _mm512_maskz_loadu_ps(tail, src + 4 * srcS + 3 * srcC); - dst[24] = _mm512_maskz_loadu_ps(tail, src + 4 * srcS + 4 * srcC); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInput16t(const float* src, size_t srcS, size_t srcC, float* dst, size_t stride, __mmask16 tail) - { - __m512 s[25], t[5]; - const __m512 _2 = _mm512_set1_ps(2.0f); - const __m512 _3 = _mm512_set1_ps(3.0f); - - s[5] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 0 * srcC); - s[6] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 1 * srcC); - s[7] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 2 * srcC); - s[8] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 3 * srcC); - s[9] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 4 * srcC); - s[10] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 0 * srcC); - s[11] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 1 * srcC); - s[12] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 2 * srcC); - s[13] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 3 * srcC); - s[14] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 4 * srcC); - s[15] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 0 * srcC); - s[16] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 1 * srcC); - s[17] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 2 * srcC); - s[18] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 3 * srcC); - s[19] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 4 * srcC); - - t[0] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 0 * srcC)), s[5]), _mm512_sub_ps(s[15], _mm512_mul_ps(_2, s[10]))); - t[1] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 1 * srcC)), s[6]), _mm512_sub_ps(s[16], _mm512_mul_ps(_2, s[11]))); - t[2] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 2 * srcC)), s[7]), _mm512_sub_ps(s[17], _mm512_mul_ps(_2, s[12]))); - t[3] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 3 * srcC)), s[8]), _mm512_sub_ps(s[18], _mm512_mul_ps(_2, s[13]))); - t[4] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 4 * srcC)), s[9]), _mm512_sub_ps(s[19], _mm512_mul_ps(_2, s[14]))); - WinogradKernel2x2Block4x4SetInputStoreRow(t, _2, _3, dst + 0 * stride, stride, tail); - - t[0] = _mm512_sub_ps(s[15], _mm512_add_ps(_mm512_mul_ps(_2, s[5]), s[10])); - t[1] = _mm512_sub_ps(s[16], _mm512_add_ps(_mm512_mul_ps(_2, s[6]), s[11])); - t[2] = _mm512_sub_ps(s[17], _mm512_add_ps(_mm512_mul_ps(_2, s[7]), s[12])); - t[3] = _mm512_sub_ps(s[18], _mm512_add_ps(_mm512_mul_ps(_2, s[8]), s[13])); - t[4] = _mm512_sub_ps(s[19], _mm512_add_ps(_mm512_mul_ps(_2, s[9]), s[14])); - WinogradKernel2x2Block4x4SetInputStoreRow(t, _2, _3, dst + 5 * stride, stride, tail); - - t[0] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, s[5]), _mm512_mul_ps(_3, s[10])), s[15]); - t[1] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, s[6]), _mm512_mul_ps(_3, s[11])), s[16]); - t[2] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, s[7]), _mm512_mul_ps(_3, s[12])), s[17]); - t[3] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, s[8]), _mm512_mul_ps(_3, s[13])), s[18]); - t[4] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, s[9]), _mm512_mul_ps(_3, s[14])), s[19]); - WinogradKernel2x2Block4x4SetInputStoreRow(t, _2, _3, dst + 10 * stride, stride, tail); - - t[0] = _mm512_sub_ps(s[15], s[5]); - t[1] = _mm512_sub_ps(s[16], s[6]); - t[2] = _mm512_sub_ps(s[17], s[7]); - t[3] = _mm512_sub_ps(s[18], s[8]); - t[4] = _mm512_sub_ps(s[19], s[9]); - WinogradKernel2x2Block4x4SetInputStoreRow(t, _2, _3, dst + 15 * stride, stride, tail); - - t[0] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, s[5]), s[10]), _mm512_sub_ps(_mm512_maskz_loadu_ps(tail, src + 4 * srcS + 0 * srcC), _mm512_mul_ps(_2, s[15]))); - t[1] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, s[6]), s[11]), _mm512_sub_ps(_mm512_maskz_loadu_ps(tail, src + 4 * srcS + 1 * srcC), _mm512_mul_ps(_2, s[16]))); - t[2] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, s[7]), s[12]), _mm512_sub_ps(_mm512_maskz_loadu_ps(tail, src + 4 * srcS + 2 * srcC), _mm512_mul_ps(_2, s[17]))); - t[3] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, s[8]), s[13]), _mm512_sub_ps(_mm512_maskz_loadu_ps(tail, src + 4 * srcS + 3 * srcC), _mm512_mul_ps(_2, s[18]))); - t[4] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_2, s[9]), s[14]), _mm512_sub_ps(_mm512_maskz_loadu_ps(tail, src + 4 * srcS + 4 * srcC), _mm512_mul_ps(_2, s[19]))); - WinogradKernel2x2Block4x4SetInputStoreRow(t, _2, _3, dst + 20 * stride, stride, tail); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInput16t(const float* src, size_t srcW, size_t srcC, float* dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - WinogradKernel2x2Block4x4SetInput16t(src + c, srcS, srcC, dst + c, dstStride, __mmask16(-1)); - if (srcCF < srcC) - { - __mmask16 tail = TailMask16(srcC - srcCF); - WinogradKernel2x2Block4x4SetInput16t(src + srcCF, srcS, srcC, dst + srcCF, dstStride, tail); - } - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInput16t(const float* src, size_t srcS, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, __m512 * dst, __mmask16 tail = -1) - { - for (size_t row = 0; row < rowB; ++row) - { - for (size_t col = 0; col < 5; ++col) - dst[col] = _mm512_setzero_ps(); - dst += 5; - } - for (size_t row = rowB; row < rowE; ++row) - { - for (size_t col = 0; col < colB; ++col) - dst[col] = _mm512_setzero_ps(); - for (size_t col = colB; col < colE; ++col) - dst[col] = _mm512_maskz_loadu_ps(tail, src + row * srcS + col * srcC); - for (size_t col = colE; col < 5; ++col) - dst[col] = _mm512_setzero_ps(); - dst += 5; - } - for (size_t row = rowE; row < 5; ++row) - { - for (size_t col = 0; col < 5; ++col) - dst[col] = _mm512_setzero_ps(); - dst += 5; - } - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInput16t(const float* src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float* dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - __m512 tmp[25]; - WinogradKernel2x2Block4x4SetInput16t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel2x2Block4x4SetInputStore(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - __mmask16 tail = TailMask16(srcC - srcCF); - __m512 tmp[25]; - WinogradKernel2x2Block4x4SetInput16t(src + srcC - F, srcS, srcC, rowB, rowE, colB, colE, tmp, tail); - WinogradKernel2x2Block4x4SetInputStore(tmp, dst + srcC - F, dstStride, tail); - } - } - - void WinogradKernel2x2Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padY == padX && padW == padH && (padY + padH == 0 || padY + padH == 1)); - if (trans ? false : true) - { - Base::WinogradKernel2x2Block4x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - return; - } - size_t dstH = srcHeight - 1 + padY + padH; - size_t dstW = srcWidth - 1 + padX + padW; - size_t dstH4 = AlignLo(dstH, 4); - size_t dstW4 = AlignLo(dstW, 4); - size_t noseW = Simd::Min(5, dstW + 1); - size_t noseH = Simd::Min(5, dstH + 1); - size_t startY = padY ? 4 : 0; - size_t startX = padX ? 4 : 0; - if (padY || padH) - { - if (dstH == dstH4) - dstH4 -= 4; - if (dstW == dstW4) - dstW4 -= 4; - if (padY) - src -= (srcWidth + 1) * (trans ? srcChannels : 1); - } - size_t tailW = dstW - dstW4 + (padW ? 0 : 1); - size_t tailH = dstH - dstH4 + (padH ? 0 : 1); - size_t row = 0, col = 0; - if (padY) - { - if (padX) - WinogradKernel2x2Block4x4SetInput16t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel2x2Block4x4SetInput16t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 5, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel2x2Block4x4SetInput16t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; - } - for (row = startY; row < dstH4; row += 4) - { - if (padX) - WinogradKernel2x2Block4x4SetInput16t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 5, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel2x2Block4x4SetInput16t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel2x2Block4x4SetInput16t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 5, 0, tailW, dst, dstStride), dst += srcChannels; - } - if (row < dstH) - { - if (padX) - WinogradKernel2x2Block4x4SetInput16t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel2x2Block4x4SetInput16t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 5, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel2x2Block4x4SetInput16t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputGetRow(const __m512 t[5], __m512 _2, __m512 _4, __m512 _8, __m512* d) - { - d[0] = _mm512_add_ps(_mm512_add_ps(t[0], t[1]), _mm512_add_ps(t[2], t[3])); - d[1] = _mm512_add_ps(_mm512_sub_ps(t[1], t[2]), _mm512_mul_ps(_2, t[3])); - d[2] = _mm512_add_ps(_mm512_add_ps(t[1], t[2]), _mm512_mul_ps(_4, t[3])); - d[3] = _mm512_add_ps(_mm512_sub_ps(t[1], t[2]), _mm512_add_ps(_mm512_mul_ps(_8, t[3]), t[4])); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputSaveRow(const __m512 t[5], __m512 _2, __m512 _4, __m512 _8, float* dst, size_t dstC, __mmask16 tail) - { - _mm512_mask_storeu_ps(dst + 0 * dstC, tail, _mm512_add_ps(_mm512_add_ps(t[0], t[1]), _mm512_add_ps(t[2], t[3]))); - _mm512_mask_storeu_ps(dst + 1 * dstC, tail, _mm512_add_ps(_mm512_sub_ps(t[1], t[2]), _mm512_mul_ps(_2, t[3]))); - _mm512_mask_storeu_ps(dst + 2 * dstC, tail, _mm512_add_ps(_mm512_add_ps(t[1], t[2]), _mm512_mul_ps(_4, t[3]))); - _mm512_mask_storeu_ps(dst + 3 * dstC, tail, _mm512_add_ps(_mm512_sub_ps(t[1], t[2]), _mm512_add_ps(_mm512_mul_ps(_8, t[3]), t[4]))); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutput(const float* src, size_t stride, float * dst, size_t dstS, size_t dstC, size_t rowE, __mmask16 tail) - { - __m512 s[25], t[5]; - s[5] = _mm512_maskz_loadu_ps(tail, src + 5 * stride); - s[6] = _mm512_maskz_loadu_ps(tail, src + 6 * stride); - s[7] = _mm512_maskz_loadu_ps(tail, src + 7 * stride); - s[8] = _mm512_maskz_loadu_ps(tail, src + 8 * stride); - s[9] = _mm512_maskz_loadu_ps(tail, src + 9 * stride); - s[10] = _mm512_maskz_loadu_ps(tail, src + 10 * stride); - s[11] = _mm512_maskz_loadu_ps(tail, src + 11 * stride); - s[12] = _mm512_maskz_loadu_ps(tail, src + 12 * stride); - s[13] = _mm512_maskz_loadu_ps(tail, src + 13 * stride); - s[14] = _mm512_maskz_loadu_ps(tail, src + 14 * stride); - s[15] = _mm512_maskz_loadu_ps(tail, src + 15 * stride); - s[16] = _mm512_maskz_loadu_ps(tail, src + 16 * stride); - s[17] = _mm512_maskz_loadu_ps(tail, src + 17 * stride); - s[18] = _mm512_maskz_loadu_ps(tail, src + 18 * stride); - s[19] = _mm512_maskz_loadu_ps(tail, src + 19 * stride); - - const __m512 _2 = _mm512_set1_ps(2.0f); - const __m512 _4 = _mm512_set1_ps(4.0f); - const __m512 _8 = _mm512_set1_ps(8.0f); - t[0] = _mm512_add_ps(_mm512_add_ps(_mm512_maskz_loadu_ps(tail, src + 0 * stride), s[5]), _mm512_add_ps(s[10], s[15])); - t[1] = _mm512_add_ps(_mm512_add_ps(_mm512_maskz_loadu_ps(tail, src + 1 * stride), s[6]), _mm512_add_ps(s[11], s[16])); - t[2] = _mm512_add_ps(_mm512_add_ps(_mm512_maskz_loadu_ps(tail, src + 2 * stride), s[7]), _mm512_add_ps(s[12], s[17])); - t[3] = _mm512_add_ps(_mm512_add_ps(_mm512_maskz_loadu_ps(tail, src + 3 * stride), s[8]), _mm512_add_ps(s[13], s[18])); - t[4] = _mm512_add_ps(_mm512_add_ps(_mm512_maskz_loadu_ps(tail, src + 4 * stride), s[9]), _mm512_add_ps(s[14], s[19])); - WinogradKernel2x2Block4x4SetOutputSaveRow(t, _2, _4, _8, dst + 0 * dstS, dstC, tail); - if (rowE == 1) return; - - t[0] = _mm512_add_ps(_mm512_sub_ps(s[5], s[10]), _mm512_mul_ps(_2, s[15])); - t[1] = _mm512_add_ps(_mm512_sub_ps(s[6], s[11]), _mm512_mul_ps(_2, s[16])); - t[2] = _mm512_add_ps(_mm512_sub_ps(s[7], s[12]), _mm512_mul_ps(_2, s[17])); - t[3] = _mm512_add_ps(_mm512_sub_ps(s[8], s[13]), _mm512_mul_ps(_2, s[18])); - t[4] = _mm512_add_ps(_mm512_sub_ps(s[9], s[14]), _mm512_mul_ps(_2, s[19])); - WinogradKernel2x2Block4x4SetOutputSaveRow(t, _2, _4, _8, dst + 1 * dstS, dstC, tail); - if (rowE == 2) return; - - t[0] = _mm512_add_ps(_mm512_add_ps(s[5], s[10]), _mm512_mul_ps(_4, s[15])); - t[1] = _mm512_add_ps(_mm512_add_ps(s[6], s[11]), _mm512_mul_ps(_4, s[16])); - t[2] = _mm512_add_ps(_mm512_add_ps(s[7], s[12]), _mm512_mul_ps(_4, s[17])); - t[3] = _mm512_add_ps(_mm512_add_ps(s[8], s[13]), _mm512_mul_ps(_4, s[18])); - t[4] = _mm512_add_ps(_mm512_add_ps(s[9], s[14]), _mm512_mul_ps(_4, s[19])); - WinogradKernel2x2Block4x4SetOutputSaveRow(t, _2, _4, _8, dst + 2 * dstS, dstC, tail); - if (rowE == 3) return; - - t[0] = _mm512_add_ps(_mm512_sub_ps(s[5], s[10]), _mm512_add_ps(_mm512_mul_ps(_8, s[15]), _mm512_maskz_loadu_ps(tail, src + 20 * stride))); - t[1] = _mm512_add_ps(_mm512_sub_ps(s[6], s[11]), _mm512_add_ps(_mm512_mul_ps(_8, s[16]), _mm512_maskz_loadu_ps(tail, src + 21 * stride))); - t[2] = _mm512_add_ps(_mm512_sub_ps(s[7], s[12]), _mm512_add_ps(_mm512_mul_ps(_8, s[17]), _mm512_maskz_loadu_ps(tail, src + 22 * stride))); - t[3] = _mm512_add_ps(_mm512_sub_ps(s[8], s[13]), _mm512_add_ps(_mm512_mul_ps(_8, s[18]), _mm512_maskz_loadu_ps(tail, src + 23 * stride))); - t[4] = _mm512_add_ps(_mm512_sub_ps(s[9], s[14]), _mm512_add_ps(_mm512_mul_ps(_8, s[19]), _mm512_maskz_loadu_ps(tail, src + 24 * stride))); - WinogradKernel2x2Block4x4SetOutputSaveRow(t, _2, _4, _8, dst + 3 * dstS, dstC, tail); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutput16t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC, size_t rowE) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - WinogradKernel2x2Block4x4SetOutput(src + d, srcStride, dst + d, dstS, dstC, rowE, __mmask16(-1)); - if (dstCF < dstC) - { - __mmask16 tail = TailMask16(dstC - dstCF); - WinogradKernel2x2Block4x4SetOutput(src + dstCF, srcStride, dst + dstCF, dstS, dstC, rowE, tail); - } - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputLoad25(const float* src, size_t stride, __m512* dst, __mmask16 tail = -1) - { - __m512 s[25], t[5]; - s[5] = _mm512_maskz_loadu_ps(tail, src + 5 * stride); - s[6] = _mm512_maskz_loadu_ps(tail, src + 6 * stride); - s[7] = _mm512_maskz_loadu_ps(tail, src + 7 * stride); - s[8] = _mm512_maskz_loadu_ps(tail, src + 8 * stride); - s[9] = _mm512_maskz_loadu_ps(tail, src + 9 * stride); - s[10] = _mm512_maskz_loadu_ps(tail, src + 10 * stride); - s[11] = _mm512_maskz_loadu_ps(tail, src + 11 * stride); - s[12] = _mm512_maskz_loadu_ps(tail, src + 12 * stride); - s[13] = _mm512_maskz_loadu_ps(tail, src + 13 * stride); - s[14] = _mm512_maskz_loadu_ps(tail, src + 14 * stride); - s[15] = _mm512_maskz_loadu_ps(tail, src + 15 * stride); - s[16] = _mm512_maskz_loadu_ps(tail, src + 16 * stride); - s[17] = _mm512_maskz_loadu_ps(tail, src + 17 * stride); - s[18] = _mm512_maskz_loadu_ps(tail, src + 18 * stride); - s[19] = _mm512_maskz_loadu_ps(tail, src + 19 * stride); - - const __m512 _2 = _mm512_set1_ps(2.0f); - const __m512 _4 = _mm512_set1_ps(4.0f); - const __m512 _8 = _mm512_set1_ps(8.0f); - t[0] = _mm512_add_ps(_mm512_add_ps(_mm512_maskz_loadu_ps(tail, src + 0 * stride), s[5]), _mm512_add_ps(s[10], s[15])); - t[1] = _mm512_add_ps(_mm512_add_ps(_mm512_maskz_loadu_ps(tail, src + 1 * stride), s[6]), _mm512_add_ps(s[11], s[16])); - t[2] = _mm512_add_ps(_mm512_add_ps(_mm512_maskz_loadu_ps(tail, src + 2 * stride), s[7]), _mm512_add_ps(s[12], s[17])); - t[3] = _mm512_add_ps(_mm512_add_ps(_mm512_maskz_loadu_ps(tail, src + 3 * stride), s[8]), _mm512_add_ps(s[13], s[18])); - t[4] = _mm512_add_ps(_mm512_add_ps(_mm512_maskz_loadu_ps(tail, src + 4 * stride), s[9]), _mm512_add_ps(s[14], s[19])); - WinogradKernel2x2Block4x4SetOutputGetRow(t, _2, _4, _8, dst + 0); - - t[0] = _mm512_add_ps(_mm512_sub_ps(s[5], s[10]), _mm512_mul_ps(_2, s[15])); - t[1] = _mm512_add_ps(_mm512_sub_ps(s[6], s[11]), _mm512_mul_ps(_2, s[16])); - t[2] = _mm512_add_ps(_mm512_sub_ps(s[7], s[12]), _mm512_mul_ps(_2, s[17])); - t[3] = _mm512_add_ps(_mm512_sub_ps(s[8], s[13]), _mm512_mul_ps(_2, s[18])); - t[4] = _mm512_add_ps(_mm512_sub_ps(s[9], s[14]), _mm512_mul_ps(_2, s[19])); - WinogradKernel2x2Block4x4SetOutputGetRow(t, _2, _4, _8, dst + 4); - - t[0] = _mm512_add_ps(_mm512_add_ps(s[5], s[10]), _mm512_mul_ps(_4, s[15])); - t[1] = _mm512_add_ps(_mm512_add_ps(s[6], s[11]), _mm512_mul_ps(_4, s[16])); - t[2] = _mm512_add_ps(_mm512_add_ps(s[7], s[12]), _mm512_mul_ps(_4, s[17])); - t[3] = _mm512_add_ps(_mm512_add_ps(s[8], s[13]), _mm512_mul_ps(_4, s[18])); - t[4] = _mm512_add_ps(_mm512_add_ps(s[9], s[14]), _mm512_mul_ps(_4, s[19])); - WinogradKernel2x2Block4x4SetOutputGetRow(t, _2, _4, _8, dst + 8); - - t[0] = _mm512_add_ps(_mm512_sub_ps(s[5], s[10]), _mm512_add_ps(_mm512_mul_ps(_8, s[15]), _mm512_maskz_loadu_ps(tail, src + 20 * stride))); - t[1] = _mm512_add_ps(_mm512_sub_ps(s[6], s[11]), _mm512_add_ps(_mm512_mul_ps(_8, s[16]), _mm512_maskz_loadu_ps(tail, src + 21 * stride))); - t[2] = _mm512_add_ps(_mm512_sub_ps(s[7], s[12]), _mm512_add_ps(_mm512_mul_ps(_8, s[17]), _mm512_maskz_loadu_ps(tail, src + 22 * stride))); - t[3] = _mm512_add_ps(_mm512_sub_ps(s[8], s[13]), _mm512_add_ps(_mm512_mul_ps(_8, s[18]), _mm512_maskz_loadu_ps(tail, src + 23 * stride))); - t[4] = _mm512_add_ps(_mm512_sub_ps(s[9], s[14]), _mm512_add_ps(_mm512_mul_ps(_8, s[19]), _mm512_maskz_loadu_ps(tail, src + 24 * stride))); - WinogradKernel2x2Block4x4SetOutputGetRow(t, _2, _4, _8, dst + 12); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputStore16(const __m512 src[16], float* dst, size_t dstS, size_t dstC, __mmask16 tail = -1) - { - _mm512_mask_storeu_ps(dst + 0 * dstS + 0 * dstC, tail, src[0]); - _mm512_mask_storeu_ps(dst + 0 * dstS + 1 * dstC, tail, src[1]); - _mm512_mask_storeu_ps(dst + 0 * dstS + 2 * dstC, tail, src[2]); - _mm512_mask_storeu_ps(dst + 0 * dstS + 3 * dstC, tail, src[3]); - _mm512_mask_storeu_ps(dst + 1 * dstS + 0 * dstC, tail, src[4]); - _mm512_mask_storeu_ps(dst + 1 * dstS + 1 * dstC, tail, src[5]); - _mm512_mask_storeu_ps(dst + 1 * dstS + 2 * dstC, tail, src[6]); - _mm512_mask_storeu_ps(dst + 1 * dstS + 3 * dstC, tail, src[7]); - _mm512_mask_storeu_ps(dst + 2 * dstS + 0 * dstC, tail, src[8]); - _mm512_mask_storeu_ps(dst + 2 * dstS + 1 * dstC, tail, src[9]); - _mm512_mask_storeu_ps(dst + 2 * dstS + 2 * dstC, tail, src[10]); - _mm512_mask_storeu_ps(dst + 2 * dstS + 3 * dstC, tail, src[11]); - _mm512_mask_storeu_ps(dst + 3 * dstS + 0 * dstC, tail, src[12]); - _mm512_mask_storeu_ps(dst + 3 * dstS + 1 * dstC, tail, src[13]); - _mm512_mask_storeu_ps(dst + 3 * dstS + 2 * dstC, tail, src[14]); - _mm512_mask_storeu_ps(dst + 3 * dstS + 3 * dstC, tail, src[15]); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputStore16(const __m512 src[16], float* dst, size_t dstS, size_t dstC, size_t rowE, size_t colE, __mmask16 tail = -1) - { - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - _mm512_mask_storeu_ps(dst + row * dstS + col * dstC, tail, src[row * 4 + col]); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutput16t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m512 tmp[16]; - WinogradKernel2x2Block4x4SetOutputLoad25(src + d, srcStride, tmp); - WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + d, dstS, dstC, rowE, colE); - } - if (dstCF < dstC) - { - __mmask16 tail = TailMask16(dstC - dstCF); - __m512 tmp[16]; - WinogradKernel2x2Block4x4SetOutputLoad25(src + dstCF, srcStride, tmp, tail); - WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + dstCF, dstS, dstC, rowE, colE, tail); - } - } - - void WinogradKernel2x2Block4x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - if (trans ? false : true) - { - Base::WinogradKernel2x2Block4x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - return; - } - size_t tileH = (dstHeight + 3) / 4; - size_t tileW = (dstWidth + 3) / 4; - size_t dstH4 = AlignLo(dstHeight, 4); - size_t dstW4 = AlignLo(dstWidth, 4); - size_t row, col; - for (row = 0; row < dstH4; row += 4) - { - for (col = 0; col < dstW4; col += 4) - WinogradKernel2x2Block4x4SetOutput16t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, 4), src += dstChannels; - if (col < dstWidth) - WinogradKernel2x2Block4x4SetOutput16t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, 4, dstWidth - col), src += dstChannels; - } - if (row < dstHeight) - { - for (col = 0; col < dstW4; col += 4) - WinogradKernel2x2Block4x4SetOutput16t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row), src += dstChannels; - if (col < dstWidth) - WinogradKernel2x2Block4x4SetOutput16t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block2x2SetFilter16t(const float * src, float * dst, size_t stride, __mmask16 tail = -1) - { - const __m512 r2 = _mm512_set1_ps(1.0f / 2.0f); - const __m512 r4 = _mm512_set1_ps(1.0f / 4.0f); - - __m512 s[9]; - s[0] = _mm512_maskz_loadu_ps(tail, src + 0 * stride); - s[1] = _mm512_maskz_loadu_ps(tail, src + 1 * stride); - s[2] = _mm512_maskz_loadu_ps(tail, src + 2 * stride); - s[3] = _mm512_maskz_loadu_ps(tail, src + 3 * stride); - s[4] = _mm512_maskz_loadu_ps(tail, src + 4 * stride); - s[5] = _mm512_maskz_loadu_ps(tail, src + 5 * stride); - s[6] = _mm512_maskz_loadu_ps(tail, src + 6 * stride); - s[7] = _mm512_maskz_loadu_ps(tail, src + 7 * stride); - s[8] = _mm512_maskz_loadu_ps(tail, src + 8 * stride); - - _mm512_mask_storeu_ps(dst + 0 * stride, tail, s[0]); - __m512 _0a2 = _mm512_add_ps(s[0], s[2]); - _mm512_mask_storeu_ps(dst + 1 * stride, tail, _mm512_mul_ps(_mm512_add_ps(_0a2, s[1]), r2)); - _mm512_mask_storeu_ps(dst + 2 * stride, tail, _mm512_mul_ps(_mm512_sub_ps(_0a2, s[1]), r2)); - _mm512_mask_storeu_ps(dst + 3 * stride, tail, s[2]); - - __m512 _0a6a3 = _mm512_add_ps(_mm512_add_ps(s[0], s[6]), s[3]); - _mm512_mask_storeu_ps(dst + 4 * stride, tail, _mm512_mul_ps(_0a6a3, r2)); - __m512 _2a8a5 = _mm512_add_ps(_mm512_add_ps(s[2], s[8]), s[5]); - __m512 _1a7a4 = _mm512_add_ps(_mm512_add_ps(s[1], s[7]), s[4]); - _mm512_mask_storeu_ps(dst + 5 * stride, tail, _mm512_mul_ps(_mm512_add_ps(_mm512_add_ps(_0a6a3, _2a8a5), _1a7a4), r4)); - _mm512_mask_storeu_ps(dst + 6 * stride, tail, _mm512_mul_ps(_mm512_sub_ps(_mm512_add_ps(_0a6a3, _2a8a5), _1a7a4), r4)); - _mm512_mask_storeu_ps(dst + 7 * stride, tail, _mm512_mul_ps(_2a8a5, r2)); - - __m512 _0a6s3 = _mm512_sub_ps(_mm512_add_ps(s[0], s[6]), s[3]); - _mm512_mask_storeu_ps(dst + 8 * stride, tail, _mm512_mul_ps(_0a6s3, r2)); - __m512 _2a8s5 = _mm512_sub_ps(_mm512_add_ps(s[2], s[8]), s[5]); - __m512 _1a7s4 = _mm512_sub_ps(_mm512_add_ps(s[1], s[7]), s[4]); - _mm512_mask_storeu_ps(dst + 9 * stride, tail, _mm512_mul_ps(_mm512_add_ps(_mm512_add_ps(_0a6s3, _2a8s5), _1a7s4), r4)); - _mm512_mask_storeu_ps(dst + 10 * stride, tail, _mm512_mul_ps(_mm512_sub_ps(_mm512_add_ps(_0a6s3, _2a8s5), _1a7s4), r4)); - _mm512_mask_storeu_ps(dst + 11 * stride, tail, _mm512_mul_ps(_2a8s5, r2)); - - _mm512_mask_storeu_ps(dst + 12 * stride, tail, s[6]); - __m512 _6a8 = _mm512_add_ps(s[6], s[8]); - _mm512_mask_storeu_ps(dst + 13 * stride, tail, _mm512_mul_ps(_mm512_add_ps(_6a8, s[7]), r2)); - _mm512_mask_storeu_ps(dst + 14 * stride, tail, _mm512_mul_ps(_mm512_sub_ps(_6a8, s[7]), r2)); - _mm512_mask_storeu_ps(dst + 15 * stride, tail, s[8]); - } - - void WinogradKernel3x3Block2x2SetFilter(const float * src, size_t size, float * dst, SimdBool trans) - { - if (trans) - { - size_t sizeF = AlignLo(size, F), i = 0; - for (; i < sizeF; i += F) - WinogradKernel3x3Block2x2SetFilter16t(src + i, dst + i, size); - if (i < size) - { - __mmask16 tail = TailMask16(size - sizeF); - WinogradKernel3x3Block2x2SetFilter16t(src + i, dst + i, size, tail); - } - } - else - { - Sse::WinogradKernel3x3Block2x2SetFilter(src, size, dst, trans); - } - } - - //----------------------------------------------------------------------- - - template SIMD_INLINE void WinogradKernel3x3Block2x2SetInputLoad16n(const float * src, __m512 * dst, const __mmask16 * tails) - { - __m512 a0 = Load(src + 0, tails[0]); - __m512 a1 = Load(src + 2, tails[1]); - __m512 a2 = Load(src + 16, tails[2]); - __m512 a3 = Load(src + 18, tails[3]); - dst[0] = Deinterleave<0>(a0, a2); - dst[1] = Deinterleave<1>(a0, a2); - dst[2] = Deinterleave<0>(a1, a3); - dst[3] = Deinterleave<1>(a1, a3); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInputLoad16z(__m512 * dst) - { - dst[0] = _mm512_setzero_ps(); - dst[1] = _mm512_setzero_ps(); - dst[2] = _mm512_setzero_ps(); - dst[3] = _mm512_setzero_ps(); - } - - template SIMD_INLINE void WinogradKernel3x3Block2x2SetInput16Store(const __m512 * src, float * dst, size_t stride, __mmask16 tail = -1) - { - Store(dst + 0 * stride, _mm512_sub_ps(_mm512_sub_ps(src[0], src[8]), _mm512_sub_ps(src[2], src[10])), tail); - Store(dst + 1 * stride, _mm512_add_ps(_mm512_sub_ps(src[1], src[9]), _mm512_sub_ps(src[2], src[10])), tail); - Store(dst + 2 * stride, _mm512_sub_ps(_mm512_sub_ps(src[2], src[10]), _mm512_sub_ps(src[1], src[9])), tail); - Store(dst + 3 * stride, _mm512_sub_ps(_mm512_sub_ps(src[1], src[9]), _mm512_sub_ps(src[3], src[11])), tail); - Store(dst + 4 * stride, _mm512_sub_ps(_mm512_add_ps(src[4], src[8]), _mm512_add_ps(src[6], src[10])), tail); - Store(dst + 5 * stride, _mm512_add_ps(_mm512_add_ps(src[5], src[9]), _mm512_add_ps(src[6], src[10])), tail); - Store(dst + 6 * stride, _mm512_sub_ps(_mm512_add_ps(src[6], src[10]), _mm512_add_ps(src[5], src[9])), tail); - Store(dst + 7 * stride, _mm512_sub_ps(_mm512_add_ps(src[5], src[9]), _mm512_add_ps(src[7], src[11])), tail); - Store(dst + 8 * stride, _mm512_sub_ps(_mm512_sub_ps(src[8], src[4]), _mm512_sub_ps(src[10], src[6])), tail); - Store(dst + 9 * stride, _mm512_add_ps(_mm512_sub_ps(src[9], src[5]), _mm512_sub_ps(src[10], src[6])), tail); - Store(dst + 10 * stride, _mm512_sub_ps(_mm512_sub_ps(src[10], src[6]), _mm512_sub_ps(src[9], src[5])), tail); - Store(dst + 11 * stride, _mm512_sub_ps(_mm512_sub_ps(src[9], src[5]), _mm512_sub_ps(src[11], src[7])), tail); - Store(dst + 12 * stride, _mm512_sub_ps(_mm512_sub_ps(src[4], src[12]), _mm512_sub_ps(src[6], src[14])), tail); - Store(dst + 13 * stride, _mm512_add_ps(_mm512_sub_ps(src[5], src[13]), _mm512_sub_ps(src[6], src[14])), tail); - Store(dst + 14 * stride, _mm512_sub_ps(_mm512_sub_ps(src[6], src[14]), _mm512_sub_ps(src[5], src[13])), tail); - Store(dst + 15 * stride, _mm512_sub_ps(_mm512_sub_ps(src[5], src[13]), _mm512_sub_ps(src[7], src[15])), tail); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput16n(const float * src, size_t srcStride, float * dst, size_t dstStride, const __mmask16 * tails) - { - __m512 t[16]; - WinogradKernel3x3Block2x2SetInputLoad16n(src + 0 * srcStride, t + 0, tails); - WinogradKernel3x3Block2x2SetInputLoad16n(src + 1 * srcStride, t + 4, tails); - WinogradKernel3x3Block2x2SetInputLoad16n(src + 2 * srcStride, t + 8, tails); - WinogradKernel3x3Block2x2SetInputLoad16n(src + 3 * srcStride, t + 12, tails); - WinogradKernel3x3Block2x2SetInput16Store(t, dst, dstStride, tails[4]); - } - - template SIMD_INLINE void WinogradKernel3x3Block2x2SetInput16n(const float * src, size_t srcStride, PadType rowPad, float * dst, size_t dstStride, const __mmask16 * tails) - { - __m512 t[16]; - if (rowPad == PadNose1) - WinogradKernel3x3Block2x2SetInputLoad16z(t + 0); - else - WinogradKernel3x3Block2x2SetInputLoad16n(src + 0 * srcStride, t + 0, tails); - WinogradKernel3x3Block2x2SetInputLoad16n(src + 1 * srcStride, t + 4, tails); - if (rowPad == PadTail2) - WinogradKernel3x3Block2x2SetInputLoad16z(t + 8); - else - WinogradKernel3x3Block2x2SetInputLoad16n(src + 2 * srcStride, t + 8, tails); - if (rowPad >= PadTail1) - WinogradKernel3x3Block2x2SetInputLoad16z(t + 12); - else - WinogradKernel3x3Block2x2SetInputLoad16n(src + 3 * srcStride, t + 12, tails); - WinogradKernel3x3Block2x2SetInput16Store(t, dst, dstStride, tails[4]); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput16t(const float * src, size_t srcS, size_t srcC, __m512 dst[16], __mmask16 tail = -1) - { - dst[0] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 0 * srcC); - dst[1] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 1 * srcC); - dst[2] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 2 * srcC); - dst[3] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 3 * srcC); - dst[4] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 0 * srcC); - dst[5] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 1 * srcC); - dst[6] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 2 * srcC); - dst[7] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 3 * srcC); - dst[8] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 0 * srcC); - dst[9] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 1 * srcC); - dst[10] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 2 * srcC); - dst[11] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 3 * srcC); - dst[12] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 0 * srcC); - dst[13] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 1 * srcC); - dst[14] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 2 * srcC); - dst[15] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 3 * srcC); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput16t(const float * src, size_t srcW, size_t srcC, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC, srcCF = AlignLo(srcC, F), c = 0; - for (; c < srcCF; c += F) - { - __m512 tmp[16]; - WinogradKernel3x3Block2x2SetInput16t(src + c, srcS, srcC, tmp); - WinogradKernel3x3Block2x2SetInput16Store(tmp, dst + c, dstStride); - } - if (c < srcC) - { - __mmask16 tail = TailMask16(srcC - c); - __m512 tmp[16]; - WinogradKernel3x3Block2x2SetInput16t(src + c, srcS, srcC, tmp, tail); - WinogradKernel3x3Block2x2SetInput16Store(tmp, dst + c, dstStride, tail); - } - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput16t(const float * src, size_t srcS, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, __m512 * dst, __mmask16 tail = -1) - { - for (size_t row = 0; row < rowB; ++row) - { - for (size_t col = 0; col < 4; ++col) - dst[col] = _mm512_setzero_ps(); - dst += 4; - } - for (size_t row = rowB; row < rowE; ++row) - { - for (size_t col = 0; col < colB; ++col) - dst[col] = _mm512_setzero_ps(); - for (size_t col = colB; col < colE; ++col) - dst[col] = _mm512_maskz_loadu_ps(tail, src + row * srcS + col * srcC); - for (size_t col = colE; col < 4; ++col) - dst[col] = _mm512_setzero_ps(); - dst += 4; - } - for (size_t row = rowE; row < 4; ++row) - { - for (size_t col = 0; col < 4; ++col) - dst[col] = _mm512_setzero_ps(); - dst += 4; - } - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput16t(const float * src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC, srcCF = AlignLo(srcC, F), c = 0; - for (; c < srcCF; c += F) - { - __m512 tmp[16]; - WinogradKernel3x3Block2x2SetInput16t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel3x3Block2x2SetInput16Store(tmp, dst + c, dstStride); - } - if (c < srcC) - { - __mmask16 tail = TailMask16(srcC - c); - __m512 tmp[16]; - WinogradKernel3x3Block2x2SetInput16t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp, tail); - WinogradKernel3x3Block2x2SetInput16Store(tmp, dst + c, dstStride, tail); - } - } - - void WinogradKernel3x3Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padY == padX && padY == padH && padY == padW && (padY == 0 || padY == 1)); - SimdBool pad = padY > 0 ? SimdTrue : SimdFalse; - if (trans ? (false) : (srcHeight < 4 || srcWidth < 4)) - { - Avx::WinogradKernel3x3Block2x2SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - return; - } - size_t dstH = pad ? srcHeight : srcHeight - 2; - size_t dstW = pad ? srcWidth : srcWidth - 2; - size_t tileH = (dstH + 1) / 2; - size_t tileW = (dstW + 1) / 2; - size_t dstH2 = AlignLo(dstH, 2); - size_t dstW2 = AlignLo(dstW, 2); - if (trans) - { - size_t noseW = Simd::Min(4, dstW + 1); - size_t noseH = Simd::Min(4, dstH + 1); - size_t start = pad ? 2 : 0; - if (pad) - { - if (dstH == dstH2) - dstH2 -= 2; - if (dstW == dstW2) - dstW2 -= 2; - src -= (srcWidth + 1)*srcChannels; - } - size_t tailW = dstW - dstW2 + (pad ? 1 : 2); - size_t tailH = dstH - dstH2 + (pad ? 1 : 2); - size_t row = 0, col = 0; - if (pad) - { - if (pad) - WinogradKernel3x3Block2x2SetInput16t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstW2; col += 2) - WinogradKernel3x3Block2x2SetInput16t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 4, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block2x2SetInput16t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; - } - for (row = start; row < dstH2; row += 2) - { - if (pad) - WinogradKernel3x3Block2x2SetInput16t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 4, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstW2; col += 2) - WinogradKernel3x3Block2x2SetInput16t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block2x2SetInput16t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 4, 0, tailW, dst, dstStride), dst += srcChannels; - } - if (row < dstH) - { - if (pad) - WinogradKernel3x3Block2x2SetInput16t(src + row * srcWidth* srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstW2; col += 2) - WinogradKernel3x3Block2x2SetInput16t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 4, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block2x2SetInput16t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; - } - } - else - { - size_t dstW32 = AlignLo(dstW, 32); - if (pad && dstW32 == dstW) - dstW32 -= 32; - PadType rowPad = dstH2 < dstH ? PadTail1 : PadNone; - size_t tailRow = dstH2 < dstH ? dstH - 1 : dstH - 2; - bool specialRowTail = dstH2 < dstH || (pad && dstH2); - bool specialColTail = pad ? dstW32 : (dstW32 < dstW); - - __mmask16 tails[5], noses[5]; - for (size_t c = 0; c < 2; ++c) - { - noses[c * 2 + 0] = TailMask16(dstW - F * c - 0 + (pad ? 1 : 2)); - noses[c * 2 + 1] = TailMask16(dstW - F * c - 2 + (pad ? 1 : 2)); - tails[c * 2 + 0] = TailMask16(dstW - dstW32 - F * c - 0 + (pad ? 1 : 2)); - tails[c * 2 + 1] = TailMask16(dstW - dstW32 - F * c - 2 + (pad ? 1 : 2)); - } - noses[4] = TailMask16(tileW); - tails[4] = TailMask16(tileW - dstW32 / 2); - - if (pad) - { - src -= srcWidth + 1; - rowPad = dstH2 < dstH ? PadTail2 : PadTail1; - noses[0] = noses[0] & (~1); - if (dstH2 == dstH) - dstH2 -= 2; - } - for (size_t c = 0; c < srcChannels; ++c) - { - size_t row = 0, tileY = 0; - if (pad) - { - size_t col = 0, tileX = 0; - const float * s = src + row * srcWidth; - float * d = dst + tileY * tileW; - if (pad) - WinogradKernel3x3Block2x2SetInput16n(s + col, srcWidth, PadNose1, d + tileX, dstStride, noses), col += 32, tileX += 16; - for (; col < dstW32; col += 32, tileX += 16) - WinogradKernel3x3Block2x2SetInput16n(s + col, srcWidth, PadNose1, d + tileX, dstStride, tails); - if (specialColTail) - WinogradKernel3x3Block2x2SetInput16n(s + col, srcWidth, PadNose1, d + tileX, dstStride, tails); - row += 2, tileY += 1; - } - for (; row < dstH2; row += 2, tileY += 1) - { - size_t col = 0, tileX = 0; - const float * s = src + row * srcWidth; - float * d = dst + tileY * tileW; - if (pad) - WinogradKernel3x3Block2x2SetInput16n(s + col, srcWidth, PadNone, d + tileX, dstStride, noses), col += 32, tileX += 16; - for (; col < dstW32; col += 32, tileX += 16) - WinogradKernel3x3Block2x2SetInput16n(s + col, srcWidth, d + tileX, dstStride, tails); - if (specialColTail) - WinogradKernel3x3Block2x2SetInput16n(s + col, srcWidth, PadNone, d + tileX, dstStride, tails); - } - if (specialRowTail) - { - size_t col = 0, tileX = 0; - const float * s = src + tailRow * srcWidth; - float * d = dst + (tileH - 1) * tileW; - if (pad) - WinogradKernel3x3Block2x2SetInput16n(s + col, srcWidth, rowPad, d + tileX, dstStride, noses), col += 32, tileX += 16; - for (; col < dstW32; col += 32, tileX += 16) - WinogradKernel3x3Block2x2SetInput16n(s + col, srcWidth, rowPad, d + tileX, dstStride, tails); - if (specialColTail) - WinogradKernel3x3Block2x2SetInput16n(s + col, srcWidth, rowPad, d + tileX, dstStride, tails); - } - src += srcWidth * srcHeight; - dst += tileW * tileH; - } - } - } - - //----------------------------------------------------------------------- - - template SIMD_INLINE void WinogradKernel3x3Block2x2SetOutputLoad4(const float * src, size_t srcStride, __m512 * dst, __mmask16 tail) - { - __m512 s0 = Load(src + 0 * srcStride, tail); - __m512 s1 = Load(src + 1 * srcStride, tail); - __m512 s2 = Load(src + 2 * srcStride, tail); - __m512 s3 = Load(src + 3 * srcStride, tail); - dst[0] = _mm512_add_ps(_mm512_add_ps(s0, s1), s2); - dst[1] = _mm512_sub_ps(_mm512_sub_ps(s1, s2), s3); - } - - template SIMD_INLINE void WinogradKernel3x3Block2x2SetOutput16n(const float * src, size_t srcStride, float * dst, size_t dstStride, const __mmask16 * tails) - { - __m512 t[8], d[4]; - WinogradKernel3x3Block2x2SetOutputLoad4(src + 0 * srcStride, srcStride, t + 0, tails[0]); - WinogradKernel3x3Block2x2SetOutputLoad4(src + 4 * srcStride, srcStride, t + 2, tails[0]); - WinogradKernel3x3Block2x2SetOutputLoad4(src + 8 * srcStride, srcStride, t + 4, tails[0]); - d[0] = _mm512_add_ps(_mm512_add_ps(t[0], t[2]), t[4]); - d[1] = _mm512_add_ps(_mm512_add_ps(t[1], t[3]), t[5]); - Store(dst + 0, Interleave<0>(d[0], d[1]), tails[1]); - Store(dst + F, Interleave<1>(d[0], d[1]), tails[2]); - if (main) - { - dst += dstStride; - WinogradKernel3x3Block2x2SetOutputLoad4(src + 12 * srcStride, srcStride, t + 6, tails[0]); - d[2] = _mm512_sub_ps(_mm512_sub_ps(t[2], t[4]), t[6]); - d[3] = _mm512_sub_ps(_mm512_sub_ps(t[3], t[5]), t[7]); - Store(dst + 0, Interleave<0>(d[2], d[3]), tails[1]); - Store(dst + F, Interleave<1>(d[2], d[3]), tails[2]); - } - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutputLoad16(const float * src, size_t stride, __m512 * dst, __mmask16 tail = -1) - { - __m512 tmp[8]; - WinogradKernel3x3Block2x2SetOutputLoad4(src + 0 * stride, stride, tmp + 0, tail); - WinogradKernel3x3Block2x2SetOutputLoad4(src + 4 * stride, stride, tmp + 2, tail); - WinogradKernel3x3Block2x2SetOutputLoad4(src + 8 * stride, stride, tmp + 4, tail); - WinogradKernel3x3Block2x2SetOutputLoad4(src + 12 * stride, stride, tmp + 6, tail); - dst[0] = _mm512_add_ps(_mm512_add_ps(tmp[0], tmp[2]), tmp[4]); - dst[1] = _mm512_add_ps(_mm512_add_ps(tmp[1], tmp[3]), tmp[5]); - dst[2] = _mm512_sub_ps(_mm512_sub_ps(tmp[2], tmp[4]), tmp[6]); - dst[3] = _mm512_sub_ps(_mm512_sub_ps(tmp[3], tmp[5]), tmp[7]); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutputStore4(const __m512 src[4], float * dst, size_t dstS, size_t dstC, __mmask16 tail = -1) - { - _mm512_mask_storeu_ps(dst + 0 * dstS + 0 * dstC, tail, src[0]); - _mm512_mask_storeu_ps(dst + 0 * dstS + 1 * dstC, tail, src[1]); - _mm512_mask_storeu_ps(dst + 1 * dstS + 0 * dstC, tail, src[2]); - _mm512_mask_storeu_ps(dst + 1 * dstS + 1 * dstC, tail, src[3]); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutput16t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F), d = 0; - for (; d < dstCF; d += F) - { - __m512 tmp[4]; - WinogradKernel3x3Block2x2SetOutputLoad16(src + d, srcStride, tmp); - WinogradKernel3x3Block2x2SetOutputStore4(tmp, dst + d, dstS, dstC); - } - if (d < dstC) - { - __mmask16 tail = TailMask16(dstC - dstCF); - __m512 tmp[4]; - WinogradKernel3x3Block2x2SetOutputLoad16(src + d, srcStride, tmp, tail); - WinogradKernel3x3Block2x2SetOutputStore4(tmp, dst + d, dstS, dstC, tail); - } - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutputStore4(const __m512 src[4], float * dst, size_t dstS, size_t dstC, size_t rowE, size_t colE, __mmask16 tail = -1) - { - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - _mm512_mask_storeu_ps(dst + row * dstS + col * dstC, tail, src[row * 2 + col]); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutput16t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F), d = 0; - for (; d < dstCF; d += F) - { - __m512 tmp[4]; - WinogradKernel3x3Block2x2SetOutputLoad16(src + d, srcStride, tmp); - WinogradKernel3x3Block2x2SetOutputStore4(tmp, dst + d, dstS, dstC, rowE, colE); - } - if (d < dstC) - { - __mmask16 tail = TailMask16(dstC - dstCF); - __m512 tmp[4]; - WinogradKernel3x3Block2x2SetOutputLoad16(src + d, srcStride, tmp, tail); - WinogradKernel3x3Block2x2SetOutputStore4(tmp, dst + d, dstS, dstC, rowE, colE, tail); - } - } - - void WinogradKernel3x3Block2x2SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - size_t tileH = (dstHeight + 1) / 2; - size_t tileW = (dstWidth + 1) / 2; - size_t dstH2 = AlignLo(dstHeight, 2); - size_t dstW2 = AlignLo(dstWidth, 2); - if (trans) - { - size_t row, col; - for (row = 0; row < dstH2; row += 2) - { - for (col = 0; col < dstW2; col += 2) - WinogradKernel3x3Block2x2SetOutput16t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block2x2SetOutput16t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, 2, dstWidth - col), src += dstChannels; - } - if (row < dstHeight) - { - for (col = 0; col < dstW2; col += 2) - WinogradKernel3x3Block2x2SetOutput16t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, 2), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block2x2SetOutput16t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; - } - } - else - { - size_t dstW32 = AlignLo(dstWidth, 32); - __mmask16 tails[3]; - tails[0] = TailMask16(tileW - dstW32 / 2); - for (size_t c = 0; c < 2; ++c) - tails[1 + c] = TailMask16(dstWidth - dstW32 - F * c); - for (size_t c = 0; c < dstChannels; ++c) - { - size_t row = 0, tileY = 0; - for (; row < dstH2; row += 2, tileY += 1) - { - size_t col = 0, tileX = 0; - const float * s = src + tileY * tileW; - float * d = dst + row * dstWidth; - for (; col < dstW32; col += 32, tileX += 16) - WinogradKernel3x3Block2x2SetOutput16n(s + tileX, srcStride, d + col, dstWidth, tails); - if (col < dstWidth) - WinogradKernel3x3Block2x2SetOutput16n(s + tileX, srcStride, d + col, dstWidth, tails); - } - if (row < dstHeight) - { - size_t col = 0, tileX = 0; - const float * s = src + tileY * tileW; - float * d = dst + row * dstWidth; - for (col = 0; col < dstW32; col += 32, tileX += 16) - WinogradKernel3x3Block2x2SetOutput16n(s + tileX, srcStride, d + col, dstWidth, tails); - if (col < dstWidth) - WinogradKernel3x3Block2x2SetOutput16n(s + tileX, srcStride, d + col, dstWidth, tails); - } - src += tileW * tileH; - dst += dstHeight * dstWidth; - } - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block3x3SetFilter16Row(const __m512 * t, float * dst, size_t stride, __mmask16 tail) - { - const __m512 r6 = _mm512_set1_ps(1.0f / 6.0f); - const __m512 r3 = _mm512_set1_ps(1.0f / 3.0f); - const __m512 r2 = _mm512_set1_ps(1.0f / 2.0f); - const __m512 f2_3 = _mm512_set1_ps(2.0f / 3.0f); - const __m512 mr2 = _mm512_set1_ps(-1.0f / 2.0f); - - _mm512_mask_storeu_ps(dst + 0 * stride, tail, _mm512_mul_ps(r2, t[0])); - __m512 t0 = _mm512_add_ps(t[0], t[2]); - _mm512_mask_storeu_ps(dst + 1 * stride, tail, _mm512_mul_ps(mr2, _mm512_add_ps(t0, t[1]))); - _mm512_mask_storeu_ps(dst + 2 * stride, tail, _mm512_mul_ps(r6, _mm512_sub_ps(t[1], t0))); - _mm512_mask_storeu_ps(dst + 3 * stride, tail, _mm512_add_ps(_mm512_mul_ps(r6, t[0]), _mm512_add_ps(_mm512_mul_ps(r3, t[1]), _mm512_mul_ps(f2_3, t[2])))); - _mm512_mask_storeu_ps(dst + 4 * stride, tail, t[2]); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetFilter16All(const __m512 * s, float * dst, size_t stride, __mmask16 tail) - { - const __m512 r6 = _mm512_set1_ps(1.0f / 6.0f); - const __m512 r3 = _mm512_set1_ps(1.0f / 3.0f); - const __m512 r2 = _mm512_set1_ps(1.0f / 2.0f); - const __m512 f2_3 = _mm512_set1_ps(2.0f / 3.0f); - const __m512 mr2 = _mm512_set1_ps(-1.0f / 2.0f); - - __m512 t[3]; - t[0] = _mm512_mul_ps(r2, s[0]); - t[1] = _mm512_mul_ps(r2, s[1]); - t[2] = _mm512_mul_ps(r2, s[2]); - WinogradKernel3x3Block3x3SetFilter16Row(t, dst + 0 * stride, stride, tail); - - t[0] = _mm512_mul_ps(mr2, _mm512_add_ps(_mm512_add_ps(s[0], s[6]), s[3])); - t[1] = _mm512_mul_ps(mr2, _mm512_add_ps(_mm512_add_ps(s[1], s[7]), s[4])); - t[2] = _mm512_mul_ps(mr2, _mm512_add_ps(_mm512_add_ps(s[2], s[8]), s[5])); - WinogradKernel3x3Block3x3SetFilter16Row(t, dst + 5 * stride, stride, tail); - - t[0] = _mm512_mul_ps(r6, _mm512_sub_ps(s[3], _mm512_add_ps(s[0], s[6]))); - t[1] = _mm512_mul_ps(r6, _mm512_sub_ps(s[4], _mm512_add_ps(s[1], s[7]))); - t[2] = _mm512_mul_ps(r6, _mm512_sub_ps(s[5], _mm512_add_ps(s[2], s[8]))); - WinogradKernel3x3Block3x3SetFilter16Row(t, dst + 10 * stride, stride, tail); - - t[0] = _mm512_add_ps(_mm512_mul_ps(r6, s[0]), _mm512_add_ps(_mm512_mul_ps(r3, s[3]), _mm512_mul_ps(f2_3, s[6]))); - t[1] = _mm512_add_ps(_mm512_mul_ps(r6, s[1]), _mm512_add_ps(_mm512_mul_ps(r3, s[4]), _mm512_mul_ps(f2_3, s[7]))); - t[2] = _mm512_add_ps(_mm512_mul_ps(r6, s[2]), _mm512_add_ps(_mm512_mul_ps(r3, s[5]), _mm512_mul_ps(f2_3, s[8]))); - WinogradKernel3x3Block3x3SetFilter16Row(t, dst + 15 * stride, stride, tail); - - WinogradKernel3x3Block3x3SetFilter16Row(s + 6, dst + 20 * stride, stride, tail); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetFilter16t(const float * src, float * dst, size_t stride, __mmask16 tail = -1) - { - __m512 s[9]; - s[0] = _mm512_maskz_loadu_ps(tail, src + 0 * stride); - s[1] = _mm512_maskz_loadu_ps(tail, src + 1 * stride); - s[2] = _mm512_maskz_loadu_ps(tail, src + 2 * stride); - s[3] = _mm512_maskz_loadu_ps(tail, src + 3 * stride); - s[4] = _mm512_maskz_loadu_ps(tail, src + 4 * stride); - s[5] = _mm512_maskz_loadu_ps(tail, src + 5 * stride); - s[6] = _mm512_maskz_loadu_ps(tail, src + 6 * stride); - s[7] = _mm512_maskz_loadu_ps(tail, src + 7 * stride); - s[8] = _mm512_maskz_loadu_ps(tail, src + 8 * stride); - WinogradKernel3x3Block3x3SetFilter16All(s, dst + 0 * stride, stride, tail); - } - - void WinogradKernel3x3Block3x3SetFilter(const float * src, size_t size, float * dst, SimdBool trans) - { - if (trans) - { - size_t sizeF = AlignLo(size, F), i = 0; - for (; i < sizeF; i += F) - WinogradKernel3x3Block3x3SetFilter16t(src + i, dst + i, size); - if (i < size) - { - __mmask16 tail = TailMask16(size - sizeF); - WinogradKernel3x3Block3x3SetFilter16t(src + i, dst + i, size, tail); - } - } - else - { - Sse::WinogradKernel3x3Block3x3SetFilter(src, size, dst, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block3x3SetInput16Store(const __m512 src[25], float * dst, size_t stride, __mmask16 tail = -1) - { - __m512 _2 = _mm512_set1_ps(2.0f); - __m512 _3 = _mm512_set1_ps(3.0f); - __m512 tmp[5]; - - tmp[0] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[0], src[10])), _mm512_sub_ps(src[15], src[5])); - tmp[1] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[1], src[11])), _mm512_sub_ps(src[16], src[6])); - tmp[2] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[2], src[12])), _mm512_sub_ps(src[17], src[7])); - tmp[3] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[3], src[13])), _mm512_sub_ps(src[18], src[8])); - tmp[4] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[4], src[14])), _mm512_sub_ps(src[19], src[9])); - _mm512_mask_storeu_ps(dst + 0 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[0], tmp[2])), _mm512_sub_ps(tmp[3], tmp[1]))); - _mm512_mask_storeu_ps(dst + 1 * stride, tail, _mm512_sub_ps(_mm512_sub_ps(tmp[3], tmp[2]), _mm512_mul_ps(_2, tmp[1]))); - _mm512_mask_storeu_ps(dst + 2 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, tmp[1]), _mm512_sub_ps(tmp[3], _mm512_mul_ps(_3, tmp[2])))); - _mm512_mask_storeu_ps(dst + 3 * stride, tail, _mm512_sub_ps(tmp[3], tmp[1])); - _mm512_mask_storeu_ps(dst + 4 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[1], tmp[3])), _mm512_sub_ps(tmp[4], tmp[2]))); - - tmp[0] = _mm512_sub_ps(_mm512_sub_ps(src[15], src[10]), _mm512_mul_ps(_2, src[5])); - tmp[1] = _mm512_sub_ps(_mm512_sub_ps(src[16], src[11]), _mm512_mul_ps(_2, src[6])); - tmp[2] = _mm512_sub_ps(_mm512_sub_ps(src[17], src[12]), _mm512_mul_ps(_2, src[7])); - tmp[3] = _mm512_sub_ps(_mm512_sub_ps(src[18], src[13]), _mm512_mul_ps(_2, src[8])); - tmp[4] = _mm512_sub_ps(_mm512_sub_ps(src[19], src[14]), _mm512_mul_ps(_2, src[9])); - _mm512_mask_storeu_ps(dst + 5 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[0], tmp[2])), _mm512_sub_ps(tmp[3], tmp[1]))); - _mm512_mask_storeu_ps(dst + 6 * stride, tail, _mm512_sub_ps(_mm512_sub_ps(tmp[3], tmp[2]), _mm512_mul_ps(_2, tmp[1]))); - _mm512_mask_storeu_ps(dst + 7 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, tmp[1]), _mm512_sub_ps(tmp[3], _mm512_mul_ps(_3, tmp[2])))); - _mm512_mask_storeu_ps(dst + 8 * stride, tail, _mm512_sub_ps(tmp[3], tmp[1])); - _mm512_mask_storeu_ps(dst + 9 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[1], tmp[3])), _mm512_sub_ps(tmp[4], tmp[2]))); - - tmp[0] = _mm512_add_ps(_mm512_mul_ps(_2, src[5]), _mm512_sub_ps(src[15], _mm512_mul_ps(_3, src[10]))); - tmp[1] = _mm512_add_ps(_mm512_mul_ps(_2, src[6]), _mm512_sub_ps(src[16], _mm512_mul_ps(_3, src[11]))); - tmp[2] = _mm512_add_ps(_mm512_mul_ps(_2, src[7]), _mm512_sub_ps(src[17], _mm512_mul_ps(_3, src[12]))); - tmp[3] = _mm512_add_ps(_mm512_mul_ps(_2, src[8]), _mm512_sub_ps(src[18], _mm512_mul_ps(_3, src[13]))); - tmp[4] = _mm512_add_ps(_mm512_mul_ps(_2, src[9]), _mm512_sub_ps(src[19], _mm512_mul_ps(_3, src[14]))); - _mm512_mask_storeu_ps(dst + 10 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[0], tmp[2])), _mm512_sub_ps(tmp[3], tmp[1]))); - _mm512_mask_storeu_ps(dst + 11 * stride, tail, _mm512_sub_ps(_mm512_sub_ps(tmp[3], tmp[2]), _mm512_mul_ps(_2, tmp[1]))); - _mm512_mask_storeu_ps(dst + 12 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, tmp[1]), _mm512_sub_ps(tmp[3], _mm512_mul_ps(_3, tmp[2])))); - _mm512_mask_storeu_ps(dst + 13 * stride, tail, _mm512_sub_ps(tmp[3], tmp[1])); - _mm512_mask_storeu_ps(dst + 14 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[1], tmp[3])), _mm512_sub_ps(tmp[4], tmp[2]))); - - tmp[0] = _mm512_sub_ps(src[15], src[5]); - tmp[1] = _mm512_sub_ps(src[16], src[6]); - tmp[2] = _mm512_sub_ps(src[17], src[7]); - tmp[3] = _mm512_sub_ps(src[18], src[8]); - tmp[4] = _mm512_sub_ps(src[19], src[9]); - _mm512_mask_storeu_ps(dst + 15 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[0], tmp[2])), _mm512_sub_ps(tmp[3], tmp[1]))); - _mm512_mask_storeu_ps(dst + 16 * stride, tail, _mm512_sub_ps(_mm512_sub_ps(tmp[3], tmp[2]), _mm512_mul_ps(_2, tmp[1]))); - _mm512_mask_storeu_ps(dst + 17 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, tmp[1]), _mm512_sub_ps(tmp[3], _mm512_mul_ps(_3, tmp[2])))); - _mm512_mask_storeu_ps(dst + 18 * stride, tail, _mm512_sub_ps(tmp[3], tmp[1])); - _mm512_mask_storeu_ps(dst + 19 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[1], tmp[3])), _mm512_sub_ps(tmp[4], tmp[2]))); - - tmp[0] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[5], src[15])), _mm512_sub_ps(src[20], src[10])); - tmp[1] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[6], src[16])), _mm512_sub_ps(src[21], src[11])); - tmp[2] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[7], src[17])), _mm512_sub_ps(src[22], src[12])); - tmp[3] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[8], src[18])), _mm512_sub_ps(src[23], src[13])); - tmp[4] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[9], src[19])), _mm512_sub_ps(src[24], src[14])); - _mm512_mask_storeu_ps(dst + 20 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[0], tmp[2])), _mm512_sub_ps(tmp[3], tmp[1]))); - _mm512_mask_storeu_ps(dst + 21 * stride, tail, _mm512_sub_ps(_mm512_sub_ps(tmp[3], tmp[2]), _mm512_mul_ps(_2, tmp[1]))); - _mm512_mask_storeu_ps(dst + 22 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, tmp[1]), _mm512_sub_ps(tmp[3], _mm512_mul_ps(_3, tmp[2])))); - _mm512_mask_storeu_ps(dst + 23 * stride, tail, _mm512_sub_ps(tmp[3], tmp[1])); - _mm512_mask_storeu_ps(dst + 24 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[1], tmp[3])), _mm512_sub_ps(tmp[4], tmp[2]))); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetInput16t(const float * src, size_t srcS, size_t srcC, __m512 dst[25], __mmask16 tail = -1) - { - dst[0] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 0 * srcC); - dst[1] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 1 * srcC); - dst[2] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 2 * srcC); - dst[3] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 3 * srcC); - dst[4] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 4 * srcC); - dst[5] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 0 * srcC); - dst[6] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 1 * srcC); - dst[7] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 2 * srcC); - dst[8] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 3 * srcC); - dst[9] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 4 * srcC); - dst[10] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 0 * srcC); - dst[11] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 1 * srcC); - dst[12] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 2 * srcC); - dst[13] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 3 * srcC); - dst[14] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 4 * srcC); - dst[15] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 0 * srcC); - dst[16] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 1 * srcC); - dst[17] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 2 * srcC); - dst[18] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 3 * srcC); - dst[19] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 4 * srcC); - dst[20] = _mm512_maskz_loadu_ps(tail, src + 4 * srcS + 0 * srcC); - dst[21] = _mm512_maskz_loadu_ps(tail, src + 4 * srcS + 1 * srcC); - dst[22] = _mm512_maskz_loadu_ps(tail, src + 4 * srcS + 2 * srcC); - dst[23] = _mm512_maskz_loadu_ps(tail, src + 4 * srcS + 3 * srcC); - dst[24] = _mm512_maskz_loadu_ps(tail, src + 4 * srcS + 4 * srcC); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetInput16t(const float * src, size_t srcW, size_t srcC, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - __m512 tmp[25]; - WinogradKernel3x3Block3x3SetInput16t(src + c, srcS, srcC, tmp); - WinogradKernel3x3Block3x3SetInput16Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - __m512 tmp[25]; - __mmask16 tail = TailMask16(srcC - srcCF); - WinogradKernel3x3Block3x3SetInput16t(src + srcCF, srcS, srcC, tmp, tail); - WinogradKernel3x3Block3x3SetInput16Store(tmp, dst + srcCF, dstStride, tail); - } - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetInput16t(const float * src, size_t srcS, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, __m512 * dst, __mmask16 tail = -1) - { - for (size_t row = 0; row < rowB; ++row) - { - for (size_t col = 0; col < 5; ++col) - dst[col] = _mm512_setzero_ps(); - dst += 5; - } - for (size_t row = rowB; row < rowE; ++row) - { - for (size_t col = 0; col < colB; ++col) - dst[col] = _mm512_setzero_ps(); - for (size_t col = colB; col < colE; ++col) - dst[col] = _mm512_maskz_loadu_ps(tail, src + row * srcS + col * srcC); - for (size_t col = colE; col < 5; ++col) - dst[col] = _mm512_setzero_ps(); - dst += 5; - } - for (size_t row = rowE; row < 5; ++row) - { - for (size_t col = 0; col < 5; ++col) - dst[col] = _mm512_setzero_ps(); - dst += 5; - } - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetInput16t(const float * src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - __m512 tmp[25]; - WinogradKernel3x3Block3x3SetInput16t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel3x3Block3x3SetInput16Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - __m512 tmp[25]; - __mmask16 tail = TailMask16(srcC - srcCF); - WinogradKernel3x3Block3x3SetInput16t(src + srcCF, srcS, srcC, rowB, rowE, colB, colE, tmp, tail); - WinogradKernel3x3Block3x3SetInput16Store(tmp, dst + srcCF, dstStride, tail); - } - } - - void WinogradKernel3x3Block3x3SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padY == padX && padY == padH && padY == padW && (padY == 0 || padY == 1)); - SimdBool pad = padY > 0 ? SimdTrue : SimdFalse; - if (trans ? (false) : (srcHeight < 5 || srcWidth < 5)) - { - Avx::WinogradKernel3x3Block3x3SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - return; - } - size_t dstH = pad ? srcHeight : srcHeight - 2; - size_t dstW = pad ? srcWidth : srcWidth - 2; - size_t tileH = (dstH + 2) / 3; - size_t tileW = (dstW + 2) / 3; - size_t dstH3 = AlignLoAny(dstH, 3); - size_t dstW3 = AlignLoAny(dstW, 3); - if (trans) - { - size_t noseW = Simd::Min(5, dstW + 1); - size_t noseH = Simd::Min(5, dstH + 1); - size_t start = pad ? 3 : 0; - if (pad) - { - if (dstH == dstH3) - dstH3 -= 3; - if (dstW == dstW3) - dstW3 -= 3; - src -= (srcWidth + 1)*srcChannels; - } - size_t tailW = dstW - dstW3 + (pad ? 1 : 2); - size_t tailH = dstH - dstH3 + (pad ? 1 : 2); - size_t row = 0, col = 0; - if (pad) - { - if (pad) - WinogradKernel3x3Block3x3SetInput16t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstW3; col += 3) - WinogradKernel3x3Block3x3SetInput16t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 5, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block3x3SetInput16t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; - } - for (row = start; row < dstH3; row += 3) - { - if (pad) - WinogradKernel3x3Block3x3SetInput16t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 5, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstW3; col += 3) - WinogradKernel3x3Block3x3SetInput16t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block3x3SetInput16t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 5, 0, tailW, dst, dstStride), dst += srcChannels; - } - if (row < dstH) - { - if (pad) - WinogradKernel3x3Block3x3SetInput16t(src + row * srcWidth* srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstW3; col += 3) - WinogradKernel3x3Block3x3SetInput16t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 5, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block3x3SetInput16t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; - } - } - else - { - Base::WinogradKernel3x3Block3x3SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block3x3SetOutputLoad25(const float * src, size_t stride, __m512 dst[9], __mmask16 tail = -1) - { - __m512 s[25]; - s[0] = _mm512_maskz_loadu_ps(tail,src + 0 * stride); - s[1] = _mm512_maskz_loadu_ps(tail,src + 1 * stride); - s[2] = _mm512_maskz_loadu_ps(tail,src + 2 * stride); - s[3] = _mm512_maskz_loadu_ps(tail,src + 3 * stride); - s[4] = _mm512_maskz_loadu_ps(tail,src + 4 * stride); - s[5] = _mm512_maskz_loadu_ps(tail,src + 5 * stride); - s[6] = _mm512_maskz_loadu_ps(tail,src + 6 * stride); - s[7] = _mm512_maskz_loadu_ps(tail,src + 7 * stride); - s[8] = _mm512_maskz_loadu_ps(tail,src + 8 * stride); - s[9] = _mm512_maskz_loadu_ps(tail,src + 9 * stride); - s[10] = _mm512_maskz_loadu_ps(tail,src + 10 * stride); - s[11] = _mm512_maskz_loadu_ps(tail,src + 11 * stride); - s[12] = _mm512_maskz_loadu_ps(tail,src + 12 * stride); - s[13] = _mm512_maskz_loadu_ps(tail,src + 13 * stride); - s[14] = _mm512_maskz_loadu_ps(tail,src + 14 * stride); - s[15] = _mm512_maskz_loadu_ps(tail,src + 15 * stride); - s[16] = _mm512_maskz_loadu_ps(tail,src + 16 * stride); - s[17] = _mm512_maskz_loadu_ps(tail,src + 17 * stride); - s[18] = _mm512_maskz_loadu_ps(tail,src + 18 * stride); - s[19] = _mm512_maskz_loadu_ps(tail,src + 19 * stride); - s[20] = _mm512_maskz_loadu_ps(tail,src + 20 * stride); - s[21] = _mm512_maskz_loadu_ps(tail,src + 21 * stride); - s[22] = _mm512_maskz_loadu_ps(tail,src + 22 * stride); - s[23] = _mm512_maskz_loadu_ps(tail,src + 23 * stride); - s[24] = _mm512_maskz_loadu_ps(tail,src + 24 * stride); - - __m512 _2 = _mm512_set1_ps(2.0f); - __m512 _4 = _mm512_set1_ps(4.0f); - __m512 t[5]; - t[0] = _mm512_add_ps(_mm512_add_ps(s[0], s[5]), _mm512_add_ps(s[10], s[15])); - t[1] = _mm512_add_ps(_mm512_add_ps(s[1], s[6]), _mm512_add_ps(s[11], s[16])); - t[2] = _mm512_add_ps(_mm512_add_ps(s[2], s[7]), _mm512_add_ps(s[12], s[17])); - t[3] = _mm512_add_ps(_mm512_add_ps(s[3], s[8]), _mm512_add_ps(s[13], s[18])); - t[4] = _mm512_add_ps(_mm512_add_ps(s[4], s[9]), _mm512_add_ps(s[14], s[19])); - dst[0] = _mm512_add_ps(_mm512_add_ps(t[0], t[1]), _mm512_add_ps(t[2], t[3])); - dst[1] = _mm512_add_ps(_mm512_sub_ps(t[1], t[2]), _mm512_mul_ps(_2, t[3])); - dst[2] = _mm512_add_ps(_mm512_add_ps(t[1], t[2]), _mm512_add_ps(_mm512_mul_ps(_4, t[3]), t[4])); - - t[0] = _mm512_add_ps(_mm512_sub_ps(s[5], s[10]), _mm512_mul_ps(_2, s[15])); - t[1] = _mm512_add_ps(_mm512_sub_ps(s[6], s[11]), _mm512_mul_ps(_2, s[16])); - t[2] = _mm512_add_ps(_mm512_sub_ps(s[7], s[12]), _mm512_mul_ps(_2, s[17])); - t[3] = _mm512_add_ps(_mm512_sub_ps(s[8], s[13]), _mm512_mul_ps(_2, s[18])); - t[4] = _mm512_add_ps(_mm512_sub_ps(s[9], s[14]), _mm512_mul_ps(_2, s[19])); - dst[3] = _mm512_add_ps(_mm512_add_ps(t[0], t[1]), _mm512_add_ps(t[2], t[3])); - dst[4] = _mm512_add_ps(_mm512_sub_ps(t[1], t[2]), _mm512_mul_ps(_2, t[3])); - dst[5] = _mm512_add_ps(_mm512_add_ps(t[1], t[2]), _mm512_add_ps(_mm512_mul_ps(_4, t[3]), t[4])); - - t[0] = _mm512_add_ps(_mm512_add_ps(s[5], s[10]), _mm512_add_ps(_mm512_mul_ps(_4, s[15]), s[20])); - t[1] = _mm512_add_ps(_mm512_add_ps(s[6], s[11]), _mm512_add_ps(_mm512_mul_ps(_4, s[16]), s[21])); - t[2] = _mm512_add_ps(_mm512_add_ps(s[7], s[12]), _mm512_add_ps(_mm512_mul_ps(_4, s[17]), s[22])); - t[3] = _mm512_add_ps(_mm512_add_ps(s[8], s[13]), _mm512_add_ps(_mm512_mul_ps(_4, s[18]), s[23])); - t[4] = _mm512_add_ps(_mm512_add_ps(s[9], s[14]), _mm512_add_ps(_mm512_mul_ps(_4, s[19]), s[24])); - dst[6] = _mm512_add_ps(_mm512_add_ps(t[0], t[1]), _mm512_add_ps(t[2], t[3])); - dst[7] = _mm512_add_ps(_mm512_sub_ps(t[1], t[2]), _mm512_mul_ps(_2, t[3])); - dst[8] = _mm512_add_ps(_mm512_add_ps(t[1], t[2]), _mm512_add_ps(_mm512_mul_ps(_4, t[3]), t[4])); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetOutputStore9(const __m512 src[9], float * dst, size_t dstS, size_t dstC, __mmask16 tail = -1) - { - _mm512_mask_storeu_ps(dst + 0 * dstS + 0 * dstC, tail, src[0]); - _mm512_mask_storeu_ps(dst + 0 * dstS + 1 * dstC, tail, src[1]); - _mm512_mask_storeu_ps(dst + 0 * dstS + 2 * dstC, tail, src[2]); - _mm512_mask_storeu_ps(dst + 1 * dstS + 0 * dstC, tail, src[3]); - _mm512_mask_storeu_ps(dst + 1 * dstS + 1 * dstC, tail, src[4]); - _mm512_mask_storeu_ps(dst + 1 * dstS + 2 * dstC, tail, src[5]); - _mm512_mask_storeu_ps(dst + 2 * dstS + 0 * dstC, tail, src[6]); - _mm512_mask_storeu_ps(dst + 2 * dstS + 1 * dstC, tail, src[7]); - _mm512_mask_storeu_ps(dst + 2 * dstS + 2 * dstC, tail, src[8]); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetOutput16t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m512 tmp[9]; - WinogradKernel3x3Block3x3SetOutputLoad25(src + d, srcStride, tmp); - WinogradKernel3x3Block3x3SetOutputStore9(tmp, dst + d, dstS, dstC); - } - if (dstCF < dstC) - { - __m512 tmp[9]; - __mmask16 tail = TailMask16(dstC - dstCF); - WinogradKernel3x3Block3x3SetOutputLoad25(src + dstCF, srcStride, tmp, tail); - WinogradKernel3x3Block3x3SetOutputStore9(tmp, dst + dstCF, dstS, dstC, tail); - } - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetOutputStore9(const __m512 src[16], float * dst, size_t dstS, size_t dstC, size_t rowE, size_t colE, __mmask16 tail = -1) - { - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - _mm512_mask_storeu_ps(dst + row * dstS + col * dstC, tail, src[row * 3 + col]); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetOutput16t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - __m512 tmp[9]; - WinogradKernel3x3Block3x3SetOutputLoad25(src + d, srcStride, tmp); - WinogradKernel3x3Block3x3SetOutputStore9(tmp, dst + d, dstS, dstC, rowE, colE); - } - if (dstCF < dstC) - { - __m512 tmp[9]; - __mmask16 tail = TailMask16(dstC - dstCF); - WinogradKernel3x3Block3x3SetOutputLoad25(src + dstCF, srcStride, tmp, tail); - WinogradKernel3x3Block3x3SetOutputStore9(tmp, dst + dstCF, dstS, dstC, rowE, colE, tail); - } - } - - void WinogradKernel3x3Block3x3SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - if (trans ? (false) : (dstHeight < 4 || dstWidth < 16)) - { - Avx::WinogradKernel3x3Block3x3SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - return; - } - size_t tileH = (dstHeight + 2) / 3; - size_t tileW = (dstWidth + 2) / 3; - size_t dstH3 = AlignLoAny(dstHeight, 3); - size_t dstW3 = AlignLoAny(dstWidth, 3); - if (trans) - { - size_t row, col; - for (row = 0; row < dstH3; row += 3) - { - for (col = 0; col < dstW3; col += 3) - WinogradKernel3x3Block3x3SetOutput16t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block3x3SetOutput16t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, 3, dstWidth - col), src += dstChannels; - } - if (row < dstHeight) - { - for (col = 0; col < dstW3; col += 3) - WinogradKernel3x3Block3x3SetOutput16t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, 3), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block3x3SetOutput16t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; - } - } - else - { - Base::WinogradKernel3x3Block3x3SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block4x4SetFilter16Row(const __m512 * t, float * dst, size_t stride, __mmask16 tail) - { - const __m512 r4 = _mm512_set1_ps(1.0f / 4.0f); - const __m512 r6 = _mm512_set1_ps(1.0f / 6.0f); - const __m512 mr6 = _mm512_set1_ps(-1.0f / 6.0f); - const __m512 r12 = _mm512_set1_ps(1.0f / 12.0f); - const __m512 r24 = _mm512_set1_ps(1.0f / 24.0f); - _mm512_mask_storeu_ps(dst + 0 * stride, tail, _mm512_mul_ps(r4, t[0])); - __m512 t0 = _mm512_add_ps(t[0], t[2]); - _mm512_mask_storeu_ps(dst + 1 * stride, tail, _mm512_mul_ps(mr6, _mm512_add_ps(t0, t[1]))); - _mm512_mask_storeu_ps(dst + 2 * stride, tail, _mm512_mul_ps(mr6, _mm512_sub_ps(t0, t[1]))); - __m512 t1 = _mm512_add_ps(_mm512_mul_ps(r24, t[0]), _mm512_mul_ps(r6, t[2])); - __m512 t2 = _mm512_mul_ps(r12, t[1]); - _mm512_mask_storeu_ps(dst + 3 * stride, tail, _mm512_add_ps(t1, t2)); - _mm512_mask_storeu_ps(dst + 4 * stride, tail, _mm512_sub_ps(t1, t2)); - _mm512_mask_storeu_ps(dst + 5 * stride, tail, t[2]); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetFilter16All(const __m512 * s, float * dst, size_t stride, __mmask16 tail) - { - const __m512 r4 = _mm512_set1_ps(1.0f / 4.0f); - const __m512 r6 = _mm512_set1_ps(1.0f / 6.0f); - const __m512 mr6 = _mm512_set1_ps(-1.0f / 6.0f); - const __m512 r12 = _mm512_set1_ps(1.0f / 12.0f); - const __m512 r24 = _mm512_set1_ps(1.0f / 24.0f); - - __m512 t[3]; - t[0] = _mm512_mul_ps(r4, s[0]); - t[1] = _mm512_mul_ps(r4, s[1]); - t[2] = _mm512_mul_ps(r4, s[2]); - WinogradKernel3x3Block4x4SetFilter16Row(t, dst + 0 * stride, stride, tail); - - t[0] = _mm512_mul_ps(mr6, _mm512_add_ps(_mm512_add_ps(s[0], s[3]), s[6])); - t[1] = _mm512_mul_ps(mr6, _mm512_add_ps(_mm512_add_ps(s[1], s[4]), s[7])); - t[2] = _mm512_mul_ps(mr6, _mm512_add_ps(_mm512_add_ps(s[2], s[5]), s[8])); - WinogradKernel3x3Block4x4SetFilter16Row(t, dst + 6 * stride, stride, tail); - - t[0] = _mm512_mul_ps(mr6, _mm512_add_ps(_mm512_sub_ps(s[0], s[3]), s[6])); - t[1] = _mm512_mul_ps(mr6, _mm512_add_ps(_mm512_sub_ps(s[1], s[4]), s[7])); - t[2] = _mm512_mul_ps(mr6, _mm512_add_ps(_mm512_sub_ps(s[2], s[5]), s[8])); - WinogradKernel3x3Block4x4SetFilter16Row(t, dst + 12 * stride, stride, tail); - - t[0] = _mm512_add_ps(_mm512_add_ps(_mm512_mul_ps(r24, s[0]), _mm512_mul_ps(r12, s[3])), _mm512_mul_ps(r6, s[6])); - t[1] = _mm512_add_ps(_mm512_add_ps(_mm512_mul_ps(r24, s[1]), _mm512_mul_ps(r12, s[4])), _mm512_mul_ps(r6, s[7])); - t[2] = _mm512_add_ps(_mm512_add_ps(_mm512_mul_ps(r24, s[2]), _mm512_mul_ps(r12, s[5])), _mm512_mul_ps(r6, s[8])); - WinogradKernel3x3Block4x4SetFilter16Row(t, dst + 18 * stride, stride, tail); - - t[0] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(r24, s[0]), _mm512_mul_ps(r12, s[3])), _mm512_mul_ps(r6, s[6])); - t[1] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(r24, s[1]), _mm512_mul_ps(r12, s[4])), _mm512_mul_ps(r6, s[7])); - t[2] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(r24, s[2]), _mm512_mul_ps(r12, s[5])), _mm512_mul_ps(r6, s[8])); - WinogradKernel3x3Block4x4SetFilter16Row(t, dst + 24 * stride, stride, tail); - - WinogradKernel3x3Block4x4SetFilter16Row(s + 6, dst + 30 * stride, stride, tail); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetFilter16t(const float * src, float * dst, size_t stride, __mmask16 tail = -1) - { - __m512 s[9]; - s[0] = _mm512_maskz_loadu_ps(tail, src + 0 * stride); - s[1] = _mm512_maskz_loadu_ps(tail, src + 1 * stride); - s[2] = _mm512_maskz_loadu_ps(tail, src + 2 * stride); - s[3] = _mm512_maskz_loadu_ps(tail, src + 3 * stride); - s[4] = _mm512_maskz_loadu_ps(tail, src + 4 * stride); - s[5] = _mm512_maskz_loadu_ps(tail, src + 5 * stride); - s[6] = _mm512_maskz_loadu_ps(tail, src + 6 * stride); - s[7] = _mm512_maskz_loadu_ps(tail, src + 7 * stride); - s[8] = _mm512_maskz_loadu_ps(tail, src + 8 * stride); - WinogradKernel3x3Block4x4SetFilter16All(s, dst + 0 * stride, stride, tail); - } - - void WinogradKernel3x3Block4x4SetFilter(const float * src, size_t size, float * dst, SimdBool trans) - { - if (trans) - { - size_t sizeF = AlignLo(size, F), i = 0; - for (; i < sizeF; i += F) - WinogradKernel3x3Block4x4SetFilter16t(src + i, dst + i, size); - if (i < size) - { - __mmask16 tail = TailMask16(size - sizeF); - WinogradKernel3x3Block4x4SetFilter16t(src + i, dst + i, size, tail); - } - } - else - { - Sse::WinogradKernel3x3Block4x4SetFilter(src, size, dst, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block4x4SetInput16Store(const __m512 src[36], float * dst, size_t stride, __mmask16 tail = -1) - { - __m512 _2 = _mm512_set1_ps(2.0f); - __m512 _4 = _mm512_set1_ps(4.0f); - __m512 _5 = _mm512_set1_ps(5.0f); - __m512 tmp[36]; - tmp[0] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, src[0]), _mm512_mul_ps(_5, src[12])), src[24]); - tmp[1] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, src[1]), _mm512_mul_ps(_5, src[13])), src[25]); - tmp[2] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, src[2]), _mm512_mul_ps(_5, src[14])), src[26]); - tmp[3] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, src[3]), _mm512_mul_ps(_5, src[15])), src[27]); - tmp[4] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, src[4]), _mm512_mul_ps(_5, src[16])), src[28]); - tmp[5] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, src[5]), _mm512_mul_ps(_5, src[17])), src[29]); - tmp[6] = _mm512_sub_ps(_mm512_add_ps(src[18], src[24]), _mm512_mul_ps(_4, _mm512_add_ps(src[6], src[12]))); - tmp[7] = _mm512_sub_ps(_mm512_add_ps(src[19], src[25]), _mm512_mul_ps(_4, _mm512_add_ps(src[7], src[13]))); - tmp[8] = _mm512_sub_ps(_mm512_add_ps(src[20], src[26]), _mm512_mul_ps(_4, _mm512_add_ps(src[8], src[14]))); - tmp[9] = _mm512_sub_ps(_mm512_add_ps(src[21], src[27]), _mm512_mul_ps(_4, _mm512_add_ps(src[9], src[15]))); - tmp[10] = _mm512_sub_ps(_mm512_add_ps(src[22], src[28]), _mm512_mul_ps(_4, _mm512_add_ps(src[10], src[16]))); - tmp[11] = _mm512_sub_ps(_mm512_add_ps(src[23], src[29]), _mm512_mul_ps(_4, _mm512_add_ps(src[11], src[17]))); - tmp[12] = _mm512_add_ps(_mm512_mul_ps(_4, _mm512_sub_ps(src[6], src[12])), _mm512_sub_ps(src[24], src[18])); - tmp[13] = _mm512_add_ps(_mm512_mul_ps(_4, _mm512_sub_ps(src[7], src[13])), _mm512_sub_ps(src[25], src[19])); - tmp[14] = _mm512_add_ps(_mm512_mul_ps(_4, _mm512_sub_ps(src[8], src[14])), _mm512_sub_ps(src[26], src[20])); - tmp[15] = _mm512_add_ps(_mm512_mul_ps(_4, _mm512_sub_ps(src[9], src[15])), _mm512_sub_ps(src[27], src[21])); - tmp[16] = _mm512_add_ps(_mm512_mul_ps(_4, _mm512_sub_ps(src[10], src[16])), _mm512_sub_ps(src[28], src[22])); - tmp[17] = _mm512_add_ps(_mm512_mul_ps(_4, _mm512_sub_ps(src[11], src[17])), _mm512_sub_ps(src[29], src[23])); - tmp[18] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[18], src[6])), _mm512_sub_ps(src[24], src[12])); - tmp[19] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[19], src[7])), _mm512_sub_ps(src[25], src[13])); - tmp[20] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[20], src[8])), _mm512_sub_ps(src[26], src[14])); - tmp[21] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[21], src[9])), _mm512_sub_ps(src[27], src[15])); - tmp[22] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[22], src[10])), _mm512_sub_ps(src[28], src[16])); - tmp[23] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[23], src[11])), _mm512_sub_ps(src[29], src[17])); - tmp[24] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[6], src[18])), _mm512_sub_ps(src[24], src[12])); - tmp[25] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[7], src[19])), _mm512_sub_ps(src[25], src[13])); - tmp[26] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[8], src[20])), _mm512_sub_ps(src[26], src[14])); - tmp[27] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[9], src[21])), _mm512_sub_ps(src[27], src[15])); - tmp[28] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[10], src[22])), _mm512_sub_ps(src[28], src[16])); - tmp[29] = _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(src[11], src[23])), _mm512_sub_ps(src[29], src[17])); - tmp[30] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, src[6]), _mm512_mul_ps(_5, src[18])), src[30]); - tmp[31] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, src[7]), _mm512_mul_ps(_5, src[19])), src[31]); - tmp[32] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, src[8]), _mm512_mul_ps(_5, src[20])), src[32]); - tmp[33] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, src[9]), _mm512_mul_ps(_5, src[21])), src[33]); - tmp[34] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, src[10]), _mm512_mul_ps(_5, src[22])), src[34]); - tmp[35] = _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, src[11]), _mm512_mul_ps(_5, src[23])), src[35]); - - _mm512_mask_storeu_ps(dst + 0 * stride, tail, _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, tmp[0]), _mm512_mul_ps(_5, tmp[2])), tmp[4])); - _mm512_mask_storeu_ps(dst + 1 * stride, tail, _mm512_sub_ps(_mm512_add_ps(tmp[3], tmp[4]), _mm512_mul_ps(_4, _mm512_add_ps(tmp[1], tmp[2])))); - _mm512_mask_storeu_ps(dst + 2 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_4, _mm512_sub_ps(tmp[1], tmp[2])), _mm512_sub_ps(tmp[4], tmp[3]))); - _mm512_mask_storeu_ps(dst + 3 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[3], tmp[1])), _mm512_sub_ps(tmp[4], tmp[2]))); - _mm512_mask_storeu_ps(dst + 4 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[1], tmp[3])), _mm512_sub_ps(tmp[4], tmp[2]))); - _mm512_mask_storeu_ps(dst + 5 * stride, tail, _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, tmp[1]), _mm512_mul_ps(_5, tmp[3])), tmp[5])); - _mm512_mask_storeu_ps(dst + 6 * stride, tail, _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, tmp[6]), _mm512_mul_ps(_5, tmp[8])), tmp[10])); - _mm512_mask_storeu_ps(dst + 7 * stride, tail, _mm512_sub_ps(_mm512_add_ps(tmp[9], tmp[10]), _mm512_mul_ps(_4, _mm512_add_ps(tmp[7], tmp[8])))); - _mm512_mask_storeu_ps(dst + 8 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_4, _mm512_sub_ps(tmp[7], tmp[8])), _mm512_sub_ps(tmp[10], tmp[9]))); - _mm512_mask_storeu_ps(dst + 9 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[9], tmp[7])), _mm512_sub_ps(tmp[10], tmp[8]))); - _mm512_mask_storeu_ps(dst + 10 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[7], tmp[9])), _mm512_sub_ps(tmp[10], tmp[8]))); - _mm512_mask_storeu_ps(dst + 11 * stride, tail, _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, tmp[7]), _mm512_mul_ps(_5, tmp[9])), tmp[11])); - _mm512_mask_storeu_ps(dst + 12 * stride, tail, _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, tmp[12]), _mm512_mul_ps(_5, tmp[14])), tmp[16])); - _mm512_mask_storeu_ps(dst + 13 * stride, tail, _mm512_sub_ps(_mm512_add_ps(tmp[15], tmp[16]), _mm512_mul_ps(_4, _mm512_add_ps(tmp[13], tmp[14])))); - _mm512_mask_storeu_ps(dst + 14 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_4, _mm512_sub_ps(tmp[13], tmp[14])), _mm512_sub_ps(tmp[16], tmp[15]))); - _mm512_mask_storeu_ps(dst + 15 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[15], tmp[13])), _mm512_sub_ps(tmp[16], tmp[14]))); - _mm512_mask_storeu_ps(dst + 16 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[13], tmp[15])), _mm512_sub_ps(tmp[16], tmp[14]))); - _mm512_mask_storeu_ps(dst + 17 * stride, tail, _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, tmp[13]), _mm512_mul_ps(_5, tmp[15])), tmp[17])); - _mm512_mask_storeu_ps(dst + 18 * stride, tail, _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, tmp[18]), _mm512_mul_ps(_5, tmp[20])), tmp[22])); - _mm512_mask_storeu_ps(dst + 19 * stride, tail, _mm512_sub_ps(_mm512_add_ps(tmp[21], tmp[22]), _mm512_mul_ps(_4, _mm512_add_ps(tmp[19], tmp[20])))); - _mm512_mask_storeu_ps(dst + 20 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_4, _mm512_sub_ps(tmp[19], tmp[20])), _mm512_sub_ps(tmp[22], tmp[21]))); - _mm512_mask_storeu_ps(dst + 21 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[21], tmp[19])), _mm512_sub_ps(tmp[22], tmp[20]))); - _mm512_mask_storeu_ps(dst + 22 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[19], tmp[21])), _mm512_sub_ps(tmp[22], tmp[20]))); - _mm512_mask_storeu_ps(dst + 23 * stride, tail, _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, tmp[19]), _mm512_mul_ps(_5, tmp[21])), tmp[23])); - _mm512_mask_storeu_ps(dst + 24 * stride, tail, _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, tmp[24]), _mm512_mul_ps(_5, tmp[26])), tmp[28])); - _mm512_mask_storeu_ps(dst + 25 * stride, tail, _mm512_sub_ps(_mm512_add_ps(tmp[27], tmp[28]), _mm512_mul_ps(_4, _mm512_add_ps(tmp[25], tmp[26])))); - _mm512_mask_storeu_ps(dst + 26 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_4, _mm512_sub_ps(tmp[25], tmp[26])), _mm512_sub_ps(tmp[28], tmp[27]))); - _mm512_mask_storeu_ps(dst + 27 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[27], tmp[25])), _mm512_sub_ps(tmp[28], tmp[26]))); - _mm512_mask_storeu_ps(dst + 28 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[25], tmp[27])), _mm512_sub_ps(tmp[28], tmp[26]))); - _mm512_mask_storeu_ps(dst + 29 * stride, tail, _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, tmp[25]), _mm512_mul_ps(_5, tmp[27])), tmp[29])); - _mm512_mask_storeu_ps(dst + 30 * stride, tail, _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, tmp[30]), _mm512_mul_ps(_5, tmp[32])), tmp[34])); - _mm512_mask_storeu_ps(dst + 31 * stride, tail, _mm512_sub_ps(_mm512_add_ps(tmp[33], tmp[34]), _mm512_mul_ps(_4, _mm512_add_ps(tmp[31], tmp[32])))); - _mm512_mask_storeu_ps(dst + 32 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_4, _mm512_sub_ps(tmp[31], tmp[32])), _mm512_sub_ps(tmp[34], tmp[33]))); - _mm512_mask_storeu_ps(dst + 33 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[33], tmp[31])), _mm512_sub_ps(tmp[34], tmp[32]))); - _mm512_mask_storeu_ps(dst + 34 * stride, tail, _mm512_add_ps(_mm512_mul_ps(_2, _mm512_sub_ps(tmp[31], tmp[33])), _mm512_sub_ps(tmp[34], tmp[32]))); - _mm512_mask_storeu_ps(dst + 35 * stride, tail, _mm512_add_ps(_mm512_sub_ps(_mm512_mul_ps(_4, tmp[31]), _mm512_mul_ps(_5, tmp[33])), tmp[35])); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetInput16t(const float * src, size_t srcS, size_t srcC, __m512 dst[36], __mmask16 tail = -1) - { - dst[0] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 0 * srcC); - dst[1] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 1 * srcC); - dst[2] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 2 * srcC); - dst[3] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 3 * srcC); - dst[4] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 4 * srcC); - dst[5] = _mm512_maskz_loadu_ps(tail, src + 0 * srcS + 5 * srcC); - dst[6] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 0 * srcC); - dst[7] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 1 * srcC); - dst[8] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 2 * srcC); - dst[9] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 3 * srcC); - dst[10] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 4 * srcC); - dst[11] = _mm512_maskz_loadu_ps(tail, src + 1 * srcS + 5 * srcC); - dst[12] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 0 * srcC); - dst[13] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 1 * srcC); - dst[14] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 2 * srcC); - dst[15] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 3 * srcC); - dst[16] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 4 * srcC); - dst[17] = _mm512_maskz_loadu_ps(tail, src + 2 * srcS + 5 * srcC); - dst[18] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 0 * srcC); - dst[19] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 1 * srcC); - dst[20] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 2 * srcC); - dst[21] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 3 * srcC); - dst[22] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 4 * srcC); - dst[23] = _mm512_maskz_loadu_ps(tail, src + 3 * srcS + 5 * srcC); - dst[24] = _mm512_maskz_loadu_ps(tail, src + 4 * srcS + 0 * srcC); - dst[25] = _mm512_maskz_loadu_ps(tail, src + 4 * srcS + 1 * srcC); - dst[26] = _mm512_maskz_loadu_ps(tail, src + 4 * srcS + 2 * srcC); - dst[27] = _mm512_maskz_loadu_ps(tail, src + 4 * srcS + 3 * srcC); - dst[28] = _mm512_maskz_loadu_ps(tail, src + 4 * srcS + 4 * srcC); - dst[29] = _mm512_maskz_loadu_ps(tail, src + 4 * srcS + 5 * srcC); - dst[30] = _mm512_maskz_loadu_ps(tail, src + 5 * srcS + 0 * srcC); - dst[31] = _mm512_maskz_loadu_ps(tail, src + 5 * srcS + 1 * srcC); - dst[32] = _mm512_maskz_loadu_ps(tail, src + 5 * srcS + 2 * srcC); - dst[33] = _mm512_maskz_loadu_ps(tail, src + 5 * srcS + 3 * srcC); - dst[34] = _mm512_maskz_loadu_ps(tail, src + 5 * srcS + 4 * srcC); - dst[35] = _mm512_maskz_loadu_ps(tail, src + 5 * srcS + 5 * srcC); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetInput16t(const float * src, size_t srcW, size_t srcC, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - size_t c = 0; - for (; c < srcCF; c += F) - { - __m512 tmp[36]; - WinogradKernel3x3Block4x4SetInput16t(src + c, srcS, srcC, tmp); - WinogradKernel3x3Block4x4SetInput16Store(tmp, dst + c, dstStride); - } - if (c < srcC) - { - __mmask16 tail = TailMask16(srcC - c); - __m512 tmp[36]; - WinogradKernel3x3Block4x4SetInput16t(src + c, srcS, srcC, tmp, tail); - WinogradKernel3x3Block4x4SetInput16Store(tmp, dst + c, dstStride, tail); - } - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetInput16t(const float * src, size_t srcS, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, __m512 * dst, __mmask16 tail = -1) - { - for (size_t row = 0; row < rowB; ++row) - { - for (size_t col = 0; col < 6; ++col) - dst[col] = _mm512_setzero_ps(); - dst += 6; - } - for (size_t row = rowB; row < rowE; ++row) - { - for (size_t col = 0; col < colB; ++col) - dst[col] = _mm512_setzero_ps(); - for (size_t col = colB; col < colE; ++col) - dst[col] = _mm512_maskz_loadu_ps(tail, src + row * srcS + col * srcC); - for (size_t col = colE; col < 6; ++col) - dst[col] = _mm512_setzero_ps(); - dst += 6; - } - for (size_t row = rowE; row < 6; ++row) - { - for (size_t col = 0; col < 6; ++col) - dst[col] = _mm512_setzero_ps(); - dst += 6; - } - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetInput16t(const float * src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - size_t c = 0; - for (; c < srcCF; c += F) - { - __m512 tmp[36]; - WinogradKernel3x3Block4x4SetInput16t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel3x3Block4x4SetInput16Store(tmp, dst + c, dstStride); - } - if (c < srcC) - { - __mmask16 tail = TailMask16(srcC - c); - __m512 tmp[36]; - WinogradKernel3x3Block4x4SetInput16t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp, tail); - WinogradKernel3x3Block4x4SetInput16Store(tmp, dst + c, dstStride, tail); - } - } - - void WinogradKernel3x3Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - if (trans ? (false) : (srcHeight < 6 || srcWidth < 14)) - { - Avx::WinogradKernel3x3Block4x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - return; - } - if (trans) - { - assert(padY + padH <= 2 && padX + padW <= 2); - size_t dstH = srcHeight - 2 + padY + padH; - size_t dstW = srcWidth - 2 + padX + padW; - size_t dstH4 = dstH / 4 * 4; - size_t dstW4 = dstW / 4 * 4; - size_t noseW = Simd::Min(6, srcWidth + padX); - size_t noseH = Simd::Min(6, srcHeight + padY); - size_t startY = padY ? 4 : 0; - size_t startX = padX ? 4 : 0; - if (padH && dstH == dstH4) - dstH4 -= 4; - if (padY) - src -= srcWidth * srcChannels; - if (padW && dstW == dstW4) - dstW4 -= 4; - if (padX) - src -= srcChannels; - size_t tailW = dstW - dstW4 + (padW ? 1 : 2); - size_t tailH = dstH - dstH4 + (padH ? 1 : 2); - size_t row = 0, col = 0; - if (padY) - { - if (padX) - WinogradKernel3x3Block4x4SetInput16t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel3x3Block4x4SetInput16t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 6, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block4x4SetInput16t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; - } - for (row = startY; row < dstH4; row += 4) - { - if (padX) - WinogradKernel3x3Block4x4SetInput16t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 6, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel3x3Block4x4SetInput16t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block4x4SetInput16t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 6, 0, tailW, dst, dstStride), dst += srcChannels; - } - if (row < dstH) - { - if (padX) - WinogradKernel3x3Block4x4SetInput16t(src + row * srcWidth* srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel3x3Block4x4SetInput16t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 6, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block4x4SetInput16t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; - } - } - else - { - Base::WinogradKernel3x3Block4x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutputLoad36(const float * src, size_t stride, __m512 dst[16], __mmask16 tail = -1) - { - __m512 s[36]; - s[0] = _mm512_maskz_loadu_ps(tail, src + 0 * stride); - s[1] = _mm512_maskz_loadu_ps(tail, src + 1 * stride); - s[2] = _mm512_maskz_loadu_ps(tail, src + 2 * stride); - s[3] = _mm512_maskz_loadu_ps(tail, src + 3 * stride); - s[4] = _mm512_maskz_loadu_ps(tail, src + 4 * stride); - s[5] = _mm512_maskz_loadu_ps(tail, src + 5 * stride); - s[6] = _mm512_maskz_loadu_ps(tail, src + 6 * stride); - s[7] = _mm512_maskz_loadu_ps(tail, src + 7 * stride); - s[8] = _mm512_maskz_loadu_ps(tail, src + 8 * stride); - s[9] = _mm512_maskz_loadu_ps(tail, src + 9 * stride); - s[10] = _mm512_maskz_loadu_ps(tail, src + 10 * stride); - s[11] = _mm512_maskz_loadu_ps(tail, src + 11 * stride); - s[12] = _mm512_maskz_loadu_ps(tail, src + 12 * stride); - s[13] = _mm512_maskz_loadu_ps(tail, src + 13 * stride); - s[14] = _mm512_maskz_loadu_ps(tail, src + 14 * stride); - s[15] = _mm512_maskz_loadu_ps(tail, src + 15 * stride); - s[16] = _mm512_maskz_loadu_ps(tail, src + 16 * stride); - s[17] = _mm512_maskz_loadu_ps(tail, src + 17 * stride); - s[18] = _mm512_maskz_loadu_ps(tail, src + 18 * stride); - s[19] = _mm512_maskz_loadu_ps(tail, src + 19 * stride); - s[20] = _mm512_maskz_loadu_ps(tail, src + 20 * stride); - s[21] = _mm512_maskz_loadu_ps(tail, src + 21 * stride); - s[22] = _mm512_maskz_loadu_ps(tail, src + 22 * stride); - s[23] = _mm512_maskz_loadu_ps(tail, src + 23 * stride); - s[24] = _mm512_maskz_loadu_ps(tail, src + 24 * stride); - s[25] = _mm512_maskz_loadu_ps(tail, src + 25 * stride); - s[26] = _mm512_maskz_loadu_ps(tail, src + 26 * stride); - s[27] = _mm512_maskz_loadu_ps(tail, src + 27 * stride); - s[28] = _mm512_maskz_loadu_ps(tail, src + 28 * stride); - s[29] = _mm512_maskz_loadu_ps(tail, src + 29 * stride); - s[30] = _mm512_maskz_loadu_ps(tail, src + 30 * stride); - s[31] = _mm512_maskz_loadu_ps(tail, src + 31 * stride); - s[32] = _mm512_maskz_loadu_ps(tail, src + 32 * stride); - s[33] = _mm512_maskz_loadu_ps(tail, src + 33 * stride); - s[34] = _mm512_maskz_loadu_ps(tail, src + 34 * stride); - s[35] = _mm512_maskz_loadu_ps(tail, src + 35 * stride); - - __m512 _2 = _mm512_set1_ps(2.0f); - __m512 _4 = _mm512_set1_ps(4.0f); - __m512 _8 = _mm512_set1_ps(8.0f); - __m512 t[24]; - t[0] = _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(s[0], s[6]), _mm512_add_ps(s[12], s[18])), s[24]); - t[1] = _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(s[1], s[7]), _mm512_add_ps(s[13], s[19])), s[25]); - t[2] = _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(s[2], s[8]), _mm512_add_ps(s[14], s[20])), s[26]); - t[3] = _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(s[3], s[9]), _mm512_add_ps(s[15], s[21])), s[27]); - t[4] = _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(s[4], s[10]), _mm512_add_ps(s[16], s[22])), s[28]); - t[5] = _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(s[5], s[11]), _mm512_add_ps(s[17], s[23])), s[29]); - t[6] = _mm512_add_ps(_mm512_sub_ps(s[6], s[12]), _mm512_mul_ps(_2, _mm512_sub_ps(s[18], s[24]))); - t[7] = _mm512_add_ps(_mm512_sub_ps(s[7], s[13]), _mm512_mul_ps(_2, _mm512_sub_ps(s[19], s[25]))); - t[8] = _mm512_add_ps(_mm512_sub_ps(s[8], s[14]), _mm512_mul_ps(_2, _mm512_sub_ps(s[20], s[26]))); - t[9] = _mm512_add_ps(_mm512_sub_ps(s[9], s[15]), _mm512_mul_ps(_2, _mm512_sub_ps(s[21], s[27]))); - t[10] = _mm512_add_ps(_mm512_sub_ps(s[10], s[16]), _mm512_mul_ps(_2, _mm512_sub_ps(s[22], s[28]))); - t[11] = _mm512_add_ps(_mm512_sub_ps(s[11], s[17]), _mm512_mul_ps(_2, _mm512_sub_ps(s[23], s[29]))); - t[12] = _mm512_add_ps(_mm512_add_ps(s[6], s[12]), _mm512_mul_ps(_4, _mm512_add_ps(s[18], s[24]))); - t[13] = _mm512_add_ps(_mm512_add_ps(s[7], s[13]), _mm512_mul_ps(_4, _mm512_add_ps(s[19], s[25]))); - t[14] = _mm512_add_ps(_mm512_add_ps(s[8], s[14]), _mm512_mul_ps(_4, _mm512_add_ps(s[20], s[26]))); - t[15] = _mm512_add_ps(_mm512_add_ps(s[9], s[15]), _mm512_mul_ps(_4, _mm512_add_ps(s[21], s[27]))); - t[16] = _mm512_add_ps(_mm512_add_ps(s[10], s[16]), _mm512_mul_ps(_4, _mm512_add_ps(s[22], s[28]))); - t[17] = _mm512_add_ps(_mm512_add_ps(s[11], s[17]), _mm512_mul_ps(_4, _mm512_add_ps(s[23], s[29]))); - t[18] = _mm512_add_ps(_mm512_add_ps(_mm512_sub_ps(s[6], s[12]), _mm512_mul_ps(_8, _mm512_sub_ps(s[18], s[24]))), s[30]); - t[19] = _mm512_add_ps(_mm512_add_ps(_mm512_sub_ps(s[7], s[13]), _mm512_mul_ps(_8, _mm512_sub_ps(s[19], s[25]))), s[31]); - t[20] = _mm512_add_ps(_mm512_add_ps(_mm512_sub_ps(s[8], s[14]), _mm512_mul_ps(_8, _mm512_sub_ps(s[20], s[26]))), s[32]); - t[21] = _mm512_add_ps(_mm512_add_ps(_mm512_sub_ps(s[9], s[15]), _mm512_mul_ps(_8, _mm512_sub_ps(s[21], s[27]))), s[33]); - t[22] = _mm512_add_ps(_mm512_add_ps(_mm512_sub_ps(s[10], s[16]), _mm512_mul_ps(_8, _mm512_sub_ps(s[22], s[28]))), s[34]); - t[23] = _mm512_add_ps(_mm512_add_ps(_mm512_sub_ps(s[11], s[17]), _mm512_mul_ps(_8, _mm512_sub_ps(s[23], s[29]))), s[35]); - - dst[0] = _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(t[0], t[1]), _mm512_add_ps(t[2], t[3])), t[4]); - dst[1] = _mm512_add_ps(_mm512_sub_ps(t[1], t[2]), _mm512_mul_ps(_2, _mm512_sub_ps(t[3], t[4]))); - dst[2] = _mm512_add_ps(_mm512_add_ps(t[1], t[2]), _mm512_mul_ps(_4, _mm512_add_ps(t[3], t[4]))); - dst[3] = _mm512_add_ps(_mm512_add_ps(_mm512_sub_ps(t[1], t[2]), _mm512_mul_ps(_8, _mm512_sub_ps(t[3], t[4]))), t[5]); - dst[4] = _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(t[6], t[7]), _mm512_add_ps(t[8], t[9])), t[10]); - dst[5] = _mm512_add_ps(_mm512_sub_ps(t[7], t[8]), _mm512_mul_ps(_2, _mm512_sub_ps(t[9], t[10]))); - dst[6] = _mm512_add_ps(_mm512_add_ps(t[7], t[8]), _mm512_mul_ps(_4, _mm512_add_ps(t[9], t[10]))); - dst[7] = _mm512_add_ps(_mm512_add_ps(_mm512_sub_ps(t[7], t[8]), _mm512_mul_ps(_8, _mm512_sub_ps(t[9], t[10]))), t[11]); - dst[8] = _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(t[12], t[13]), _mm512_add_ps(t[14], t[15])), t[16]); - dst[9] = _mm512_add_ps(_mm512_sub_ps(t[13], t[14]), _mm512_mul_ps(_2, _mm512_sub_ps(t[15], t[16]))); - dst[10] = _mm512_add_ps(_mm512_add_ps(t[13], t[14]), _mm512_mul_ps(_4, _mm512_add_ps(t[15], t[16]))); - dst[11] = _mm512_add_ps(_mm512_add_ps(_mm512_sub_ps(t[13], t[14]), _mm512_mul_ps(_8, _mm512_sub_ps(t[15], t[16]))), t[17]); - dst[12] = _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(t[18], t[19]), _mm512_add_ps(t[20], t[21])), t[22]); - dst[13] = _mm512_add_ps(_mm512_sub_ps(t[19], t[20]), _mm512_mul_ps(_2, _mm512_sub_ps(t[21], t[22]))); - dst[14] = _mm512_add_ps(_mm512_add_ps(t[19], t[20]), _mm512_mul_ps(_4, _mm512_add_ps(t[21], t[22]))); - dst[15] = _mm512_add_ps(_mm512_add_ps(_mm512_sub_ps(t[19], t[20]), _mm512_mul_ps(_8, _mm512_sub_ps(t[21], t[22]))), t[23]); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutputStore16(const __m512 src[16], float * dst, size_t dstS, size_t dstC, __mmask16 tail = -1) - { - _mm512_mask_storeu_ps(dst + 0 * dstS + 0 * dstC, tail, src[0]); - _mm512_mask_storeu_ps(dst + 0 * dstS + 1 * dstC, tail, src[1]); - _mm512_mask_storeu_ps(dst + 0 * dstS + 2 * dstC, tail, src[2]); - _mm512_mask_storeu_ps(dst + 0 * dstS + 3 * dstC, tail, src[3]); - _mm512_mask_storeu_ps(dst + 1 * dstS + 0 * dstC, tail, src[4]); - _mm512_mask_storeu_ps(dst + 1 * dstS + 1 * dstC, tail, src[5]); - _mm512_mask_storeu_ps(dst + 1 * dstS + 2 * dstC, tail, src[6]); - _mm512_mask_storeu_ps(dst + 1 * dstS + 3 * dstC, tail, src[7]); - _mm512_mask_storeu_ps(dst + 2 * dstS + 0 * dstC, tail, src[8]); - _mm512_mask_storeu_ps(dst + 2 * dstS + 1 * dstC, tail, src[9]); - _mm512_mask_storeu_ps(dst + 2 * dstS + 2 * dstC, tail, src[10]); - _mm512_mask_storeu_ps(dst + 2 * dstS + 3 * dstC, tail, src[11]); - _mm512_mask_storeu_ps(dst + 3 * dstS + 0 * dstC, tail, src[12]); - _mm512_mask_storeu_ps(dst + 3 * dstS + 1 * dstC, tail, src[13]); - _mm512_mask_storeu_ps(dst + 3 * dstS + 2 * dstC, tail, src[14]); - _mm512_mask_storeu_ps(dst + 3 * dstS + 3 * dstC, tail, src[15]); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutput16tSaveRow(const __m512 * t, float * dst, size_t dstC, __mmask16 tail) - { - _mm512_mask_storeu_ps(dst + 0 * dstC, tail, _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(t[0], t[1]), _mm512_add_ps(t[2], t[3])), t[4])); - _mm512_mask_storeu_ps(dst + 1 * dstC, tail, _mm512_add_ps(_mm512_sub_ps(t[1], t[2]), _mm512_mul_ps(_mm512_set1_ps(2.0f), _mm512_sub_ps(t[3], t[4])))); - _mm512_mask_storeu_ps(dst + 2 * dstC, tail, _mm512_add_ps(_mm512_add_ps(t[1], t[2]), _mm512_mul_ps(_mm512_set1_ps(4.0f), _mm512_add_ps(t[3], t[4])))); - _mm512_mask_storeu_ps(dst + 3 * dstC, tail, _mm512_add_ps(_mm512_add_ps(_mm512_sub_ps(t[1], t[2]), _mm512_mul_ps(_mm512_set1_ps(8.0f), _mm512_sub_ps(t[3], t[4]))), t[5])); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutput16t(const float * src, size_t srcStride, float * dst, size_t dstS, size_t dstC, size_t rowE, __mmask16 tail = -1) - { - __m512 s[36], t[6]; - s[6] = _mm512_maskz_loadu_ps(tail, src + 6 * srcStride); - s[7] = _mm512_maskz_loadu_ps(tail, src + 7 * srcStride); - s[8] = _mm512_maskz_loadu_ps(tail, src + 8 * srcStride); - s[9] = _mm512_maskz_loadu_ps(tail, src + 9 * srcStride); - s[10] = _mm512_maskz_loadu_ps(tail, src + 10 * srcStride); - s[11] = _mm512_maskz_loadu_ps(tail, src + 11 * srcStride); - s[12] = _mm512_maskz_loadu_ps(tail, src + 12 * srcStride); - s[13] = _mm512_maskz_loadu_ps(tail, src + 13 * srcStride); - s[14] = _mm512_maskz_loadu_ps(tail, src + 14 * srcStride); - s[15] = _mm512_maskz_loadu_ps(tail, src + 15 * srcStride); - s[16] = _mm512_maskz_loadu_ps(tail, src + 16 * srcStride); - s[17] = _mm512_maskz_loadu_ps(tail, src + 17 * srcStride); - s[18] = _mm512_maskz_loadu_ps(tail, src + 18 * srcStride); - s[19] = _mm512_maskz_loadu_ps(tail, src + 19 * srcStride); - s[20] = _mm512_maskz_loadu_ps(tail, src + 20 * srcStride); - s[21] = _mm512_maskz_loadu_ps(tail, src + 21 * srcStride); - s[22] = _mm512_maskz_loadu_ps(tail, src + 22 * srcStride); - s[23] = _mm512_maskz_loadu_ps(tail, src + 23 * srcStride); - s[24] = _mm512_maskz_loadu_ps(tail, src + 24 * srcStride); - s[25] = _mm512_maskz_loadu_ps(tail, src + 25 * srcStride); - s[26] = _mm512_maskz_loadu_ps(tail, src + 26 * srcStride); - s[27] = _mm512_maskz_loadu_ps(tail, src + 27 * srcStride); - s[28] = _mm512_maskz_loadu_ps(tail, src + 28 * srcStride); - s[29] = _mm512_maskz_loadu_ps(tail, src + 29 * srcStride); - - t[0] = _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(_mm512_maskz_loadu_ps(tail, src + 0 * srcStride), s[6]), _mm512_add_ps(s[12], s[18])), s[24]); - t[1] = _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(_mm512_maskz_loadu_ps(tail, src + 1 * srcStride), s[7]), _mm512_add_ps(s[13], s[19])), s[25]); - t[2] = _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(_mm512_maskz_loadu_ps(tail, src + 2 * srcStride), s[8]), _mm512_add_ps(s[14], s[20])), s[26]); - t[3] = _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(_mm512_maskz_loadu_ps(tail, src + 3 * srcStride), s[9]), _mm512_add_ps(s[15], s[21])), s[27]); - t[4] = _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(_mm512_maskz_loadu_ps(tail, src + 4 * srcStride), s[10]), _mm512_add_ps(s[16], s[22])), s[28]); - t[5] = _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(_mm512_maskz_loadu_ps(tail, src + 5 * srcStride), s[11]), _mm512_add_ps(s[17], s[23])), s[29]); - WinogradKernel3x3Block4x4SetOutput16tSaveRow(t, dst, dstC, tail); - dst += dstS; if (rowE == 1) return; - - __m512 _2 = _mm512_set1_ps(2.0f); - t[0] = _mm512_add_ps(_mm512_sub_ps(s[6], s[12]), _mm512_mul_ps(_2, _mm512_sub_ps(s[18], s[24]))); - t[1] = _mm512_add_ps(_mm512_sub_ps(s[7], s[13]), _mm512_mul_ps(_2, _mm512_sub_ps(s[19], s[25]))); - t[2] = _mm512_add_ps(_mm512_sub_ps(s[8], s[14]), _mm512_mul_ps(_2, _mm512_sub_ps(s[20], s[26]))); - t[3] = _mm512_add_ps(_mm512_sub_ps(s[9], s[15]), _mm512_mul_ps(_2, _mm512_sub_ps(s[21], s[27]))); - t[4] = _mm512_add_ps(_mm512_sub_ps(s[10], s[16]), _mm512_mul_ps(_2, _mm512_sub_ps(s[22], s[28]))); - t[5] = _mm512_add_ps(_mm512_sub_ps(s[11], s[17]), _mm512_mul_ps(_2, _mm512_sub_ps(s[23], s[29]))); - WinogradKernel3x3Block4x4SetOutput16tSaveRow(t, dst, dstC, tail); - dst += dstS; if (rowE == 2) return; - - __m512 _4 = _mm512_set1_ps(4.0f); - t[0] = _mm512_add_ps(_mm512_add_ps(s[6], s[12]), _mm512_mul_ps(_4, _mm512_add_ps(s[18], s[24]))); - t[1] = _mm512_add_ps(_mm512_add_ps(s[7], s[13]), _mm512_mul_ps(_4, _mm512_add_ps(s[19], s[25]))); - t[2] = _mm512_add_ps(_mm512_add_ps(s[8], s[14]), _mm512_mul_ps(_4, _mm512_add_ps(s[20], s[26]))); - t[3] = _mm512_add_ps(_mm512_add_ps(s[9], s[15]), _mm512_mul_ps(_4, _mm512_add_ps(s[21], s[27]))); - t[4] = _mm512_add_ps(_mm512_add_ps(s[10], s[16]), _mm512_mul_ps(_4, _mm512_add_ps(s[22], s[28]))); - t[5] = _mm512_add_ps(_mm512_add_ps(s[11], s[17]), _mm512_mul_ps(_4, _mm512_add_ps(s[23], s[29]))); - WinogradKernel3x3Block4x4SetOutput16tSaveRow(t, dst, dstC, tail); - dst += dstS; if (rowE == 3) return; - - __m512 _8 = _mm512_set1_ps(8.0f); - t[0] = _mm512_add_ps(_mm512_add_ps(_mm512_sub_ps(s[6], s[12]), _mm512_mul_ps(_8, _mm512_sub_ps(s[18], s[24]))), _mm512_maskz_loadu_ps(tail, src + 30 * srcStride)); - t[1] = _mm512_add_ps(_mm512_add_ps(_mm512_sub_ps(s[7], s[13]), _mm512_mul_ps(_8, _mm512_sub_ps(s[19], s[25]))), _mm512_maskz_loadu_ps(tail, src + 31 * srcStride)); - t[2] = _mm512_add_ps(_mm512_add_ps(_mm512_sub_ps(s[8], s[14]), _mm512_mul_ps(_8, _mm512_sub_ps(s[20], s[26]))), _mm512_maskz_loadu_ps(tail, src + 32 * srcStride)); - t[3] = _mm512_add_ps(_mm512_add_ps(_mm512_sub_ps(s[9], s[15]), _mm512_mul_ps(_8, _mm512_sub_ps(s[21], s[27]))), _mm512_maskz_loadu_ps(tail, src + 33 * srcStride)); - t[4] = _mm512_add_ps(_mm512_add_ps(_mm512_sub_ps(s[10], s[16]), _mm512_mul_ps(_8, _mm512_sub_ps(s[22], s[28]))), _mm512_maskz_loadu_ps(tail, src + 34 * srcStride)); - t[5] = _mm512_add_ps(_mm512_add_ps(_mm512_sub_ps(s[11], s[17]), _mm512_mul_ps(_8, _mm512_sub_ps(s[23], s[29]))), _mm512_maskz_loadu_ps(tail, src + 35 * srcStride)); - WinogradKernel3x3Block4x4SetOutput16tSaveRow(t, dst, dstC, tail); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutputT(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC, size_t rowE) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F), d = 0; - for (; d < dstCF; d += F) - WinogradKernel3x3Block4x4SetOutput16t(src + d, srcStride, dst + d, dstS, dstC, rowE); - if (d < dstC) - WinogradKernel3x3Block4x4SetOutput16t(src + d, srcStride, dst + d, dstS, dstC, rowE, TailMask16(dstC - d)); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutputStore16(const __m512 src[16], float * dst, size_t dstS, size_t dstC, size_t rowE, size_t colE, __mmask16 tail = -1) - { - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - _mm512_mask_storeu_ps(dst + row * dstS + col * dstC, tail, src[row * 4 + col]); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutputT(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F), d = 0; - for (; d < dstCF; d += F) - { - __m512 tmp[16]; - WinogradKernel3x3Block4x4SetOutputLoad36(src + d, srcStride, tmp); - WinogradKernel3x3Block4x4SetOutputStore16(tmp, dst + d, dstS, dstC, rowE, colE); - } - if (d < dstC) - { - __mmask16 tail = TailMask16(dstC - d); - __m512 tmp[16]; - WinogradKernel3x3Block4x4SetOutputLoad36(src + d, srcStride, tmp, tail); - WinogradKernel3x3Block4x4SetOutputStore16(tmp, dst + d, dstS, dstC, rowE, colE, tail); - } - } - - void WinogradKernel3x3Block4x4SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - if (trans ? (false) : (dstHeight < 4 || dstWidth < 16)) - { - Avx::WinogradKernel3x3Block4x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - return; - } - size_t tileH = (dstHeight + 3) / 4; - size_t tileW = (dstWidth + 3) / 4; - size_t dstH4 = AlignLo(dstHeight, 4); - size_t dstW4 = AlignLo(dstWidth, 4); - if (trans) - { - size_t row, col; - for (row = 0; row < dstH4; row += 4) - { - for (col = 0; col < dstW4; col += 4) - WinogradKernel3x3Block4x4SetOutputT(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, 4), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block4x4SetOutputT(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, 4, dstWidth - col), src += dstChannels; - } - if (row < dstHeight) - { - for (col = 0; col < dstW4; col += 4) - WinogradKernel3x3Block4x4SetOutputT(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block4x4SetOutputT(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; - } - } - else - { - Base::WinogradKernel3x3Block4x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - } - } - } -#endif// SIMD_AVX512F_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdAvx512vnni.h b/src/3rd/Simd/Simd/SimdAvx512vnni.h deleted file mode 100644 index 43930559..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512vnni.h +++ /dev/null @@ -1,37 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdAvx512vnni_h__ -#define __SimdAvx512vnni_h__ - -#include "Simd/SimdDefs.h" - -namespace Simd -{ -#ifdef SIMD_AVX512VNNI_ENABLE - namespace Avx512vnni - { - } -#endif// SIMD_AVX512VNNI_ENABLE -} -#endif//__SimdAvx512vnni_h__ diff --git a/src/3rd/Simd/Simd/SimdAvx512vnniSynetConvolution8i.cpp b/src/3rd/Simd/Simd/SimdAvx512vnniSynetConvolution8i.cpp deleted file mode 100644 index 10fc8a1a..00000000 --- a/src/3rd/Simd/Simd/SimdAvx512vnniSynetConvolution8i.cpp +++ /dev/null @@ -1,979 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution8i.h" -#include "Simd/SimdSynetConvolution8iCommon.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdMath.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdAvx512bw.h" -#include "Simd/SimdAvx512vnni.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_AVX512VNNI_ENABLE - namespace Avx512vnni - { - using AlgParam = SynetConvolution8iNhwcDirect::AlgParam; - using ConvolutionPtr = SynetConvolution8iNhwcDirect::ConvolutionPtr; - using Term8iType = Base::SynetConvolution8iNhwcDirect::Term8iType; - - SIMD_INLINE __m512i Set4(const uint8_t* src) - { - return _mm512_set1_epi32(*(int32_t*)src); - } - - template void Madd4(__m512i& i32, __m512i u8, __m512i i8); - - template<> SIMD_INLINE void Madd4(__m512i& i32, __m512i u8, __m512i i8) - { - i32 = _mm512_add_epi32(i32, _mm512_madd_epi16(_mm512_maddubs_epi16(u8, i8), Avx512bw::K16_0001)); - } - - template<> SIMD_INLINE void Madd4(__m512i& i32, __m512i u8, __m512i i8) - { - i32 = _mm512_dpbusd_epi32(i32, u8, i8); - } - - template void ConvolutionNhwcDirect_2x1(const uint8_t * src0, - const ConvParam8i& p, const AlgParam & a, size_t dy, size_t dx, size_t srcC, size_t dstC, const int8_t * weight0, - const __m512i * bias, const __m512i * params, const __m512 * scale, const __m512* shift, int32_t * buf, uint8_t* dst) - { - __m512i d00, d01, s0, w0, w1; - size_t dW = (DivHi(p.srcC, 4) - DivHi(srcC, 4)) * A, dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dWz = DivHi(srcC, 4) * A; - const int8_t* weight1 = weight0 + p.kernelY * p.kernelX * DivHi(p.srcC, 4) * A; - __m512i norm = _mm512_set1_epi32(a.norm); - size_t sy = dy * p.strideY - p.padY; - size_t sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY; - size_t kX = p.kernelX * p.dilationX; - if (dstC > F) - { - d00 = _mm512_setzero_si512(), d01 = _mm512_setzero_si512(); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - if (sy + ky < p.srcH && sx + kx < p.srcW) - { - size_t offs = (sy + ky) * dY + (sx + kx) * dX, end = offs + srcC; - for (; offs < end; offs += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - w1 = _mm512_loadu_si512((__m512i*)weight1); - s0 = Set4(src0 + offs); - Madd4(d00, s0, w0); - Madd4(d01, s0, w1); - weight0 += A, weight1 += A; - } - } - else - { - if (a.zero) - { - s0 = _mm512_set1_epi32(a.zero); - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - w1 = _mm512_loadu_si512((__m512i*)weight1); - Madd4(d00, s0, w0); - Madd4(d01, s0, w1); - weight0 += A, weight1 += A; - } - } - else - weight0 += dWz, weight1 += dWz; - } - weight0 += dW, weight1 += dW; - } - } - __mmask16 tail = TailMask16(dstC - F); - Save2(dst, buf, d00, d01, norm, bias, params, scale, shift, tail); - } - else - { - d00 = _mm512_setzero_si512(); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - if (sy + ky < p.srcH && sx + kx < p.srcW) - { - size_t offs = (sy + ky) * dY + (sx + kx) * dX, end = offs + srcC; - for (; offs < end; offs += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - s0 = Set4(src0 + offs); - Madd4(d00, s0, w0); - weight0 += A; - } - } - else - { - if (a.zero) - { - s0 = _mm512_set1_epi32(a.zero); - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - Madd4(d00, s0, w0); - weight0 += A; - } - } - else - weight0 += dWz; - } - weight0 += dW; - } - } - __mmask16 tail = TailMask16(dstC); - Save1(dst, buf, d00, norm, bias, params, scale, shift, tail); - } - } - - template void ConvolutionNhwcDirect_2x12(const uint8_t* src0, - const ConvParam8i& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const int8_t* weight0, - const __m512i* bias, const __m512i* params, const __m512* scale, const __m512* shift, int32_t* buf, uint8_t* dst) - { - __m512i d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, dA0, dA1, dB0, dB1, s0, w0, w1; - size_t dW = (DivHi(p.srcC, 4) - DivHi(srcC, 4)) * A, dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dD = p.dstC * a.size, dB = p.dstC, dWz = (DivHi(srcC, 4) * A + dW) * p.kernelX; - const int8_t * weight1 = weight0 + p.kernelY * p.kernelX * DivHi(p.srcC, 4) * A; - const uint8_t* src1 = src0 + 1 * dS; - const uint8_t* src2 = src0 + 2 * dS; - const uint8_t* src3 = src0 + 3 * dS; - const uint8_t* src4 = src0 + 4 * dS; - const uint8_t* src5 = src0 + 5 * dS; - __m512i norm = _mm512_set1_epi32(a.norm); - size_t sy = dy * p.strideY - p.padY; - size_t sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY; - size_t kX = p.kernelX * p.dilationX; - if (dstC > F) - { - d00 = _mm512_setzero_si512(), d01 = _mm512_setzero_si512(); - d10 = _mm512_setzero_si512(), d11 = _mm512_setzero_si512(); - d20 = _mm512_setzero_si512(), d21 = _mm512_setzero_si512(); - d30 = _mm512_setzero_si512(), d31 = _mm512_setzero_si512(); - d40 = _mm512_setzero_si512(), d41 = _mm512_setzero_si512(); - d50 = _mm512_setzero_si512(), d51 = _mm512_setzero_si512(); - d60 = _mm512_setzero_si512(), d61 = _mm512_setzero_si512(); - d70 = _mm512_setzero_si512(), d71 = _mm512_setzero_si512(); - d80 = _mm512_setzero_si512(), d81 = _mm512_setzero_si512(); - d90 = _mm512_setzero_si512(), d91 = _mm512_setzero_si512(); - dA0 = _mm512_setzero_si512(), dA1 = _mm512_setzero_si512(); - dB0 = _mm512_setzero_si512(), dB1 = _mm512_setzero_si512(); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - if (sy + ky < p.srcH) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - assert(sx + kx < p.srcW && sx + kx + 12 <= p.srcW); - size_t offs0 = (sy + ky) * dY + (sx + kx) * dX, end = offs0 + srcC, offs6 = offs0 + 6 * dS; - for (; offs0 < end; offs0 += 4, offs6 += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - w1 = _mm512_loadu_si512((__m512i*)weight1); - s0 = Set4(src0 + offs0), Madd4(d00, s0, w0), Madd4(d01, s0, w1); - s0 = Set4(src1 + offs0), Madd4(d10, s0, w0), Madd4(d11, s0, w1); - s0 = Set4(src2 + offs0), Madd4(d20, s0, w0), Madd4(d21, s0, w1); - s0 = Set4(src3 + offs0), Madd4(d30, s0, w0), Madd4(d31, s0, w1); - s0 = Set4(src4 + offs0), Madd4(d40, s0, w0), Madd4(d41, s0, w1); - s0 = Set4(src5 + offs0), Madd4(d50, s0, w0), Madd4(d51, s0, w1); - s0 = Set4(src0 + offs6), Madd4(d60, s0, w0), Madd4(d61, s0, w1); - s0 = Set4(src1 + offs6), Madd4(d70, s0, w0), Madd4(d71, s0, w1); - s0 = Set4(src2 + offs6), Madd4(d80, s0, w0), Madd4(d81, s0, w1); - s0 = Set4(src3 + offs6), Madd4(d90, s0, w0), Madd4(d91, s0, w1); - s0 = Set4(src4 + offs6), Madd4(dA0, s0, w0), Madd4(dA1, s0, w1); - s0 = Set4(src5 + offs6), Madd4(dB0, s0, w0), Madd4(dB1, s0, w1); - weight0 += A, weight1 += A; - } - weight0 += dW, weight1 += dW; - } - } - else if (a.zero) - { - s0 = _mm512_set1_epi32(a.zero); - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - w1 = _mm512_loadu_si512((__m512i*)weight1); - Madd4(d00, s0, w0), Madd4(d01, s0, w1); - Madd4(d10, s0, w0), Madd4(d11, s0, w1); - Madd4(d20, s0, w0), Madd4(d21, s0, w1); - Madd4(d30, s0, w0), Madd4(d31, s0, w1); - Madd4(d40, s0, w0), Madd4(d41, s0, w1); - Madd4(d50, s0, w0), Madd4(d51, s0, w1); - Madd4(d60, s0, w0), Madd4(d61, s0, w1); - Madd4(d70, s0, w0), Madd4(d71, s0, w1); - Madd4(d80, s0, w0), Madd4(d81, s0, w1); - Madd4(d90, s0, w0), Madd4(d91, s0, w1); - Madd4(dA0, s0, w0), Madd4(dA1, s0, w1); - Madd4(dB0, s0, w0), Madd4(dB1, s0, w1); - weight0 += A, weight1 += A; - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - __mmask16 tail = TailMask16(dstC - F); - Save2(dst, buf, d00, d01, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d10, d11, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d20, d21, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d30, d31, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d40, d41, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d50, d51, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d60, d61, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d70, d71, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d80, d81, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d90, d91, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, dA0, dA1, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, dB0, dB1, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - } - else - { - d00 = _mm512_setzero_si512(); - d10 = _mm512_setzero_si512(); - d20 = _mm512_setzero_si512(); - d30 = _mm512_setzero_si512(); - d40 = _mm512_setzero_si512(); - d50 = _mm512_setzero_si512(); - d60 = _mm512_setzero_si512(); - d70 = _mm512_setzero_si512(); - d80 = _mm512_setzero_si512(); - d90 = _mm512_setzero_si512(); - dA0 = _mm512_setzero_si512(); - dB0 = _mm512_setzero_si512(); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - if (sy + ky < p.srcH) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - assert(sx + kx < p.srcW && sx + kx + 12 <= p.srcW); - size_t offs0 = (sy + ky) * dY + (sx + kx) * dX, end = offs0 + srcC, offs6 = offs0 + 6 * dS; - for (; offs0 < end; offs0 += 4, offs6 += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - s0 = Set4(src0 + offs0), Madd4(d00, s0, w0); - s0 = Set4(src1 + offs0), Madd4(d10, s0, w0); - s0 = Set4(src2 + offs0), Madd4(d20, s0, w0); - s0 = Set4(src3 + offs0), Madd4(d30, s0, w0); - s0 = Set4(src4 + offs0), Madd4(d40, s0, w0); - s0 = Set4(src5 + offs0), Madd4(d50, s0, w0); - s0 = Set4(src0 + offs6), Madd4(d60, s0, w0); - s0 = Set4(src1 + offs6), Madd4(d70, s0, w0); - s0 = Set4(src2 + offs6), Madd4(d80, s0, w0); - s0 = Set4(src3 + offs6), Madd4(d90, s0, w0); - s0 = Set4(src4 + offs6), Madd4(dA0, s0, w0); - s0 = Set4(src5 + offs6), Madd4(dB0, s0, w0); - weight0 += A; - } - weight0 += dW; - } - } - else if (a.zero) - { - s0 = _mm512_set1_epi32(a.zero); - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - Madd4(d00, s0, w0); - Madd4(d10, s0, w0); - Madd4(d20, s0, w0); - Madd4(d30, s0, w0); - Madd4(d40, s0, w0); - Madd4(d50, s0, w0); - Madd4(d60, s0, w0); - Madd4(d70, s0, w0); - Madd4(d80, s0, w0); - Madd4(d90, s0, w0); - Madd4(dA0, s0, w0); - Madd4(dB0, s0, w0); - weight0 += A; - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - __mmask16 tail = TailMask16(dstC); - Save1(dst, buf, d00, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d10, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d20, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d30, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d40, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d50, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d60, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d70, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d80, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d90, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, dA0, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, dB0, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - } - } - - template void ConvolutionNhwcDirect_2xM(const uint8_t* src0, - const ConvParam8i& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const int8_t* weight0, - const __m512i* bias, const __m512i* params, const __m512* scale, const __m512* shift, int32_t* buf, uint8_t* dst) - { - __m512i d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, dA0, dA1, dB0, dB1, s0, w0, w1; - size_t dW = (DivHi(p.srcC, 4) - DivHi(srcC, 4)) * A, dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dD = p.dstC * a.size, dB = p.dstC, dWz = (DivHi(srcC, 4) * A + dW) * p.kernelX; - const int8_t* weight1 = weight0 + p.kernelY * p.kernelX * DivHi(p.srcC, 4) * A; - const uint8_t* src1 = src0 + 1 * dS; - const uint8_t* src2 = src0 + 2 * dS; - const uint8_t* src3 = src0 + 3 * dS; - const uint8_t* src4 = src0 + 4 * dS; - const uint8_t* src5 = src0 + 5 * dS; - __m512i norm = _mm512_set1_epi32(a.norm); - size_t sy = dy * p.strideY - p.padY; - size_t sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY; - size_t kX = p.kernelX * p.dilationX; - if (dstC > F) - { - if (M > 0x0) d00 = _mm512_setzero_si512(), d01 = _mm512_setzero_si512(); - if (M > 0x1) d10 = _mm512_setzero_si512(), d11 = _mm512_setzero_si512(); - if (M > 0x2) d20 = _mm512_setzero_si512(), d21 = _mm512_setzero_si512(); - if (M > 0x3) d30 = _mm512_setzero_si512(), d31 = _mm512_setzero_si512(); - if (M > 0x4) d40 = _mm512_setzero_si512(), d41 = _mm512_setzero_si512(); - if (M > 0x5) d50 = _mm512_setzero_si512(), d51 = _mm512_setzero_si512(); - if (M > 0x6) d60 = _mm512_setzero_si512(), d61 = _mm512_setzero_si512(); - if (M > 0x7) d70 = _mm512_setzero_si512(), d71 = _mm512_setzero_si512(); - if (M > 0x8) d80 = _mm512_setzero_si512(), d81 = _mm512_setzero_si512(); - if (M > 0x9) d90 = _mm512_setzero_si512(), d91 = _mm512_setzero_si512(); - if (M > 0xA) dA0 = _mm512_setzero_si512(), dA1 = _mm512_setzero_si512(); - if (M > 0xB) dB0 = _mm512_setzero_si512(), dB1 = _mm512_setzero_si512(); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - if (sy + ky < p.srcH) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - assert(sx + kx < p.srcW && sx + kx + M <= p.srcW); - size_t offs0 = (sy + ky) * dY + (sx + kx) * dX, end = offs0 + srcC, offs6 = offs0 + 6 * dS; - for (; offs0 < end; offs0 += 4, offs6 += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - w1 = _mm512_loadu_si512((__m512i*)weight1); - if (M > 0x0) s0 = Set4(src0 + offs0), Madd4(d00, s0, w0), Madd4(d01, s0, w1); - if (M > 0x1) s0 = Set4(src1 + offs0), Madd4(d10, s0, w0), Madd4(d11, s0, w1); - if (M > 0x2) s0 = Set4(src2 + offs0), Madd4(d20, s0, w0), Madd4(d21, s0, w1); - if (M > 0x3) s0 = Set4(src3 + offs0), Madd4(d30, s0, w0), Madd4(d31, s0, w1); - if (M > 0x4) s0 = Set4(src4 + offs0), Madd4(d40, s0, w0), Madd4(d41, s0, w1); - if (M > 0x5) s0 = Set4(src5 + offs0), Madd4(d50, s0, w0), Madd4(d51, s0, w1); - if (M > 0x6) s0 = Set4(src0 + offs6), Madd4(d60, s0, w0), Madd4(d61, s0, w1); - if (M > 0x7) s0 = Set4(src1 + offs6), Madd4(d70, s0, w0), Madd4(d71, s0, w1); - if (M > 0x8) s0 = Set4(src2 + offs6), Madd4(d80, s0, w0), Madd4(d81, s0, w1); - if (M > 0x9) s0 = Set4(src3 + offs6), Madd4(d90, s0, w0), Madd4(d91, s0, w1); - if (M > 0xA) s0 = Set4(src4 + offs6), Madd4(dA0, s0, w0), Madd4(dA1, s0, w1); - if (M > 0xB) s0 = Set4(src5 + offs6), Madd4(dB0, s0, w0), Madd4(dB1, s0, w1); - weight0 += A, weight1 += A; - } - weight0 += dW, weight1 += dW; - } - } - else if (a.zero) - { - s0 = _mm512_set1_epi32(a.zero); - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - w1 = _mm512_loadu_si512((__m512i*)weight1); - if (M > 0x0) Madd4(d00, s0, w0), Madd4(d01, s0, w1); - if (M > 0x1) Madd4(d10, s0, w0), Madd4(d11, s0, w1); - if (M > 0x2) Madd4(d20, s0, w0), Madd4(d21, s0, w1); - if (M > 0x3) Madd4(d30, s0, w0), Madd4(d31, s0, w1); - if (M > 0x4) Madd4(d40, s0, w0), Madd4(d41, s0, w1); - if (M > 0x5) Madd4(d50, s0, w0), Madd4(d51, s0, w1); - if (M > 0x6) Madd4(d60, s0, w0), Madd4(d61, s0, w1); - if (M > 0x7) Madd4(d70, s0, w0), Madd4(d71, s0, w1); - if (M > 0x8) Madd4(d80, s0, w0), Madd4(d81, s0, w1); - if (M > 0x9) Madd4(d90, s0, w0), Madd4(d91, s0, w1); - if (M > 0xA) Madd4(dA0, s0, w0), Madd4(dA1, s0, w1); - if (M > 0xB) Madd4(dB0, s0, w0), Madd4(dB1, s0, w1); - weight0 += A, weight1 += A; - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - __mmask16 tail = TailMask16(dstC - F); - if (M > 0x0) Save2(dst, buf, d00, d01, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x1) Save2(dst, buf, d10, d11, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x2) Save2(dst, buf, d20, d21, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x3) Save2(dst, buf, d30, d31, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x4) Save2(dst, buf, d40, d41, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x5) Save2(dst, buf, d50, d51, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x6) Save2(dst, buf, d60, d61, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x7) Save2(dst, buf, d70, d71, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x8) Save2(dst, buf, d80, d81, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x9) Save2(dst, buf, d90, d91, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0xA) Save2(dst, buf, dA0, dA1, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0xB) Save2(dst, buf, dB0, dB1, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - } - else - { - if (M > 0x0) d00 = _mm512_setzero_si512(); - if (M > 0x1) d10 = _mm512_setzero_si512(); - if (M > 0x2) d20 = _mm512_setzero_si512(); - if (M > 0x3) d30 = _mm512_setzero_si512(); - if (M > 0x4) d40 = _mm512_setzero_si512(); - if (M > 0x5) d50 = _mm512_setzero_si512(); - if (M > 0x6) d60 = _mm512_setzero_si512(); - if (M > 0x7) d70 = _mm512_setzero_si512(); - if (M > 0x8) d80 = _mm512_setzero_si512(); - if (M > 0x9) d90 = _mm512_setzero_si512(); - if (M > 0xA) dA0 = _mm512_setzero_si512(); - if (M > 0xB) dB0 = _mm512_setzero_si512(); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - if (sy + ky < p.srcH) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - assert(sx + kx < p.srcW && sx + kx + M <= p.srcW); - size_t offs0 = (sy + ky) * dY + (sx + kx) * dX, end = offs0 + srcC, offs6 = offs0 + 6 * dS; - for (; offs0 < end; offs0 += 4, offs6 += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - if (M > 0x0) s0 = Set4(src0 + offs0), Madd4(d00, s0, w0); - if (M > 0x1) s0 = Set4(src1 + offs0), Madd4(d10, s0, w0); - if (M > 0x2) s0 = Set4(src2 + offs0), Madd4(d20, s0, w0); - if (M > 0x3) s0 = Set4(src3 + offs0), Madd4(d30, s0, w0); - if (M > 0x4) s0 = Set4(src4 + offs0), Madd4(d40, s0, w0); - if (M > 0x5) s0 = Set4(src5 + offs0), Madd4(d50, s0, w0); - if (M > 0x6) s0 = Set4(src0 + offs6), Madd4(d60, s0, w0); - if (M > 0x7) s0 = Set4(src1 + offs6), Madd4(d70, s0, w0); - if (M > 0x8) s0 = Set4(src2 + offs6), Madd4(d80, s0, w0); - if (M > 0x9) s0 = Set4(src3 + offs6), Madd4(d90, s0, w0); - if (M > 0xA) s0 = Set4(src4 + offs6), Madd4(dA0, s0, w0); - if (M > 0xB) s0 = Set4(src5 + offs6), Madd4(dB0, s0, w0); - weight0 += A; - } - weight0 += dW; - } - } - else if (a.zero) - { - s0 = _mm512_set1_epi32(a.zero); - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - if (M > 0x0) Madd4(d00, s0, w0); - if (M > 0x1) Madd4(d10, s0, w0); - if (M > 0x2) Madd4(d20, s0, w0); - if (M > 0x3) Madd4(d30, s0, w0); - if (M > 0x4) Madd4(d40, s0, w0); - if (M > 0x5) Madd4(d50, s0, w0); - if (M > 0x6) Madd4(d60, s0, w0); - if (M > 0x7) Madd4(d70, s0, w0); - if (M > 0x8) Madd4(d80, s0, w0); - if (M > 0x9) Madd4(d90, s0, w0); - if (M > 0xA) Madd4(dA0, s0, w0); - if (M > 0xB) Madd4(dB0, s0, w0); - weight0 += A; - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - __mmask16 tail = TailMask16(dstC); - if (M > 0x0) Save1(dst, buf, d00, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x1) Save1(dst, buf, d10, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x2) Save1(dst, buf, d20, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x3) Save1(dst, buf, d30, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x4) Save1(dst, buf, d40, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x5) Save1(dst, buf, d50, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x6) Save1(dst, buf, d60, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x7) Save1(dst, buf, d70, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x8) Save1(dst, buf, d80, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x9) Save1(dst, buf, d90, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0xA) Save1(dst, buf, dA0, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0xB) Save1(dst, buf, dB0, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - } - } - - typedef void(*ConvolutionNhwcDirect_2xM_Ptr)(const uint8_t* src0, const ConvParam8i& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, - const int8_t* weight0, const __m512i* bias, const __m512i* params, const __m512* scale, const __m512* shift, int32_t* buf, uint8_t* dst); - - template ConvolutionNhwcDirect_2xM_Ptr GetConvolutionNhwcDirect_2xM(size_t M) - { - switch (M) - { - case 0x0: NULL; - case 0x1: return ConvolutionNhwcDirect_2xM; - case 0x2: return ConvolutionNhwcDirect_2xM; - case 0x3: return ConvolutionNhwcDirect_2xM; - case 0x4: return ConvolutionNhwcDirect_2xM; - case 0x5: return ConvolutionNhwcDirect_2xM; - case 0x6: return ConvolutionNhwcDirect_2xM; - case 0x7: return ConvolutionNhwcDirect_2xM; - case 0x8: return ConvolutionNhwcDirect_2xM; - case 0x9: return ConvolutionNhwcDirect_2xM; - case 0xA: return ConvolutionNhwcDirect_2xM; - case 0xB: return ConvolutionNhwcDirect_2xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect_2(const uint8_t* src, - const ConvParam8i & p, const AlgParam & a, size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const int8_t* weight, - const int32_t* bias, const int32_t * params, const float * scale, const float* shift, int32_t* buf, uint8_t* dst) - { - size_t noseH = p.NoseH(), noseW = p.NoseW(), bodyH = p.BodyH(), bodyW = p.BodyW(); - size_t n = 12, bodyWn = AlignLoAny(bodyW - noseW, n) + noseW, m = bodyW - bodyWn; - ConvolutionNhwcDirect_2xM_Ptr convolutionNhwcDirect_2xM = GetConvolutionNhwcDirect_2xM(m); - size_t tailH = p.dstH, tailW = p.dstW; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - __m512i _params[2], _bias[2]; - _params[0] = _mm512_setzero_si512(); - if (type == ::SimdConvolutionActivationRestrictRange) - _params[1] = _mm512_set1_epi32(a.high); - __m512 _scale[2], _shift[2]; - - for (size_t dc = 0; dc < dstC; dc += DF) - { - size_t dC = Simd::Min(DF, dstC - dc); - _bias[0] = _mm512_loadu_si512((__m512i*)(bias + dc + 0)); - _bias[1] = _mm512_loadu_si512((__m512i*)(bias + dc + F)); - _scale[0] = _mm512_loadu_ps(scale + dc + 0); - _scale[1] = _mm512_loadu_ps(scale + dc + F); - _shift[0] = _mm512_loadu_ps(shift + dc + 0); - _shift[1] = _mm512_loadu_ps(shift + dc + F); - - uint8_t * d = dst + (dc + yBeg * p.dstW * p.dstC) * a.size; - int32_t * b = buf + dc + yBeg * p.dstW * p.dstC; - size_t dy = yBeg; - for (; dy < noseH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyWn; dx += n, b += p.dstC * n, d += p.dstC * a.size * n) - ConvolutionNhwcDirect_2x12(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyW; dx += m, b += p.dstC * m, d += p.dstC * a.size * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < tailW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - } - for (; dy < bodyH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyWn; dx += n, b += p.dstC * n, d += p.dstC * a.size * n) - ConvolutionNhwcDirect_2x12(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyW; dx += m, b += p.dstC * m, d += p.dstC * a.size * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < tailW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - } - for (; dy < tailH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyWn; dx += n, b += p.dstC * n, d += p.dstC * a.size * n) - ConvolutionNhwcDirect_2x12(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyW; dx += m, b += p.dstC * m, d += p.dstC * a.size * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < tailW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - } - weight += p.kernelY * p.kernelX * DivHi(p.srcC, 4) * DA; - } - } - - //--------------------------------------------------------------------- - - template void ConvolutionNhwcDirect1x1_2x12( - const uint8_t* src0, const ConvParam8i& p, const AlgParam& a, size_t srcC, size_t dstC, const int8_t* weight0, - const __m512i* bias, const __m512i* params, const __m512* scale, const __m512* shift, int32_t* buf, uint8_t* dst) - { - __m512i d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, dA0, dA1, dB0, dB1, s0, w0, w1; - size_t dS = p.srcC * p.strideX, dD = p.dstC * a.size, dB = p.dstC; - const int8_t* weight1 = weight0 + DivHi(p.srcC, 4) * A; - const uint8_t* src1 = src0 + 1 * dS; - const uint8_t* src2 = src0 + 2 * dS; - const uint8_t* src3 = src0 + 3 * dS; - const uint8_t* src4 = src0 + 4 * dS; - const uint8_t* src5 = src0 + 5 * dS; - __m512i norm = _mm512_set1_epi32(a.norm); - if (dstC > F) - { - d00 = _mm512_setzero_si512(), d01 = _mm512_setzero_si512(); - d10 = _mm512_setzero_si512(), d11 = _mm512_setzero_si512(); - d20 = _mm512_setzero_si512(), d21 = _mm512_setzero_si512(); - d30 = _mm512_setzero_si512(), d31 = _mm512_setzero_si512(); - d40 = _mm512_setzero_si512(), d41 = _mm512_setzero_si512(); - d50 = _mm512_setzero_si512(), d51 = _mm512_setzero_si512(); - d60 = _mm512_setzero_si512(), d61 = _mm512_setzero_si512(); - d70 = _mm512_setzero_si512(), d71 = _mm512_setzero_si512(); - d80 = _mm512_setzero_si512(), d81 = _mm512_setzero_si512(); - d90 = _mm512_setzero_si512(), d91 = _mm512_setzero_si512(); - dA0 = _mm512_setzero_si512(), dA1 = _mm512_setzero_si512(); - dB0 = _mm512_setzero_si512(), dB1 = _mm512_setzero_si512(); - for (size_t offs0 = 0, offs6 = offs0 + 6 * dS; offs0 < srcC; offs0 += 4, offs6 += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - w1 = _mm512_loadu_si512((__m512i*)weight1); - s0 = Set4(src0 + offs0), Madd4(d00, s0, w0), Madd4(d01, s0, w1); - s0 = Set4(src1 + offs0), Madd4(d10, s0, w0), Madd4(d11, s0, w1); - s0 = Set4(src2 + offs0), Madd4(d20, s0, w0), Madd4(d21, s0, w1); - s0 = Set4(src3 + offs0), Madd4(d30, s0, w0), Madd4(d31, s0, w1); - s0 = Set4(src4 + offs0), Madd4(d40, s0, w0), Madd4(d41, s0, w1); - s0 = Set4(src5 + offs0), Madd4(d50, s0, w0), Madd4(d51, s0, w1); - s0 = Set4(src0 + offs6), Madd4(d60, s0, w0), Madd4(d61, s0, w1); - s0 = Set4(src1 + offs6), Madd4(d70, s0, w0), Madd4(d71, s0, w1); - s0 = Set4(src2 + offs6), Madd4(d80, s0, w0), Madd4(d81, s0, w1); - s0 = Set4(src3 + offs6), Madd4(d90, s0, w0), Madd4(d91, s0, w1); - s0 = Set4(src4 + offs6), Madd4(dA0, s0, w0), Madd4(dA1, s0, w1); - s0 = Set4(src5 + offs6), Madd4(dB0, s0, w0), Madd4(dB1, s0, w1); - weight0 += A, weight1 += A; - } - __mmask16 tail = TailMask16(dstC - F); - Save2(dst, buf, d00, d01, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d10, d11, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d20, d21, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d30, d31, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d40, d41, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d50, d51, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d60, d61, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d70, d71, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d80, d81, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, d90, d91, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, dA0, dA1, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save2(dst, buf, dB0, dB1, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - } - else - { - d00 = _mm512_setzero_si512(); - d10 = _mm512_setzero_si512(); - d20 = _mm512_setzero_si512(); - d30 = _mm512_setzero_si512(); - d40 = _mm512_setzero_si512(); - d50 = _mm512_setzero_si512(); - d60 = _mm512_setzero_si512(); - d70 = _mm512_setzero_si512(); - d80 = _mm512_setzero_si512(); - d90 = _mm512_setzero_si512(); - dA0 = _mm512_setzero_si512(); - dB0 = _mm512_setzero_si512(); - for (size_t offs0 = 0, offs6 = offs0 + 6 * dS; offs0 < srcC; offs0 += 4, offs6 += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - s0 = Set4(src0 + offs0), Madd4(d00, s0, w0); - s0 = Set4(src1 + offs0), Madd4(d10, s0, w0); - s0 = Set4(src2 + offs0), Madd4(d20, s0, w0); - s0 = Set4(src3 + offs0), Madd4(d30, s0, w0); - s0 = Set4(src4 + offs0), Madd4(d40, s0, w0); - s0 = Set4(src5 + offs0), Madd4(d50, s0, w0); - s0 = Set4(src0 + offs6), Madd4(d60, s0, w0); - s0 = Set4(src1 + offs6), Madd4(d70, s0, w0); - s0 = Set4(src2 + offs6), Madd4(d80, s0, w0); - s0 = Set4(src3 + offs6), Madd4(d90, s0, w0); - s0 = Set4(src4 + offs6), Madd4(dA0, s0, w0); - s0 = Set4(src5 + offs6), Madd4(dB0, s0, w0); - weight0 += A; - } - __mmask16 tail = TailMask16(dstC); - Save1(dst, buf, d00, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d10, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d20, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d30, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d40, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d50, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d60, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d70, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d80, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, d90, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, dA0, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - Save1(dst, buf, dB0, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - } - } - - template void ConvolutionNhwcDirect1x1_2xM( - const uint8_t* src0, const ConvParam8i& p, const AlgParam& a, size_t srcC, size_t dstC, const int8_t* weight0, - const __m512i* bias, const __m512i* params, const __m512* scale, const __m512* shift, int32_t* buf, uint8_t* dst) - { - __m512i d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, dA0, dA1, dB0, dB1, s0, w0, w1; - size_t dS = p.srcC * p.strideX, dD = p.dstC * a.size, dB = p.dstC; - const int8_t* weight1 = weight0 + DivHi(p.srcC, 4) * A; - const uint8_t* src1 = src0 + 1 * dS; - const uint8_t* src2 = src0 + 2 * dS; - const uint8_t* src3 = src0 + 3 * dS; - const uint8_t* src4 = src0 + 4 * dS; - const uint8_t* src5 = src0 + 5 * dS; - __m512i norm = _mm512_set1_epi32(a.norm); - if (dstC > F) - { - if (M > 0x0) d00 = _mm512_setzero_si512(), d01 = _mm512_setzero_si512(); - if (M > 0x1) d10 = _mm512_setzero_si512(), d11 = _mm512_setzero_si512(); - if (M > 0x2) d20 = _mm512_setzero_si512(), d21 = _mm512_setzero_si512(); - if (M > 0x3) d30 = _mm512_setzero_si512(), d31 = _mm512_setzero_si512(); - if (M > 0x4) d40 = _mm512_setzero_si512(), d41 = _mm512_setzero_si512(); - if (M > 0x5) d50 = _mm512_setzero_si512(), d51 = _mm512_setzero_si512(); - if (M > 0x6) d60 = _mm512_setzero_si512(), d61 = _mm512_setzero_si512(); - if (M > 0x7) d70 = _mm512_setzero_si512(), d71 = _mm512_setzero_si512(); - if (M > 0x8) d80 = _mm512_setzero_si512(), d81 = _mm512_setzero_si512(); - if (M > 0x9) d90 = _mm512_setzero_si512(), d91 = _mm512_setzero_si512(); - if (M > 0xA) dA0 = _mm512_setzero_si512(), dA1 = _mm512_setzero_si512(); - if (M > 0xB) dB0 = _mm512_setzero_si512(), dB1 = _mm512_setzero_si512(); - for (size_t offs0 = 0, offs6 = offs0 + 6 * dS; offs0 < srcC; offs0 += 4, offs6 += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - w1 = _mm512_loadu_si512((__m512i*)weight1); - if (M > 0x0) s0 = Set4(src0 + offs0), Madd4(d00, s0, w0), Madd4(d01, s0, w1); - if (M > 0x1) s0 = Set4(src1 + offs0), Madd4(d10, s0, w0), Madd4(d11, s0, w1); - if (M > 0x2) s0 = Set4(src2 + offs0), Madd4(d20, s0, w0), Madd4(d21, s0, w1); - if (M > 0x3) s0 = Set4(src3 + offs0), Madd4(d30, s0, w0), Madd4(d31, s0, w1); - if (M > 0x4) s0 = Set4(src4 + offs0), Madd4(d40, s0, w0), Madd4(d41, s0, w1); - if (M > 0x5) s0 = Set4(src5 + offs0), Madd4(d50, s0, w0), Madd4(d51, s0, w1); - if (M > 0x6) s0 = Set4(src0 + offs6), Madd4(d60, s0, w0), Madd4(d61, s0, w1); - if (M > 0x7) s0 = Set4(src1 + offs6), Madd4(d70, s0, w0), Madd4(d71, s0, w1); - if (M > 0x8) s0 = Set4(src2 + offs6), Madd4(d80, s0, w0), Madd4(d81, s0, w1); - if (M > 0x9) s0 = Set4(src3 + offs6), Madd4(d90, s0, w0), Madd4(d91, s0, w1); - if (M > 0xA) s0 = Set4(src4 + offs6), Madd4(dA0, s0, w0), Madd4(dA1, s0, w1); - if (M > 0xB) s0 = Set4(src5 + offs6), Madd4(dB0, s0, w0), Madd4(dB1, s0, w1); - weight0 += A, weight1 += A; - } - __mmask16 tail = TailMask16(dstC - F); - if (M > 0x0) Save2(dst, buf, d00, d01, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x1) Save2(dst, buf, d10, d11, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x2) Save2(dst, buf, d20, d21, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x3) Save2(dst, buf, d30, d31, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x4) Save2(dst, buf, d40, d41, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x5) Save2(dst, buf, d50, d51, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x6) Save2(dst, buf, d60, d61, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x7) Save2(dst, buf, d70, d71, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x8) Save2(dst, buf, d80, d81, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x9) Save2(dst, buf, d90, d91, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0xA) Save2(dst, buf, dA0, dA1, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0xB) Save2(dst, buf, dB0, dB1, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - } - else - { - if (M > 0x0) d00 = _mm512_setzero_si512(); - if (M > 0x1) d10 = _mm512_setzero_si512(); - if (M > 0x2) d20 = _mm512_setzero_si512(); - if (M > 0x3) d30 = _mm512_setzero_si512(); - if (M > 0x4) d40 = _mm512_setzero_si512(); - if (M > 0x5) d50 = _mm512_setzero_si512(); - if (M > 0x6) d60 = _mm512_setzero_si512(); - if (M > 0x7) d70 = _mm512_setzero_si512(); - if (M > 0x8) d80 = _mm512_setzero_si512(); - if (M > 0x9) d90 = _mm512_setzero_si512(); - if (M > 0xA) dA0 = _mm512_setzero_si512(); - if (M > 0xB) dB0 = _mm512_setzero_si512(); - for (size_t offs0 = 0, offs6 = offs0 + 6 * dS; offs0 < srcC; offs0 += 4, offs6 += 4) - { - w0 = _mm512_loadu_si512((__m512i*)weight0); - if (M > 0x0) s0 = Set4(src0 + offs0), Madd4(d00, s0, w0); - if (M > 0x1) s0 = Set4(src1 + offs0), Madd4(d10, s0, w0); - if (M > 0x2) s0 = Set4(src2 + offs0), Madd4(d20, s0, w0); - if (M > 0x3) s0 = Set4(src3 + offs0), Madd4(d30, s0, w0); - if (M > 0x4) s0 = Set4(src4 + offs0), Madd4(d40, s0, w0); - if (M > 0x5) s0 = Set4(src5 + offs0), Madd4(d50, s0, w0); - if (M > 0x6) s0 = Set4(src0 + offs6), Madd4(d60, s0, w0); - if (M > 0x7) s0 = Set4(src1 + offs6), Madd4(d70, s0, w0); - if (M > 0x8) s0 = Set4(src2 + offs6), Madd4(d80, s0, w0); - if (M > 0x9) s0 = Set4(src3 + offs6), Madd4(d90, s0, w0); - if (M > 0xA) s0 = Set4(src4 + offs6), Madd4(dA0, s0, w0); - if (M > 0xB) s0 = Set4(src5 + offs6), Madd4(dB0, s0, w0); - weight0 += A; - } - __mmask16 tail = TailMask16(dstC); - if (M > 0x0) Save1(dst, buf, d00, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x1) Save1(dst, buf, d10, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x2) Save1(dst, buf, d20, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x3) Save1(dst, buf, d30, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x4) Save1(dst, buf, d40, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x5) Save1(dst, buf, d50, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x6) Save1(dst, buf, d60, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x7) Save1(dst, buf, d70, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x8) Save1(dst, buf, d80, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0x9) Save1(dst, buf, d90, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0xA) Save1(dst, buf, dA0, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - if (M > 0xB) Save1(dst, buf, dB0, norm, bias, params, scale, shift, tail), dst += dD, buf += dB; - } - } - - typedef void(*ConvolutionNhwcDirect1x1_2xM_Ptr)(const uint8_t* src0, const ConvParam8i& p, const AlgParam& a, size_t srcC, size_t dstC, - const int8_t* weight0, const __m512i* bias, const __m512i* params, const __m512* scale, const __m512* shift, int32_t* buf, uint8_t* dst); - - template ConvolutionNhwcDirect1x1_2xM_Ptr GetConvolutionNhwcDirect1x1_2xM(size_t M) - { - switch (M) - { - case 0x0: return NULL; - case 0x1: return ConvolutionNhwcDirect1x1_2xM; - case 0x2: return ConvolutionNhwcDirect1x1_2xM; - case 0x3: return ConvolutionNhwcDirect1x1_2xM; - case 0x4: return ConvolutionNhwcDirect1x1_2xM; - case 0x5: return ConvolutionNhwcDirect1x1_2xM; - case 0x6: return ConvolutionNhwcDirect1x1_2xM; - case 0x7: return ConvolutionNhwcDirect1x1_2xM; - case 0x8: return ConvolutionNhwcDirect1x1_2xM; - case 0x9: return ConvolutionNhwcDirect1x1_2xM; - case 0xA: return ConvolutionNhwcDirect1x1_2xM; - case 0xB: return ConvolutionNhwcDirect1x1_2xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect1x1_2(const uint8_t* src, - const ConvParam8i& p, const AlgParam& a, size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const int8_t* weight, - const int32_t* bias, const int32_t* params, const float* scale, const float* shift, int32_t* buf, uint8_t* dst) - { - size_t n1 = (yEnd - yBeg) * p.dstW, n12 = AlignLoAny(n1, 12), m = n1 - n12; - ConvolutionNhwcDirect1x1_2xM_Ptr convolutionNhwcDirect1x1_2xM = GetConvolutionNhwcDirect1x1_2xM(m); - __m512i _params[2], _bias[2]; - _params[0] = _mm512_setzero_si512(); - if (type == ::SimdConvolutionActivationRestrictRange) - _params[1] = _mm512_set1_epi32(a.high); - __m512 _scale[2], _shift[2]; - - for (size_t dc = 0; dc < dstC; dc += DF) - { - size_t dC = Simd::Min(DF, dstC - dc); - _bias[0] = _mm512_loadu_si512((__m512i*)(bias + dc + 0)); - _bias[1] = _mm512_loadu_si512((__m512i*)(bias + dc + F)); - _scale[0] = _mm512_loadu_ps(scale + dc + 0); - _scale[1] = _mm512_loadu_ps(scale + dc + F); - _shift[0] = _mm512_loadu_ps(shift + dc + 0); - _shift[1] = _mm512_loadu_ps(shift + dc + F); - const uint8_t* s = src + yBeg * p.srcW * p.srcC; - uint8_t* d = dst + (dc + yBeg * p.dstW * p.dstC) * a.size; - int32_t* b = buf + dc + yBeg * p.dstW * p.dstC; - size_t i = 0; - for (; i < n12; i += 12, s += p.srcC * 12, b += p.dstC * 12, d += p.dstC * a.size * 12) - ConvolutionNhwcDirect1x1_2x12(s, p, a, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; i < n1; i += m, s += p.srcC * m, b += p.dstC * m, d += p.dstC * a.size * m) - convolutionNhwcDirect1x1_2xM(s, p, a, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - weight += DivHi(p.srcC, 4) * DA; - } - } - - //--------------------------------------------------------------------- - - template void Set(const ConvParam8i& p, const AlgParam & a, ConvolutionPtr * d) - { - if (p.Is1x1()) - { - switch (a.microD) - { - case 2 * F: d[term] = ConvolutionNhwcDirect1x1_2; break; - default: - assert(0); - } - } - else - { - switch (a.microD) - { - case 2 * F: d[term] = ConvolutionNhwcDirect_2; break; - default: - assert(0); - } - } - } - - template void Set(const ConvParam8i& p, const AlgParam& a, ConvolutionPtr* d) - { - if (p.compatibility & SimdSynetCompatibilityNoFma) - Set(p, a, d); - else - Set(p, a, d); - } - - template void Set(const ConvParam8i& p, const AlgParam& a, ConvolutionPtr* d) - { - if (p.compatibility & SimdSynetCompatibilityOverflow16i) - Set(p, a, d); - else - Set(p, a, d); - } - - template void Set(const ConvParam8i& p, const AlgParam& a, ConvolutionPtr* d) - { - Set(p, a, d); - Set(p, a, d); - Set(p, a, d); - Set(p, a, d); - Set(p, a, d); - Set(p, a, d); - } - - static void Set(const ConvParam8i& p, const AlgParam& a, ConvolutionPtr * d) - { - switch (p.activation) - { - case SimdConvolutionActivationIdentity: Set(p, a, d); break; - case SimdConvolutionActivationRelu: Set(p, a, d); break; - case SimdConvolutionActivationRestrictRange: Set(p, a, d); break; - default: assert(0); - } - } - - SynetConvolution8iNhwcDirect::SynetConvolution8iNhwcDirect(const ConvParam8i& p) - : Avx512bw::SynetConvolution8iNhwcDirect(p) - { - SetAlgParam(F, 2 * F, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3()); - Set(p, _alg, _convolutions); - _convertSrc = Avx512bw::SynetConvert32fTo8u; - } - - //--------------------------------------------------------------------- - - void * SynetConvolution8iInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility) - { - ConvParam8i param(batch, conv, compatibility); - if (!param.Valid()) - return NULL; - else if (SynetConvolution8iNhwcDirect::Preferable(param)) - return new SynetConvolution8iNhwcDirect(param); - else - return new Base::SynetConvolution8iGemmNN(param); - } - } -#endif -} diff --git a/src/3rd/Simd/Simd/SimdBase.h b/src/3rd/Simd/Simd/SimdBase.h deleted file mode 100644 index 2156e3e0..00000000 --- a/src/3rd/Simd/Simd/SimdBase.h +++ /dev/null @@ -1,713 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar, -* 2014-2016 Antonenka Mikhail, -* 2019-2019 Facundo Galan. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdBase_h__ -#define __SimdBase_h__ - -#include "Simd/SimdDefs.h" - -namespace Simd -{ - namespace Base - { - size_t GetThreadNumber(); - - void SetThreadNumber(size_t threadNumber); - - uint32_t Crc32c(const void * src, size_t size); - - void AbsDifference(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, uint8_t *c, size_t cStride, - size_t width, size_t height); - - void AbsDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - size_t width, size_t height, uint64_t * sum); - - void AbsDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum); - - void AbsDifferenceSums3x3(const uint8_t *current, size_t currentStride, const uint8_t * background, size_t backgroundStride, - size_t width, size_t height, uint64_t * sums); - - void AbsDifferenceSums3x3Masked(const uint8_t *current, size_t currentStride, const uint8_t *background, size_t backgroundStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sums); - - void AbsGradientSaturatedSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void AddFeatureDifference(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, - uint16_t weight, uint8_t * difference, size_t differenceStride); - - void AlphaBlending(const uint8_t *src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t *alpha, size_t alphaStride, uint8_t *dst, size_t dstStride); - - void AlphaFilling(uint8_t * dst, size_t dstStride, size_t width, size_t height, const uint8_t * channel, size_t channelCount, const uint8_t * alpha, size_t alphaStride); - - void BackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride); - - void BackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride); - - void BackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * loValue, size_t loValueStride, const uint8_t * hiValue, size_t hiValueStride, - uint8_t * loCount, size_t loCountStride, uint8_t * hiCount, size_t hiCountStride); - - void BackgroundAdjustRange(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold); - - void BackgroundAdjustRangeMasked(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride); - - void BackgroundShiftRange(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride); - - void BackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride, const uint8_t * mask, size_t maskStride); - - void BackgroundInitMask(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t index, uint8_t value, uint8_t * dst, size_t dstStride); - - void BayerToBgr(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgr, size_t bgrStride); - - void BayerToBgra(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void BgraToBayer(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat); - - void BgraToBgr(const uint8_t * bgra, size_t size, uint8_t * bgr, bool lastRow); - - void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride); - - void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride); - - void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride); - - void BgraToYuv420p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void BgraToYuv422p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void BgraToYuv444p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void BgraToYuva420p(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, - uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride, uint8_t * a, size_t aStride); - - void BgrToBayer(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat); - - void BgrToBgra(const uint8_t * bgr, size_t size, uint8_t * bgra, bool fillAlpha, bool lastRow, uint8_t alpha); - - void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, - const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride); - - void BgrToHsl(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * hsl, size_t hslStride); - - void BgrToHsv(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * hsv, size_t hsvStride); - - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride); - - void BgrToYuv420p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void BgrToYuv422p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void BgrToYuv444p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void Binarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride, SimdCompareType compareType); - - void AveragingBinarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, size_t neighborhood, uint8_t threshold, uint8_t positive, uint8_t negative, - uint8_t * dst, size_t dstStride, SimdCompareType compareType); - - void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, - uint8_t value, SimdCompareType compareType, uint32_t * count); - - void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, - int16_t value, SimdCompareType compareType, uint32_t * count); - - void ConditionalSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum); - - void ConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum); - - void ConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum); - - void ConditionalFill(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t threshold, SimdCompareType compareType, uint8_t value, uint8_t * dst, size_t dstStride); - - void Copy(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, uint8_t * dst, size_t dstStride); - - void CopyFrame(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, - size_t frameLeft, size_t frameTop, size_t frameRight, size_t frameBottom, uint8_t * dst, size_t dstStride); - - void DeinterleaveUv(const uint8_t * uv, size_t uvStride, size_t width, size_t height, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride); - - void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride); - - void * DetectionLoadStringXml(char * xml, const char * path = NULL); - - void * DetectionLoadA(const char * path); - - void DetectionInfo(const void * data, size_t * width, size_t * height, SimdDetectionInfoFlags * flags); - - void * DetectionInit(const void * data, uint8_t * sum, size_t sumStride, size_t width, size_t height, - uint8_t * sqsum, size_t sqsumStride, uint8_t * tilted, size_t tiltedStride, int throughColumn, int int16); - - void DetectionPrepare(void * hid); - - void DetectionHaarDetect32fp(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void DetectionHaarDetect32fi(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void DetectionLbpDetect32fp(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void DetectionLbpDetect32fi(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void DetectionLbpDetect16ip(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void DetectionLbpDetect16ii(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void EdgeBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride); - - void EdgeBackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride); - - void EdgeBackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t * backgroundCount, size_t backgroundCountStride); - - void EdgeBackgroundAdjustRange(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold); - - void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride); - - void EdgeBackgroundShiftRange(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride); - - void EdgeBackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride, const uint8_t * mask, size_t maskStride); - - void Fill(uint8_t * dst, size_t stride, size_t width, size_t height, size_t pixelSize, uint8_t value); - - void FillFrame(uint8_t * dst, size_t stride, size_t width, size_t height, size_t pixelSize, - size_t frameLeft, size_t frameTop, size_t frameRight, size_t frameBottom, uint8_t value); - - void FillBgr(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red); - - void FillBgra(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red, uint8_t alpha); - - void FillPixel(uint8_t * dst, size_t stride, size_t width, size_t height, const uint8_t * pixel, size_t pixelSize); - - void Fill32f(float * dst, size_t size, const float * value); - - void Float32ToFloat16(const float * src, size_t size, uint16_t * dst); - - void Float16ToFloat32(const uint16_t * src, size_t size, float * dst); - - void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum); - - void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance); - - void CosineDistancesMxNa16f(size_t M, size_t N, size_t K, const uint16_t * const * A, const uint16_t * const * B, float * distances); - - void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst); - - void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst); - - void CosineDistance32f(const float * a, const float * b, size_t size, float * distance); - - void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); - - void Gemm32fNT(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); - - void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride); - - void GrayToBgra(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha); - - void AbsSecondDerivativeHistogram(const uint8_t *src, size_t width, size_t height, size_t stride, - size_t step, size_t indent, uint32_t * histogram); - - void Histogram(const uint8_t *src, size_t width, size_t height, size_t stride, uint32_t * histogram); - - void HistogramMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t index, uint32_t * histogram); - - void HistogramConditional(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint32_t * histogram); - - void NormalizedColors(const uint32_t * histogram, uint8_t * colors); - - void ChangeColors(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * colors, uint8_t * dst, size_t dstStride); - - void NormalizeHistogram(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void AddRowToHistograms(int * indexes, float * values, size_t row, size_t width, size_t height, - size_t cellX, size_t cellY, size_t quantization, float * histograms); - - void HogDirectionHistograms(const uint8_t * src, size_t stride, size_t width, size_t height, - size_t cellX, size_t cellY, size_t quantization, float * histograms); - - void HogExtractFeatures(const uint8_t * src, size_t stride, size_t width, size_t height, float * features); - - void HogDeinterleave(const float * src, size_t srcStride, size_t width, size_t height, size_t count, float ** dst, size_t dstStride); - - void HogFilterSeparable(const float * src, size_t srcStride, size_t width, size_t height, const float * rowFilter, size_t rowSize, const float * colFilter, size_t colSize, float * dst, size_t dstStride, int add); - - void HogLiteExtractFeatures(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t cell, float * features, size_t featuresStride); - - void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride); - - void HogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight); - - void HogLiteCompressFeatures(const float * src, size_t srcStride, size_t width, size_t height, const float * pca, float * dst, size_t dstStride); - - void HogLiteFilterSeparable(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * hFilter, size_t hSize, const float * vFilter, size_t vSize, float * dst, size_t dstStride, int add); - - void HogLiteFindMax7x7(const float * a, size_t aStride, const float * b, size_t bStride, size_t height, float * value, size_t * col, size_t * row); - - void HogLiteCreateMask(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, const float * threshold, size_t scale, size_t size, uint32_t * dst, size_t dstStride); - - void Int16ToGray(const uint8_t * src, size_t width, size_t height, size_t srcStride, uint8_t * dst, size_t dstStride); - - void Integral(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t * sum, size_t sumStride, uint8_t * sqsum, size_t sqsumStride, uint8_t * tilted, size_t tiltedStride, - SimdPixelFormatType sumFormat, SimdPixelFormatType sqsumFormat); - - void InterferenceIncrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t increment, int16_t saturation); - - void InterferenceIncrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t increment, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index); - - void InterferenceDecrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t decrement, int16_t saturation); - - void InterferenceDecrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t decrement, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index); - - void InterleaveUv(const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * uv, size_t uvStride); - - void InterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - void InterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride); - - void Laplace(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void LaplaceAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void LaplaceAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - void LbpEstimate(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void MeanFilter3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride); - - void MedianFilterRhomb3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - void MedianFilterRhomb5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - void MedianFilterSquare3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - void MedianFilterSquare5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - void NeuralConvert(const uint8_t * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride, int inversion); - - void NeuralProductSum(const float * a, const float * b, size_t size, float * sum); - - void NeuralAddVectorMultipliedByValue(const float * src, size_t size, const float * value, float * dst); - - void NeuralAddVector(const float * src, size_t size, float * dst); - - void NeuralAddValue(const float * value, float * dst, size_t size); - - void NeuralRoughSigmoid(const float * src, size_t size, const float * slope, float * dst); - - void NeuralRoughSigmoid2(const float * src, size_t size, const float * slope, float * dst); - - void NeuralDerivativeSigmoid(const float * src, size_t size, const float * slope, float * dst); - - void NeuralRoughTanh(const float * src, size_t size, const float * slope, float * dst); - - void NeuralDerivativeTanh(const float * src, size_t size, const float * slope, float * dst); - - void NeuralDerivativeRelu(const float * src, size_t size, const float * slope, float * dst); - - void NeuralPow(const float * src, size_t size, const float * exponent, float * dst); - - void NeuralUpdateWeights(const float * x, size_t size, const float * a, const float * b, float * d, float * w); - - void NeuralAdaptiveGradientUpdate(const float * delta, size_t size, size_t batch, const float * alpha, const float * epsilon, float * gradient, float * weight); - - void NeuralAddConvolution2x2Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution3x3Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution4x4Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution5x5Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution2x2Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution3x3Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution4x4Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution5x5Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution2x2Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - void NeuralAddConvolution3x3Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - void NeuralAddConvolution4x4Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - void NeuralAddConvolution5x5Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - void NeuralPooling1x1Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride); - - void NeuralPooling2x2Max2x2(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride); - - void NeuralPooling2x2Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride); - - void NeuralConvolutionForward(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, const float * weight, - size_t kernelX, size_t kernelY, size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, - void * buffer, size_t * size, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth, int add); - - void OperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride, SimdOperationBinary8uType type); - - void OperationBinary16i(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint8_t * dst, size_t dstStride, SimdOperationBinary16iType type); - - void VectorProduct(const uint8_t * vertical, const uint8_t * horizontal, uint8_t * dst, size_t stride, size_t width, size_t height); - - void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); - - void ReduceGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - void ReduceGray3x3(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation); - - void ReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - void ReduceGray5x5(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation); - - void Reorder16bit(const uint8_t * src, size_t size, uint8_t * dst); - - void Reorder32bit(const uint8_t * src, size_t size, uint8_t * dst); - - void Reorder64bit(const uint8_t * src, size_t size, uint8_t * dst); - - void EstimateAlphaIndex(size_t srcSize, size_t dstSize, int * indexes, int * alphas, size_t channelCount); - - void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); - - void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); - - void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride); - - void SegmentationChangeIndex(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t oldIndex, uint8_t newIndex); - - void SegmentationFillSingleHoles(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index); - - void SegmentationPropagate2x2(const uint8_t * parent, size_t parentStride, size_t width, size_t height, - uint8_t * child, size_t childStride, const uint8_t * difference, size_t differenceStride, - uint8_t currentIndex, uint8_t invalidIndex, uint8_t emptyIndex, uint8_t differenceThreshold); - - void SegmentationShrinkRegion(const uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index, - ptrdiff_t * left, ptrdiff_t * top, ptrdiff_t * right, ptrdiff_t * bottom); - - void CommonShiftAction(const uint8_t * & src, size_t srcStride, size_t & width, size_t & height, size_t channelCount, - const uint8_t * bkg, size_t bkgStride, const double * shiftX, const double * shiftY, - size_t cropLeft, size_t cropTop, size_t cropRight, size_t cropBottom, uint8_t * & dst, size_t dstStride, int & fDx, int & fDy); - - void ShiftBilinear(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, - int fDx, int fDy, uint8_t * dst, size_t dstStride); - - void ShiftBilinear(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t * bkg, size_t bkgStride, const double * shiftX, const double * shiftY, - size_t cropLeft, size_t cropTop, size_t cropRight, size_t cropBottom, uint8_t * dst, size_t dstStride); - - void SobelDx(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void SobelDxAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void SobelDxAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - void SobelDy(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void SobelDyAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void SobelDyAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - void ContourMetrics(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void ContourMetricsMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t indexMin, uint8_t * dst, size_t dstStride); - - void ContourAnchors(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t step, int16_t threshold, uint8_t * dst, size_t dstStride); - - void SquaredDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - size_t width, size_t height, uint64_t * sum); - - void SquaredDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum); - - void SquaredDifferenceSum32f(const float * a, const float * b, size_t size, float * sum); - - void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum); - - void GetStatistic(const uint8_t * src, size_t stride, size_t width, size_t height, - uint8_t * min, uint8_t * max, uint8_t * average); - - void GetMoments(const uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index, - uint64_t * area, uint64_t * x, uint64_t * y, uint64_t * xx, uint64_t * xy, uint64_t * yy); - - void GetObjectMoments(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t index, - uint64_t * n, uint64_t * s, uint64_t * sx, uint64_t * sy, uint64_t * sxx, uint64_t * sxy, uint64_t * syy); - - void GetRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - void GetColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - void GetAbsDyRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - void GetAbsDxColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - void ValueSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - void SquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum); - - void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum); - - void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - void SvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum); - - void SynetAddBias(const float * bias, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetConvert32fTo8u(const float* src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float* shift, uint8_t* dst, SimdSynetCompatibilityType compatibility); - - void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst); - - void SynetElu32f(const float * src, size_t size, const float * alpha, float * dst); - - void SynetFusedLayerForward0(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward1(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward2(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward3(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward4(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward8(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward9(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1, SimdTensorFormatType format); - - void SynetHswish32f(const float * src, size_t size, const float * shift, const float * scale, float * dst); - - void SynetInnerProductLayerForward(const float * src, const float * weight, const float * bias, size_t count, size_t size, float * dst); - - void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst, SimdTensorFormatType format); - - void SynetPoolingForwardAverage(const float * src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float* dst, size_t dstH, size_t dstW, SimdBool excludePad, SimdTensorFormatType format); - - void SynetPoolingForwardMax32f(const float * src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float * dst, size_t dstH, size_t dstW, SimdTensorFormatType format); - - void SynetPoolingForwardMax8u(const uint8_t* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, uint8_t* dst, size_t dstH, size_t dstW, SimdTensorFormatType format); - - void SynetPreluLayerForward(const float * src, const float * slope, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetRelu32f(const float* src, size_t size, const float* slope, float* dst); - - void SynetReorderImage(size_t batch, size_t channels, size_t spatial, const float * src, SimdTensorFormatType srcFormat, float * dst, SimdTensorFormatType dstFormat); - - void SynetReorderFilter(size_t output, size_t input, size_t kernel, const float * src, SimdTensorFormatType srcFormat, float * dst, SimdTensorFormatType dstFormat); - - void SynetRestrictRange32f(const float * src, size_t size, const float * lower, const float * upper, float * dst); - - void SynetScaleLayerForward(const float* src, const float* scale, const float* bias, size_t channels, size_t height, size_t width, float* dst, SimdTensorFormatType format, SimdSynetCompatibilityType compatibility); - - void SynetSetInput(const uint8_t * src, size_t width, size_t height, size_t stride, SimdPixelFormatType srcFormat, - const float * lower, const float * upper, float * dst, size_t channels, SimdTensorFormatType dstFormat); - - void SynetShuffleLayerForward(const float* src0, const float* src1, size_t channels0, size_t channels1, size_t spatial, float* dst0, float* dst1, SimdTensorFormatType format, int type); - - void SynetSigmoid32f(const float* src, size_t size, const float* slope, float* dst); - - void SynetSoftmaxLayerForward(const float * src, size_t outer, size_t size, size_t inner, float * dst); - - void SynetSoftplus32f(const float* src, size_t size, const float* beta, const float* threshold, float* dst); - - SimdTensorFormatType SynetSpecifyTensorFormat(SimdTensorFormatType format); - - void SynetTanh32f(const float* src, size_t size, const float* slope, float* dst); - - size_t SynetTensorAlignment(SimdTensorFormatType format); - - void SynetUnaryOperation32fLayerForward(const float* src, size_t size, SimdSynetUnaryOperation32fType type, float* dst); - - void TextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride); - - void TextureBoostedUv(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t boost, uint8_t * dst, size_t dstStride); - - void TextureGetDifferenceSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, int64_t * sum); - - void TexturePerformCompensation(const uint8_t * src, size_t srcStride, size_t width, size_t height, - int shift, uint8_t * dst, size_t dstStride); - - void TransformImage(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, SimdTransformType transform, uint8_t * dst, size_t dstStride); - - void WinogradKernel1x3Block1x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans); - - void WinogradKernel1x3Block1x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel1x3Block1x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel1x5Block1x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans); - - void WinogradKernel1x5Block1x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel1x5Block1x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel2x2Block2x2SetFilter(const float* src, size_t size, float* dst, SimdBool trans); - - void WinogradKernel2x2Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel2x2Block2x2SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel2x2Block4x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans); - - void WinogradKernel2x2Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel2x2Block4x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel3x3Block2x2SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - void WinogradKernel3x3Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel3x3Block2x2SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel3x3Block3x3SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - void WinogradKernel3x3Block3x3SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel3x3Block3x3SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel3x3Block4x4SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - void WinogradKernel3x3Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel3x3Block4x4SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void Yuva420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride); - - void Yuv420pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - void Yuv422pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - void Yuv444pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - void Yuv420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void Yuv422pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void Yuv444pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void Yuv444pToHsl(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hsl, size_t hslStride); - - void Yuv444pToHsv(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hsv, size_t hsvStride); - - void Yuv420pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride); - - void Yuv444pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride); - - void Yuv420pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride); - - void Yuv422pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride); - - void Yuv444pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride); - } -} -#endif//__SimdBase_h__ diff --git a/src/3rd/Simd/Simd/SimdBaseAbsDifference.cpp b/src/3rd/Simd/Simd/SimdBaseAbsDifference.cpp deleted file mode 100644 index ce6a8f39..00000000 --- a/src/3rd/Simd/Simd/SimdBaseAbsDifference.cpp +++ /dev/null @@ -1,47 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar, -* 2019-2019 Facundo Galan. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" - -namespace Simd -{ - namespace Base - { - void AbsDifference(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, uint8_t *c, size_t cStride, - size_t width, size_t height) - { - for (size_t row = 0; row < height; ++row) - { - int rowSum = 0; - for (size_t col = 0; col < width; ++col) - { - c[col] = AbsDifferenceU8(a[col], b[col]); - } - a += aStride; - b += bStride; - c += cStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseAbsDifferenceSum.cpp b/src/3rd/Simd/Simd/SimdBaseAbsDifferenceSum.cpp deleted file mode 100644 index 64243424..00000000 --- a/src/3rd/Simd/Simd/SimdBaseAbsDifferenceSum.cpp +++ /dev/null @@ -1,151 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" - -namespace Simd -{ - namespace Base - { - void AbsDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - size_t width, size_t height, uint64_t * sum) - { - *sum = 0; - for (size_t row = 0; row < height; ++row) - { - int rowSum = 0; - for (size_t col = 0; col < width; ++col) - { - rowSum += AbsDifferenceU8(a[col], b[col]); - } - *sum += rowSum; - a += aStride; - b += bStride; - } - } - - void AbsDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) - { - *sum = 0; - for (size_t row = 0; row < height; ++row) - { - int rowSum = 0; - for (size_t col = 0; col < width; ++col) - { - if (mask[col] == index) - rowSum += AbsDifferenceU8(a[col], b[col]); - } - *sum += rowSum; - a += aStride; - b += bStride; - mask += maskStride; - } - } - - void AbsDifferenceSums3x3(const uint8_t * current, size_t currentStride, const uint8_t * background, size_t backgroundStride, - size_t width, size_t height, uint64_t * sums) - { - assert(width > 2 && height > 2); - - for (size_t i = 0; i < 9; ++i) - sums[i] = 0; - - height -= 2; - width -= 2; - current += 1 + currentStride; - background += 1 + backgroundStride; - for (size_t row = 0; row < height; ++row) - { - int rowSums[9]; - for (size_t i = 0; i < 9; ++i) - rowSums[i] = 0; - - for (size_t col = 0; col < width; ++col) - { - int value = current[col]; - rowSums[0] += AbsDifferenceU8(value, background[col - backgroundStride - 1]); - rowSums[1] += AbsDifferenceU8(value, background[col - backgroundStride]); - rowSums[2] += AbsDifferenceU8(value, background[col - backgroundStride + 1]); - rowSums[3] += AbsDifferenceU8(value, background[col - 1]); - rowSums[4] += AbsDifferenceU8(value, background[col]); - rowSums[5] += AbsDifferenceU8(value, background[col + 1]); - rowSums[6] += AbsDifferenceU8(value, background[col + backgroundStride - 1]); - rowSums[7] += AbsDifferenceU8(value, background[col + backgroundStride]); - rowSums[8] += AbsDifferenceU8(value, background[col + backgroundStride + 1]); - } - - for (size_t i = 0; i < 9; ++i) - sums[i] += rowSums[i]; - - current += currentStride; - background += backgroundStride; - } - } - - void AbsDifferenceSums3x3Masked(const uint8_t *current, size_t currentStride, const uint8_t *background, size_t backgroundStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sums) - { - assert(width > 2 && height > 2); - - for (size_t i = 0; i < 9; ++i) - sums[i] = 0; - - height -= 2; - width -= 2; - current += 1 + currentStride; - background += 1 + backgroundStride; - mask += 1 + maskStride; - for (size_t row = 0; row < height; ++row) - { - int rowSums[9]; - for (size_t i = 0; i < 9; ++i) - rowSums[i] = 0; - - for (size_t col = 0; col < width; ++col) - { - if (mask[col] == index) - { - int value = current[col]; - rowSums[0] += AbsDifferenceU8(value, background[col - backgroundStride - 1]); - rowSums[1] += AbsDifferenceU8(value, background[col - backgroundStride]); - rowSums[2] += AbsDifferenceU8(value, background[col - backgroundStride + 1]); - rowSums[3] += AbsDifferenceU8(value, background[col - 1]); - rowSums[4] += AbsDifferenceU8(value, background[col]); - rowSums[5] += AbsDifferenceU8(value, background[col + 1]); - rowSums[6] += AbsDifferenceU8(value, background[col + backgroundStride - 1]); - rowSums[7] += AbsDifferenceU8(value, background[col + backgroundStride]); - rowSums[8] += AbsDifferenceU8(value, background[col + backgroundStride + 1]); - } - } - - for (size_t i = 0; i < 9; ++i) - sums[i] += rowSums[i]; - - current += currentStride; - background += backgroundStride; - mask += maskStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseAbsGradientSaturatedSum.cpp b/src/3rd/Simd/Simd/SimdBaseAbsGradientSaturatedSum.cpp deleted file mode 100644 index f854a8d2..00000000 --- a/src/3rd/Simd/Simd/SimdBaseAbsGradientSaturatedSum.cpp +++ /dev/null @@ -1,52 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" - -namespace Simd -{ - namespace Base - { - void AbsGradientSaturatedSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - memset(dst, 0, width); - src += srcStride; - dst += dstStride; - for (size_t row = 2; row < height; ++row) - { - dst[0] = 0; - for (size_t col = 1; col < width - 1; ++col) - { - const int dy = AbsDifferenceU8(src[col - srcStride], src[col + srcStride]); - const int dx = AbsDifferenceU8(src[col - 1], src[col + 1]); - dst[col] = MinU8(dx + dy, 0xFF); - } - dst[width - 1] = 0; - - src += srcStride; - dst += dstStride; - } - memset(dst, 0, width); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseAddFeatureDifference.cpp b/src/3rd/Simd/Simd/SimdBaseAddFeatureDifference.cpp deleted file mode 100644 index 8077b1a1..00000000 --- a/src/3rd/Simd/Simd/SimdBaseAddFeatureDifference.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" - -namespace Simd -{ - namespace Base - { - const int SHIFT = 16; - - SIMD_INLINE uint32_t ShiftedWeightedSquare(int difference, int weight) - { - return difference*difference*weight >> SHIFT; - } - - SIMD_INLINE int FeatureDifference(int value, int lo, int hi) - { - return Max(0, Max(value - hi, lo - value)); - } - - void AddFeatureDifference(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, - uint16_t weight, uint8_t * difference, size_t differenceStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - int featureDifference = FeatureDifference(value[col], lo[col], hi[col]); - int sum = difference[col] + ShiftedWeightedSquare(featureDifference, weight); - difference[col] = Min(sum, 0xFF); - } - value += valueStride; - lo += loStride; - hi += hiStride; - difference += differenceStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseAlphaBlending.cpp b/src/3rd/Simd/Simd/SimdBaseAlphaBlending.cpp deleted file mode 100644 index 91df7c55..00000000 --- a/src/3rd/Simd/Simd/SimdBaseAlphaBlending.cpp +++ /dev/null @@ -1,114 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE int AlphaBlending(int src, int dst, int alpha) - { - return DivideBy255(src*alpha + dst*(0xFF - alpha)); - } - - template void AlphaBlending(const uint8_t * src, int alpha, uint8_t * dst); - - template <> SIMD_INLINE void AlphaBlending<1>(const uint8_t * src, int alpha, uint8_t * dst) - { - dst[0] = AlphaBlending(src[0], dst[0], alpha); - } - - template <> SIMD_INLINE void AlphaBlending<2>(const uint8_t * src, int alpha, uint8_t * dst) - { - dst[0] = AlphaBlending(src[0], dst[0], alpha); - dst[1] = AlphaBlending(src[1], dst[1], alpha); - } - - template <> SIMD_INLINE void AlphaBlending<3>(const uint8_t * src, int alpha, uint8_t * dst) - { - dst[0] = AlphaBlending(src[0], dst[0], alpha); - dst[1] = AlphaBlending(src[1], dst[1], alpha); - dst[2] = AlphaBlending(src[2], dst[2], alpha); - } - - template <> SIMD_INLINE void AlphaBlending<4>(const uint8_t * src, int alpha, uint8_t * dst) - { - dst[0] = AlphaBlending(src[0], dst[0], alpha); - dst[1] = AlphaBlending(src[1], dst[1], alpha); - dst[2] = AlphaBlending(src[2], dst[2], alpha); - dst[3] = AlphaBlending(src[3], dst[3], alpha); - } - - template void AlphaBlending(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * alpha, size_t alphaStride, uint8_t * dst, size_t dstStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < width; ++col, offset += channelCount) - AlphaBlending(src + offset, alpha[col], dst + offset); - src += srcStride; - alpha += alphaStride; - dst += dstStride; - } - } - - void AlphaBlending(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t * alpha, size_t alphaStride, uint8_t * dst, size_t dstStride) - { - assert(channelCount >= 1 && channelCount <= 4); - - switch (channelCount) - { - case 1: AlphaBlending<1>(src, srcStride, width, height, alpha, alphaStride, dst, dstStride); break; - case 2: AlphaBlending<2>(src, srcStride, width, height, alpha, alphaStride, dst, dstStride); break; - case 3: AlphaBlending<3>(src, srcStride, width, height, alpha, alphaStride, dst, dstStride); break; - case 4: AlphaBlending<4>(src, srcStride, width, height, alpha, alphaStride, dst, dstStride); break; - } - } - - template void AlphaFilling(uint8_t * dst, size_t dstStride, size_t width, size_t height, const uint8_t * channel, const uint8_t * alpha, size_t alphaStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < width; ++col, offset += channelCount) - AlphaBlending(channel, alpha[col], dst + offset); - alpha += alphaStride; - dst += dstStride; - } - } - - void AlphaFilling(uint8_t * dst, size_t dstStride, size_t width, size_t height, const uint8_t * channel, size_t channelCount, const uint8_t * alpha, size_t alphaStride) - { - assert(channelCount >= 1 && channelCount <= 4); - - switch (channelCount) - { - case 1: AlphaFilling<1>(dst, dstStride, width, height, channel, alpha, alphaStride); break; - case 2: AlphaFilling<2>(dst, dstStride, width, height, channel, alpha, alphaStride); break; - case 3: AlphaFilling<3>(dst, dstStride, width, height, channel, alpha, alphaStride); break; - case 4: AlphaFilling<4>(dst, dstStride, width, height, channel, alpha, alphaStride); break; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseBackground.cpp b/src/3rd/Simd/Simd/SimdBaseBackground.cpp deleted file mode 100644 index 5c426fd6..00000000 --- a/src/3rd/Simd/Simd/SimdBaseBackground.cpp +++ /dev/null @@ -1,220 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" - -namespace Simd -{ - namespace Base - { - void BackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - if (value[col] < lo[col]) - lo[col]--; - if (value[col] > hi[col]) - hi[col]++; - } - value += valueStride; - lo += loStride; - hi += hiStride; - } - } - - void BackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - if (value[col] < lo[col]) - lo[col] = value[col]; - if (value[col] > hi[col]) - hi[col] = value[col]; - } - value += valueStride; - lo += loStride; - hi += hiStride; - } - } - - void BackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * loValue, size_t loValueStride, const uint8_t * hiValue, size_t hiValueStride, - uint8_t * loCount, size_t loCountStride, uint8_t * hiCount, size_t hiCountStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - if (value[col] < loValue[col] && loCount[col] < 0xFF) - loCount[col]++; - if (value[col] > hiValue[col] && hiCount[col] < 0xFF) - hiCount[col]++; - } - value += valueStride; - loValue += loValueStride; - hiValue += hiValueStride; - loCount += loCountStride; - hiCount += hiCountStride; - } - } - - SIMD_INLINE void AdjustLo(const uint8_t & count, uint8_t & value, int threshold) - { - if (count > threshold) - { - if (value > 0) - value--; - } - else if (count < threshold) - { - if (value < 0xFF) - value++; - } - } - - SIMD_INLINE void AdjustHi(const uint8_t & count, uint8_t & value, int threshold) - { - if (count > threshold) - { - if (value < 0xFF) - value++; - } - else if (count < threshold) - { - if (value > 0) - value--; - } - } - - void BackgroundAdjustRange(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - AdjustLo(loCount[col], loValue[col], threshold); - AdjustHi(hiCount[col], hiValue[col], threshold); - loCount[col] = 0; - hiCount[col] = 0; - } - loValue += loValueStride; - hiValue += hiValueStride; - loCount += loCountStride; - hiCount += hiCountStride; - } - } - - void BackgroundAdjustRangeMasked(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - if (mask[col]) - { - AdjustLo(loCount[col], loValue[col], threshold); - AdjustHi(hiCount[col], hiValue[col], threshold); - } - loCount[col] = 0; - hiCount[col] = 0; - } - loValue += loValueStride; - hiValue += hiValueStride; - loCount += loCountStride; - hiCount += hiCountStride; - mask += maskStride; - } - } - - SIMD_INLINE void BackgroundShiftRange(const uint8_t & value, uint8_t & lo, uint8_t & hi) - { - int add = int(value) - int(hi); - int sub = int(lo) - int(value); - if (add > 0) - { - lo = Min(lo + add, 0xFF); - hi = Min(hi + add, 0xFF); - } - if (sub > 0) - { - lo = Max(lo - sub, 0); - hi = Max(hi - sub, 0); - } - } - - void BackgroundShiftRange(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - BackgroundShiftRange(value[col], lo[col], hi[col]); - value += valueStride; - lo += loStride; - hi += hiStride; - } - } - - void BackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride, const uint8_t * mask, size_t maskStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - if (mask[col]) - BackgroundShiftRange(value[col], lo[col], hi[col]); - } - value += valueStride; - lo += loStride; - hi += hiStride; - mask += maskStride; - } - } - - void BackgroundInitMask(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t index, uint8_t value, uint8_t * dst, size_t dstStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - if (src[col] == index) - dst[col] = value; - } - src += srcStride; - dst += dstStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseBayerToBgr.cpp b/src/3rd/Simd/Simd/SimdBaseBayerToBgr.cpp deleted file mode 100644 index 928c2378..00000000 --- a/src/3rd/Simd/Simd/SimdBaseBayerToBgr.cpp +++ /dev/null @@ -1,85 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdBayer.h" - -namespace Simd -{ - namespace Base - { - template void BayerToBgr(const uint8_t * src[6], - size_t col0, size_t col2, size_t col4, uint8_t * dst, size_t stride) - { - BayerToBgr(src, - col0, col0 + 1, col2, col2 + 1, col4, col4 + 1, - dst, dst + 3, dst + stride, dst + stride + 3); - } - - template void BayerToBgr(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, uint8_t * bgr, size_t bgrStride) - { - const uint8_t * src[6]; - for (size_t row = 0; row < height; row += 2) - { - src[0] = (row == 0 ? bayer : bayer - 2 * bayerStride); - src[1] = src[0] + bayerStride; - src[2] = bayer; - src[3] = src[2] + bayerStride; - src[4] = (row == height - 2 ? bayer : bayer + 2 * bayerStride); - src[5] = src[4] + bayerStride; - - BayerToBgr(src, 0, 0, 2, bgr, bgrStride); - - for (size_t col = 2; col < width - 2; col += 2) - BayerToBgr(src, col - 2, col, col + 2, bgr + 3 * col, bgrStride); - - BayerToBgr(src, width - 4, width - 2, width - 2, bgr + 3 * (width - 2), bgrStride); - - bayer += 2 * bayerStride; - bgr += 2 * bgrStride; - } - } - - void BayerToBgr(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgr, size_t bgrStride) - { - assert((width % 2 == 0) && (height % 2 == 0)); - - switch (bayerFormat) - { - case SimdPixelFormatBayerGrbg: - BayerToBgr(bayer, width, height, bayerStride, bgr, bgrStride); - break; - case SimdPixelFormatBayerGbrg: - BayerToBgr(bayer, width, height, bayerStride, bgr, bgrStride); - break; - case SimdPixelFormatBayerRggb: - BayerToBgr(bayer, width, height, bayerStride, bgr, bgrStride); - break; - case SimdPixelFormatBayerBggr: - BayerToBgr(bayer, width, height, bayerStride, bgr, bgrStride); - break; - default: - assert(0); - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseBayerToBgra.cpp b/src/3rd/Simd/Simd/SimdBaseBayerToBgra.cpp deleted file mode 100644 index 846bb5bd..00000000 --- a/src/3rd/Simd/Simd/SimdBaseBayerToBgra.cpp +++ /dev/null @@ -1,88 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdBayer.h" - -namespace Simd -{ - namespace Base - { - template void BayerToBgra(const uint8_t * src[6], - size_t col0, size_t col2, size_t col4, uint8_t * dst0, size_t stride, uint8_t alpha) - { - uint8_t * dst1 = dst0 + stride; - BayerToBgr(src, col0, col0 + 1, col2, col2 + 1, col4, col4 + 1, dst0, dst0 + 4, dst1, dst1 + 4); - dst0[3] = alpha; - dst0[7] = alpha; - dst1[3] = alpha; - dst1[7] = alpha; - } - - template void BayerToBgra(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - const uint8_t * src[6]; - for (size_t row = 0; row < height; row += 2) - { - src[0] = (row == 0 ? bayer : bayer - 2 * bayerStride); - src[1] = src[0] + bayerStride; - src[2] = bayer; - src[3] = src[2] + bayerStride; - src[4] = (row == height - 2 ? bayer : bayer + 2 * bayerStride); - src[5] = src[4] + bayerStride; - - BayerToBgra(src, 0, 0, 2, bgra, bgraStride, alpha); - - for (size_t col = 2; col < width - 2; col += 2) - BayerToBgra(src, col - 2, col, col + 2, bgra + 4 * col, bgraStride, alpha); - - BayerToBgra(src, width - 4, width - 2, width - 2, bgra + 4 * (width - 2), bgraStride, alpha); - - bayer += 2 * bayerStride; - bgra += 2 * bgraStride; - } - } - - void BayerToBgra(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - assert((width % 2 == 0) && (height % 2 == 0)); - - switch (bayerFormat) - { - case SimdPixelFormatBayerGrbg: - BayerToBgra(bayer, width, height, bayerStride, bgra, bgraStride, alpha); - break; - case SimdPixelFormatBayerGbrg: - BayerToBgra(bayer, width, height, bayerStride, bgra, bgraStride, alpha); - break; - case SimdPixelFormatBayerRggb: - BayerToBgra(bayer, width, height, bayerStride, bgra, bgraStride, alpha); - break; - case SimdPixelFormatBayerBggr: - BayerToBgra(bayer, width, height, bayerStride, bgra, bgraStride, alpha); - break; - default: - assert(0); - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseBgrToBayer.cpp b/src/3rd/Simd/Simd/SimdBaseBgrToBayer.cpp deleted file mode 100644 index 43ad926a..00000000 --- a/src/3rd/Simd/Simd/SimdBaseBgrToBayer.cpp +++ /dev/null @@ -1,103 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdDefs.h" - -namespace Simd -{ - namespace Base - { - template void BgrToBayer(const uint8_t * bgr0, const uint8_t * bgr1, uint8_t * bayer0, uint8_t * bayer1); - - template <> SIMD_INLINE void BgrToBayer(const uint8_t * bgr0, const uint8_t * bgr1, uint8_t * bayer0, uint8_t * bayer1) - { - bayer0[0] = bgr0[1]; - bayer0[1] = bgr0[5]; - bayer1[0] = bgr1[0]; - bayer1[1] = bgr1[4]; - } - - template <> SIMD_INLINE void BgrToBayer(const uint8_t * bgr0, const uint8_t * bgr1, uint8_t * bayer0, uint8_t * bayer1) - { - bayer0[0] = bgr0[1]; - bayer0[1] = bgr0[3]; - bayer1[0] = bgr1[2]; - bayer1[1] = bgr1[4]; - } - - template <> SIMD_INLINE void BgrToBayer(const uint8_t * bgr0, const uint8_t * bgr1, uint8_t * bayer0, uint8_t * bayer1) - { - bayer0[0] = bgr0[2]; - bayer0[1] = bgr0[4]; - bayer1[0] = bgr1[1]; - bayer1[1] = bgr1[3]; - } - - template <> SIMD_INLINE void BgrToBayer(const uint8_t * bgr0, const uint8_t * bgr1, uint8_t * bayer0, uint8_t * bayer1) - { - bayer0[0] = bgr0[0]; - bayer0[1] = bgr0[4]; - bayer1[0] = bgr1[1]; - bayer1[1] = bgr1[5]; - } - - template SIMD_INLINE void BgrToBayer(const uint8_t * bgr, size_t bgrStride, uint8_t * bayer, size_t bayerStride) - { - BgrToBayer(bgr, bgr + bgrStride, bayer, bayer + bayerStride); - } - - template void BgrToBayer(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bayer, size_t bayerStride) - { - for (size_t row = 0; row < height; row += 2) - { - for (size_t col = 0, offset = 0; col < width; col += 2, offset += 6) - BgrToBayer(bgr + offset, bgrStride, bayer + col, bayerStride); - bgr += 2 * bgrStride; - bayer += 2 * bayerStride; - } - } - - void BgrToBayer(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat) - { - assert((width % 2 == 0) && (height % 2 == 0)); - - switch (bayerFormat) - { - case SimdPixelFormatBayerGrbg: - BgrToBayer(bgr, width, height, bgrStride, bayer, bayerStride); - break; - case SimdPixelFormatBayerGbrg: - BgrToBayer(bgr, width, height, bgrStride, bayer, bayerStride); - break; - case SimdPixelFormatBayerRggb: - BgrToBayer(bgr, width, height, bgrStride, bayer, bayerStride); - break; - case SimdPixelFormatBayerBggr: - BgrToBayer(bgr, width, height, bgrStride, bayer, bayerStride); - break; - default: - assert(0); - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseBgrToBgra.cpp b/src/3rd/Simd/Simd/SimdBaseBgrToBgra.cpp deleted file mode 100644 index d888664c..00000000 --- a/src/3rd/Simd/Simd/SimdBaseBgrToBgra.cpp +++ /dev/null @@ -1,128 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdDefs.h" - -namespace Simd -{ - namespace Base - { - void BgrToBgra(const uint8_t *bgr, size_t size, uint8_t *bgra, bool fillAlpha, bool lastRow, uint8_t alpha) - { - if (fillAlpha) - { -#ifdef SIMD_BIG_ENDIAN - const int32_t alphaMask = alpha; -#else - const int32_t alphaMask = alpha << 24; -#endif - for (size_t i = (lastRow ? 1 : 0); i < size; ++i, bgr += 3, bgra += 4) - { - *(int32_t*)bgra = (*(int32_t*)bgr) | alphaMask; - } - if (lastRow) - { - bgra[0] = bgr[0]; - bgra[1] = bgr[1]; - bgra[2] = bgr[2]; - bgra[3] = alpha; - } - } - else - { - for (size_t i = (lastRow ? 1 : 0); i < size; ++i, bgr += 3, bgra += 4) - { - *(int32_t*)bgra = (*(int32_t*)bgr); - } - if (lastRow) - { - bgra[0] = bgr[0]; - bgra[1] = bgr[1]; - bgra[2] = bgr[2]; - } - } - } - - void BgrToBgra(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha) - { - for (size_t row = 1; row < height; ++row) - { - BgrToBgra(bgr, width, bgra, true, false, alpha); - bgr += bgrStride; - bgra += bgraStride; - } - BgrToBgra(bgr, width, bgra, true, true, alpha); - } - - void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, - const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - for (size_t row = 0; row < height; ++row) - { - const uint8_t * pBlue = blue; - const uint8_t * pGreen = green; - const uint8_t * pRed = red; - uint8_t * pBgra = bgra; - for (size_t col = 0; col < width; ++col) - { -#ifdef SIMD_BIG_ENDIAN - pBgra[0] = pBlue[1]; - pBgra[1] = pGreen[1]; - pBgra[2] = pRed[1]; -#else - pBgra[0] = pBlue[0]; - pBgra[1] = pGreen[0]; - pBgra[2] = pRed[0]; -#endif - pBgra[3] = alpha; - pBlue += 2; - pGreen += 2; - pRed += 2; - pBgra += 4; - } - blue += blueStride; - green += greenStride; - red += redStride; - bgra += bgraStride; - } - } - - void RgbToBgra(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) - { - size_t rgbGap = rgbStride - width * 3; - size_t bgraGap = bgraStride - width * 4; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col, rgb += 3, bgra += 4) - { - bgra[0] = rgb[2]; - bgra[1] = rgb[1]; - bgra[2] = rgb[0]; - bgra[3] = alpha; - } - rgb += rgbGap; - bgra += bgraGap; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseBgrToGray.cpp b/src/3rd/Simd/Simd/SimdBaseBgrToGray.cpp deleted file mode 100644 index 1e5165d6..00000000 --- a/src/3rd/Simd/Simd/SimdBaseBgrToGray.cpp +++ /dev/null @@ -1,56 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdConversion.h" - -namespace Simd -{ - namespace Base - { - void BgrToGray(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *gray, size_t grayStride) - { - for (size_t row = 0; row < height; ++row) - { - const uint8_t * pBgr = bgr + row*bgrStride; - uint8_t * pGray = gray + row*grayStride; - for (const uint8_t *pGrayEnd = pGray + width; pGray < pGrayEnd; pGray += 1, pBgr += 3) - { - *pGray = BgrToGray(pBgr[0], pBgr[1], pBgr[2]); - } - } - } - - void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) - { - for (size_t row = 0; row < height; ++row) - { - const uint8_t* pRgb = rgb + row * rgbStride; - uint8_t* pGray = gray + row * grayStride; - for (const uint8_t* pGrayEnd = pGray + width; pGray < pGrayEnd; pGray += 1, pRgb += 3) - { - *pGray = BgrToGray(pRgb[2], pRgb[1], pRgb[0]); - } - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseBgrToHsl.cpp b/src/3rd/Simd/Simd/SimdBaseBgrToHsl.cpp deleted file mode 100644 index 602523be..00000000 --- a/src/3rd/Simd/Simd/SimdBaseBgrToHsl.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdConversion.h" - -namespace Simd -{ - namespace Base - { - void BgrToHsl(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * hsl, size_t hslStride) - { - for (size_t row = 0; row < height; ++row) - { - const uint8_t * pBgr = bgr + row*bgrStride; - uint8_t * pHsl = hsl + row*hslStride; - for (const uint8_t * pBgrEnd = pBgr + width * 3; pBgr < pBgrEnd; pBgr += 3, pHsl += 3) - { - BgrToHsl(pBgr[0], pBgr[1], pBgr[2], pHsl); - } - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseBgrToHsv.cpp b/src/3rd/Simd/Simd/SimdBaseBgrToHsv.cpp deleted file mode 100644 index 45c643c3..00000000 --- a/src/3rd/Simd/Simd/SimdBaseBgrToHsv.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdConversion.h" - -namespace Simd -{ - namespace Base - { - void BgrToHsv(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * hsv, size_t hsvStride) - { - for (size_t row = 0; row < height; ++row) - { - const uint8_t * pBgr = bgr + row*bgrStride; - uint8_t * pHsv = hsv + row*hsvStride; - for (const uint8_t * pBgrEnd = pBgr + width * 3; pBgr < pBgrEnd; pBgr += 3, pHsv += 3) - { - BgrToHsv(pBgr[0], pBgr[1], pBgr[2], pHsv); - } - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseBgrToRgb.cpp b/src/3rd/Simd/Simd/SimdBaseBgrToRgb.cpp deleted file mode 100644 index b45cd2a9..00000000 --- a/src/3rd/Simd/Simd/SimdBaseBgrToRgb.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdDefs.h" - -namespace Simd -{ - namespace Base - { - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) - { - size_t size = width * 3; - for (size_t row = 0; row < height; ++row) - { - for (size_t i = 0; i < size; i += 3) - { - rgb[i + 0] = bgr[i + 2]; - rgb[i + 1] = bgr[i + 1]; - rgb[i + 2] = bgr[i + 0]; - } - bgr += bgrStride; - rgb += rgbStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseBgrToYuv.cpp b/src/3rd/Simd/Simd/SimdBaseBgrToYuv.cpp deleted file mode 100644 index eb34ed9a..00000000 --- a/src/3rd/Simd/Simd/SimdBaseBgrToYuv.cpp +++ /dev/null @@ -1,118 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar, -* 2014-2015 Antonenka Mikhail. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdConversion.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE void BgrToYuv420p(const uint8_t * bgr0, size_t bgrStride, uint8_t * y0, size_t yStride, uint8_t * u, uint8_t * v) - { - const uint8_t * bgr1 = bgr0 + bgrStride; - uint8_t * y1 = y0 + yStride; - - y0[0] = BgrToY(bgr0[0], bgr0[1], bgr0[2]); - y0[1] = BgrToY(bgr0[3], bgr0[4], bgr0[5]); - y1[0] = BgrToY(bgr1[0], bgr1[1], bgr1[2]); - y1[1] = BgrToY(bgr1[3], bgr1[4], bgr1[5]); - - int blue = Average(bgr0[0], bgr0[3], bgr1[0], bgr1[3]); - int green = Average(bgr0[1], bgr0[4], bgr1[1], bgr1[4]); - int red = Average(bgr0[2], bgr0[5], bgr1[2], bgr1[5]); - - u[0] = BgrToU(blue, green, red); - v[0] = BgrToV(blue, green, red); - } - - void BgrToYuv420p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= 2) && (height >= 2)); - - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colY = 0, colBgr = 0; colY < width; colY += 2, colUV++, colBgr += 6) - { - BgrToYuv420p(bgr + colBgr, bgrStride, y + colY, yStride, u + colUV, v + colUV); - } - y += 2 * yStride; - u += uStride; - v += vStride; - bgr += 2 * bgrStride; - } - } - - SIMD_INLINE void BgrToYuv422p(const uint8_t * bgr, uint8_t * y, uint8_t * u, uint8_t * v) - { - y[0] = BgrToY(bgr[0], bgr[1], bgr[2]); - y[1] = BgrToY(bgr[3], bgr[4], bgr[5]); - - int blue = Average(bgr[0], bgr[3]); - int green = Average(bgr[1], bgr[4]); - int red = Average(bgr[2], bgr[5]); - - u[0] = BgrToU(blue, green, red); - v[0] = BgrToV(blue, green, red); - } - - void BgrToYuv422p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert((width % 2 == 0) && (width >= 2)); - - for (size_t row = 0; row < height; ++row) - { - for (size_t colUV = 0, colY = 0, colBgr = 0; colY < width; colY += 2, colUV++, colBgr += 6) - BgrToYuv422p(bgr + colBgr, y + colY, u + colUV, v + colUV); - y += yStride; - u += uStride; - v += vStride; - bgr += bgrStride; - } - } - - SIMD_INLINE void BgrToYuv444p(const uint8_t * bgr, uint8_t * y, uint8_t * u, uint8_t * v) - { - const int blue = bgr[0], green = bgr[1], red = bgr[2]; - y[0] = BgrToY(blue, green, red); - u[0] = BgrToU(blue, green, red); - v[0] = BgrToV(blue, green, red); - } - - void BgrToYuv444p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colBgr = 0; col < width; ++col, colBgr += 3) - BgrToYuv444p(bgr + colBgr, y + col, u + col, v + col); - y += yStride; - u += uStride; - v += vStride; - bgr += bgrStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseBgraToBayer.cpp b/src/3rd/Simd/Simd/SimdBaseBgraToBayer.cpp deleted file mode 100644 index b908caf2..00000000 --- a/src/3rd/Simd/Simd/SimdBaseBgraToBayer.cpp +++ /dev/null @@ -1,103 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdDefs.h" - -namespace Simd -{ - namespace Base - { - template void BgraToBayer(const uint8_t * bgra0, const uint8_t * bgra1, uint8_t * bayer0, uint8_t * bayer1); - - template <> SIMD_INLINE void BgraToBayer(const uint8_t * bgra0, const uint8_t * bgra1, uint8_t * bayer0, uint8_t * bayer1) - { - bayer0[0] = bgra0[1]; - bayer0[1] = bgra0[6]; - bayer1[0] = bgra1[0]; - bayer1[1] = bgra1[5]; - } - - template <> SIMD_INLINE void BgraToBayer(const uint8_t * bgra0, const uint8_t * bgra1, uint8_t * bayer0, uint8_t * bayer1) - { - bayer0[0] = bgra0[1]; - bayer0[1] = bgra0[4]; - bayer1[0] = bgra1[2]; - bayer1[1] = bgra1[5]; - } - - template <> SIMD_INLINE void BgraToBayer(const uint8_t * bgra0, const uint8_t * bgra1, uint8_t * bayer0, uint8_t * bayer1) - { - bayer0[0] = bgra0[2]; - bayer0[1] = bgra0[5]; - bayer1[0] = bgra1[1]; - bayer1[1] = bgra1[4]; - } - - template <> SIMD_INLINE void BgraToBayer(const uint8_t * bgra0, const uint8_t * bgra1, uint8_t * bayer0, uint8_t * bayer1) - { - bayer0[0] = bgra0[0]; - bayer0[1] = bgra0[5]; - bayer1[0] = bgra1[1]; - bayer1[1] = bgra1[6]; - } - - template SIMD_INLINE void BgraToBayer(const uint8_t * bgra, size_t bgraStride, uint8_t * bayer, size_t bayerStride) - { - BgraToBayer(bgra, bgra + bgraStride, bayer, bayer + bayerStride); - } - - template void BgraToBayer(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bayer, size_t bayerStride) - { - for (size_t row = 0; row < height; row += 2) - { - for (size_t col = 0, offset = 0; col < width; col += 2, offset += 8) - BgraToBayer(bgra + offset, bgraStride, bayer + col, bayerStride); - bgra += 2 * bgraStride; - bayer += 2 * bayerStride; - } - } - - void BgraToBayer(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat) - { - assert((width % 2 == 0) && (height % 2 == 0)); - - switch (bayerFormat) - { - case SimdPixelFormatBayerGrbg: - BgraToBayer(bgra, width, height, bgraStride, bayer, bayerStride); - break; - case SimdPixelFormatBayerGbrg: - BgraToBayer(bgra, width, height, bgraStride, bayer, bayerStride); - break; - case SimdPixelFormatBayerRggb: - BgraToBayer(bgra, width, height, bgraStride, bayer, bayerStride); - break; - case SimdPixelFormatBayerBggr: - BgraToBayer(bgra, width, height, bgraStride, bayer, bayerStride); - break; - default: - assert(0); - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseBgraToBgr.cpp b/src/3rd/Simd/Simd/SimdBaseBgraToBgr.cpp deleted file mode 100644 index 619faac1..00000000 --- a/src/3rd/Simd/Simd/SimdBaseBgraToBgr.cpp +++ /dev/null @@ -1,72 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdDefs.h" - -namespace Simd -{ - namespace Base - { - void BgraToBgr(const uint8_t *bgra, size_t size, uint8_t *bgr, bool lastRow) - { - for (size_t i = (lastRow ? 1 : 0); i < size; ++i, bgr += 3, bgra += 4) - { - *(int32_t*)bgr = (*(int32_t*)bgra); - } - if (lastRow) - { - bgr[0] = bgra[0]; - bgr[1] = bgra[1]; - bgr[2] = bgra[2]; - } - } - - void BgraToBgr(const uint8_t *bgra, size_t width, size_t height, size_t bgraStride, uint8_t *bgr, size_t bgrStride) - { - for (size_t row = 1; row < height; ++row) - { - BgraToBgr(bgra, width, bgr, false); - bgr += bgrStride; - bgra += bgraStride; - } - BgraToBgr(bgra, width, bgr, true); - } - - void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) - { - size_t bgraGap = bgraStride - width * 4; - size_t rgbGap = rgbStride - width * 3; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col, bgra += 4, rgb += 3) - { - rgb[2] = bgra[0]; - rgb[1] = bgra[1]; - rgb[0] = bgra[2]; - } - bgra += bgraGap; - rgb += rgbGap; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseBgraToGray.cpp b/src/3rd/Simd/Simd/SimdBaseBgraToGray.cpp deleted file mode 100644 index bf35860a..00000000 --- a/src/3rd/Simd/Simd/SimdBaseBgraToGray.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdConversion.h" - -namespace Simd -{ - namespace Base - { - void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride) - { - for (size_t row = 0; row < height; ++row) - { - const uint8_t * pBgra = bgra + row*bgraStride; - uint8_t * pGray = gray + row*grayStride; - for (const uint8_t *pGrayEnd = pGray + width; pGray < pGrayEnd; pGray += 1, pBgra += 4) - { - *pGray = BgrToGray(pBgra[0], pBgra[1], pBgra[2]); - } - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseBgraToYuv.cpp b/src/3rd/Simd/Simd/SimdBaseBgraToYuv.cpp deleted file mode 100644 index cf0f7d12..00000000 --- a/src/3rd/Simd/Simd/SimdBaseBgraToYuv.cpp +++ /dev/null @@ -1,157 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar, -* 2014-2015 Antonenka Mikhail. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdConversion.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE void BgraToYuv420p(const uint8_t * bgra0, size_t bgraStride, uint8_t * y0, size_t yStride, uint8_t * u, uint8_t * v) - { - const uint8_t * bgra1 = bgra0 + bgraStride; - uint8_t * y1 = y0 + yStride; - - y0[0] = BgrToY(bgra0[0], bgra0[1], bgra0[2]); - y0[1] = BgrToY(bgra0[4], bgra0[5], bgra0[6]); - y1[0] = BgrToY(bgra1[0], bgra1[1], bgra1[2]); - y1[1] = BgrToY(bgra1[4], bgra1[5], bgra1[6]); - - int blue = Average(bgra0[0], bgra0[4], bgra1[0], bgra1[4]); - int green = Average(bgra0[1], bgra0[5], bgra1[1], bgra1[5]); - int red = Average(bgra0[2], bgra0[6], bgra1[2], bgra1[6]); - - u[0] = BgrToU(blue, green, red); - v[0] = BgrToV(blue, green, red); - } - - void BgraToYuv420p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= 2) && (height >= 2)); - - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colY = 0, colBgra = 0; colY < width; colY += 2, colUV++, colBgra += 8) - BgraToYuv420p(bgra + colBgra, bgraStride, y + colY, yStride, u + colUV, v + colUV); - y += 2 * yStride; - u += uStride; - v += vStride; - bgra += 2 * bgraStride; - } - } - - SIMD_INLINE void BgraToYuv422p(const uint8_t * bgra, uint8_t * y, uint8_t * u, uint8_t * v) - { - y[0] = BgrToY(bgra[0], bgra[1], bgra[2]); - y[1] = BgrToY(bgra[4], bgra[5], bgra[6]); - - int blue = Average(bgra[0], bgra[4]); - int green = Average(bgra[1], bgra[5]); - int red = Average(bgra[2], bgra[6]); - - u[0] = BgrToU(blue, green, red); - v[0] = BgrToV(blue, green, red); - } - - void BgraToYuv422p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert((width % 2 == 0) && (width >= 2)); - - for (size_t row = 0; row < height; ++row) - { - for (size_t colUV = 0, colY = 0, colBgra = 0; colY < width; colY += 2, colUV++, colBgra += 8) - BgraToYuv422p(bgra + colBgra, y + colY, u + colUV, v + colUV); - y += yStride; - u += uStride; - v += vStride; - bgra += bgraStride; - } - } - - SIMD_INLINE void BgraToYuv444p(const uint8_t * bgra, uint8_t * y, uint8_t * u, uint8_t * v) - { - const int blue = bgra[0], green = bgra[1], red = bgra[2]; - y[0] = BgrToY(blue, green, red); - u[0] = BgrToU(blue, green, red); - v[0] = BgrToV(blue, green, red); - } - - void BgraToYuv444p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colBgra = 0; col < width; ++col, colBgra += 4) - BgraToYuv444p(bgra + colBgra, y + col, u + col, v + col); - y += yStride; - u += uStride; - v += vStride; - bgra += bgraStride; - } - } - - SIMD_INLINE void BgraToYuva420p(const uint8_t * bgra0, size_t bgraStride, uint8_t * y0, size_t yStride, uint8_t * u, uint8_t * v, uint8_t * a0, size_t aStride) - { - const uint8_t * bgra1 = bgra0 + bgraStride; - uint8_t * y1 = y0 + yStride; - uint8_t * a1 = a0 + aStride; - - y0[0] = BgrToY(bgra0[0], bgra0[1], bgra0[2]); - y0[1] = BgrToY(bgra0[4], bgra0[5], bgra0[6]); - y1[0] = BgrToY(bgra1[0], bgra1[1], bgra1[2]); - y1[1] = BgrToY(bgra1[4], bgra1[5], bgra1[6]); - - int blue = Average(bgra0[0], bgra0[4], bgra1[0], bgra1[4]); - int green = Average(bgra0[1], bgra0[5], bgra1[1], bgra1[5]); - int red = Average(bgra0[2], bgra0[6], bgra1[2], bgra1[6]); - - u[0] = BgrToU(blue, green, red); - v[0] = BgrToV(blue, green, red); - - a0[0] = bgra0[3]; - a0[1] = bgra0[7]; - a1[0] = bgra1[3]; - a1[1] = bgra1[7]; - } - - void BgraToYuva420p(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride, uint8_t * a, size_t aStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= 2) && (height >= 2)); - - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colYA = 0, colBgra = 0; colYA < width; colYA += 2, colUV++, colBgra += 8) - BgraToYuva420p(bgra + colBgra, bgraStride, y + colYA, yStride, u + colUV, v + colUV, a + colYA, aStride); - y += 2 * yStride; - u += uStride; - v += vStride; - a += 2 * aStride; - bgra += 2 * bgraStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseBinarization.cpp b/src/3rd/Simd/Simd/SimdBaseBinarization.cpp deleted file mode 100644 index dccf064f..00000000 --- a/src/3rd/Simd/Simd/SimdBaseBinarization.cpp +++ /dev/null @@ -1,177 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ - namespace Base - { - template - void Binarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - dst[col] = Compare8u(src[col], value) ? positive : negative; - src += srcStride; - dst += dstStride; - } - } - - void Binarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride, SimdCompareType compareType) - { - switch (compareType) - { - case SimdCompareEqual: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - case SimdCompareNotEqual: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - case SimdCompareGreater: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - case SimdCompareGreaterOrEqual: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - case SimdCompareLesser: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - case SimdCompareLesserOrEqual: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - default: - assert(0); - } - } - - namespace - { - struct Buffer - { - Buffer(size_t width, size_t edge) - { - size_t size = sizeof(uint32_t)*(width + 2 * edge); - _p = Allocate(size); - memset(_p, 0, size); - sa = (uint32_t*)_p + edge; - } - - ~Buffer() - { - Free(_p); - } - - uint32_t * sa; - private: - void *_p; - }; - } - - template SIMD_INLINE uint32_t GetSa(uint8_t src, uint8_t value) - { -#ifdef SIMD_BIG_ENDIAN - return Compare8u(src, value) ? 0x00010001 : 0x00000001; -#else - return Compare8u(src, value) ? 0x00010001 : 0x00010000; -#endif - } - - template - void AveragingBinarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, size_t neighborhood, uint8_t threshold, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride) - { - assert(width > neighborhood && height > neighborhood && neighborhood < 0x80); - - Buffer buffer(width, neighborhood + 1); - - union SaSum - { - uint32_t sum; - uint16_t sa[2]; - }; - - for (size_t row = 0; row < neighborhood; ++row) - { - const uint8_t * s = src + row*srcStride; - for (size_t col = 0; col < width; ++col) - { - buffer.sa[col] += GetSa(s[col], value); - } - } - - for (size_t row = 0; row < height; ++row) - { - if (row < height - neighborhood) - { - const uint8_t * s = src + (row + neighborhood)*srcStride; - for (size_t col = 0; col < width; ++col) - { - buffer.sa[col] += GetSa(s[col], value); - } - } - - if (row > neighborhood) - { - const uint8_t * s = src + (row - neighborhood - 1)*srcStride; - for (size_t col = 0; col < width; ++col) - { - buffer.sa[col] -= GetSa(s[col], value); - } - } - - SaSum saSum = { 0 }; - for (size_t col = 0; col < neighborhood; ++col) - saSum.sum += buffer.sa[col]; - for (size_t col = 0; col < width; ++col) - { - saSum.sum += buffer.sa[col + neighborhood]; - saSum.sum -= buffer.sa[col - neighborhood - 1]; - dst[col] = (saSum.sa[0] * 0xFF > threshold*saSum.sa[1]) ? positive : negative; - } - dst += dstStride; - } - } - - void AveragingBinarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, size_t neighborhood, uint8_t threshold, uint8_t positive, uint8_t negative, - uint8_t * dst, size_t dstStride, SimdCompareType compareType) - { - switch (compareType) - { - case SimdCompareEqual: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - case SimdCompareNotEqual: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - case SimdCompareGreater: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - case SimdCompareGreaterOrEqual: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - case SimdCompareLesser: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - case SimdCompareLesserOrEqual: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - default: - assert(0); - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseConditional.cpp b/src/3rd/Simd/Simd/SimdBaseConditional.cpp deleted file mode 100644 index 164a667b..00000000 --- a/src/3rd/Simd/Simd/SimdBaseConditional.cpp +++ /dev/null @@ -1,277 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar, -* 2014-2016 Antonenka Mikhail. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" -#include "Simd/SimdCompare.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ - namespace Base - { - template - void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, uint8_t value, uint32_t * count) - { - *count = 0; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - if (Compare8u(src[col], value)) - (*count)++; - } - src += stride; - } - } - - void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, - uint8_t value, SimdCompareType compareType, uint32_t * count) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalCount8u(src, stride, width, height, value, count); - case SimdCompareNotEqual: - return ConditionalCount8u(src, stride, width, height, value, count); - case SimdCompareGreater: - return ConditionalCount8u(src, stride, width, height, value, count); - case SimdCompareGreaterOrEqual: - return ConditionalCount8u(src, stride, width, height, value, count); - case SimdCompareLesser: - return ConditionalCount8u(src, stride, width, height, value, count); - case SimdCompareLesserOrEqual: - return ConditionalCount8u(src, stride, width, height, value, count); - default: - assert(0); - } - } - - template - void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, int16_t value, uint32_t * count) - { - *count = 0; - for (size_t row = 0; row < height; ++row) - { - const int16_t * s = (const int16_t *)src; - for (size_t col = 0; col < width; ++col) - { - if (Compare16i(s[col], value)) - (*count)++; - } - src += stride; - } - } - - void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, - int16_t value, SimdCompareType compareType, uint32_t * count) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalCount16i(src, stride, width, height, value, count); - case SimdCompareNotEqual: - return ConditionalCount16i(src, stride, width, height, value, count); - case SimdCompareGreater: - return ConditionalCount16i(src, stride, width, height, value, count); - case SimdCompareGreaterOrEqual: - return ConditionalCount16i(src, stride, width, height, value, count); - case SimdCompareLesser: - return ConditionalCount16i(src, stride, width, height, value, count); - case SimdCompareLesserOrEqual: - return ConditionalCount16i(src, stride, width, height, value, count); - default: - assert(0); - } - } - - template - void ConditionalSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) - { - *sum = 0; - for (size_t row = 0; row < height; ++row) - { - uint32_t rowSum = 0; - for (size_t col = 0; col < width; ++col) - { - if (Compare8u(mask[col], value)) - rowSum += src[col]; - } - *sum += rowSum; - src += srcStride; - mask += maskStride; - } - } - - void ConditionalSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareNotEqual: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreater: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreaterOrEqual: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesser: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesserOrEqual: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - default: - assert(0); - } - } - - template - void ConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) - { - *sum = 0; - for (size_t row = 0; row < height; ++row) - { - uint32_t rowSum = 0; - for (size_t col = 0; col < width; ++col) - { - if (Compare8u(mask[col], value)) - rowSum += Square(src[col]); - } - *sum += rowSum; - src += srcStride; - mask += maskStride; - } - } - - void ConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareNotEqual: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreater: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreaterOrEqual: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesser: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesserOrEqual: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - default: - assert(0); - } - } - - template - void ConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) - { - src += srcStride + 1; - mask += maskStride + 1; - width -= 2; - height -= 2; - - *sum = 0; - for (size_t row = 0; row < height; ++row) - { - uint32_t rowSum = 0; - for (size_t col = 0; col < width; ++col) - { - if (Compare8u(mask[col], value)) - { - rowSum += SquaredDifference(src[col + 1], src[col - 1]); - rowSum += SquaredDifference(src[col + srcStride], src[col - srcStride]); - } - } - *sum += rowSum; - src += srcStride; - mask += maskStride; - } - } - - void ConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareNotEqual: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreater: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreaterOrEqual: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesser: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesserOrEqual: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - default: - assert(0); - } - } - - template - void ConditionalFill(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t threshold, uint8_t value, uint8_t * dst, size_t dstStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - if (Compare8u(src[col], threshold)) - dst[col] = value; - } - src += srcStride; - dst += dstStride; - } - } - - void ConditionalFill(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t threshold, SimdCompareType compareType, uint8_t value, uint8_t * dst, size_t dstStride) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - case SimdCompareNotEqual: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - case SimdCompareGreater: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - case SimdCompareGreaterOrEqual: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - case SimdCompareLesser: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - case SimdCompareLesserOrEqual: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - default: - assert(0); - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseCopy.cpp b/src/3rd/Simd/Simd/SimdBaseCopy.cpp deleted file mode 100644 index 0af3059d..00000000 --- a/src/3rd/Simd/Simd/SimdBaseCopy.cpp +++ /dev/null @@ -1,97 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdDefs.h" - -namespace Simd -{ - namespace Base - { - void Copy(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, uint8_t * dst, size_t dstStride) - { - size_t rowSize = width*pixelSize; - for (size_t row = 0; row < height; ++row) - { - memcpy(dst, src, rowSize); - src += srcStride; - dst += dstStride; - } - } - - void CopyFrame(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, - size_t frameLeft, size_t frameTop, size_t frameRight, size_t frameBottom, uint8_t * dst, size_t dstStride) - { - if (frameTop > frameBottom || frameBottom > height || frameLeft > frameRight || frameRight > width) - return; - - if (frameTop > 0) - { - size_t srcOffset = 0; - size_t dstOffset = 0; - size_t size = width*pixelSize; - for (size_t row = 0; row < frameTop; ++row) - { - memcpy(dst + dstOffset, src + srcOffset, size); - srcOffset += srcStride; - dstOffset += dstStride; - } - } - if (frameBottom < height) - { - size_t srcOffset = frameBottom*srcStride; - size_t dstOffset = frameBottom*dstStride; - size_t size = width*pixelSize; - for (size_t row = frameBottom; row < height; ++row) - { - memcpy(dst + dstOffset, src + srcOffset, size); - srcOffset += srcStride; - dstOffset += dstStride; - } - } - if (frameLeft > 0) - { - size_t srcOffset = frameTop*srcStride; - size_t dstOffset = frameTop*dstStride; - size_t size = frameLeft*pixelSize; - for (size_t row = frameTop; row < frameBottom; ++row) - { - memcpy(dst + dstOffset, src + srcOffset, size); - srcOffset += srcStride; - dstOffset += dstStride; - } - } - if (frameRight < width) - { - size_t srcOffset = frameTop*srcStride + frameRight*pixelSize; - size_t dstOffset = frameTop*dstStride + frameRight*pixelSize; - size_t size = (width - frameRight)*pixelSize; - for (size_t row = frameTop; row < frameBottom; ++row) - { - memcpy(dst + dstOffset, src + srcOffset, size); - srcOffset += srcStride; - dstOffset += dstStride; - } - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseCpu.cpp b/src/3rd/Simd/Simd/SimdBaseCpu.cpp deleted file mode 100644 index 2999e504..00000000 --- a/src/3rd/Simd/Simd/SimdBaseCpu.cpp +++ /dev/null @@ -1,153 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdCpu.h" -#include "Simd/SimdEnable.h" - -#include -#include -#include -#include - -#ifdef __GNUC__ -#include -#include -#include -#endif - -namespace Simd -{ - namespace Base - { - size_t CpuThreadNumber() - { - return std::thread::hardware_concurrency(); - } - -#if defined(_MSC_VER) - typedef SYSTEM_LOGICAL_PROCESSOR_INFORMATION Info; - - void GetLogicalProcessorInformation(std::vector & info) - { - DWORD size = 0; - ::GetLogicalProcessorInformation(0, &size); - info.resize(size / sizeof(Info)); - ::GetLogicalProcessorInformation(info.data(), &size); - } - - size_t CpuSocketNumber() - { - std::vector info; - GetLogicalProcessorInformation(info); - size_t number = 0; - for (size_t i = 0; i < info.size(); ++i) - if (info[i].Relationship == ::RelationNumaNode) - number++; - return number; - } - - size_t CpuCoreNumber() - { - std::vector info; - GetLogicalProcessorInformation(info); - size_t number = 0; - for (size_t i = 0; i < info.size(); ++i) - if (info[i].Relationship == ::RelationProcessorCore) - number++; - return number; - } - - size_t CpuCacheSize(size_t level) - { - std::vector info; - GetLogicalProcessorInformation(info); - for (size_t i = 0; i < info.size(); ++i) - if (info[i].Relationship == ::RelationCache && info[i].Cache.Level == level && (info[i].Cache.Type == ::CacheData || info[i].Cache.Type == CacheUnified)) - return info[i].Cache.Size; - return 0; - } -#elif defined(__GNUC__) - size_t CpuSocketNumber() - { - uint32_t number = 0; - ::FILE * p = ::popen("lscpu -b -p=Socket | grep -v '^#' | sort -u | wc -l", "r"); - if (p) - { - char buffer[PATH_MAX]; - while (::fgets(buffer, PATH_MAX, p)); - number = ::atoi(buffer); - ::pclose(p); - } - return number; - } - - size_t CpuCoreNumber() - { - uint32_t number = 0; - ::FILE * p = ::popen("lscpu -b -p=Core | grep -v '^#' | sort -u | wc -l", "r"); - if (p) - { - char buffer[PATH_MAX]; - while (::fgets(buffer, PATH_MAX, p)); - number = ::atoi(buffer); - ::pclose(p); - } - return number; - } - - SIMD_INLINE size_t CorrectIfZero(size_t value, size_t otherwise) - { - return value ? value : otherwise; - } - -#if defined(_SC_LEVEL1_DCACHE_SIZE) && defined(_SC_LEVEL2_CACHE_SIZE) && defined(_SC_LEVEL3_CACHE_SIZE) - size_t CpuCacheSize(size_t level) - { - switch (level) - { - case 1: return CorrectIfZero(::sysconf(_SC_LEVEL1_DCACHE_SIZE), 32 * 1024); - case 2: return CorrectIfZero(::sysconf(_SC_LEVEL2_CACHE_SIZE), 256 * 1024); - case 3: return CorrectIfZero(::sysconf(_SC_LEVEL3_CACHE_SIZE), 2048 * 1024); - default: - return 0; - } - } -#else - size_t CpuCacheSize(size_t level) - { - switch (level) - { - case 1: return 32 * 1024; - case 2: return 256 * 1024; - case 3: return 2048 * 1024; - default: - return 0; - } - } -#endif - -#else -#error This platform is unsupported! -#endif - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseCrc32.cpp b/src/3rd/Simd/Simd/SimdBaseCrc32.cpp deleted file mode 100644 index 5d7ee475..00000000 --- a/src/3rd/Simd/Simd/SimdBaseCrc32.cpp +++ /dev/null @@ -1,640 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdDefs.h" - -namespace Simd -{ - namespace Base - { - // Precalculated CRC32c lookup table for polynomial 0x1EDC6F41 (castagnoli-crc). - static const uint32_t Crc32cTable[8][256] = - { -#ifdef SIMD_BIG_ENDIAN - { - 0x00000000, 0x03836bf2, 0xf7703be1, 0xf4f35013, 0x1f979ac7, 0x1c14f135, 0xe8e7a126, 0xeb64cad4, - 0xcf58d98a, 0xccdbb278, 0x3828e26b, 0x3bab8999, 0xd0cf434d, 0xd34c28bf, 0x27bf78ac, 0x243c135e, - 0x6fc75e10, 0x6c4435e2, 0x98b765f1, 0x9b340e03, 0x7050c4d7, 0x73d3af25, 0x8720ff36, 0x84a394c4, - 0xa09f879a, 0xa31cec68, 0x57efbc7b, 0x546cd789, 0xbf081d5d, 0xbc8b76af, 0x487826bc, 0x4bfb4d4e, - 0xde8ebd20, 0xdd0dd6d2, 0x29fe86c1, 0x2a7ded33, 0xc11927e7, 0xc29a4c15, 0x36691c06, 0x35ea77f4, - 0x11d664aa, 0x12550f58, 0xe6a65f4b, 0xe52534b9, 0x0e41fe6d, 0x0dc2959f, 0xf931c58c, 0xfab2ae7e, - 0xb149e330, 0xb2ca88c2, 0x4639d8d1, 0x45bab323, 0xaede79f7, 0xad5d1205, 0x59ae4216, 0x5a2d29e4, - 0x7e113aba, 0x7d925148, 0x8961015b, 0x8ae26aa9, 0x6186a07d, 0x6205cb8f, 0x96f69b9c, 0x9575f06e, - 0xbc1d7b41, 0xbf9e10b3, 0x4b6d40a0, 0x48ee2b52, 0xa38ae186, 0xa0098a74, 0x54fada67, 0x5779b195, - 0x7345a2cb, 0x70c6c939, 0x8435992a, 0x87b6f2d8, 0x6cd2380c, 0x6f5153fe, 0x9ba203ed, 0x9821681f, - 0xd3da2551, 0xd0594ea3, 0x24aa1eb0, 0x27297542, 0xcc4dbf96, 0xcfced464, 0x3b3d8477, 0x38beef85, - 0x1c82fcdb, 0x1f019729, 0xebf2c73a, 0xe871acc8, 0x0315661c, 0x00960dee, 0xf4655dfd, 0xf7e6360f, - 0x6293c661, 0x6110ad93, 0x95e3fd80, 0x96609672, 0x7d045ca6, 0x7e873754, 0x8a746747, 0x89f70cb5, - 0xadcb1feb, 0xae487419, 0x5abb240a, 0x59384ff8, 0xb25c852c, 0xb1dfeede, 0x452cbecd, 0x46afd53f, - 0x0d549871, 0x0ed7f383, 0xfa24a390, 0xf9a7c862, 0x12c302b6, 0x11406944, 0xe5b33957, 0xe63052a5, - 0xc20c41fb, 0xc18f2a09, 0x357c7a1a, 0x36ff11e8, 0xdd9bdb3c, 0xde18b0ce, 0x2aebe0dd, 0x29688b2f, - 0x783bf682, 0x7bb89d70, 0x8f4bcd63, 0x8cc8a691, 0x67ac6c45, 0x642f07b7, 0x90dc57a4, 0x935f3c56, - 0xb7632f08, 0xb4e044fa, 0x401314e9, 0x43907f1b, 0xa8f4b5cf, 0xab77de3d, 0x5f848e2e, 0x5c07e5dc, - 0x17fca892, 0x147fc360, 0xe08c9373, 0xe30ff881, 0x086b3255, 0x0be859a7, 0xff1b09b4, 0xfc986246, - 0xd8a47118, 0xdb271aea, 0x2fd44af9, 0x2c57210b, 0xc733ebdf, 0xc4b0802d, 0x3043d03e, 0x33c0bbcc, - 0xa6b54ba2, 0xa5362050, 0x51c57043, 0x52461bb1, 0xb922d165, 0xbaa1ba97, 0x4e52ea84, 0x4dd18176, - 0x69ed9228, 0x6a6ef9da, 0x9e9da9c9, 0x9d1ec23b, 0x767a08ef, 0x75f9631d, 0x810a330e, 0x828958fc, - 0xc97215b2, 0xcaf17e40, 0x3e022e53, 0x3d8145a1, 0xd6e58f75, 0xd566e487, 0x2195b494, 0x2216df66, - 0x062acc38, 0x05a9a7ca, 0xf15af7d9, 0xf2d99c2b, 0x19bd56ff, 0x1a3e3d0d, 0xeecd6d1e, 0xed4e06ec, - 0xc4268dc3, 0xc7a5e631, 0x3356b622, 0x30d5ddd0, 0xdbb11704, 0xd8327cf6, 0x2cc12ce5, 0x2f424717, - 0x0b7e5449, 0x08fd3fbb, 0xfc0e6fa8, 0xff8d045a, 0x14e9ce8e, 0x176aa57c, 0xe399f56f, 0xe01a9e9d, - 0xabe1d3d3, 0xa862b821, 0x5c91e832, 0x5f1283c0, 0xb4764914, 0xb7f522e6, 0x430672f5, 0x40851907, - 0x64b90a59, 0x673a61ab, 0x93c931b8, 0x904a5a4a, 0x7b2e909e, 0x78adfb6c, 0x8c5eab7f, 0x8fddc08d, - 0x1aa830e3, 0x192b5b11, 0xedd80b02, 0xee5b60f0, 0x053faa24, 0x06bcc1d6, 0xf24f91c5, 0xf1ccfa37, - 0xd5f0e969, 0xd673829b, 0x2280d288, 0x2103b97a, 0xca6773ae, 0xc9e4185c, 0x3d17484f, 0x3e9423bd, - 0x756f6ef3, 0x76ec0501, 0x821f5512, 0x819c3ee0, 0x6af8f434, 0x697b9fc6, 0x9d88cfd5, 0x9e0ba427, - 0xba37b779, 0xb9b4dc8b, 0x4d478c98, 0x4ec4e76a, 0xa5a02dbe, 0xa623464c, 0x52d0165f, 0x51537dad - }, - { - 0x00000000, 0x7798a213, 0xee304527, 0x99a8e734, 0xdc618a4e, 0xabf9285d, 0x3251cf69, 0x45c96d7a, - 0xb8c3149d, 0xcf5bb68e, 0x56f351ba, 0x216bf3a9, 0x64a29ed3, 0x133a3cc0, 0x8a92dbf4, 0xfd0a79e7, - 0x81f1c53f, 0xf669672c, 0x6fc18018, 0x1859220b, 0x5d904f71, 0x2a08ed62, 0xb3a00a56, 0xc438a845, - 0x3932d1a2, 0x4eaa73b1, 0xd7029485, 0xa09a3696, 0xe5535bec, 0x92cbf9ff, 0x0b631ecb, 0x7cfbbcd8, - 0x02e38b7f, 0x757b296c, 0xecd3ce58, 0x9b4b6c4b, 0xde820131, 0xa91aa322, 0x30b24416, 0x472ae605, - 0xba209fe2, 0xcdb83df1, 0x5410dac5, 0x238878d6, 0x664115ac, 0x11d9b7bf, 0x8871508b, 0xffe9f298, - 0x83124e40, 0xf48aec53, 0x6d220b67, 0x1abaa974, 0x5f73c40e, 0x28eb661d, 0xb1438129, 0xc6db233a, - 0x3bd15add, 0x4c49f8ce, 0xd5e11ffa, 0xa279bde9, 0xe7b0d093, 0x90287280, 0x098095b4, 0x7e1837a7, - 0x04c617ff, 0x735eb5ec, 0xeaf652d8, 0x9d6ef0cb, 0xd8a79db1, 0xaf3f3fa2, 0x3697d896, 0x410f7a85, - 0xbc050362, 0xcb9da171, 0x52354645, 0x25ade456, 0x6064892c, 0x17fc2b3f, 0x8e54cc0b, 0xf9cc6e18, - 0x8537d2c0, 0xf2af70d3, 0x6b0797e7, 0x1c9f35f4, 0x5956588e, 0x2ecefa9d, 0xb7661da9, 0xc0febfba, - 0x3df4c65d, 0x4a6c644e, 0xd3c4837a, 0xa45c2169, 0xe1954c13, 0x960dee00, 0x0fa50934, 0x783dab27, - 0x06259c80, 0x71bd3e93, 0xe815d9a7, 0x9f8d7bb4, 0xda4416ce, 0xaddcb4dd, 0x347453e9, 0x43ecf1fa, - 0xbee6881d, 0xc97e2a0e, 0x50d6cd3a, 0x274e6f29, 0x62870253, 0x151fa040, 0x8cb74774, 0xfb2fe567, - 0x87d459bf, 0xf04cfbac, 0x69e41c98, 0x1e7cbe8b, 0x5bb5d3f1, 0x2c2d71e2, 0xb58596d6, 0xc21d34c5, - 0x3f174d22, 0x488fef31, 0xd1270805, 0xa6bfaa16, 0xe376c76c, 0x94ee657f, 0x0d46824b, 0x7ade2058, - 0xf9fac3fb, 0x8e6261e8, 0x17ca86dc, 0x605224cf, 0x259b49b5, 0x5203eba6, 0xcbab0c92, 0xbc33ae81, - 0x4139d766, 0x36a17575, 0xaf099241, 0xd8913052, 0x9d585d28, 0xeac0ff3b, 0x7368180f, 0x04f0ba1c, - 0x780b06c4, 0x0f93a4d7, 0x963b43e3, 0xe1a3e1f0, 0xa46a8c8a, 0xd3f22e99, 0x4a5ac9ad, 0x3dc26bbe, - 0xc0c81259, 0xb750b04a, 0x2ef8577e, 0x5960f56d, 0x1ca99817, 0x6b313a04, 0xf299dd30, 0x85017f23, - 0xfb194884, 0x8c81ea97, 0x15290da3, 0x62b1afb0, 0x2778c2ca, 0x50e060d9, 0xc94887ed, 0xbed025fe, - 0x43da5c19, 0x3442fe0a, 0xadea193e, 0xda72bb2d, 0x9fbbd657, 0xe8237444, 0x718b9370, 0x06133163, - 0x7ae88dbb, 0x0d702fa8, 0x94d8c89c, 0xe3406a8f, 0xa68907f5, 0xd111a5e6, 0x48b942d2, 0x3f21e0c1, - 0xc22b9926, 0xb5b33b35, 0x2c1bdc01, 0x5b837e12, 0x1e4a1368, 0x69d2b17b, 0xf07a564f, 0x87e2f45c, - 0xfd3cd404, 0x8aa47617, 0x130c9123, 0x64943330, 0x215d5e4a, 0x56c5fc59, 0xcf6d1b6d, 0xb8f5b97e, - 0x45ffc099, 0x3267628a, 0xabcf85be, 0xdc5727ad, 0x999e4ad7, 0xee06e8c4, 0x77ae0ff0, 0x0036ade3, - 0x7ccd113b, 0x0b55b328, 0x92fd541c, 0xe565f60f, 0xa0ac9b75, 0xd7343966, 0x4e9cde52, 0x39047c41, - 0xc40e05a6, 0xb396a7b5, 0x2a3e4081, 0x5da6e292, 0x186f8fe8, 0x6ff72dfb, 0xf65fcacf, 0x81c768dc, - 0xffdf5f7b, 0x8847fd68, 0x11ef1a5c, 0x6677b84f, 0x23bed535, 0x54267726, 0xcd8e9012, 0xba163201, - 0x471c4be6, 0x3084e9f5, 0xa92c0ec1, 0xdeb4acd2, 0x9b7dc1a8, 0xece563bb, 0x754d848f, 0x02d5269c, - 0x7e2e9a44, 0x09b63857, 0x901edf63, 0xe7867d70, 0xa24f100a, 0xd5d7b219, 0x4c7f552d, 0x3be7f73e, - 0xc6ed8ed9, 0xb1752cca, 0x28ddcbfe, 0x5f4569ed, 0x1a8c0497, 0x6d14a684, 0xf4bc41b0, 0x8324e3a3 - }, - { - 0x00000000, 0x7e9241a5, 0x0d526f4f, 0x73c02eea, 0x1aa4de9e, 0x64369f3b, 0x17f6b1d1, 0x6964f074, - 0xc53e5138, 0xbbac109d, 0xc86c3e77, 0xb6fe7fd2, 0xdf9a8fa6, 0xa108ce03, 0xd2c8e0e9, 0xac5aa14c, - 0x8a7da270, 0xf4efe3d5, 0x872fcd3f, 0xf9bd8c9a, 0x90d97cee, 0xee4b3d4b, 0x9d8b13a1, 0xe3195204, - 0x4f43f348, 0x31d1b2ed, 0x42119c07, 0x3c83dda2, 0x55e72dd6, 0x2b756c73, 0x58b54299, 0x2627033c, - 0x14fb44e1, 0x6a690544, 0x19a92bae, 0x673b6a0b, 0x0e5f9a7f, 0x70cddbda, 0x030df530, 0x7d9fb495, - 0xd1c515d9, 0xaf57547c, 0xdc977a96, 0xa2053b33, 0xcb61cb47, 0xb5f38ae2, 0xc633a408, 0xb8a1e5ad, - 0x9e86e691, 0xe014a734, 0x93d489de, 0xed46c87b, 0x8422380f, 0xfab079aa, 0x89705740, 0xf7e216e5, - 0x5bb8b7a9, 0x252af60c, 0x56ead8e6, 0x28789943, 0x411c6937, 0x3f8e2892, 0x4c4e0678, 0x32dc47dd, - 0xd98065c7, 0xa7122462, 0xd4d20a88, 0xaa404b2d, 0xc324bb59, 0xbdb6fafc, 0xce76d416, 0xb0e495b3, - 0x1cbe34ff, 0x622c755a, 0x11ec5bb0, 0x6f7e1a15, 0x061aea61, 0x7888abc4, 0x0b48852e, 0x75dac48b, - 0x53fdc7b7, 0x2d6f8612, 0x5eafa8f8, 0x203de95d, 0x49591929, 0x37cb588c, 0x440b7666, 0x3a9937c3, - 0x96c3968f, 0xe851d72a, 0x9b91f9c0, 0xe503b865, 0x8c674811, 0xf2f509b4, 0x8135275e, 0xffa766fb, - 0xcd7b2126, 0xb3e96083, 0xc0294e69, 0xbebb0fcc, 0xd7dfffb8, 0xa94dbe1d, 0xda8d90f7, 0xa41fd152, - 0x0845701e, 0x76d731bb, 0x05171f51, 0x7b855ef4, 0x12e1ae80, 0x6c73ef25, 0x1fb3c1cf, 0x6121806a, - 0x47068356, 0x3994c2f3, 0x4a54ec19, 0x34c6adbc, 0x5da25dc8, 0x23301c6d, 0x50f03287, 0x2e627322, - 0x8238d26e, 0xfcaa93cb, 0x8f6abd21, 0xf1f8fc84, 0x989c0cf0, 0xe60e4d55, 0x95ce63bf, 0xeb5c221a, - 0x4377278b, 0x3de5662e, 0x4e2548c4, 0x30b70961, 0x59d3f915, 0x2741b8b0, 0x5481965a, 0x2a13d7ff, - 0x864976b3, 0xf8db3716, 0x8b1b19fc, 0xf5895859, 0x9ceda82d, 0xe27fe988, 0x91bfc762, 0xef2d86c7, - 0xc90a85fb, 0xb798c45e, 0xc458eab4, 0xbacaab11, 0xd3ae5b65, 0xad3c1ac0, 0xdefc342a, 0xa06e758f, - 0x0c34d4c3, 0x72a69566, 0x0166bb8c, 0x7ff4fa29, 0x16900a5d, 0x68024bf8, 0x1bc26512, 0x655024b7, - 0x578c636a, 0x291e22cf, 0x5ade0c25, 0x244c4d80, 0x4d28bdf4, 0x33bafc51, 0x407ad2bb, 0x3ee8931e, - 0x92b23252, 0xec2073f7, 0x9fe05d1d, 0xe1721cb8, 0x8816eccc, 0xf684ad69, 0x85448383, 0xfbd6c226, - 0xddf1c11a, 0xa36380bf, 0xd0a3ae55, 0xae31eff0, 0xc7551f84, 0xb9c75e21, 0xca0770cb, 0xb495316e, - 0x18cf9022, 0x665dd187, 0x159dff6d, 0x6b0fbec8, 0x026b4ebc, 0x7cf90f19, 0x0f3921f3, 0x71ab6056, - 0x9af7424c, 0xe46503e9, 0x97a52d03, 0xe9376ca6, 0x80539cd2, 0xfec1dd77, 0x8d01f39d, 0xf393b238, - 0x5fc91374, 0x215b52d1, 0x529b7c3b, 0x2c093d9e, 0x456dcdea, 0x3bff8c4f, 0x483fa2a5, 0x36ade300, - 0x108ae03c, 0x6e18a199, 0x1dd88f73, 0x634aced6, 0x0a2e3ea2, 0x74bc7f07, 0x077c51ed, 0x79ee1048, - 0xd5b4b104, 0xab26f0a1, 0xd8e6de4b, 0xa6749fee, 0xcf106f9a, 0xb1822e3f, 0xc24200d5, 0xbcd04170, - 0x8e0c06ad, 0xf09e4708, 0x835e69e2, 0xfdcc2847, 0x94a8d833, 0xea3a9996, 0x99fab77c, 0xe768f6d9, - 0x4b325795, 0x35a01630, 0x466038da, 0x38f2797f, 0x5196890b, 0x2f04c8ae, 0x5cc4e644, 0x2256a7e1, - 0x0471a4dd, 0x7ae3e578, 0x0923cb92, 0x77b18a37, 0x1ed57a43, 0x60473be6, 0x1387150c, 0x6d1554a9, - 0xc14ff5e5, 0xbfddb440, 0xcc1d9aaa, 0xb28fdb0f, 0xdbeb2b7b, 0xa5796ade, 0xd6b94434, 0xa82b0591 - }, - { - 0x00000000, 0xb8aa45dd, 0x812367bf, 0x39892262, 0xf331227b, 0x4b9b67a6, 0x721245c4, 0xcab80019, - 0xe66344f6, 0x5ec9012b, 0x67402349, 0xdfea6694, 0x1552668d, 0xadf82350, 0x94710132, 0x2cdb44ef, - 0x3db164e9, 0x851b2134, 0xbc920356, 0x0438468b, 0xce804692, 0x762a034f, 0x4fa3212d, 0xf70964f0, - 0xdbd2201f, 0x637865c2, 0x5af147a0, 0xe25b027d, 0x28e30264, 0x904947b9, 0xa9c065db, 0x116a2006, - 0x8b1425d7, 0x33be600a, 0x0a374268, 0xb29d07b5, 0x782507ac, 0xc08f4271, 0xf9066013, 0x41ac25ce, - 0x6d776121, 0xd5dd24fc, 0xec54069e, 0x54fe4343, 0x9e46435a, 0x26ec0687, 0x1f6524e5, 0xa7cf6138, - 0xb6a5413e, 0x0e0f04e3, 0x37862681, 0x8f2c635c, 0x45946345, 0xfd3e2698, 0xc4b704fa, 0x7c1d4127, - 0x50c605c8, 0xe86c4015, 0xd1e56277, 0x694f27aa, 0xa3f727b3, 0x1b5d626e, 0x22d4400c, 0x9a7e05d1, - 0xe75fa6ab, 0x5ff5e376, 0x667cc114, 0xded684c9, 0x146e84d0, 0xacc4c10d, 0x954de36f, 0x2de7a6b2, - 0x013ce25d, 0xb996a780, 0x801f85e2, 0x38b5c03f, 0xf20dc026, 0x4aa785fb, 0x732ea799, 0xcb84e244, - 0xdaeec242, 0x6244879f, 0x5bcda5fd, 0xe367e020, 0x29dfe039, 0x9175a5e4, 0xa8fc8786, 0x1056c25b, - 0x3c8d86b4, 0x8427c369, 0xbdaee10b, 0x0504a4d6, 0xcfbca4cf, 0x7716e112, 0x4e9fc370, 0xf63586ad, - 0x6c4b837c, 0xd4e1c6a1, 0xed68e4c3, 0x55c2a11e, 0x9f7aa107, 0x27d0e4da, 0x1e59c6b8, 0xa6f38365, - 0x8a28c78a, 0x32828257, 0x0b0ba035, 0xb3a1e5e8, 0x7919e5f1, 0xc1b3a02c, 0xf83a824e, 0x4090c793, - 0x51fae795, 0xe950a248, 0xd0d9802a, 0x6873c5f7, 0xa2cbc5ee, 0x1a618033, 0x23e8a251, 0x9b42e78c, - 0xb799a363, 0x0f33e6be, 0x36bac4dc, 0x8e108101, 0x44a88118, 0xfc02c4c5, 0xc58be6a7, 0x7d21a37a, - 0x3fc9a052, 0x8763e58f, 0xbeeac7ed, 0x06408230, 0xccf88229, 0x7452c7f4, 0x4ddbe596, 0xf571a04b, - 0xd9aae4a4, 0x6100a179, 0x5889831b, 0xe023c6c6, 0x2a9bc6df, 0x92318302, 0xabb8a160, 0x1312e4bd, - 0x0278c4bb, 0xbad28166, 0x835ba304, 0x3bf1e6d9, 0xf149e6c0, 0x49e3a31d, 0x706a817f, 0xc8c0c4a2, - 0xe41b804d, 0x5cb1c590, 0x6538e7f2, 0xdd92a22f, 0x172aa236, 0xaf80e7eb, 0x9609c589, 0x2ea38054, - 0xb4dd8585, 0x0c77c058, 0x35fee23a, 0x8d54a7e7, 0x47eca7fe, 0xff46e223, 0xc6cfc041, 0x7e65859c, - 0x52bec173, 0xea1484ae, 0xd39da6cc, 0x6b37e311, 0xa18fe308, 0x1925a6d5, 0x20ac84b7, 0x9806c16a, - 0x896ce16c, 0x31c6a4b1, 0x084f86d3, 0xb0e5c30e, 0x7a5dc317, 0xc2f786ca, 0xfb7ea4a8, 0x43d4e175, - 0x6f0fa59a, 0xd7a5e047, 0xee2cc225, 0x568687f8, 0x9c3e87e1, 0x2494c23c, 0x1d1de05e, 0xa5b7a583, - 0xd89606f9, 0x603c4324, 0x59b56146, 0xe11f249b, 0x2ba72482, 0x930d615f, 0xaa84433d, 0x122e06e0, - 0x3ef5420f, 0x865f07d2, 0xbfd625b0, 0x077c606d, 0xcdc46074, 0x756e25a9, 0x4ce707cb, 0xf44d4216, - 0xe5276210, 0x5d8d27cd, 0x640405af, 0xdcae4072, 0x1616406b, 0xaebc05b6, 0x973527d4, 0x2f9f6209, - 0x034426e6, 0xbbee633b, 0x82674159, 0x3acd0484, 0xf075049d, 0x48df4140, 0x71566322, 0xc9fc26ff, - 0x5382232e, 0xeb2866f3, 0xd2a14491, 0x6a0b014c, 0xa0b30155, 0x18194488, 0x219066ea, 0x993a2337, - 0xb5e167d8, 0x0d4b2205, 0x34c20067, 0x8c6845ba, 0x46d045a3, 0xfe7a007e, 0xc7f3221c, 0x7f5967c1, - 0x6e3347c7, 0xd699021a, 0xef102078, 0x57ba65a5, 0x9d0265bc, 0x25a82061, 0x1c210203, 0xa48b47de, - 0x88500331, 0x30fa46ec, 0x0973648e, 0xb1d92153, 0x7b61214a, 0xc3cb6497, 0xfa4246f5, 0x42e80328 - }, - { - 0x00000000, 0xac6f1138, 0x58df2270, 0xf4b03348, 0xb0be45e0, 0x1cd154d8, 0xe8616790, 0x440e76a8, - 0x910b67c5, 0x3d6476fd, 0xc9d445b5, 0x65bb548d, 0x21b52225, 0x8dda331d, 0x796a0055, 0xd505116d, - 0xd361228f, 0x7f0e33b7, 0x8bbe00ff, 0x27d111c7, 0x63df676f, 0xcfb07657, 0x3b00451f, 0x976f5427, - 0x426a454a, 0xee055472, 0x1ab5673a, 0xb6da7602, 0xf2d400aa, 0x5ebb1192, 0xaa0b22da, 0x066433e2, - 0x57b5a81b, 0xfbdab923, 0x0f6a8a6b, 0xa3059b53, 0xe70bedfb, 0x4b64fcc3, 0xbfd4cf8b, 0x13bbdeb3, - 0xc6becfde, 0x6ad1dee6, 0x9e61edae, 0x320efc96, 0x76008a3e, 0xda6f9b06, 0x2edfa84e, 0x82b0b976, - 0x84d48a94, 0x28bb9bac, 0xdc0ba8e4, 0x7064b9dc, 0x346acf74, 0x9805de4c, 0x6cb5ed04, 0xc0dafc3c, - 0x15dfed51, 0xb9b0fc69, 0x4d00cf21, 0xe16fde19, 0xa561a8b1, 0x090eb989, 0xfdbe8ac1, 0x51d19bf9, - 0xae6a5137, 0x0205400f, 0xf6b57347, 0x5ada627f, 0x1ed414d7, 0xb2bb05ef, 0x460b36a7, 0xea64279f, - 0x3f6136f2, 0x930e27ca, 0x67be1482, 0xcbd105ba, 0x8fdf7312, 0x23b0622a, 0xd7005162, 0x7b6f405a, - 0x7d0b73b8, 0xd1646280, 0x25d451c8, 0x89bb40f0, 0xcdb53658, 0x61da2760, 0x956a1428, 0x39050510, - 0xec00147d, 0x406f0545, 0xb4df360d, 0x18b02735, 0x5cbe519d, 0xf0d140a5, 0x046173ed, 0xa80e62d5, - 0xf9dff92c, 0x55b0e814, 0xa100db5c, 0x0d6fca64, 0x4961bccc, 0xe50eadf4, 0x11be9ebc, 0xbdd18f84, - 0x68d49ee9, 0xc4bb8fd1, 0x300bbc99, 0x9c64ada1, 0xd86adb09, 0x7405ca31, 0x80b5f979, 0x2cdae841, - 0x2abedba3, 0x86d1ca9b, 0x7261f9d3, 0xde0ee8eb, 0x9a009e43, 0x366f8f7b, 0xc2dfbc33, 0x6eb0ad0b, - 0xbbb5bc66, 0x17daad5e, 0xe36a9e16, 0x4f058f2e, 0x0b0bf986, 0xa764e8be, 0x53d4dbf6, 0xffbbcace, - 0x5cd5a26e, 0xf0bab356, 0x040a801e, 0xa8659126, 0xec6be78e, 0x4004f6b6, 0xb4b4c5fe, 0x18dbd4c6, - 0xcddec5ab, 0x61b1d493, 0x9501e7db, 0x396ef6e3, 0x7d60804b, 0xd10f9173, 0x25bfa23b, 0x89d0b303, - 0x8fb480e1, 0x23db91d9, 0xd76ba291, 0x7b04b3a9, 0x3f0ac501, 0x9365d439, 0x67d5e771, 0xcbbaf649, - 0x1ebfe724, 0xb2d0f61c, 0x4660c554, 0xea0fd46c, 0xae01a2c4, 0x026eb3fc, 0xf6de80b4, 0x5ab1918c, - 0x0b600a75, 0xa70f1b4d, 0x53bf2805, 0xffd0393d, 0xbbde4f95, 0x17b15ead, 0xe3016de5, 0x4f6e7cdd, - 0x9a6b6db0, 0x36047c88, 0xc2b44fc0, 0x6edb5ef8, 0x2ad52850, 0x86ba3968, 0x720a0a20, 0xde651b18, - 0xd80128fa, 0x746e39c2, 0x80de0a8a, 0x2cb11bb2, 0x68bf6d1a, 0xc4d07c22, 0x30604f6a, 0x9c0f5e52, - 0x490a4f3f, 0xe5655e07, 0x11d56d4f, 0xbdba7c77, 0xf9b40adf, 0x55db1be7, 0xa16b28af, 0x0d043997, - 0xf2bff359, 0x5ed0e261, 0xaa60d129, 0x060fc011, 0x4201b6b9, 0xee6ea781, 0x1ade94c9, 0xb6b185f1, - 0x63b4949c, 0xcfdb85a4, 0x3b6bb6ec, 0x9704a7d4, 0xd30ad17c, 0x7f65c044, 0x8bd5f30c, 0x27bae234, - 0x21ded1d6, 0x8db1c0ee, 0x7901f3a6, 0xd56ee29e, 0x91609436, 0x3d0f850e, 0xc9bfb646, 0x65d0a77e, - 0xb0d5b613, 0x1cbaa72b, 0xe80a9463, 0x4465855b, 0x006bf3f3, 0xac04e2cb, 0x58b4d183, 0xf4dbc0bb, - 0xa50a5b42, 0x09654a7a, 0xfdd57932, 0x51ba680a, 0x15b41ea2, 0xb9db0f9a, 0x4d6b3cd2, 0xe1042dea, - 0x34013c87, 0x986e2dbf, 0x6cde1ef7, 0xc0b10fcf, 0x84bf7967, 0x28d0685f, 0xdc605b17, 0x700f4a2f, - 0x766b79cd, 0xda0468f5, 0x2eb45bbd, 0x82db4a85, 0xc6d53c2d, 0x6aba2d15, 0x9e0a1e5d, 0x32650f65, - 0xe7601e08, 0x4b0f0f30, 0xbfbf3c78, 0x13d02d40, 0x57de5be8, 0xfbb14ad0, 0x0f017998, 0xa36e68a0 - }, - { - 0x00000000, 0x196b30ef, 0xc3a08cdb, 0xdacbbc34, 0x7737f5b2, 0x6e5cc55d, 0xb4977969, 0xadfc4986, - 0x1f180660, 0x0673368f, 0xdcb88abb, 0xc5d3ba54, 0x682ff3d2, 0x7144c33d, 0xab8f7f09, 0xb2e44fe6, - 0x3e300cc0, 0x275b3c2f, 0xfd90801b, 0xe4fbb0f4, 0x4907f972, 0x506cc99d, 0x8aa775a9, 0x93cc4546, - 0x21280aa0, 0x38433a4f, 0xe288867b, 0xfbe3b694, 0x561fff12, 0x4f74cffd, 0x95bf73c9, 0x8cd44326, - 0x8d16f485, 0x947dc46a, 0x4eb6785e, 0x57dd48b1, 0xfa210137, 0xe34a31d8, 0x39818dec, 0x20eabd03, - 0x920ef2e5, 0x8b65c20a, 0x51ae7e3e, 0x48c54ed1, 0xe5390757, 0xfc5237b8, 0x26998b8c, 0x3ff2bb63, - 0xb326f845, 0xaa4dc8aa, 0x7086749e, 0x69ed4471, 0xc4110df7, 0xdd7a3d18, 0x07b1812c, 0x1edab1c3, - 0xac3efe25, 0xb555ceca, 0x6f9e72fe, 0x76f54211, 0xdb090b97, 0xc2623b78, 0x18a9874c, 0x01c2b7a3, - 0xeb5b040e, 0xf23034e1, 0x28fb88d5, 0x3190b83a, 0x9c6cf1bc, 0x8507c153, 0x5fcc7d67, 0x46a74d88, - 0xf443026e, 0xed283281, 0x37e38eb5, 0x2e88be5a, 0x8374f7dc, 0x9a1fc733, 0x40d47b07, 0x59bf4be8, - 0xd56b08ce, 0xcc003821, 0x16cb8415, 0x0fa0b4fa, 0xa25cfd7c, 0xbb37cd93, 0x61fc71a7, 0x78974148, - 0xca730eae, 0xd3183e41, 0x09d38275, 0x10b8b29a, 0xbd44fb1c, 0xa42fcbf3, 0x7ee477c7, 0x678f4728, - 0x664df08b, 0x7f26c064, 0xa5ed7c50, 0xbc864cbf, 0x117a0539, 0x081135d6, 0xd2da89e2, 0xcbb1b90d, - 0x7955f6eb, 0x603ec604, 0xbaf57a30, 0xa39e4adf, 0x0e620359, 0x170933b6, 0xcdc28f82, 0xd4a9bf6d, - 0x587dfc4b, 0x4116cca4, 0x9bdd7090, 0x82b6407f, 0x2f4a09f9, 0x36213916, 0xecea8522, 0xf581b5cd, - 0x4765fa2b, 0x5e0ecac4, 0x84c576f0, 0x9dae461f, 0x30520f99, 0x29393f76, 0xf3f28342, 0xea99b3ad, - 0xd6b7081c, 0xcfdc38f3, 0x151784c7, 0x0c7cb428, 0xa180fdae, 0xb8ebcd41, 0x62207175, 0x7b4b419a, - 0xc9af0e7c, 0xd0c43e93, 0x0a0f82a7, 0x1364b248, 0xbe98fbce, 0xa7f3cb21, 0x7d387715, 0x645347fa, - 0xe88704dc, 0xf1ec3433, 0x2b278807, 0x324cb8e8, 0x9fb0f16e, 0x86dbc181, 0x5c107db5, 0x457b4d5a, - 0xf79f02bc, 0xeef43253, 0x343f8e67, 0x2d54be88, 0x80a8f70e, 0x99c3c7e1, 0x43087bd5, 0x5a634b3a, - 0x5ba1fc99, 0x42cacc76, 0x98017042, 0x816a40ad, 0x2c96092b, 0x35fd39c4, 0xef3685f0, 0xf65db51f, - 0x44b9faf9, 0x5dd2ca16, 0x87197622, 0x9e7246cd, 0x338e0f4b, 0x2ae53fa4, 0xf02e8390, 0xe945b37f, - 0x6591f059, 0x7cfac0b6, 0xa6317c82, 0xbf5a4c6d, 0x12a605eb, 0x0bcd3504, 0xd1068930, 0xc86db9df, - 0x7a89f639, 0x63e2c6d6, 0xb9297ae2, 0xa0424a0d, 0x0dbe038b, 0x14d53364, 0xce1e8f50, 0xd775bfbf, - 0x3dec0c12, 0x24873cfd, 0xfe4c80c9, 0xe727b026, 0x4adbf9a0, 0x53b0c94f, 0x897b757b, 0x90104594, - 0x22f40a72, 0x3b9f3a9d, 0xe15486a9, 0xf83fb646, 0x55c3ffc0, 0x4ca8cf2f, 0x9663731b, 0x8f0843f4, - 0x03dc00d2, 0x1ab7303d, 0xc07c8c09, 0xd917bce6, 0x74ebf560, 0x6d80c58f, 0xb74b79bb, 0xae204954, - 0x1cc406b2, 0x05af365d, 0xdf648a69, 0xc60fba86, 0x6bf3f300, 0x7298c3ef, 0xa8537fdb, 0xb1384f34, - 0xb0faf897, 0xa991c878, 0x735a744c, 0x6a3144a3, 0xc7cd0d25, 0xdea63dca, 0x046d81fe, 0x1d06b111, - 0xafe2fef7, 0xb689ce18, 0x6c42722c, 0x752942c3, 0xd8d50b45, 0xc1be3baa, 0x1b75879e, 0x021eb771, - 0x8ecaf457, 0x97a1c4b8, 0x4d6a788c, 0x54014863, 0xf9fd01e5, 0xe096310a, 0x3a5d8d3e, 0x2336bdd1, - 0x91d2f237, 0x88b9c2d8, 0x52727eec, 0x4b194e03, 0xe6e50785, 0xff8e376a, 0x25458b5e, 0x3c2ebbb1 - }, - { - 0x00000000, 0xc82c0368, 0x905906d0, 0x587505b8, 0xd1c5e0a5, 0x19e9e3cd, 0x419ce675, 0x89b0e51d, - 0x53fd2d4e, 0x9bd12e26, 0xc3a42b9e, 0x0b8828f6, 0x8238cdeb, 0x4a14ce83, 0x1261cb3b, 0xda4dc853, - 0xa6fa5b9c, 0x6ed658f4, 0x36a35d4c, 0xfe8f5e24, 0x773fbb39, 0xbf13b851, 0xe766bde9, 0x2f4abe81, - 0xf50776d2, 0x3d2b75ba, 0x655e7002, 0xad72736a, 0x24c29677, 0xecee951f, 0xb49b90a7, 0x7cb793cf, - 0xbd835b3d, 0x75af5855, 0x2dda5ded, 0xe5f65e85, 0x6c46bb98, 0xa46ab8f0, 0xfc1fbd48, 0x3433be20, - 0xee7e7673, 0x2652751b, 0x7e2770a3, 0xb60b73cb, 0x3fbb96d6, 0xf79795be, 0xafe29006, 0x67ce936e, - 0x1b7900a1, 0xd35503c9, 0x8b200671, 0x430c0519, 0xcabce004, 0x0290e36c, 0x5ae5e6d4, 0x92c9e5bc, - 0x48842def, 0x80a82e87, 0xd8dd2b3f, 0x10f12857, 0x9941cd4a, 0x516dce22, 0x0918cb9a, 0xc134c8f2, - 0x7a07b77a, 0xb22bb412, 0xea5eb1aa, 0x2272b2c2, 0xabc257df, 0x63ee54b7, 0x3b9b510f, 0xf3b75267, - 0x29fa9a34, 0xe1d6995c, 0xb9a39ce4, 0x718f9f8c, 0xf83f7a91, 0x301379f9, 0x68667c41, 0xa04a7f29, - 0xdcfdece6, 0x14d1ef8e, 0x4ca4ea36, 0x8488e95e, 0x0d380c43, 0xc5140f2b, 0x9d610a93, 0x554d09fb, - 0x8f00c1a8, 0x472cc2c0, 0x1f59c778, 0xd775c410, 0x5ec5210d, 0x96e92265, 0xce9c27dd, 0x06b024b5, - 0xc784ec47, 0x0fa8ef2f, 0x57ddea97, 0x9ff1e9ff, 0x16410ce2, 0xde6d0f8a, 0x86180a32, 0x4e34095a, - 0x9479c109, 0x5c55c261, 0x0420c7d9, 0xcc0cc4b1, 0x45bc21ac, 0x8d9022c4, 0xd5e5277c, 0x1dc92414, - 0x617eb7db, 0xa952b4b3, 0xf127b10b, 0x390bb263, 0xb0bb577e, 0x78975416, 0x20e251ae, 0xe8ce52c6, - 0x32839a95, 0xfaaf99fd, 0xa2da9c45, 0x6af69f2d, 0xe3467a30, 0x2b6a7958, 0x731f7ce0, 0xbb337f88, - 0xf40e6ef5, 0x3c226d9d, 0x64576825, 0xac7b6b4d, 0x25cb8e50, 0xede78d38, 0xb5928880, 0x7dbe8be8, - 0xa7f343bb, 0x6fdf40d3, 0x37aa456b, 0xff864603, 0x7636a31e, 0xbe1aa076, 0xe66fa5ce, 0x2e43a6a6, - 0x52f43569, 0x9ad83601, 0xc2ad33b9, 0x0a8130d1, 0x8331d5cc, 0x4b1dd6a4, 0x1368d31c, 0xdb44d074, - 0x01091827, 0xc9251b4f, 0x91501ef7, 0x597c1d9f, 0xd0ccf882, 0x18e0fbea, 0x4095fe52, 0x88b9fd3a, - 0x498d35c8, 0x81a136a0, 0xd9d43318, 0x11f83070, 0x9848d56d, 0x5064d605, 0x0811d3bd, 0xc03dd0d5, - 0x1a701886, 0xd25c1bee, 0x8a291e56, 0x42051d3e, 0xcbb5f823, 0x0399fb4b, 0x5becfef3, 0x93c0fd9b, - 0xef776e54, 0x275b6d3c, 0x7f2e6884, 0xb7026bec, 0x3eb28ef1, 0xf69e8d99, 0xaeeb8821, 0x66c78b49, - 0xbc8a431a, 0x74a64072, 0x2cd345ca, 0xe4ff46a2, 0x6d4fa3bf, 0xa563a0d7, 0xfd16a56f, 0x353aa607, - 0x8e09d98f, 0x4625dae7, 0x1e50df5f, 0xd67cdc37, 0x5fcc392a, 0x97e03a42, 0xcf953ffa, 0x07b93c92, - 0xddf4f4c1, 0x15d8f7a9, 0x4dadf211, 0x8581f179, 0x0c311464, 0xc41d170c, 0x9c6812b4, 0x544411dc, - 0x28f38213, 0xe0df817b, 0xb8aa84c3, 0x708687ab, 0xf93662b6, 0x311a61de, 0x696f6466, 0xa143670e, - 0x7b0eaf5d, 0xb322ac35, 0xeb57a98d, 0x237baae5, 0xaacb4ff8, 0x62e74c90, 0x3a924928, 0xf2be4a40, - 0x338a82b2, 0xfba681da, 0xa3d38462, 0x6bff870a, 0xe24f6217, 0x2a63617f, 0x721664c7, 0xba3a67af, - 0x6077affc, 0xa85bac94, 0xf02ea92c, 0x3802aa44, 0xb1b24f59, 0x799e4c31, 0x21eb4989, 0xe9c74ae1, - 0x9570d92e, 0x5d5cda46, 0x0529dffe, 0xcd05dc96, 0x44b5398b, 0x8c993ae3, 0xd4ec3f5b, 0x1cc03c33, - 0xc68df460, 0x0ea1f708, 0x56d4f2b0, 0x9ef8f1d8, 0x174814c5, 0xdf6417ad, 0x87111215, 0x4f3d117d - }, - { - 0x00000000, 0x277d3c49, 0x4efa7892, 0x698744db, 0x6d821d21, 0x4aff2168, 0x237865b3, 0x040559fa, - 0xda043b42, 0xfd79070b, 0x94fe43d0, 0xb3837f99, 0xb7862663, 0x90fb1a2a, 0xf97c5ef1, 0xde0162b8, - 0xb4097684, 0x93744acd, 0xfaf30e16, 0xdd8e325f, 0xd98b6ba5, 0xfef657ec, 0x97711337, 0xb00c2f7e, - 0x6e0d4dc6, 0x4970718f, 0x20f73554, 0x078a091d, 0x038f50e7, 0x24f26cae, 0x4d752875, 0x6a08143c, - 0x9965000d, 0xbe183c44, 0xd79f789f, 0xf0e244d6, 0xf4e71d2c, 0xd39a2165, 0xba1d65be, 0x9d6059f7, - 0x43613b4f, 0x641c0706, 0x0d9b43dd, 0x2ae67f94, 0x2ee3266e, 0x099e1a27, 0x60195efc, 0x476462b5, - 0x2d6c7689, 0x0a114ac0, 0x63960e1b, 0x44eb3252, 0x40ee6ba8, 0x679357e1, 0x0e14133a, 0x29692f73, - 0xf7684dcb, 0xd0157182, 0xb9923559, 0x9eef0910, 0x9aea50ea, 0xbd976ca3, 0xd4102878, 0xf36d1431, - 0x32cb001a, 0x15b63c53, 0x7c317888, 0x5b4c44c1, 0x5f491d3b, 0x78342172, 0x11b365a9, 0x36ce59e0, - 0xe8cf3b58, 0xcfb20711, 0xa63543ca, 0x81487f83, 0x854d2679, 0xa2301a30, 0xcbb75eeb, 0xecca62a2, - 0x86c2769e, 0xa1bf4ad7, 0xc8380e0c, 0xef453245, 0xeb406bbf, 0xcc3d57f6, 0xa5ba132d, 0x82c72f64, - 0x5cc64ddc, 0x7bbb7195, 0x123c354e, 0x35410907, 0x314450fd, 0x16396cb4, 0x7fbe286f, 0x58c31426, - 0xabae0017, 0x8cd33c5e, 0xe5547885, 0xc22944cc, 0xc62c1d36, 0xe151217f, 0x88d665a4, 0xafab59ed, - 0x71aa3b55, 0x56d7071c, 0x3f5043c7, 0x182d7f8e, 0x1c282674, 0x3b551a3d, 0x52d25ee6, 0x75af62af, - 0x1fa77693, 0x38da4ada, 0x515d0e01, 0x76203248, 0x72256bb2, 0x555857fb, 0x3cdf1320, 0x1ba22f69, - 0xc5a34dd1, 0xe2de7198, 0x8b593543, 0xac24090a, 0xa82150f0, 0x8f5c6cb9, 0xe6db2862, 0xc1a6142b, - 0x64960134, 0x43eb3d7d, 0x2a6c79a6, 0x0d1145ef, 0x09141c15, 0x2e69205c, 0x47ee6487, 0x609358ce, - 0xbe923a76, 0x99ef063f, 0xf06842e4, 0xd7157ead, 0xd3102757, 0xf46d1b1e, 0x9dea5fc5, 0xba97638c, - 0xd09f77b0, 0xf7e24bf9, 0x9e650f22, 0xb918336b, 0xbd1d6a91, 0x9a6056d8, 0xf3e71203, 0xd49a2e4a, - 0x0a9b4cf2, 0x2de670bb, 0x44613460, 0x631c0829, 0x671951d3, 0x40646d9a, 0x29e32941, 0x0e9e1508, - 0xfdf30139, 0xda8e3d70, 0xb30979ab, 0x947445e2, 0x90711c18, 0xb70c2051, 0xde8b648a, 0xf9f658c3, - 0x27f73a7b, 0x008a0632, 0x690d42e9, 0x4e707ea0, 0x4a75275a, 0x6d081b13, 0x048f5fc8, 0x23f26381, - 0x49fa77bd, 0x6e874bf4, 0x07000f2f, 0x207d3366, 0x24786a9c, 0x030556d5, 0x6a82120e, 0x4dff2e47, - 0x93fe4cff, 0xb48370b6, 0xdd04346d, 0xfa790824, 0xfe7c51de, 0xd9016d97, 0xb086294c, 0x97fb1505, - 0x565d012e, 0x71203d67, 0x18a779bc, 0x3fda45f5, 0x3bdf1c0f, 0x1ca22046, 0x7525649d, 0x525858d4, - 0x8c593a6c, 0xab240625, 0xc2a342fe, 0xe5de7eb7, 0xe1db274d, 0xc6a61b04, 0xaf215fdf, 0x885c6396, - 0xe25477aa, 0xc5294be3, 0xacae0f38, 0x8bd33371, 0x8fd66a8b, 0xa8ab56c2, 0xc12c1219, 0xe6512e50, - 0x38504ce8, 0x1f2d70a1, 0x76aa347a, 0x51d70833, 0x55d251c9, 0x72af6d80, 0x1b28295b, 0x3c551512, - 0xcf380123, 0xe8453d6a, 0x81c279b1, 0xa6bf45f8, 0xa2ba1c02, 0x85c7204b, 0xec406490, 0xcb3d58d9, - 0x153c3a61, 0x32410628, 0x5bc642f3, 0x7cbb7eba, 0x78be2740, 0x5fc31b09, 0x36445fd2, 0x1139639b, - 0x7b3177a7, 0x5c4c4bee, 0x35cb0f35, 0x12b6337c, 0x16b36a86, 0x31ce56cf, 0x58491214, 0x7f342e5d, - 0xa1354ce5, 0x864870ac, 0xefcf3477, 0xc8b2083e, 0xccb751c4, 0xebca6d8d, 0x824d2956, 0xa530151f - } -#else//SIMD_BIG_ENDIAN - { - 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, - 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, - 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, - 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, - 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, - 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, - 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, - 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, - 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, - 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, - 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, - 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, - 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, - 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, - 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, - 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, - 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, - 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, - 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, - 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, - 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, - 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, - 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, - 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, - 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, - 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, - 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, - 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, - 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, - 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, - 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, - 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351 - }, - { - 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, - 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, - 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, - 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, - 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, - 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, - 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, - 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, - 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, - 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, - 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, - 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, - 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, - 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, - 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, - 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, - 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, - 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, - 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, - 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, - 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, - 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, - 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, - 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, - 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, - 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, - 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, - 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, - 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, - 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, - 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, - 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483 - }, - { - 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, - 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, - 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, - 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, - 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, - 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, - 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, - 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, - 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, - 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, - 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, - 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, - 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, - 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, - 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, - 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, - 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, - 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, - 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, - 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, - 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, - 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, - 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, - 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, - 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, - 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, - 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, - 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, - 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, - 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, - 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, - 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8 - }, - { - 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, - 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, - 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, - 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, - 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, - 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, - 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, - 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, - 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, - 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, - 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, - 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, - 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, - 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, - 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, - 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, - 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, - 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, - 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, - 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, - 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, - 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, - 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, - 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, - 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, - 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, - 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, - 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, - 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, - 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, - 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, - 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842 - }, - { - 0x00000000, 0x38116fac, 0x7022df58, 0x4833b0f4, 0xe045beb0, 0xd854d11c, 0x906761e8, 0xa8760e44, - 0xc5670b91, 0xfd76643d, 0xb545d4c9, 0x8d54bb65, 0x2522b521, 0x1d33da8d, 0x55006a79, 0x6d1105d5, - 0x8f2261d3, 0xb7330e7f, 0xff00be8b, 0xc711d127, 0x6f67df63, 0x5776b0cf, 0x1f45003b, 0x27546f97, - 0x4a456a42, 0x725405ee, 0x3a67b51a, 0x0276dab6, 0xaa00d4f2, 0x9211bb5e, 0xda220baa, 0xe2336406, - 0x1ba8b557, 0x23b9dafb, 0x6b8a6a0f, 0x539b05a3, 0xfbed0be7, 0xc3fc644b, 0x8bcfd4bf, 0xb3debb13, - 0xdecfbec6, 0xe6ded16a, 0xaeed619e, 0x96fc0e32, 0x3e8a0076, 0x069b6fda, 0x4ea8df2e, 0x76b9b082, - 0x948ad484, 0xac9bbb28, 0xe4a80bdc, 0xdcb96470, 0x74cf6a34, 0x4cde0598, 0x04edb56c, 0x3cfcdac0, - 0x51eddf15, 0x69fcb0b9, 0x21cf004d, 0x19de6fe1, 0xb1a861a5, 0x89b90e09, 0xc18abefd, 0xf99bd151, - 0x37516aae, 0x0f400502, 0x4773b5f6, 0x7f62da5a, 0xd714d41e, 0xef05bbb2, 0xa7360b46, 0x9f2764ea, - 0xf236613f, 0xca270e93, 0x8214be67, 0xba05d1cb, 0x1273df8f, 0x2a62b023, 0x625100d7, 0x5a406f7b, - 0xb8730b7d, 0x806264d1, 0xc851d425, 0xf040bb89, 0x5836b5cd, 0x6027da61, 0x28146a95, 0x10050539, - 0x7d1400ec, 0x45056f40, 0x0d36dfb4, 0x3527b018, 0x9d51be5c, 0xa540d1f0, 0xed736104, 0xd5620ea8, - 0x2cf9dff9, 0x14e8b055, 0x5cdb00a1, 0x64ca6f0d, 0xccbc6149, 0xf4ad0ee5, 0xbc9ebe11, 0x848fd1bd, - 0xe99ed468, 0xd18fbbc4, 0x99bc0b30, 0xa1ad649c, 0x09db6ad8, 0x31ca0574, 0x79f9b580, 0x41e8da2c, - 0xa3dbbe2a, 0x9bcad186, 0xd3f96172, 0xebe80ede, 0x439e009a, 0x7b8f6f36, 0x33bcdfc2, 0x0badb06e, - 0x66bcb5bb, 0x5eadda17, 0x169e6ae3, 0x2e8f054f, 0x86f90b0b, 0xbee864a7, 0xf6dbd453, 0xcecabbff, - 0x6ea2d55c, 0x56b3baf0, 0x1e800a04, 0x269165a8, 0x8ee76bec, 0xb6f60440, 0xfec5b4b4, 0xc6d4db18, - 0xabc5decd, 0x93d4b161, 0xdbe70195, 0xe3f66e39, 0x4b80607d, 0x73910fd1, 0x3ba2bf25, 0x03b3d089, - 0xe180b48f, 0xd991db23, 0x91a26bd7, 0xa9b3047b, 0x01c50a3f, 0x39d46593, 0x71e7d567, 0x49f6bacb, - 0x24e7bf1e, 0x1cf6d0b2, 0x54c56046, 0x6cd40fea, 0xc4a201ae, 0xfcb36e02, 0xb480def6, 0x8c91b15a, - 0x750a600b, 0x4d1b0fa7, 0x0528bf53, 0x3d39d0ff, 0x954fdebb, 0xad5eb117, 0xe56d01e3, 0xdd7c6e4f, - 0xb06d6b9a, 0x887c0436, 0xc04fb4c2, 0xf85edb6e, 0x5028d52a, 0x6839ba86, 0x200a0a72, 0x181b65de, - 0xfa2801d8, 0xc2396e74, 0x8a0ade80, 0xb21bb12c, 0x1a6dbf68, 0x227cd0c4, 0x6a4f6030, 0x525e0f9c, - 0x3f4f0a49, 0x075e65e5, 0x4f6dd511, 0x777cbabd, 0xdf0ab4f9, 0xe71bdb55, 0xaf286ba1, 0x9739040d, - 0x59f3bff2, 0x61e2d05e, 0x29d160aa, 0x11c00f06, 0xb9b60142, 0x81a76eee, 0xc994de1a, 0xf185b1b6, - 0x9c94b463, 0xa485dbcf, 0xecb66b3b, 0xd4a70497, 0x7cd10ad3, 0x44c0657f, 0x0cf3d58b, 0x34e2ba27, - 0xd6d1de21, 0xeec0b18d, 0xa6f30179, 0x9ee26ed5, 0x36946091, 0x0e850f3d, 0x46b6bfc9, 0x7ea7d065, - 0x13b6d5b0, 0x2ba7ba1c, 0x63940ae8, 0x5b856544, 0xf3f36b00, 0xcbe204ac, 0x83d1b458, 0xbbc0dbf4, - 0x425b0aa5, 0x7a4a6509, 0x3279d5fd, 0x0a68ba51, 0xa21eb415, 0x9a0fdbb9, 0xd23c6b4d, 0xea2d04e1, - 0x873c0134, 0xbf2d6e98, 0xf71ede6c, 0xcf0fb1c0, 0x6779bf84, 0x5f68d028, 0x175b60dc, 0x2f4a0f70, - 0xcd796b76, 0xf56804da, 0xbd5bb42e, 0x854adb82, 0x2d3cd5c6, 0x152dba6a, 0x5d1e0a9e, 0x650f6532, - 0x081e60e7, 0x300f0f4b, 0x783cbfbf, 0x402dd013, 0xe85bde57, 0xd04ab1fb, 0x9879010f, 0xa0686ea3 - }, - { - 0x00000000, 0xef306b19, 0xdb8ca0c3, 0x34bccbda, 0xb2f53777, 0x5dc55c6e, 0x697997b4, 0x8649fcad, - 0x6006181f, 0x8f367306, 0xbb8ab8dc, 0x54bad3c5, 0xd2f32f68, 0x3dc34471, 0x097f8fab, 0xe64fe4b2, - 0xc00c303e, 0x2f3c5b27, 0x1b8090fd, 0xf4b0fbe4, 0x72f90749, 0x9dc96c50, 0xa975a78a, 0x4645cc93, - 0xa00a2821, 0x4f3a4338, 0x7b8688e2, 0x94b6e3fb, 0x12ff1f56, 0xfdcf744f, 0xc973bf95, 0x2643d48c, - 0x85f4168d, 0x6ac47d94, 0x5e78b64e, 0xb148dd57, 0x370121fa, 0xd8314ae3, 0xec8d8139, 0x03bdea20, - 0xe5f20e92, 0x0ac2658b, 0x3e7eae51, 0xd14ec548, 0x570739e5, 0xb83752fc, 0x8c8b9926, 0x63bbf23f, - 0x45f826b3, 0xaac84daa, 0x9e748670, 0x7144ed69, 0xf70d11c4, 0x183d7add, 0x2c81b107, 0xc3b1da1e, - 0x25fe3eac, 0xcace55b5, 0xfe729e6f, 0x1142f576, 0x970b09db, 0x783b62c2, 0x4c87a918, 0xa3b7c201, - 0x0e045beb, 0xe13430f2, 0xd588fb28, 0x3ab89031, 0xbcf16c9c, 0x53c10785, 0x677dcc5f, 0x884da746, - 0x6e0243f4, 0x813228ed, 0xb58ee337, 0x5abe882e, 0xdcf77483, 0x33c71f9a, 0x077bd440, 0xe84bbf59, - 0xce086bd5, 0x213800cc, 0x1584cb16, 0xfab4a00f, 0x7cfd5ca2, 0x93cd37bb, 0xa771fc61, 0x48419778, - 0xae0e73ca, 0x413e18d3, 0x7582d309, 0x9ab2b810, 0x1cfb44bd, 0xf3cb2fa4, 0xc777e47e, 0x28478f67, - 0x8bf04d66, 0x64c0267f, 0x507ceda5, 0xbf4c86bc, 0x39057a11, 0xd6351108, 0xe289dad2, 0x0db9b1cb, - 0xebf65579, 0x04c63e60, 0x307af5ba, 0xdf4a9ea3, 0x5903620e, 0xb6330917, 0x828fc2cd, 0x6dbfa9d4, - 0x4bfc7d58, 0xa4cc1641, 0x9070dd9b, 0x7f40b682, 0xf9094a2f, 0x16392136, 0x2285eaec, 0xcdb581f5, - 0x2bfa6547, 0xc4ca0e5e, 0xf076c584, 0x1f46ae9d, 0x990f5230, 0x763f3929, 0x4283f2f3, 0xadb399ea, - 0x1c08b7d6, 0xf338dccf, 0xc7841715, 0x28b47c0c, 0xaefd80a1, 0x41cdebb8, 0x75712062, 0x9a414b7b, - 0x7c0eafc9, 0x933ec4d0, 0xa7820f0a, 0x48b26413, 0xcefb98be, 0x21cbf3a7, 0x1577387d, 0xfa475364, - 0xdc0487e8, 0x3334ecf1, 0x0788272b, 0xe8b84c32, 0x6ef1b09f, 0x81c1db86, 0xb57d105c, 0x5a4d7b45, - 0xbc029ff7, 0x5332f4ee, 0x678e3f34, 0x88be542d, 0x0ef7a880, 0xe1c7c399, 0xd57b0843, 0x3a4b635a, - 0x99fca15b, 0x76ccca42, 0x42700198, 0xad406a81, 0x2b09962c, 0xc439fd35, 0xf08536ef, 0x1fb55df6, - 0xf9fab944, 0x16cad25d, 0x22761987, 0xcd46729e, 0x4b0f8e33, 0xa43fe52a, 0x90832ef0, 0x7fb345e9, - 0x59f09165, 0xb6c0fa7c, 0x827c31a6, 0x6d4c5abf, 0xeb05a612, 0x0435cd0b, 0x308906d1, 0xdfb96dc8, - 0x39f6897a, 0xd6c6e263, 0xe27a29b9, 0x0d4a42a0, 0x8b03be0d, 0x6433d514, 0x508f1ece, 0xbfbf75d7, - 0x120cec3d, 0xfd3c8724, 0xc9804cfe, 0x26b027e7, 0xa0f9db4a, 0x4fc9b053, 0x7b757b89, 0x94451090, - 0x720af422, 0x9d3a9f3b, 0xa98654e1, 0x46b63ff8, 0xc0ffc355, 0x2fcfa84c, 0x1b736396, 0xf443088f, - 0xd200dc03, 0x3d30b71a, 0x098c7cc0, 0xe6bc17d9, 0x60f5eb74, 0x8fc5806d, 0xbb794bb7, 0x544920ae, - 0xb206c41c, 0x5d36af05, 0x698a64df, 0x86ba0fc6, 0x00f3f36b, 0xefc39872, 0xdb7f53a8, 0x344f38b1, - 0x97f8fab0, 0x78c891a9, 0x4c745a73, 0xa344316a, 0x250dcdc7, 0xca3da6de, 0xfe816d04, 0x11b1061d, - 0xf7fee2af, 0x18ce89b6, 0x2c72426c, 0xc3422975, 0x450bd5d8, 0xaa3bbec1, 0x9e87751b, 0x71b71e02, - 0x57f4ca8e, 0xb8c4a197, 0x8c786a4d, 0x63480154, 0xe501fdf9, 0x0a3196e0, 0x3e8d5d3a, 0xd1bd3623, - 0x37f2d291, 0xd8c2b988, 0xec7e7252, 0x034e194b, 0x8507e5e6, 0x6a378eff, 0x5e8b4525, 0xb1bb2e3c - }, - { - 0x00000000, 0x68032cc8, 0xd0065990, 0xb8057558, 0xa5e0c5d1, 0xcde3e919, 0x75e69c41, 0x1de5b089, - 0x4e2dfd53, 0x262ed19b, 0x9e2ba4c3, 0xf628880b, 0xebcd3882, 0x83ce144a, 0x3bcb6112, 0x53c84dda, - 0x9c5bfaa6, 0xf458d66e, 0x4c5da336, 0x245e8ffe, 0x39bb3f77, 0x51b813bf, 0xe9bd66e7, 0x81be4a2f, - 0xd27607f5, 0xba752b3d, 0x02705e65, 0x6a7372ad, 0x7796c224, 0x1f95eeec, 0xa7909bb4, 0xcf93b77c, - 0x3d5b83bd, 0x5558af75, 0xed5dda2d, 0x855ef6e5, 0x98bb466c, 0xf0b86aa4, 0x48bd1ffc, 0x20be3334, - 0x73767eee, 0x1b755226, 0xa370277e, 0xcb730bb6, 0xd696bb3f, 0xbe9597f7, 0x0690e2af, 0x6e93ce67, - 0xa100791b, 0xc90355d3, 0x7106208b, 0x19050c43, 0x04e0bcca, 0x6ce39002, 0xd4e6e55a, 0xbce5c992, - 0xef2d8448, 0x872ea880, 0x3f2bddd8, 0x5728f110, 0x4acd4199, 0x22ce6d51, 0x9acb1809, 0xf2c834c1, - 0x7ab7077a, 0x12b42bb2, 0xaab15eea, 0xc2b27222, 0xdf57c2ab, 0xb754ee63, 0x0f519b3b, 0x6752b7f3, - 0x349afa29, 0x5c99d6e1, 0xe49ca3b9, 0x8c9f8f71, 0x917a3ff8, 0xf9791330, 0x417c6668, 0x297f4aa0, - 0xe6ecfddc, 0x8eefd114, 0x36eaa44c, 0x5ee98884, 0x430c380d, 0x2b0f14c5, 0x930a619d, 0xfb094d55, - 0xa8c1008f, 0xc0c22c47, 0x78c7591f, 0x10c475d7, 0x0d21c55e, 0x6522e996, 0xdd279cce, 0xb524b006, - 0x47ec84c7, 0x2fefa80f, 0x97eadd57, 0xffe9f19f, 0xe20c4116, 0x8a0f6dde, 0x320a1886, 0x5a09344e, - 0x09c17994, 0x61c2555c, 0xd9c72004, 0xb1c40ccc, 0xac21bc45, 0xc422908d, 0x7c27e5d5, 0x1424c91d, - 0xdbb77e61, 0xb3b452a9, 0x0bb127f1, 0x63b20b39, 0x7e57bbb0, 0x16549778, 0xae51e220, 0xc652cee8, - 0x959a8332, 0xfd99affa, 0x459cdaa2, 0x2d9ff66a, 0x307a46e3, 0x58796a2b, 0xe07c1f73, 0x887f33bb, - 0xf56e0ef4, 0x9d6d223c, 0x25685764, 0x4d6b7bac, 0x508ecb25, 0x388de7ed, 0x808892b5, 0xe88bbe7d, - 0xbb43f3a7, 0xd340df6f, 0x6b45aa37, 0x034686ff, 0x1ea33676, 0x76a01abe, 0xcea56fe6, 0xa6a6432e, - 0x6935f452, 0x0136d89a, 0xb933adc2, 0xd130810a, 0xccd53183, 0xa4d61d4b, 0x1cd36813, 0x74d044db, - 0x27180901, 0x4f1b25c9, 0xf71e5091, 0x9f1d7c59, 0x82f8ccd0, 0xeafbe018, 0x52fe9540, 0x3afdb988, - 0xc8358d49, 0xa036a181, 0x1833d4d9, 0x7030f811, 0x6dd54898, 0x05d66450, 0xbdd31108, 0xd5d03dc0, - 0x8618701a, 0xee1b5cd2, 0x561e298a, 0x3e1d0542, 0x23f8b5cb, 0x4bfb9903, 0xf3feec5b, 0x9bfdc093, - 0x546e77ef, 0x3c6d5b27, 0x84682e7f, 0xec6b02b7, 0xf18eb23e, 0x998d9ef6, 0x2188ebae, 0x498bc766, - 0x1a438abc, 0x7240a674, 0xca45d32c, 0xa246ffe4, 0xbfa34f6d, 0xd7a063a5, 0x6fa516fd, 0x07a63a35, - 0x8fd9098e, 0xe7da2546, 0x5fdf501e, 0x37dc7cd6, 0x2a39cc5f, 0x423ae097, 0xfa3f95cf, 0x923cb907, - 0xc1f4f4dd, 0xa9f7d815, 0x11f2ad4d, 0x79f18185, 0x6414310c, 0x0c171dc4, 0xb412689c, 0xdc114454, - 0x1382f328, 0x7b81dfe0, 0xc384aab8, 0xab878670, 0xb66236f9, 0xde611a31, 0x66646f69, 0x0e6743a1, - 0x5daf0e7b, 0x35ac22b3, 0x8da957eb, 0xe5aa7b23, 0xf84fcbaa, 0x904ce762, 0x2849923a, 0x404abef2, - 0xb2828a33, 0xda81a6fb, 0x6284d3a3, 0x0a87ff6b, 0x17624fe2, 0x7f61632a, 0xc7641672, 0xaf673aba, - 0xfcaf7760, 0x94ac5ba8, 0x2ca92ef0, 0x44aa0238, 0x594fb2b1, 0x314c9e79, 0x8949eb21, 0xe14ac7e9, - 0x2ed97095, 0x46da5c5d, 0xfedf2905, 0x96dc05cd, 0x8b39b544, 0xe33a998c, 0x5b3fecd4, 0x333cc01c, - 0x60f48dc6, 0x08f7a10e, 0xb0f2d456, 0xd8f1f89e, 0xc5144817, 0xad1764df, 0x15121187, 0x7d113d4f - }, - { - 0x00000000, 0x493c7d27, 0x9278fa4e, 0xdb448769, 0x211d826d, 0x6821ff4a, 0xb3657823, 0xfa590504, - 0x423b04da, 0x0b0779fd, 0xd043fe94, 0x997f83b3, 0x632686b7, 0x2a1afb90, 0xf15e7cf9, 0xb86201de, - 0x847609b4, 0xcd4a7493, 0x160ef3fa, 0x5f328edd, 0xa56b8bd9, 0xec57f6fe, 0x37137197, 0x7e2f0cb0, - 0xc64d0d6e, 0x8f717049, 0x5435f720, 0x1d098a07, 0xe7508f03, 0xae6cf224, 0x7528754d, 0x3c14086a, - 0x0d006599, 0x443c18be, 0x9f789fd7, 0xd644e2f0, 0x2c1de7f4, 0x65219ad3, 0xbe651dba, 0xf759609d, - 0x4f3b6143, 0x06071c64, 0xdd439b0d, 0x947fe62a, 0x6e26e32e, 0x271a9e09, 0xfc5e1960, 0xb5626447, - 0x89766c2d, 0xc04a110a, 0x1b0e9663, 0x5232eb44, 0xa86bee40, 0xe1579367, 0x3a13140e, 0x732f6929, - 0xcb4d68f7, 0x827115d0, 0x593592b9, 0x1009ef9e, 0xea50ea9a, 0xa36c97bd, 0x782810d4, 0x31146df3, - 0x1a00cb32, 0x533cb615, 0x8878317c, 0xc1444c5b, 0x3b1d495f, 0x72213478, 0xa965b311, 0xe059ce36, - 0x583bcfe8, 0x1107b2cf, 0xca4335a6, 0x837f4881, 0x79264d85, 0x301a30a2, 0xeb5eb7cb, 0xa262caec, - 0x9e76c286, 0xd74abfa1, 0x0c0e38c8, 0x453245ef, 0xbf6b40eb, 0xf6573dcc, 0x2d13baa5, 0x642fc782, - 0xdc4dc65c, 0x9571bb7b, 0x4e353c12, 0x07094135, 0xfd504431, 0xb46c3916, 0x6f28be7f, 0x2614c358, - 0x1700aeab, 0x5e3cd38c, 0x857854e5, 0xcc4429c2, 0x361d2cc6, 0x7f2151e1, 0xa465d688, 0xed59abaf, - 0x553baa71, 0x1c07d756, 0xc743503f, 0x8e7f2d18, 0x7426281c, 0x3d1a553b, 0xe65ed252, 0xaf62af75, - 0x9376a71f, 0xda4ada38, 0x010e5d51, 0x48322076, 0xb26b2572, 0xfb575855, 0x2013df3c, 0x692fa21b, - 0xd14da3c5, 0x9871dee2, 0x4335598b, 0x0a0924ac, 0xf05021a8, 0xb96c5c8f, 0x6228dbe6, 0x2b14a6c1, - 0x34019664, 0x7d3deb43, 0xa6796c2a, 0xef45110d, 0x151c1409, 0x5c20692e, 0x8764ee47, 0xce589360, - 0x763a92be, 0x3f06ef99, 0xe44268f0, 0xad7e15d7, 0x572710d3, 0x1e1b6df4, 0xc55fea9d, 0x8c6397ba, - 0xb0779fd0, 0xf94be2f7, 0x220f659e, 0x6b3318b9, 0x916a1dbd, 0xd856609a, 0x0312e7f3, 0x4a2e9ad4, - 0xf24c9b0a, 0xbb70e62d, 0x60346144, 0x29081c63, 0xd3511967, 0x9a6d6440, 0x4129e329, 0x08159e0e, - 0x3901f3fd, 0x703d8eda, 0xab7909b3, 0xe2457494, 0x181c7190, 0x51200cb7, 0x8a648bde, 0xc358f6f9, - 0x7b3af727, 0x32068a00, 0xe9420d69, 0xa07e704e, 0x5a27754a, 0x131b086d, 0xc85f8f04, 0x8163f223, - 0xbd77fa49, 0xf44b876e, 0x2f0f0007, 0x66337d20, 0x9c6a7824, 0xd5560503, 0x0e12826a, 0x472eff4d, - 0xff4cfe93, 0xb67083b4, 0x6d3404dd, 0x240879fa, 0xde517cfe, 0x976d01d9, 0x4c2986b0, 0x0515fb97, - 0x2e015d56, 0x673d2071, 0xbc79a718, 0xf545da3f, 0x0f1cdf3b, 0x4620a21c, 0x9d642575, 0xd4585852, - 0x6c3a598c, 0x250624ab, 0xfe42a3c2, 0xb77edee5, 0x4d27dbe1, 0x041ba6c6, 0xdf5f21af, 0x96635c88, - 0xaa7754e2, 0xe34b29c5, 0x380faeac, 0x7133d38b, 0x8b6ad68f, 0xc256aba8, 0x19122cc1, 0x502e51e6, - 0xe84c5038, 0xa1702d1f, 0x7a34aa76, 0x3308d751, 0xc951d255, 0x806daf72, 0x5b29281b, 0x1215553c, - 0x230138cf, 0x6a3d45e8, 0xb179c281, 0xf845bfa6, 0x021cbaa2, 0x4b20c785, 0x906440ec, 0xd9583dcb, - 0x613a3c15, 0x28064132, 0xf342c65b, 0xba7ebb7c, 0x4027be78, 0x091bc35f, 0xd25f4436, 0x9b633911, - 0xa777317b, 0xee4b4c5c, 0x350fcb35, 0x7c33b612, 0x866ab316, 0xcf56ce31, 0x14124958, 0x5d2e347f, - 0xe54c35a1, 0xac704886, 0x7734cfef, 0x3e08b2c8, 0xc451b7cc, 0x8d6dcaeb, 0x56294d82, 0x1f1530a5 - } -#endif//SIMD_BIG_ENDIAN - }; - - uint32_t Crc32c(const void * src, size_t size) - { - const uint8_t * p = (const uint8_t*)src; - uint32_t crc = 0xFFFFFFFF; - - for (; ((uintptr_t)p & (sizeof(uint32_t) - 1)) != 0 && size > 0; ++p, --size) - { -#ifdef SIMD_BIG_ENDIAN - crc = Crc32cTable[0][((crc >> 24) ^ *p) & 0xFF] ^ (crc << 8); -#else - crc = Crc32cTable[0][(crc ^ *p) & 0xFF] ^ (crc >> 8); -#endif - } - - for (; size >= sizeof(uint64_t); size -= sizeof(uint64_t)) - { - crc ^= *(uint32_t *)p; - p += sizeof(uint32_t); - uint32_t next = *(uint32_t *)p; - p += sizeof(uint32_t); - crc = -#ifdef SIMD_BIG_ENDIAN - Crc32cTable[4][(crc) & 0xFF] ^ - Crc32cTable[5][(crc >> 8) & 0xFF] ^ - Crc32cTable[6][(crc >> 16) & 0xFF] ^ - Crc32cTable[7][(crc >> 24)] ^ - Crc32cTable[0][(next) & 0xFF] ^ - Crc32cTable[1][(next >> 8) & 0xFF] ^ - Crc32cTable[2][(next >> 16) & 0xFF] ^ - Crc32cTable[3][(next >> 24)]; -#else - Crc32cTable[7][(crc) & 0xFF] ^ - Crc32cTable[6][(crc >> 8) & 0xFF] ^ - Crc32cTable[5][(crc >> 16) & 0xFF] ^ - Crc32cTable[4][(crc >> 24)] ^ - Crc32cTable[3][(next) & 0xFF] ^ - Crc32cTable[2][(next >> 8) & 0xFF] ^ - Crc32cTable[1][(next >> 16) & 0xFF] ^ - Crc32cTable[0][(next >> 24)]; -#endif - } - -#ifdef SIMD_BIG_ENDIAN - for (; size > 0; ++p, size--) - crc = Crc32cTable[0][((crc >> 24) ^ *p) & 0xFF] ^ (crc << 8); - - crc = - ((crc << 24) & 0xFF000000) | - ((crc << 8) & 0x00FF0000) | - ((crc >> 8) & 0x0000FF00) | - ((crc >> 24) & 0x000000FF); -#else - for (; size > 0; ++p, size--) - crc = Crc32cTable[0][(crc ^ *p) & 0xFF] ^ (crc >> 8); -#endif - return (~crc); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseDeinterleave.cpp b/src/3rd/Simd/Simd/SimdBaseDeinterleave.cpp deleted file mode 100644 index 01563d1b..00000000 --- a/src/3rd/Simd/Simd/SimdBaseDeinterleave.cpp +++ /dev/null @@ -1,84 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdDefs.h" - -namespace Simd -{ - namespace Base - { - void DeinterleaveUv(const uint8_t * uv, size_t uvStride, size_t width, size_t height, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < width; ++col, offset += 2) - { - u[col] = uv[offset]; - v[col] = uv[offset + 1]; - } - uv += uvStride; - u += uStride; - v += vStride; - } - } - - void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < width; ++col, offset += 3) - { - b[col] = bgr[offset + 0]; - g[col] = bgr[offset + 1]; - r[col] = bgr[offset + 2]; - } - bgr += bgrStride; - b += bStride; - g += gStride; - r += rStride; - } - } - - void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < width; ++col, offset += 4) - { - b[col] = bgra[offset + 0]; - g[col] = bgra[offset + 1]; - r[col] = bgra[offset + 2]; - a[col] = bgra[offset + 3]; - } - bgra += bgraStride; - b += bStride; - g += gStride; - r += rStride; - a += aStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseDetection.cpp b/src/3rd/Simd/Simd/SimdBaseDetection.cpp deleted file mode 100644 index 4edeab50..00000000 --- a/src/3rd/Simd/Simd/SimdBaseDetection.cpp +++ /dev/null @@ -1,918 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar, -* 2019-2019 Facundo Galan. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdDetection.h" -#include "Simd/SimdXml.hpp" - -#include -#include -#include -#include - -#define SIMD_EX(message) \ -{ \ - std::stringstream __ss; \ - __ss << message; \ - std::cerr << __ss.str().c_str() << std::endl; \ - throw std::runtime_error(__ss.str().c_str()); \ -} - -namespace Simd -{ - namespace Base - { - namespace Xml - { - typedef Simd::Xml::File File; - typedef Simd::Xml::Document Document; - typedef Simd::Xml::Node Node; - - template T FromString(const std::string & s) - { - T t; - std::stringstream(s) >> t; - return t; - } - - template inline InputIterator FindNotSpace(InputIterator first, InputIterator last) - { - while (first != last) - { - if (!isspace(*first)) - return first; - ++first; - } - return last; - } - - template <> inline std::string FromString(const std::string & s) - { - std::string str(s); - str.erase(str.begin(), FindNotSpace(str.begin(), str.end())); - str.erase(FindNotSpace(str.rbegin(), str.rend()).base(), str.end()); - return str; - } - - template inline T GetValue(Node * parent) - { - if (parent == NULL) - SIMD_EX("Invalid element!"); - Node * child = parent->FirstNode(); - if (child == NULL) - SIMD_EX("Invalid node!"); - return FromString(child->Value()); - } - - template inline T GetValue(Node * parent, const char * name) - { - if (parent == NULL) - SIMD_EX("Invalid element!"); - return GetValue(parent->FirstNode(name)); - } - - template inline std::vector GetValues(Node * parent) - { - if (parent == NULL) - SIMD_EX("Invalid element!"); - Node * child = parent->FirstNode(); - if (child == NULL) - SIMD_EX("Invalid node!"); - std::stringstream ss(child->Value()); - std::vector values; - while (!ss.eof()) - { - T value; - ss >> value; - values.push_back(value); - } - return values; - } - - template inline std::vector GetValues(Node * parent, const char * name) - { - if (parent == NULL) - SIMD_EX("Invalid element!"); - return GetValues(parent->FirstNode(name)); - } - - inline size_t GetSize(Node * parent) - { - return Simd::Xml::CountChildren(parent); - } - } - - namespace Names - { - const char * cascade = "cascade"; - const char * BOOST = "BOOST"; - const char * stageType = "stageType"; - const char * featureType = "featureType"; - const char * HAAR = "HAAR"; - const char * LBP = "LBP"; - const char * HOG = "HOG"; - const char * width = "width"; - const char * height = "height"; - const char * stageParams = "stageParams"; - const char * maxDepth = "maxDepth"; - const char * featureParams = "featureParams"; - const char * maxCatCount = "maxCatCount"; - const char * stages = "stages"; - const char * stageThreshold = "stageThreshold"; - const char * weakClassifiers = "weakClassifiers"; - const char * internalNodes = "internalNodes"; - const char * leafValues = "leafValues"; - const char * features = "features"; - const char * rects = "rects"; - const char * tilted = "tilted"; - const char * rect = "rect"; - } - - void * DetectionLoadStringXml(char * xml, const char * path) - { - static const float THRESHOLD_EPS = 1e-5f; - - Data * data = NULL; - try - { - Xml::Document doc; - doc.Parse<0>(xml); - - Xml::Node * root = doc.FirstNode(); - if (root == NULL) { - if (path == NULL) { - SIMD_EX("Invalid format of XML string!"); - } - else { - SIMD_EX("Invalid format of XML file '" << path << "'!"); - } - } - - Xml::Node * cascade = root->FirstNode(Names::cascade); - if (cascade == NULL) - return data; - - data = new Data(); - - if (Xml::GetValue(cascade, Names::stageType) != Names::BOOST) - SIMD_EX("Invalid cascade stage type!"); - data->stageType = 0; - - std::string featureType = Xml::GetValue(cascade, Names::featureType); - if (featureType == Names::HAAR) - data->featureType = SimdDetectionInfoFeatureHaar; - else if (featureType == Names::LBP) - data->featureType = SimdDetectionInfoFeatureLbp; - else if (featureType == Names::HOG) - SIMD_EX("HOG feature type is not supported!") - else - SIMD_EX("Invalid cascade feature type!"); - - data->origWinSize.x = Xml::GetValue(cascade, Names::width); - data->origWinSize.y = Xml::GetValue(cascade, Names::height); - if (data->origWinSize.x <= 0 || data->origWinSize.y <= 0) - SIMD_EX("Invalid cascade width or height!"); - - Xml::Node * stageParams = cascade->FirstNode(Names::stageParams); - if (stageParams && stageParams->FirstNode(Names::maxDepth)) - data->isStumpBased = Xml::GetValue(stageParams, Names::maxDepth) == 1 ? true : false; - else - data->isStumpBased = true; - - if (!data->isStumpBased) - SIMD_EX("Tree classifier cascades are not supported!"); - - Xml::Node * featureParams = cascade->FirstNode(Names::featureParams); - data->ncategories = Xml::GetValue(featureParams, Names::maxCatCount); - int subsetSize = (data->ncategories + 31) / 32; - int nodeStep = 3 + (data->ncategories > 0 ? subsetSize : 1); - - Xml::Node * stages = cascade->FirstNode(Names::stages); - if (stages == NULL) - SIMD_EX("Invalid stages count!"); - data->stages.reserve(Xml::GetSize(stages)); - int stageIndex = 0; - for (Xml::Node * stageNode = stages->FirstNode(); stageNode != NULL; stageNode = stageNode->NextSibling(), ++stageIndex) - { - Data::Stage stage; - stage.threshold = Xml::GetValue(stageNode, Names::stageThreshold) - THRESHOLD_EPS; - - Xml::Node * weakClassifiers = stageNode->FirstNode(Names::weakClassifiers); - if (weakClassifiers == NULL) - SIMD_EX("Invalid weak classifiers count!"); - stage.ntrees = (int)Xml::GetSize(weakClassifiers); - stage.first = (int)data->classifiers.size(); - data->stages.push_back(stage); - data->classifiers.reserve(data->stages[stageIndex].first + data->stages[stageIndex].ntrees); - - for (Xml::Node * weakClassifier = weakClassifiers->FirstNode(); weakClassifier != NULL; weakClassifier = weakClassifier->NextSibling()) - { - std::vector internalNodes = Xml::GetValues(weakClassifier, Names::internalNodes); - std::vector leafValues = Xml::GetValues(weakClassifier, Names::leafValues); - - Data::DTree tree; - tree.nodeCount = (int)internalNodes.size() / nodeStep; - if (tree.nodeCount > 1) - data->isStumpBased = false; - data->classifiers.push_back(tree); - - data->nodes.reserve(data->nodes.size() + tree.nodeCount); - data->leaves.reserve(data->leaves.size() + leafValues.size()); - if (subsetSize) - data->subsets.reserve(data->subsets.size() + tree.nodeCount*subsetSize); - - for (int n = 0; n < tree.nodeCount; ++n) - { - Data::DTreeNode node; - node.left = (int)internalNodes[n*nodeStep + 0]; - node.right = (int)internalNodes[n*nodeStep + 1]; - node.featureIdx = (int)internalNodes[n*nodeStep + 2]; - if (subsetSize) - { - for (int j = 0; j < subsetSize; j++) - data->subsets.push_back((int)internalNodes[n*nodeStep + 3 + j]); - node.threshold = 0.f; - } - else - { - node.threshold = (float)internalNodes[n*nodeStep + 3]; - } - data->nodes.push_back(node); - } - - for (size_t i = 0; i < leafValues.size(); ++i) - data->leaves.push_back(leafValues[i]); - } - } - - Xml::Node * featureNodes = cascade->FirstNode(Names::features); - if (data->featureType == SimdDetectionInfoFeatureHaar) - { - data->hasTilted = false; - data->haarFeatures.reserve(Xml::GetSize(featureNodes)); - for (Xml::Node * featureNode = featureNodes->FirstNode(); featureNode != NULL; featureNode = featureNode->NextSibling()) - { - Data::HaarFeature feature; - int rectIndex = 0; - Xml::Node * rectsNode = featureNode->FirstNode(Names::rects); - for (Xml::Node * rectNode = rectsNode->FirstNode(); rectNode != NULL; rectNode = rectNode->NextSibling(), rectIndex++) - { - std::vector values = Xml::GetValues(rectNode); - feature.rect[rectIndex].r.x = (int)values[0]; - feature.rect[rectIndex].r.y = (int)values[1]; - feature.rect[rectIndex].r.width = (int)values[2]; - feature.rect[rectIndex].r.height = (int)values[3]; - feature.rect[rectIndex].weight = (float)values[4]; - } - feature.tilted = featureNode->FirstNode(Names::tilted) && Xml::GetValue(featureNode, Names::tilted) != 0; - if (feature.tilted) - data->hasTilted = true; - data->haarFeatures.push_back(feature); - } - } - - if (data->featureType == SimdDetectionInfoFeatureLbp) - { - data->canInt16 = true; - data->lbpFeatures.reserve(Xml::GetSize(featureNodes)); - for (Xml::Node * featureNode = featureNodes->FirstNode(); featureNode != NULL; featureNode = featureNode->NextSibling()) - { - Data::LbpFeature feature; - std::vector values = Xml::GetValues(featureNode, Names::rect); - feature.rect.x = values[0]; - feature.rect.y = values[1]; - feature.rect.width = values[2]; - feature.rect.height = values[3]; - if (feature.rect.width*feature.rect.height > 256) - data->canInt16 = false; - data->lbpFeatures.push_back(feature); - } - } - } - catch (...) - { - delete data; - data = NULL; - } - - return data; - } - - void * DetectionLoadA(const char * path) - { - Xml::File file; - if (!file.Open(path)) - SIMD_EX("Can't load XML file '" << path << "'!"); - - return DetectionLoadStringXml(file.Data(), path); - } - - void DetectionInfo(const void * _data, size_t * width, size_t * height, SimdDetectionInfoFlags * flags) - { - Data * data = (Data*)_data; - if (data) - { - if (width) - *width = data->origWinSize.x; - if (height) - *height = data->origWinSize.y; - if (flags) - *flags = SimdDetectionInfoFlags(data->featureType | - (data->hasTilted ? SimdDetectionInfoHasTilted : 0) | - (data->canInt16 ? SimdDetectionInfoCanInt16 : 0)); - } - } - - HidHaarCascade * CreateHidHaar(const Data & data) - { - if (data.featureType != SimdDetectionInfoFeatureHaar) - SIMD_EX("It is not HAAR cascade!"); - - HidHaarCascade * hid = new HidHaarCascade(); - - hid->isThroughColumn = false; - hid->isStumpBased = data.isStumpBased; - hid->origWinSize = data.origWinSize; - - hid->trees.resize(data.classifiers.size()); - for (size_t i = 0; i < data.classifiers.size(); ++i) - hid->trees[i].nodeCount = data.classifiers[i].nodeCount; - - hid->nodes.resize(data.nodes.size()); - for (size_t i = 0; i < data.nodes.size(); ++i) - { - hid->nodes[i].featureIdx = data.nodes[i].featureIdx; - hid->nodes[i].left = data.nodes[i].left; - hid->nodes[i].right = data.nodes[i].right; - hid->nodes[i].threshold = data.nodes[i].threshold; - } - - hid->stages.resize(data.stages.size()); - hid->leaves.resize(data.leaves.size()); - for (size_t i = 0; i < data.stages.size(); ++i) - { - hid->stages[i].first = data.stages[i].first; - hid->stages[i].ntrees = data.stages[i].ntrees; - hid->stages[i].threshold = data.stages[i].threshold; - hid->stages[i].hasThree = false; - for (int j = data.stages[i].first, n = data.stages[i].first + data.stages[i].ntrees; j < n; ++j) - { - hid->leaves[2 * j + 0] = data.leaves[2 * j + 0]; - hid->leaves[2 * j + 1] = data.leaves[2 * j + 1]; - if (data.haarFeatures[data.nodes[j].featureIdx].rect[2].weight != 0) - hid->stages[i].hasThree = true; - } - } - - hid->features.resize(data.haarFeatures.size()); - for (size_t i = 0; i < hid->features.size(); ++i) - { - for (int j = 0; j < Data::HaarFeature::RECT_NUM; ++j) - hid->features[i].rect[j].weight = data.haarFeatures[i].rect[j].weight; - if (data.haarFeatures[i].tilted) - hid->hasTilted = true; - } - - return hid; - } - - template SIMD_INLINE T * SumElemPtr(const Image & view, ptrdiff_t row, ptrdiff_t col, bool throughColumn) - { - assert(view.ChannelCount() == 1 && view.ChannelSize() == sizeof(T)); - assert(row >= 0 && col >= 0 && col < (ptrdiff_t)view.width && row < (ptrdiff_t)view.height); - - if (throughColumn) - { - if (col & 1) - return (T*)& view.At(col / 2 + (view.width + 1) / 2, row); - else - return (T*)& view.At(col / 2, row); - } - else - return (T*)& view.At(col, row); - } - - static void InitBase(HidHaarCascade * hid, const Image & sum, const Image & sqsum, const Image & tilted) - { - Rect rect(1, 1, hid->origWinSize.x - 1, hid->origWinSize.y - 1); - hid->windowArea = (float)rect.Area(); - - hid->p[0] = SumElemPtr(sum, rect.top, rect.left, false); - hid->p[1] = SumElemPtr(sum, rect.top, rect.right, false); - hid->p[2] = SumElemPtr(sum, rect.bottom, rect.left, false); - hid->p[3] = SumElemPtr(sum, rect.bottom, rect.right, false); - - hid->pq[0] = SumElemPtr(sqsum, rect.top, rect.left, false); - hid->pq[1] = SumElemPtr(sqsum, rect.top, rect.right, false); - hid->pq[2] = SumElemPtr(sqsum, rect.bottom, rect.left, false); - hid->pq[3] = SumElemPtr(sqsum, rect.bottom, rect.right, false); - - hid->sum = sum; - hid->sqsum = sum; - hid->tilted = tilted; - } - - template SIMD_INLINE void UpdateFeaturePtrs(HidHaarCascade * hid, const Data & data) - { - Image sum = hid->isThroughColumn ? hid->isum : hid->sum; - Image tilted = hid->isThroughColumn ? hid->itilted : hid->tilted; - for (size_t i = 0; i < hid->features.size(); i++) - { - const Data::HaarFeature & df = data.haarFeatures[i]; - HidHaarCascade::Feature & hf = hid->features[i]; - for (int j = 0; j < Data::HaarFeature::RECT_NUM; ++j) - { - const Data::Rect & dr = df.rect[j].r; - WeightedRect & hr = hf.rect[j]; - if (hr.weight != 0.0) - { - if (df.tilted) - { - hr.p0 = SumElemPtr(tilted, dr.y, dr.x, hid->isThroughColumn); - hr.p1 = SumElemPtr(tilted, dr.y + dr.height, dr.x - dr.height, hid->isThroughColumn); - hr.p2 = SumElemPtr(tilted, dr.y + dr.width, dr.x + dr.width, hid->isThroughColumn); - hr.p3 = SumElemPtr(tilted, dr.y + dr.width + dr.height, dr.x + dr.width - dr.height, hid->isThroughColumn); - } - else - { - hr.p0 = SumElemPtr(sum, dr.y, dr.x, hid->isThroughColumn); - hr.p1 = SumElemPtr(sum, dr.y, dr.x + dr.width, hid->isThroughColumn); - hr.p2 = SumElemPtr(sum, dr.y + dr.height, dr.x, hid->isThroughColumn); - hr.p3 = SumElemPtr(sum, dr.y + dr.height, dr.x + dr.width, hid->isThroughColumn); - } - } - else - { - hr.p0 = NULL; - hr.p1 = NULL; - hr.p2 = NULL; - hr.p3 = NULL; - } - } - } - } - - HidHaarCascade * InitHaar(const Data & data, const Image & sum, const Image & sqsum, const Image & tilted, bool throughColumn) - { - if (!data.isStumpBased) - SIMD_EX("Can't use tree classfier for vector haar classifier!"); - - HidHaarCascade * hid = CreateHidHaar(data); - InitBase(hid, sum, sqsum, tilted); - if (throughColumn) - { - hid->isThroughColumn = true; - hid->isum.Recreate(sum.width, sum.height, Image::Int32, NULL, Image::PixelSize(Image::Int32)); - if (hid->hasTilted) - hid->itilted.Recreate(tilted.width, tilted.height, Image::Int32, NULL, Image::PixelSize(Image::Int32)); - } - UpdateFeaturePtrs(hid, data); - return hid; - } - - template void InitLbp(const Data & data, size_t index, HidLbpStage * stages, T * leaves); - - template<> void InitLbp(const Data & data, size_t index, HidLbpStage * stages, float * leaves) - { - stages[index].first = data.stages[index].first; - stages[index].ntrees = data.stages[index].ntrees; - stages[index].threshold = data.stages[index].threshold; - for (int i = stages[index].first * 2, n = (stages[index].first + stages[index].ntrees) * 2; i < n; ++i) - leaves[i] = data.leaves[i]; - } - - template<> void InitLbp(const Data & data, size_t index, HidLbpStage * stages, int * leaves) - { - float min = 0, max = 0; - for (int i = 0; i < data.stages[index].ntrees; ++i) - { - const float * leave = data.leaves.data() + (data.stages[index].first + i) * 2; - min += std::min(leave[0], leave[1]); - max += std::max(leave[0], leave[1]); - } - float k = float(SHRT_MAX)*0.9f / Simd::Max(Simd::Abs(min), Simd::Abs(max)); - - stages[index].first = data.stages[index].first; - stages[index].ntrees = data.stages[index].ntrees; - stages[index].threshold = Simd::Round(data.stages[index].threshold*k); - for (int i = stages[index].first * 2, n = (stages[index].first + stages[index].ntrees) * 2; i < n; ++i) - leaves[i] = Simd::Round(data.leaves[i] * k); -#if 0 - std::cout - << "stage = " << index - << "; ntrees = " << data.stages[index].ntrees - << "; threshold = " << data.stages[index].threshold - << "; min = " << min - << "; max = " << max - << "; k = " << k - << "." << std::endl; -#endif - } - - template HidLbpCascade * CreateHidLbp(const Data & data) - { - HidLbpCascade * hid = new HidLbpCascade(); - - hid->isInt16 = (sizeof(TSum) == 2); - hid->isThroughColumn = false; - - hid->isStumpBased = data.isStumpBased; - //hid->stageType = data.stageType; - hid->featureType = data.featureType; - hid->ncategories = data.ncategories; - hid->origWinSize = data.origWinSize; - - hid->trees.resize(data.classifiers.size()); - for (size_t i = 0; i < data.classifiers.size(); ++i) - { - hid->trees[i].nodeCount = data.classifiers[i].nodeCount; - } - - hid->nodes.resize(data.nodes.size()); - for (size_t i = 0; i < data.nodes.size(); ++i) - { - hid->nodes[i].featureIdx = data.nodes[i].featureIdx; - hid->nodes[i].left = data.nodes[i].left; - hid->nodes[i].right = data.nodes[i].right; - } - - hid->stages.resize(data.stages.size()); - hid->leaves.resize(data.leaves.size()); - for (size_t i = 0; i < data.stages.size(); ++i) - { - InitLbp(data, i, hid->stages.data(), hid->leaves.data()); - } - - hid->subsets.resize(data.subsets.size()); - for (size_t i = 0; i < data.subsets.size(); ++i) - { - hid->subsets[i] = data.subsets[i]; - } - - hid->features.resize(data.lbpFeatures.size()); - for (size_t i = 0; i < hid->features.size(); ++i) - { - hid->features[i].rect.left = data.lbpFeatures[i].rect.x; - hid->features[i].rect.top = data.lbpFeatures[i].rect.y; - hid->features[i].rect.right = data.lbpFeatures[i].rect.x + data.lbpFeatures[i].rect.width; - hid->features[i].rect.bottom = data.lbpFeatures[i].rect.y + data.lbpFeatures[i].rect.height; - } - - return hid; - } - - template SIMD_INLINE void UpdateFeaturePtrs(HidLbpCascade * hid) - { - Image sum = (hid->isThroughColumn || hid->isInt16) ? hid->isum : hid->sum; - for (size_t i = 0; i < hid->features.size(); i++) - { - typename HidLbpCascade::Feature& feature = hid->features[i]; - for (size_t row = 0; row < 4; ++row) - { - for (size_t col = 0; col < 4; ++col) - { - feature.p[row * 4 + col] = SumElemPtr(sum, - feature.rect.top + feature.rect.Height()*row, - feature.rect.left + feature.rect.Width()*col, hid->isThroughColumn); - } - } - } - } - - HidBase * InitLbp(const Data & data, const Image & sum, bool throughColumn, bool int16) - { - assert(sum.format == Image::Int32); - if (int16 && data.canInt16) - { - HidLbpCascade * hid = CreateHidLbp(data); - hid->isThroughColumn = throughColumn; - hid->sum = sum; - hid->isum.Recreate(sum.Size(), Image::Int16); - UpdateFeaturePtrs(hid); - return hid; - } - else - { - HidLbpCascade * hid = CreateHidLbp(data); - hid->isThroughColumn = throughColumn; - hid->sum = sum; - if (throughColumn) - hid->isum.Recreate(sum.Size(), Image::Int32); - UpdateFeaturePtrs(hid); - return hid; - } - } - - void * DetectionInit(const void * _data, uint8_t * sum, size_t sumStride, size_t width, size_t height, - uint8_t * sqsum, size_t sqsumStride, uint8_t * tilted, size_t tiltedStride, int throughColumn, int int16) - { - Data & data = *(Data*)_data; - switch (data.featureType) - { - case SimdDetectionInfoFeatureHaar: - return InitHaar(data, - Image(width, height, sumStride, Image::Int32, sum), - Image(width, height, sqsumStride, Image::Int32, sqsum), - Image(width, height, tiltedStride, Image::Int32, tilted), - throughColumn != 0); - case SimdDetectionInfoFeatureLbp: - return InitLbp(data, - Image(width, height, sumStride, Image::Int32, sum), - throughColumn != 0, - int16 != 0); - default: - return NULL; - } - } - - void PrepareThroughColumn32i(const Image & src, Image & dst) - { - assert(Simd::Compatible(src, dst) && src.format == Image::Int32); - - for (size_t row = 0; row < src.height; ++row) - { - const uint32_t * s = &src.At(0, row); - - uint32_t * evenDst = &dst.At(0, row); - for (size_t col = 0; col < src.width; col += 2) - evenDst[col >> 1] = s[col]; - - uint32_t * oddDst = &dst.At((dst.width + 1) >> 1, row); - for (size_t col = 1; col < src.width; col += 2) - oddDst[col >> 1] = s[col]; - } - } - - void Prepare16i(const Image & src, bool throughColumn, Image & dst) - { - assert(Simd::EqualSize(src, dst) && src.format == Image::Int32 && dst.format == Image::Int16); - - if (throughColumn) - { - for (size_t row = 0; row < src.height; ++row) - { - const uint32_t * s = &src.At(0, row); - - uint16_t * evenDst = &dst.At(0, row); - for (size_t col = 0; col < src.width; col += 2) - evenDst[col >> 1] = (uint16_t)s[col]; - - uint16_t * oddDst = &dst.At((dst.width + 1) >> 1, row); - for (size_t col = 1; col < src.width; col += 2) - oddDst[col >> 1] = (uint16_t)s[col]; - } - } - else - { - for (size_t row = 0; row < src.height; ++row) - { - const uint32_t * s = &src.At(0, row); - uint16_t * d = &dst.At(0, row); - for (size_t col = 0; col < src.width; ++col) - d[col] = (uint16_t)s[col]; - } - } - } - - void DetectionPrepare(void * _hid) - { - HidBase * hidBase = (HidBase*)_hid; - if (hidBase->featureType == SimdDetectionInfoFeatureHaar && hidBase->isThroughColumn) - { - HidHaarCascade * hid = (HidHaarCascade*)hidBase; - PrepareThroughColumn32i(hid->sum, hid->isum); - if (hid->hasTilted) - PrepareThroughColumn32i(hid->tilted, hid->itilted); - } - else if (hidBase->featureType == SimdDetectionInfoFeatureLbp) - { - if (hidBase->isInt16) - { - HidLbpCascade * hid = (HidLbpCascade*)hidBase; - Prepare16i(hid->sum, hid->isThroughColumn, hid->isum); - } - else if (hidBase->isThroughColumn) - { - HidLbpCascade * hid = (HidLbpCascade*)hidBase; - PrepareThroughColumn32i(hid->sum, hid->isum); - } - } - } - - int Detect32f(const HidHaarCascade & hid, size_t offset, int startStage, float norm) - { - typedef HidHaarCascade Hid; - const Hid::Stage * stages = hid.stages.data(); - if (startStage >= (int)hid.stages.size()) - return 1; - const Hid::Node * node = hid.nodes.data() + stages[startStage].first; - const float * leaves = hid.leaves.data() + stages[startStage].first * 2; - for (int i = startStage, n = (int)hid.stages.size(); i < n; ++i) - { - const Hid::Stage & stage = stages[i]; - if (stage.canSkip) - continue; - const Hid::Node * end = node + stage.ntrees; - float stageSum = 0.0; - if (stage.hasThree) - { - for (; node < end; ++node, leaves += 2) - { - const Hid::Feature & feature = hid.features[node->featureIdx]; - float sum = WeightedSum32f(feature.rect[0], offset) + WeightedSum32f(feature.rect[1], offset); - if (feature.rect[2].p0) - sum += WeightedSum32f(feature.rect[2], offset); - stageSum += leaves[sum >= node->threshold*norm]; - } - } - else - { - for (; node < end; ++node, leaves += 2) - { - const Hid::Feature & feature = hid.features[node->featureIdx]; - float sum = WeightedSum32f(feature.rect[0], offset) + WeightedSum32f(feature.rect[1], offset); - stageSum += leaves[sum >= node->threshold*norm]; - } - } - if (stageSum < stage.threshold) - return -i; - } - return 1; - } - - void DetectionHaarDetect32fp(const HidHaarCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - for (ptrdiff_t row = rect.top; row < rect.bottom; row += 1) - { - size_t p_offset = row * hid.sum.stride / sizeof(uint32_t); - size_t pq_offset = row * hid.sqsum.stride / sizeof(uint32_t); - for (ptrdiff_t col = rect.left; col < rect.right; col += 1) - { - if (mask.At(col, row) == 0) - continue; - float norm = Norm32f(hid, pq_offset + col); - if (Detect32f(hid, p_offset + col, 0, norm) > 0) - dst.At(col, row) = 1; - } - } - } - - void DetectionHaarDetect32fp(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidHaarCascade & hid = *(HidHaarCascade*)_hid; - return DetectionHaarDetect32fp(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - - void DetectionHaarDetect32fi(const HidHaarCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - for (ptrdiff_t row = rect.top; row < rect.bottom; row += 2) - { - size_t p_offset = row * hid.isum.stride / sizeof(uint32_t); - size_t pq_offset = row * hid.sqsum.stride / sizeof(uint32_t); - for (ptrdiff_t col = rect.left; col < rect.right; col += 2) - { - if (mask.At(col, row) == 0) - continue; - float norm = Norm32f(hid, pq_offset + col); - if (Detect32f(hid, p_offset + col / 2, 0, norm) > 0) - dst.At(col, row) = 1; - } - } - } - - void DetectionHaarDetect32fi(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidHaarCascade & hid = *(HidHaarCascade*)_hid; - return DetectionHaarDetect32fi(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - - void DetectionLbpDetect32fp(const HidLbpCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - for (ptrdiff_t row = rect.top; row < rect.bottom; row += 1) - { - size_t offset = row * hid.sum.stride / sizeof(int); - for (ptrdiff_t col = rect.left; col < rect.right; col += 1) - { - if (mask.At(col, row) == 0) - continue; - if (Detect(hid, offset + col, 0) > 0) - dst.At(col, row) = 1; - } - } - } - - void DetectionLbpDetect32fp(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidLbpCascade & hid = *(HidLbpCascade*)_hid; - return DetectionLbpDetect32fp(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - - void DetectionLbpDetect32fi(const HidLbpCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - for (ptrdiff_t row = rect.top; row < rect.bottom; row += 2) - { - size_t offset = row * hid.isum.stride / sizeof(int); - for (ptrdiff_t col = rect.left; col < rect.right; col += 2) - { - if (mask.At(col, row) == 0) - continue; - if (Detect(hid, offset + col / 2, 0) > 0) - dst.At(col, row) = 1; - } - } - } - - void DetectionLbpDetect32fi(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidLbpCascade & hid = *(HidLbpCascade*)_hid; - return DetectionLbpDetect32fi(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - - void DetectionLbpDetect16ip(const HidLbpCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - for (ptrdiff_t row = rect.top; row < rect.bottom; row += 1) - { - size_t offset = row * hid.isum.stride / sizeof(uint16_t); - for (ptrdiff_t col = rect.left; col < rect.right; col += 1) - { - if (mask.At(col, row) == 0) - continue; - if (Detect(hid, offset + col, 0) > 0) - dst.At(col, row) = 1; - } - } - } - - void DetectionLbpDetect16ip(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidLbpCascade & hid = *(HidLbpCascade*)_hid; - return DetectionLbpDetect16ip(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - - void DetectionLbpDetect16ii(const HidLbpCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - for (ptrdiff_t row = rect.top; row < rect.bottom; row += 2) - { - size_t offset = row * hid.isum.stride / sizeof(uint16_t); - for (ptrdiff_t col = rect.left; col < rect.right; col += 2) - { - if (mask.At(col, row) == 0) - continue; - if (Detect(hid, offset + col / 2, 0) > 0) - dst.At(col, row) = 1; - } - } - } - - void DetectionLbpDetect16ii(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidLbpCascade & hid = *(HidLbpCascade*)_hid; - return DetectionLbpDetect16ii(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseEdgeBackground.cpp b/src/3rd/Simd/Simd/SimdBaseEdgeBackground.cpp deleted file mode 100644 index 12c83c1c..00000000 --- a/src/3rd/Simd/Simd/SimdBaseEdgeBackground.cpp +++ /dev/null @@ -1,144 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ - namespace Base - { - void EdgeBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - if (value[col] > background[col]) - background[col]++; - } - value += valueStride; - background += backgroundStride; - } - } - - void EdgeBackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - background[col] = MaxU8(value[col], background[col]); - } - value += valueStride; - background += backgroundStride; - } - } - - void EdgeBackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t * backgroundCount, size_t backgroundCountStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - if (value[col] > backgroundValue[col] && backgroundCount[col] < 0xFF) - backgroundCount[col]++; - } - value += valueStride; - backgroundValue += backgroundValueStride; - backgroundCount += backgroundCountStride; - } - } - - SIMD_INLINE void AdjustEdge(const uint8_t & count, uint8_t & value, uint8_t threshold) - { - if (count < threshold) - { - if (value > 0) - value--; - } - else if (count > threshold) - { - if (value < 0xFF) - value++; - } - } - - void EdgeBackgroundAdjustRange(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - AdjustEdge(backgroundCount[col], backgroundValue[col], threshold); - backgroundCount[col] = 0; - } - backgroundValue += backgroundValueStride; - backgroundCount += backgroundCountStride; - } - } - - void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - if (mask[col]) - AdjustEdge(backgroundCount[col], backgroundValue[col], threshold); - backgroundCount[col] = 0; - } - backgroundValue += backgroundValueStride; - backgroundCount += backgroundCountStride; - mask += maskStride; - } - } - - void EdgeBackgroundShiftRange(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride) - { - Copy(value, valueStride, width, height, 1, background, backgroundStride); - } - - void EdgeBackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride, const uint8_t * mask, size_t maskStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - if (mask[col]) - background[col] = value[col]; - } - value += valueStride; - background += backgroundStride; - mask += maskStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseFill.cpp b/src/3rd/Simd/Simd/SimdBaseFill.cpp deleted file mode 100644 index 49d3efa7..00000000 --- a/src/3rd/Simd/Simd/SimdBaseFill.cpp +++ /dev/null @@ -1,245 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar, -* 2014-2019 Antonenka Mikhail. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" - -namespace Simd -{ - namespace Base - { - void Fill(uint8_t * dst, size_t stride, size_t width, size_t height, size_t pixelSize, uint8_t value) - { - size_t rowSize = width*pixelSize; - for (size_t row = 0; row < height; ++row) - { - memset(dst, value, rowSize); - dst += stride; - } - } - - void FillFrame(uint8_t * dst, size_t stride, size_t width, size_t height, size_t pixelSize, - size_t frameLeft, size_t frameTop, size_t frameRight, size_t frameBottom, uint8_t value) - { - if (frameTop) - { - size_t offset = 0; - size_t size = width*pixelSize; - for (size_t row = 0; row < frameTop; ++row) - { - memset(dst + offset, value, size); - offset += stride; - } - } - if (height - frameBottom) - { - size_t offset = frameBottom*stride; - size_t size = width*pixelSize; - for (size_t row = frameBottom; row < height; ++row) - { - memset(dst + offset, value, size); - offset += stride; - } - } - if (frameLeft) - { - size_t offset = frameTop*stride; - size_t size = frameLeft*pixelSize; - for (size_t row = frameTop; row < frameBottom; ++row) - { - memset(dst + offset, value, size); - offset += stride; - } - } - if (width - frameRight) - { - size_t offset = frameTop*stride + frameRight*pixelSize; - size_t size = (width - frameRight)*pixelSize; - for (size_t row = frameTop; row < frameBottom; ++row) - { - memset(dst + offset, value, size); - offset += stride; - } - } - } - - SIMD_INLINE uint64_t Fill64(uint8_t a, uint8_t b, uint8_t c) - { -#ifdef SIMD_BIG_ENDIAN - return (uint64_t(a) << 56) | (uint64_t(b) << 48) | (uint64_t(c) << 40) | (uint64_t(a) << 32) | - (uint64_t(b) << 24) | (uint64_t(c) << 16) | (uint64_t(a) << 8) | uint64_t(b); -#else - return uint64_t(a) | (uint64_t(b) << 8) | (uint64_t(c) << 16) | (uint64_t(a) << 24) | - (uint64_t(b) << 32) | (uint64_t(c) << 40) | (uint64_t(a) << 48) | (uint64_t(b) << 56); -#endif - } - - SIMD_INLINE uint32_t Fill32(uint8_t a, uint8_t b, uint8_t c) - { -#ifdef SIMD_BIG_ENDIAN - return (uint32_t(a) << 24) | (uint32_t(b) << 16) | (uint32_t(c) << 8) | uint32_t(a); -#else - return uint32_t(a) | (uint32_t(b) << 8) | (uint32_t(c) << 16) | (uint32_t(a) << 24); -#endif - } - - void FillBgr(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red) - { - size_t size = width * 3; - size_t step = sizeof(size_t) * 3; - size_t alignedSize = AlignLo(width, sizeof(size_t)) * 3; - size_t bgrs[3]; -#if defined(SIMD_X64_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM64_ENABLE) - bgrs[0] = Fill64(blue, green, red); - bgrs[1] = Fill64(red, blue, green); - bgrs[2] = Fill64(green, red, blue); - -#else - bgrs[0] = Fill32(blue, green, red); - bgrs[1] = Fill32(green, red, blue); - bgrs[2] = Fill32(red, blue, green); -#endif - for (size_t row = 0; row < height; ++row) - { - size_t offset = 0; - for (; offset < alignedSize; offset += step) - { - ((size_t*)(dst + offset))[0] = bgrs[0]; - ((size_t*)(dst + offset))[1] = bgrs[1]; - ((size_t*)(dst + offset))[2] = bgrs[2]; - } - for (; offset < size; offset += 3) - { - (dst + offset)[0] = blue; - (dst + offset)[1] = green; - (dst + offset)[2] = red; - } - dst += stride; - } - } - -#if defined(__GNUC__) && (defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)) -#pragma GCC push_options -#pragma GCC optimize ("O2") -#endif - void FillBgra(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red, uint8_t alpha) - { -#ifdef SIMD_BIG_ENDIAN - uint32_t bgra32 = uint32_t(alpha) | (uint32_t(red) << 8) | (uint32_t(green) << 16) | (uint32_t(blue) << 24); -#else - uint32_t bgra32 = uint32_t(blue) | (uint32_t(green) << 8) | (uint32_t(red) << 16) | (uint32_t(alpha) << 24); -#endif - -#if defined(SIMD_X64_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM64_ENABLE) - uint64_t bgra64 = uint64_t(bgra32) | (uint64_t(bgra32) << 32); - size_t alignedWidth = AlignLo(width, 2); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += 2) - *((uint64_t*)((uint32_t*)dst + col)) = bgra64; - if (width != alignedWidth) - ((uint32_t*)dst)[width - 1] = bgra32; - dst += stride; - } -#else - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - ((uint32_t*)dst)[col] = bgra32; - dst += stride; - } -#endif - } - - void FillUv(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t u, uint8_t v) - { -#ifdef SIMD_BIG_ENDIAN - uint16_t uv16 = uint32_t(v) | (uint32_t(u) << 8); -#else - uint16_t uv16 = uint32_t(u) | (uint32_t(v) << 8); -#endif - -#if defined(SIMD_X64_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM64_ENABLE) - uint64_t uv64 = uint64_t(uv16) | (uint64_t(uv16) << 16) | (uint64_t(uv16) << 32) | (uint64_t(uv16) << 48); - size_t alignedWidth = AlignLo(width, 4); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += 4) - *((uint64_t*)((uint16_t*)dst + col)) = uv64; - for (; col < width; col += 1) - ((uint16_t*)dst)[col] = uv16; - dst += stride; - } -#else - uint32_t uv32 = uint32_t(uv16) | (uint32_t(uv16) << 16); - size_t alignedWidth = AlignLo(width, 2); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += 2) - *((uint32_t*)((uint16_t*)dst + col)) = uv32; - for (; col < width; col += 1) - ((uint16_t*)dst)[col] = uv16; - dst += stride; - } -#endif - } -#if defined(__GNUC__) && (defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)) -#pragma GCC pop_options -#endif - - void FillPixel(uint8_t * dst, size_t stride, size_t width, size_t height, const uint8_t * pixel, size_t pixelSize) - { - switch (pixelSize) - { - case 1: - Fill(dst, stride, width, height, 1, pixel[0]); - break; - case 2: - FillUv(dst, stride, width, height, pixel[0], pixel[1]); - break; - case 3: - FillBgr(dst, stride, width, height, pixel[0], pixel[1], pixel[2]); - break; - case 4: - FillBgra(dst, stride, width, height, pixel[0], pixel[1], pixel[2], pixel[3]); - break; - default: - assert(0); - } - } - - void Fill32f(float * dst, size_t size, const float * value) - { - if (value == 0 || value[0] == 0) - memset(dst, 0, size*sizeof(float)); - else - { - float v = value[0]; - for (; size; --size) - *dst++ = v; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseFloat16.cpp b/src/3rd/Simd/Simd/SimdBaseFloat16.cpp deleted file mode 100644 index 44dfdfc9..00000000 --- a/src/3rd/Simd/Simd/SimdBaseFloat16.cpp +++ /dev/null @@ -1,171 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ - namespace Base - { - union Bits - { - float f; - int32_t si; - uint32_t ui; - }; - - const int SHIFT = 13; - const int SHIFT_SIGN = 16; - - const int32_t INF_N = 0x7F800000; // flt32 infinity - const int32_t MAX_N = 0x477FE000; // max flt16 normal as a flt32 - const int32_t MIN_N = 0x38800000; // min flt16 normal as a flt32 - const int32_t SIGN_N = 0x80000000; // flt32 sign bit - - const int32_t INF_C = INF_N >> SHIFT; - const int32_t NAN_N = (INF_C + 1) << SHIFT; // minimum flt16 nan as a flt32 - const int32_t MAX_C = MAX_N >> SHIFT; - const int32_t MIN_C = MIN_N >> SHIFT; - const int32_t SIGN_C = SIGN_N >> SHIFT_SIGN; // flt16 sign bit - - const int32_t MUL_N = 0x52000000; // (1 << 23) / MIN_N - const int32_t MUL_C = 0x33800000; // MIN_N / (1 << (23 - shift)) - - const int32_t SUB_C = 0x003FF; // max flt32 subnormal down shifted - const int32_t NOR_C = 0x00400; // min flt32 normal down shifted - - const int32_t MAX_D = INF_C - MAX_C - 1; - const int32_t MIN_D = MIN_C - SUB_C - 1; - - SIMD_INLINE uint16_t Float32ToFloat16(float value) - { - Bits v, s; - v.f = value; - uint32_t sign = v.si & SIGN_N; - v.si ^= sign; - sign >>= SHIFT_SIGN; // logical shift - s.si = MUL_N; - s.si = int32_t(s.f * v.f); // correct subnormals - v.si ^= (s.si ^ v.si) & -(MIN_N > v.si); - v.si ^= (INF_N ^ v.si) & -((INF_N > v.si) & (v.si > MAX_N)); - v.si ^= (NAN_N ^ v.si) & -((NAN_N > v.si) & (v.si > INF_N)); - v.ui >>= SHIFT; // logical shift - v.si ^= ((v.si - MAX_D) ^ v.si) & -(v.si > MAX_C); - v.si ^= ((v.si - MIN_D) ^ v.si) & -(v.si > SUB_C); - return v.ui | sign; - } - - SIMD_INLINE float Float16ToFloat32(uint16_t value) - { - Bits v; - v.ui = value; - int32_t sign = v.si & SIGN_C; - v.si ^= sign; - sign <<= SHIFT_SIGN; - v.si ^= ((v.si + MIN_D) ^ v.si) & -(v.si > SUB_C); - v.si ^= ((v.si + MAX_D) ^ v.si) & -(v.si > MAX_C); - Bits s; - s.si = MUL_C; - s.f *= v.si; - int32_t mask = -(NOR_C > v.si); - v.si <<= SHIFT; - v.si ^= (s.si ^ v.si) & mask; - v.si |= sign; - return v.f; - } - - void Float32ToFloat16(const float * src, size_t size, uint16_t * dst) - { - size_t alignedSize = Simd::AlignLo(size, 4); - size_t i = 0; - for (; i < alignedSize; i += 4) - { - dst[i + 0] = Float32ToFloat16(src[i + 0]); - dst[i + 1] = Float32ToFloat16(src[i + 1]); - dst[i + 2] = Float32ToFloat16(src[i + 2]); - dst[i + 3] = Float32ToFloat16(src[i + 3]); - } - for (; i < size; ++i) - dst[i] = Float32ToFloat16(src[i]); - } - - void Float16ToFloat32(const uint16_t * src, size_t size, float * dst) - { - size_t alignedSize = Simd::AlignLo(size, 4); - size_t i = 0; - for (; i < alignedSize; i += 4) - { - dst[i + 0] = Float16ToFloat32(src[i + 0]); - dst[i + 1] = Float16ToFloat32(src[i + 1]); - dst[i + 2] = Float16ToFloat32(src[i + 2]); - dst[i + 3] = Float16ToFloat32(src[i + 3]); - } - for (; i < size; ++i) - dst[i] = Float16ToFloat32(src[i]); - } - - SIMD_INLINE float SquaredDifference16f(uint16_t a, uint16_t b) - { - return Simd::Square(Float16ToFloat32(a) - Float16ToFloat32(b)); - } - - void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum) - { - size_t alignedSize = Simd::AlignLo(size, 4); - float sums[4] = { 0, 0, 0, 0 }; - size_t i = 0; - for (; i < alignedSize; i += 4) - { - sums[0] += SquaredDifference16f(a[i + 0], b[i + 0]); - sums[1] += SquaredDifference16f(a[i + 1], b[i + 1]); - sums[2] += SquaredDifference16f(a[i + 2], b[i + 2]); - sums[3] += SquaredDifference16f(a[i + 3], b[i + 3]); - } - for (; i < size; ++i) - sums[0] += SquaredDifference16f(a[i], b[i]); - *sum = sums[0] + sums[1] + sums[2] + sums[3]; - } - - void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance) - { - float aa = 0, ab = 0, bb = 0; - for (size_t i = 0; i < size; ++i) - { - float _a = Float16ToFloat32(a[i]); - float _b = Float16ToFloat32(b[i]); - aa += _a * _a; - ab += _a * _b; - bb += _b * _b; - } - *distance = 1.0f - ab / ::sqrt(aa*bb); - } - - void CosineDistancesMxNa16f(size_t M, size_t N, size_t K, const uint16_t * const * A, const uint16_t * const * B, float * distances) - { - for (size_t i = 0; i < M; ++i) - for (size_t j = 0; j < N; ++j) - CosineDistance16f(A[i], B[j], K, distances + i * N + j); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseFloat32.cpp b/src/3rd/Simd/Simd/SimdBaseFloat32.cpp deleted file mode 100644 index 23f19a2a..00000000 --- a/src/3rd/Simd/Simd/SimdBaseFloat32.cpp +++ /dev/null @@ -1,87 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE uint8_t Float32ToUint8(float value, float lower, float upper, float boost) - { - return uint8_t((Simd::Min(Simd::Max(value, lower), upper) - lower)*boost); - } - - void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst) - { - float _lower = lower[0], _upper = upper[0], boost = 255.0f / (upper[0] - lower[0]); - size_t alignedSize = Simd::AlignLo(size, 4); - size_t i = 0; - for (; i < alignedSize; i += 4) - { - dst[i + 0] = Float32ToUint8(src[i + 0], _lower, _upper, boost); - dst[i + 1] = Float32ToUint8(src[i + 1], _lower, _upper, boost); - dst[i + 2] = Float32ToUint8(src[i + 2], _lower, _upper, boost); - dst[i + 3] = Float32ToUint8(src[i + 3], _lower, _upper, boost); - } - for (; i < size; ++i) - dst[i] = Float32ToUint8(src[i], _lower, _upper, boost); - } - - SIMD_INLINE float Uint8ToFloat32(int value, float lower, float boost) - { - return value*boost + lower; - } - - void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst) - { - float _lower = lower[0], boost = (upper[0] - lower[0]) / 255.0f; - size_t alignedSize = Simd::AlignLo(size, 4); - size_t i = 0; - for (; i < alignedSize; i += 4) - { - dst[i + 0] = Uint8ToFloat32(src[i + 0], _lower, boost); - dst[i + 1] = Uint8ToFloat32(src[i + 1], _lower, boost); - dst[i + 2] = Uint8ToFloat32(src[i + 2], _lower, boost); - dst[i + 3] = Uint8ToFloat32(src[i + 3], _lower, boost); - } - for (; i < size; ++i) - dst[i] = Uint8ToFloat32(src[i], _lower, boost); - } - - void CosineDistance32f(const float * a, const float * b, size_t size, float * distance) - { - float aa = 0, ab = 0, bb = 0; - for (size_t i = 0; i < size; ++i) - { - float _a = a[i]; - float _b = b[i]; - aa += _a * _a; - ab += _a * _b; - bb += _b * _b; - } - *distance = 1.0f - ab / ::sqrt(aa*bb); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseGaussianBlur3x3.cpp b/src/3rd/Simd/Simd/SimdBaseGaussianBlur3x3.cpp deleted file mode 100644 index 6e709bc7..00000000 --- a/src/3rd/Simd/Simd/SimdBaseGaussianBlur3x3.cpp +++ /dev/null @@ -1,70 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdDefs.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE int DivideBy16(int value) - { - return (value + 8) >> 4; - } - - SIMD_INLINE int GaussianBlur(const uint8_t *s0, const uint8_t *s1, const uint8_t *s2, size_t x0, size_t x1, size_t x2) - { - return DivideBy16(s0[x0] + 2 * s0[x1] + s0[x2] + (s1[x0] + 2 * s1[x1] + s1[x2]) * 2 + s2[x0] + 2 * s2[x1] + s2[x2]); - } - - void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - const uint8_t *src0, *src1, *src2; - - size_t size = channelCount*width; - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - size_t col = 0; - for (; col < channelCount; col++) - dst[col] = GaussianBlur(src0, src1, src2, col, col, col + channelCount); - - for (; col < size - channelCount; ++col) - dst[col] = GaussianBlur(src0, src1, src2, col - channelCount, col, col + channelCount); - - for (; col < size; col++) - dst[col] = GaussianBlur(src0, src1, src2, col - channelCount, col, col); - - dst += dstStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseGemm32f.cpp b/src/3rd/Simd/Simd/SimdBaseGemm32f.cpp deleted file mode 100644 index 37436244..00000000 --- a/src/3rd/Simd/Simd/SimdBaseGemm32f.cpp +++ /dev/null @@ -1,68 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdDefs.h" - -namespace Simd -{ - namespace Base - { - void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc) - { - float b = beta[0]; - for (size_t i = 0; i < M; ++i) - { - float * pC = C + i * ldc; - for (size_t j = 0; j < N; ++j) - pC[j] = b * pC[j]; - for (size_t k = 0; k < K; ++k) - { - const float * pB = B + k * ldb; - float a = alpha[0] * A[i*lda + k]; - for (size_t j = 0; j < N; ++j) - pC[j] = a * pB[j] + pC[j]; - } - } - } - - void Gemm32fNT(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc) - { - float b = beta[0]; - for (size_t i = 0; i < M; ++i) - { - float * pC = C + i * ldc; - for (size_t j = 0; j < N; ++j) - pC[j] = b * pC[j]; - for (size_t j = 0; j < N; ++j) - { - const float * pA = A + i * K; - const float * pB = B + j * K; - float sum = 0; - for (size_t k = 0; k < K; ++k) - sum += pA[k] * pB[k]; - pC[j] += sum*alpha[0]; - } - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseGrayToBgr.cpp b/src/3rd/Simd/Simd/SimdBaseGrayToBgr.cpp deleted file mode 100644 index 3f410c69..00000000 --- a/src/3rd/Simd/Simd/SimdBaseGrayToBgr.cpp +++ /dev/null @@ -1,48 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdDefs.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE void GrayToBgr(const uint8_t & gray, uint8_t * bgr) - { - bgr[0] = gray; - bgr[1] = gray; - bgr[2] = gray; - } - - void GrayToBgr(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgr, size_t bgrStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < width; ++col, offset += 3) - GrayToBgr(gray[col], bgr + offset); - gray += grayStride; - bgr += bgrStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseGrayToBgra.cpp b/src/3rd/Simd/Simd/SimdBaseGrayToBgra.cpp deleted file mode 100644 index f89b06cf..00000000 --- a/src/3rd/Simd/Simd/SimdBaseGrayToBgra.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdDefs.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE uint32_t GrayToBgra(uint32_t gray, uint32_t alpha) - { -#ifdef SIMD_BIG_ENDIAN - return alpha | (gray << 8) | (gray << 16) | (gray << 24); -#else - return gray | (gray << 8) | (gray << 16) | (alpha << 24); -#endif - } - - void GrayToBgra(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - ((uint32_t*)bgra)[col] = GrayToBgra(gray[col], alpha); - gray += grayStride; - bgra += bgraStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseHistogram.cpp b/src/3rd/Simd/Simd/SimdBaseHistogram.cpp deleted file mode 100644 index 780bcc44..00000000 --- a/src/3rd/Simd/Simd/SimdBaseHistogram.cpp +++ /dev/null @@ -1,212 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE int AbsSecondDerivative(const uint8_t * src, ptrdiff_t step) - { - return AbsDifferenceU8(Average(src[step], src[-step]), src[0]); - } - - void AbsSecondDerivativeHistogram(const uint8_t *src, size_t width, size_t height, size_t stride, - size_t step, size_t indent, uint32_t * histogram) - { - assert(width > 2 * indent && height > 2 * indent && indent >= step); - - memset(histogram, 0, sizeof(uint32_t)*HISTOGRAM_SIZE); - - src += indent*(stride + 1); - height -= 2 * indent; - width -= 2 * indent; - - size_t rowStep = step*stride; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - const int sdX = AbsSecondDerivative(src + col, step); - const int sdY = AbsSecondDerivative(src + col, rowStep); - const int sd = MaxU8(sdY, sdX); - ++histogram[sd]; - } - src += stride; - } - } - - void Histogram(const uint8_t * src, size_t width, size_t height, size_t stride, uint32_t * histogram) - { - uint32_t histograms[4][HISTOGRAM_SIZE]; - memset(histograms, 0, sizeof(uint32_t)*HISTOGRAM_SIZE * 4); - size_t alignedWidth = Simd::AlignLo(width, 4); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += 4) - { - ++histograms[0][src[col + 0]]; - ++histograms[1][src[col + 1]]; - ++histograms[2][src[col + 2]]; - ++histograms[3][src[col + 3]]; - } - for (; col < width; ++col) - ++histograms[0][src[col + 0]]; - - src += stride; - } - - for (size_t i = 0; i < HISTOGRAM_SIZE; ++i) - histogram[i] = histograms[0][i] + histograms[1][i] + histograms[2][i] + histograms[3][i]; - } - - void HistogramMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t index, uint32_t * histogram) - { - uint32_t histograms[4][HISTOGRAM_SIZE + 4]; - memset(histograms, 0, sizeof(uint32_t)*(HISTOGRAM_SIZE + 4) * 4); - size_t alignedWidth = Simd::AlignLo(width, 4); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += 4) - { - ++histograms[0][(4 + src[col + 0])*(mask[col + 0] == index)]; - ++histograms[1][(4 + src[col + 1])*(mask[col + 1] == index)]; - ++histograms[2][(4 + src[col + 2])*(mask[col + 2] == index)]; - ++histograms[3][(4 + src[col + 3])*(mask[col + 3] == index)]; - } - for (; col < width; ++col) - ++histograms[0][(4 + src[col + 0])*(mask[col + 0] == index)]; - - src += srcStride; - mask += maskStride; - } - for (size_t i = 0; i < HISTOGRAM_SIZE; ++i) - histogram[i] = histograms[0][4 + i] + histograms[1][4 + i] + histograms[2][4 + i] + histograms[3][4 + i]; - } - - template - void HistogramConditional(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint32_t * histogram) - { - uint32_t histograms[4][HISTOGRAM_SIZE + 4]; - memset(histograms, 0, sizeof(uint32_t)*(HISTOGRAM_SIZE + 4) * 4); - size_t alignedWidth = Simd::AlignLo(width, 4); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += 4) - { - ++histograms[0][(4 + src[col + 0])*Compare8u(mask[col + 0], value)]; - ++histograms[1][(4 + src[col + 1])*Compare8u(mask[col + 1], value)]; - ++histograms[2][(4 + src[col + 2])*Compare8u(mask[col + 2], value)]; - ++histograms[3][(4 + src[col + 3])*Compare8u(mask[col + 3], value)]; - } - for (; col < width; ++col) - ++histograms[0][(4 + src[col + 0])*Compare8u(mask[col + 0], value)]; - - src += srcStride; - mask += maskStride; - } - for (size_t i = 0; i < HISTOGRAM_SIZE; ++i) - histogram[i] = histograms[0][4 + i] + histograms[1][4 + i] + histograms[2][4 + i] + histograms[3][4 + i]; - } - - void HistogramConditional(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint32_t * histogram) - { - switch (compareType) - { - case SimdCompareEqual: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - case SimdCompareNotEqual: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - case SimdCompareGreater: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - case SimdCompareGreaterOrEqual: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - case SimdCompareLesser: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - case SimdCompareLesserOrEqual: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - default: - assert(0); - } - } - - void NormalizedColors(const uint32_t * histogram, uint8_t * colors) - { - uint32_t integral[HISTOGRAM_SIZE], sum = 0, minCount = 0, minColor = 0; - for (size_t i = 0; i < HISTOGRAM_SIZE; ++i) - { - if (sum == 0 && histogram[i] != 0) - { - minCount = histogram[i]; - minColor = (uint32_t)i; - } - sum += histogram[i]; - integral[i] = sum; - } - - uint32_t norm = sum - minCount, term = (sum - minCount) / 2; - for (size_t i = 0; i < HISTOGRAM_SIZE; ++i) - colors[i] = i < minColor ? 0 : (norm ? (255 * (integral[i] - minCount) + term) / norm : minColor); - } - - void ChangeColors(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * colors, uint8_t * dst, size_t dstStride) - { - size_t alignedWidth = Simd::AlignLo(width, 4); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += 4) - { - dst[col + 0] = colors[src[col + 0]]; - dst[col + 1] = colors[src[col + 1]]; - dst[col + 2] = colors[src[col + 2]]; - dst[col + 3] = colors[src[col + 3]]; - } - for (; col < width; ++col) - dst[col] = colors[src[col]]; - - src += srcStride; - dst += dstStride; - } - } - - void NormalizeHistogram(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - uint32_t histogram[HISTOGRAM_SIZE]; - Histogram(src, width, height, srcStride, histogram); - - uint8_t colors[HISTOGRAM_SIZE]; - NormalizedColors(histogram, colors); - - ChangeColors(src, srcStride, width, height, colors, dst, dstStride); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseHog.cpp b/src/3rd/Simd/Simd/SimdBaseHog.cpp deleted file mode 100644 index fa26e4fe..00000000 --- a/src/3rd/Simd/Simd/SimdBaseHog.cpp +++ /dev/null @@ -1,549 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdArray.h" - -namespace Simd -{ - namespace Base - { - namespace - { - struct Buffer - { - const int size; - float * cos, *sin; - int * index; - float * value; - - Buffer(size_t width, size_t quantization) - : size((int)quantization / 2) - { - _p = Allocate(width*(sizeof(int) + sizeof(float)) + sizeof(float) * 2 * size); - index = (int*)_p; - value = (float*)index + width; - cos = value + width; - sin = cos + size; - for (int i = 0; i < size; ++i) - { - cos[i] = (float)::cos(i*M_PI / size); - sin[i] = (float)::sin(i*M_PI / size); - } - } - - ~Buffer() - { - Free(_p); - } - - private: - void *_p; - }; - } - - void AddRowToHistograms(int * indexes, float * values, size_t row, size_t width, size_t height, size_t cellX, size_t cellY, size_t quantization, float * histograms) - { - int blockX = int(width / cellX); - int blockY = int(height / cellY); - int blockStride = int(quantization*blockX); - - float yp = ((float)row + 0.5f) / (float)cellY - 0.5f; - int iyp = (int)floor(yp); - float vy0 = yp - iyp; - float vy1 = 1.0f - vy0; - - size_t noseEnd = cellX / 2; - size_t bodyEnd = width - cellX / 2; - - if (iyp < 0) - { - float * h = histograms + (iyp + 1)*blockStride; - for (size_t col = 1; col < width - 1; ++col) - { - float value = values[col]; - int index = indexes[col]; - - float xp = ((float)col + 0.5f) / (float)cellX - 0.5f; - int ixp = (int)floor(xp); - float vx0 = xp - ixp; - float vx1 = 1.0f - vx0; - - if (ixp >= 0) - h[ixp*quantization + index] += vx1 * vy0*value; - if (ixp + 1 < blockX) - h[(ixp + 1)*quantization + index] += vx0 * vy0*value; - } - } - else if (iyp + 1 == blockY) - { - float * h = histograms + iyp * blockStride; - for (size_t col = 1; col < width - 1; ++col) - { - float value = values[col]; - int index = indexes[col]; - - float xp = ((float)col + 0.5f) / (float)cellX - 0.5f; - int ixp = (int)floor(xp); - float vx0 = xp - ixp; - float vx1 = 1.0f - vx0; - - if (ixp >= 0) - h[ixp*quantization + index] += vx1 * vy1*value; - if (ixp + 1 < blockX) - h[(ixp + 1)*quantization + index] += vx0 * vy1*value; - } - } - else - { - float * h0 = histograms + iyp * blockStride; - float * h1 = histograms + (iyp + 1)*blockStride; - size_t col = 1; - for (; col < noseEnd; ++col) - { - float value = values[col]; - int index = indexes[col]; - - float xp = ((float)col + 0.5f) / (float)cellX - 0.5f; - int ixp = (int)floor(xp); - float vx0 = xp - ixp; - - h0[(ixp + 1)*quantization + index] += vx0 * vy1*value; - h1[(ixp + 1)*quantization + index] += vx0 * vy0*value; - } - - for (; col < bodyEnd; ++col) - { - float value = values[col]; - int index = indexes[col]; - - float xp = ((float)col + 0.5f) / (float)cellX - 0.5f; - int ixp = (int)floor(xp); - float vx0 = xp - ixp; - float vx1 = 1.0f - vx0; - - h0[ixp*quantization + index] += vx1 * vy1*value; - h1[ixp*quantization + index] += vx1 * vy0*value; - h0[(ixp + 1)*quantization + index] += vx0 * vy1*value; - h1[(ixp + 1)*quantization + index] += vx0 * vy0*value; - } - - for (; col < width - 1; ++col) - { - float value = values[col]; - int index = indexes[col]; - - float xp = ((float)col + 0.5f) / (float)cellX - 0.5f; - int ixp = (int)floor(xp); - float vx0 = xp - ixp; - float vx1 = 1.0f - vx0; - - h0[ixp*quantization + index] += vx1 * vy1*value; - h1[ixp*quantization + index] += vx1 * vy0*value; - } - } - } - - void HogDirectionHistograms(const uint8_t * src, size_t stride, size_t width, size_t height, - size_t cellX, size_t cellY, size_t quantization, float * histograms) - { - assert(width%cellX == 0 && height%cellY == 0 && quantization % 2 == 0); - - Buffer buffer(width, quantization); - - memset(histograms, 0, quantization*(width / cellX)*(height / cellY) * sizeof(float)); - - for (size_t row = 1; row < height - 1; ++row) - { - const uint8_t * src1 = src + stride * row; - const uint8_t * src0 = src1 - stride; - const uint8_t * src2 = src1 + stride; - -#if 1 - for (size_t col = 1; col < width - 1; ++col) - { - float dy = (float)(src2[col] - src0[col]); - float dx = (float)(src1[col + 1] - src1[col - 1]); - float value = (float)::sqrt(dx*dx + dy * dy); - - float bestDot = 0; - int index = 0; - for (int direction = 0; direction < buffer.size; direction++) - { - float dot = buffer.cos[direction] * dx + buffer.sin[direction] * dy; - if (dot > bestDot) - { - bestDot = dot; - index = direction; - } - else if (-dot > bestDot) - { - bestDot = -dot; - index = direction + buffer.size; - } - } - - buffer.value[col] = value; - buffer.index[col] = index; - } -#else - size_t size = (buffer.size + 1) / 2; - for (size_t col = 1; col < width - 1; ++col) - { - float dy = (float)(src2[col] - src0[col]); - float dx = (float)(src1[col + 1] - src1[col - 1]); - float value = (float)::sqrt(dx*dx + dy * dy); - float ady = Simd::Abs(dy); - float adx = Simd::Abs(dx); - - float bestDot = 0; - int index = 0; - for (int direction = 0; direction < size; direction++) - { - float dot = buffer.cos[direction] * adx + buffer.sin[direction] * ady; - if (dot > bestDot) - { - bestDot = dot; - index = direction; - } - } - if (dx < 0) - index = buffer.size - index; - if (dy < 0 && index != 0) - index = buffer.size * 2 - index - (dx == 0); - - buffer.value[col] = value; - buffer.index[col] = index; - } -#endif - - AddRowToHistograms(buffer.index, buffer.value, row, width, height, cellX, cellY, quantization, histograms); - } - } - - class HogFeatureExtractor - { - static const size_t C = 8; - static const size_t Q = 9; - static const size_t Q2 = 18; - - size_t _sx, _sy, _hs; - - float _cos[5]; - float _sin[5]; - float _k[C]; - - Array32i _index; - Array32f _value; - Array32f _histogram; - Array32f _norm; - - void Init(size_t w, size_t h) - { - _sx = w / C; - _hs = _sx + 2; - _sy = h / C; - for (int i = 0; i < 5; ++i) - { - _cos[i] = (float)::cos(i*M_PI / Q); - _sin[i] = (float)::sin(i*M_PI / Q); - } - for (int i = 0; i < C; ++i) - _k[i] = float((1 + i * 2) / 16.0f); - _index.Resize(w); - _value.Resize(w); - _histogram.Resize((_sx + 2)*(_sy + 2)*Q2); - _norm.Resize((_sx + 2)*(_sy + 2)); - } - - void AddRowToHistogram(size_t row, size_t width, size_t height) - { - size_t iyp = (row - 4) / C; - float vy0 = _k[(row + 4) & 7]; - float vy1 = 1.0f - vy0; - float * h0 = _histogram.data + ((iyp + 1)*_hs + 0)*Q2; - float * h1 = _histogram.data + ((iyp + 2)*_hs + 0)*Q2; - for (size_t col = 1, n = C, i = 5; col < width - 1; i = 0, n = Simd::Min(C, width - col - 1)) - { - for (; i < n; ++i, ++col) - { - float value = _value[col]; - int index = _index[col]; - float vx0 = _k[i]; - float vx1 = 1.0f - vx0; - h0[index] += vx1 * vy1*value; - h1[index] += vx1 * vy0*value; - h0[Q2 + index] += vx0 * vy1*value; - h1[Q2 + index] += vx0 * vy0*value; - } - h0 += Q2; - h1 += Q2; - } - } - - void EstimateHistogram(const uint8_t * src, size_t stride, size_t width, size_t height) - { - _histogram.Clear(); - for (size_t row = 1; row < height - 1; ++row) - { - const uint8_t * src1 = src + stride * row; - const uint8_t * src0 = src1 - stride; - const uint8_t * src2 = src1 + stride; - - for (size_t col = 1; col < width - 1; ++col) - { - float dy = (float)(src2[col] - src0[col]); - float dx = (float)(src1[col + 1] - src1[col - 1]); - float value = (float)::sqrt(dx*dx + dy * dy); - float ady = Simd::Abs(dy); - float adx = Simd::Abs(dx); - - float bestDot = 0; - int index = 0; - for (int direction = 0; direction < 5; direction++) - { - float dot = _cos[direction] * adx + _sin[direction] * ady; - if (dot > bestDot) - { - bestDot = dot; - index = direction; - } - } - if (dx < 0) - index = Q - index; - if (dy < 0 && index != 0) - index = Q2 - index - (dx == 0); - - _value[col] = value; - _index[col] = index; - } - - AddRowToHistogram(row, width, height); - } - } - - void EstimateNorm() - { - _norm.Clear(); - for (size_t y = 0; y < _sy; ++y) - { - const float * ph = _histogram.data + ((y + 1)*_hs + 1)*Q2; - float * pn = _norm.data + (y + 1)*_hs + 1; - for (size_t x = 0; x < _sx; ++x) - { - const float * h = ph + x * Q2; - for (int o = 0; o < Q; ++o) - pn[x] += Simd::Square(h[o] + h[o + Q]); - } - } - } - - void ExtractFeatures(float * features) - { - float eps = 0.0001f; - for (size_t y = 0; y < _sy; y++) - { - for (size_t x = 0; x < _sx; x++) - { - float * dst = features + (y*_sx + x) * 31; - - float *psrc, n1, n2, n3, n4; - - float * p0 = _norm.data + y * _hs + x; - float * p1 = p0 + _hs; - float * p2 = p1 + _hs; - - n1 = 1.0f / sqrt(p1[1] + p1[2] + p2[1] + p2[2] + eps); - n2 = 1.0f / sqrt(p0[1] + p0[2] + p1[1] + p1[2] + eps); - n3 = 1.0f / sqrt(p1[0] + p1[1] + p2[0] + p2[1] + eps); - n4 = 1.0f / sqrt(p0[0] + p0[1] + p1[0] + p1[1] + eps); - - float t1 = 0; - float t2 = 0; - float t3 = 0; - float t4 = 0; - - psrc = _histogram.data + ((y + 1)*_hs + x + 1)*Q2; - for (int o = 0; o < Q2; o++) - { - float h1 = Simd::Min(*psrc * n1, 0.2f); - float h2 = Simd::Min(*psrc * n2, 0.2f); - float h3 = Simd::Min(*psrc * n3, 0.2f); - float h4 = Simd::Min(*psrc * n4, 0.2f); - *dst = 0.5f * (h1 + h2 + h3 + h4); - t1 += h1; - t2 += h2; - t3 += h3; - t4 += h4; - dst++; - psrc++; - } - - psrc = _histogram.data + ((y + 1)*_hs + x + 1)*Q2; - for (int o = 0; o < Q; o++) - { - float sum = *psrc + *(psrc + Q); - float h1 = Simd::Min(sum * n1, 0.2f); - float h2 = Simd::Min(sum * n2, 0.2f); - float h3 = Simd::Min(sum * n3, 0.2f); - float h4 = Simd::Min(sum * n4, 0.2f); - *dst = 0.5f * (h1 + h2 + h3 + h4); - dst++; - psrc++; - } - - *dst = 0.2357f * t1; - dst++; - *dst = 0.2357f * t2; - dst++; - *dst = 0.2357f * t3; - dst++; - *dst = 0.2357f * t4; - } - } - } - - public: - void Run(const uint8_t * src, size_t stride, size_t width, size_t height, float * features) - { - Init(width, height); - - EstimateHistogram(src, stride, width, height); - - EstimateNorm(); - - ExtractFeatures(features); - } - }; - - void HogExtractFeatures(const uint8_t * src, size_t stride, size_t width, size_t height, float * features) - { - assert(width % 8 == 0 && height % 8 == 0 && width >= 16 && height >= 16); - - HogFeatureExtractor extractor; - extractor.Run(src, stride, width, height, features); - } - - namespace HogSeparableFilter_Detail - { - template void Set(float & dst, float value); - - template <> SIMD_INLINE void Set<0>(float & dst, float value) - { - dst = value; - } - - template <> SIMD_INLINE void Set<1>(float & dst, float value) - { - dst += value; - } - } - - void HogDeinterleave(const float * src, size_t srcStride, size_t width, size_t height, size_t count, float ** dst, size_t dstStride) - { - for (size_t row = 0; row < height; ++row) - { - const float * psrc = src + row * srcStride; - size_t offset = row * dstStride; - for (size_t col = 0; col < width; ++col) - { - for (size_t i = 0; i < count; ++i) - dst[i][offset + col] = *psrc++; - } - } - } - - class HogSeparableFilter - { - typedef Array Array32f; - - size_t _w, _h; - Array32f _buffer; - - void Init(size_t w, size_t h, size_t rs, size_t cs) - { - _w = w - rs + 1; - _h = h - cs + 1; - _buffer.Resize(_w*h); - } - - void FilterRows(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - const float * s = src + col; - float sum = 0; - for (size_t i = 0; i < size; ++i) - sum += s[i] * filter[i]; - dst[col] = sum; - } - src += srcStride; - dst += dstStride; - } - } - - - template void FilterCols(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - const float * s = src + col; - float sum = 0; - for (size_t i = 0; i < size; ++i) - sum += s[i*srcStride] * filter[i]; - HogSeparableFilter_Detail::Set(dst[col], sum); - } - src += srcStride; - dst += dstStride; - } - } - - public: - - void Run(const float * src, size_t srcStride, size_t width, size_t height, - const float * rowFilter, size_t rowSize, const float * colFilter, size_t colSize, float * dst, size_t dstStride, int add) - { - Init(width, height, rowSize, colSize); - - FilterRows(src, srcStride, _w, height, rowFilter, rowSize, _buffer.data, _w); - - if (add) - FilterCols<1>(_buffer.data, _w, _w, _h, colFilter, colSize, dst, dstStride); - else - FilterCols<0>(_buffer.data, _w, _w, _h, colFilter, colSize, dst, dstStride); - } - }; - - void HogFilterSeparable(const float * src, size_t srcStride, size_t width, size_t height, - const float * rowFilter, size_t rowSize, const float * colFilter, size_t colSize, float * dst, size_t dstStride, int add) - { - assert(width >= rowSize - 1 && height >= colSize - 1); - - HogSeparableFilter filter; - filter.Run(src, srcStride, width, height, rowFilter, rowSize, colFilter, colSize, dst, dstStride, add); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseHogLite.cpp b/src/3rd/Simd/Simd/SimdBaseHogLite.cpp deleted file mode 100644 index e89f36dc..00000000 --- a/src/3rd/Simd/Simd/SimdBaseHogLite.cpp +++ /dev/null @@ -1,547 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdArray.h" -#include "Simd/SimdUpdate.h" - -namespace Simd -{ - namespace Base - { - template class HogLiteFeatureExtractor - { - static const size_t FQ = 8; - static const size_t HQ = FQ/2; - - typedef Array Ints; - typedef Array Floats; - - size_t _hx, _fx; - Ints _hi[2]; - Floats _hf[2], _nf[4]; - int _k0[cell], _k1[cell]; - - SIMD_INLINE void Init(size_t width) - { - _hx = width / cell; - _fx = _hx - 2; - for (size_t i = 0; i < cell; ++i) - { - _k0[i] = int(cell - i - 1) * 2 + 1; - _k1[i] = int(i) * 2 + 1; - } - for (size_t i = 0; i < 2; ++i) - { - _hi[i].Resize(_hx*FQ, true); - _hf[i].Resize(_hx*FQ); - } - for (size_t i = 0; i < 4; ++i) - _nf[i].Resize(_hx); - } - - SIMD_INLINE void UpdateIntegerHistogram(const uint8_t * src, size_t stride, size_t width, size_t rowI, size_t rowF) - { - int * h0 = _hi[(rowI + 0) & 1].data; - int * h1 = _hi[(rowI + 1) & 1].data; - int ky0 = _k0[rowF]; - int ky1 = _k1[rowF]; - for (size_t col = 0; col < width;) - { - for (size_t colF = 0; colF < cell; ++colF, ++col) - { - int dy = src[col + stride] - src[col - stride]; - int dx = src[col + 1] - src[col - 1]; - int adx = Abs(dx); - int ady = Abs(dy); - int value = RestrictRange(Max(adx, ady) + (Min(adx, ady) + 1) / 2); - - size_t index = (adx > ady ? 0 : 1); - index = (dx > 0 ? index : (HQ - 1) - index); - index = (dy > 0 ? index : (FQ - 1) - index); - - h0[00 + index] += value*_k0[colF] * ky0; - h1[00 + index] += value*_k0[colF] * ky1; - h0[FQ + index] += value*_k1[colF] * ky0; - h1[FQ + index] += value*_k1[colF] * ky1; - } - h0 += FQ; - h1 += FQ; - } - } - - SIMD_INLINE void UpdateFloatHistogram(size_t rowI) - { - const float k = 1.0f / Simd::Square(cell * 2); - Ints & hi = _hi[rowI & 1]; - Floats & hf = _hf[rowI & 1]; - Floats & nf = _nf[rowI & 3]; - - for (size_t i = 0; i < hi.size; ++i) - hf.data[i] = float(hi.data[i])*k; - hi.Clear(); - - const float * h = hf.data; - for (size_t x = 0; x < _hx; ++x, h += FQ) - { - float sum = 0; - for (int i = 0; i < HQ; ++i) - sum += Simd::Square(h[i] + h[i + HQ]); - nf.data[x] = sum; - } - } - - SIMD_INLINE void SetFeatures(size_t rowI, float * dst) - { - const float eps = 0.0001f; - float * hf = _hf[(rowI - 1) & 1].data + FQ; - float * p0 = _nf[(rowI - 2) & 3].data; - float * p1 = _nf[(rowI - 1) & 3].data; - float * p2 = _nf[(rowI - 0) & 3].data; - for (size_t x = 0; x < _fx; ++x, ++p0, ++p1, ++p2) - { - float n1 = 1.0f / sqrt(p1[1] + p1[2] + p2[1] + p2[2] + eps); - float n2 = 1.0f / sqrt(p0[1] + p0[2] + p1[1] + p1[2] + eps); - float n3 = 1.0f / sqrt(p1[0] + p1[1] + p2[0] + p2[1] + eps); - float n4 = 1.0f / sqrt(p0[0] + p0[1] + p1[0] + p1[1] + eps); - - float t1 = 0; - float t2 = 0; - float t3 = 0; - float t4 = 0; - - float * src = hf + FQ*x; - for (size_t o = 0; o < FQ; o++) - { - float h1 = Simd::Min(*src * n1, 0.2f); - float h2 = Simd::Min(*src * n2, 0.2f); - float h3 = Simd::Min(*src * n3, 0.2f); - float h4 = Simd::Min(*src * n4, 0.2f); - *dst++ = 0.5f * (h1 + h2 + h3 + h4); - t1 += h1; - t2 += h2; - t3 += h3; - t4 += h4; - src++; - } - - src = hf + FQ*x; - for (size_t o = 0; o < HQ; o++) - { - float sum = *src + *(src + HQ); - float h1 = Simd::Min(sum * n1, 0.2f); - float h2 = Simd::Min(sum * n2, 0.2f); - float h3 = Simd::Min(sum * n3, 0.2f); - float h4 = Simd::Min(sum * n4, 0.2f); - *dst++ = 0.5f * (h1 + h2 + h3 + h4); - src++; - } - - *dst++ = 0.2357f * t1; - *dst++ = 0.2357f * t2; - *dst++ = 0.2357f * t3; - *dst++ = 0.2357f * t4; - } - } - - public: - - void Run(const uint8_t * src, size_t srcStride, size_t width, size_t height, float * features, size_t featuresStride) - { - assert(cell == 8 || cell == 4); - assert(width >= cell * 3 && height >= cell * 3); - - Init(width); - - src += (srcStride + 1)*cell / 2; - height = (height/cell - 1)*cell; - width = (width/cell - 1)*cell; - - for (size_t row = 0; row < height; ++row) - { - size_t rowI = row / cell; - size_t rowF = row & (cell - 1); - UpdateIntegerHistogram(src, srcStride, width, rowI, rowF); - if (rowF == cell - 1) - { - UpdateFloatHistogram(rowI); - if (rowI >= 2) - { - SetFeatures(rowI, features); - features += featuresStride; - } - } - src += srcStride; - } - size_t rowI = height/cell; - UpdateFloatHistogram(rowI); - SetFeatures(rowI, features); - } - }; - - void HogLiteExtractFeatures(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t cell, float * features, size_t featuresStride) - { - if (cell == 4) - { - HogLiteFeatureExtractor<4> extractor; - extractor.Run(src, srcStride, width, height, features, featuresStride); - } - else - { - HogLiteFeatureExtractor<8> extractor; - extractor.Run(src, srcStride, width, height, features, featuresStride); - } - } - - class HogLiteFeatureFilter - { - void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride) - { - size_t filterStride = featureSize*filterWidth; - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol) - { - float sum = 0; - const float * pSrc = src + dstRow*srcStride + dstCol*featureSize; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - for (size_t filterCol = 0; filterCol < filterStride; ++filterCol) - sum += pSrc[filterCol] * pFilter[filterCol]; - pSrc += srcStride; - pFilter += filterStride; - } - dst[dstCol] = sum; - } - dst += dstStride; - } - } - - void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) - { - size_t filterStride = featureSize*filterWidth; - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol) - { - if (mask[dstCol]) - { - float sum = 0; - const float * pSrc = src + dstRow*srcStride + dstCol*featureSize; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - for (size_t filterCol = 0; filterCol < filterStride; ++filterCol) - sum += pSrc[filterCol] * pFilter[filterCol]; - pSrc += srcStride; - pFilter += filterStride; - } - dst[dstCol] = sum; - } - else - dst[dstCol] = -FLT_MAX; - } - dst += dstStride; - mask += maskStride; - } - } - public: - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) - { - assert(featureSize == 8 || featureSize == 16); - assert(srcWidth >= filterWidth && srcHeight >= filterHeight); - - size_t dstWidth = srcWidth - filterWidth + 1; - size_t dstHeight = srcHeight - filterHeight + 1; - if (mask) - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - else - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride); - } - }; - - void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) - { - HogLiteFeatureFilter featureFilter; - featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - } - - class HogLiteFeatureResizer - { - typedef Array Ints; - typedef Array Floats; - - Ints _iy, _ix; - Floats _ky, _kx; - - void InitIndexWeight(size_t srcSize, size_t dstSize, size_t dstStep, Ints & indexes, Floats & weights) - { - indexes.Resize(dstSize); - weights.Resize(dstSize); - - float scale = float(srcSize) / float(dstSize); - for (size_t i = 0; i < dstSize; ++i) - { - float weight = (float)((i + 0.5f)*scale - 0.5f); - int index = (int)::floor(weight); - weight -= index; - if (index < 0) - { - index = 0; - weight = 0.0f; - } - if (index > (int)srcSize - 2) - { - index = (int)srcSize - 2; - weight = 1.0f; - } - indexes[i] = int(index*dstStep); - weights[i] = weight; - } - } - - void Resize(const float * src, size_t srcStride, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) - { - for (size_t rowDst = 0; rowDst < dstHeight; ++rowDst) - { - float ky1 = _ky[rowDst]; - float ky0 = 1.0f - ky1; - const float * pSrc = src + _iy[rowDst]; - float * pDst = dst + rowDst*dstStride; - for (size_t colDst = 0; colDst < dstWidth; ++colDst, pDst += featureSize) - { - float kx1 = _kx[colDst]; - float kx0 = 1.0f - kx1; - float k00 = ky0*kx0, k01 = ky0*kx1, k10 = ky1*kx0, k11 = ky1*kx1; - const float * pSrc0 = pSrc + _ix[colDst]; - const float * pSrc1 = pSrc0 + srcStride; - for (size_t i = 0; i < featureSize; ++i) - pDst[i] = pSrc0[i] * k00 + pSrc0[i + featureSize] * k01 + pSrc1[i] * k10 + pSrc1[i + featureSize] * k11; - } - } - } - - public: - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) - { - assert(featureSize == 8 || featureSize == 16); - - if (srcWidth == dstWidth && srcHeight == dstHeight) - { - size_t size = sizeof(float)*srcWidth*featureSize; - for (size_t row = 0; row < dstHeight; ++row) - memcpy(dst + row*dstStride, src + row*srcStride, size); - return; - } - - InitIndexWeight(srcWidth, dstWidth, featureSize, _ix, _kx); - InitIndexWeight(srcHeight, dstHeight, srcStride, _iy, _ky); - - Resize(src, srcStride, featureSize, dst, dstStride, dstWidth, dstHeight); - } - }; - - void HogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) - { - HogLiteFeatureResizer featureResizer; - featureResizer.Run(src, srcStride, srcWidth, srcHeight, featureSize, dst, dstStride, dstWidth, dstHeight); - } - - void HogLiteCompressFeatures(const float * src, size_t srcStride, size_t width, size_t height, const float * pca, float * dst, size_t dstStride) - { - for (size_t row = 0; row < height; ++row) - { - const float * s = src; - float * d = dst; - for (size_t col = 0; col < width; ++col) - { - const float * p = pca; - for (size_t i = 0; i < 8; ++i, p += 16) - { - float sum = 0; - for (size_t j = 0; j < 16; ++j) - sum += s[j] * p[j]; - d[i] = sum; - } - s += 16; - d += 8; - } - src += srcStride; - dst += dstStride; - } - } - - class HogLiteSeparableFilter - { - typedef Array Array32f; - - size_t _dstWidth, _dstHeight; - Array32f _buffer; - - void Init(size_t srcWidth, size_t srcHeight, size_t hSize, size_t vSize) - { - _dstWidth = srcWidth - hSize + 1; - _dstHeight = srcHeight - vSize + 1; - _buffer.Resize(_dstWidth*srcHeight); - } - - void FilterH(const float * src, size_t srcStride, size_t width, size_t height, size_t step, const float * filter, size_t size, float * dst, size_t dstStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - const float * s = src + col*step; - float sum = 0; - for (size_t i = 0; i < size; ++i) - sum += s[i] * filter[i]; - dst[col] = sum; - } - src += srcStride; - dst += dstStride; - } - } - - template void FilterV(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - const float * s = src + col; - float sum = 0; - for (size_t i = 0; i < size; ++i) - sum += s[i*srcStride] * filter[i]; - Update(dst + col, sum); - } - src += srcStride; - dst += dstStride; - } - } - - public: - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * hFilter, size_t hSize, const float * vFilter, size_t vSize, float * dst, size_t dstStride, int add) - { - assert(featureSize == 8 || featureSize == 16); - assert(srcWidth >= hSize && srcHeight >= vSize); - - Init(srcWidth, srcHeight, hSize, vSize); - - FilterH(src, srcStride, _dstWidth, srcHeight, featureSize, hFilter, hSize*featureSize, _buffer.data, _dstWidth); - - if(add) - FilterV(_buffer.data, _dstWidth, _dstWidth, _dstHeight, vFilter, vSize, dst, dstStride); - else - FilterV(_buffer.data, _dstWidth, _dstWidth, _dstHeight, vFilter, vSize, dst, dstStride); - } - }; - - void HogLiteFilterSeparable(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * hFilter, size_t hSize, const float * vFilter, size_t vSize, float * dst, size_t dstStride, int add) - { - HogLiteSeparableFilter filter; - filter.Run(src, srcStride, srcWidth, srcHeight, featureSize, hFilter, hSize, vFilter, vSize, dst, dstStride, add); - } - - void HogLiteFindMax7x7(const float * a, size_t aStride, const float * b, size_t bStride, size_t height, float * pValue, size_t * pCol, size_t * pRow) - { - *pValue = -FLT_MAX; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < 7; ++col) - { - float value = a[col] + b[col]; - if (value > *pValue) - { - *pValue = value; - *pCol = col; - *pRow = row; - } - } - a += aStride; - b += bStride; - } - } - - template void HogLiteCreateMask(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, const float * threshold, size_t scale, uint32_t * dst, size_t dstStride) - { - size_t dstStartEnd = size - scale; - size_t dstRowSize = (srcWidth*scale + size - scale) * sizeof(uint32_t); - for (size_t srcRow = 0; srcRow < srcHeight; ++srcRow) - { - for (size_t dstRow = 0; dstRow < scale; ++dstRow) - memset(dst + (dstStartEnd + dstRow)*dstStride, 0, dstRowSize); - - for (size_t srcCol = 0; srcCol < srcWidth; ++srcCol) - { - if (src[srcCol] > *threshold) - { - uint32_t * pDst = dst + srcCol * scale; - for (size_t dstRow = 0; dstRow < size; ++dstRow) - { - memset(pDst, -1, size * sizeof(uint32_t)); - pDst += dstStride; - } - } - } - src += srcStride; - dst += dstStride*scale; - } - } - - void HogLiteCreateMask(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, const float * threshold, size_t scale, size_t size, uint32_t * dst, size_t dstStride) - { - size_t dstStartEnd = size - scale; - size_t dstRowSize = (srcWidth*scale + size - scale) * sizeof(uint32_t); - for (size_t dstRow = 0; dstRow < dstStartEnd; ++dstRow) - memset(dst + dstRow*dstStride, 0, dstRowSize); - - switch (size) - { - case 7: HogLiteCreateMask<7>(src, srcStride, srcWidth, srcHeight, threshold, scale, dst, dstStride); return; - default: break; - } - - for (size_t srcRow = 0; srcRow < srcHeight; ++srcRow) - { - for (size_t dstRow = 0; dstRow < scale; ++dstRow) - memset(dst + (dstStartEnd + dstRow)*dstStride, 0, dstRowSize); - - for (size_t srcCol = 0; srcCol < srcWidth; ++srcCol) - { - if (src[srcCol] > *threshold) - { - uint32_t * pDst = dst + srcCol * scale; - for (size_t dstRow = 0; dstRow < size; ++dstRow) - { - for (size_t dstCol = 0; dstCol < size; ++dstCol) - pDst[dstCol] = -1; - pDst += dstStride; - } - } - } - src += srcStride; - dst += dstStride*scale; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseInt16ToGray.cpp b/src/3rd/Simd/Simd/SimdBaseInt16ToGray.cpp deleted file mode 100644 index 3e89b1b2..00000000 --- a/src/3rd/Simd/Simd/SimdBaseInt16ToGray.cpp +++ /dev/null @@ -1,47 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar, -* 2014-2016 Antonenka Mikhail. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" - -namespace Simd -{ - namespace Base - { - static void Int16ToGray(const int16_t * src, size_t width, size_t height, size_t srcStride, uint8_t * dst, size_t dstStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - dst[col] = RestrictRange(src[col]); - src += srcStride; - dst += dstStride; - } - } - - void Int16ToGray(const uint8_t * src, size_t width, size_t height, size_t srcStride, uint8_t * dst, size_t dstStride) - { - Int16ToGray((const int16_t *)src, width, height, srcStride / sizeof(int16_t), dst, dstStride); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseIntegral.cpp b/src/3rd/Simd/Simd/SimdBaseIntegral.cpp deleted file mode 100644 index 5a85e4af..00000000 --- a/src/3rd/Simd/Simd/SimdBaseIntegral.cpp +++ /dev/null @@ -1,87 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdIntegral.h" - -namespace Simd -{ - namespace Base - { - void Integral(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t * sum, size_t sumStride, uint8_t * sqsum, size_t sqsumStride, uint8_t * tilted, size_t tiltedStride, - SimdPixelFormatType sumFormat, SimdPixelFormatType sqsumFormat) - { - assert(sumFormat == SimdPixelFormatInt32 && sumStride % sizeof(uint32_t) == 0); - if (tilted) - assert(tiltedStride % sizeof(uint32_t) == 0); - - if (sqsum) - { - if (tilted) - { - switch (sqsumFormat) - { - case SimdPixelFormatInt32: - IntegralSumSqsumTilted(src, srcStride, width, height, - (uint32_t*)sum, sumStride / sizeof(uint32_t), (uint32_t*)sqsum, sqsumStride / sizeof(uint32_t), (uint32_t*)tilted, tiltedStride / sizeof(uint32_t)); - break; - case SimdPixelFormatDouble: - IntegralSumSqsumTilted(src, srcStride, width, height, - (uint32_t*)sum, sumStride / sizeof(uint32_t), (double*)sqsum, sqsumStride / sizeof(double), (uint32_t*)tilted, tiltedStride / sizeof(uint32_t)); - break; - default: - assert(0); - } - } - else - { - switch (sqsumFormat) - { - case SimdPixelFormatInt32: - IntegralSumSqsum(src, srcStride, width, height, - (uint32_t*)sum, sumStride / sizeof(uint32_t), (uint32_t*)sqsum, sqsumStride / sizeof(uint32_t)); - break; - case SimdPixelFormatDouble: - IntegralSumSqsum(src, srcStride, width, height, - (uint32_t*)sum, sumStride / sizeof(uint32_t), (double*)sqsum, sqsumStride / sizeof(double)); - break; - default: - assert(0); - } - } - } - else - { - if (tilted) - { - IntegralSumTilted(src, srcStride, width, height, - (uint32_t*)sum, sumStride / sizeof(uint32_t), (uint32_t*)tilted, tiltedStride / sizeof(uint32_t)); - } - else - { - IntegralSum(src, srcStride, width, height, (uint32_t*)sum, sumStride / sizeof(uint32_t)); - } - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseInterference.cpp b/src/3rd/Simd/Simd/SimdBaseInterference.cpp deleted file mode 100644 index 2cd93865..00000000 --- a/src/3rd/Simd/Simd/SimdBaseInterference.cpp +++ /dev/null @@ -1,78 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" - -namespace Simd -{ - namespace Base - { - void InterferenceIncrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t increment, int16_t saturation) - { - for (size_t row = 0; row < height; ++row) - { - int16_t * s = (int16_t *)statistic; - for (size_t col = 0; col < width; ++col) - s[col] = Min(s[col] + increment, saturation); - statistic += stride; - } - } - - void InterferenceIncrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t increment, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index) - { - for (size_t row = 0; row < height; ++row) - { - int16_t * s = (int16_t *)statistic; - for (size_t col = 0; col < width; ++col) - s[col] = Min(s[col] + (mask[col] == index ? increment : 0), saturation); - statistic += statisticStride; - mask += maskStride; - } - } - - void InterferenceDecrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t decrement, int16_t saturation) - { - for (size_t row = 0; row < height; ++row) - { - int16_t * s = (int16_t *)statistic; - for (size_t col = 0; col < width; ++col) - s[col] = Max(s[col] - decrement, saturation); - statistic += stride; - } - } - - void InterferenceDecrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t decrement, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index) - { - for (size_t row = 0; row < height; ++row) - { - int16_t * s = (int16_t *)statistic; - for (size_t col = 0; col < width; ++col) - s[col] = Max(s[col] - (mask[col] == index ? decrement : 0), saturation); - statistic += statisticStride; - mask += maskStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseInterleave.cpp b/src/3rd/Simd/Simd/SimdBaseInterleave.cpp deleted file mode 100644 index 2fda8bd0..00000000 --- a/src/3rd/Simd/Simd/SimdBaseInterleave.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdDefs.h" - -namespace Simd -{ - namespace Base - { - void InterleaveUv(const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * uv, size_t uvStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < width; ++col, offset += 2) - { - uv[offset] = u[col]; - uv[offset + 1] = v[col]; - } - u += uStride; - v += vStride; - uv += uvStride; - } - } - - void InterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < width; ++col, offset += 3) - { - bgr[offset + 0] = b[col]; - bgr[offset + 1] = g[col]; - bgr[offset + 2] = r[col]; - } - b += bStride; - g += gStride; - r += rStride; - bgr += bgrStride; - } - } - - void InterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < width; ++col, offset += 4) - { - bgra[offset + 0] = b[col]; - bgra[offset + 1] = g[col]; - bgra[offset + 2] = r[col]; - bgra[offset + 3] = a[col]; - } - b += bStride; - g += gStride; - r += rStride; - a += aStride; - bgra += bgraStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseLaplace.cpp b/src/3rd/Simd/Simd/SimdBaseLaplace.cpp deleted file mode 100644 index 2f33521f..00000000 --- a/src/3rd/Simd/Simd/SimdBaseLaplace.cpp +++ /dev/null @@ -1,117 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" - -namespace Simd -{ - namespace Base - { - template int Laplace(const uint8_t *s0, const uint8_t *s1, const uint8_t *s2, size_t x0, size_t x1, size_t x2); - - template <> SIMD_INLINE int Laplace(const uint8_t *s0, const uint8_t *s1, const uint8_t *s2, size_t x0, size_t x1, size_t x2) - { - return 8 * s1[x1] - (s0[x0] + s0[x1] + s0[x2] + s1[x0] + s1[x2] + s2[x0] + s2[x1] + s2[x2]); - } - - template <> SIMD_INLINE int Laplace(const uint8_t *s0, const uint8_t *s1, const uint8_t *s2, size_t x0, size_t x1, size_t x2) - { - return Simd::Abs(Laplace(s0, s1, s2, x0, x1, x2)); - } - - template void Laplace(const uint8_t * src, size_t srcStride, size_t width, size_t height, int16_t * dst, size_t dstStride) - { - assert(width > 1); - - const uint8_t *src0, *src1, *src2; - - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - dst[0] = Laplace(src0, src1, src2, 0, 0, 1); - - for (size_t col = 1; col < width - 1; ++col) - dst[col] = Laplace(src0, src1, src2, col - 1, col, col + 1); - - dst[width - 1] = Laplace(src0, src1, src2, width - 2, width - 1, width - 1); - - dst += dstStride; - } - } - - void Laplace(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - Laplace(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - void LaplaceAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - Laplace(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - void LaplaceAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - assert(width > 1); - - const uint8_t *src0, *src1, *src2; - - *sum = 0; - for (size_t row = 0; row < height; ++row) - { - src0 = src + stride*(row - 1); - src1 = src0 + stride; - src2 = src1 + stride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - -#ifdef __GNUC__ - size_t rowSum = 0; -#else - uint32_t rowSum = 0; -#endif - - rowSum += Laplace(src0, src1, src2, 0, 0, 1); - - for (size_t col = 1; col < width - 1; ++col) - rowSum += Laplace(src0, src1, src2, col - 1, col, col + 1); - - rowSum += Laplace(src0, src1, src2, width - 2, width - 1, width - 1); - - *sum += rowSum; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseLbp.cpp b/src/3rd/Simd/Simd/SimdBaseLbp.cpp deleted file mode 100644 index 59d833c6..00000000 --- a/src/3rd/Simd/Simd/SimdBaseLbp.cpp +++ /dev/null @@ -1,65 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdDefs.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE int LbpEstimate(const uint8_t * src, ptrdiff_t stride) - { - int threshold = src[0]; - int lbp = 0; - lbp |= (src[-stride - 1] >= threshold ? 0x01 : 0); - lbp |= (src[-stride] >= threshold ? 0x02 : 0); - lbp |= (src[-stride + 1] >= threshold ? 0x04 : 0); - lbp |= (src[1] >= threshold ? 0x08 : 0); - lbp |= (src[stride + 1] >= threshold ? 0x10 : 0); - lbp |= (src[stride] >= threshold ? 0x20 : 0); - lbp |= (src[stride - 1] >= threshold ? 0x40 : 0); - lbp |= (src[-1] >= threshold ? 0x80 : 0); - return lbp; - } - - void LbpEstimate(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - memset(dst, 0, width); - src += srcStride; - dst += dstStride; - for (size_t row = 2; row < height; ++row) - { - dst[0] = 0; - for (size_t col = 1; col < width - 1; ++col) - { - dst[col] = LbpEstimate(src + col, srcStride); - } - dst[width - 1] = 0; - - src += srcStride; - dst += dstStride; - } - memset(dst, 0, width); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseMeanFilter3x3.cpp b/src/3rd/Simd/Simd/SimdBaseMeanFilter3x3.cpp deleted file mode 100644 index 39f5a3c1..00000000 --- a/src/3rd/Simd/Simd/SimdBaseMeanFilter3x3.cpp +++ /dev/null @@ -1,70 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdConst.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE int DivideBy9(int value) - { - return ((value + 5)*DIVISION_BY_9_FACTOR) >> DIVISION_BY_9_SHIFT; - } - - SIMD_INLINE int MeanFilter3x3(const uint8_t *s0, const uint8_t *s1, const uint8_t *s2, size_t x0, size_t x1, size_t x2) - { - return DivideBy9(s0[x0] + s0[x1] + s0[x2] + s1[x0] + s1[x1] + s1[x2] + s2[x0] + s2[x1] + s2[x2]); - } - - void MeanFilter3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - const uint8_t *src0, *src1, *src2; - - size_t size = channelCount*width; - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - size_t col = 0; - for (; col < channelCount; col++) - dst[col] = MeanFilter3x3(src0, src1, src2, col, col, col + channelCount); - - for (; col < size - channelCount; ++col) - dst[col] = MeanFilter3x3(src0, src1, src2, col - channelCount, col, col + channelCount); - - for (; col < size; col++) - dst[col] = MeanFilter3x3(src0, src1, src2, col - channelCount, col, col); - - dst += dstStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseMedianFilter.cpp b/src/3rd/Simd/Simd/SimdBaseMedianFilter.cpp deleted file mode 100644 index 44501310..00000000 --- a/src/3rd/Simd/Simd/SimdBaseMedianFilter.cpp +++ /dev/null @@ -1,394 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE void LoadRhomb3x3(const uint8_t * y[3], size_t x[3], int a[5]) - { - a[0] = y[0][x[1]]; - a[1] = y[1][x[0]]; a[2] = y[1][x[1]]; a[3] = y[1][x[2]]; - a[4] = y[2][x[1]]; - } - - SIMD_INLINE void PartialSort5(int a[5]) - { - SortU8(a[2], a[3]); - SortU8(a[1], a[2]); - SortU8(a[2], a[3]); - a[4] = MaxU8(a[1], a[4]); - a[0] = MinU8(a[0], a[3]); - SortU8(a[2], a[0]); - a[2] = MaxU8(a[4], a[2]); - a[2] = MinU8(a[2], a[0]); - } - - void MedianFilterRhomb3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - int a[5]; - const uint8_t * y[3]; - size_t x[3]; - - size_t size = channelCount*width; - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - y[0] = src + srcStride*(row - 1); - y[1] = y[0] + srcStride; - y[2] = y[1] + srcStride; - if (row < 1) - y[0] = y[1]; - if (row >= height - 1) - y[2] = y[1]; - - for (size_t col = 0; col < 2 * channelCount; col++) - { - x[0] = col < channelCount ? col : size - 3 * channelCount + col; - x[2] = col < channelCount ? col + channelCount : size - 2 * channelCount + col; - x[1] = col < channelCount ? x[0] : x[2]; - - LoadRhomb3x3(y, x, a); - PartialSort5(a); - dst[x[1]] = (uint8_t)a[2]; - } - - for (size_t col = channelCount; col < size - channelCount; ++col) - { - x[0] = col - channelCount; - x[1] = col; - x[2] = col + channelCount; - - LoadRhomb3x3(y, x, a); - PartialSort5(a); - dst[col] = (uint8_t)a[2]; - } - } - } - - SIMD_INLINE void LoadSquare3x3(const uint8_t * y[3], size_t x[3], int a[9]) - { - a[0] = y[0][x[0]]; a[1] = y[0][x[1]]; a[2] = y[0][x[2]]; - a[3] = y[1][x[0]]; a[4] = y[1][x[1]]; a[5] = y[1][x[2]]; - a[6] = y[2][x[0]]; a[7] = y[2][x[1]]; a[8] = y[2][x[2]]; - } - - SIMD_INLINE void PartialSort9(int a[9]) - { - SortU8(a[1], a[2]); SortU8(a[4], a[5]); SortU8(a[7], a[8]); - SortU8(a[0], a[1]); SortU8(a[3], a[4]); SortU8(a[6], a[7]); - SortU8(a[1], a[2]); SortU8(a[4], a[5]); SortU8(a[7], a[8]); - a[3] = MaxU8(a[0], a[3]); - a[5] = MinU8(a[5], a[8]); - SortU8(a[4], a[7]); - a[6] = MaxU8(a[3], a[6]); - a[4] = MaxU8(a[1], a[4]); - a[2] = MinU8(a[2], a[5]); - a[4] = MinU8(a[4], a[7]); - SortU8(a[4], a[2]); - a[4] = MaxU8(a[6], a[4]); - a[4] = MinU8(a[4], a[2]); - } - - void MedianFilterSquare3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - int a[9]; - const uint8_t * y[3]; - size_t x[3]; - - size_t size = channelCount*width; - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - y[0] = src + srcStride*(row - 1); - y[1] = y[0] + srcStride; - y[2] = y[1] + srcStride; - if (row < 1) - y[0] = y[1]; - if (row >= height - 1) - y[2] = y[1]; - - for (size_t col = 0; col < 2 * channelCount; col++) - { - x[0] = col < channelCount ? col : size - 3 * channelCount + col; - x[2] = col < channelCount ? col + channelCount : size - 2 * channelCount + col; - x[1] = col < channelCount ? x[0] : x[2]; - - LoadSquare3x3(y, x, a); - PartialSort9(a); - dst[x[1]] = (uint8_t)a[4]; - } - - for (size_t col = channelCount; col < size - channelCount; ++col) - { - x[0] = col - channelCount; - x[1] = col; - x[2] = col + channelCount; - - LoadSquare3x3(y, x, a); - PartialSort9(a); - dst[col] = (uint8_t)a[4]; - } - } - } - - SIMD_INLINE void LoadRhomb5x5(const uint8_t * y[5], size_t x[5], int a[13]) - { - a[0] = y[0][x[2]]; - a[1] = y[1][x[1]]; a[2] = y[1][x[2]]; a[3] = y[1][x[3]]; - a[4] = y[2][x[0]]; a[5] = y[2][x[1]]; a[6] = y[2][x[2]]; a[7] = y[2][x[3]]; a[8] = y[2][x[4]]; - a[9] = y[3][x[1]]; a[10] = y[3][x[2]]; a[11] = y[3][x[3]]; - a[12] = y[4][x[2]]; - } - - SIMD_INLINE void PartialSort13(int a[13]) - { - SortU8(a[0], a[1]); SortU8(a[3], a[4]); SortU8(a[2], a[4]); - SortU8(a[2], a[3]); SortU8(a[6], a[7]); SortU8(a[5], a[7]); - SortU8(a[5], a[6]); SortU8(a[9], a[10]); SortU8(a[8], a[10]); - SortU8(a[8], a[9]); SortU8(a[11], a[12]); SortU8(a[5], a[8]); - SortU8(a[2], a[8]); SortU8(a[2], a[5]); SortU8(a[6], a[9]); - SortU8(a[3], a[9]); SortU8(a[3], a[6]); SortU8(a[7], a[10]); - SortU8(a[4], a[10]); SortU8(a[4], a[7]); SortU8(a[3], a[12]); - SortU8(a[0], a[9]); - a[1] = MinU8(a[1], a[10]); - a[1] = MinU8(a[1], a[7]); - a[1] = MinU8(a[1], a[9]); - a[11] = MaxU8(a[5], a[11]); - a[11] = MaxU8(a[3], a[11]); - a[11] = MaxU8(a[2], a[11]); - SortU8(a[0], a[6]); SortU8(a[1], a[8]); SortU8(a[6], a[8]); - a[4] = MinU8(a[4], a[8]); - SortU8(a[0], a[1]); SortU8(a[4], a[6]); SortU8(a[0], a[4]); - a[11] = MaxU8(a[0], a[11]); - SortU8(a[6], a[11]); - a[1] = MinU8(a[1], a[11]); - SortU8(a[1], a[4]); SortU8(a[6], a[12]); - a[6] = MaxU8(a[1], a[6]); - a[4] = MinU8(a[4], a[12]); - a[6] = MaxU8(a[4], a[6]); - } - - void MedianFilterRhomb5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - int a[13]; - const uint8_t * y[5]; - size_t x[5]; - - size_t size = channelCount*width; - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - y[0] = src + srcStride*(row - 2); - y[1] = y[0] + srcStride; - y[2] = y[1] + srcStride; - y[3] = y[2] + srcStride; - y[4] = y[3] + srcStride; - if (row < 2) - { - if (row < 1) - y[1] = y[2]; - y[0] = y[1]; - } - if (row >= height - 2) - { - if (row >= height - 1) - y[3] = y[2]; - y[4] = y[3]; - } - - for (size_t col = 0; col < 4 * channelCount; col++) - { - if (col < 2 * channelCount) - { - x[0] = col < channelCount ? col : col - channelCount; - x[1] = x[0]; - x[2] = col; - x[3] = x[2] + channelCount; - x[4] = x[3] + channelCount; - } - else - { - x[0] = size - 6 * channelCount + col; - x[1] = x[0] + channelCount; - x[2] = x[1] + channelCount; - x[3] = col < 3 * channelCount ? x[2] + channelCount : x[2]; - x[4] = x[3]; - } - - LoadRhomb5x5(y, x, a); - PartialSort13(a); - dst[x[2]] = (uint8_t)a[6]; - } - - for (size_t col = 2 * channelCount; col < size - 2 * channelCount; ++col) - { - x[0] = col - 2 * channelCount; - x[1] = col - channelCount; - x[2] = col; - x[3] = col + channelCount; - x[4] = col + 2 * channelCount; - - LoadRhomb5x5(y, x, a); - PartialSort13(a); - dst[col] = (uint8_t)a[6]; - } - } - } - - SIMD_INLINE void LoadSquare5x5(const uint8_t * y[5], size_t x[5], int a[25]) - { - a[0] = y[0][x[0]]; a[1] = y[0][x[1]]; a[2] = y[0][x[2]]; a[3] = y[0][x[3]]; a[4] = y[0][x[4]]; - a[5] = y[1][x[0]]; a[6] = y[1][x[1]]; a[7] = y[1][x[2]]; a[8] = y[1][x[3]]; a[9] = y[1][x[4]]; - a[10] = y[2][x[0]]; a[11] = y[2][x[1]]; a[12] = y[2][x[2]]; a[13] = y[2][x[3]]; a[14] = y[2][x[4]]; - a[15] = y[3][x[0]]; a[16] = y[3][x[1]]; a[17] = y[3][x[2]]; a[18] = y[3][x[3]]; a[19] = y[3][x[4]]; - a[20] = y[4][x[0]]; a[21] = y[4][x[1]]; a[22] = y[4][x[2]]; a[23] = y[4][x[3]]; a[24] = y[4][x[4]]; - } - - SIMD_INLINE void PartialSort25(int a[25]) - { - SortU8(a[0], a[1]); SortU8(a[3], a[4]); SortU8(a[2], a[4]); - SortU8(a[2], a[3]); SortU8(a[6], a[7]); SortU8(a[5], a[7]); - SortU8(a[5], a[6]); SortU8(a[9], a[10]); SortU8(a[8], a[10]); - SortU8(a[8], a[9]); SortU8(a[12], a[13]); SortU8(a[11], a[13]); - SortU8(a[11], a[12]); SortU8(a[15], a[16]); SortU8(a[14], a[16]); - SortU8(a[14], a[15]); SortU8(a[18], a[19]); SortU8(a[17], a[19]); - SortU8(a[17], a[18]); SortU8(a[21], a[22]); SortU8(a[20], a[22]); - SortU8(a[20], a[21]); SortU8(a[23], a[24]); SortU8(a[2], a[5]); - SortU8(a[3], a[6]); SortU8(a[0], a[6]); SortU8(a[0], a[3]); - SortU8(a[4], a[7]); SortU8(a[1], a[7]); SortU8(a[1], a[4]); - SortU8(a[11], a[14]); SortU8(a[8], a[14]); SortU8(a[8], a[11]); - SortU8(a[12], a[15]); SortU8(a[9], a[15]); SortU8(a[9], a[12]); - SortU8(a[13], a[16]); SortU8(a[10], a[16]); SortU8(a[10], a[13]); - SortU8(a[20], a[23]); SortU8(a[17], a[23]); SortU8(a[17], a[20]); - SortU8(a[21], a[24]); SortU8(a[18], a[24]); SortU8(a[18], a[21]); - SortU8(a[19], a[22]); SortU8(a[9], a[18]); SortU8(a[0], a[18]); - a[17] = MaxU8(a[8], a[17]); - a[9] = MaxU8(a[0], a[9]); - SortU8(a[10], a[19]); SortU8(a[1], a[19]); SortU8(a[1], a[10]); - SortU8(a[11], a[20]); SortU8(a[2], a[20]); SortU8(a[12], a[21]); - a[11] = MaxU8(a[2], a[11]); - SortU8(a[3], a[21]); SortU8(a[3], a[12]); SortU8(a[13], a[22]); - a[4] = MinU8(a[4], a[22]); - SortU8(a[4], a[13]); SortU8(a[14], a[23]); - SortU8(a[5], a[23]); SortU8(a[5], a[14]); SortU8(a[15], a[24]); - a[6] = MinU8(a[6], a[24]); - SortU8(a[6], a[15]); - a[7] = MinU8(a[7], a[16]); - a[7] = MinU8(a[7], a[19]); - a[13] = MinU8(a[13], a[21]); - a[15] = MinU8(a[15], a[23]); - a[7] = MinU8(a[7], a[13]); - a[7] = MinU8(a[7], a[15]); - a[9] = MaxU8(a[1], a[9]); - a[11] = MaxU8(a[3], a[11]); - a[17] = MaxU8(a[5], a[17]); - a[17] = MaxU8(a[11], a[17]); - a[17] = MaxU8(a[9], a[17]); - SortU8(a[4], a[10]); - SortU8(a[6], a[12]); SortU8(a[7], a[14]); SortU8(a[4], a[6]); - a[7] = MaxU8(a[4], a[7]); - SortU8(a[12], a[14]); - a[10] = MinU8(a[10], a[14]); - SortU8(a[6], a[7]); SortU8(a[10], a[12]); SortU8(a[6], a[10]); - a[17] = MaxU8(a[6], a[17]); - SortU8(a[12], a[17]); - a[7] = MinU8(a[7], a[17]); - SortU8(a[7], a[10]); SortU8(a[12], a[18]); - a[12] = MaxU8(a[7], a[12]); - a[10] = MinU8(a[10], a[18]); - SortU8(a[12], a[20]); - a[10] = MinU8(a[10], a[20]); - a[12] = MaxU8(a[10], a[12]); - } - - void MedianFilterSquare5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - int a[25]; - const uint8_t * y[5]; - size_t x[5]; - - size_t size = channelCount*width; - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - y[0] = src + srcStride*(row - 2); - y[1] = y[0] + srcStride; - y[2] = y[1] + srcStride; - y[3] = y[2] + srcStride; - y[4] = y[3] + srcStride; - if (row < 2) - { - if (row < 1) - y[1] = y[2]; - y[0] = y[1]; - } - if (row >= height - 2) - { - if (row >= height - 1) - y[3] = y[2]; - y[4] = y[3]; - } - - for (size_t col = 0; col < 4 * channelCount; col++) - { - if (col < 2 * channelCount) - { - x[0] = col < channelCount ? col : col - channelCount; - x[1] = x[0]; - x[2] = col; - x[3] = x[2] + channelCount; - x[4] = x[3] + channelCount; - } - else - { - x[0] = size - 6 * channelCount + col; - x[1] = x[0] + channelCount; - x[2] = x[1] + channelCount; - x[3] = col < 3 * channelCount ? x[2] + channelCount : x[2]; - x[4] = x[3]; - } - - LoadSquare5x5(y, x, a); - PartialSort25(a); - dst[x[2]] = (uint8_t)a[12]; - } - - for (size_t col = 2 * channelCount; col < size - 2 * channelCount; ++col) - { - x[0] = col - 2 * channelCount; - x[1] = col - channelCount; - x[2] = col; - x[3] = col + channelCount; - x[4] = col + 2 * channelCount; - - LoadSquare5x5(y, x, a); - PartialSort25(a); - dst[col] = (uint8_t)a[12]; - } - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseNeural.cpp b/src/3rd/Simd/Simd/SimdBaseNeural.cpp deleted file mode 100644 index e344ede3..00000000 --- a/src/3rd/Simd/Simd/SimdBaseNeural.cpp +++ /dev/null @@ -1,634 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdPow.h" - -namespace Simd -{ - namespace Base - { - void NeuralConvert(const uint8_t * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride, int inversion) - { - const float k = 1.0f / 255.0f; - for (size_t row = 0; row < height; ++row) - { - if (inversion) - { - for (size_t col = 0; col < width; ++col) - dst[col] = (255 - src[col])* k; - } - else - { - for (size_t col = 0; col < width; ++col) - dst[col] = src[col] * k; - } - src += srcStride; - dst += dstStride; - } - } - - SIMD_INLINE float ProductSum(const float * a, const float * b, size_t aligned, size_t full) - { - size_t i = 0; - float sums[4] = { 0, 0, 0, 0 }; - for (; i < aligned; i += 4) - { - sums[0] += a[i + 0] * b[i + 0]; - sums[1] += a[i + 1] * b[i + 1]; - sums[2] += a[i + 2] * b[i + 2]; - sums[3] += a[i + 3] * b[i + 3]; - } - for (; i < full; ++i) - sums[0] += a[i] * b[i]; - return sums[0] + sums[1] + sums[2] + sums[3]; - } - - void NeuralProductSum(const float * a, const float * b, size_t size, float * sum) - { - *sum = ProductSum(a, b, Simd::AlignLo(size, 4), size); - } - - SIMD_INLINE void AddMultiplied(const float * src, size_t aligned, size_t full, float value, float * dst) - { - size_t i = 0; - for (; i < aligned; i += 4) - { - dst[i + 0] += src[i + 0] * value; - dst[i + 1] += src[i + 1] * value; - dst[i + 2] += src[i + 2] * value; - dst[i + 3] += src[i + 3] * value; - } - for (; i < full; ++i) - dst[i] += src[i] * value; - } - - void NeuralAddVectorMultipliedByValue(const float * src, size_t size, const float * value, float * dst) - { - AddMultiplied(src, Simd::AlignLo(size, 4), size, *value, dst); - } - - void NeuralAddVector(const float * src, size_t size, float * dst) - { - size_t aligned = Simd::AlignLo(size, 4); - size_t i = 0; - for (; i < aligned; i += 4) - { - dst[i + 0] += src[i + 0]; - dst[i + 1] += src[i + 1]; - dst[i + 2] += src[i + 2]; - dst[i + 3] += src[i + 3]; - } - for (; i < size; ++i) - dst[i] += src[i]; - } - - void NeuralAddValue(const float * value, float * dst, size_t size) - { - const float val = value[0]; - size_t aligned = Simd::AlignLo(size, 4); - size_t i = 0; - for (; i < aligned; i += 4) - { - dst[i + 0] += val; - dst[i + 1] += val; - dst[i + 2] += val; - dst[i + 3] += val; - } - for (; i < size; ++i) - dst[i] += val; - } - - void NeuralRoughSigmoid(const float * src, size_t size, const float * slope, float * dst) - { - float s = slope[0]; - for (size_t i = 0; i < size; ++i) - dst[i] = RoughSigmoid(src[i] * s); - } - - void NeuralRoughSigmoid2(const float * src, size_t size, const float * slope, float * dst) - { - float s = slope[0]; - for (size_t i = 0; i < size; ++i) - dst[i] = RoughSigmoid2(src[i] * s); - } - - void NeuralDerivativeSigmoid(const float * src, size_t size, const float * slope, float * dst) - { - float s = slope[0]; - for (size_t i = 0; i < size; ++i) - dst[i] *= s*DerivativeSigmoid(src[i]); - } - - void NeuralRoughTanh(const float * src, size_t size, const float * slope, float * dst) - { - float s = slope[0]; - for (size_t i = 0; i < size; ++i) - dst[i] = RoughTanh(src[i] * s); - } - - void NeuralDerivativeTanh(const float * src, size_t size, const float * slope, float * dst) - { - float s = slope[0]; - for (size_t i = 0; i < size; ++i) - dst[i] *= s*DerivativeTanh(src[i]); - } - - void NeuralDerivativeRelu(const float * src, size_t size, const float * slope, float * dst) - { - float s = slope[0]; - for (size_t i = 0; i < size; ++i) - dst[i] *= src[i] > 0 ? 1.0f : s; - } - - void NeuralPow(const float * src, size_t size, const float * exponent, float * dst) - { - float e = exponent[0]; - for (size_t i = 0; i < size; ++i) - dst[i] = Pow(src[i], e); - } - - void NeuralUpdateWeights(const float * x, size_t size, const float * a, const float * b, float * d, float * w) - { - float _a = a[0], _b = b[0]; - size_t alignedSize = Simd::AlignLo(size, 4); - size_t i = 0; - for (; i < alignedSize; i += 4) - { - UpdateWeights(x, i + 0, _a, _b, d, w); - UpdateWeights(x, i + 1, _a, _b, d, w); - UpdateWeights(x, i + 2, _a, _b, d, w); - UpdateWeights(x, i + 3, _a, _b, d, w); - } - for (; i < size; ++i) - UpdateWeights(x, i, _a, _b, d, w); - } - - void NeuralAdaptiveGradientUpdate(const float * delta, size_t size, size_t batch, const float * alpha, const float * epsilon, float * gradient, float * weight) - { - float norm = (float)(1.0 / batch), _alpha = alpha[0], _epsilon = epsilon[0]; - size_t alignedSize = Simd::AlignLo(size, 4); - size_t i = 0; - for (; i < alignedSize; i += 4) - { - AdaptiveGradientUpdate(delta, i + 0, norm, _alpha, _epsilon, gradient, weight); - AdaptiveGradientUpdate(delta, i + 1, norm, _alpha, _epsilon, gradient, weight); - AdaptiveGradientUpdate(delta, i + 2, norm, _alpha, _epsilon, gradient, weight); - AdaptiveGradientUpdate(delta, i + 3, norm, _alpha, _epsilon, gradient, weight); - } - for (; i < size; ++i) - AdaptiveGradientUpdate(delta, i, norm, _alpha, _epsilon, gradient, weight); - } - - SIMD_INLINE float Convolution2(const float * src, const float * weights) - { - return src[0] * weights[0] + src[1] * weights[1]; - } - - SIMD_INLINE float Convolution2x2Forward(const float * src, size_t stride, const float * weights) - { - return - Convolution2(src, weights) + - Convolution2(src + stride, weights + 2); - } - - void NeuralAddConvolution2x2Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - dst[col] += Convolution2x2Forward(src + col, srcStride, weights); - src += srcStride; - dst += dstStride; - } - } - - SIMD_INLINE float Convolution3(const float * src, const float * weights) - { - return src[0] * weights[0] + src[1] * weights[1] + src[2] * weights[2]; - } - - SIMD_INLINE float Convolution3x3Forward(const float * src, size_t stride, const float * weights) - { - return - Convolution3(src, weights) + - Convolution3(src + stride, weights + 3) + - Convolution3(src + 2 * stride, weights + 6); - } - - void NeuralAddConvolution3x3Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - dst[col] += Convolution3x3Forward(src + col, srcStride, weights); - src += srcStride; - dst += dstStride; - } - } - - SIMD_INLINE float Convolution4(const float * src, const float * weights) - { - return src[0] * weights[0] + src[1] * weights[1] + src[2] * weights[2] + src[3] * weights[3]; - } - - SIMD_INLINE float Convolution4x4Forward(const float * src, size_t stride, const float * weights) - { - return - Convolution4(src, weights) + - Convolution4(src + stride, weights + 4) + - Convolution4(src + 2 * stride, weights + 8) + - Convolution4(src + 3 * stride, weights + 12); - } - - void NeuralAddConvolution4x4Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - dst[col] += Convolution4x4Forward(src + col, srcStride, weights); - src += srcStride; - dst += dstStride; - } - } - - SIMD_INLINE float Convolution5(const float * src, const float * weights) - { - return src[0] * weights[0] + src[1] * weights[1] + src[2] * weights[2] + src[3] * weights[3] + src[4] * weights[4]; - } - - SIMD_INLINE float Convolution5x5Forward(const float * src, size_t stride, const float * weights) - { - return - Convolution5(src, weights) + - Convolution5(src + stride, weights + 5) + - Convolution5(src + 2 * stride, weights + 10) + - Convolution5(src + 3 * stride, weights + 15) + - Convolution5(src + 4 * stride, weights + 20); - } - - void NeuralAddConvolution5x5Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - dst[col] += Convolution5x5Forward(src + col, srcStride, weights); - src += srcStride; - dst += dstStride; - } - } - - template SIMD_INLINE void NeuralAddConvolutionBackward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - size_t aligned = Simd::AlignLo(width, 4); - for (size_t row = 0; row < height; ++row) - { - for (size_t dy = 0; dy < coreY; ++dy) - { - const float * w = weights + dy * coreX; - float * d = dst + dy*dstStride; - for (size_t dx = 0; dx < coreX; ++dx) - AddMultiplied(src, aligned, width, w[dx], d + dx); - } - src += srcStride; - dst += dstStride; - } - } - - void NeuralAddConvolution2x2Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - NeuralAddConvolutionBackward<2, 2>(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution3x3Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - NeuralAddConvolutionBackward<3, 3>(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution4x4Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - NeuralAddConvolutionBackward<4, 4>(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution5x5Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - NeuralAddConvolutionBackward<5, 5>(src, srcStride, width, height, weights, dst, dstStride); - } - - template SIMD_INLINE void NeuralAddConvolutionSum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - size_t aligned = Simd::AlignLo(width, 4); - for (size_t row = 0; row < height; ++row) - { - for (size_t dy = 0; dy < coreY; ++dy) - { - const float * s = src + dy*srcStride; - float * sum = sums + dy * coreX; - for (size_t dx = 0; dx < coreX; ++dx) - sum[dx] += ProductSum(s + dx, dst, aligned, width); - } - src += srcStride; - dst += dstStride; - } - } - - void NeuralAddConvolution2x2Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - NeuralAddConvolutionSum<2, 2>(src, srcStride, dst, dstStride, width, height, sums); - } - - void NeuralAddConvolution3x3Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - NeuralAddConvolutionSum<3, 3>(src, srcStride, dst, dstStride, width, height, sums); - } - - void NeuralAddConvolution4x4Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - NeuralAddConvolutionSum<4, 4>(src, srcStride, dst, dstStride, width, height, sums); - } - - void NeuralAddConvolution5x5Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - NeuralAddConvolutionSum<5, 5>(src, srcStride, dst, dstStride, width, height, sums); - } - - SIMD_INLINE float Max2(const float * src) - { - return Simd::Max(src[0], src[1]); - } - - SIMD_INLINE float Max2x2(const float * src, size_t stride) - { - return Simd::Max(Max2(src), Max2(src + stride)); - } - - SIMD_INLINE float Max3(const float * src) - { - return Simd::Max(src[0], Simd::Max(src[1], src[2])); - } - - SIMD_INLINE float Max3x3(const float * src, size_t stride) - { - return Simd::Max(Max3(src), Simd::Max(Max3(src + stride), Max3(src + 2 * stride))); - } - - SIMD_INLINE float Max2x3(const float * src, size_t stride) - { - return Simd::Max(Max2(src), Simd::Max(Max2(src + stride), Max2(src + 2 * stride))); - } - - SIMD_INLINE float Max3x2(const float * src, size_t stride) - { - return Simd::Max(Max3(src), Max3(src + stride)); - } - - void NeuralPooling1x1Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - height -= 1; - width -= 1; - src -= 1; - - dst[0] = Max2x2(src + 1, srcStride); - for (size_t col = 1; col < width; ++col) - dst[col] = Max3x2(src + col, srcStride); - dst[width] = Max2x2(src + width, srcStride); - dst += dstStride; - - for (size_t row = 1; row < height; ++row) - { - dst[0] = Max2x3(src + 1, srcStride); - for (size_t col = 1; col < width; ++col) - dst[col] = Max3x3(src + col, srcStride); - dst[width] = Max2x3(src + width, srcStride); - src += srcStride; - dst += dstStride; - } - - dst[0] = Max2x2(src + 1, srcStride); - for (size_t col = 1; col < width; ++col) - dst[col] = Max3x2(src + col, srcStride); - dst[width] = Max2x2(src + width, srcStride); - } - - void NeuralPooling2x2Max2x2(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - size_t heightEven = Simd::AlignLo(height, 2); - size_t widthEven = Simd::AlignLo(width, 2); - for (size_t row = 0; row < heightEven; row += 2) - { - for (size_t col = 0; col < widthEven; col += 2) - dst[col >> 1] = Max2x2(src + col, srcStride); - if (width - widthEven) - dst[widthEven >> 1] = Simd::Max(src[widthEven], src[widthEven + srcStride]); - src += 2 * srcStride; - dst += dstStride; - } - if (height - heightEven) - { - for (size_t col = 0; col < widthEven; col += 2) - dst[col >> 1] = Simd::Max(src[col], src[col + 1]); - if (width - widthEven) - dst[widthEven >> 1] = src[widthEven]; - } - } - - void NeuralPooling2x2Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - height -= 1; - width -= 1; - size_t heightEven = Simd::AlignLo(height, 2); - size_t widthEven = Simd::AlignLo(width, 2); - for (size_t row = 0; row < heightEven; row += 2) - { - for (size_t col = 0; col < widthEven; col += 2) - dst[col >> 1] = Max3x3(src + col, srcStride); - if (width - widthEven) - dst[widthEven >> 1] = Max2x3(src + widthEven, srcStride); - src += 2 * srcStride; - dst += dstStride; - } - if (height - heightEven) - { - for (size_t col = 0; col < widthEven; col += 2) - dst[col >> 1] = Max3x2(src + col, srcStride); - if (width - widthEven) - dst[widthEven >> 1] = Max2x2(src + widthEven, srcStride); - } - } - - SIMD_INLINE bool NeuralConvolutionForwardValid(ptrdiff_t a, ptrdiff_t b) - { - return size_t(a) < size_t(b); - } - - void NeuralConvolutionForwardConvertN(const float * src, ptrdiff_t srcWidth, ptrdiff_t srcHeight, ptrdiff_t srcDepth, ptrdiff_t kernelX, ptrdiff_t kernelY, - ptrdiff_t padX, ptrdiff_t padY, ptrdiff_t strideX, ptrdiff_t strideY, ptrdiff_t dilationX, ptrdiff_t dilationY, float * dst) - { - const ptrdiff_t dstHeight = (srcHeight + 2 * padY - (dilationY * (kernelY - 1) + 1)) / strideY + 1; - const ptrdiff_t dstWidth = (srcWidth + 2 * padX - (dilationX * (kernelX - 1) + 1)) / strideX + 1; - const ptrdiff_t channelSize = srcHeight * srcWidth; - for (ptrdiff_t channel = 0; channel < srcDepth; ++channel, src += channelSize) - { - for (ptrdiff_t kernelRow = 0; kernelRow < kernelY; ++kernelRow) - { - for (ptrdiff_t kernelCol = 0; kernelCol < kernelX; ++kernelCol) - { - ptrdiff_t srcRow = kernelRow*dilationY - padY; - for (ptrdiff_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - if (!NeuralConvolutionForwardValid(srcRow, srcHeight)) - { - for (ptrdiff_t dstCol = 0; dstCol < dstWidth; ++dstCol) - *(dst++) = 0; - } - else - { - ptrdiff_t srcCol = kernelCol*dilationX - padX; - for (ptrdiff_t dstCol = 0; dstCol < dstWidth; ++dstCol) - { - if (NeuralConvolutionForwardValid(srcCol, srcWidth)) - *(dst++) = src[srcRow*srcWidth + srcCol]; - else - *(dst++) = 0; - srcCol += strideX; - } - } - srcRow += strideY; - } - } - } - } - } - - void NeuralConvolutionForwardConvertT(const float * src, ptrdiff_t srcWidth, ptrdiff_t srcHeight, ptrdiff_t srcDepth, ptrdiff_t kernelX, ptrdiff_t kernelY, - ptrdiff_t padX, ptrdiff_t padY, ptrdiff_t strideX, ptrdiff_t strideY, ptrdiff_t dilationX, ptrdiff_t dilationY, float * dst) - { - const ptrdiff_t dstHeight = (srcHeight + 2 * padY - (dilationY * (kernelY - 1) + 1)) / strideY + 1; - const ptrdiff_t dstWidth = (srcWidth + 2 * padX - (dilationX * (kernelX - 1) + 1)) / strideX + 1; - for (ptrdiff_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - ptrdiff_t srcRow0 = dstRow*strideY - padY; - for (ptrdiff_t dstCol = 0; dstCol < dstWidth; ++dstCol) - { - ptrdiff_t srcCol0 = dstCol*strideX - padX; - for (ptrdiff_t channel = 0; channel < srcDepth; ++channel) - { - ptrdiff_t dstChannelOffset = ((dstRow*dstWidth + dstCol)*srcDepth + channel)*kernelY*kernelX; - for (ptrdiff_t kernelRow = 0; kernelRow < kernelY; ++kernelRow) - { - ptrdiff_t srcRow = srcRow0 + kernelRow*dilationY; - for (ptrdiff_t kernelCol = 0; kernelCol < kernelX; ++kernelCol) - { - ptrdiff_t srcCol = srcCol0 + kernelCol*dilationX; - ptrdiff_t dstOffset = dstChannelOffset + kernelRow*kernelX + kernelCol; - if (NeuralConvolutionForwardValid(srcRow, srcHeight) && NeuralConvolutionForwardValid(srcCol, srcWidth)) - dst[dstOffset] = src[(channel*srcHeight + srcRow)*srcWidth + srcCol]; - else - dst[dstOffset] = 0; - } - } - } - } - } - } - - void NeuralConvolutionForwardGemmNN(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) - { - for (size_t i = 0; i < M; ++i) - { - for (size_t k = 0; k < K; ++k) - { - float va = a[i*K + k]; - const float * pb = b + k*N; - float * pc = c + i*N; - for (size_t j = 0; j < N; ++j) - pc[j] += va*pb[j]; - } - } - } - - void NeuralConvolutionForwardGemmNT(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) - { - for (size_t i = 0; i < M; ++i) - { - for (size_t j = 0; j < N; ++j) - { - float s = 0; - const float * pa = a + i*K; - const float * pb = b + j*K; - for (size_t k = 0; k < K; ++k) - s += pa[k] * pb[k]; - c[i*N + j] += s; - } - } - } - - void NeuralConvolutionForward(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, - const float * weight, size_t kernelX, size_t kernelY, size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, - void * buffer, size_t * size, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth, int add) - { - assert(dstWidth == (srcWidth + 2 * padX - (dilationX * (kernelX - 1) + 1)) / strideX + 1); - assert(dstHeight == (srcHeight + 2 * padY - (dilationY * (kernelY - 1) + 1)) / strideY + 1); - - if (!add) - memset(dst, 0, dstWidth*dstHeight*dstDepth * sizeof(float)); - - float * temporal = NULL; - void * internal = NULL; - - bool transpose = dstWidth*dstHeight <= 1024;// && srcDepth > 128; - - if (kernelX == 1 && kernelY == 1 && !transpose) - temporal = (float*)src; - else - { - size_t required = dstWidth*dstHeight*srcDepth*kernelX*kernelY * sizeof(float); - if (buffer != AlignHi(buffer, SIMD_ALIGN)) - required += SIMD_ALIGN; - if (buffer == NULL || size == NULL || *size < required) - { - internal = Allocate(required); - if (size) - *size = required; - temporal = (float*)internal; - } - else - temporal = (float*)AlignHi(buffer, SIMD_ALIGN); - - if (transpose) - NeuralConvolutionForwardConvertT(src, srcWidth, srcHeight, srcDepth, kernelX, kernelY, padX, padY, strideX, strideY, dilationX, dilationY, temporal); - else - NeuralConvolutionForwardConvertN(src, srcWidth, srcHeight, srcDepth, kernelX, kernelY, padX, padY, strideX, strideY, dilationX, dilationY, temporal); - } - - size_t M = dstDepth, N = dstHeight*dstWidth, K = kernelX*kernelY*srcDepth; - if (transpose) - NeuralConvolutionForwardGemmNT(M, N, K, weight, temporal, dst); - else - NeuralConvolutionForwardGemmNN(M, N, K, weight, temporal, dst); - - if (internal) - Free(internal); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseOperation.cpp b/src/3rd/Simd/Simd/SimdBaseOperation.cpp deleted file mode 100644 index c76d9e6b..00000000 --- a/src/3rd/Simd/Simd/SimdBaseOperation.cpp +++ /dev/null @@ -1,159 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" - -namespace Simd -{ - namespace Base - { - template SIMD_INLINE uint8_t OperationBinary8u(const uint8_t & a, const uint8_t & b); - - template <> SIMD_INLINE uint8_t OperationBinary8u(const uint8_t & a, const uint8_t & b) - { - return Average(a, b); - } - - template <> SIMD_INLINE uint8_t OperationBinary8u(const uint8_t & a, const uint8_t & b) - { - return a & b; - } - - template <> SIMD_INLINE uint8_t OperationBinary8u(const uint8_t & a, const uint8_t & b) - { - return a | b; - } - - template <> SIMD_INLINE uint8_t OperationBinary8u(const uint8_t & a, const uint8_t & b) - { - return MaxU8(a, b); - } - - template <> SIMD_INLINE uint8_t OperationBinary8u(const uint8_t & a, const uint8_t & b) - { - return MinU8(a, b); - } - - template <> SIMD_INLINE uint8_t OperationBinary8u(const uint8_t & a, const uint8_t & b) - { - return SaturatedSubtractionU8(a, b); - } - - template <> SIMD_INLINE uint8_t OperationBinary8u(const uint8_t & a, const uint8_t & b) - { - return MinU8((int)a + (int)b, 255); - } - - template void OperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride) - { - size_t size = width*channelCount; - for (size_t row = 0; row < height; ++row) - { - for (size_t offset = 0; offset < size; ++offset) - dst[offset] = OperationBinary8u(a[offset], b[offset]); - a += aStride; - b += bStride; - dst += dstStride; - } - } - - void OperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride, SimdOperationBinary8uType type) - { - switch (type) - { - case SimdOperationBinary8uAverage: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uAnd: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uOr: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uMaximum: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uMinimum: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uSaturatedSubtraction: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uSaturatedAddition: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - default: - assert(0); - } - } - - template SIMD_INLINE int16_t OperationBinary16i(const int16_t & a, const int16_t & b); - - template <> SIMD_INLINE int16_t OperationBinary16i(const int16_t & a, const int16_t & b) - { - return a + b; - } - - template <> SIMD_INLINE int16_t OperationBinary16i(const int16_t & a, const int16_t & b) - { - return a - b; - } - - template void OperationBinary16i(const int16_t * a, size_t aStride, const int16_t * b, size_t bStride, - size_t width, size_t height, int16_t * dst, size_t dstStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - dst[col] = OperationBinary16i(a[col], b[col]); - a += aStride; - b += bStride; - dst += dstStride; - } - } - - void OperationBinary16i(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint8_t * dst, size_t dstStride, SimdOperationBinary16iType type) - { - assert(aStride % sizeof(int16_t) == 0 && bStride % sizeof(int16_t) == 0 && dstStride % sizeof(int16_t) == 0); - - switch (type) - { - case SimdOperationBinary16iAddition: - return OperationBinary16i( - (const int16_t*)a, aStride / sizeof(int16_t), (const int16_t*)b, bStride / sizeof(int16_t), width, height, (int16_t*)dst, dstStride / sizeof(int16_t)); - case SimdOperationBinary16iSubtraction: - return OperationBinary16i( - (const int16_t*)a, aStride / sizeof(int16_t), (const int16_t*)b, bStride / sizeof(int16_t), width, height, (int16_t*)dst, dstStride / sizeof(int16_t)); - default: - assert(0); - } - } - - void VectorProduct(const uint8_t * vertical, const uint8_t * horizontal, uint8_t * dst, size_t stride, size_t width, size_t height) - { - for (size_t row = 0; row < height; ++row) - { - int _vertical = vertical[row]; - for (size_t col = 0; col < width; ++col) - dst[col] = DivideBy255(_vertical * horizontal[col]); - dst += stride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBasePerformance.cpp b/src/3rd/Simd/Simd/SimdBasePerformance.cpp deleted file mode 100644 index fcf07905..00000000 --- a/src/3rd/Simd/Simd/SimdBasePerformance.cpp +++ /dev/null @@ -1,155 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdPerformance.h" - -#if defined(SIMD_PERFORMANCE_STATISTIC) && defined(NDEBUG) -namespace Simd -{ - namespace Base - { - - SIMD_INLINE double Miliseconds(int64_t count) - { - return double(count) / double(TimeFrequency()) * 1000.0; - } - - PerformanceMeasurer::PerformanceMeasurer(const String& name, int64_t flop) - : _name(name) - , _flop(flop) - , _count(0) - , _current(0) - , _total(0) - , _min(std::numeric_limits::max()) - , _max(std::numeric_limits::min()) - , _entered(false) - , _paused(false) - { - } - - PerformanceMeasurer::PerformanceMeasurer(const PerformanceMeasurer & pm) - : _name(pm._name) - , _flop(pm._flop) - , _count(pm._count) - , _start(pm._start) - , _current(pm._current) - , _total(pm._total) - , _min(pm._min) - , _max(pm._max) - , _entered(pm._entered) - , _paused(pm._paused) - { - } - - void PerformanceMeasurer::Enter() - { - if (!_entered) - { - _entered = true; - _paused = false; - _start = TimeCounter(); - } - } - - void PerformanceMeasurer::Leave(bool pause) - { - if (_entered || _paused) - { - if (_entered) - { - _entered = false; - _current += TimeCounter() - _start; - } - if (!pause) - { - _total += _current; - _min = std::min(_min, _current); - _max = std::max(_max, _current); - ++_count; - _current = 0; - } - _paused = pause; - } - } - - String PerformanceMeasurer::Statistic() const - { - std::stringstream ss; - ss << _name << ": "; - ss << std::setprecision(0) << std::fixed << Miliseconds(_total) << " ms"; - ss << " / " << _count << " = "; - ss << std::setprecision(3) << std::fixed << Average() << " ms"; - ss << std::setprecision(3) << " {min=" << Miliseconds(_min) << "; max=" << Miliseconds(_max) << "}"; - if (_flop) - ss << " " << std::setprecision(1) << GFlops() << " GFlops"; - return ss.str(); - } - - void PerformanceMeasurer::Combine(const PerformanceMeasurer& other) - { - _count += other._count; - _total += other._total; - _min = std::min(_min, other._min); - _max = std::max(_max, other._max); - } - - double PerformanceMeasurer::Average() const - { - return _count ? (Miliseconds(_total) / _count) : 0; - } - - double PerformanceMeasurer::GFlops() const - { - return _count && _flop && _total > 0 ? (double(_flop) * _count / Miliseconds(_total) / 1000000.0) : 0; - } - - //--------------------------------------------------------------------- - - PerformanceMeasurerStorage PerformanceMeasurerStorage::s_storage; - - const char * PerformanceMeasurerStorage::PerformanceStatistic() - { - if (_map.empty()) - return ""; - FunctionMap combined; - std::lock_guard lock(_mutex); - for (ThreadMap::const_iterator thread = _map.begin(); thread != _map.end(); ++thread) - { - for (FunctionMap::const_iterator function = thread->second.begin(); function != thread->second.end(); ++function) - { - if (combined.find(function->first) == combined.end()) - combined[function->first].reset(new PerformanceMeasurer(*function->second)); - else - combined[function->first]->Combine(*function->second); - } - } - std::stringstream report; - report << std::endl << "Simd Library Internal Performance Statistics:" << std::endl; - for (FunctionMap::const_iterator it = combined.begin(); it != combined.end(); ++it) - report << it->second->Statistic() << std::endl; - _report = report.str(); - return _report.c_str(); - } - } -} -#endif//SIMD_PERFORMANCE_STATISTIC diff --git a/src/3rd/Simd/Simd/SimdBaseReduce.cpp b/src/3rd/Simd/Simd/SimdBaseReduce.cpp deleted file mode 100644 index bbb8ea1b..00000000 --- a/src/3rd/Simd/Simd/SimdBaseReduce.cpp +++ /dev/null @@ -1,98 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdMath.h" - -namespace Simd -{ - namespace Base - { - template void ReduceColor2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst); - - template <> void ReduceColor2x2<1>(const uint8_t * src0, const uint8_t * src1, uint8_t * dst) - { - dst[0] = Average(src0[0], src0[1], src1[0], src1[1]); - } - - template <> void ReduceColor2x2<2>(const uint8_t * src0, const uint8_t * src1, uint8_t * dst) - { - dst[0] = Average(src0[0], src0[2], src1[0], src1[2]); - dst[1] = Average(src0[1], src0[3], src1[1], src1[3]); - } - - template <> void ReduceColor2x2<3>(const uint8_t * src0, const uint8_t * src1, uint8_t * dst) - { - dst[0] = Average(src0[0], src0[3], src1[0], src1[3]); - dst[1] = Average(src0[1], src0[4], src1[1], src1[4]); - dst[2] = Average(src0[2], src0[5], src1[2], src1[5]); - } - - - template <> void ReduceColor2x2<4>(const uint8_t * src0, const uint8_t * src1, uint8_t * dst) - { - dst[0] = Average(src0[0], src0[4], src1[0], src1[4]); - dst[1] = Average(src0[1], src0[5], src1[1], src1[5]); - dst[2] = Average(src0[2], src0[6], src1[2], src1[6]); - dst[3] = Average(src0[3], src0[7], src1[3], src1[7]); - } - - template void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride) - { - size_t evenWidth = AlignLo(srcWidth, 2); - size_t evenSize = evenWidth * channelCount; - size_t srcStep = 2 * channelCount; - for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) - { - const uint8_t * s0 = src; - const uint8_t * s1 = (srcRow == srcHeight - 1 ? src : src + srcStride); - const uint8_t * end = src + evenSize; - uint8_t * d = dst; - for (; s0 < end; s0 += srcStep, s1 += srcStep, d += channelCount) - ReduceColor2x2(s0, s1, d); - if (evenWidth != srcWidth) - { - for(size_t c = 0; c < channelCount; ++c) - d[c] = Average(s0[c], s1[c]); - } - src += 2 * srcStride; - dst += dstStride; - } - } - - void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) - { - assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight); - - switch (channelCount) - { - case 1: ReduceColor2x2<1>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - case 2: ReduceColor2x2<2>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - case 3: ReduceColor2x2<3>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - case 4: ReduceColor2x2<4>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - default: assert(0); - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseReduceGray2x2.cpp b/src/3rd/Simd/Simd/SimdBaseReduceGray2x2.cpp deleted file mode 100644 index cd49c91f..00000000 --- a/src/3rd/Simd/Simd/SimdBaseReduceGray2x2.cpp +++ /dev/null @@ -1,56 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdMath.h" - -namespace Simd -{ - namespace Base - { - void ReduceGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight); - - size_t evenWidth = AlignLo(srcWidth, 2); - for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) - { - const uint8_t *s0 = src; - const uint8_t *s1 = (srcRow == srcHeight - 1 ? src : src + srcStride); - const uint8_t *end = src + evenWidth; - uint8_t *d = dst; - for (; s0 < end; s0 += 2, s1 += 2, d += 1) - { - d[0] = Average(s0[0], s0[1], s1[0], s1[1]); - } - if (evenWidth != srcWidth) - { - d[0] = Average(s0[0], s1[0]); - } - src += 2 * srcStride; - dst += dstStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseReduceGray3x3.cpp b/src/3rd/Simd/Simd/SimdBaseReduceGray3x3.cpp deleted file mode 100644 index 87aad7b3..00000000 --- a/src/3rd/Simd/Simd/SimdBaseReduceGray3x3.cpp +++ /dev/null @@ -1,67 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" - -namespace Simd -{ - namespace Base - { - template void ReduceGray3x3(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight); - - for (size_t col = 0; col < srcHeight; col += 2, dst += dstStride) - { - const uint8_t *src0 = src + srcStride*(col - 1); - const uint8_t *src1 = src0 + srcStride; - const uint8_t *src2 = src1 + srcStride; - if (col == 0) - src0 = src1; - if (col == srcHeight - 1) - src2 = src1; - - uint8_t *pDst = dst; - size_t row; - - *pDst++ = GaussianBlur3x3(src0, src1, src2, 0, 0, 1); - - for (row = 2; row < srcWidth - 1; row += 2) - *pDst++ = GaussianBlur3x3(src0, src1, src2, row - 1, row, row + 1); - - if (row == srcWidth - 1) - *pDst++ = GaussianBlur3x3(src0, src1, src2, srcWidth - 2, srcWidth - 1, srcWidth - 1); - } - } - - void ReduceGray3x3(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation) - { - if (compensation) - ReduceGray3x3(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - ReduceGray3x3(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseReduceGray4x4.cpp b/src/3rd/Simd/Simd/SimdBaseReduceGray4x4.cpp deleted file mode 100644 index 143d6c3b..00000000 --- a/src/3rd/Simd/Simd/SimdBaseReduceGray4x4.cpp +++ /dev/null @@ -1,110 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" - -namespace Simd -{ - namespace Base - { - namespace - { - struct Buffer - { - Buffer(size_t width) - { - _p = Allocate(sizeof(int) * 2 * width); - src0 = (int*)_p; - src1 = src0 + width; - } - - ~Buffer() - { - Free(_p); - } - - int * src0; - int * src1; - private: - void *_p; - }; - } - - SIMD_INLINE int DivideBy64(int value) - { - return (value + 32) >> 6; - } - - SIMD_INLINE int GaussianBlur(const uint8_t *src, size_t x0, size_t x1, size_t x2, size_t x3) - { - return src[x0] + 3 * (src[x1] + src[x2]) + src[x3]; - } - - SIMD_INLINE void ProcessFirstRow(const uint8_t *src, size_t x0, size_t x1, size_t x2, size_t x3, Buffer & buffer, size_t offset) - { - int tmp = GaussianBlur(src, x0, x1, x2, x3); - buffer.src0[offset] = tmp; - buffer.src1[offset] = tmp; - } - - SIMD_INLINE void ProcessMainRow(const uint8_t *s2, const uint8_t *s3, size_t x0, size_t x1, size_t x2, size_t x3, Buffer & buffer, uint8_t* dst, size_t offset) - { - int tmp2 = GaussianBlur(s2, x0, x1, x2, x3); - int tmp3 = GaussianBlur(s3, x0, x1, x2, x3); - dst[offset] = DivideBy64(buffer.src0[offset] + 3 * (buffer.src1[offset] + tmp2) + tmp3); - buffer.src0[offset] = tmp2; - buffer.src1[offset] = tmp3; - } - - void ReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth > 2); - - Buffer buffer(dstWidth); - - ProcessFirstRow(src, 0, 0, 1, 2, buffer, 0); - size_t srcCol = 2, dstCol = 1; - for (; srcCol < srcWidth - 2; srcCol += 2, dstCol++) - ProcessFirstRow(src, srcCol - 1, srcCol, srcCol + 1, srcCol + 2, buffer, dstCol); - ProcessFirstRow(src, srcCol - 1, srcCol, srcWidth - 1, srcWidth - 1, buffer, dstCol); - - for (size_t row = 0; row < srcHeight; row += 2, dst += dstStride) - { - const uint8_t *src2 = src + srcStride*(row + 1); - const uint8_t *src3 = src2 + srcStride; - if (row >= srcHeight - 2) - { - src2 = src + srcStride*(srcHeight - 1); - src3 = src2; - } - - ProcessMainRow(src2, src3, 0, 0, 1, 2, buffer, dst, 0); - size_t srcCol = 2, dstCol = 1; - for (; srcCol < srcWidth - 2; srcCol += 2, dstCol++) - ProcessMainRow(src2, src3, srcCol - 1, srcCol, srcCol + 1, srcCol + 2, buffer, dst, dstCol); - ProcessMainRow(src2, src3, srcCol - 1, srcCol, srcWidth - 1, srcWidth - 1, buffer, dst, dstCol); - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseReduceGray5x5.cpp b/src/3rd/Simd/Simd/SimdBaseReduceGray5x5.cpp deleted file mode 100644 index a95c56d5..00000000 --- a/src/3rd/Simd/Simd/SimdBaseReduceGray5x5.cpp +++ /dev/null @@ -1,351 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" - -namespace Simd -{ - namespace Base - { - namespace - { - struct Buffer - { - Buffer(size_t width) - { - _p = Allocate(sizeof(uint16_t) * 3 * width); - isc0 = (uint16_t*)_p; - isc1 = isc0 + width; - iscp = isc1 + width; - } - - ~Buffer() - { - Free(_p); - } - - uint16_t * isc0; - uint16_t * isc1; - uint16_t * iscp; - private: - void *_p; - }; - } - - - /************************************************************************************************** - * The Burt & Adelson Reduce operation. This function use 2-D version of algorithm; - * - * Reference: - * Frederick M. Waltz and John W.V. Miller. An efficient algorithm for Gaussian blur using - * finite-state machines. - * SPIE Conf. on Machine Vision Systems for Inspection and Metrology VII. November 1998. - * - * - * 2-D explanation: - * - * src image pixels: A B C D E dst image pixels: a b c - * F G H I J - * K L M N O d e f - * P Q R S T - * U V W X Y g h i - * - * Algorithm visits all src image pixels from left to right and top to bottom. - * When visiting src pixel Y, the value of e will be written to the dst image. - * - * State variables before visiting Y: - * sr0 = W - * sr1 = U + 4V - * srp = 4X - * sc0[2] = K + 4L + 6M + 4N + O - * sc1[2] = (A + 4B + 6C + 4D + E) + 4*(F + 4G + 6H + 4I + J) - * scp[2] = 4*(P + 4Q + 6R + 4S + T) - * - * State variables after visiting Y: - * sr0 = Y - * sr1 = W + 4X - * srp = 4X - * sc0[2] = U + 4V + 6W + 4X + Y - * sc1[2] = (K + 4L + 6M + 4N + O) + 4*(P + 4Q + 6R + 4S + T) - * scp[2] = 4*(P + 4Q + 6R + 4S + T) - * e = 1 * (A + 4B + 6C + 4D + E) - * + 4 * (F + 4G + 6H + 4I + J) - * + 6 * (K + 4L + 6M + 4N + O) - * + 4 * (P + 4Q + 6R + 4S + T) - * + 1 * (U + 4V + 6W + 4X + Y) - * - * Updates when visiting (even x, even y) source pixel: - * (all updates occur in parallel) - * sr0 <= current - * sr1 <= sr0 + srp - * sc0[x] <= sr1 + 6*sr0 + srp + current - * sc1[x] <= sc0[x] + scp[x] - * dst(-1,-1) <= sc1[x] + 6*sc0[x] + scp + (new sc0[x]) - * - * Updates when visiting (odd x, even y) source pixel: - * srp <= 4*current - * - * Updates when visiting (even x, odd y) source pixel: - * sr0 <= current - * sr1 <= sr0 + srp - * scp[x] <= 4*(sr1 + 6*sr0 + srp + current) - * - * Updates when visting (odd x, odd y) source pixel: - * srp <= 4*current - **************************************************************************************************/ - template SIMD_INLINE int DivideBy256(int value); - - template <> SIMD_INLINE int DivideBy256(int value) - { - return (value + 128) >> 8; - } - - template <> SIMD_INLINE int DivideBy256(int value) - { - return value >> 8; - } - - template void ReduceGray5x5(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight); - - Buffer buffer(dstWidth + 1); - - unsigned short isr0, isr1, isrp; - - const short zeroPixel = 0; - - uint8_t * dy = dst; - uint8_t * dx = dy; - const uint8_t * sy = src; - const uint8_t * sx = sy; - - bool evenY = true; - bool evenX = true; - size_t srcy = 0; - size_t srcx = 0; - size_t dstx = 0; - - // First row - { - isr0 = *sy; - isr1 = zeroPixel; - isrp = (unsigned short)(*sy) * 4; - - // Main pixels in first row - for (sx = sy, evenX = true, srcx = 0, dstx = 0; srcx < srcWidth; ++srcx, ++sx) - { - unsigned short icurrent(*sx); - - if (evenX) - { - buffer.isc0[dstx] = isr1 + 6 * isr0 + isrp + icurrent; - buffer.isc1[dstx] = 5 * buffer.isc0[dstx]; - isr1 = isr0 + isrp; - isr0 = icurrent; - } - else - { - isrp = icurrent * 4; - ++dstx; - } - evenX = !evenX; - } - - // Last entries in first row - if (!evenX) - { - // previous srcx was even - ++dstx; - buffer.isc0[dstx] = isr1 + 11 * isr0; - buffer.isc1[dstx] = 5 * buffer.isc0[dstx]; - } - else - { - // previous srcx was odd - buffer.isc0[dstx] = isr1 + 6 * isr0 + isrp + (isrp >> 2); - buffer.isc1[dstx] = 5 * buffer.isc0[dstx]; - } - } - sy += srcStride; - - // Main Rows - { - for (evenY = false, srcy = 1; srcy < srcHeight; ++srcy, sy += srcStride) - { - isr0 = (unsigned short)(*sy); - isr1 = zeroPixel; - isrp = (unsigned short)(*sy) * 4; - - if (evenY) - { - // Even-numbered row - // First entry in row - sx = sy; - isr1 = isr0 + isrp; - isr0 = (unsigned short)(*sx); - ++sx; - dx = dy; - - unsigned short * p_isc0 = buffer.isc0; - unsigned short * p_isc1 = buffer.isc1; - unsigned short * p_iscp = buffer.iscp; - - // Main entries in row - for (evenX = false, srcx = 1, dstx = 0; srcx < (srcWidth - 1); srcx += 2, ++sx) - { - p_isc0++; - p_isc1++; - p_iscp++; - - unsigned short icurrent = (unsigned short)(*sx); - - isrp = icurrent * 4; - icurrent = (unsigned short)(*(++sx)); - - unsigned short ip; - ip = *p_isc1 + 6 * (*p_isc0) + *p_iscp; - *p_isc1 = *p_isc0 + *p_iscp; - *p_isc0 = isr1 + 6 * isr0 + isrp + icurrent; - isr1 = isr0 + isrp; - isr0 = icurrent; - ip = ip + *p_isc0; - *dx = DivideBy256(ip); - ++dx; - } - dstx += p_isc0 - buffer.isc0; - - //doing the last operation due to even number of operations in previous cycle - if (!(srcWidth & 1)) - { - unsigned short icurrent = (unsigned short)(*sx); - isrp = icurrent * 4; - ++dstx; - evenX = !evenX; - ++sx; - } - - // Last entries in row - if (!evenX) - { - // previous srcx was even - ++dstx; - - unsigned short ip; - ip = buffer.isc1[dstx] + 6 * buffer.isc0[dstx] + buffer.iscp[dstx]; - buffer.isc1[dstx] = buffer.isc0[dstx] + buffer.iscp[dstx]; - buffer.isc0[dstx] = isr1 + 11 * isr0; - ip = ip + buffer.isc0[dstx]; - *dx = DivideBy256(ip); - } - else - { - // Previous srcx was odd - unsigned short ip; - ip = buffer.isc1[dstx] + 6 * buffer.isc0[dstx] + buffer.iscp[dstx]; - buffer.isc1[dstx] = buffer.isc0[dstx] + buffer.iscp[dstx]; - buffer.isc0[dstx] = isr1 + 6 * isr0 + isrp + (isrp >> 2); - ip = ip + buffer.isc0[dstx]; - *dx = DivideBy256(ip); - } - - dy += dstStride; - } - else - { - // First entry in odd-numbered row - sx = sy; - isr1 = isr0 + isrp; - isr0 = (unsigned short)(*sx); - ++sx; - - // Main entries in odd-numbered row - unsigned short * p_iscp = buffer.iscp; - - for (evenX = false, srcx = 1, dstx = 0; srcx < (srcWidth - 1); srcx += 2, ++sx) - { - unsigned short icurrent = (unsigned short)(*sx); - isrp = icurrent * 4; - - p_iscp++; - - icurrent = (unsigned short)(*(++sx)); - - *p_iscp = (isr1 + 6 * isr0 + isrp + icurrent) * 4; - isr1 = isr0 + isrp; - isr0 = icurrent; - } - dstx += p_iscp - buffer.iscp; - - //doing the last operation due to even number of operations in previous cycle - if (!(srcWidth & 1)) - { - unsigned short icurrent = (unsigned short)(*sx); - isrp = icurrent * 4; - ++dstx; - evenX = !evenX; - ++sx; - } - - // Last entries in row - if (!evenX) - { - // previous srcx was even - ++dstx; - buffer.iscp[dstx] = (isr1 + 11 * isr0) * 4; - } - else - { - buffer.iscp[dstx] = (isr1 + 6 * isr0 + isrp + (isrp >> 2)) * 4; - } - } - evenY = !evenY; - } - } - - // Last Rows - { - if (!evenY) - { - for (dstx = 1, dx = dy; dstx < (dstWidth + 1); ++dstx, ++dx) - *dx = DivideBy256(buffer.isc1[dstx] + 11 * buffer.isc0[dstx]); - } - else - { - for (dstx = 1, dx = dy; dstx < (dstWidth + 1); ++dstx, ++dx) - *dx = DivideBy256(buffer.isc1[dstx] + 6 * buffer.isc0[dstx] + buffer.iscp[dstx] + (buffer.iscp[dstx] >> 2)); - } - } - } - - void ReduceGray5x5(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation) - { - if (compensation) - ReduceGray5x5(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - ReduceGray5x5(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseReorder.cpp b/src/3rd/Simd/Simd/SimdBaseReorder.cpp deleted file mode 100644 index bffcfacc..00000000 --- a/src/3rd/Simd/Simd/SimdBaseReorder.cpp +++ /dev/null @@ -1,82 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE void Reorder16bitX(const uint8_t * src, uint8_t * dst) - { - size_t value = *(size_t*)src; -#if defined (SIMD_X64_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined (SIMD_ARM64_ENABLE) - *(size_t*)dst = (value & 0xFF00FF00FF00FF00) >> 8 | (value & 0x00FF00FF00FF00FF) << 8; -#else - *(size_t*)dst = (value & 0xFF00FF00) >> 8 | (value & 0x00FF00FF) << 8; -#endif - } - - void Reorder16bit(const uint8_t * src, size_t size, uint8_t * dst) - { - assert(size % 2 == 0); - - size_t alignedSize = AlignLo(size, sizeof(size_t)); - for (size_t i = 0; i < alignedSize; i += sizeof(size_t)) - Reorder16bitX(src + i, dst + i); - for (size_t i = alignedSize; i < size; i += 2) - Reorder16bit(src + i, dst + i); - } - - SIMD_INLINE void Reorder32bitX(const uint8_t * src, uint8_t * dst) - { -#if defined (SIMD_X64_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined (SIMD_ARM64_ENABLE) - size_t value = *(size_t*)src; - *(size_t*)dst = - (value & 0x000000FF000000FF) << 24 | (value & 0x0000FF000000FF00) << 8 | - (value & 0x00FF000000FF0000) >> 8 | (value & 0xFF000000FF000000) >> 24; -#else - Reorder32bit(src, dst); -#endif - } - - void Reorder32bit(const uint8_t * src, size_t size, uint8_t * dst) - { - assert(size % 4 == 0); - - size_t alignedSize = AlignLo(size, sizeof(size_t)); - for (size_t i = 0; i < alignedSize; i += sizeof(size_t)) - Reorder32bitX(src + i, dst + i); - for (size_t i = alignedSize; i < size; i += 4) - Reorder32bit(src + i, dst + i); - } - - void Reorder64bit(const uint8_t * src, size_t size, uint8_t * dst) - { - assert(size % 8 == 0); - - for (size_t i = 0; i < size; i += 8) - Reorder64bit(src + i, dst + i); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseResizeBilinear.cpp b/src/3rd/Simd/Simd/SimdBaseResizeBilinear.cpp deleted file mode 100644 index 79acc0bd..00000000 --- a/src/3rd/Simd/Simd/SimdBaseResizeBilinear.cpp +++ /dev/null @@ -1,154 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" - -namespace Simd -{ - namespace Base - { - namespace - { - struct Buffer - { - Buffer(size_t width, size_t height) - { - _p = Allocate(2 * sizeof(int)*(2 * width + height)); - ix = (int*)_p; - ax = ix + width; - iy = ax + width; - ay = iy + height; - pbx[0] = (int*)(ay + height); - pbx[1] = pbx[0] + width; - } - - ~Buffer() - { - Free(_p); - } - - int * ix; - int * ax; - int * iy; - int * ay; - int * pbx[2]; - private: - void *_p; - }; - } - - void EstimateAlphaIndex(size_t srcSize, size_t dstSize, int * indexes, int * alphas, size_t channelCount) - { - float scale = (float)srcSize / dstSize; - - for (size_t i = 0; i < dstSize; ++i) - { - float alpha = (float)((i + 0.5)*scale - 0.5); - ptrdiff_t index = (ptrdiff_t)::floor(alpha); - alpha -= index; - - if (index < 0) - { - index = 0; - alpha = 0; - } - - if (index > (ptrdiff_t)srcSize - 2) - { - index = srcSize - 2; - alpha = 1; - } - - for (size_t c = 0; c < channelCount; c++) - { - size_t offset = i*channelCount + c; - indexes[offset] = (int)(channelCount*index + c); - alphas[offset] = (int)(alpha * FRACTION_RANGE + 0.5); - } - } - } - - void ResizeBilinear( - const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) - { - assert(channelCount >= 1 && channelCount <= 4); - - size_t dstRowSize = channelCount*dstWidth; - - Buffer buffer(dstRowSize, dstHeight); - - EstimateAlphaIndex(srcHeight, dstHeight, buffer.iy, buffer.ay, 1); - - EstimateAlphaIndex(srcWidth, dstWidth, buffer.ix, buffer.ax, channelCount); - - ptrdiff_t previous = -2; - - for (size_t yDst = 0; yDst < dstHeight; yDst++, dst += dstStride) - { - int fy = buffer.ay[yDst]; - ptrdiff_t sy = buffer.iy[yDst]; - int k = 0; - - if (sy == previous) - k = 2; - else if (sy == previous + 1) - { - Swap(buffer.pbx[0], buffer.pbx[1]); - k = 1; - } - - previous = sy; - - for (; k < 2; k++) - { - int* pb = buffer.pbx[k]; - const uint8_t* ps = src + (sy + k)*srcStride; - for (size_t x = 0; x < dstRowSize; x++) - { - size_t sx = buffer.ix[x]; - int fx = buffer.ax[x]; - int t = ps[sx]; - pb[x] = (t << LINEAR_SHIFT) + (ps[sx + channelCount] - t)*fx; - } - } - - if (fy == 0) - for (size_t xDst = 0; xDst < dstRowSize; xDst++) - dst[xDst] = ((buffer.pbx[0][xDst] << LINEAR_SHIFT) + BILINEAR_ROUND_TERM) >> BILINEAR_SHIFT; - else if (fy == FRACTION_RANGE) - for (size_t xDst = 0; xDst < dstRowSize; xDst++) - dst[xDst] = ((buffer.pbx[1][xDst] << LINEAR_SHIFT) + BILINEAR_ROUND_TERM) >> BILINEAR_SHIFT; - else - { - for (size_t xDst = 0; xDst < dstRowSize; xDst++) - { - int t = buffer.pbx[0][xDst]; - dst[xDst] = ((t << LINEAR_SHIFT) + (buffer.pbx[1][xDst] - t)*fy + BILINEAR_ROUND_TERM) >> BILINEAR_SHIFT; - } - } - } - } - } -} - diff --git a/src/3rd/Simd/Simd/SimdBaseResizer.cpp b/src/3rd/Simd/Simd/SimdBaseResizer.cpp deleted file mode 100644 index 3185c297..00000000 --- a/src/3rd/Simd/Simd/SimdBaseResizer.cpp +++ /dev/null @@ -1,392 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdResizer.h" - -namespace Simd -{ - namespace Base - { - ResizerByteBilinear::ResizerByteBilinear(const ResParam & param) - : Resizer(param) - { - _ay.Resize(_param.dstH); - _iy.Resize(_param.dstH); - EstimateIndexAlpha(_param.srcH, _param.dstH, 1, _iy.data, _ay.data); - } - - void ResizerByteBilinear::EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, int32_t * indices, int32_t * alphas) - { - float scale = (float)srcSize / dstSize; - - for (size_t i = 0; i < dstSize; ++i) - { - float alpha = (float)((i + 0.5f)*scale - 0.5f); - ptrdiff_t index = (ptrdiff_t)::floor(alpha); - alpha -= index; - - if (index < 0) - { - index = 0; - alpha = 0; - } - - if (index >(ptrdiff_t)srcSize - 2) - { - index = srcSize - 2; - alpha = 1; - } - - for (size_t c = 0; c < channels; c++) - { - size_t offset = i * channels + c; - indices[offset] = (int32_t)(channels*index + c); - alphas[offset] = (int32_t)(alpha * FRACTION_RANGE + 0.5f); - } - } - } - - void ResizerByteBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - size_t cn = _param.channels; - size_t rs = _param.dstW * cn; - if (_ax.data == 0) - { - _ax.Resize(rs); - _ix.Resize(rs); - EstimateIndexAlpha(_param.srcW, _param.dstW, cn, _ix.data, _ax.data); - _bx[0].Resize(rs); - _bx[1].Resize(rs); - } - int32_t * pbx[2] = { _bx[0].data, _bx[1].data }; - int32_t prev = -2; - for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride) - { - int32_t fy = _ay[dy]; - int32_t sy = _iy[dy]; - int32_t k = 0; - - if (sy == prev) - k = 2; - else if (sy == prev + 1) - { - Swap(pbx[0], pbx[1]); - k = 1; - } - - prev = sy; - - for (; k < 2; k++) - { - int32_t * pb = pbx[k]; - const uint8_t * ps = src + (sy + k)*srcStride; - for (size_t dx = 0; dx < rs; dx++) - { - int32_t sx = _ix[dx]; - int32_t fx = _ax[dx]; - int32_t t = ps[sx]; - pb[dx] = (t << LINEAR_SHIFT) + (ps[sx + cn] - t)*fx; - } - } - - if (fy == 0) - for (size_t dx = 0; dx < rs; dx++) - dst[dx] = ((pbx[0][dx] << LINEAR_SHIFT) + BILINEAR_ROUND_TERM) >> BILINEAR_SHIFT; - else if (fy == FRACTION_RANGE) - for (size_t dx = 0; dx < rs; dx++) - dst[dx] = ((pbx[1][dx] << LINEAR_SHIFT) + BILINEAR_ROUND_TERM) >> BILINEAR_SHIFT; - else - { - for (size_t dx = 0; dx < rs; dx++) - { - int32_t t = pbx[0][dx]; - dst[dx] = ((t << LINEAR_SHIFT) + (pbx[1][dx] - t)*fy + BILINEAR_ROUND_TERM) >> BILINEAR_SHIFT; - } - } - } - } - - //--------------------------------------------------------------------- - - ResizerByteArea::ResizerByteArea(const ResParam & param) - : Resizer(param) - { - double scale = Simd::Max(float(_param.srcW) / _param.dstW, float(_param.srcH) / _param.dstH); - - _ay.Resize(_param.dstH + 1); - _iy.Resize(_param.dstH + 1); - EstimateParams(_param.srcH, _param.dstH, Base::AREA_RANGE, _ay.data, _iy.data); - - _ax.Resize(_param.dstW + 1); - _ix.Resize(_param.dstW + 1); - EstimateParams(_param.srcW, _param.dstW, Base::AREA_RANGE, _ax.data, _ix.data); - } - - void ResizerByteArea::EstimateParams(size_t srcSize, size_t dstSize, size_t range, int32_t * alpha, int32_t * index) - { - float scale = (float)srcSize / dstSize; - - for (size_t ds = 0; ds <= dstSize; ++ds) - { - float a = (float)ds*scale; - size_t i = (size_t)::floor(a); - a -= i; - if (i == srcSize) - { - i--; - a = 1.0f; - } - alpha[ds] = int32_t(range * (1.0f - a) / scale); - index[ds] = int32_t(i); - } - } - - template SIMD_INLINE void ResizerByteAreaSet(const T * src, int32_t value, int32_t * dst) - { - for (size_t c = 0; c < N; ++c) - dst[c] = src[c] * value; - } - - template SIMD_INLINE void ResizerByteAreaAdd(const T * src, int32_t value, int32_t * dst) - { - for (size_t c = 0; c < N; ++c) - dst[c] += src[c] * value; - } - - template SIMD_INLINE void ResizerByteAreaPixelRowSum(const uint8_t * src, size_t size, int32_t nose, int32_t body, int32_t tail, int32_t * dst) - { - ResizerByteAreaSet(src, nose, dst); - for (size_t i = 0; i < size; ++i) - { - src += N; - ResizerByteAreaAdd(src, body, dst); - } - ResizerByteAreaAdd(src, tail, dst); - } - - template SIMD_INLINE void ResizerByteAreaRes(const int32_t * src, uint8_t * dst) - { - for (size_t c = 0; c < N; ++c) - dst[c] = uint8_t((src[c] + Base::AREA_ROUND) >> Base::AREA_SHIFT); - } - - template void ResizerByteArea::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - int32_t ts[N], rs[N]; - int32_t ayb = _ay.data[0], axb = _ax.data[0]; - for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride) - { - size_t by = _iy.data[dy], ey = _iy.data[dy + 1]; - int32_t ayn = _ay.data[dy], ayt = - _ay.data[dy + 1]; - for (size_t dx = 0; dx < _param.dstW; dx++) - { - size_t bx = _ix.data[dx], sx = _ix.data[dx + 1] - bx; - int32_t axn = _ax.data[dx], axt = - _ax.data[dx + 1]; - const uint8_t * s = src + by * srcStride + bx * N; - ResizerByteAreaPixelRowSum(s, sx, axn, axb, axt, rs); - ResizerByteAreaSet(rs, ayn, ts); - for (size_t sy = by; sy < ey; sy++) - { - s += srcStride; - ResizerByteAreaPixelRowSum(s, sx, axn, axb, axt, rs); - ResizerByteAreaAdd(rs, ayb, ts); - } - ResizerByteAreaPixelRowSum(s, sx, axn, axb, axt, rs); - ResizerByteAreaAdd(rs, ayt, ts); - ResizerByteAreaRes(ts, dst + dx * N); - } - } - } - - void ResizerByteArea::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - switch (_param.channels) - { - case 1: Run<1>(src, srcStride, dst, dstStride); return; - case 2: Run<2>(src, srcStride, dst, dstStride); return; - case 3: Run<3>(src, srcStride, dst, dstStride); return; - case 4: Run<4>(src, srcStride, dst, dstStride); return; - default: - assert(0); - } - } - - //--------------------------------------------------------------------- - - ResizerFloatBilinear::ResizerFloatBilinear(const ResParam & param) - : Resizer(param) - { - _ay.Resize(_param.dstH, false, _param.align); - _iy.Resize(_param.dstH, false, _param.align); - EstimateIndexAlpha(_param.srcH, _param.dstH, 1, _iy.data, _ay.data); - size_t rs = _param.dstW * _param.channels; - _ax.Resize(rs, false, _param.align); - _ix.Resize(rs, false, _param.align); - EstimateIndexAlpha(_param.srcW, _param.dstW, _param.channels, _ix.data, _ax.data); - _bx[0].Resize(rs, false, _param.align); - _bx[1].Resize(rs, false, _param.align); - } - - void ResizerFloatBilinear::EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, int32_t * indices, float * alphas) - { - if (_param.method == SimdResizeMethodBilinear) - { - float scale = (float)srcSize / dstSize; - for (size_t i = 0; i < dstSize; ++i) - { - float alpha = (float)((i + 0.5f) * scale - 0.5f); - ptrdiff_t index = (ptrdiff_t)::floor(alpha); - alpha -= index; - if (index < 0) - { - index = 0; - alpha = 0; - } - if (index > (ptrdiff_t)srcSize - 2) - { - index = srcSize - 2; - alpha = 1; - } - for (size_t c = 0; c < channels; c++) - { - size_t offset = i * channels + c; - indices[offset] = (int32_t)(channels * index + c); - alphas[offset] = alpha; - } - } - } - else if (_param.method == SimdResizeMethodCaffeInterp) - { - float scale = dstSize > 1 ? float(srcSize - 1) / float(dstSize - 1) : 0.0f; - for (size_t i = 0; i < dstSize; ++i) - { - float alpha = float(i) * scale; - ptrdiff_t index = (ptrdiff_t)::floor(alpha); - alpha -= index; - if (index > (ptrdiff_t)srcSize - 2) - { - index = srcSize - 2; - alpha = 1; - } - for (size_t c = 0; c < channels; c++) - { - size_t offset = i * channels + c; - indices[offset] = (int32_t)(channels * index + c); - alphas[offset] = alpha; - } - } - } - else if (_param.method == SimdResizeMethodInferenceEngineInterp) - { - float scale = (float)srcSize / dstSize; - for (size_t i = 0; i < dstSize; ++i) - { - float alpha = float(i) * scale; - ptrdiff_t index = (ptrdiff_t)::floor(alpha); - alpha -= index; - if (index < 0) - { - index = 0; - alpha = 0; - } - if (index > (ptrdiff_t)srcSize - 2) - { - index = srcSize - 2; - alpha = 1; - } - for (size_t c = 0; c < channels; c++) - { - size_t offset = i * channels + c; - indices[offset] = (int32_t)(channels * index + c); - alphas[offset] = alpha; - } - } - } - else - assert(0); - - } - - void ResizerFloatBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - Run((const float*)src, srcStride / sizeof(float), (float*)dst, dstStride / sizeof(float)); - } - - void ResizerFloatBilinear::Run(const float * src, size_t srcStride, float * dst, size_t dstStride) - { - size_t cn = _param.channels; - size_t rs = _param.dstW * cn; - float * pbx[2] = { _bx[0].data, _bx[1].data }; - int32_t prev = -2; - for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride) - { - float fy1 = _ay[dy]; - float fy0 = 1.0f - fy1; - int32_t sy = _iy[dy]; - int32_t k = 0; - - if (sy == prev) - k = 2; - else if (sy == prev + 1) - { - Swap(pbx[0], pbx[1]); - k = 1; - } - - prev = sy; - - for (; k < 2; k++) - { - float * pb = pbx[k]; - const float * ps = src + (sy + k)*srcStride; - for (size_t dx = 0; dx < rs; dx++) - { - int32_t sx = _ix[dx]; - float fx = _ax[dx]; - pb[dx] = ps[sx]*(1.0f - fx) + ps[sx + cn]*fx; - } - } - - for (size_t dx = 0; dx < rs; dx++) - dst[dx] = pbx[0][dx]*fy0 + pbx[1][dx]*fy1; - } - } - - //--------------------------------------------------------------------- - - void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) - { - ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(void*)); - if (param.IsByteBilinear()) - return new ResizerByteBilinear(param); - else if (param.IsByteArea()) - return new ResizerByteArea(param); - else if (param.IsFloatBilinear()) - return new ResizerFloatBilinear(param); - else - return NULL; - } - } -} - diff --git a/src/3rd/Simd/Simd/SimdBaseSegmentation.cpp b/src/3rd/Simd/Simd/SimdBaseSegmentation.cpp deleted file mode 100644 index bfaeb919..00000000 --- a/src/3rd/Simd/Simd/SimdBaseSegmentation.cpp +++ /dev/null @@ -1,195 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdDefs.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE void FillSingleHole(uint8_t * mask, ptrdiff_t stride, uint8_t index) - { - if (mask[-stride] == index && mask[stride] == index && mask[-1] == index && mask[1] == index) - mask[0] = index; - } - - void SegmentationFillSingleHoles(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index) - { - assert(width > 2 && height > 2); - - mask += stride + 1; - height -= 2; - width -= 2; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - FillSingleHole(mask + col, stride, index); - } - mask += stride; - } - } - - void SegmentationChangeIndex(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t oldIndex, uint8_t newIndex) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - if (mask[col] == oldIndex) - mask[col] = newIndex; - } - mask += stride; - } - } - - void SegmentationPropagate2x2(const uint8_t * parent, size_t parentStride, size_t width, size_t height, - uint8_t * child, size_t childStride, const uint8_t * difference, size_t differenceStride, - uint8_t currentIndex, uint8_t invalidIndex, uint8_t emptyIndex, uint8_t differenceThreshold) - { - assert(width >= 2 && height >= 2); - - width--; - height--; - for (size_t parentRow = 0, childRow = 1; parentRow < height; ++parentRow, childRow += 2) - { - const uint8_t * parent0 = parent + parentRow*parentStride; - const uint8_t * parent1 = parent0 + parentStride; - const uint8_t * difference0 = difference + childRow*differenceStride; - const uint8_t * difference1 = difference0 + differenceStride; - uint8_t * child0 = child + childRow*childStride; - uint8_t * child1 = child0 + childStride; - for (size_t parentCol = 0, childCol = 1; parentCol < width; ++parentCol, childCol += 2) - { - const bool parent00 = parent0[parentCol] == currentIndex; - const bool parent01 = parent0[parentCol + 1] == currentIndex; - const bool parent10 = parent1[parentCol] == currentIndex; - const bool parent11 = parent1[parentCol + 1] == currentIndex; - - const bool parentOne = parent00 || parent01 || parent10 || parent11; - const bool parentAll = parent00 && parent01 && parent10 && parent11; - - const bool difference00 = difference0[childCol] > differenceThreshold; - const bool difference01 = difference0[childCol + 1] > differenceThreshold; - const bool difference10 = difference1[childCol] > differenceThreshold; - const bool difference11 = difference1[childCol + 1] > differenceThreshold; - - uint8_t & child00 = child0[childCol]; - uint8_t & child01 = child0[childCol + 1]; - uint8_t & child10 = child1[childCol]; - uint8_t & child11 = child1[childCol + 1]; - - if (child00 < invalidIndex) - child00 = parentAll || (parentOne && difference00) ? currentIndex : emptyIndex; - - if (child01 < invalidIndex) - child01 = parentAll || (parentOne && difference01) ? currentIndex : emptyIndex; - - if (child10 < invalidIndex) - child10 = parentAll || (parentOne && difference10) ? currentIndex : emptyIndex; - - if (child11 < invalidIndex) - child11 = parentAll || (parentOne && difference11) ? currentIndex : emptyIndex; - } - } - } - - void SegmentationShrinkRegion(const uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index, - ptrdiff_t * left, ptrdiff_t * top, ptrdiff_t * right, ptrdiff_t * bottom) - { - assert(*left < *right && *top < *bottom); - assert(*left >= 0 && *right <= (ptrdiff_t)width && *top >= 0 && *bottom <= (ptrdiff_t)height); - - bool search = true; - for (ptrdiff_t row = *top; search && row < *bottom; ++row) - { - const uint8_t * _mask = mask + row*stride; - for (ptrdiff_t col = *left; col < *right; ++col) - { - if (_mask[col] == index) - { - search = false; - *top = row; - break; - } - } - } - - if (search) - { - *left = 0; - *top = 0; - *right = 0; - *bottom = 0; - return; - } - - search = true; - for (ptrdiff_t row = *bottom - 1; search && row >= *top; --row) - { - const uint8_t * _mask = mask + row*stride; - for (ptrdiff_t col = *left; col < *right; ++col) - { - if (_mask[col] == index) - { - search = false; - *bottom = row + 1; - break; - } - } - } - - search = true; - for (ptrdiff_t col = *left; search && col < *right; ++col) - { - const uint8_t * _mask = mask + (*top)*stride + col; - for (ptrdiff_t row = *top; row < *bottom; ++row) - { - if (*_mask == index) - { - search = false; - *left = col; - break; - } - _mask += stride; - } - } - - search = true; - for (ptrdiff_t col = *right - 1; search && col >= *left; --col) - { - const uint8_t * _mask = mask + (*top)*stride + col; - for (ptrdiff_t row = *top; row < *bottom; ++row) - { - if (*_mask == index) - { - search = false; - *right = col + 1; - break; - } - _mask += stride; - } - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseShiftBilinear.cpp b/src/3rd/Simd/Simd/SimdBaseShiftBilinear.cpp deleted file mode 100644 index 64325d1c..00000000 --- a/src/3rd/Simd/Simd/SimdBaseShiftBilinear.cpp +++ /dev/null @@ -1,287 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE int Interpolate(int s[2][2], int k[2][2]) - { - return (s[0][0] * k[0][0] + s[0][1] * k[0][1] + - s[1][0] * k[1][0] + s[1][1] * k[1][1] + BILINEAR_ROUND_TERM) >> BILINEAR_SHIFT; - } - - SIMD_INLINE int Interpolate(const unsigned char *src, size_t dx, size_t dy, int k[2][2]) - { - return (src[0] * k[0][0] + src[dx] * k[0][1] + - src[dy] * k[1][0] + src[dx + dy] * k[1][1] + BILINEAR_ROUND_TERM) >> BILINEAR_SHIFT; - } - - SIMD_INLINE int Interpolate(const unsigned char *src, size_t dr, int k[2]) - { - return (src[0] * k[0] + src[dr] * k[1] + LINEAR_ROUND_TERM) >> LINEAR_SHIFT; - } - - void MixBorder(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t * bkg, size_t bkgStride, ptrdiff_t iDx, ptrdiff_t iDy, int fDx, int fDy, uint8_t * dst, size_t dstStride) - { - size_t bkgWidth = Abs(iDx) - (iDx < 0 && fDx ? 1 : 0); - size_t bkgHeight = Abs(iDy) - (iDy < 0 && fDy ? 1 : 0); - - size_t mainWidth = width - bkgWidth - (fDx ? 1 : 0); - size_t mainHeight = height - bkgHeight - (fDy ? 1 : 0); - - int k[2][2]; - k[0][0] = (FRACTION_RANGE - fDx)*(FRACTION_RANGE - fDy); - k[0][1] = fDx*(FRACTION_RANGE - fDy); - k[1][0] = (FRACTION_RANGE - fDx)*fDy; - k[1][1] = fDx*fDy; - - if (fDx) - { - const uint8_t * ps[2][2]; - size_t xOffset = (iDx >= 0 ? width - 1 - iDx : -iDx - 1)*channelCount; - size_t bkgOffset = (iDy > 0 ? 0 : -iDy)*bkgStride + xOffset; - size_t dstOffset = (iDy > 0 ? 0 : -iDy)*dstStride + xOffset; - - if (iDx < 0) - { - ps[0][0] = bkg + bkgOffset; - ps[0][1] = src + (iDy < 0 ? 0 : iDy)*srcStride; - ps[1][0] = bkg + bkgOffset; - ps[1][1] = src + ((iDy < 0 ? 0 : iDy) + (fDy ? 1 : 0))*srcStride; - } - else - { - ps[0][0] = src + (iDy < 0 ? 0 : iDy)*srcStride + (width - 1)*channelCount; - ps[0][1] = bkg + bkgOffset; - ps[1][0] = src + ((iDy < 0 ? 0 : iDy) + (fDy ? 1 : 0))*srcStride + (width - 1)*channelCount; - ps[1][1] = bkg + bkgOffset; - } - - for (size_t row = 0; row < mainHeight; ++row) - { - for (size_t channel = 0; channel < channelCount; channel++) - { - int s[2][2]; - s[0][0] = ps[0][0][channel]; - s[0][1] = ps[0][1][channel]; - s[1][0] = ps[1][0][channel]; - s[1][1] = ps[1][1][channel]; - dst[dstOffset + channel] = Interpolate(s, k); - } - ps[0][0] += srcStride; - ps[0][1] += bkgStride; - ps[1][0] += srcStride; - ps[1][1] += bkgStride; - dstOffset += dstStride; - } - } - - if (fDy) - { - const uint8_t * ps[2][2]; - size_t bkgOffset = (iDy >= 0 ? height - 1 - iDy : -iDy - 1)*bkgStride + (iDx > 0 ? 0 : -iDx)*channelCount; - size_t dstOffset = (iDy >= 0 ? height - 1 - iDy : -iDy - 1)*dstStride + (iDx > 0 ? 0 : -iDx)*channelCount; - - if (iDy < 0) - { - ps[0][0] = bkg + bkgOffset; - ps[0][1] = bkg + bkgOffset; - ps[1][0] = src + (iDx < 0 ? 0 : iDx)*channelCount; - ps[1][1] = src + ((iDx < 0 ? 0 : iDx) + (fDx ? 1 : 0))*channelCount; - } - else - { - ps[0][0] = src + (height - 1)*srcStride + (iDx < 0 ? 0 : iDx)*channelCount; - ps[0][1] = src + (height - 1)*srcStride + ((iDx < 0 ? 0 : iDx) + (fDx ? 1 : 0))*channelCount; - ps[1][0] = bkg + bkgOffset; - ps[1][1] = bkg + bkgOffset; - } - - for (size_t col = 0; col < mainWidth; ++col) - { - for (size_t channel = 0; channel < channelCount; channel++) - { - int s[2][2]; - s[0][0] = ps[0][0][channel]; - s[0][1] = ps[0][1][channel]; - s[1][0] = ps[1][0][channel]; - s[1][1] = ps[1][1][channel]; - dst[dstOffset + channel] = Interpolate(s, k); - } - ps[0][0] += channelCount; - ps[0][1] += channelCount; - ps[1][0] += channelCount; - ps[1][1] += channelCount; - dstOffset += channelCount; - } - } - - if (fDx && fDy) - { - const uint8_t * ps[2][2]; - size_t xOffset = (iDx >= 0 ? width - 1 - iDx : -iDx - 1)*channelCount; - size_t bkgOffset = (iDy >= 0 ? height - 1 - iDy : -iDy - 1)*bkgStride + xOffset; - size_t dstOffset = (iDy >= 0 ? height - 1 - iDy : -iDy - 1)*dstStride + xOffset; - - ps[0][0] = (iDx >= 0 && iDy >= 0) ? (src + (height - 1)*srcStride + (width - 1)*channelCount) : bkg + bkgOffset; - ps[0][1] = (iDx < 0 && iDy >= 0) ? (src + (height - 1)*srcStride) : bkg + bkgOffset; - ps[1][0] = (iDx >= 0 && iDy < 0) ? (src + (width - 1)*channelCount) : bkg + bkgOffset; - ps[1][1] = (iDx < 0 && iDy < 0) ? (src) : bkg + bkgOffset; - - for (size_t channel = 0; channel < channelCount; channel++) - { - int s[2][2]; - s[0][0] = ps[0][0][channel]; - s[0][1] = ps[0][1][channel]; - s[1][0] = ps[1][0][channel]; - s[1][1] = ps[1][1][channel]; - dst[dstOffset + channel] = Interpolate(s, k); - } - } - } - - void CommonShiftAction( - const uint8_t * & src, size_t srcStride, size_t & width, size_t & height, size_t channelCount, - const uint8_t * bkg, size_t bkgStride, const double * shiftX, const double * shiftY, - size_t cropLeft, size_t cropTop, size_t cropRight, size_t cropBottom, uint8_t * & dst, size_t dstStride, - int & fDx, int & fDy) - { - assert(cropLeft <= cropRight && cropTop <= cropBottom && cropRight <= width && cropBottom <= height); - assert(*shiftX < cropRight - cropLeft && *shiftY < cropBottom - cropTop); - - Base::CopyFrame(src, srcStride, width, height, channelCount, cropLeft, cropTop, cropRight, cropBottom, dst, dstStride); - - dst += dstStride*cropTop + cropLeft*channelCount; - src += srcStride*cropTop + cropLeft*channelCount; - bkg += bkgStride*cropTop + cropLeft*channelCount; - width = cropRight - cropLeft; - height = cropBottom - cropTop; - - ptrdiff_t iDx = (ptrdiff_t)floor(*shiftX + FRACTION_ROUND_TERM); - ptrdiff_t iDy = (ptrdiff_t)floor(*shiftY + FRACTION_ROUND_TERM); - fDx = (int)floor((*shiftX + FRACTION_ROUND_TERM - iDx)*FRACTION_RANGE); - fDy = (int)floor((*shiftY + FRACTION_ROUND_TERM - iDy)*FRACTION_RANGE); - - ptrdiff_t left = (iDx < 0 ? (-iDx - (fDx ? 1 : 0)) : 0); - ptrdiff_t top = (iDy < 0 ? (-iDy - (fDy ? 1 : 0)) : 0); - ptrdiff_t right = (iDx < 0 ? width : width - iDx); - ptrdiff_t bottom = (iDy < 0 ? height : height - iDy); - - Base::CopyFrame(bkg, bkgStride, width, height, channelCount, left, top, right, bottom, dst, dstStride); - - MixBorder(src, srcStride, width, height, channelCount, bkg, bkgStride, iDx, iDy, fDx, fDy, dst, dstStride); - - src += Simd::Max((ptrdiff_t)0, iDy)*srcStride + Simd::Max((ptrdiff_t)0, iDx)*channelCount; - dst += Simd::Max((ptrdiff_t)0, -iDy)*dstStride + Simd::Max((ptrdiff_t)0, -iDx)*channelCount; - - width = width - Abs(iDx) + (iDx < 0 && fDx ? 1 : 0) - (fDx ? 1 : 0); - height = height - Abs(iDy) + (iDy < 0 && fDy ? 1 : 0) - (fDy ? 1 : 0); - } - - void ShiftBilinear(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, - int fDx, int fDy, uint8_t * dst, size_t dstStride) - { - size_t size = width*channelCount; - if (fDy) - { - if (fDx) - { - int k[2][2]; - k[0][0] = (FRACTION_RANGE - fDx)*(FRACTION_RANGE - fDy); - k[0][1] = fDx*(FRACTION_RANGE - fDy); - k[1][0] = (FRACTION_RANGE - fDx)*fDy; - k[1][1] = fDx*fDy; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < size; col++) - { - dst[col] = Interpolate(src + col, channelCount, srcStride, k); - } - src += srcStride; - dst += dstStride; - } - } - else - { - int k[2]; - k[0] = FRACTION_RANGE - fDy; - k[1] = fDy; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < size; col++) - { - dst[col] = Interpolate(src + col, srcStride, k); - } - src += srcStride; - dst += dstStride; - } - } - } - else - { - if (fDx) - { - int k[2]; - k[0] = FRACTION_RANGE - fDx; - k[1] = fDx; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < size; col++) - { - dst[col] = Interpolate(src + col, channelCount, k); - } - src += srcStride; - dst += dstStride; - } - } - else - { - for (size_t row = 0; row < height; ++row) - { - memcpy(dst, src, size); - src += srcStride; - dst += dstStride; - } - } - } - } - - void ShiftBilinear( - const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t * bkg, size_t bkgStride, const double * shiftX, const double * shiftY, - size_t cropLeft, size_t cropTop, size_t cropRight, size_t cropBottom, uint8_t * dst, size_t dstStride) - { - int fDx, fDy; - CommonShiftAction(src, srcStride, width, height, channelCount, bkg, bkgStride, shiftX, shiftY, - cropLeft, cropTop, cropRight, cropBottom, dst, dstStride, fDx, fDy); - - ShiftBilinear(src, srcStride, width, height, channelCount, fDx, fDy, dst, dstStride); - } - } -} - diff --git a/src/3rd/Simd/Simd/SimdBaseSobel.cpp b/src/3rd/Simd/Simd/SimdBaseSobel.cpp deleted file mode 100644 index 817131da..00000000 --- a/src/3rd/Simd/Simd/SimdBaseSobel.cpp +++ /dev/null @@ -1,318 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" - -namespace Simd -{ - namespace Base - { - template int SobelDx(const uint8_t *s0, const uint8_t *s1, const uint8_t *s2, size_t x0, size_t x2); - - template <> SIMD_INLINE int SobelDx(const uint8_t *s0, const uint8_t *s1, const uint8_t *s2, size_t x0, size_t x2) - { - return (s0[x2] + 2 * s1[x2] + s2[x2]) - (s0[x0] + 2 * s1[x0] + s2[x0]); - } - - template <> SIMD_INLINE int SobelDx(const uint8_t *s0, const uint8_t *s1, const uint8_t *s2, size_t x0, size_t x2) - { - return Simd::Abs(SobelDx(s0, s1, s2, x0, x2)); - } - - template void SobelDx(const uint8_t * src, size_t srcStride, size_t width, size_t height, int16_t * dst, size_t dstStride) - { - assert(width > 1); - - const uint8_t *src0, *src1, *src2; - - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - dst[0] = SobelDx(src0, src1, src2, 0, 1); - - for (size_t col = 1; col < width - 1; ++col) - dst[col] = SobelDx(src0, src1, src2, col - 1, col + 1); - - dst[width - 1] = SobelDx(src0, src1, src2, width - 2, width - 1); - - dst += dstStride; - } - } - - void SobelDx(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - SobelDx(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - void SobelDxAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - SobelDx(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - void SobelDxAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - assert(width > 1); - - const uint8_t *src0, *src1, *src2; - - *sum = 0; - for (size_t row = 0; row < height; ++row) - { - src0 = src + stride*(row - 1); - src1 = src0 + stride; - src2 = src1 + stride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - -#ifdef __GNUC__ - size_t rowSum = 0; -#else - uint32_t rowSum = 0; -#endif - - rowSum += SobelDx(src0, src1, src2, 0, 1); - - for (size_t col = 1; col < width - 1; ++col) - rowSum += SobelDx(src0, src1, src2, col - 1, col + 1); - - rowSum += SobelDx(src0, src1, src2, width - 2, width - 1); - - *sum += rowSum; - } - } - - template SIMD_INLINE int SobelDy(const uint8_t *s0, const uint8_t *s2, size_t x0, size_t x1, size_t x2); - - template <> SIMD_INLINE int SobelDy(const uint8_t *s0, const uint8_t *s2, size_t x0, size_t x1, size_t x2) - { - return (s2[x0] + 2 * s2[x1] + s2[x2]) - (s0[x0] + 2 * s0[x1] + s0[x2]); - } - - template <> SIMD_INLINE int SobelDy(const uint8_t *s0, const uint8_t *s2, size_t x0, size_t x1, size_t x2) - { - return Simd::Abs(SobelDy(s0, s2, x0, x1, x2)); - } - - template void SobelDy(const uint8_t * src, size_t srcStride, size_t width, size_t height, int16_t * dst, size_t dstStride) - { - assert(width > 1); - - const uint8_t *src0, *src1, *src2; - - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - dst[0] = SobelDy(src0, src2, 0, 0, 1); - - for (size_t col = 1; col < width - 1; ++col) - dst[col] = SobelDy(src0, src2, col - 1, col, col + 1); - - dst[width - 1] = SobelDy(src0, src2, width - 2, width - 1, width - 1); - - dst += dstStride; - } - } - - void SobelDy(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - SobelDy(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - void SobelDyAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - SobelDy(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - void SobelDyAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - assert(width > 1); - - const uint8_t *src0, *src1, *src2; - - *sum = 0; - for (size_t row = 0; row < height; ++row) - { - src0 = src + stride*(row - 1); - src1 = src0 + stride; - src2 = src1 + stride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - -#ifdef __GNUC__ - size_t rowSum = 0; -#else - uint32_t rowSum = 0; -#endif - - rowSum += SobelDy(src0, src2, 0, 0, 1); - - for (size_t col = 1; col < width - 1; ++col) - rowSum += SobelDy(src0, src2, col - 1, col, col + 1); - - rowSum += SobelDy(src0, src2, width - 2, width - 1, width - 1); - - *sum += rowSum; - } - } - - SIMD_INLINE int ContourMetrics(const uint8_t *s0, const uint8_t *s1, const uint8_t *s2, size_t x0, size_t x1, size_t x2) - { - int dx = SobelDx(s0, s1, s2, x0, x2); - int dy = SobelDy(s0, s2, x0, x1, x2); - return (dx + dy) * 2 + (dx >= dy ? 0 : 1); - } - - void ContourMetrics(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint16_t * dst, size_t dstStride) - { - assert(width > 1); - - const uint8_t *src0, *src1, *src2; - - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - dst[0] = ContourMetrics(src0, src1, src2, 0, 0, 1); - - for (size_t col = 1; col < width - 1; ++col) - dst[col] = ContourMetrics(src0, src1, src2, col - 1, col, col + 1); - - dst[width - 1] = ContourMetrics(src0, src1, src2, width - 2, width - 1, width - 1); - - dst += dstStride; - } - } - - void ContourMetrics(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - ContourMetrics(src, srcStride, width, height, (uint16_t *)dst, dstStride / sizeof(int16_t)); - } - - void ContourMetricsMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t indexMin, uint16_t * dst, size_t dstStride) - { - assert(width > 1); - - const uint8_t *src0, *src1, *src2; - - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - dst[0] = mask[0] < indexMin ? 0 : ContourMetrics(src0, src1, src2, 0, 0, 1); - - for (size_t col = 1; col < width - 1; ++col) - dst[col] = mask[col] < indexMin ? 0 : ContourMetrics(src0, src1, src2, col - 1, col, col + 1); - - dst[width - 1] = mask[width - 1] < indexMin ? 0 : ContourMetrics(src0, src1, src2, width - 2, width - 1, width - 1); - - dst += dstStride; - mask += maskStride; - } - } - - void ContourMetricsMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t indexMin, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - ContourMetricsMasked(src, srcStride, width, height, mask, maskStride, indexMin, (uint16_t *)dst, dstStride / sizeof(int16_t)); - } - - SIMD_INLINE uint8_t Anchor(const uint16_t * src, ptrdiff_t stride, int16_t threshold) - { - uint16_t s = src[0]; - uint16_t a = s / 2; - if (s & 1) - return ((a > 0) && (a - src[+1] / 2 >= threshold) && (a - src[-1] / 2 >= threshold)) ? 255 : 0; - else - return ((a > 0) && (a - src[+stride] / 2 >= threshold) && (a - src[-stride] / 2 >= threshold)) ? 255 : 0; - } - - void ContourAnchors(const uint16_t * src, size_t srcStride, size_t width, size_t height, - size_t step, int16_t threshold, uint8_t * dst, size_t dstStride) - { - memset(dst, 0, width); - memset(dst + dstStride*(height - 1), 0, width); - dst += dstStride; - src += srcStride; - for (size_t row = 1; row < height - 1; row += step) - { - dst[0] = 0; - for (size_t col = 1; col < width - 1; ++col) - dst[col] = Anchor(src + col, srcStride, threshold); - dst[width - 1] = 0; - dst += step*dstStride; - src += step*srcStride; - } - } - - void ContourAnchors(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t step, int16_t threshold, uint8_t * dst, size_t dstStride) - { - assert(srcStride % sizeof(int16_t) == 0); - - ContourAnchors((const uint16_t *)src, srcStride / sizeof(int16_t), width, height, step, threshold, dst, dstStride); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseSquaredDifferenceSum.cpp b/src/3rd/Simd/Simd/SimdBaseSquaredDifferenceSum.cpp deleted file mode 100644 index 74805a3c..00000000 --- a/src/3rd/Simd/Simd/SimdBaseSquaredDifferenceSum.cpp +++ /dev/null @@ -1,133 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ - namespace Base - { - void SquaredDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - size_t width, size_t height, uint64_t * sum) - { - assert(width < 0x10000); - - *sum = 0; - for (size_t row = 0; row < height; ++row) - { - int rowSum = 0; - for (size_t col = 0; col < width; ++col) - { - rowSum += SquaredDifference(a[col], b[col]); - } - *sum += rowSum; - a += aStride; - b += bStride; - } - } - - void SquaredDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) - { - assert(width < 0x10000); - - *sum = 0; - for (size_t row = 0; row < height; ++row) - { - int rowSum = 0; - for (size_t col = 0; col < width; ++col) - { - if (mask[col] == index) - rowSum += SquaredDifference(a[col], b[col]); - } - *sum += rowSum; - a += aStride; - b += bStride; - mask += maskStride; - } - } - - void SquaredDifferenceSum32f(const float * a, const float * b, size_t size, float * sum) - { - size_t alignedSize = Simd::AlignLo(size, 4); - float sums[4] = { 0, 0, 0, 0 }; - size_t i = 0; - for (; i < alignedSize; i += 4) - { - sums[0] += Simd::Square(a[i + 0] - b[i + 0]); - sums[1] += Simd::Square(a[i + 1] - b[i + 1]); - sums[2] += Simd::Square(a[i + 2] - b[i + 2]); - sums[3] += Simd::Square(a[i + 3] - b[i + 3]); - } - for (; i < size; ++i) - sums[0] += Simd::Square(a[i] - b[i]); - *sum = sums[0] + sums[1] + sums[2] + sums[3]; - } - - SIMD_INLINE void KahanSum(float value, float & sum, float & correction) - { - float term = value - correction; - float temp = sum + term; - correction = (temp - sum) - term; - sum = temp; - } - -#if defined(__GNUC__) && (defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)) -#ifdef __clang__ -#pragma clang optimize off -#else -#pragma GCC push_options -#pragma GCC optimize ("O1") -#endif -#elif defined(_MSC_VER) && (_MSC_VER >= 1914) -#pragma optimize ("", off) -#endif - void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum) - { - size_t alignedSize = Simd::AlignLo(size, 4); - float sums[4] = { 0, 0, 0, 0 }; - float corrections[4] = { 0, 0, 0, 0 }; - size_t i = 0; - for (; i < alignedSize; i += 4) - { - KahanSum(Simd::Square(a[i + 0] - b[i + 0]), sums[0], corrections[0]); - KahanSum(Simd::Square(a[i + 1] - b[i + 1]), sums[1], corrections[1]); - KahanSum(Simd::Square(a[i + 2] - b[i + 2]), sums[2], corrections[2]); - KahanSum(Simd::Square(a[i + 3] - b[i + 3]), sums[3], corrections[3]); - } - for (; i < size; ++i) - KahanSum(Simd::Square(a[i + 0] - b[i + 0]), sums[0], corrections[0]); - *sum = sums[0] + sums[1] + sums[2] + sums[3]; - } -#if defined(__GNUC__) && (defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)) -#ifdef __clang__ -#pragma clang optimize on -#else -#pragma GCC pop_options -#endif -#elif defined(_MSC_VER) && (_MSC_VER >= 1920) -#pragma optimize ("", on) -#endif - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseStatistic.cpp b/src/3rd/Simd/Simd/SimdBaseStatistic.cpp deleted file mode 100644 index 9d03ee53..00000000 --- a/src/3rd/Simd/Simd/SimdBaseStatistic.cpp +++ /dev/null @@ -1,177 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" - -namespace Simd -{ - namespace Base - { - void GetStatistic(const uint8_t * src, size_t stride, size_t width, size_t height, - uint8_t * min, uint8_t * max, uint8_t * average) - { - assert(width*height); - - uint64_t sum = 0; - int min_ = UCHAR_MAX; - int max_ = 0; - for (size_t row = 0; row < height; ++row) - { - int rowSum = 0; - for (size_t col = 0; col < width; ++col) - { - int value = src[col]; - max_ = MaxU8(value, max_); - min_ = MinU8(value, min_); - rowSum += value; - } - sum += rowSum; - src += stride; - } - *average = (uint8_t)((sum + width*height / 2) / (width*height)); - *min = min_; - *max = max_; - } - - void GetRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - for (size_t row = 0; row < height; ++row) - { - uint32_t sum = 0; - for (size_t col = 0; col < width; ++col) - sum += src[col]; - sums[row] = sum; - src += stride; - } - } - - void GetColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - memset(sums, 0, sizeof(uint32_t)*width); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - sums[col] += src[col]; - src += stride; - } - } - - void GetAbsDyRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - const uint8_t * src0 = src; - const uint8_t * src1 = src + stride; - height--; - sums[height] = 0; - for (size_t row = 0; row < height; ++row) - { - uint32_t sum = 0; - for (size_t col = 0; col < width; ++col) - sum += AbsDifferenceU8(src0[col], src1[col]); - sums[row] = sum; - src0 += stride; - src1 += stride; - } - } - - void GetAbsDxColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - const uint8_t * src0 = src; - const uint8_t * src1 = src + 1; - memset(sums, 0, sizeof(uint32_t)*width); - width--; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - sums[col] += AbsDifferenceU8(src0[col], src1[col]); - src0 += stride; - src1 += stride; - } - } - - void ValueSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - *sum = 0; - for (size_t row = 0; row < height; ++row) - { - int rowSum = 0; - for (size_t col = 0; col < width; ++col) - rowSum += src[col]; - *sum += rowSum; - src += stride; - } - } - - void SquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - assert(width < 0x10000); - - *sum = 0; - for (size_t row = 0; row < height; ++row) - { - int rowSum = 0; - for (size_t col = 0; col < width; ++col) - rowSum += Square(src[col]); - *sum += rowSum; - src += stride; - } - } - - void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum) - { - assert(width < 0x10000); - - *valueSum = 0; - *squareSum = 0; - for (size_t row = 0; row < height; ++row) - { - int rowValueSum = 0; - int rowSquareSum = 0; - for (size_t col = 0; col < width; ++col) - { - int value = src[col]; - rowValueSum += value; - rowSquareSum += Square(value); - } - *valueSum += rowValueSum; - *squareSum += rowSquareSum; - src += stride; - } - } - - void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum) - { - assert(width < 0x10000); - - *sum = 0; - for (size_t row = 0; row < height; ++row) - { - int rowSum = 0; - for (size_t col = 0; col < width; ++col) - rowSum += a[col] * b[col]; - *sum += rowSum; - a += aStride; - b += bStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseStatisticMoments.cpp b/src/3rd/Simd/Simd/SimdBaseStatisticMoments.cpp deleted file mode 100644 index 673d1099..00000000 --- a/src/3rd/Simd/Simd/SimdBaseStatisticMoments.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE void GetObjectMoments(uint32_t src, uint32_t col, uint32_t & n, uint32_t & s, uint32_t& sx, uint32_t& sxx) - { - n += 1; - s += src; - sx += src * col; - sxx += src * col * col; - } - - void GetObjectMoments(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t* mask, size_t maskStride, uint8_t index, - uint64_t* n, uint64_t* s, uint64_t* sx, uint64_t* sy, uint64_t* sxx, uint64_t* sxy, uint64_t* syy) - { - assert(src || mask); - - *n = 0; - *s = 0; - *sx = 0; - *sy = 0; - *sxx = 0; - *sxy = 0; - *syy = 0; - - const size_t B = 181; - - for (size_t row = 0; row < height; ++row) - { - for (size_t colB = 0; colB < width;) - { - uint32_t colE = (uint32_t)Simd::Min(colB + B, width); - uint32_t _n = 0; - uint32_t _s = 0; - uint32_t _sx = 0; - uint32_t _sxx = 0; - if (mask == NULL) - { - for (uint32_t col = (uint32_t)colB; col < colE; ++col) - GetObjectMoments(src[col], col - (uint32_t)colB, _n, _s, _sx, _sxx); - } - else if (src == NULL) - { - for (uint32_t col = (uint32_t)colB; col < colE; ++col) - if(mask[col] == index) - GetObjectMoments(1, col - (uint32_t)colB, _n, _s, _sx, _sxx); - } - else - { - for (uint32_t col = (uint32_t)colB; col < colE; ++col) - if (mask[col] == index) - GetObjectMoments(src[col], col - (uint32_t)colB, _n, _s, _sx, _sxx); - } - uint64_t _y = row; - uint64_t _x = colB; - - *n += _n; - *s += _s; - - *sx += _sx + _s * _x; - *sy += _s * _y; - - *sxx += _sxx + _sx * _x * 2 + _s * _x * _x; - *sxy += _sx * _y + _s * _x * _y; - *syy += _s * _y * _y; - - colB = colE; - } - if (src) - src += srcStride; - if (mask) - mask += maskStride; - } - } - - void GetMoments(const uint8_t* mask, size_t stride, size_t width, size_t height, uint8_t index, - uint64_t* area, uint64_t* x, uint64_t* y, uint64_t* xx, uint64_t* xy, uint64_t* yy) - { - uint64_t stub; - GetObjectMoments(NULL, 0, width, height, mask, stride, index, &stub, area, x, y, xx, xy, yy); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseStretchGray2x2.cpp b/src/3rd/Simd/Simd/SimdBaseStretchGray2x2.cpp deleted file mode 100644 index 6d3e07df..00000000 --- a/src/3rd/Simd/Simd/SimdBaseStretchGray2x2.cpp +++ /dev/null @@ -1,52 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdDefs.h" - -namespace Simd -{ - namespace Base - { - void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert(srcWidth * 2 == dstWidth && srcHeight * 2 == dstHeight); - - for (size_t row = 0; row < srcHeight; ++row) - { - uint8_t * dstEven = dst; - uint8_t * dstOdd = dst + dstStride; - for (size_t srcCol = 0; srcCol < srcWidth; srcCol += 1, dstEven += 2, dstOdd += 2) - { - uint8_t value = src[srcCol]; - dstEven[0] = value; - dstEven[1] = value; - dstOdd[0] = value; - dstOdd[1] = value; - } - src += srcStride; - dst += 2 * dstStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseSvm.cpp b/src/3rd/Simd/Simd/SimdBaseSvm.cpp deleted file mode 100644 index 4467024a..00000000 --- a/src/3rd/Simd/Simd/SimdBaseSvm.cpp +++ /dev/null @@ -1,68 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" - -namespace Simd -{ - namespace Base - { - namespace - { - struct Buffer - { - Buffer(size_t count) - { - size_t size = sizeof(float)*count; - _p = Allocate(size); - memset(_p, 0, size); - sums = (float*)_p; - } - - ~Buffer() - { - Free(_p); - } - - float * sums; - private: - void *_p; - }; - } - - void SvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum) - { - Buffer buffer(count); - for (size_t j = 0; j < length; ++j) - { - float v = x[j]; - for (size_t i = 0; i < count; ++i) - buffer.sums[i] += v*svs[i]; - svs += count; - } - *sum = 0; - for (size_t i = 0; i < count; ++i) - *sum += buffer.sums[i] * weights[i]; - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseSynet.cpp b/src/3rd/Simd/Simd/SimdBaseSynet.cpp deleted file mode 100644 index b9d06945..00000000 --- a/src/3rd/Simd/Simd/SimdBaseSynet.cpp +++ /dev/null @@ -1,670 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdArray.h" -#include "Simd/SimdPow.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdEnable.h" -#include "Simd/SimdExp.h" - -namespace Simd -{ - namespace Base - { - SimdTensorFormatType SynetSpecifyTensorFormat(SimdTensorFormatType format) - { - if (format == SimdTensorFormatNchwXc) - { - switch (Simd::ALIGNMENT) - { - case 16: return SimdTensorFormatNchw4c; - case 32: return SimdTensorFormatNchw8c; - case 64: return SimdTensorFormatNchw16c; - } - } - if (format == SimdTensorFormatOyxiXo) - { - switch (Simd::ALIGNMENT) - { - case 16: return SimdTensorFormatOyxi4o; - case 32: return SimdTensorFormatOyxi8o; - case 64: return SimdTensorFormatOyxi16o; - } - } - return SimdTensorFormatUnknown; - } - - size_t SynetTensorAlignment(SimdTensorFormatType format) - { - switch (format) - { - case SimdTensorFormatNchw: return 1; - case SimdTensorFormatNhwc: return 1; - case SimdTensorFormatNchw4c: return 4; - case SimdTensorFormatNchw8c: return 8; - case SimdTensorFormatNchw16c: return 16; - case SimdTensorFormatOiyx: return 1; - case SimdTensorFormatYxio: return 1; - case SimdTensorFormatOyxi4o: return 4; - case SimdTensorFormatOyxi8o: return 8; - case SimdTensorFormatOyxi16o: return 16; - } - assert(0); - return 0; - } - - void SynetAddBiasNchw(const float * bias, size_t channels, size_t spatial, float * dst) - { - size_t aligned = Simd::AlignLo(spatial, 4); - for (size_t c = 0; c < channels; ++c) - { - float value = bias[c]; - size_t s = 0; - for (; s < aligned; s += 4) - { - dst[s + 0] += value; - dst[s + 1] += value; - dst[s + 2] += value; - dst[s + 3] += value; - } - for (; s < spatial; ++s) - dst[s] += value; - dst += spatial; - } - } - - void SynetAddBiasNhwc(const float * bias, size_t channels, size_t spatial, float * dst) - { - size_t aligned = Simd::AlignLo(channels, 4); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned; c += 4) - { - dst[c + 0] += bias[c + 0]; - dst[c + 1] += bias[c + 1]; - dst[c + 2] += bias[c + 2]; - dst[c + 3] += bias[c + 3]; - } - for (; c < channels; ++c) - dst[c] += bias[c]; - dst += channels; - } - } - - template void SynetAddBiasNchwXc(const float * bias, size_t channels, size_t spatial, float * dst) - { - for (size_t c = 0; c < channels; c += N) - { - for (size_t s = 0; s < spatial; ++s) - { - for (size_t i = 0; i < N; ++i) - dst[i] += bias[i]; - dst += N; - } - bias += N; - } - } - - void SynetAddBias(const float * bias, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetAddBiasNchw(bias, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetAddBiasNhwc(bias, channels, spatial, dst); - else if(format == SimdTensorFormatNchw4c) - SynetAddBiasNchwXc<4>(bias, channels, spatial, dst); - else if (format == SimdTensorFormatNchw8c) - SynetAddBiasNchwXc<8>(bias, channels, spatial, dst); - else if (format == SimdTensorFormatNchw16c) - SynetAddBiasNchwXc<16>(bias, channels, spatial, dst); - else - assert(0); - } - - //--------------------------------------------------------------------- - - template void SynetEltwiseLayerForward(float const * const * src, size_t count, size_t size, float * dst) - { - size_t aligned = Simd::AlignLo(size, 4); - const float * src0 = src[0]; - const float * src1 = src[1]; - size_t j = 0; - for (; j < aligned; j += 4) - { - dst[j + 0] = SynetEltwiseLayerForward(src0[j + 0], src1[j + 0]); - dst[j + 1] = SynetEltwiseLayerForward(src0[j + 1], src1[j + 1]); - dst[j + 2] = SynetEltwiseLayerForward(src0[j + 2], src1[j + 2]); - dst[j + 3] = SynetEltwiseLayerForward(src0[j + 3], src1[j + 3]); - } - for (; j < size; ++j) - dst[j] = SynetEltwiseLayerForward(src0[j], src1[j]); - for (size_t i = 2; i < count; ++i) - { - const float * srci = src[i]; - for (j = 0; j < aligned; j += 4) - { - dst[j + 0] = SynetEltwiseLayerForward(dst[j + 0], srci[j + 0]); - dst[j + 1] = SynetEltwiseLayerForward(dst[j + 1], srci[j + 1]); - dst[j + 2] = SynetEltwiseLayerForward(dst[j + 2], srci[j + 2]); - dst[j + 3] = SynetEltwiseLayerForward(dst[j + 3], srci[j + 3]); - } - for (; j < size; ++j) - dst[j] = SynetEltwiseLayerForward(dst[j], srci[j]); - } - } - - void SynetEltwiseLayerForwardSum(float const * const * src, const float * weight, size_t count, size_t size, float * dst) - { - size_t aligned = Simd::AlignLo(size, 4); - const float * src0 = src[0]; - const float * src1 = src[1]; - float weight0 = weight[0], weight1 = weight[1]; - size_t j = 0; - for (; j < aligned; j += 4) - { - dst[j + 0] = src0[j + 0] * weight0 + src1[j + 0] * weight1; - dst[j + 1] = src0[j + 1] * weight0 + src1[j + 1] * weight1; - dst[j + 2] = src0[j + 2] * weight0 + src1[j + 2] * weight1; - dst[j + 3] = src0[j + 3] * weight0 + src1[j + 3] * weight1; - } - for (; j < size; ++j) - dst[j] = src0[j] * weight0 + src1[j] * weight1; - for (size_t i = 2; i < count; ++i) - { - const float * srci = src[i]; - float weighti = weight[i]; - for (j = 0; j < aligned; j += 4) - { - dst[j + 0] += srci[j + 0] * weighti; - dst[j + 1] += srci[j + 1] * weighti; - dst[j + 2] += srci[j + 2] * weighti; - dst[j + 3] += srci[j + 3] * weighti; - } - for (; j < size; ++j) - dst[j] += srci[j] * weighti; - } - } - - void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst) - { - switch (type) - { - case SimdSynetEltwiseOperationProduct: - SynetEltwiseLayerForward(src, count, size, dst); - break; - case SimdSynetEltwiseOperationSum: - SynetEltwiseLayerForwardSum(src, weight, count, size, dst); - break; - case SimdSynetEltwiseOperationMax: - SynetEltwiseLayerForward(src, count, size, dst); - break; - case SimdSynetEltwiseOperationMin: - SynetEltwiseLayerForward(src, count, size, dst); - break; - default: - assert(0); - } - } - - //--------------------------------------------------------------------- - - void SynetInnerProductLayerForward(const float * src, const float * weight, const float * bias, size_t count, size_t size, float * dst) - { - size_t aligned = Simd::AlignLo(size, 4); - for (size_t i = 0; i < count; ++i) - { - size_t j = 0; - float sums[4] = { 0, 0, 0, 0 }; - for (; j < aligned; j += 4) - { - sums[0] += src[j + 0] * weight[j + 0]; - sums[1] += src[j + 1] * weight[j + 1]; - sums[2] += src[j + 2] * weight[j + 2]; - sums[3] += src[j + 3] * weight[j + 3]; - } - for (; j < size; ++j) - sums[0] += src[j] * weight[j]; - dst[i] = sums[0] + sums[1] + sums[2] + sums[3] + (bias ? bias[i] : 0); - weight += size; - } - } - - //--------------------------------------------------------------------- - - void SynetLrnLayerCrossChannelsNchw(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst) - { - float k0 = k[0], k1 = k[1], k2 = k[2]; - Array32f sum(spatial, true), zero(spatial, true); - for (size_t c = 0; c < half; ++c) - { - const float * pos = src + c * spatial; - for (size_t s = 0; s < spatial; ++s) - sum[s] += Simd::Square(pos[s]); - } - for (size_t c = 0; c < channels; ++c) - { - const float * pos = (c < channels - half) ? src + half * spatial : zero.data; - const float * neg = (c > half) ? src - (half + 1) * spatial : zero.data; - for (size_t s = 0; s < spatial; ++s) - { - sum[s] += Simd::Square(pos[s]); - sum[s] -= Simd::Square(neg[s]); - dst[s] = src[s] * Pow(k0 + k1 * sum[s], k2); - } - src += spatial; - dst += spatial; - } - } - - void SynetLrnLayerCrossChannelsNhwc(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst) - { - float k0 = k[0], k1 = k[1], k2 = k[2]; - size_t beg = half + 1; - size_t end = channels - half; - for (size_t s = 0; s < spatial; ++s) - { - float sum = 0; - for (size_t c = 0; c < half; ++c) - sum += Simd::Square(src[c]); - for (size_t c = 0; c < beg; ++c) - { - sum += Simd::Square(src[c + half]); - dst[c] = src[c] * Pow(k0 + k1 * sum, k2); - } - for (size_t c = beg; c < end; ++c) - { - sum += Simd::Square(src[c + half]); - sum -= Simd::Square(src[c - half - 1]); - dst[c] = src[c] * Pow(k0 + k1 * sum, k2); - } - for (size_t c = end; c < channels; ++c) - { - sum -= Simd::Square(src[c - half - 1]); - dst[c] = src[c] * Pow(k0 + k1 * sum, k2); - } - src += channels; - dst += channels; - } - } - - template void SynetLrnLayerCrossChannelsNchwXc(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst) - { - assert(0); - } - - void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst, SimdTensorFormatType format) - { - if (format == SimdTensorFormatNchw) - SynetLrnLayerCrossChannelsNchw(src, half, channels, spatial, k, dst); - else if (format == SimdTensorFormatNhwc) - SynetLrnLayerCrossChannelsNhwc(src, half, channels, spatial, k, dst); - else if (format == SimdTensorFormatNchw4c) - SynetLrnLayerCrossChannelsNchwXc<4>(src, half, channels, spatial, k, dst); - else if (format == SimdTensorFormatNchw8c) - SynetLrnLayerCrossChannelsNchwXc<8>(src, half, channels, spatial, k, dst); - else if (format == SimdTensorFormatNchw16c) - SynetLrnLayerCrossChannelsNchwXc<16>(src, half, channels, spatial, k, dst); - else - assert(0); - } - - //--------------------------------------------------------------------- - - void SynetScaleLayerForwardNchw(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, float * dst) - { - size_t aligned = Simd::AlignLo(spatial, 4); - if (bias) - { - for (size_t c = 0; c < channels; ++c) - { - float _scale = scale[c]; - float _bias = bias[c]; - size_t s = 0; - for (; s < aligned; s += 4) - { - dst[s + 0] = src[s + 0] * _scale + _bias; - dst[s + 1] = src[s + 1] * _scale + _bias; - dst[s + 2] = src[s + 2] * _scale + _bias; - dst[s + 3] = src[s + 3] * _scale + _bias; - } - for (; s < spatial; ++s) - dst[s] = src[s] * _scale + _bias; - src += spatial; - dst += spatial; - } - } - else - { - for (size_t c = 0; c < channels; ++c) - { - float _scale = scale[c]; - size_t s = 0; - for (; s < aligned; s += 4) - { - dst[s + 0] = src[s + 0] * _scale; - dst[s + 1] = src[s + 1] * _scale; - dst[s + 2] = src[s + 2] * _scale; - dst[s + 3] = src[s + 3] * _scale; - } - for (; s < spatial; ++s) - dst[s] = src[s] * _scale; - src += spatial; - dst += spatial; - } - } - } - - void SynetScaleLayerForwardNhwc(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, float * dst) - { - size_t aligned = Simd::AlignLo(channels, 4); - if (bias) - { - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned; c += 4) - { - dst[c + 0] = src[c + 0] * scale[c + 0] + bias[c + 0]; - dst[c + 1] = src[c + 1] * scale[c + 1] + bias[c + 1]; - dst[c + 2] = src[c + 2] * scale[c + 2] + bias[c + 2]; - dst[c + 3] = src[c + 3] * scale[c + 3] + bias[c + 3]; - } - for (; c < channels; ++c) - dst[c] = src[c] * scale[c] + bias[c]; - src += channels; - dst += channels; - - } - } - else - { - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned; c += 4) - { - dst[c + 0] = src[c + 0] * scale[c + 0]; - dst[c + 1] = src[c + 1] * scale[c + 1]; - dst[c + 2] = src[c + 2] * scale[c + 2]; - dst[c + 3] = src[c + 3] * scale[c + 3]; - } - for (; c < channels; ++c) - dst[c] = src[c] * scale[c]; - src += channels; - dst += channels; - } - } - } - - template void SynetScaleLayerForwardNchwXc(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, float * dst) - { - if (bias) - { - for (size_t c = 0; c < channels; c += N) - { - for (size_t s = 0; s < spatial; ++s) - { - for (size_t i = 0; i < N; ++i) - dst[i] = src[i]*scale[i] + bias[i]; - src += N; - dst += N; - } - scale += N; - bias += N; - } - } - else - { - for (size_t c = 0; c < channels; c += N) - { - for (size_t s = 0; s < spatial; ++s) - { - for (size_t i = 0; i < N; ++i) - dst[i] = src[i] * scale[i]; - src += N; - dst += N; - } - scale += N; - } - } - } - - void SynetScaleLayerForward(const float* src, const float* scale, const float* bias, size_t channels, size_t height, size_t width, float* dst, SimdTensorFormatType format, SimdSynetCompatibilityType compatibility) - { - size_t spatial = height * width; - if (Base::NchwCompatible(channels, spatial, format)) - SynetScaleLayerForwardNchw(src, scale, bias, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetScaleLayerForwardNhwc(src, scale, bias, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - SynetScaleLayerForwardNchwXc<4>(src, scale, bias, channels, spatial, dst); - else if (format == SimdTensorFormatNchw8c) - SynetScaleLayerForwardNchwXc<8>(src, scale, bias, channels, spatial, dst); - else if (format == SimdTensorFormatNchw16c) - SynetScaleLayerForwardNchwXc<16>(src, scale, bias, channels, spatial, dst); - else - assert(0); - } - - //--------------------------------------------------------------------- - - void SynetShuffleLayerForward(const float* src0, const float* src1, size_t channels0, size_t channels1, size_t spatial, float* dst0, float* dst1, SimdTensorFormatType format, int type) - { - size_t channels = (channels0 + channels1) / 2, size = sizeof(float) * spatial; - switch (type) - { - case 0: - if (format == SimdTensorFormatNchw) - { - size_t cd = 0; - for (size_t cs = 0; cs < channels0; cs += 2, cd += 1) - { - memcpy(dst0, src0 + 0 * spatial, size); - memcpy(dst1, src0 + 1 * spatial, size); - src0 += 2 * spatial; - dst0 += spatial; - dst1 += spatial; - } - for (size_t cs = 0; cs < channels1; cs += 2, cd += 1) - { - memcpy(dst0, src1 + 0 * spatial, size); - memcpy(dst1, src1 + 1 * spatial, size); - src1 += 2 * spatial; - dst0 += spatial; - dst1 += spatial; - } - } - else if (format == SimdTensorFormatNhwc) - { - for (size_t s = 0; s < spatial; ++s) - { - size_t cd = 0; - for (size_t cs = 0; cs < channels0; cs += 2, cd += 1) - { - dst0[cd] = src0[cs + 0]; - dst1[cd] = src0[cs + 1]; - } - for (size_t cs = 0; cs < channels1; cs += 2, cd += 1) - { - dst0[cd] = src1[cs + 0]; - dst1[cd] = src1[cs + 1]; - } - src0 += channels0; - src1 += channels1; - dst0 += channels; - dst1 += channels; - } - } - else - assert(0); - break; - case 1: - if (format == SimdTensorFormatNchw) - { - size_t cs = 0; - for (size_t cd = 0; cd < channels0; cs += 1, cd += 2) - { - memcpy(dst0 + 0 * spatial, src0, size); - memcpy(dst0 + 1 * spatial, src1, size); - src0 += spatial; - src1 += spatial; - dst0 += 2 * spatial; - } - for (size_t cd = 0; cd < channels1; cs += 1, cd += 2) - { - memcpy(dst1 + 0 * spatial, src0, size); - memcpy(dst1 + 1 * spatial, src1, size); - src0 += spatial; - src1 += spatial; - dst1 += 2 * spatial; - } - } - else if (format == SimdTensorFormatNhwc) - { - for (size_t s = 0; s < spatial; ++s) - { - size_t cs = 0; - for (size_t cd = 0; cd < channels0; cd += 2, cs += 1) - { - dst0[cd + 0] = src0[cs]; - dst0[cd + 1] = src1[cs]; - } - for (size_t cd = 0; cd < channels1; cd += 2, cs += 1) - { - dst1[cd + 0] = src0[cs]; - dst1[cd + 1] = src1[cs]; - } - src0 += channels; - src1 += channels; - dst0 += channels0; - dst1 += channels1; - } - } - else - assert(0); - break; - default: - assert(0); - } - - } - - //--------------------------------------------------------------------- - - void SynetSoftmaxLayerForward(const float * src, size_t outer, size_t count, size_t inner, float * dst) - { - if (inner == 1 && count == 2) - { - for (size_t o = 0; o < outer; ++o) - { - float max = Simd::Max(src[0], src[1]); - float exp0 = ::exp(src[0] - max); - float exp1 = ::exp(src[1] - max); - float sum = exp0 + exp1; - dst[0] = exp0 / sum; - dst[1] = exp1 / sum; - src += 2; - dst += 2; - } - } - else - { - Array32f tmp(inner * 2); - const float * s; - float * max = tmp.data, *sum = tmp.data + inner, *d; - for (size_t o = 0; o < outer; ++o) - { - for (size_t i = 0; i < inner; ++i) - max[i] = src[i]; - s = src + inner; - for (size_t c = 1; c < count; ++c) - { - for (size_t i = 0; i < inner; ++i) - max[i] = Simd::Max(max[i], s[i]); - s += inner; - } - - s = src; - d = dst; - for (size_t i = 0; i < inner; ++i) - sum[i] = 0; - for (size_t c = 0; c < count; ++c) - { - for (size_t i = 0; i < inner; ++i) - { - d[i] = ::exp(s[i] - max[i]); - sum[i] += d[i]; - } - s += inner; - d += inner; - } - - d = dst; - for (size_t c = 0; c < count; ++c) - { - for (size_t i = 0; i < inner; ++i) - d[i] /= sum[i]; - d += inner; - } - src += count * inner; - dst += count * inner; - } - } - } - - //--------------------------------------------------------------------- - - template void SynetUnaryOperation32fLayerForward(const float* src, size_t size, float* dst) - { - size_t size4 = AlignLo(size, 4); - size_t i = 0; - for (; i < size4; i += 4) - { - dst[i + 0] = SynetUnaryOperation32f(src[i + 0]); - dst[i + 1] = SynetUnaryOperation32f(src[i + 1]); - dst[i + 2] = SynetUnaryOperation32f(src[i + 2]); - dst[i + 3] = SynetUnaryOperation32f(src[i + 3]); - } - for (; i < size; ++i) - dst[i] = SynetUnaryOperation32f(src[i]); - } - - void SynetUnaryOperation32fLayerForward(const float * src, size_t size, SimdSynetUnaryOperation32fType type, float * dst) - { - switch (type) - { - case SimdSynetUnaryOperation32fAbs: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fExp: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fLog: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fNeg: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fRsqrt: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fSqrt: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fTanh: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fZero: SynetUnaryOperation32fLayerForward(src, size, dst); break; - default: - assert(0); - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseSynetActivation.cpp b/src/3rd/Simd/Simd/SimdBaseSynetActivation.cpp deleted file mode 100644 index d009c82b..00000000 --- a/src/3rd/Simd/Simd/SimdBaseSynetActivation.cpp +++ /dev/null @@ -1,234 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdArray.h" -#include "Simd/SimdExp.h" -#include "Simd/SimdSynet.h" - -namespace Simd -{ - namespace Base - { - void SynetElu32f(const float * src, size_t size, const float * alpha, float * dst) - { - float _alpha = alpha[0]; - size_t size4 = Simd::AlignLo(size, 4); - size_t i = 0; - for (; i < size4; i += 4) - { - dst[i + 0] = SynetElu32f(src[i + 0], _alpha); - dst[i + 1] = SynetElu32f(src[i + 1], _alpha); - dst[i + 2] = SynetElu32f(src[i + 2], _alpha); - dst[i + 3] = SynetElu32f(src[i + 3], _alpha); - } - for (; i < size; ++i) - dst[i] = SynetElu32f(src[i], _alpha); - } - - //--------------------------------------------------------------------- - - void SynetHswish32f(const float * src, size_t size, const float * shift, const float * scale, float * dst) - { - float _shift = shift[0]; - float _scale = scale[0]; - size_t size4 = Simd::AlignLo(size, 4); - size_t i = 0; - for (; i < size4; i += 4) - { - dst[i + 0] = SynetHswish32f(src[i + 0], _shift, _scale); - dst[i + 1] = SynetHswish32f(src[i + 1], _shift, _scale); - dst[i + 2] = SynetHswish32f(src[i + 2], _shift, _scale); - dst[i + 3] = SynetHswish32f(src[i + 3], _shift, _scale); - } - for (; i < size; ++i) - dst[i] = SynetHswish32f(src[i], _shift, _scale); - } - - //--------------------------------------------------------------------- - - void SynetPreluLayerForwardNchw(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) - { - size_t aligned = Simd::AlignLo(spatial, 4); - for (size_t c = 0; c < channels; ++c) - { - float _slope = slope[c]; - size_t s = 0; - for (; s < aligned; s += 4) - { - dst[s + 0] = SynetRelu32f(src[s + 0], _slope); - dst[s + 1] = SynetRelu32f(src[s + 1], _slope); - dst[s + 2] = SynetRelu32f(src[s + 2], _slope); - dst[s + 3] = SynetRelu32f(src[s + 3], _slope); - } - for (; s < spatial; ++s) - dst[s] = SynetRelu32f(src[s], _slope); - src += spatial; - dst += spatial; - } - } - - void SynetPreluLayerForwardNhwc(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) - { - size_t aligned = Simd::AlignLo(channels, 4); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned; c += 4) - { - dst[c + 0] = SynetRelu32f(src[c + 0], slope[c + 0]); - dst[c + 1] = SynetRelu32f(src[c + 1], slope[c + 1]); - dst[c + 2] = SynetRelu32f(src[c + 2], slope[c + 2]); - dst[c + 3] = SynetRelu32f(src[c + 3], slope[c + 3]); - } - for (; c < channels; ++c) - dst[c] = SynetRelu32f(src[c], slope[c]); - src += channels; - dst += channels; - - } - } - - template void SynetPreluLayerForwardNchwXc(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) - { - for (size_t c = 0; c < channels; c += N) - { - for (size_t s = 0; s < spatial; ++s) - { - for (size_t i = 0; i < N; ++i) - dst[i] = SynetRelu32f(src[i], slope[i]); - src += N; - dst += N; - } - slope += N; - } - } - - void SynetPreluLayerForward(const float* src, const float* slope, size_t channels, size_t spatial, float* dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetPreluLayerForwardNchw(src, slope, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetPreluLayerForwardNhwc(src, slope, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - SynetPreluLayerForwardNchwXc<4>(src, slope, channels, spatial, dst); - else if (format == SimdTensorFormatNchw8c) - SynetPreluLayerForwardNchwXc<8>(src, slope, channels, spatial, dst); - else if (format == SimdTensorFormatNchw16c) - SynetPreluLayerForwardNchwXc<16>(src, slope, channels, spatial, dst); - else - assert(0); - } - - //--------------------------------------------------------------------- - - void SynetRelu32f(const float* src, size_t size, const float* slope, float* dst) - { - float _slope = slope[0]; - size_t size4 = Simd::AlignLo(size, 4); - size_t i = 0; - for (; i < size4; i += 4) - { - dst[i + 0] = SynetRelu32f(src[i + 0], _slope); - dst[i + 1] = SynetRelu32f(src[i + 1], _slope); - dst[i + 2] = SynetRelu32f(src[i + 2], _slope); - dst[i + 3] = SynetRelu32f(src[i + 3], _slope); - } - for (; i < size; ++i) - dst[i] = SynetRelu32f(src[i], _slope); - } - - //--------------------------------------------------------------------- - - void SynetRestrictRange32f(const float * src, size_t size, const float * lower, const float * upper, float * dst) - { - float min = *lower; - float max = *upper; - size_t size4 = Simd::AlignLo(size, 4); - size_t i = 0; - for (; i < size4; i += 4) - { - dst[i + 0] = Simd::RestrictRange(src[i + 0], min, max); - dst[i + 1] = Simd::RestrictRange(src[i + 1], min, max); - dst[i + 2] = Simd::RestrictRange(src[i + 2], min, max); - dst[i + 3] = Simd::RestrictRange(src[i + 3], min, max); - } - for (; i < size; ++i) - dst[i] = Simd::RestrictRange(src[i], min, max); - } - - //--------------------------------------------------------------------- - - void SynetSigmoid32f(const float* src, size_t size, const float* slope, float* dst) - { - float _slope = slope[0]; - size_t size4 = Simd::AlignLo(size, 4); - size_t i = 0; - for (; i < size4; i += 4) - { - dst[i + 0] = SynetSigmoid32f(src[i + 0], _slope); - dst[i + 1] = SynetSigmoid32f(src[i + 1], _slope); - dst[i + 2] = SynetSigmoid32f(src[i + 2], _slope); - dst[i + 3] = SynetSigmoid32f(src[i + 3], _slope); - } - for (; i < size; ++i) - dst[i] = SynetSigmoid32f(src[i], _slope); - } - - //--------------------------------------------------------------------- - - void SynetSoftplus32f(const float* src, size_t size, const float * beta, const float * threshold, float* dst) - { - float _beta = beta[0]; - float _threshold = threshold[0]; - size_t size4 = Simd::AlignLo(size, 4); - size_t i = 0; - for (; i < size4; i += 4) - { - dst[i + 0] = SynetSoftplus32f(src[i + 0], _beta, _threshold); - dst[i + 1] = SynetSoftplus32f(src[i + 1], _beta, _threshold); - dst[i + 2] = SynetSoftplus32f(src[i + 2], _beta, _threshold); - dst[i + 3] = SynetSoftplus32f(src[i + 3], _beta, _threshold); - } - for (; i < size; ++i) - dst[i] = SynetSoftplus32f(src[i], _beta, _threshold); - } - - //--------------------------------------------------------------------- - - void SynetTanh32f(const float* src, size_t size, const float* slope, float* dst) - { - float _slope = slope[0]; - size_t size4 = Simd::AlignLo(size, 4); - size_t i = 0; - for (; i < size4; i += 4) - { - dst[i + 0] = SynetTanh32f(src[i + 0], _slope); - dst[i + 1] = SynetTanh32f(src[i + 1], _slope); - dst[i + 2] = SynetTanh32f(src[i + 2], _slope); - dst[i + 3] = SynetTanh32f(src[i + 3], _slope); - } - for (; i < size; ++i) - dst[i] = SynetTanh32f(src[i], _slope); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseSynetConversion.cpp b/src/3rd/Simd/Simd/SimdBaseSynetConversion.cpp deleted file mode 100644 index 94369f53..00000000 --- a/src/3rd/Simd/Simd/SimdBaseSynetConversion.cpp +++ /dev/null @@ -1,523 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdConversion.h" -#include "Simd/SimdSynet.h" - -namespace Simd -{ - namespace Base - { - void SynetConvert32fTo8u(const float* src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float* shift, uint8_t* dst, SimdSynetCompatibilityType compatibility) - { - for (size_t b = 0; b < batch; ++b) - { - if (format == SimdTensorFormatNchw) - { - for (size_t c = 0; c < channels; ++c) - { - float _scale = scale[c]; - float _shift = shift[c]; - for (size_t h = 0; h < height; ++h) - { - for (size_t w = 0; w < width; ++w) - dst[w] = SynetConvert32fTo8u(src[w], _scale, _shift); - src += width; - dst += width; - } - } - } - else if (format == SimdTensorFormatNhwc) - { - for (size_t h = 0; h < height; ++h) - { - for (size_t w = 0; w < width; ++w) - { - for (size_t c = 0; c < channels; ++c) - dst[c] = SynetConvert32fTo8u(src[c], scale[c], shift[c]); - src += channels; - dst += channels; - } - } - } - else - assert(0); - } - } - - //--------------------------------------------------------------------- - - SIMD_INLINE float ToFloat(int value, float scale, float shift) - { - return value * scale + shift; - } - - template SIMD_INLINE int ToGray(const uint8_t* src); - - template<> SIMD_INLINE int ToGray(const uint8_t* src) - { - return src[0]; - } - - template<> SIMD_INLINE int ToGray(const uint8_t* src) - { - return BgrToGray(src[0], src[1], src[2]); - } - - template<> SIMD_INLINE int ToGray(const uint8_t* src) - { - return BgrToGray(src[2], src[1], src[0]); - } - - template void SynetSetInput1(const uint8_t* src, size_t width, size_t height, size_t stride, const float* scale, const float* shift, float* dst) - { - for (size_t y = 0; y < height; ++y) - { - for (size_t x = 0; x < width; ++x, src += step) - *dst++ = ToFloat(ToGray(src), scale[0], shift[0]); - src += (stride - width * step); - } - } - - template SIMD_INLINE int ToBgr(const uint8_t* src, size_t channel); - - template<> SIMD_INLINE int ToBgr(const uint8_t* src, size_t channel) - { - return src[0]; - } - - template<> SIMD_INLINE int ToBgr(const uint8_t* src, size_t channel) - { - return src[channel]; - } - - template<> SIMD_INLINE int ToBgr(const uint8_t* src, size_t channel) - { - return src[2 - channel]; - } - - template void SynetSetInputNchw3(const uint8_t* src, size_t width, size_t height, size_t stride, const float* scale, const float* shift, float* dst0) - { - float* dst1 = dst0 + width * height; - float* dst2 = dst1 + width * height; - for (size_t y = 0; y < height; ++y) - { - for (size_t x = 0; x < width; ++x, src += step) - { - *dst0++ = ToFloat(ToBgr(src, 0), scale[0], shift[0]); - *dst1++ = ToFloat(ToBgr(src, 1), scale[1], shift[1]); - *dst2++ = ToFloat(ToBgr(src, 2), scale[2], shift[2]); - } - src += (stride - width * step); - } - } - - template void SynetSetInputNhwc3(const uint8_t* src, size_t width, size_t height, size_t stride, const float* scale, const float* shift, float* dst) - { - for (size_t y = 0; y < height; ++y) - { - for (size_t x = 0; x < width; ++x, src += step) - { - *dst++ = ToFloat(ToBgr(src, 0), scale[0], shift[0]); - *dst++ = ToFloat(ToBgr(src, 1), scale[1], shift[1]); - *dst++ = ToFloat(ToBgr(src, 2), scale[2], shift[2]); - } - src += (stride - width * step); - } - } - - void SynetSetInput(const uint8_t* src, size_t width, size_t height, size_t stride, SimdPixelFormatType srcFormat, - const float* lower, const float* upper, float* dst, size_t channels, SimdTensorFormatType dstFormat) - { - float scale[3]; - for (size_t i = 0; i < channels; ++i) - scale[i] = (upper[i] - lower[i]) / 255.0f; - switch (channels) - { - case 1: - switch (srcFormat) - { - case SimdPixelFormatGray8: SynetSetInput1(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgr24: SynetSetInput1(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgra32: SynetSetInput1(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatRgb24: SynetSetInput1(src, width, height, stride, scale, lower, dst); return; - default: assert(0); - } - break; - case 3: - switch (dstFormat) - { - case SimdTensorFormatNchw: - switch (srcFormat) - { - case SimdPixelFormatGray8: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgr24: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgra32: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatRgb24: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return; - default: assert(0); - } - break; - case SimdTensorFormatNhwc: - switch (srcFormat) - { - case SimdPixelFormatGray8: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgr24: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgra32: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatRgb24: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return; - default: assert(0); - } - break; - default: assert(0); - } - default: assert(0); - } - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void Copy(const float * src, float * dst) - { - for (size_t i = 0; i < N; ++i) - dst[i] = src[i]; - } - - void SynetReorderImage_Chw_Hwc(size_t channels, size_t spatial, const float * src, float * dst) - { - for (size_t s = 0; s < spatial; ++s, src += 1, dst += channels) - for (size_t c = 0; c < channels; ++c) - dst[c] = src[c*spatial]; - } - - template void SynetReorderImage_Chw_ChwXc(size_t channels, size_t spatial, const float * src, float * dst) - { - for (size_t c = 0; c < channels; c += N, src += N*spatial) - { - size_t n = Simd::Min(channels, c + N) - c; - const float * ps = src; - for (size_t s = 0; s < spatial; ++s, dst += N, ps += 1) - { - size_t i = 0; - for (; i < n; ++i) - dst[i] = ps[i*spatial]; - for (; i < N; ++i) - dst[i] = 0; - } - } - } - - void SynetReorderImage_Hwc_Chw(size_t channels, size_t spatial, const float * src, float * dst) - { - SynetReorderImage_Chw_Hwc(spatial, channels, src, dst); - } - - template void SynetReorderImage_Hwc_ChwXc(size_t channels, size_t spatial, const float * src, float * dst) - { - size_t channelsN = AlignLo(channels, N); - size_t tail = channels - channelsN; - for (size_t c = 0; c < channelsN; c += N, src += N) - { - const float * psrc = src; - for (size_t s = 0; s < spatial; ++s, psrc += channels, dst += N) - Copy(psrc, dst); - } - if(tail) - { - const float * psrc = src; - for (size_t s = 0; s < spatial; ++s, psrc += channels, dst += N) - { - size_t i = 0; - for (; i < tail; ++i) - dst[i] = psrc[i]; - for (; i < N; ++i) - dst[i] = 0; - } - } - } - - template void SynetReorderImage_ChwXc_Chw(size_t channels, size_t spatial, const float * src, float * dst) - { - for (size_t c = 0; c < channels; c += N, src += N * spatial) - { - const float * ps = src; - for (size_t i = 0, n = Simd::Min(channels, c + N) - c; i < n; ++i, ps += 1, dst += spatial) - { - for (size_t s = 0; s < spatial; ++s) - dst[s] = ps[s*N]; - } - } - } - - template void SynetReorderImage_ChwXc_Hwc(size_t channels, size_t spatial, const float * src, float * dst) - { - size_t stride = N * spatial; - size_t channelsN = AlignLo(channels, N); - size_t tail = channels - channelsN; - for (size_t s = 0; s < spatial; ++s, src += N) - { - const float * psrc = src; - for (size_t c = 0; c < channelsN; c += N, psrc += stride, dst += N) - Copy(psrc, dst); - if (tail) - { - for (size_t i = 0; i < tail; ++i) - *(dst++) = psrc[i]; - } - } - } - - typedef void(*SynetImageConverterPtr)(size_t channels, size_t spatial, const float * src, float * dst); - SynetImageConverterPtr GetImageConverter(SimdTensorFormatType src, SimdTensorFormatType dst) - { - if (src == SimdTensorFormatNchw) - { - if(dst == SimdTensorFormatNhwc) - return SynetReorderImage_Chw_Hwc; - if (dst == SimdTensorFormatNchw4c) - return SynetReorderImage_Chw_ChwXc<4>; - if (dst == SimdTensorFormatNchw8c) - return SynetReorderImage_Chw_ChwXc<8>; - if (dst == SimdTensorFormatNchw16c) - return SynetReorderImage_Chw_ChwXc<16>; - } - if (src == SimdTensorFormatNhwc) - { - if(dst == SimdTensorFormatNchw) - return SynetReorderImage_Hwc_Chw; - if (dst == SimdTensorFormatNchw4c) - return SynetReorderImage_Hwc_ChwXc<4>; - if (dst == SimdTensorFormatNchw8c) - return SynetReorderImage_Hwc_ChwXc<8>; - if (dst == SimdTensorFormatNchw16c) - return SynetReorderImage_Hwc_ChwXc<16>; - } - if (src == SimdTensorFormatNchw4c) - { - if (dst == SimdTensorFormatNchw) - return SynetReorderImage_ChwXc_Chw<4>; - if (dst == SimdTensorFormatNhwc) - return SynetReorderImage_ChwXc_Hwc<4>; - } - if (src == SimdTensorFormatNchw8c) - { - if (dst == SimdTensorFormatNchw) - return SynetReorderImage_ChwXc_Chw<8>; - if (dst == SimdTensorFormatNhwc) - return SynetReorderImage_ChwXc_Hwc<8>; - } - if (src == SimdTensorFormatNchw16c) - { - if (dst == SimdTensorFormatNchw) - return SynetReorderImage_ChwXc_Chw<16>; - if (dst == SimdTensorFormatNhwc) - return SynetReorderImage_ChwXc_Hwc<16>; - } - return NULL; - } - - void SynetReorderImage(size_t batch, size_t channels, size_t spatial, const float * src, SimdTensorFormatType srcFormat, float * dst, SimdTensorFormatType dstFormat) - { - SynetImageConverterPtr imageConverter = GetImageConverter(srcFormat, dstFormat); - size_t srcStride = AlignHi(channels, SynetTensorAlignment(srcFormat))*spatial; - size_t dstStride = AlignHi(channels, SynetTensorAlignment(dstFormat))*spatial; - for (size_t n = 0; n < batch; ++n) - { - if (srcFormat == dstFormat) - memcpy(dst, src, srcStride*sizeof(float)); - else - { - assert(imageConverter); - imageConverter(channels, spatial, src, dst); - } - src += srcStride; - dst += dstStride; - } - } - - //--------------------------------------------------------------------- - - void SynetReorderFilter_Oiyx_Yxio(size_t output, size_t input, size_t kernel, const float * src, float * dst) - { - size_t stride = input * kernel; - for (size_t k = 0; k < kernel; ++k, src += 1) - { - const float * ps = src; - for (size_t i = 0; i < input; ++i, ps += kernel) - { - for (size_t o = 0; o < output; ++o) - *(dst++) = ps[o * stride]; - } - } - } - - template void SynetReorderFilter_Oiyx_OyxiXo(size_t output, size_t input, size_t kernel, const float * src, float * dst) - { - for (size_t o = 0; o < output; o += N) - { - size_t n = Simd::Min(output, o + N) - o; - for (size_t k = 0; k < kernel; ++k) - { - for (size_t i = 0; i < input; ++i) - { - size_t j = 0; - for (; j < n; ++j) - *(dst++) = src[((o + j) * input + i)*kernel + k]; - for (; j < N; ++j) - *(dst++) = 0; - } - } - } - } - - void SynetReorderFilter_Yxio_Oiyx(size_t output, size_t input, size_t kernel, const float * src, float * dst) - { - SynetReorderFilter_Oiyx_Yxio(kernel, input, output, src, dst); - } - - template void SynetReorderFilter_Yxio_OyxiXo(size_t output, size_t input, size_t kernel, const float * src, float * dst) - { - size_t outputN = AlignLo(output, N); - for (size_t o = 0; o < outputN; o += N, src += N) - { - const float * psrc = src; - for (size_t k = 0; k < kernel; ++k) - for (size_t i = 0; i < input; ++i, dst += N, psrc += output) - Copy(psrc, dst); - } - if(outputN < output) - { - size_t tail = output - outputN; - for (size_t k = 0; k < kernel; ++k) - { - for (size_t i = 0; i < input; ++i, src += output) - { - size_t j = 0; - for (; j < tail; ++j) - *(dst++) = src[j]; - for (; j < N; ++j) - *(dst++) = 0; - } - } - } - } - - template void SynetReorderFilter_OyxiXo_Oiyx(size_t output, size_t input, size_t kernel, const float * src, float * dst) - { - for (size_t o = 0; o < output; o += N, src += N*kernel*input) - { - for (size_t j = 0, n = Simd::Min(output, o + N) - o; j < n; ++j) - { - for (size_t i = 0; i < input; ++i) - { - for (size_t k = 0; k < kernel; ++k) - *(dst++) = src[ (k*input + i)*N + j]; - } - } - } - } - - template void SynetReorderFilter_OyxiXo_Yxio(size_t output, size_t input, size_t kernel, const float * src, float * dst) - { - size_t outputN = AlignLo(output, N); - size_t tail = output - outputN; - size_t stride = kernel * input * N; - for (size_t k = 0; k < kernel; ++k) - { - for (size_t i = 0; i < input; ++i, src += N) - { - const float * psrc = src; - for (size_t o = 0; o < outputN; o += N, psrc += stride, dst += N) - Copy(psrc, dst); - if(outputN < output) - { - for (size_t j = 0; j < tail; ++j) - *(dst++) = psrc[j]; - } - } - } - } - - typedef void(*SynetFilterConverterPtr)(size_t output, size_t input, size_t kernel, const float * src, float * dst); - SynetFilterConverterPtr GetFilterConverter(SimdTensorFormatType src, SimdTensorFormatType dst) - { - if (src == SimdTensorFormatOiyx) - { - if (dst == SimdTensorFormatYxio) - return SynetReorderFilter_Oiyx_Yxio; - if (dst == SimdTensorFormatOyxi4o) - return SynetReorderFilter_Oiyx_OyxiXo<4>; - if (dst == SimdTensorFormatOyxi8o) - return SynetReorderFilter_Oiyx_OyxiXo<8>; - if (dst == SimdTensorFormatOyxi16o) - return SynetReorderFilter_Oiyx_OyxiXo<16>; - } - if (src == SimdTensorFormatYxio) - { - if (dst == SimdTensorFormatOiyx) - return SynetReorderFilter_Yxio_Oiyx; - if (dst == SimdTensorFormatOyxi4o) - return SynetReorderFilter_Yxio_OyxiXo<4>; - if (dst == SimdTensorFormatOyxi8o) - return SynetReorderFilter_Yxio_OyxiXo<8>; - if (dst == SimdTensorFormatOyxi16o) - return SynetReorderFilter_Yxio_OyxiXo<16>; - } - if (src == SimdTensorFormatOyxi4o) - { - if (dst == SimdTensorFormatOiyx) - return SynetReorderFilter_OyxiXo_Oiyx<4>; - if (dst == SimdTensorFormatYxio) - return SynetReorderFilter_OyxiXo_Yxio<4>; - } - if (src == SimdTensorFormatOyxi8o) - { - if (dst == SimdTensorFormatOiyx) - return SynetReorderFilter_OyxiXo_Oiyx<8>; - if (dst == SimdTensorFormatYxio) - return SynetReorderFilter_OyxiXo_Yxio<8>; - } - if (src == SimdTensorFormatOyxi16o) - { - if (dst == SimdTensorFormatOiyx) - return SynetReorderFilter_OyxiXo_Oiyx<16>; - if (dst == SimdTensorFormatYxio) - return SynetReorderFilter_OyxiXo_Yxio<16>; - } - return NULL; - } - - void SynetReorderFilter(size_t output, size_t input, size_t kernel, const float * src, SimdTensorFormatType srcFormat, float * dst, SimdTensorFormatType dstFormat) - { - if (srcFormat == dstFormat) - { - size_t aligned = AlignHi(output, SynetTensorAlignment(srcFormat)); - memcpy(dst, src, aligned * input * kernel * sizeof(float)); - return; - } - SynetFilterConverterPtr filterConverter = GetFilterConverter(srcFormat, dstFormat); - assert(filterConverter); - filterConverter(output, input, kernel, src, dst); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseSynetConvolution32f.cpp b/src/3rd/Simd/Simd/SimdBaseSynetConvolution32f.cpp deleted file mode 100644 index f1c43e0f..00000000 --- a/src/3rd/Simd/Simd/SimdBaseSynetConvolution32f.cpp +++ /dev/null @@ -1,1553 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#if defined(SIMD_PERFORMANCE_STATISTIC) - Base::PerformanceMeasurer * SynetConvolution32f::Perf(const String& func) - { - if (_perf == NULL) - _perf = Simd::Base::PerformanceMeasurerStorage::s_storage.Get(func, Param().Info() + " " + Desc(), Param().Flop()); - return _perf; - } -#endif - - namespace Base - { - void ConvolutionBiasAndActivation(const float * bias, size_t count, size_t size, ::SimdConvolutionActivationType activation, const float * params, SimdBool trans, float * dst) - { - if (activation == ::SimdConvolutionActivationIdentity) - { - if(bias) - SynetAddBias(bias, count, size, dst, (SimdTensorFormatType)trans); - } - else if (activation == ::SimdConvolutionActivationRelu) - { - if (bias) - { - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - for (size_t i = 0; i < count; ++i) - dst[i] = Simd::Max(0.0f, dst[i] + bias[i]); - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - for (size_t j = 0; j < size; ++j) - dst[j] = Simd::Max(0.0f, dst[j] + bias[i]); - dst += size; - } - } - } - else - { - float slope = 0; - SynetRelu32f(dst, size*count, &slope, dst); - } - } - else if (activation == ::SimdConvolutionActivationLeakyRelu) - { - float slope = params[0]; - if (bias) - { - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - for (size_t i = 0; i < count; ++i) - dst[i] = SynetRelu32f(dst[i] + bias[i], slope); - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - for (size_t j = 0; j < size; ++j) - dst[j] = SynetRelu32f(dst[j] + bias[i], slope); - dst += size; - } - } - } - else - SynetRelu32f(dst, size*count, &slope, dst); - } - else if (activation == ::SimdConvolutionActivationRestrictRange) - { - float lower = params[0]; - float upper = params[1]; - if (bias) - { - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - for (size_t i = 0; i < count; ++i) - dst[i] = Simd::RestrictRange(dst[i] + bias[i], lower, upper); - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - for (size_t j = 0; j < size; ++j) - dst[j] = Simd::RestrictRange(dst[j] + bias[i], lower, upper); - dst += size; - } - } - } - else - SynetRestrictRange32f(dst, size*count, &lower, &upper, dst); - } - else if (activation == ::SimdConvolutionActivationPrelu) - { - if (bias) - { - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - for (size_t i = 0; i < count; ++i) - dst[i] = SynetRelu32f(dst[i] + bias[i], params[i]); - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - for (size_t j = 0; j < size; ++j) - dst[j] = SynetRelu32f(dst[j] + bias[i], params[i]); - dst += size; - } - } - } - else - Base::SynetPreluLayerForward(dst, params, count, size, dst, (SimdTensorFormatType)trans); - } - else if (activation == ::SimdConvolutionActivationElu) - { - float alpha = params[0]; - if (bias) - { - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - for (size_t i = 0; i < count; ++i) - dst[i] = SynetElu32f(dst[i] + bias[i], alpha); - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - for (size_t j = 0; j < size; ++j) - dst[j] = SynetElu32f(dst[j] + bias[i], alpha); - dst += size; - } - } - } - else - SynetElu32f(dst, size*count, &alpha, dst); - } - else if (activation == ::SimdConvolutionActivationHswish) - { - float shift = params[0]; - float scale = params[1]; - if (bias) - { - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - for (size_t i = 0; i < count; ++i) - dst[i] = SynetHswish32f(dst[i] + bias[i], shift, scale); - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - for (size_t j = 0; j < size; ++j) - dst[j] = SynetHswish32f(dst[j] + bias[i], shift, scale); - dst += size; - } - } - } - else - SynetHswish32f(dst, size*count, &shift, &scale, dst); - } - else - assert(0); - } - - SynetConvolution32fGemmNN::SynetConvolution32fGemmNN(const ConvParam32f & p) - : SynetConvolution32f(p) - { - if (p.IsDilation(1) && p.IsStride(1) && p.IsPad(0)) - { - _skipConv = p.IsKernel(1) || (p.srcH == p.kernelY && p.srcW == p.kernelX); - } - else - _skipConv = false; - if (p.trans) - { - _M = p.dstH * p.dstW; - _N = p.dstC / p.group; - _K = p.srcC * p.kernelY * p.kernelX / p.group; - _ldS = _K * (p.Is1x1() ? p.group : 1); - _ldW = p.dstC; - _ldD = p.dstC; - _grW = _N; - _grS = _K * (p.Is1x1() ? 1 : _M); - _grD = _N; - } - else - { - _M = p.dstC / p.group; - _N = p.dstH * p.dstW; - _K = p.srcC * p.kernelY * p.kernelX / p.group; - _ldW = _K; - _ldS = _N; - _ldD = _N; - _grW = _M * _K; - _grS = _K * _N; - _grD = _M * _N; - } - _batch = p.batch; - _sizeS = p.srcC*p.srcH*p.srcW; - _sizeB = p.srcC*p.kernelY*p.kernelX*p.dstH*p.dstW; - _sizeD = p.dstC*p.dstH*p.dstW; - _merge = 1; - if (p.trans && p.group == 1 && _batch > 1) - { - for (size_t merge = 1; merge <= _batch; ++merge) - if (_batch%merge == 0 && _M*merge*_K*sizeof(float) <= Base::AlgCacheL2()) - _merge = merge; - } - _gemm.Init(InitGemmFuncs(Base::Gemm32fNN, "Base", p.gemm, "Ext")); - _biasAndActivation = Base::ConvolutionBiasAndActivation; - } - - size_t SynetConvolution32fGemmNN::ExternalBufferSize() const - { - if (_skipConv) - return 1; - else - return _sizeB*_merge; - }; - - void SynetConvolution32fGemmNN::SetParams(const float * weight, SimdBool * internal, const float * bias, const float * params) - { - Simd::SynetConvolution32f::SetParams(weight, internal, bias, params); - if (_nhwcWeight.data) - { - if (_gemmCb.Size()) - _gemmCb.At(0).ReorderB(_M*_merge, _N, _K, weight, _nhwcWeight.data); - else - _nhwcReorderB(_M*_merge, _N, _K, weight, _nhwcWeight.data, GemmKernelAny, NHWC_GEMM_COMPATIBLE); - if (internal) - *internal = SimdTrue; - } - } - - void SynetConvolution32fGemmNN::Forward(const float * src, float * buf, float * dst) - { - const ConvParam32f & p = _param; - if (!_skipConv) - buf = Buffer(buf); - if (_merge > 1) - { - for (size_t b = 0; b < _batch; b += _merge) - { - const float * tmp = src; - if (!_skipConv) - { - for (size_t m = 0; m < _merge; ++m) - ImgToRow(src + m * _sizeS, buf + m * _sizeB); - tmp = buf; - } - if (_nhwcWeight.data) - { - if (_gemmCb.Size()) - _gemmCb.Run(GemmCbArgs(_M*_merge, _N, _K, tmp, _nhwcWeight.data, dst)); - else - _nhwcRun(_M*_merge, _N, _K, tmp, _nhwcWeight.data, dst, GemmKernelAny, NHWC_GEMM_COMPATIBLE); - } - else - _gemm.Run(GemmArgs(_M*_merge, _N, _K, &_1, tmp, _ldS, _weight, _ldW, &_0, dst, _ldD)); - for (size_t m = 0; m < _merge; ++m) - _biasAndActivation(_bias, p.dstC, p.dstH*p.dstW, p.activation, _params, p.trans, dst + m * _sizeD); - src += _sizeS * _merge; - dst += _sizeD * _merge; - } - } - else - { - for (size_t b = 0; b < _batch; ++b) - { - const float * tmp = src; - if (!_skipConv) - { - if (_param.trans) - ImgToRow(src, buf); - else - ImgToCol(src, buf); - tmp = buf; - } - for (size_t g = 0; g < p.group; ++g) - { - if (p.trans) - { - if (_nhwcWeight.data) - { - if (_gemmCb.Size()) - _gemmCb.Run(GemmCbArgs(_M, _N, _K, tmp, _nhwcWeight.data, dst)); - else - _nhwcRun(_M, _N, _K, tmp, _nhwcWeight.data, dst, GemmKernelAny, NHWC_GEMM_COMPATIBLE); - } - else - _gemm.Run(GemmArgs(_M, _N, _K, &_1, tmp + _grS * g, _ldS, _weight + _grW * g, _ldW, &_0, dst + _grD * g, _ldD)); - } - else - _gemm.Run(GemmArgs(_M, _N, _K, &_1, _weight + _grW * g, _ldW, tmp + _grS * g, _ldS, &_0, dst + _grD * g, _ldD)); - } - _biasAndActivation(_bias, p.dstC, p.dstH*p.dstW, p.activation, _params, p.trans, dst); - src += _sizeS; - dst += _sizeD; - } - } - } - - void SynetConvolution32fGemmNN::ImgToCol(const float * src, float * dst) - { - const ConvParam32f & p = _param; - assert(!p.trans); - size_t srcSize = p.srcW * p.srcH; - if (p.IsDilation(1) && p.IsStride(2) && p.IsPad(0) && p.IsKernel(1)) - { - for (size_t c = 0; c < p.srcC; ++c) - { - for (size_t dy = 0; dy < p.dstH; ++dy) - { - const float * psrc = src + 2 * dy*p.srcW; - for (size_t dx = 0, sx = 0; dx < p.dstW; ++dx, sx += 2) - *(dst++) = psrc[sx]; - } - src += srcSize; - } - } - else if (p.IsDilation(1) && p.IsStride(1)) - { - const ptrdiff_t bodySize = p.dstW - p.padX - p.padW; - for (size_t c = 0; c < p.srcC; ++c) - { - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sy = ky - p.padY; - for (size_t dy = 0; dy < p.dstH; ++dy, ++sy) - { - if (sy < p.srcH) - { - size_t sx = kx - p.padX, dx = 0; - const float* psrc = src + sy * p.srcW; - for (; dx < p.padX; ++dx, ++sx) - { - if (sx < p.srcW) - *(dst++) = psrc[sx]; - else - *(dst++) = 0; - } - if (bodySize > 0) - { - memcpy(dst, psrc + sx, bodySize * sizeof(float)); - dst += bodySize; - dx += bodySize; - sx += bodySize; - } - for (; dx < p.dstW; ++dx, ++sx) - { - if (sx < p.srcW) - *(dst++) = psrc[sx]; - else - *(dst++) = 0; - } - } - else - { - memset(dst, 0, p.dstW * sizeof(float)); - dst += p.dstW; - } - } - } - } - src += srcSize; - } - } - else - { - for (size_t c = 0; c < p.srcC; ++c) - { - for (size_t ky = 0; ky < p.kernelY; ky++) - { - for (size_t kx = 0; kx < p.kernelX; kx++) - { - size_t sy = ky * p.dilationY - p.padY; - for (size_t dy = 0; dy < p.dstH; ++dy) - { - if (sy < p.srcH) - { - size_t sx = kx * p.dilationX - p.padX; - for (size_t dx = 0; dx < p.dstW; ++dx) - { - if (sx < p.srcW) - *(dst++) = src[sy * p.srcW + sx]; - else - *(dst++) = 0; - sx += p.strideX; - } - } - else - { - for (size_t dx = 0; dx < p.dstW; ++dx) - *(dst++) = 0; - } - sy += p.strideY; - } - } - } - src += srcSize; - } - } - } - - void SynetConvolution32fGemmNN::ImgToRow(const float * src, float * dst) - { - const ConvParam32f & p = _param; - assert(p.trans); - size_t size = p.srcC / p.group; - for (size_t g = 0; g < p.group; ++g) - { - for (size_t dy = 0; dy < p.dstH; ++dy) - { - for (size_t dx = 0; dx < p.dstW; ++dx) - { - for (size_t ky = 0; ky < p.kernelY; ky++) - { - size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; kx++) - { - size_t sx = dx * p.strideX + kx * p.dilationX - p.padX; - if (sx < p.srcW) - { - memcpy(dst, src + (sy * p.srcW + sx)*p.srcC, size * sizeof(float)); - dst += size; - } - else - { - memset(dst, 0, size * sizeof(float)); - dst += size; - } - } - } - else - { - memset(dst, 0, p.kernelX * size * sizeof(float)); - dst += p.kernelX * size; - } - } - } - } - src += size; - } - } - - //--------------------------------------------------------------------- - - SynetConvolution32fGemmNT::SynetConvolution32fGemmNT(const ConvParam32f & p) - : SynetConvolution32f(p) - { - assert(p.group == 1); - if (p.trans) - assert(p.dstC == 1 && p.Is1x1()); - _M = p.dstC; - _N = p.dstH * p.dstW; - _K = p.srcC * p.kernelY * p.kernelX; - _batch = p.batch; - _sizeS = p.srcC*p.srcH*p.srcW; - _sizeB = p.srcC*p.kernelY*p.kernelX*p.dstH*p.dstW; - _sizeD = p.dstC*p.dstH*p.dstW; - _gemm.Init(InitGemmFuncs(Base::Gemm32fNT, "Base")); - _biasAndActivation = Base::ConvolutionBiasAndActivation; - } - - size_t SynetConvolution32fGemmNT::ExternalBufferSize() const - { - return _param.trans ? 1 : _sizeB; - }; - - void SynetConvolution32fGemmNT::Forward(const float * src, float * buf, float * dst) - { - const ConvParam32f& p = _param; - if (p.trans == 0) - buf = Buffer(buf); - for (size_t b = 0; b < _batch; ++b) - { - if (p.trans) - { - _gemm.Run(GemmArgs(_M, _N, _K, &_1, _weight, _K, src, _K, &_0, dst, _N)); - _biasAndActivation(_bias, 1, p.dstH * p.dstW, p.activation, _params, SimdFalse, dst); - } - else - { - ImgToRow(src, _param, buf); - _gemm.Run(GemmArgs(_M, _N, _K, &_1, _weight, _K, buf, _K, &_0, dst, _N)); - _biasAndActivation(_bias, p.dstC, p.dstH * p.dstW, p.activation, _params, SimdFalse, dst); - } - src += _sizeS; - dst += _sizeD; - } - } - - bool SynetConvolution32fGemmNT::Preferable(const ConvParam32f & p) - { - if (p.group != 1) - return false; - if (p.trans) - return p.Is1x1() && p.dstC == 1; - else - return p.srcH < 6 && p.srcW < 6; - } - - void SynetConvolution32fGemmNT::ImgToRow(const float * src, const ConvParam32f & p, float * dst) - { - const size_t K = p.kernelX * p.kernelY*p.srcC, N = p.dstH * p.dstW; - if (p.IsDilation(1) && p.IsStride(1)) - { - if (p.IsKernel(1)) - { - for (size_t i = 0; i < N; ++i) - { - for (size_t k = 0; k < K; ++k) - *(dst++) = src[k*N + i]; - } - } - else - { - for (size_t dstRow = 0; dstRow < p.dstH; ++dstRow) - { - size_t srcRow0 = dstRow - p.padY; - for (size_t dstCol = 0; dstCol < p.dstW; ++dstCol) - { - size_t srcCol0 = dstCol - p.padX; - for (size_t channel = 0; channel < p.srcC; ++channel) - { - for (size_t kernelRow = 0; kernelRow < p.kernelY; ++kernelRow) - { - size_t srcRow = srcRow0 + kernelRow; - if (srcRow < p.srcH) - { - const float * psrc = src + (channel*p.srcH + srcRow)*p.srcW; - for (size_t kernelCol = 0; kernelCol < p.kernelX; ++kernelCol) - { - size_t srcCol = srcCol0 + kernelCol; - if (srcCol < p.srcW) - *(dst++) = psrc[srcCol]; - else - *(dst++) = 0; - } - } - else - { - for (size_t kernelCol = 0; kernelCol < p.kernelX; ++kernelCol) - *(dst++) = 0; - } - } - } - } - } - } - } - else - { - for (size_t dstRow = 0; dstRow < p.dstH; ++dstRow) - { - size_t srcRow0 = dstRow * p.strideY - p.padY; - for (size_t dstCol = 0; dstCol < p.dstW; ++dstCol) - { - size_t srcCol0 = dstCol * p.strideX - p.padX; - for (size_t channel = 0; channel < p.srcC; ++channel) - { - for (size_t kernelRow = 0; kernelRow < p.kernelY; ++kernelRow) - { - size_t srcRow = srcRow0 + kernelRow * p.dilationY; - if (srcRow < p.srcH) - { - const float * psrc = src + (channel*p.srcH + srcRow)*p.srcW; - for (size_t kernelCol = 0; kernelCol < p.kernelX; ++kernelCol) - { - size_t srcCol = srcCol0 + kernelCol * p.dilationX; - if (srcCol < p.srcW) - *(dst++) = psrc[srcCol]; - else - *(dst++) = 0; - } - } - else - { - for (size_t kernelCol = 0; kernelCol < p.kernelX; ++kernelCol) - *(dst++) = 0; - } - } - } - } - } - } - } - - //--------------------------------------------------------------------- - - SynetConvolution32fWinograd::SynetConvolution32fWinograd(const ConvParam32f& p) - : SynetConvolution32f(p) - { - if (p.kernelY == 1 && p.kernelX == 3) - { - { - SetBlock(1, 4); - _setFilter = Base::WinogradKernel1x3Block1x4SetFilter; - _setInput = Base::WinogradKernel1x3Block1x4SetInput; - _setOutput = Base::WinogradKernel1x3Block1x4SetOutput; - } - } - else if (p.kernelY == 1 && p.kernelX == 5) - { - { - SetBlock(1, 4); - _setFilter = Base::WinogradKernel1x5Block1x4SetFilter; - _setInput = Base::WinogradKernel1x5Block1x4SetInput; - _setOutput = Base::WinogradKernel1x5Block1x4SetOutput; - } - } - else if (p.kernelY == 2 && p.kernelX == 2) - { - if (p.trans && p.srcH >= 8 && p.srcW >= 8 && p.srcH * p.srcW * p.batch >= 144) - { - SetBlock(4, 4); - _setFilter = Base::WinogradKernel2x2Block4x4SetFilter; - _setInput = Base::WinogradKernel2x2Block4x4SetInput; - _setOutput = Base::WinogradKernel2x2Block4x4SetOutput; - } - else - { - SetBlock(2, 2); - _setFilter = Base::WinogradKernel2x2Block2x2SetFilter; - _setInput = Base::WinogradKernel2x2Block2x2SetInput; - _setOutput = Base::WinogradKernel2x2Block2x2SetOutput; - } - } - else if (p.kernelY == 3 && p.kernelX == 3) - { - if (p.trans && p.srcH >= 8 && p.srcW >= 8 && p.srcH * p.srcW * p.batch >= 144) - { - SetBlock(4, 4); - _setFilter = Base::WinogradKernel3x3Block4x4SetFilter; - _setInput = Base::WinogradKernel3x3Block4x4SetInput; - _setOutput = Base::WinogradKernel3x3Block4x4SetOutput; - } - else if (p.trans && p.srcH >= 6 && p.srcW >= 6 && p.srcH * p.srcW * p.batch >= 81 && p.dstH % 3 == 0 && p.dstW % 3 == 0) - { - SetBlock(3, 3); - _setFilter = Base::WinogradKernel3x3Block3x3SetFilter; - _setInput = Base::WinogradKernel3x3Block3x3SetInput; - _setOutput = Base::WinogradKernel3x3Block3x3SetOutput; - } - else - { - SetBlock(2, 2); - _setFilter = Base::WinogradKernel3x3Block2x2SetFilter; - _setInput = Base::WinogradKernel3x3Block2x2SetInput; - _setOutput = Base::WinogradKernel3x3Block2x2SetOutput; - } - } - else - assert(0); - _gemm.Init(InitGemmFuncs(Base::Gemm32fNN, "Base", p.gemm, "Ext")); - _biasAndActivation = Base::ConvolutionBiasAndActivation; - } - - String SynetConvolution32fWinograd::Desc() const - { - const ConvParam32f& p = this->Param(); - return Ext() + "::Winograd F(" + ToStr(_blockY) + "x" + ToStr(_blockX) + "," + ToStr(p.kernelY) + "x" + ToStr(p.kernelX) + ")" - + (_merge > 1 ? "*" + ToStr(_merge) : "") + (_split > 1 ? "/" + ToStr(_split) : ""); - } - - size_t SynetConvolution32fWinograd::ExternalBufferSize() const - { - return (_strideS + _strideD)*_count*_merge; - } - - size_t SynetConvolution32fWinograd::InternalBufferSize() const - { - return Simd::SynetConvolution32f::InternalBufferSize() + _winogradWeight.size; - } - - void SynetConvolution32fWinograd::SetParams(const float * weight, SimdBool * internal, const float * bias, const float * params) - { - Simd::SynetConvolution32f::SetParams(weight, internal, bias, params); - _winogradWeight.Resize(_strideW*_count); - _setFilter(weight, _param.srcC*_param.dstC, _winogradWeight.data, _param.trans); - if (_nhwcWeight.data) - { - for (size_t i = 0; i < _count; ++i) - { - if (_gemmCb.Size()) - _gemmCb.At(0).ReorderB(_M * _merge, _N, _K, _winogradWeight.data + i * _strideW, _nhwcWeight.data + i * _nhwcStrideW); - else - _nhwcReorderB(_M * _merge, _N, _K, _winogradWeight.data + i * _strideW, _nhwcWeight.data + i * _nhwcStrideW, GemmKernelAny, NHWC_GEMM_COMPATIBLE); - } - _winogradWeight.Resize(0); - } - if (internal) - *internal = SimdTrue; - } - - void SynetConvolution32fWinograd::Forward(const float * src, float * buf, float * dst) - { - const ConvParam32f & p = _param; - float * bufS = Buffer(buf); - float * bufD = bufS + _strideS * _count * _merge; - if (p.trans) - { - if (_split > 1) - ForwardSplitted(src, bufS, bufD, dst); - else - ForwardMerged(src, bufS, bufD, dst); - } - else - { - for (size_t b = 0; b < _batch; ++b) - { - _setInput(src, p.srcC, p.srcH, p.srcW, p.padY, p.padX, p.padH, p.padW, bufS, _strideS, p.trans); - for (size_t i = 0; i < _count; ++i) - _gemm.Run(GemmArgs(_M, _N, _K, &_1, _winogradWeight.data + i * _strideW, _K, bufS + i * _strideS, _N, &_0, bufD + i * _strideD, _N)); - _setOutput(bufD, _strideD, dst, p.dstC, p.dstH, p.dstW, p.trans); - _biasAndActivation(_bias, p.dstC, p.dstH*p.dstW, p.activation, _params, p.trans, dst); - src += _sizeS; - dst += _sizeD; - } - } - } - - bool SynetConvolution32fWinograd::Preferable(const ConvParam32f & p) - { - if (!p.IsDilation(1) || !p.IsStride(1) || p.group != 1 || p.srcC <= 16) - return false; - - if (p.IsKernel(1, 3)) - { - if (!(p.IsPad(0) || (p.padX == 1 && p.padW == 1)) ) - return false; - if (p.srcC <= 32) - return false; - return p.trans && p.srcW >= 8 && p.srcH * p.srcW * p.batch >= 36; - } - else if (p.IsKernel(1, 5)) - { - if (!(p.IsPad(0) || (p.padX == 2 && p.padW == 2))) - return false; - return p.trans && p.srcW >= 8 && p.srcH * p.srcW * p.batch >= 36; - } - else if (p.IsKernel(2)) - { - if (!(p.IsPad(0) || (p.padY + p.padH == 1 && p.padX + p.padW == 1))) - return false; - return p.trans && p.srcH >= 4 && p.srcW >= 4 && p.srcH * p.srcW * p.batch >= 36; - } - else if (p.IsKernel(3)) - { - if (!(p.IsPad(0) || p.IsPad(1))) - return false; - if (p.trans) - return p.srcH >= 4 && p.srcW >= 4 && p.srcH * p.srcW * p.batch >= 36; - else - return p.srcH >= 6 && p.srcW >= 6; - } - return false; - } - - void SynetConvolution32fWinograd::SetBlock(size_t blockY, size_t blockX) - { - const ConvParam32f & p = _param; - _blockY = blockY; - _blockX = blockX; - _count = (_blockY + p.kernelY - 1) * (_blockX + p.kernelX - 1); - _tileH = (p.dstH + _blockY - 1) / _blockY; - _tileW = (p.dstW + _blockX - 1) / _blockX; - _strideW = p.srcC * p.dstC; - _M = p.trans ? _tileW * _tileH : p.dstC; - _N = p.trans ? p.dstC : _tileW * _tileH; - _K = p.srcC; - _batch = p.batch; - _sizeS = p.srcC*p.srcH*p.srcW; - _sizeD = p.dstC*p.dstH*p.dstW; - _merge = 1; - _split = 1; - _tileHs = _tileH; - if (p.trans) - { - if (_batch > 1) - { - for (size_t merge = 1; merge <= _batch; ++merge) - if (_batch % merge == 0 && _M * merge <= 128) - _merge = merge; - } - if (_merge == 1 && _blockY == 4) - { - size_t cacheL2 = Base::AlgCacheL2() / sizeof(float); - size_t cacheL3 = Base::AlgCacheL3() / sizeof(float); - size_t bufferSize = _count * (p.srcC + p.dstC) * _tileW * _tileH; - size_t weightSize = _count * p.srcC * p.dstC; - if (bufferSize > cacheL2) - { - _tileHs = Simd::RestrictRange(size_t(cacheL2*0.5) * _tileH / bufferSize, 1, _tileH); - _split = DivHi(_tileH, _tileHs); - while (_split * _tileHs >= _tileH + _split) - _tileHs--; - if (_split > 1 && weightSize > cacheL3) - { - _split = DivHi(bufferSize, weightSize); - _tileHs = DivHi(_tileH, _split); - while (_split * _tileHs >= _tileH + _tileHs) - _split--; - } - } - } - } - _strideS = p.srcC * _tileHs * _tileW; - _strideD = p.dstC * _tileHs * _tileW; - } - - void SynetConvolution32fWinograd::ForwardMerged(const float * src, float * bufS, float * bufD, float * dst) - { - const ConvParam32f & p = _param; - for (size_t b = 0; b < _batch; b += _merge) - { - for (size_t m = 0; m < _merge; ++m) - _setInput(src + m * _sizeS, p.srcC, p.srcH, p.srcW, p.padY, p.padX, p.padH, p.padW, bufS + m * _strideS, _strideS * _merge, p.trans); - for (size_t i = 0; i < _count; ++i) - { - if (_nhwcWeight.data) - { - if (_gemmCb.Size()) - _gemmCb.Run(GemmCbArgs(_M * _merge, _N, _K, bufS + i * _strideS * _merge, _nhwcWeight.data + i * _nhwcStrideW, bufD + i * _strideD * _merge)); - else - _nhwcRun(_M * _merge, _N, _K, bufS + i * _strideS * _merge, _nhwcWeight.data + i * _nhwcStrideW, bufD + i * _strideD * _merge, GemmKernelAny, NHWC_GEMM_COMPATIBLE); - } - else - _gemm.Run(GemmArgs(_M * _merge, _N, _K, &_1, bufS + i * _strideS * _merge, _K, _winogradWeight.data + i * _strideW, _N, &_0, bufD + i * _strideD * _merge, _N)); - } - for (size_t m = 0; m < _merge; ++m) - { - _setOutput(bufD + m * _strideD, _strideD * _merge, dst + m * _sizeD, p.dstC, p.dstH, p.dstW, p.trans); - _biasAndActivation(_bias, p.dstC, p.dstH*p.dstW, p.activation, _params, p.trans, dst + m * _sizeD); - } - src += _sizeS * _merge; - dst += _sizeD * _merge; - } - } - - void SynetConvolution32fWinograd::ForwardSplitted(const float* src, float* bufS, float* bufD, float* dst) - { - const ConvParam32f& p = _param; - for (size_t b = 0; b < _batch; ++b) - { - for (size_t s = 0; s < _split; ++s) - { - size_t padY = s ? 0 : p.padY; - size_t padH = s == _split - 1 ? p.padH : 0; - size_t srcY = s * _tileHs * _blockY + padY - p.padY; - size_t srcH = Simd::Min(_tileHs * _blockY + p.kernelY - 1 - padY - padH, p.srcH - srcY); - size_t M = _tileW * Simd::Min(_tileHs, _tileH - s * _tileHs); - size_t dstY = s * _tileHs * _blockY; - size_t dstH = Simd::Min(_tileHs * _blockY, p.dstH - dstY); - _setInput(src + srcY * p.srcC * p.srcW, p.srcC, srcH, p.srcW, padY, p.padX, padH, p.padW, bufS, _strideS, p.trans); - for (size_t i = 0; i < _count; ++i) - { - if (_nhwcWeight.data) - { - if (_gemmCb.Size()) - _gemmCb.Run(GemmCbArgs(M, _N, _K, bufS + i * _strideS, _nhwcWeight.data + i * _nhwcStrideW, bufD + i * _strideD)); - else - _nhwcRun(M, _N, _K, bufS + i * _strideS, _nhwcWeight.data + i * _nhwcStrideW, bufD + i * _strideD, GemmKernelAny, NHWC_GEMM_COMPATIBLE); - } - else - _gemm.Run(GemmArgs(M, _N, _K, &_1, bufS + i * _strideS, _K, _winogradWeight.data + i * _strideW, _N, &_0, bufD + i * _strideD, _N)); - } - _setOutput(bufD, _strideD, dst + dstY * p.dstC * p.dstW, p.dstC, dstH, p.dstW, p.trans); - _biasAndActivation(_bias, p.dstC, dstH * p.dstW, p.activation, _params, p.trans, dst + dstY * p.dstC * p.dstW); - } - src += _sizeS; - dst += _sizeD; - } - } - - //--------------------------------------------------------------------- - - SynetConvolution32fDirectNchw::SynetConvolution32fDirectNchw(const ConvParam32f & p) - : SynetConvolution32f(p) - { - _srcC = p.srcC / p.group; - _srcH = p.padY + p.srcH + p.padH; - _srcW = p.padX + p.srcW + p.padW; - _dstC = p.dstC / p.group; - _grW = _srcC * _dstC * p.kernelY * p.kernelX; - _grS = _srcC * p.srcH * p.srcW; - _grD = _dstC * p.dstH * p.dstW; - _pad = p.IsPad(0) ? 0 : 1; - _convolutionBiasActivation = SetConvolutionBiasActivation(); - } - - size_t SynetConvolution32fDirectNchw::ExternalBufferSize() const - { - if (_pad) - return _srcC*_srcH*_srcW; - else - return 1; - } - - void SynetConvolution32fDirectNchw::Forward(const float * src, float * buf, float * dst) - { - const ConvParam32f & p = _param; - if(_pad) - buf = Buffer(buf); - for (size_t b = 0; b < p.batch; ++b) - { - const float * weight = _weight; - const float * bias = _bias; - const float * params = _params; - for (size_t g = 0; g < p.group; ++g) - { - if (_pad) - { - Pad(src, buf); - _convolutionBiasActivation(buf, _srcC, _srcH, _srcW, weight, bias, params, dst, _dstC, p.dstH, p.dstW); - } - else - _convolutionBiasActivation(src, _srcC, _srcH, _srcW, weight, bias, params, dst, _dstC, p.dstH, p.dstW); - weight += _grW; - if (bias) - bias += _dstC; - if (p.activation == ::SimdConvolutionActivationPrelu) - params += _dstC; - src += _grS; - dst += _grD; - } - } - } - - bool SynetConvolution32fDirectNchw::Preferable(const ConvParam32f & p) - { - if (!p.IsDilation(1)) - return false; - if (!(p.IsStride(1) || p.IsStride(2) || p.IsStride(3))) - return false; - double k = double(p.srcC) / p.group * p.strideX * p.strideY / p.kernelX / p.kernelY; - return k < 2.0 && (p.IsKernel(2) || p.IsKernel(3)) && p.trans == 0; - } - - void SynetConvolution32fDirectNchw::Pad(const float * src, float * dst) const - { - const ConvParam32f & p = _param; - for (size_t c = 0; c < _srcC; ++c) - { - if (p.padY) - { - memset(dst, 0, p.padY*_srcW * sizeof(float)); - dst += p.padY*_srcW; - } - for (size_t row = 0; row < p.srcH; ++row) - { - for (size_t col = 0; col < p.padX; ++col) - *dst++ = 0; - memcpy(dst, src, p.srcW * sizeof(float)); - dst += p.srcW; - src += p.srcW; - for (size_t col = 0; col < p.padW; ++col) - *dst++ = 0; - } - if (p.padH) - { - memset(dst, 0, p.padH*_srcW * sizeof(float)); - dst += p.padH*_srcW; - } - } - } - - SIMD_INLINE void AddConvolutionKernel1x1(const float * src, size_t srcW, size_t strideY, size_t strideX, const float * weight, float * dst, size_t dstH, size_t dstW) - { - for (size_t dy = 0; dy < dstH; ++dy) - { - for (size_t dx = 0, sx = 0; dx < dstW; ++dx, sx += strideX) - dst[dx] += src[sx]*weight[0]; - src += srcW * strideY; - dst += dstW; - } - } - - SIMD_INLINE float ConvolutionKernel2(const float * src, const float * weight) - { - return src[0] * weight[0] + src[1] * weight[1]; - } - - SIMD_INLINE float ConvolutionKernel2x2(const float * src, size_t srcW, const float * weight) - { - return - ConvolutionKernel2(src, weight) + - ConvolutionKernel2(src + srcW, weight + 2); - } - - SIMD_INLINE void AddConvolutionKernel2x2(const float * src, size_t srcW, size_t strideY, size_t strideX, const float * weight, float * dst, size_t dstH, size_t dstW) - { - for (size_t dy = 0; dy < dstH; ++dy) - { - for (size_t dx = 0, sx = 0; dx < dstW; ++dx, sx += strideX) - dst[dx] += ConvolutionKernel2x2(src + sx, srcW, weight); - src += srcW * strideY; - dst += dstW; - } - } - - SIMD_INLINE float ConvolutionKernel3(const float * src, const float * weight) - { - return src[0] * weight[0] + src[1] * weight[1] + src[2] * weight[2]; - } - - SIMD_INLINE float ConvolutionKernel3x3(const float * src, size_t srcW, const float * weight) - { - return - ConvolutionKernel3(src, weight) + - ConvolutionKernel3(src + srcW, weight + 3) + - ConvolutionKernel3(src + 2 * srcW, weight + 6); - } - - SIMD_INLINE void AddConvolutionKernel3x3(const float * src, size_t srcW, size_t strideY, size_t strideX, const float * weight, float * dst, size_t dstH, size_t dstW) - { - for (size_t dy = 0; dy < dstH; ++dy) - { - for (size_t dx = 0, sx = 0; dx < dstW; ++dx, sx += strideX) - dst[dx] += ConvolutionKernel3x3(src + sx, srcW, weight); - src += srcW * strideY; - dst += dstW; - } - } - - template - void ConvolutionBiasActivation(const float * src, size_t srcC, size_t srcH, size_t srcW, const float * weight, - const float * bias, const float * params, float * dst, size_t dstC, size_t dstH, size_t dstW) - { - for (size_t dc = 0; dc < dstC; ++dc) - { - Fill32f(dst, dstW * dstH, bias ? bias + dc : NULL); - for (size_t sc = 0; sc < srcC; ++sc) - { - const float * ps = src + sc * srcW * srcH; - const float * pw = weight + (dc*srcC + sc)*kernel*kernel; - float * pd = dst; - if (kernel == 1) - AddConvolutionKernel1x1(ps, srcW, stride, stride, pw, pd, dstH, dstW); - else if (kernel == 2) - AddConvolutionKernel2x2(ps, srcW, stride, stride, pw, pd, dstH, dstW); - else if (kernel == 3) - AddConvolutionKernel3x3(ps, srcW, stride, stride, pw, pd, dstH, dstW); - else - { - for (size_t dy = 0; dy < dstH; ++dy) - { - for (size_t dx = 0, sx = 0; dx < dstW; ++dx, sx += stride) - { - float sum = 0; - for (size_t ky = 0; ky < kernel; ++ky) - { - const float * s = ps + ky * srcW + sx; - const float * w = pw + kernel*ky; - for (size_t kx = 0; kx < kernel; ++kx) - sum += s[kx] * w[kx]; - } - pd[dx] += sum; - } - ps += srcW * stride; - pd += dstW; - } - } - } - ConvolutionBiasAndActivation(NULL, 1, dstH*dstW, type, params, ::SimdFalse, dst); - if (type == ::SimdConvolutionActivationPrelu) - params++; - dst += dstW * dstH; - } - } - - template SynetConvolution32fDirectNchw::ConvolutionBiasActivationPtr SetConvolutionBiasActivation(::SimdConvolutionActivationType type) - { - switch (type) - { - case ::SimdConvolutionActivationIdentity: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationRelu: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationLeakyRelu: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationRestrictRange: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationPrelu: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationElu: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationHswish: return ConvolutionBiasActivation; - default: - assert(0); - return NULL; - } - } - - SynetConvolution32fDirectNchw::ConvolutionBiasActivationPtr SynetConvolution32fDirectNchw::SetConvolutionBiasActivation() - { - const ConvParam32f & p = _param; - switch (p.strideX) - { - case 1: - if (p.kernelX == 1) - return Base::SetConvolutionBiasActivation<1, 1>(p.activation); - if (p.kernelX == 2) - return Base::SetConvolutionBiasActivation<2, 1>(p.activation); - if (p.kernelX == 3) - return Base::SetConvolutionBiasActivation<3, 1>(p.activation); - break; - case 2: - if (p.kernelX == 2) - return Base::SetConvolutionBiasActivation<2, 2>(p.activation); - if (p.kernelX == 3) - return Base::SetConvolutionBiasActivation<3, 2>(p.activation); - break; - case 3: - if (p.kernelX == 3) - return Base::SetConvolutionBiasActivation<3, 3>(p.activation); - break; - } - return NULL; - } - - //--------------------------------------------------------------------- - - SynetConvolution32fDirectNhwc::SynetConvolution32fDirectNhwc(const ConvParam32f & p) - : SynetConvolution32f(p) - { - _batch = p.batch; - _sizeS = p.srcC*p.srcH*p.srcW; - _sizeD = p.dstC*p.dstH*p.dstW; - _convolutionBiasActivation = SetConvolutionBiasActivation(); - } - - void SynetConvolution32fDirectNhwc::Forward(const float * src, float * buf, float * dst) - { - for (size_t b = 0; b < _batch; ++b) - { - _convolutionBiasActivation(src, _param, _weight, _bias, _params, dst); - src += _sizeS; - dst += _sizeD; - } - } - - bool SynetConvolution32fDirectNhwc::Preferable(const ConvParam32f & p) - { - if (p.trans == 0) - return false; - if (p.group == 1) - { - double k = double(p.srcC) / p.group * p.strideX * p.strideY / p.kernelX / p.kernelY; - return k < 2.0; - } - return p.IsDepthwise(); - } - - static void ConvolutionDirectNhwcConvolutionBiasActivationDefault(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t group = p.group; - size_t srcC = p.srcC / group; - size_t dstC = p.dstC / group; - for (size_t dy = 0; dy < p.dstH; ++dy) - { - for (size_t dx = 0; dx < p.dstW; ++dx) - { - memset(dst, 0, p.dstC * sizeof(float)); - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * p.strideX + kx * p.dilationX - p.padX; - if (sx < p.srcW) - { - - const float * pw = weight + (ky*p.kernelX + kx)*srcC*p.dstC; - const float * ps = src + (sy*p.srcW + sx)*p.srcC; - if (group == 1) - { - for (size_t sc = 0; sc < srcC; ++sc) - { - for (size_t dc = 0; dc < dstC; ++dc) - dst[dc] += ps[0] * pw[dc]; - ps += 1; - pw += dstC; - } - } - else - { - for (size_t g = 0; g < group; ++g) - dst[g] += ps[g] * pw[g]; - } - } - } - } - } - ConvolutionBiasAndActivation(bias, p.dstC, 1, p.activation, params, ::SimdTrue, dst); - dst += p.dstC; - } - } - } - - SynetConvolution32fDirectNhwc::ConvolutionBiasActivationPtr SynetConvolution32fDirectNhwc::SetConvolutionBiasActivation() - { - return ConvolutionDirectNhwcConvolutionBiasActivationDefault; - } - - //--------------------------------------------------------------------- - - SynetConvolution32fDepthwiseDotProduct::SynetConvolution32fDepthwiseDotProduct(const ConvParam32f & p) - : SynetConvolution32f(p) - { - _count = p.srcC; - _size = p.srcH*p.srcW; - _batch = p.batch; - _sizeS = p.srcC*p.srcH*p.srcW; - _sizeD = p.dstC*p.dstH*p.dstW; - } - - SIMD_INLINE float DotProduct(const float * a, const float * b, size_t size) - { - size_t i = 0, aligned = size&(~3); - float sums[4] = { 0, 0, 0, 0 }; - for (; i < aligned; i += 4) - { - sums[0] += a[i + 0] * b[i + 0]; - sums[1] += a[i + 1] * b[i + 1]; - sums[2] += a[i + 2] * b[i + 2]; - sums[3] += a[i + 3] * b[i + 3]; - } - for (; i < size; ++i) - sums[0] += a[i] * b[i]; - return sums[0] + sums[1] + sums[2] + sums[3]; - } - - void SynetConvolution32fDepthwiseDotProduct::Forward(const float * src, float * buf, float * dst) - { - for (size_t b = 0; b < _batch; ++b) - { - if (_bias) - { - for (size_t i = 0; i < _count; ++i) - dst[i] = DotProduct(src + i * _size, _weight + i * _size, _size) + _bias[i]; - } - else - { - for (size_t i = 0; i < _count; ++i) - dst[i] = DotProduct(src + i * _size, _weight + i * _size, _size); - } - if (_param.activation) - ConvolutionBiasAndActivation(NULL, _count, 1, _param.activation, _params, ::SimdFalse, dst); - src += _sizeS; - dst += _sizeD; - } - } - - bool SynetConvolution32fDepthwiseDotProduct::Preferable(const ConvParam32f & p) - { - if (!(p.IsPad(0) && p.IsDilation(1) && p.IsStride(1))) - return false; - if (!(p.dstC == p.srcC && p.dstC == p.group && p.srcW == p.kernelX && p.srcH == p.kernelY)) - return false; - return p.trans == 0; - } - - //--------------------------------------------------------------------- - - SynetConvolution32fNhwcDirect::SynetConvolution32fNhwcDirect(const ConvParam32f & p) - : SynetConvolution32f(p) - { - _sizeS = p.srcC*p.srcH*p.srcW; - _sizeD = p.dstC*p.dstH*p.dstW; -#ifdef SIMD_SYNET_CONVOLUTION_NHWC_DIRECT_OLD - _old.enable = (p.srcC <= 3 && p.IsDilation(1)); - _old.convolution = NULL; -#endif - } - - size_t SynetConvolution32fNhwcDirect::InternalBufferSize() const - { - size_t size = _buffer.size + _rWeight.size + _rBias.size + _rParams.size; -#ifdef SIMD_SYNET_CONVOLUTION_NHWC_DIRECT_OLD - size += _old.weight.size; -#endif - return size; - } - - void SynetConvolution32fNhwcDirect::SetParams(const float * weight, SimdBool * internal, const float * bias, const float * params) - { - SynetConvolution32f::SetParams(weight, internal, bias, params); -#ifdef SIMD_SYNET_CONVOLUTION_NHWC_DIRECT_OLD - if (_old.enable && _old.weight.data) - { - OldReorderWeight(weight, _old.weight.data); - _weight = _old.weight.data; - if (internal) - *internal = SimdTrue; - } - else -#endif - if (_rWeight.data) - { - ReorderWeight(weight, _rWeight.data); - _weight = _rWeight.data; - if (internal) - *internal = SimdTrue; - } - if (_rBias.data) - { - if (bias) - memcpy(_rBias.data, bias, _param.dstC * sizeof(float)); - _bias = _rBias.data; - } - if (_rParams.data && _param.activation == ::SimdConvolutionActivationPrelu) - { - memcpy(_rParams.data, params, _param.dstC * sizeof(float)); - _params = _rParams.data; - } - } - - void SynetConvolution32fNhwcDirect::Forward(const float * src, float * buf, float * dst) - { - const ConvParam32f & p = _param; - for (size_t b = 0; b < p.batch; ++b) - { -#ifdef SIMD_SYNET_CONVOLUTION_NHWC_DIRECT_OLD - if(_old.enable) - _old.convolution(src, _param, _old.alg, _weight, _bias, _params, dst); - else -#endif - _run.Run(RunArgs(src, _param, _weight, _bias, _params, dst)); - src += _sizeS; - dst += _sizeD; - } - } - - void SynetConvolution32fNhwcDirect::Forward(const float* src, const ConvParam32f& p, const AlgParam& a, const float* weight, const float* bias, const float* params, float* dst) - { - for (size_t dc = 0; dc < p.dstC; dc += a.macroD) - { - size_t macroD = Simd::Min(p.dstC, dc + a.macroD) - dc; - for (size_t sc = 0; sc < p.srcC; sc += a.macroC) - { - size_t macroC = Simd::Min(p.srcC, sc + a.macroC) - sc; - for (size_t yBeg = 0; yBeg < p.dstH;) - { - size_t yEnd = Simd::Min(yBeg + a.macroH, p.dstH); - if (a.macroC == p.srcC) - a.convolutions[TermSingle](src + sc, p, a, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc == 0) - a.convolutions[TermFirst](src + sc, p, a, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc + macroC == p.srcC) - a.convolutions[TermLast](src + sc, p, a, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else - a.convolutions[TermIterim](src + sc, p, a, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - yBeg = yEnd; - } - weight += a.F * macroC; - } - if (p.activation == ::SimdConvolutionActivationPrelu) - params += macroD; - weight += p.kernelY * p.kernelY * p.srcC * macroD - p.srcC * a.F; - } - } - - void SynetConvolution32fNhwcDirect::SetAlgParam(size_t F, size_t N, AlgParam & alg) - { - const ConvParam32f& p = _param; - alg.F = F; - alg.microD = F*N; - alg.macroC = Simd::Min(Base::AlgCacheL1() / sizeof(float) / p.kernelY / p.kernelX / alg.microD, p.srcC); - for (size_t macroH = p.dstH; macroH >= 1; macroH--) - { - alg.macroH = macroH; - if (alg.macroC * p.srcW * (alg.macroH * p.strideY + p.kernelY * p.dilationY - 1) * sizeof(float) <= Base::AlgCacheL2()) - break; - } - alg.macroD = Simd::Min(AlignLoAny(Base::AlgCacheL3() / sizeof(float) / p.kernelY / p.kernelX / alg.macroC, alg.microD), AlignHiAny(p.dstC, alg.microD)); - alg.stepW = p.kernelY * p.kernelX * p.srcC * alg.F; - _rWeight.Resize(DivHi(p.dstC, alg.F)*alg.stepW); - _rBias.Resize(AlignHiAny(p.dstC, alg.F), true); - if (p.activation == ::SimdConvolutionActivationPrelu) - _rParams.Resize(AlignHiAny(p.dstC, alg.F)); - } - - void SynetConvolution32fNhwcDirect::ReorderWeight(const float* src, float* dst) - { - const ConvParam32f& p = _param; - const AlgParam & a = _run.At(0).alg; - for (size_t dc = 0; dc < p.dstC; dc += a.F) - { - size_t F = Simd::Min(p.dstC, dc + a.F) - dc; - const float* psrc = src; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - for (size_t sc = 0; sc < p.srcC; ++sc) - { - size_t f = 0; - for (; f < F; ++f) - *(dst++) = psrc[f]; - for (; f < a.F; ++f) - *(dst++) = 0.0f; - psrc += p.dstC; - } - } - } - src += F; - } - } - -#ifdef SIMD_SYNET_CONVOLUTION_NHWC_DIRECT_OLD - void SynetConvolution32fNhwcDirect::OldSetAlgParam(size_t F) - { - const ConvParam32f& p = _param; - AlgParam & a = _old.alg; - a.F = F; - a.microD = a.F*2; - a.macroC = Simd::Min(Base::AlgCacheL1() / sizeof(float) / p.kernelY / p.kernelX / a.microD, p.srcC); - for (size_t macroH = p.dstH; macroH >= 1; macroH--) - { - a.macroH = macroH; - if (a.macroC * p.srcW * (a.macroH * p.strideY + p.kernelY * p.dilationY - 1) * sizeof(float) <= Base::AlgCacheL2()) - break; - } - a.macroD = Simd::Min(AlignLoAny(Base::AlgCacheL3() / sizeof(float) / p.kernelY / p.kernelX / a.macroC, a.microD), AlignHiAny(p.dstC, a.microD)); - _old.weight.Resize(AlignHiAny(p.dstC, a.microD) * p.kernelY * p.kernelX * p.srcC); - _rBias.Resize(AlignHiAny(p.dstC, a.microD), true); - if (p.activation == ::SimdConvolutionActivationPrelu) - _rParams.Resize(AlignHiAny(p.dstC, a.microD)); - } - - void SynetConvolution32fNhwcDirect::OldReorderWeight(const float* src, float* dst) - { - const ConvParam32f& p = _param; - const AlgParam& a = _old.alg; - for (size_t da = 0; da < p.dstC; da += a.macroD) - { - size_t macroD = Simd::Min(p.dstC, da + a.macroD) - da; - for (size_t sa = 0; sa < p.srcC; sa += a.macroC) - { - size_t macroC = Simd::Min(p.srcC, sa + a.macroC) - sa; - for (size_t di = 0; di < macroD; di += a.microD) - { - size_t microD = Simd::Min(macroD, di + a.microD) - di; - for (size_t ky = 0; ky < p.kernelY; ky++) - { - for (size_t kx = 0; kx < p.kernelX; kx++) - { - for (size_t si = 0; si < macroC; si++) - { - const float* s = src + ((ky * p.kernelX + kx) * p.srcC + sa + si) * p.dstC + da + di; - size_t i = 0; - for (; i < microD; i++) - dst[i] = s[i]; - for (; i < a.microD; i++) - dst[i] = 0; - dst += a.microD; - } - } - } - } - } - } - } -#endif - - bool SynetConvolution32fNhwcDirect::Preferable(const ConvParam32f & p) - { - return false; - } - - //--------------------------------------------------------------------- - -//#define SIMD_BASE_ONLY_GEMM_NN - - void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdGemm32fNNPtr gemm) - { - ConvParam32f param(batch, conv, gemm); - if (!param.Valid()) - return NULL; -#if !defined(SIMD_BASE_ONLY_GEMM_NN) - else if (SynetConvolution32fDepthwiseDotProduct::Preferable(param)) - return new SynetConvolution32fDepthwiseDotProduct(param); - else if(SynetConvolution32fWinograd::Preferable(param)) - return new SynetConvolution32fWinograd(param); - else if (SynetConvolution32fGemmNT::Preferable(param)) - return new SynetConvolution32fGemmNT(param); - else if (SynetConvolution32fDirectNchw::Preferable(param)) - return new SynetConvolution32fDirectNchw(param); - else if (SynetConvolution32fNhwcDirect::Preferable(param)) - return new SynetConvolution32fNhwcDirect(param); - else if (SynetConvolution32fDirectNhwc::Preferable(param)) - return new SynetConvolution32fDirectNhwc(param); -#endif - else - return new SynetConvolution32fGemmNN(param); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseSynetConvolution8i.cpp b/src/3rd/Simd/Simd/SimdBaseSynetConvolution8i.cpp deleted file mode 100644 index 87bd39cb..00000000 --- a/src/3rd/Simd/Simd/SimdBaseSynetConvolution8i.cpp +++ /dev/null @@ -1,762 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution8i.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdMath.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdCpu.h" -#include "Simd/SimdLog.h" - -namespace Simd -{ - SynetConvolution8i::SynetConvolution8i(const ConvParam8i& p) - : _param(p) -#if defined(SIMD_PERFORMANCE_STATISTIC) - , _perf(NULL) -#endif - { - _sizeS = p.srcC * p.srcH * p.srcW; - _sizeD = p.dstC * p.dstH * p.dstW; - _merge = 1; - _src8u = p.srcT == SimdTensorData8u; - _dst8u = p.dstT == SimdTensorData8u; - _overflow16i = (p.compatibility & SimdSynetCompatibilityOverflow16i) != 0; - _weight8i.Resize(p.kernelY * p.kernelX * p.srcC / p.group * p.dstC); - _norm32i.Resize(2 * p.dstC); - _norm32f.Resize(2 * p.dstC); - _convertSrc = Base::SynetConvert32fTo8u; - } - - size_t SynetConvolution8i::ExternalBufferSize() const - { - size_t size = SIMD_ALIGN; - if (!_src8u) - size += AlignHi(_sizeS * _merge * sizeof(uint8_t), SIMD_ALIGN); - return size; - } - - size_t SynetConvolution8i::InternalBufferSize() const - { - return _buffer.size * sizeof(uint8_t) + _srcCvt.Size() + _dstCvt.Size() + - _weight8i.size * sizeof(int8_t) + _norm32i.size * sizeof(int32_t) + _norm32f.size * sizeof(float); - } - - void SynetConvolution8i::SetParams(const float* weight, const float* bias, const float* params, const float* const* stats) - { - const ConvParam8i& p = _param; - _srcCvt.Init(stats[0], stats[1], p.srcC); - _dstCvt.Init(stats[2], stats[3], p.dstC); - size_t G = p.group, D = p.dstC / G, C = p.srcC / G, K = p.kernelY * p.kernelX, CK = C * K, GD = G * D; - Array32f normW(CK); - const float* pSrcW = weight; - const float* pSrcB = bias; - const float* pSrcScaleInv = _srcCvt.iScale.data; - const float* pSrcScale = _srcCvt.scale.data; - const float* pSrcShift = _srcCvt.shift.data; - const float* pDstScale = _dstCvt.iScale.data; - const float* pDstScaleInv = _dstCvt.scale.data; - const float* pDstShift = _dstCvt.iShift.data; - float* pNormW = normW.data; - int8_t* pDstW = _weight8i.data; - int32_t* pDstS = _norm32i.data; - int32_t* pDstB = pDstS + p.dstC; - float* pNormScale = _norm32f.data; - float* pNormShift = pNormScale + p.dstC; - for (size_t g = 0; g < G; ++g) - { - for (size_t d = 0; d < D; ++d) - { - float normB = 0, minW = FLT_MAX, maxW = -FLT_MAX, scale = 1.0f; - if (p.trans) - { - for (size_t k = 0, kc = 0; k < K; ++k) - for (size_t c = 0; c < C; ++c, ++kc) - { - pNormW[kc] = pSrcW[kc * GD + d] * pSrcScaleInv[c]; - minW = Simd::Min(minW, pNormW[kc]); - maxW = Simd::Max(maxW, pNormW[kc]); - } - float abs = Simd::Max(::abs(maxW), ::abs(minW)); - if(pSrcB) - abs = Simd::Max(abs, ::abs(pSrcB[d]) / float(128 * 256 * 256)); - scale = 127.0f / abs; - for (size_t k = 0, kc = 0; k < K; ++k) - for (size_t c = 0; c < C; ++c, ++kc) - if (_srcCvt.neg && (p.compatibility & SimdSynetCompatibilityOverflow16i)) - { - int w = Base::SynetConvert32fTo8i(pNormW[kc], scale, 0.0f); - if (w & 1) - w = Round(w * 0.25f) * 4; - pDstW[kc * GD + d] = w / 2; - normB -= w * pSrcShift[c]; - } - else - { - pDstW[kc * GD + d] = Base::SynetConvert32fTo8i(pNormW[kc], scale, 0.0f); - normB -= pDstW[kc * GD + d] * pSrcShift[c]; - } - } - else - { - for (size_t c = 0, ck = 0; c < C; ++c) - for (size_t k = 0; k < K; ++k, ++ck) - { - pNormW[ck] = pSrcW[d * CK + ck] * pSrcScaleInv[c]; - minW = Simd::Min(minW, pNormW[ck]); - maxW = Simd::Max(maxW, pNormW[ck]); - } - float abs = Simd::Max(::abs(maxW), ::abs(minW)); - if (pSrcB) - abs = Simd::Max(abs, ::abs(pSrcB[d]) / float(128 * 256 * 256)); - scale = 127.0f / abs; - for (size_t c = 0, ck = 0; c < C; ++c) - for (size_t k = 0; k < K; ++k, ++ck) - if (_srcCvt.neg && (p.compatibility & SimdSynetCompatibilityOverflow16i)) - { - int w = Base::SynetConvert32fTo8i(pNormW[ck], scale, 0.0f); - if (w & 1) - w = Round(w * 0.25f) * 4; - pDstW[d * CK + ck] = w / 2; - normB -= w * pSrcShift[c]; - } - else - { - pDstW[d * CK + ck] = Base::SynetConvert32fTo8i(pNormW[ck], scale, 0.0f); - normB -= pDstW[d * CK + ck] * pSrcShift[c]; - } - } - pDstS[d] = _srcCvt.neg && (p.compatibility & SimdSynetCompatibilityOverflow16i) ? 2 : 1; - if (pSrcB) - normB += pSrcB[d] * scale; - pDstB[d] = Round(normB); - if (_dst8u) - { - pNormScale[d] = (1.0f / scale) * pDstScaleInv[d]; - pNormShift[d] = -pDstShift[d] / pDstScale[d]; - } - else - { - pNormScale[d] = 1.0f / scale; - pNormShift[d] = 0; - } - } - if (p.trans) - { - pSrcW += D; - pDstW += D; - } - else - { - pSrcW += CK * D; - pDstW += CK * D; - } - if(pSrcB) - pSrcB += D; - pDstB += D; - pDstS += D; - pSrcScale += C; - pSrcScaleInv += C; - pSrcShift += C; - pDstScale += D; - pDstScaleInv += D; - pDstShift += D; - pNormScale += D; - pNormShift += D; - } - } - - void SynetConvolution8i::Forward(const uint8_t* src, uint8_t* buf, uint8_t* dst) - { - if (buf == NULL) - { - _buffer.Resize(ExternalBufferSize()); - buf = _buffer.data; - } - const ConvParam8i& p = _param; - uint8_t* src8u = _src8u ? NULL : Allocate(buf, _sizeS * _merge); - for (size_t b = 0; b < p.batch; b += _merge) - { - if (!_src8u) - _convertSrc((float*)src + b * _sizeS, _merge, p.srcC, p.srcH, p.srcW, p.srcF, _srcCvt.scale.data, _srcCvt.shift.data, src8u, p.compatibility); - Forward8u(_src8u ? src + b * _sizeS : src8u, buf, dst + b * (_dst8u ? sizeof(uint8_t) : sizeof(float))); - } - } - -#if defined(SIMD_PERFORMANCE_STATISTIC) - Base::PerformanceMeasurer * SynetConvolution8i::Perf(const String& func) - { - if (_perf == NULL) - _perf = Simd::Base::PerformanceMeasurerStorage::s_storage.Get(func, Param().Info() + " " + Desc(), Param().Flop()); - return _perf; - } -#endif - - //------------------------------------------------------------------------- - - namespace Base - { - template SIMD_INLINE D Convert(S value, F scale, F shift) - { - return (D)(F(value) * scale + shift); - } - - template<> SIMD_INLINE uint8_t Convert(int32_t value, float scale, float shift) - { - return (uint8_t)Simd::RestrictRange(Round(float(value) * scale + shift), 0, 255); - } - - template<> SIMD_INLINE uint8_t Convert(float value, float scale, float shift) - { - return (uint8_t)Simd::RestrictRange(Round(value * scale + shift), 0, 255); - } - - template<> SIMD_INLINE int8_t Convert(float value, float scale, float shift) - { - return (int8_t)Simd::RestrictRange(Round(value * scale + shift), -128, 127); - } - - template void Convert(const S * src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const F* scale, const F* shift, D * dst) - { - for (size_t b = 0; b < batch; ++b) - { - if (format == SimdTensorFormatNchw) - { - for (size_t c = 0; c < channels; ++c) - { - F _scale = scale[c]; - F _shift = shift[c]; - for (size_t h = 0; h < height; ++h) - { - for (size_t w = 0; w < width; ++w) - dst[w] = Convert(src[w], _scale, _shift); - src += width; - dst += width; - } - } - } - else if (format == SimdTensorFormatNhwc) - { - for (size_t h = 0; h < height; ++h) - { - for (size_t w = 0; w < width; ++w) - { - for (size_t c = 0; c < channels; ++c) - dst[c] = Convert(src[c], scale[c], shift[c]); - src += channels; - dst += channels; - } - } - } - else - assert(0); - } - } - - SynetConvolution8iGemmNN::SynetConvolution8iGemmNN(const ConvParam8i& p) - : SynetConvolution8i(p) - { - if (p.IsDilation(1) && p.IsStride(1) && p.IsPad(0)) - { - _skipConv = p.IsKernel(1) || (p.srcH == p.kernelY && p.srcW == p.kernelX); - } - else - _skipConv = false; - _sizeB = p.srcC * p.kernelY * p.kernelX * p.dstH * p.dstW; - if (p.trans) - { - _ldS = p.srcC * p.kernelY * p.kernelX / p.group * (_skipConv ? p.group : 1); - _ldW = p.dstC; - _ldD = p.dstC; - _grW = p.dstC / p.group; - _grS = p.srcC * p.kernelY * p.kernelX / p.group * (_skipConv ? 1 : p.dstH * p.dstW); - _grD = p.dstC / p.group; - } - else - { - _ldW = p.srcC * p.kernelY * p.kernelX / p.group; - _ldS = p.dstH * p.dstW; - _ldD = p.dstH * p.dstW; - _grW = p.dstC / p.group * p.srcC * p.kernelY * p.kernelX / p.group; - _grS = p.srcC * p.kernelY * p.kernelX / p.group * p.dstH * p.dstW; - _grD = p.dstH * p.dstW *p.dstC / p.group; - } - _siK = p.kernelY * p.kernelX; - _siC = p.srcC / p.group; - _siD = p.dstC / p.group; - _siS = p.dstH * p.dstW; - } - - size_t SynetConvolution8iGemmNN::ExternalBufferSize() const - { - size_t size = SynetConvolution8i::ExternalBufferSize(); - size += AlignHi(_sizeD * _merge * sizeof(int32_t), SIMD_ALIGN); - if(!_skipConv) - size += AlignHi(_sizeB * _merge * sizeof(uint8_t), SIMD_ALIGN); - return size; - } - - template void Relu(T* data, size_t size) - { - for (size_t i = 0; i < size; ++i) - data[i] = Simd::Max(data[i], T(0)); - } - - void SynetConvolution8iGemmNN::Forward8u(const uint8_t* src, uint8_t* buf, uint8_t* dst) - { - const ConvParam8i& p = _param; - const int8_t * weight = _weight8i.data; - int32_t * sum = Allocate(buf, _sizeD * _merge); - if (!_skipConv) - { - if(p.trans) - for (size_t m = 0; m < _merge; ++m) - ImgToRow(src + m * _sizeS, buf + m * _sizeB); - else - for (size_t m = 0; m < _merge; ++m) - ImgToCol(src + m * _sizeS, buf + m * _sizeB); - src = buf; - } - if (_merge > 1) - { - assert(0); - } - else - { - for (size_t g = 0; g < p.group; ++g) - { - if (p.trans) - GemmNhwc(_siS, _siD, _siK, _siC, src + _grS * g, _ldS, weight + _grW * g, _ldW, sum + _grD * g, _ldD); - else - GemmNchw(_siD, _siS, _siC, _siK, weight + _grW * g, _ldW, src + _grS * g, _ldS, sum + _grD * g, _ldD); - } - } - Convert(sum, _merge, p.dstC, p.dstH, p.dstW, p.dstF, _norm32i.data, _norm32i.data + p.dstC, sum); - switch (p.activation) - { - case SimdConvolutionActivationIdentity: - break; - case SimdConvolutionActivationRelu: - Relu(sum, _sizeD * _merge); - break; - default: - assert(0); - } - if (_dst8u) - Convert(sum, _merge, p.dstC, p.dstH, p.dstW, p.dstF, _norm32f.data, _norm32f.data + p.dstC, dst); - else - Convert(sum, _merge, p.dstC, p.dstH, p.dstW, p.dstF, _norm32f.data, _norm32f.data + p.dstC, (float*)dst); - } - - void SynetConvolution8iGemmNN::ImgToCol(const uint8_t* src, uint8_t* dst) - { - const ConvParam8i& p = _param; - assert(!p.trans); - size_t srcSize = p.srcW * p.srcH; - const uint8_t* zero = _srcCvt.zero.data; - if (p.IsDilation(1) && p.IsStride(2) && p.IsPad(0) && p.IsKernel(1)) - { - for (size_t channel = 0; channel < p.srcC; ++channel) - { - for (size_t dy = 0; dy < p.dstH; ++dy) - { - const uint8_t * psrc = src + 2 * dy * p.srcW; - for (size_t dx = 0, sx = 0; dx < p.dstW; ++dx, sx += 2) - *(dst++) = psrc[sx]; - } - src += srcSize; - } - } - else if (p.IsDilation(1) && p.IsStride(1)) - { - const ptrdiff_t bodySize = p.dstW - p.padX - p.padW; - for (size_t channel = 0; channel < p.srcC; ++channel) - { - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sy = ky - p.padY; - for (size_t dy = 0; dy < p.dstH; ++dy, ++sy) - { - if (sy < p.srcH) - { - size_t sx = kx - p.padX, dx = 0; - const uint8_t * psrc = src + sy * p.srcW; - for (; dx < p.padX; ++dx, ++sx) - { - if (sx < p.srcW) - *(dst++) = psrc[sx]; - else - *(dst++) = zero[channel]; - } - if (bodySize > 0) - { - memcpy(dst, psrc + sx, bodySize * sizeof(uint8_t)); - dst += bodySize; - dx += bodySize; - sx += bodySize; - } - for (; dx < p.dstW; ++dx, ++sx) - { - if (sx < p.srcW) - *(dst++) = psrc[sx]; - else - *(dst++) = zero[channel]; - } - } - else - { - for (size_t dx = 0; dx < p.dstW; ++dx) - *(dst++) = zero[channel]; - } - } - } - } - src += srcSize; - } - } - else - { - for (size_t channel = 0; channel < p.srcC; ++channel) - { - for (size_t ky = 0; ky < p.kernelY; ky++) - { - for (size_t kx = 0; kx < p.kernelX; kx++) - { - size_t sy = ky * p.dilationY - p.padY; - for (size_t dy = 0; dy < p.dstH; ++dy) - { - if (sy < p.srcH) - { - size_t sx = kx * p.dilationX - p.padX; - for (size_t dx = 0; dx < p.dstW; ++dx) - { - if (sx < p.srcW) - *(dst++) = src[sy * p.srcW + sx]; - else - *(dst++) = zero[channel]; - sx += p.strideX; - } - } - else - { - for (size_t dx = 0; dx < p.dstW; ++dx) - *(dst++) = zero[channel]; - } - sy += p.strideY; - } - } - } - src += srcSize; - } - } - } - - void SynetConvolution8iGemmNN::ImgToRow(const uint8_t* src, uint8_t* dst) - { - const ConvParam8i& p = _param; - assert(p.trans); - size_t size = p.srcC / p.group; - const uint8_t* zero = _srcCvt.zero.data; - for (size_t g = 0; g < p.group; ++g) - { - for (size_t dy = 0; dy < p.dstH; ++dy) - { - for (size_t dx = 0; dx < p.dstW; ++dx) - { - for (size_t ky = 0; ky < p.kernelY; ky++) - { - size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; kx++) - { - size_t sx = dx * p.strideX + kx * p.dilationX - p.padX; - if (sx < p.srcW) - { - memcpy(dst, src + (sy * p.srcW + sx) * p.srcC, size * sizeof(uint8_t)); - dst += size; - } - else - { - memcpy(dst, zero, size * sizeof(uint8_t)); - dst += size; - } - } - } - else - { - for (size_t kx = 0; kx < p.kernelX; kx++) - { - memcpy(dst, zero, size * sizeof(uint8_t)); - dst += size; - } - } - } - } - } - src += size; - zero += size; - } - } - - void SynetConvolution8iGemmNN::GemmNchw(size_t D, size_t S, size_t C, size_t K, const int8_t* wgt, size_t ldw, const uint8_t* src, size_t lds, int32_t* dst, size_t ldd) - { - const size_t C2 = _overflow16i ? AlignLo(C, 2) : 0; - for (size_t i = 0; i < D; ++i) - { - for (size_t j = 0; j < S; ++j) - dst[j] = 0; - size_t c = 0; - for (; c < C2; c += 2) - { - for (size_t k = 0; k < K; k++) - { - int32_t w0 = wgt[(c + 0) * K + k]; - int32_t w1 = wgt[(c + 1) * K + k]; - const uint8_t* s0 = src + ((c + 0) * K + k) * lds; - const uint8_t* s1 = src + ((c + 1) * K + k) * lds; - for (size_t j = 0; j < S; ++j) - dst[j] += Simd::RestrictRange(s0[j] * w0 + s1[j] * w1, SHRT_MIN, SHRT_MAX); - } - } - for (; c < C; ++c) - { - for (size_t k = 0; k < K; k++) - { - int32_t w0 = wgt[(c + 0) * K + k]; - const uint8_t* s0 = src + ((c + 0) * K + k) * lds; - for (size_t j = 0; j < S; ++j) - dst[j] += s0[j] * w0; - } - } - wgt += ldw; - dst += ldd; - } - } - - void SynetConvolution8iGemmNN::GemmNhwc(size_t S, size_t D, size_t K, size_t C, const uint8_t* src, size_t lds, const int8_t* wgt, size_t ldw, int32_t* dst, size_t ldd) - { - const size_t C2 = _overflow16i ? AlignLo(C, 2) : 0; - for (size_t i = 0; i < S; ++i) - { - for (size_t j = 0; j < D; ++j) - dst[j] = 0; - for (size_t k = 0, o = 0; k < K; k++) - { - size_t c = 0; - for (; c < C2; c += 2, o += 2) - { - int32_t s0 = src[o + 0]; - int32_t s1 = src[o + 1]; - const int8_t* w0 = wgt + (o + 0) * ldw; - const int8_t* w1 = wgt + (o + 1) * ldw; - for (size_t j = 0; j < D; ++j) - dst[j] += Simd::RestrictRange(s0 * w0[j] + s1 * w1[j], SHRT_MIN, SHRT_MAX); - } - for (; c < C; ++c, ++o) - { - int32_t s0 = src[o]; - const int8_t* w0 = wgt + o * ldw; - for (size_t j = 0; j < D; ++j) - dst[j] += s0 * w0[j]; - } - } - src += lds; - dst += ldd; - } - } - - //--------------------------------------------------------------------- - - SynetConvolution8iNhwcDirect::SynetConvolution8iNhwcDirect(const ConvParam8i& p) - : SynetConvolution8i(p) - { - for (size_t i = 0; i < Term8iSize; ++i) - _convolutions[i] = NULL; - } - - String SynetConvolution8iNhwcDirect::Desc() const - { - return Ext() + "::NhwcDirect" + ((_param.compatibility& SimdSynetCompatibilityOverflow16i) ? "-o" : "-e"); - } - - size_t SynetConvolution8iNhwcDirect::InternalBufferSize() const - { - size_t size = SynetConvolution8i::InternalBufferSize(); - return size; - } - - size_t SynetConvolution8iNhwcDirect::ExternalBufferSize() const - { - const ConvParam8i& p = _param; - size_t size = SynetConvolution8i::ExternalBufferSize(); - if (_alg.macroC < p.srcC) - size += AlignHi(_sizeD*sizeof(int32_t), SIMD_ALIGN); - return size; - } - - void SynetConvolution8iNhwcDirect::SetParams(const float* weight, const float* bias, const float* params, const float* const* stats) - { - SynetConvolution8i::SetParams(weight, bias, params, stats); - ReorderWeight(); - _alg.norm = _srcCvt.neg && (_param.compatibility & SimdSynetCompatibilityOverflow16i) ? 2 : 1; - _alg.zero = _srcCvt.neg ? 0x80808080 : 0; - } - - bool SynetConvolution8iNhwcDirect::Preferable(const ConvParam8i& p) - { - return false; - } - - void SynetConvolution8iNhwcDirect::SetAlgParam(size_t F, size_t microD, size_t L1, size_t L2, size_t L3) - { - const ConvParam8i& p = _param; - _alg.F = F; - _alg.microD = microD; - _alg.macroC = Simd::Min(AlignLoAny(L1 / p.kernelY / p.kernelX / microD, 4), p.srcC); - for (size_t macroH = p.dstH; macroH >= 1; macroH--) - { - _alg.macroH = macroH; - if (_alg.macroC * p.srcW * (_alg.macroH * p.strideY + p.kernelY * p.dilationY - 1) <= L2) - break; - } - _alg.macroD = Simd::Min(AlignLoAny(L3 / p.kernelY / p.kernelX / _alg.macroC, _alg.microD), AlignHiAny(p.dstC, _alg.microD)); - _alg.size = (p.dstT == SimdTensorData32f ? 4 : 1); - } - - void SynetConvolution8iNhwcDirect::ReorderWeight() - { - const ConvParam8i& p = _param; - size_t C = DivHi(p.srcC, 4), D = DivHi(p.dstC, _alg.F); - Array8i weight8i(p.kernelY * p.kernelX * C * D * _alg.F * 4); - int8_t* dst = weight8i.data; - for (size_t d = 0; d < D; d++) - { - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - for (size_t c = 0; c < C; ++c) - { - const int8_t* src = _weight8i.data + ((ky*p.kernelX + kx)*p.srcC + c*4)*p.dstC + d*_alg.F; - for (size_t f = 0; f < _alg.F; ++f) - { - for (size_t i = 0; i < 4; ++i) - { - if (d * _alg.F + f < p.dstC && c * 4 + i < p.srcC) - *(dst++) = src[i * p.dstC]; - else - *(dst++) = 0; - } - src++; - } - } - } - } - } - _weight8i.Swap(weight8i); - } - - void SynetConvolution8iNhwcDirect::Forward8u(const uint8_t* src, uint8_t* buf, uint8_t* dst) - { - const ConvParam8i& p = _param; - int32_t* sum = _alg.macroC < p.srcC ? Allocate(buf, _sizeD) : NULL; - for (size_t m = 0; m < _merge; ++m) - { - Forward8u(src, sum, dst); - src += _sizeS; - dst += _sizeD*(_dst8u ? sizeof(uint8_t) : sizeof(float)); - } - } - - void SynetConvolution8iNhwcDirect::Forward8u(const uint8_t* src, int32_t* buf, uint8_t* dst) - { - const ConvParam8i& p = _param; - const int8_t* weight = _weight8i.data; - const int32_t* bias = _norm32i.data + p.dstC; - const int32_t* params = NULL; - const float* scale = _norm32f.data; - const float* shift = _norm32f.data + p.dstC; - for (size_t dc = 0; dc < p.dstC; dc += _alg.macroD) - { - size_t macroD = Simd::Min(p.dstC, dc + _alg.macroD) - dc; - for (size_t sc = 0; sc < p.srcC; sc += _alg.macroC) - { - size_t macroC = Simd::Min(p.srcC, sc + _alg.macroC) - sc; - for (size_t yBeg = 0; yBeg < p.dstH;) - { - size_t yEnd = Simd::Min(yBeg + _alg.macroH, p.dstH); - if (_alg.macroC == p.srcC) - { - if (_alg.size == 1) - _convolutions[Term8iSingle8u](src + sc, p, _alg, macroD, yBeg, yEnd, macroC, weight, bias, params, scale, shift, buf, dst); - else - _convolutions[Term8iSingle32f](src + sc, p, _alg, macroD, yBeg, yEnd, macroC, weight, bias, params, scale, shift, buf, dst); - } - else if (sc == 0) - _convolutions[Term8iFirst](src + sc, p, _alg, macroD, yBeg, yEnd, macroC, weight, bias, params, scale, shift, buf, dst); - else if (sc + macroC == p.srcC) - { - if (_alg.size == 1) - _convolutions[Term8iLast8u](src + sc, p, _alg, macroD, yBeg, yEnd, macroC, weight, bias, params, scale, shift, buf, dst); - else - _convolutions[Term8iLast32f](src + sc, p, _alg, macroD, yBeg, yEnd, macroC, weight, bias, params, scale, shift, buf, dst); - } - else - _convolutions[Term8iIterim](src + sc, p, _alg, macroD, yBeg, yEnd, macroC, weight, bias, params, scale, shift, buf, dst); - yBeg = yEnd; - } - weight += DivHi(macroC, 4) * _alg.F * 4; - } - weight += p.kernelY * p.kernelX * DivHi(p.srcC, 4) * macroD * 4 - DivHi(p.srcC, 4) * _alg.F * 4; - bias += _alg.macroD; - //if (type == ::SimdConvolutionActivationPrelu) - // params += macroD; - shift += _alg.macroD; - scale += _alg.macroD; - if (buf) - buf += _alg.macroD; - dst += _alg.macroD * _alg.size; - } - } - - //--------------------------------------------------------------------- - -//#define SIMD_BASE_ONLY_GEMM_NN - - void * SynetConvolution8iInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility) - { - ConvParam8i param(batch, conv, compatibility); - if (!param.Valid()) - return NULL; -#if !defined(SIMD_BASE_ONLY_GEMM_NN) - else if (SynetConvolution8iNhwcDirect::Preferable(param)) - return new SynetConvolution8iNhwcDirect(param); -#endif - else - return new SynetConvolution8iGemmNN(param); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseSynetDeconvolution32f.cpp b/src/3rd/Simd/Simd/SimdBaseSynetDeconvolution32f.cpp deleted file mode 100644 index 6d8301c4..00000000 --- a/src/3rd/Simd/Simd/SimdBaseSynetDeconvolution32f.cpp +++ /dev/null @@ -1,402 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetDeconvolution32f.h" -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ -#if defined(SIMD_PERFORMANCE_STATISTIC) - Base::PerformanceMeasurer * SynetDeconvolution32f::Perf(const String& func) - { - if (_perf == NULL) - _perf = Simd::Base::PerformanceMeasurerStorage::s_storage.Get(func, Param().Info() + " " + Desc(), Param().Flop()); - return _perf; - } -#endif - - namespace Base - { - SynetDeconvolution32fGemmNN::SynetDeconvolution32fGemmNN(const DeconvParam32f & p) - : SynetDeconvolution32f(p) - { - _is1x1 = p.Is1x1(); - if (p.trans) - { - assert(p.group == 1); - - _M = p.srcH * p.srcW; - _N = p.kernelY * p.kernelX * p.dstC; - _K = p.srcC; - _ldS = _K; - _ldW = _N; - _ldD = _N; - _grW = 0; - _grS = 0; - _grD = 0; - } - else - { - _M = p.kernelY * p.kernelX * p.dstC / p.group; - _N = p.srcH * p.srcW; - _K = p.srcC / p.group; - _ldW = _K; - _ldS = _N; - _ldD = _N; - _grW = _M * _K; - _grS = _K * _N; - _grD = _M * _N; - } - _batch = p.batch; - _sizeS = p.srcC*p.srcH*p.srcW; - _sizeB = p.dstC*p.kernelY*p.kernelX*p.srcH*p.srcW; - _sizeD = p.dstC*p.dstH*p.dstW; - _merge = 1; - if (p.trans) - { - if (p.group == 1 && _batch > 1) - { - for (size_t merge = 1; merge <= _batch; ++merge) - if (_batch%merge == 0 && _M*merge <= 256) - _merge = merge; - } - } - else - _weightT.Resize(p.srcC * p.kernelY * p.kernelX * p.dstC / p.group); - _gemm.Init(InitGemmFuncs(Base::Gemm32fNN, "Base", p.gemm, "Ext")); - _biasAndActivation = Base::ConvolutionBiasAndActivation; - } - - size_t SynetDeconvolution32fGemmNN::ExternalBufferSize() const - { - if (_is1x1) - return 1; - else - return _sizeB*_merge; - }; - - void SynetDeconvolution32fGemmNN::SetParams(const float * weight, SimdBool * internal, const float * bias, const float * params) - { - Simd::SynetDeconvolution32f::SetParams(weight, internal, bias, params); - if (_nhwcWeight.data) - { - if (_gemmCb.Size()) - _gemmCb.At(0).ReorderB(_M*_merge, _N, _K, weight, _nhwcWeight.data); - else - _nhwcReorderB(_M*_merge, _N, _K, weight, _nhwcWeight.data, GemmKernelAny, NHWC_GEMM_COMPATIBLE); - if (internal) - *internal = SimdTrue; - } - if (_weightT.data) - { - const float * src = weight; - float * dst = _weightT.data; - for (size_t g = 0; g < _param.group; ++g) - { - for (size_t i = 0; i < _M; ++i) - for (size_t k = 0; k < _K; ++k) - dst[i * _K + k] = src[k * _M + i]; - src += _grW; - dst += _grW; - } - if (internal) - *internal = SimdTrue; - } - } - - void SynetDeconvolution32fGemmNN::Forward(const float * src, float * buf, float * dst) - { - const DeconvParam32f & p = _param; - if (!_is1x1) - buf = Buffer(buf); - if (_merge > 1) - { - for (size_t b = 0; b < _batch; b += _merge) - { - float * tmp = _is1x1 ? dst : buf; - if (_nhwcWeight.data) - { - if (_gemmCb.Size()) - _gemmCb.Run(GemmCbArgs(_M*_merge, _N, _K, src, _nhwcWeight.data, tmp)); - else - _nhwcRun(_M*_merge, _N, _K, src, _nhwcWeight.data, tmp, GemmKernelAny, NHWC_GEMM_COMPATIBLE); - } - else - _gemm.Run(GemmArgs(_M*_merge, _N, _K, &_1, src, _ldS, _weight, _ldW, &_0, tmp, _ldD)); - if (!_is1x1) - { - for (size_t m = 0; m < _merge; ++m) - RowToImg(tmp + m * _sizeS, dst + m * _sizeB); - } - for (size_t m = 0; m < _merge; ++m) - _biasAndActivation(_bias, p.dstC, p.dstH*p.dstW, p.activation, _params, p.trans, dst + m * _sizeD); - src += _sizeS * _merge; - dst += _sizeD * _merge; - } - } - else - { - for (size_t b = 0; b < _batch; ++b) - { - float * tmp = _is1x1 ? dst : buf; - for (size_t g = 0; g < p.group; ++g) - { - if (p.trans) - { - if (_nhwcWeight.data) - { - if (_gemmCb.Size()) - _gemmCb.Run(GemmCbArgs(_M, _N, _K, src, _nhwcWeight.data, tmp)); - else - _nhwcRun(_M, _N, _K, src, _nhwcWeight.data, tmp, GemmKernelAny, NHWC_GEMM_COMPATIBLE); - } - else - _gemm.Run(GemmArgs(_M, _N, _K, &_1, src + _grS * g, _ldS, _weight + _grW * g, _ldW, &_0, tmp + _grD * g, _ldD)); - } - else - _gemm.Run(GemmArgs(_M, _N, _K, &_1, _weightT.data + _grW * g, _ldW, src + _grS * g, _ldS, &_0, tmp + _grD * g, _ldD)); - } - if (!_is1x1) - { - if (_param.trans) - RowToImg(tmp, dst); - else - ColToImg(tmp, dst); - } - _biasAndActivation(_bias, p.dstC, p.dstH*p.dstW, p.activation, _params, p.trans, dst); - src += _sizeS; - dst += _sizeD; - } - } - } - - void SynetDeconvolution32fGemmNN::ColToImg(const float * src, float * dst) - { - const DeconvParam32f & p = _param; - assert(!p.trans); - size_t dstSize = p.dstW * p.dstH; - for (size_t cd = 0; cd < p.dstC; ++cd) - { - memset(dst, 0, dstSize * sizeof(float)); - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t dy = ky * p.dilationY - p.padY; - for (size_t sy = 0; sy < p.srcH; ++sy, dy += p.strideY) - { - if (dy < p.dstH) - { - size_t dx = kx * p.dilationX - p.padX; - for (size_t sx = 0; sx < p.srcW; ++sx, dx += p.strideX) - { - if (dx < p.dstW) - dst[dy * p.dstW + dx] += *src; - src++; - } - } - else - src += p.srcW; - } - } - } - dst += dstSize; - } - } - - void SynetDeconvolution32fGemmNN::RowToImg(const float * src, float * dst) - { - const DeconvParam32f & p = _param; - assert(p.trans && p.group == 1); - if (p.IsPad(0) && p.IsDilation(1) && p.kernelY == p.strideX && p.kernelX == p.strideX) - { - for (size_t sy = 0; sy < p.srcH; ++sy) - { - for (size_t sx = 0; sx < p.srcW; ++sx) - { - size_t dy = sy * p.strideY; - for (size_t ky = 0; ky < p.kernelY; ky++, dy += 1) - { - size_t dx = sx * p.strideX; - for (size_t kx = 0; kx < p.kernelX; kx++, dx += 1) - { - memcpy(dst + (dy * p.dstW + dx)*p.dstC, src, p.dstC * sizeof(float)); - src += p.dstC; - } - } - } - } - } - else - { - for (size_t dy = 0; dy < p.dstH; ++dy) - for (size_t dx = 0; dx < p.dstW; ++dx) - memset(dst + (dy*p.dstW + dx)*p.dstC, 0, p.dstC * sizeof(float)); - for (size_t sy = 0; sy < p.srcH; ++sy) - { - for (size_t sx = 0; sx < p.srcW; ++sx) - { - size_t dy = sy * p.strideY - p.padY; - for (size_t ky = 0; ky < p.kernelY; ky++, dy += p.dilationY) - { - if (dy < p.dstH) - { - size_t dx = sx * p.strideX - p.padX; - for (size_t kx = 0; kx < p.kernelX; kx++, dx += p.dilationX) - { - if (dx < p.dstW) - { - float * d = dst + (dy * p.dstW + dx)*p.dstC; - for (size_t dc = 0; dc < p.dstC; ++dc) - d[dc] += src[dc]; - } - src += p.dstC; - } - } - else - src += p.kernelX * p.dstC; - } - } - } - } - } - - //--------------------------------------------------------------------- - - SynetDeconvolution32fNhwcDirect2x2::SynetDeconvolution32fNhwcDirect2x2(const DeconvParam32f & p) - : SynetDeconvolution32f(p) - { - _sizeS = p.srcC*p.srcH*p.srcW; - _sizeD = p.dstC*p.dstH*p.dstW; - _deconvolution = NULL; - } - - void SynetDeconvolution32fNhwcDirect2x2::SetAlgParam(size_t F, size_t L1, size_t L2, size_t L3) - { - const DeconvParam32f & p = _param; - _alg.microD = F; - _alg.macroC = Simd::Min(L1 / sizeof(float) / p.kernelX / _alg.microD, p.srcC); - _alg.macroH = Simd::Min(L2 / sizeof(float) / _alg.macroC / p.srcW, p.srcH); - _alg.macroD = Simd::Min(AlignLoAny(L3 / sizeof(float) / p.kernelY / _alg.macroC, _alg.microD), AlignHiAny(p.dstC, _alg.microD)); - _rWeight.Resize(AlignHiAny(p.dstC, _alg.microD) * p.kernelY * p.kernelX * p.srcC); - _rBias.Resize(AlignHiAny(p.dstC, _alg.microD), true); - if (p.activation == ::SimdConvolutionActivationPrelu) - _rParams.Resize(AlignHiAny(p.dstC, _alg.microD)); - } - - void SynetDeconvolution32fNhwcDirect2x2::ReorderWeight(const float * src, float * dst) - { - const DeconvParam32f & p = _param; - const AlgParam & a = _alg; - for (size_t da = 0; da < p.dstC; da += a.macroD) - { - size_t macroD = Simd::Min(p.dstC, da + a.macroD) - da; - for (size_t sa = 0; sa < p.srcC; sa += a.macroC) - { - size_t macroC = Simd::Min(p.srcC, sa + a.macroC) - sa; - for (size_t di = 0; di < macroD; di += a.microD) - { - size_t microD = Simd::Min(macroD, di + a.microD) - di; - for (size_t ky = 0; ky < p.kernelY; ky++) - { - for (size_t kx = 0; kx < p.kernelX; kx++) - { - for (size_t si = 0; si < macroC; si++) - { - const float * s = src + (((sa + si)*p.kernelY + ky) * p.kernelX + kx) * p.dstC + da + di; - size_t i = 0; - for (; i < microD; i++) - dst[i] = s[i]; - for (; i < a.microD; i++) - dst[i] = 0; - dst += a.microD; - } - } - } - } - } - } - } - - size_t SynetDeconvolution32fNhwcDirect2x2::InternalBufferSize() const - { - return _buffer.size + _rWeight.size + _rBias.size + _rParams.size; - } - - void SynetDeconvolution32fNhwcDirect2x2::SetParams(const float * weight, SimdBool * internal, const float * bias, const float * params) - { - SynetDeconvolution32f::SetParams(weight, internal, bias, params); - if (_rWeight.data) - { - const DeconvParam32f & p = _param; - ReorderWeight(weight, _rWeight.data); - _weight = _rWeight.data; - if (internal) - *internal = SimdTrue; - } - if (_rBias.data) - { - if (bias) - memcpy(_rBias.data, bias, _param.dstC * sizeof(float)); - _bias = _rBias.data; - } - if (_rParams.data && _param.activation == ::SimdConvolutionActivationPrelu) - { - memcpy(_rParams.data, params, _param.dstC * sizeof(float)); - _params = _rParams.data; - } - } - - void SynetDeconvolution32fNhwcDirect2x2::Forward(const float * src, float * buf, float * dst) - { - const DeconvParam32f & p = _param; - for (size_t b = 0; b < p.batch; ++b) - { - _deconvolution(src, _param, _alg, _weight, _bias, _params, dst); - src += _sizeS; - dst += _sizeD; - } - } - - bool SynetDeconvolution32fNhwcDirect2x2::Preferable(const DeconvParam32f & p) - { - return false; - } - - //--------------------------------------------------------------------- - - void * SynetDeconvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdGemm32fNNPtr gemm) - { - DeconvParam32f param(batch, conv, gemm); - if (!param.Valid()) - return NULL; - if (SynetDeconvolution32fNhwcDirect2x2::Preferable(param)) - return new SynetDeconvolution32fNhwcDirect2x2(param); - else - return new SynetDeconvolution32fGemmNN(param); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseSynetFused.cpp b/src/3rd/Simd/Simd/SimdBaseSynetFused.cpp deleted file mode 100644 index c8c2fadd..00000000 --- a/src/3rd/Simd/Simd/SimdBaseSynetFused.cpp +++ /dev/null @@ -1,760 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdSynet.h" - -namespace Simd -{ - namespace Base - { - void SynetFusedLayerForward0Nchw(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - size_t aligned = Simd::AlignLo(spatial, 4); - for (size_t c = 0; c < channels; ++c) - { - float _bias = bias[c]; - float _scale = scale[c]; - size_t s = 0; - for (; s < aligned; s += 4) - { - dst[s + 0] = SynetFusedLayerForward0(src[s + 0] + _bias, _scale); - dst[s + 1] = SynetFusedLayerForward0(src[s + 1] + _bias, _scale); - dst[s + 2] = SynetFusedLayerForward0(src[s + 2] + _bias, _scale); - dst[s + 3] = SynetFusedLayerForward0(src[s + 3] + _bias, _scale); - } - for (; s < spatial; ++s) - dst[s] = SynetFusedLayerForward0(src[s] + _bias, _scale); - src += spatial; - dst += spatial; - } - } - - void SynetFusedLayerForward0Nhwc(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - size_t aligned = Simd::AlignLo(channels, 4); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned; c += 4) - { - dst[c + 0] = SynetFusedLayerForward0(src[c + 0] + bias[c + 0], scale[c + 0]); - dst[c + 1] = SynetFusedLayerForward0(src[c + 1] + bias[c + 1], scale[c + 1]); - dst[c + 2] = SynetFusedLayerForward0(src[c + 2] + bias[c + 2], scale[c + 2]); - dst[c + 3] = SynetFusedLayerForward0(src[c + 3] + bias[c + 3], scale[c + 3]); - } - for (; c < channels; ++c) - dst[c] = SynetFusedLayerForward0(src[c] + bias[c], scale[c]); - src += channels; - dst += channels; - } - } - - template void SynetFusedLayerForward0NchwXc(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - for (size_t c = 0; c < channels; c += N) - { - for (size_t s = 0; s < spatial; ++s) - { - for (size_t i = 0; i < N; ++i) - dst[i] = SynetFusedLayerForward0(src[i] + bias[i], scale[i]); - src += N; - dst += N; - } - bias += N; - scale += N; - } - } - - void SynetFusedLayerForward0(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward0Nchw(src, bias, scale, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward0Nhwc(src, bias, scale, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - SynetFusedLayerForward0NchwXc<4>(src, bias, scale, channels, spatial, dst); - else if (format == SimdTensorFormatNchw8c) - SynetFusedLayerForward0NchwXc<8>(src, bias, scale, channels, spatial, dst); - else if (format == SimdTensorFormatNchw16c) - SynetFusedLayerForward0NchwXc<16>(src, bias, scale, channels, spatial, dst); - else - assert(0); - } - - //--------------------------------------------------------------------- - - void SynetFusedLayerForward1Nchw(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - size_t aligned = Simd::AlignLo(spatial, 4); - for (size_t c = 0; c < channels; ++c) - { - float _bias0 = bias0[c]; - float _scale1 = scale1[c]; - float _bias1 = bias1[c]; - size_t s = 0; - for (; s < aligned; s += 4) - { - dst[s + 0] = SynetFusedLayerForward1(src[s + 0] + _bias0, _scale1, _bias1); - dst[s + 1] = SynetFusedLayerForward1(src[s + 1] + _bias0, _scale1, _bias1); - dst[s + 2] = SynetFusedLayerForward1(src[s + 2] + _bias0, _scale1, _bias1); - dst[s + 3] = SynetFusedLayerForward1(src[s + 3] + _bias0, _scale1, _bias1); - } - for (; s < spatial; ++s) - dst[s] = SynetFusedLayerForward1(src[s] + _bias0, _scale1, _bias1); - src += spatial; - dst += spatial; - } - } - - void SynetFusedLayerForward1Nhwc(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - size_t aligned = Simd::AlignLo(channels, 4); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned; c += 4) - { - dst[c + 0] = SynetFusedLayerForward1(src[c + 0] + bias0[c + 0], scale1[c + 0], bias1[c + 0]); - dst[c + 1] = SynetFusedLayerForward1(src[c + 1] + bias0[c + 1], scale1[c + 1], bias1[c + 1]); - dst[c + 2] = SynetFusedLayerForward1(src[c + 2] + bias0[c + 2], scale1[c + 2], bias1[c + 2]); - dst[c + 3] = SynetFusedLayerForward1(src[c + 3] + bias0[c + 3], scale1[c + 3], bias1[c + 3]); - } - for (; c < channels; ++c) - dst[c] = SynetFusedLayerForward1(src[c] + bias0[c], scale1[c], bias1[c]); - src += channels; - dst += channels; - } - } - - template void SynetFusedLayerForward1NchwXc(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - for (size_t c = 0; c < channels; c += N) - { - for (size_t s = 0; s < spatial; ++s) - { - for (size_t i = 0; i < N; ++i) - dst[i] = SynetFusedLayerForward1(src[i] + bias0[i], scale1[i], bias1[i]); - src += N; - dst += N; - } - bias0 += N; - scale1 += N; - bias1 += N; - } - } - - void SynetFusedLayerForward1(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward1Nchw(src, bias0, scale1, bias1, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward1Nhwc(src, bias0, scale1, bias1, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - SynetFusedLayerForward1NchwXc<4>(src, bias0, scale1, bias1, channels, spatial, dst); - else if (format == SimdTensorFormatNchw8c) - SynetFusedLayerForward1NchwXc<8>(src, bias0, scale1, bias1, channels, spatial, dst); - else if (format == SimdTensorFormatNchw16c) - SynetFusedLayerForward1NchwXc<16>(src, bias0, scale1, bias1, channels, spatial, dst); - else - assert(0); - } - - //--------------------------------------------------------------------- - - void SynetFusedLayerForward2Nchw(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst) - { - float _slope = slope[0]; - size_t aligned = Simd::AlignLo(spatial, 4); - for (size_t c = 0; c < channels; ++c) - { - float _scale = scale[c]; - float _bias = bias[c]; - size_t s = 0; - for (; s < aligned; s += 4) - { - dst[s + 0] = SynetFusedLayerForward2(src[s + 0], _scale, _bias, _slope); - dst[s + 1] = SynetFusedLayerForward2(src[s + 1], _scale, _bias, _slope); - dst[s + 2] = SynetFusedLayerForward2(src[s + 2], _scale, _bias, _slope); - dst[s + 3] = SynetFusedLayerForward2(src[s + 3], _scale, _bias, _slope); - } - for (; s < spatial; ++s) - dst[s] = SynetFusedLayerForward2(src[s], _scale, _bias, _slope); - src += spatial; - dst += spatial; - } - } - - void SynetFusedLayerForward2Nhwc(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst) - { - float _slope = slope[0]; - size_t aligned = Simd::AlignLo(channels, 4); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned; c += 4) - { - dst[c + 0] = SynetFusedLayerForward2(src[c + 0], scale[c + 0], bias[c + 0], _slope); - dst[c + 1] = SynetFusedLayerForward2(src[c + 1], scale[c + 1], bias[c + 1], _slope); - dst[c + 2] = SynetFusedLayerForward2(src[c + 2], scale[c + 2], bias[c + 2], _slope); - dst[c + 3] = SynetFusedLayerForward2(src[c + 3], scale[c + 3], bias[c + 3], _slope); - } - for (; c < channels; ++c) - dst[c] = SynetFusedLayerForward2(src[c], scale[c], bias[c], _slope); - src += channels; - dst += channels; - } - } - - template void SynetFusedLayerForward2NchwXc(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst) - { - float _slope = slope[0]; - for (size_t c = 0; c < channels; c += N) - { - for (size_t s = 0; s < spatial; ++s) - { - for (size_t i = 0; i < N; ++i) - dst[i] = SynetFusedLayerForward2(src[i], scale[i], bias[i], _slope); - src += N; - dst += N; - } - scale += N; - bias += N; - } - } - - void SynetFusedLayerForward2(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward2Nchw(src, scale, bias, channels, spatial, slope, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward2Nhwc(src, scale, bias, channels, spatial, slope, dst); - else if (format == SimdTensorFormatNchw4c) - SynetFusedLayerForward2NchwXc<4>(src, scale, bias, channels, spatial, slope, dst); - else if (format == SimdTensorFormatNchw8c) - SynetFusedLayerForward2NchwXc<8>(src, scale, bias, channels, spatial, slope, dst); - else if (format == SimdTensorFormatNchw16c) - SynetFusedLayerForward2NchwXc<16>(src, scale, bias, channels, spatial, slope, dst); - else - assert(0); - } - - //--------------------------------------------------------------------- - - void SynetFusedLayerForward3Nchw(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - size_t aligned = Simd::AlignLo(spatial, 4); - for (size_t c = 0; c < channels; ++c) - { - float _bias = bias[c]; - float _scale = scale[c]; - size_t s = 0; - for (; s < aligned; s += 4) - { - dst[s + 0] = SynetFusedLayerForward3(src[s + 0] + _bias, _scale); - dst[s + 1] = SynetFusedLayerForward3(src[s + 1] + _bias, _scale); - dst[s + 2] = SynetFusedLayerForward3(src[s + 2] + _bias, _scale); - dst[s + 3] = SynetFusedLayerForward3(src[s + 3] + _bias, _scale); - } - for (; s < spatial; ++s) - dst[s] = SynetFusedLayerForward3(src[s] + _bias, _scale); - src += spatial; - dst += spatial; - } - } - - void SynetFusedLayerForward3Nhwc(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - size_t aligned = Simd::AlignLo(channels, 4); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned; c += 4) - { - dst[c + 0] = SynetFusedLayerForward3(src[c + 0] + bias[c + 0], scale[c + 0]); - dst[c + 1] = SynetFusedLayerForward3(src[c + 1] + bias[c + 1], scale[c + 1]); - dst[c + 2] = SynetFusedLayerForward3(src[c + 2] + bias[c + 2], scale[c + 2]); - dst[c + 3] = SynetFusedLayerForward3(src[c + 3] + bias[c + 3], scale[c + 3]); - } - for (; c < channels; ++c) - dst[c] = SynetFusedLayerForward3(src[c] + bias[c], scale[c]); - src += channels; - dst += channels; - } - } - - template void SynetFusedLayerForward3NchwXc(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - for (size_t c = 0; c < channels; c += N) - { - for (size_t s = 0; s < spatial; ++s) - { - for (size_t i = 0; i < N; ++i) - dst[i] = SynetFusedLayerForward3(src[i] + bias[i], scale[i]); - src += N; - dst += N; - } - bias += N; - scale += N; - } - } - - void SynetFusedLayerForward3(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward3Nchw(src, bias, scale, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward3Nhwc(src, bias, scale, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - SynetFusedLayerForward3NchwXc<4>(src, bias, scale, channels, spatial, dst); - else if (format == SimdTensorFormatNchw8c) - SynetFusedLayerForward3NchwXc<8>(src, bias, scale, channels, spatial, dst); - else if (format == SimdTensorFormatNchw16c) - SynetFusedLayerForward3NchwXc<16>(src, bias, scale, channels, spatial, dst); - else - assert(0); - } - - //--------------------------------------------------------------------- - - void SynetFusedLayerForward4Nchw(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst0) - { - float _scale1 = scale1[0]; - float _bias1 = bias1[0]; - float * dst1 = dst0 + channels * spatial; - size_t aligned = Simd::AlignLo(spatial, 4); - for (size_t c = 0; c < channels; ++c) - { - float _bias0 = bias0[c]; - size_t s = 0; - for (; s < aligned; s += 4) - { - SynetFusedLayerForward4(src[s + 0], _bias0, _scale1, _bias1, dst0 + s + 0, dst1 + s + 0); - SynetFusedLayerForward4(src[s + 1], _bias0, _scale1, _bias1, dst0 + s + 1, dst1 + s + 1); - SynetFusedLayerForward4(src[s + 2], _bias0, _scale1, _bias1, dst0 + s + 2, dst1 + s + 2); - SynetFusedLayerForward4(src[s + 3], _bias0, _scale1, _bias1, dst0 + s + 3, dst1 + s + 3); - } - for (; s < spatial; ++s) - SynetFusedLayerForward4(src[s], _bias0, _scale1, _bias1, dst0 + s, dst1 + s); - src += spatial; - dst0 += spatial; - dst1 += spatial; - } - } - - void SynetFusedLayerForward4Nhwc(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst0) - { - float _scale1 = scale1[0]; - float _bias1 = bias1[0]; - float * dst1 = dst0 + channels; - size_t aligned = Simd::AlignLo(channels, 4); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned; c += 4) - { - SynetFusedLayerForward4(src[c + 0], bias0[c + 0], _scale1, _bias1, dst0 + c + 0, dst1 + c + 0); - SynetFusedLayerForward4(src[c + 1], bias0[c + 1], _scale1, _bias1, dst0 + c + 1, dst1 + c + 1); - SynetFusedLayerForward4(src[c + 2], bias0[c + 2], _scale1, _bias1, dst0 + c + 2, dst1 + c + 2); - SynetFusedLayerForward4(src[c + 3], bias0[c + 3], _scale1, _bias1, dst0 + c + 3, dst1 + c + 3); - } - for (; c < channels; ++c) - SynetFusedLayerForward4(src[c], bias0[c], _scale1, _bias1, dst0 + c, dst1 + c); - src += channels; - dst0 += 2 * channels; - dst1 += 2 * channels; - } - } - - template void SynetFusedLayerForward4NchwXcA(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst0) - { - assert(Aligned(channels, N)); - float _scale1 = scale1[0]; - float _bias1 = bias1[0]; - float * dst1 = dst0 + channels * spatial; - for (size_t c = 0; c < channels; c += N) - { - for (size_t s = 0; s < spatial; ++s) - { - for (size_t i = 0; i < N; ++i) - SynetFusedLayerForward4(src[i], bias0[i], _scale1, _bias1, dst0 + i, dst1 + i); - src += N; - dst0 += N; - dst1 += N; - } - bias0 += N; - } - } - - template void SynetFusedLayerForward4NchwXcU(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst0) - { - assert(0); - } - - template void SynetFusedLayerForward4NchwXc(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (Aligned(channels, N)) - SynetFusedLayerForward4NchwXcA(src, bias0, scale1, bias1, channels, spatial, dst); - else - SynetFusedLayerForward4NchwXcU(src, bias0, scale1, bias1, channels, spatial, dst); - } - - void SynetFusedLayerForward4(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward4Nchw(src, bias0, scale1, bias1, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward4Nhwc(src, bias0, scale1, bias1, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - SynetFusedLayerForward4NchwXc<4>(src, bias0, scale1, bias1, channels, spatial, dst); - else if (format == SimdTensorFormatNchw8c) - SynetFusedLayerForward4NchwXc<8>(src, bias0, scale1, bias1, channels, spatial, dst); - else if (format == SimdTensorFormatNchw16c) - SynetFusedLayerForward4NchwXc<16>(src, bias0, scale1, bias1, channels, spatial, dst); - else - assert(0); - } - - //--------------------------------------------------------------------- - - void SynetFusedLayerForward8Nchw(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst) - { - size_t aligned = Simd::AlignLo(spatial, 4); - for (size_t c = 0; c < channels; ++c) - { - float _src2 = src2[c]; - size_t s = 0; - for (; s < aligned; s += 4) - { - dst[s + 0] = SynetFusedLayerForward8(src0[s + 0], src1[s + 0], _src2); - dst[s + 1] = SynetFusedLayerForward8(src0[s + 1], src1[s + 1], _src2); - dst[s + 2] = SynetFusedLayerForward8(src0[s + 2], src1[s + 2], _src2); - dst[s + 3] = SynetFusedLayerForward8(src0[s + 3], src1[s + 3], _src2); - } - for (; s < spatial; ++s) - dst[s] = SynetFusedLayerForward8(src0[s], src1[s], _src2); - src0 += spatial; - src1 += spatial; - dst += spatial; - } - } - - void SynetFusedLayerForward8Nhwc(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst) - { - size_t aligned = Simd::AlignLo(channels, 4); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned; c += 4) - { - dst[c + 0] = SynetFusedLayerForward8(src0[c + 0], src1[c + 0], src2[c + 0]); - dst[c + 1] = SynetFusedLayerForward8(src0[c + 1], src1[c + 1], src2[c + 1]); - dst[c + 2] = SynetFusedLayerForward8(src0[c + 2], src1[c + 2], src2[c + 2]); - dst[c + 3] = SynetFusedLayerForward8(src0[c + 3], src1[c + 3], src2[c + 3]); - } - for (; c < channels; ++c) - dst[c] = SynetFusedLayerForward8(src0[c], src1[c], src2[c]); - src0 += channels; - src1 += channels; - dst += channels; - } - } - - template void SynetFusedLayerForward8NchwXc(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst) - { - for (size_t c = 0; c < channels; c += N) - { - for (size_t s = 0; s < spatial; ++s) - { - for (size_t i = 0; i < N; ++i) - dst[i] = SynetFusedLayerForward8(src0[i], src1[i], src2[i]); - src0 += N; - src1 += N; - dst += N; - } - src2 += N; - } - } - - void SynetFusedLayerForward8(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward8Nchw(src0, src1, src2, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward8Nhwc(src0, src1, src2, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - SynetFusedLayerForward8NchwXc<4>(src0, src1, src2, channels, spatial, dst); - else if (format == SimdTensorFormatNchw8c) - SynetFusedLayerForward8NchwXc<8>(src0, src1, src2, channels, spatial, dst); - else if (format == SimdTensorFormatNchw16c) - SynetFusedLayerForward8NchwXc<16>(src0, src1, src2, channels, spatial, dst); - else - assert(0); - } - - //--------------------------------------------------------------------- - - void SynetFusedLayerForward9Nchw(const float * src0, const float * src1, const float * scale0, const float * bias0, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - const float * scale1 = scale0 + channels0; - const float * bias1 = bias0 + channels0; - size_t aligned = Simd::AlignLo(spatial, 4); - if (dst1) - { - for (size_t c = 0; c < channels0; ++c) - { - float _scale0 = scale0[c]; - float _bias0 = bias0[c]; - size_t s = 0; - for (; s < aligned; ++s) - { - dst0[s + 0] = SynetFusedLayerForward9(src0[s + 0], _scale0, _bias0), dst1[s + 0] = src0[s + 0]; - dst0[s + 1] = SynetFusedLayerForward9(src0[s + 1], _scale0, _bias0), dst1[s + 1] = src0[s + 1]; - dst0[s + 2] = SynetFusedLayerForward9(src0[s + 2], _scale0, _bias0), dst1[s + 2] = src0[s + 2]; - dst0[s + 3] = SynetFusedLayerForward9(src0[s + 3], _scale0, _bias0), dst1[s + 3] = src0[s + 3]; - } - for (; s < spatial; ++s) - dst0[s] = SynetFusedLayerForward9(src0[s], _scale0, _bias0), dst1[s] = src0[s]; - src0 += spatial; - dst0 += spatial; - dst1 += spatial; - } - for (size_t c = 0; c < channels1; ++c) - { - float _scale1 = scale1[c]; - float _bias1 = bias1[c]; - size_t s = 0; - for (; s < aligned; ++s) - { - dst0[s + 0] = SynetFusedLayerForward9(src1[s + 0], _scale1, _bias1), dst1[s + 0] = src1[s + 0]; - dst0[s + 1] = SynetFusedLayerForward9(src1[s + 1], _scale1, _bias1), dst1[s + 1] = src1[s + 1]; - dst0[s + 2] = SynetFusedLayerForward9(src1[s + 2], _scale1, _bias1), dst1[s + 2] = src1[s + 2]; - dst0[s + 3] = SynetFusedLayerForward9(src1[s + 3], _scale1, _bias1), dst1[s + 3] = src1[s + 3]; - } - for (; s < spatial; ++s) - dst0[s] = SynetFusedLayerForward9(src1[s], _scale1, _bias1), dst1[s] = src1[s]; - src1 += spatial; - dst0 += spatial; - dst1 += spatial; - } - } - else - { - for (size_t c = 0; c < channels0; ++c) - { - float _scale0 = scale0[c]; - float _bias0 = bias0[c]; - size_t s = 0; - for (; s < aligned; ++s) - { - dst0[s + 0] = SynetFusedLayerForward9(src0[s + 0], _scale0, _bias0); - dst0[s + 1] = SynetFusedLayerForward9(src0[s + 1], _scale0, _bias0); - dst0[s + 2] = SynetFusedLayerForward9(src0[s + 2], _scale0, _bias0); - dst0[s + 3] = SynetFusedLayerForward9(src0[s + 3], _scale0, _bias0); - } - for (; s < spatial; ++s) - dst0[s] = SynetFusedLayerForward9(src0[s], _scale0, _bias0); - src0 += spatial; - dst0 += spatial; - } - for (size_t c = 0; c < channels1; ++c) - { - float _scale1 = scale1[c]; - float _bias1 = bias1[c]; - size_t s = 0; - for (; s < aligned; ++s) - { - dst0[s + 0] = SynetFusedLayerForward9(src1[s + 0], _scale1, _bias1); - dst0[s + 1] = SynetFusedLayerForward9(src1[s + 1], _scale1, _bias1); - dst0[s + 2] = SynetFusedLayerForward9(src1[s + 2], _scale1, _bias1); - dst0[s + 3] = SynetFusedLayerForward9(src1[s + 3], _scale1, _bias1); - } - for (; s < spatial; ++s) - dst0[s] = SynetFusedLayerForward9(src1[s], _scale1, _bias1); - src1 += spatial; - dst0 += spatial; - } - } - } - - void SynetFusedLayerForward9Nhwc(const float * src0, const float * src1, const float * scale0, const float * bias0, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - const float * scale1 = scale0 + channels0; - const float * bias1 = bias0 + channels0; - size_t aligned0 = Simd::AlignLo(channels0, 4); - size_t aligned1 = Simd::AlignLo(channels1, 4); - if (dst1) - { - for (size_t s = 0; s < spatial; ++s) - { - size_t c; - for (c = 0; c < channels0; c += 4) - { - dst0[c + 0] = SynetFusedLayerForward9(src0[c + 0], scale0[c + 0], bias0[c + 0]), dst1[c + 0] = src0[c + 0]; - dst0[c + 1] = SynetFusedLayerForward9(src0[c + 1], scale0[c + 1], bias0[c + 1]), dst1[c + 1] = src0[c + 1]; - dst0[c + 2] = SynetFusedLayerForward9(src0[c + 2], scale0[c + 2], bias0[c + 2]), dst1[c + 2] = src0[c + 2]; - dst0[c + 3] = SynetFusedLayerForward9(src0[c + 3], scale0[c + 3], bias0[c + 3]), dst1[c + 3] = src0[c + 3]; - } - for (; c < channels0; ++c) - dst0[c] = SynetFusedLayerForward9(src0[c], scale0[c], bias0[c]), dst1[c] = src0[c]; - src0 += channels0; - dst0 += channels0; - dst1 += channels0; - for (c = 0; c < channels1; c += 4) - { - dst0[c + 0] = SynetFusedLayerForward9(src1[c + 0], scale1[c + 0], bias1[c + 0]), dst1[c + 0] = src1[c + 0]; - dst0[c + 1] = SynetFusedLayerForward9(src1[c + 1], scale1[c + 1], bias1[c + 1]), dst1[c + 1] = src1[c + 1]; - dst0[c + 2] = SynetFusedLayerForward9(src1[c + 2], scale1[c + 2], bias1[c + 2]), dst1[c + 2] = src1[c + 2]; - dst0[c + 3] = SynetFusedLayerForward9(src1[c + 3], scale1[c + 3], bias1[c + 3]), dst1[c + 3] = src1[c + 3]; - } - for (; c < channels1; ++c) - dst0[c] = SynetFusedLayerForward9(src1[c], scale1[c], bias1[c]), dst1[c] = src1[c]; - src1 += channels1; - dst0 += channels1; - dst1 += channels1; - } - } - else - { - for (size_t s = 0; s < spatial; ++s) - { - size_t c; - for (c = 0; c < channels0; c += 4) - { - dst0[c + 0] = SynetFusedLayerForward9(src0[c + 0], scale0[c + 0], bias0[c + 0]); - dst0[c + 1] = SynetFusedLayerForward9(src0[c + 1], scale0[c + 1], bias0[c + 1]); - dst0[c + 2] = SynetFusedLayerForward9(src0[c + 2], scale0[c + 2], bias0[c + 2]); - dst0[c + 3] = SynetFusedLayerForward9(src0[c + 3], scale0[c + 3], bias0[c + 3]); - } - for (; c < channels0; ++c) - dst0[c] = SynetFusedLayerForward9(src0[c], scale0[c], bias0[c]); - src0 += channels0; - dst0 += channels0; - for (c = 0; c < channels1; c += 4) - { - dst0[c + 0] = SynetFusedLayerForward9(src1[c + 0], scale1[c + 0], bias1[c + 0]); - dst0[c + 1] = SynetFusedLayerForward9(src1[c + 1], scale1[c + 1], bias1[c + 1]); - dst0[c + 2] = SynetFusedLayerForward9(src1[c + 2], scale1[c + 2], bias1[c + 2]); - dst0[c + 3] = SynetFusedLayerForward9(src1[c + 3], scale1[c + 3], bias1[c + 3]); - } - for (; c < channels1; ++c) - dst0[c] = SynetFusedLayerForward9(src1[c], scale1[c], bias1[c]); - src1 += channels1; - dst0 += channels1; - } - } - } - - template void SynetFusedLayerForward9NchwXcA(const float * src0, const float * src1, const float * scale0, const float * bias0, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - assert(Aligned(channels0, N)); - const float * scale1 = scale0 + channels0; - const float * bias1 = bias0 + channels0; - if (dst1) - { - for (size_t c = 0; c < channels0; c += N) - { - for (size_t s = 0; s < spatial; ++s) - { - for (size_t i = 0; i < N; ++i) - dst0[i] = SynetFusedLayerForward9(src0[i], scale0[i], bias0[i]), dst1[i] = src0[i]; - src0 += N; - dst0 += N; - dst1 += N; - } - scale0 += N; - bias0 += N; - } - for (size_t c = 0; c < channels1; c += N) - { - for (size_t s = 0; s < spatial; ++s) - { - for (size_t i = 0; i < N; ++i) - dst0[i] = SynetFusedLayerForward9(src1[i], scale1[i], bias1[i]), dst1[i] = src1[i]; - src1 += N; - dst0 += N; - dst1 += N; - } - scale1 += N; - bias1 += N; - } - } - else - { - for (size_t c = 0; c < channels0; c += N) - { - for (size_t s = 0; s < spatial; ++s) - { - for (size_t i = 0; i < N; ++i) - dst0[i] = SynetFusedLayerForward9(src0[i], scale0[i], bias0[i]); - src0 += N; - dst0 += N; - } - scale0 += N; - bias0 += N; - } - for (size_t c = 0; c < channels1; c += N) - { - for (size_t s = 0; s < spatial; ++s) - { - for (size_t i = 0; i < N; ++i) - dst0[i] = SynetFusedLayerForward9(src1[i], scale1[i], bias1[i]); - src1 += N; - dst0 += N; - } - scale1 += N; - bias1 += N; - } - } - } - - template void SynetFusedLayerForward9NchwXcU(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - assert(0); - } - - template void SynetFusedLayerForward9NchwXc(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - if (Aligned(channels0, N)) - SynetFusedLayerForward9NchwXcA(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else - SynetFusedLayerForward9NchwXcU(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - } - - void SynetFusedLayerForward9(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels0 + channels1, spatial, format)) - SynetFusedLayerForward9Nchw(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else if (Base::NhwcCompatible(channels0 + channels1, spatial, format)) - SynetFusedLayerForward9Nhwc(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else if (format == SimdTensorFormatNchw4c) - SynetFusedLayerForward9NchwXc<4>(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else if (format == SimdTensorFormatNchw8c) - SynetFusedLayerForward9NchwXc<8>(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else if (format == SimdTensorFormatNchw16c) - SynetFusedLayerForward9NchwXc<16>(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else - assert(0); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseSynetMergedConvolution32f.cpp b/src/3rd/Simd/Simd/SimdBaseSynetMergedConvolution32f.cpp deleted file mode 100644 index ae4d2943..00000000 --- a/src/3rd/Simd/Simd/SimdBaseSynetMergedConvolution32f.cpp +++ /dev/null @@ -1,391 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetMergedConvolution32f.h" -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdUpdate.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ -#if defined(SIMD_PERFORMANCE_STATISTIC) - Base::PerformanceMeasurer * SynetMergedConvolution32f::Perf(const String& func) - { - if (_perf == NULL) - _perf = Simd::Base::PerformanceMeasurerStorage::s_storage.Get(func, Param().Info() + " " + Desc(), Param().Flop()); - return _perf; - } -#endif - - namespace Base - { - template void DirectConvolution(const float * src, const SimdConvolutionParameters & p, - size_t maC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcH = p.srcH, srcW = p.srcW, srcC = p.srcC, dstW = p.dstW, dstC = p.dstC; - size_t kernelY = p.kernelY, kernelX = p.kernelX, strideY = p.strideY, strideX = p.strideX, padY = p.padY, padX = p.padX; - Array32f buf(dstC); - for (size_t dy = yBeg; dy < yEnd; ++dy) - { - for (size_t dx = 0; dx < dstW; ++dx) - { - if (bias) - memcpy(buf.data, bias, dstC * sizeof(float)); - else - memset(buf.data, 0, dstC * sizeof(float)); - for (size_t ky = 0; ky < kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < kernelX; ++kx) - { - size_t sx = dx * strideX + kx - padX; - if (sx < p.srcW) - { - const float * pw = weight + (ky*kernelX + kx)*srcC*dstC; - const float * ps = src + (sy*srcW + sx)*srcC; - for (size_t sc = 0; sc < srcC; ++sc) - { - for (size_t dc = 0; dc < dstC; ++dc) - buf[dc] += ps[sc] * pw[dc]; - pw += dstC; - } - } - } - } - } - for (size_t dc = 0; dc < dstC; ++dc) - Update(dst + dc, Activate(buf[dc], params, dc)); - dst += p.dstC; - } - } - } - - template void DepthwiseConvolution(const float * src, const SimdConvolutionParameters & p, - size_t maC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float * weight, const float * bias, const float * params, float * dst) - { - assert(p.group == p.srcC && p.group == p.dstC); - size_t srcH = p.srcH, srcW = p.srcW, srcC = p.srcC, dstW = p.dstW; - size_t kernelY = p.kernelY, kernelX = p.kernelX, strideY = p.strideY, strideX = p.strideX, padY = p.padY, padX = p.padX; - for (size_t dy = yBeg; dy < yEnd; ++dy) - { - for (size_t dx = 0; dx < dstW; ++dx) - { - for (size_t c = 0; c < srcC; ++c) - { - float sum = bias ? bias[c] : 0; - for (size_t ky = 0; ky < kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - if (sy < srcH) - { - for (size_t kx = 0; kx < kernelX; ++kx) - { - size_t sx = dx * strideX + kx - padX; - if (sx < srcW) - { - const float * pw = weight + (ky * kernelX + kx) * srcC + c; - const float * ps = src + (sy * srcW + sx) * srcC + c; - sum += ps[0] * pw[0]; - } - } - } - } - dst[c] = Activate(sum, params, c); - } - dst += srcC; - } - } - } - - template void SetConvolutionPtr(const MergConvParam32f & p, size_t index, SynetMergedConvolution32f::ConvolutionPtr convolution[3]) - { - switch (index) - { - case 0: - convolution[0] = DirectConvolution; - break; - case 1: - convolution[1] = DepthwiseConvolution; - break; - case 2: - if(p.add) - convolution[2] = DirectConvolution; - else - convolution[2] = DirectConvolution; - break; - default: - assert(0); - } - } - - SynetMergedConvolution32f::SynetMergedConvolution32f(const MergConvParam32f & p) - : _param(p), _base(true) - { - _sizeS = p.conv[0].srcH*p.conv[0].srcW*p.conv[0].srcC; - _sizeD = p.conv[2].dstH*p.conv[2].dstW*p.conv[2].dstC; - _sizeB[0] = p.conv[1].srcH*p.conv[1].srcW*p.conv[1].srcC; - _sizeB[1] = p.conv[1].dstH*p.conv[1].dstW*p.conv[1].dstC; - for (size_t i = 0; i < _param.count; ++i) - { - switch (p.conv[i].activation) - { - case SimdConvolutionActivationIdentity: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationRelu: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationLeakyRelu: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationRestrictRange: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationPrelu: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationElu: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationHswish: SetConvolutionPtr(_param, i, _convolution); break; - default: assert(0); - } - } - } - - void SynetMergedConvolution32f::SetSize(size_t L1, size_t L2, size_t L3, size_t F) - { - const MergConvParam32f & p = _param; - _miC = F; - size_t size = 0; - for (size_t i = 0; i < 3; ++i) - size += p.conv[i].kernelY*p.conv[i].kernelX *p.conv[i].srcC * p.conv[i].dstC / p.conv[i].group; - size_t count = size * sizeof(float) / (L3/2) + 1; - _maC = AlignHiAny(p.conv[0].dstC / count, 2 * _miC); - for (size_t yStep = p.conv[1].dstH; yStep >= 1; yStep--) - { - _yStep[1] = Simd::Max(1, yStep); - for (_bufH[1] = 1; _bufH[1] < _yStep[1]; _bufH[1] *= 2); - _yStep[0] = _yStep[1] * p.conv[1].strideY; - for (_bufH[0] = 1; _bufH[0] < (_yStep[1] - 1) * p.conv[1].strideY + p.conv[1].kernelY; _bufH[0] *= 2); - _sizeB[0] = _bufH[0] * p.conv[0].dstW * _maC; - _sizeB[1] = _bufH[1] * p.conv[1].dstW * _maC; - if ((_sizeB[0] + _sizeB[1]) * sizeof(float) <= L2) - break; - } - for (size_t i = 0; i < 3; ++i) - { - size_t dstC = AlignHiAny(p.conv[i].dstC, i == 1 ? _miC : 2 * _miC); - _rWeight[i].Resize(dstC*p.conv[i].kernelY*p.conv[i].kernelX*p.conv[i].srcC); - _rBias[i].Resize(dstC, true); - if (p.conv[i].activation == ::SimdConvolutionActivationPrelu) - _rParams[i].Resize(dstC, true); - } - _dp[0] = p.conv[0].activation == ::SimdConvolutionActivationPrelu ? 1 : 0; - _dp[1] = p.conv[1].activation == ::SimdConvolutionActivationPrelu ? 1 : 0; - _dw[0] = p.conv[0].kernelY*p.conv[0].kernelX*p.conv[0].srcC; - _dw[1] = p.conv[1].kernelY*p.conv[1].kernelX; - _dw[2] = AlignHiAny(p.conv[2].dstC, 2 * _miC); - _base = false; - } - - float * SynetMergedConvolution32f::GetBuffer(float * buffer) - { - if (buffer) - return buffer; - else - { - _buffer.Resize(ExternalBufferSize()); - return _buffer.data; - } - } - - void SynetMergedConvolution32f::ReorderInputWeight(const float * src, float * dst) const - { - const SimdConvolutionParameters & p = _param.conv[0]; - size_t size = p.kernelY*p.kernelX*p.srcC, dstC = p.dstC, micD = _miC*2; - for (size_t c = 0; c < dstC; c += micD) - { - size_t n = Simd::Min(micD, dstC - c); - for (size_t s = 0; s < size; s++) - { - size_t i = 0; - for (; i < n; ++i) - dst[i] = src[s*dstC + c + i]; - for (; i < micD; ++i) - dst[i] = 0; - dst += micD; - } - } - } - - void SynetMergedConvolution32f::ReorderDepthwiseWeight(const float * src, float * dst) const - { - const SimdConvolutionParameters & p = _param.conv[1]; - size_t dstC = p.dstC, size = p.kernelY*p.kernelX, micD = _miC; - for (size_t c = 0; c < dstC; c += micD) - { - size_t n = Simd::Min(micD, dstC - c); - for (size_t s = 0; s < size; s++) - { - size_t i = 0; - for (; i < n; ++i) - dst[i] = src[s*dstC + c + i]; - for (; i < micD; ++i) - dst[i] = 0; - dst += micD; - } - } - } - - void SynetMergedConvolution32f::ReorderOutputWeight(const float * src, float * dst) const - { - const SimdConvolutionParameters & p = _param.conv[2]; - size_t srcC = p.srcC, dstC = p.dstC, micD = _miC * 2; - for (size_t m = 0; m < srcC; m += _maC) - { - size_t maC = Simd::Min(srcC, m + _maC) - m; - for (size_t d = 0; d < dstC; d += micD) - { - size_t n = Simd::Min(micD, dstC - d); - for (size_t s = 0; s < maC; s++) - { - size_t i = 0; - for (; i < n; ++i) - dst[i] = src[s*dstC + d + i]; - for (; i < micD; ++i) - dst[i] = 0; - dst += micD; - } - } - src += dstC*maC; - } - } - - size_t SynetMergedConvolution32f::ExternalBufferSize() const - { - return _sizeB[0] + _sizeB[1]; - } - - size_t SynetMergedConvolution32f::InternalBufferSize() const - { - size_t size = _buffer.size; - for (size_t i = 0; i < 3; ++i) - size += _rWeight[i].size + _rBias[i].size + _rParams[i].size; - return size; - } - - void SynetMergedConvolution32f::SetParams(const float * const * weight, SimdBool * internal, const float * const * bias, const float * const * params) - { - const MergConvParam32f & p = _param; - for (size_t i = 0; i < p.count; ++i) - { - if (_rWeight[i].data) - { - switch (i) - { - case 0: ReorderInputWeight(weight[i], _rWeight[i].data); break; - case 1: ReorderDepthwiseWeight(weight[i], _rWeight[i].data); break; - case 2: ReorderOutputWeight(weight[i], _rWeight[i].data); break; - default: assert(0); - } - _weight[i] = _rWeight[i].data; - if (internal) - internal[i] = SimdTrue; - } - else - { - _weight[i] = weight[i]; - if (internal) - internal[i] = SimdFalse; - } - if (_rBias[i].data) - { - if (bias[i]) - memcpy(_rBias[i].data, bias[i], p.conv[i].dstC * sizeof(float)); - _bias[i] = _rBias[i].data; - } - else - _bias[i] = bias[i]; - if (_rParams[i].size) - { - memcpy(_rParams[i].data, params[i], p.conv[i].dstC * sizeof(float)); - _params[i] = _rParams[i].data; - } - else - _params[i] = params[i]; - } - } - - void SynetMergedConvolution32f::Forward(const float * src, float * buf, float * dst) - { - const MergConvParam32f & p = _param; - float * buf0 = GetBuffer(buf); - float * buf1 = buf0 + _sizeB[0]; - for (size_t b = 0; b < p.batch; ++b) - { - if (_base) - { - _convolution[0](src, p.conv[0], 0, 0, p.conv[0].dstH, _bufH, _weight[0], _bias[0], _params[0], buf0); - _convolution[1](buf0, p.conv[1], 0, 0, p.conv[1].dstH, _bufH, _weight[1], _bias[1], _params[1], buf1); - if (p.add) - memcpy(dst, src, sizeof(float)*_sizeS); - _convolution[2](buf1, p.conv[2], 0, 0, p.conv[2].dstH, _bufH, _weight[2], _bias[2], _params[2], dst); - } - else - { - for (size_t c = 0, C = p.conv[1].dstC; c < C; c += _maC) - { - size_t maC = Simd::Min(C, c + _maC) - c; - for (size_t yBeg1 = 0, yBeg0 = 0; yBeg1 < p.conv[1].dstH;) - { - size_t yEnd1 = Simd::Min(yBeg1 + _yStep[1], p.conv[1].dstH); - size_t yEnd0 = Simd::Min(Simd::Max(yBeg0 + _yStep[0], (_yStep[1] - 1)*p.conv[1].strideY + p.conv[1].kernelY - p.conv[1].padY), p.conv[0].dstH); - _convolution[0](src, p.conv[0], maC, yBeg0, yEnd0, _bufH, _weight[0] + c * _dw[0], _bias[0] + c, _params[0] + c * _dp[0], buf0); - _convolution[1](buf0, p.conv[1], maC, yBeg1, yEnd1, _bufH, _weight[1] + c * _dw[1], _bias[1] + c, _params[1] + c * _dp[1], buf1); - if (p.add && c == 0) - { - size_t offset = yBeg1 * p.conv[2].dstW * p.conv[2].dstC, size = (yEnd1 - yBeg1)*p.conv[2].dstW * p.conv[2].dstC; - memcpy(dst + offset, src + offset, sizeof(float)*size); - } - if(maC == C) - _convolution[2](buf1, p.conv[2], maC, yBeg1, yEnd1, _bufH, _weight[2] + c * _dw[2], _bias[2], _params[2], dst); - else if (c == 0) - _convolution[3](buf1, p.conv[2], maC, yBeg1, yEnd1, _bufH, _weight[2] + c * _dw[2], _bias[2], _params[2], dst); - else if (c + maC < C) - _convolution[4](buf1, p.conv[2], maC, yBeg1, yEnd1, _bufH, _weight[2] + c * _dw[2], _bias[2], _params[2], dst); - else - _convolution[5](buf1, p.conv[2], maC, yBeg1, yEnd1, _bufH, _weight[2] + c * _dw[2], _bias[2], _params[2], dst); - yBeg1 = yEnd1; - yBeg0 = yEnd0; - } - } - } - src += _sizeS; - dst += _sizeD; - } - } - - //--------------------------------------------------------------------- - - void * SynetMergedConvolution32fInit(size_t batch, const SimdConvolutionParameters * convs, size_t count, SimdBool add) - { - MergConvParam32f param(batch, convs, count, add); - if (!param.Valid()) - return NULL; - return new Base::SynetMergedConvolution32f(param); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseSynetPooling.cpp b/src/3rd/Simd/Simd/SimdBaseSynetPooling.cpp deleted file mode 100644 index 3c7061f0..00000000 --- a/src/3rd/Simd/Simd/SimdBaseSynetPooling.cpp +++ /dev/null @@ -1,174 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdArray.h" -#include "Simd/SimdPow.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdEnable.h" - -namespace Simd -{ - namespace Base - { - void SynetPoolingForwardAverage(const float* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float* dst, size_t dstH, size_t dstW, SimdBool excludePad, SimdTensorFormatType format) - { - if (format == SimdTensorFormatNhwc) - { - for (size_t ph = 0; ph < dstH; ++ph) - { - size_t hStart = ph * strideY - padY; - size_t hEnd = Simd::Min(hStart + kernelY, srcH); - hStart = Simd::Max(0, hStart); - for (size_t pw = 0; pw < dstW; ++pw) - { - size_t wStart = pw * strideX - padX; - size_t wEnd = Simd::Min(wStart + kernelX, srcW); - wStart = Simd::Max(0, wStart); - for (size_t c = 0; c < srcC; ++c) - dst[c] = 0.0f; - for (size_t h = hStart; h < hEnd; ++h) - { - for (size_t w = wStart; w < wEnd; ++w) - { - const float* pc = src + (h * srcW + w) * srcC; - for (size_t c = 0; c < srcC; ++c) - dst[c] += pc[c]; - } - } - if (excludePad) - for (size_t c = 0; c < srcC; ++c) - dst[c] = dst[c] / float((hEnd - hStart) * (wEnd - wStart)); - else - for (size_t c = 0; c < srcC; ++c) - dst[c] = dst[c] / float(kernelY * kernelX); - dst += srcC; - } - } - } - else if (format == SimdTensorFormatNchw) - { - for (size_t c = 0; c < srcC; ++c) - { - for (size_t ph = 0; ph < dstH; ++ph) - { - size_t hStart = ph * strideY - padY; - size_t hEnd = Simd::Min(hStart + kernelY, srcH); - hStart = Simd::Max(0, hStart); - for (size_t pw = 0; pw < dstW; ++pw) - { - size_t wStart = pw * strideX - padX; - size_t wEnd = Simd::Min(wStart + kernelX, srcW); - wStart = Simd::Max(0, wStart); - float sum = 0.0f; - for (size_t h = hStart; h < hEnd; ++h) - for (size_t w = wStart; w < wEnd; ++w) - sum += src[h * srcW + w]; - if (excludePad) - dst[ph * dstW + pw] = sum / float((hEnd - hStart) * (wEnd - wStart)); - else - dst[ph * dstW + pw] = sum / float(kernelY * kernelX); - } - } - src += srcW * srcH; - dst += dstW * dstH; - } - } - else - assert(0); - } - - //--------------------------------------------------------------------- - - template void SynetPoolingForwardMax(const T* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, T* dst, size_t dstH, size_t dstW, SimdTensorFormatType format) - { - if (format == SimdTensorFormatNhwc) - { - for (size_t ph = 0; ph < dstH; ++ph) - { - size_t hStart = ph * strideY - padY; - size_t hEnd = Simd::Min(hStart + kernelY, srcH); - hStart = Simd::Max(0, hStart); - for (size_t pw = 0; pw < dstW; ++pw) - { - size_t wStart = pw * strideX - padX; - size_t wEnd = Simd::Min(wStart + kernelX, srcW); - wStart = Simd::Max(0, wStart); - for (size_t c = 0; c < srcC; ++c) - dst[c] = std::numeric_limits::lowest(); - for (size_t h = hStart; h < hEnd; ++h) - { - for (size_t w = wStart; w < wEnd; ++w) - { - const T * ps = src + (h * srcW + w) * srcC; - for (size_t c = 0; c < srcC; ++c) - dst[c] = Simd::Max(dst[c], ps[c]); - } - } - dst += srcC; - } - } - } - else if (format == SimdTensorFormatNchw) - { - for (size_t c = 0; c < srcC; ++c) - { - for (size_t ph = 0; ph < dstH; ++ph) - { - size_t hStart = ph * strideY - padY; - size_t hEnd = Simd::Min(hStart + kernelY, srcH); - hStart = Simd::Max(0, hStart); - for (size_t pw = 0; pw < dstW; ++pw) - { - size_t wStart = pw * strideX - padX; - size_t wEnd = Simd::Min(wStart + kernelX, srcW); - wStart = Simd::Max(0, wStart); - T max = std::numeric_limits::lowest();; - for (size_t h = hStart; h < hEnd; ++h) - for (size_t w = wStart; w < wEnd; ++w) - max = Simd::Max(max, src[h * srcW + w]); - dst[ph * dstW + pw] = max; - } - } - src += srcW * srcH; - dst += dstW * dstH; - } - } - else - assert(0); - } - - void SynetPoolingForwardMax32f(const float * src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float * dst, size_t dstH, size_t dstW, SimdTensorFormatType format) - { - SynetPoolingForwardMax(src, srcC, srcH, srcW, kernelY, kernelX, strideY, strideX, padY, padX, dst, dstH, dstW, format); - } - - void SynetPoolingForwardMax8u(const uint8_t* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, uint8_t* dst, size_t dstH, size_t dstW, SimdTensorFormatType format) - { - SynetPoolingForwardMax(src, srcC, srcH, srcW, kernelY, kernelX, strideY, strideX, padY, padX, dst, dstH, dstW, format); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseTexture.cpp b/src/3rd/Simd/Simd/SimdBaseTexture.cpp deleted file mode 100644 index 5e2ddda9..00000000 --- a/src/3rd/Simd/Simd/SimdBaseTexture.cpp +++ /dev/null @@ -1,134 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE int TextureBoostedSaturatedGradient(const uint8_t * src, ptrdiff_t step, int saturation, int boost) - { - return (saturation + RestrictRange((int)src[step] - (int)src[-step], -saturation, saturation))*boost; - } - - void TextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride) - { - assert(int(2)*saturation*boost <= 0xFF); - - memset(dx, 0, width); - memset(dy, 0, width); - src += srcStride; - dx += dxStride; - dy += dyStride; - for (size_t row = 2; row < height; ++row) - { - dx[0] = 0; - dy[0] = 0; - for (size_t col = 1; col < width - 1; ++col) - { - dy[col] = TextureBoostedSaturatedGradient(src + col, srcStride, saturation, boost); - dx[col] = TextureBoostedSaturatedGradient(src + col, 1, saturation, boost); - } - dx[width - 1] = 0; - dy[width - 1] = 0; - src += srcStride; - dx += dxStride; - dy += dyStride; - } - memset(dx, 0, width); - memset(dy, 0, width); - } - - void TextureBoostedUv(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t boost, uint8_t * dst, size_t dstStride) - { - assert(boost < 128); - - int min = 128 - (128 / boost); - int max = 255 - min; - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - dst[col] = (RestrictRange(src[col], min, max) - min)*boost; - } - src += srcStride; - dst += dstStride; - } - } - - void TextureGetDifferenceSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, int64_t * sum) - { - *sum = 0; - for (size_t row = 0; row < height; ++row) - { - int rowSum = 0; - for (size_t col = 0; col < width; ++col) - rowSum += src[col] - Average(lo[col], hi[col]); - *sum += rowSum; - - src += srcStride; - lo += loStride; - hi += hiStride; - } - } - - void TexturePerformCompensation(const uint8_t * src, size_t srcStride, size_t width, size_t height, - int shift, uint8_t * dst, size_t dstStride) - { - assert(shift > -0xFF && shift < 0xFF); - - if (shift == 0) - { - if (src != dst) - Base::Copy(src, srcStride, width, height, 1, dst, dstStride); - return; - } - else if (shift > 0) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - dst[col] = Min(src[col] + shift, 0xFF); - src += srcStride; - dst += dstStride; - } - } - else if (shift < 0) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - dst[col] = Max(src[col] + shift, 0); - src += srcStride; - dst += dstStride; - } - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseThread.cpp b/src/3rd/Simd/Simd/SimdBaseThread.cpp deleted file mode 100644 index fb000a65..00000000 --- a/src/3rd/Simd/Simd/SimdBaseThread.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMath.h" -#include "Simd/SimdBase.h" - -#include - -namespace Simd -{ - namespace Base - { - size_t g_threadNumber = 1; - - size_t GetThreadNumber() - { - return g_threadNumber; - } - - void SetThreadNumber(size_t threadNumber) - { - g_threadNumber = Simd::RestrictRange(threadNumber, 1, std::thread::hardware_concurrency()); - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseTransform.cpp b/src/3rd/Simd/Simd/SimdBaseTransform.cpp deleted file mode 100644 index 79484272..00000000 --- a/src/3rd/Simd/Simd/SimdBaseTransform.cpp +++ /dev/null @@ -1,171 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdDefs.h" - -namespace Simd -{ - namespace Base - { - template SIMD_INLINE void CopyPixel(const uint8_t * src, uint8_t * dst) - { - for (size_t i = 0; i < N; ++i) - dst[i] = src[i]; - } - - template<> SIMD_INLINE void CopyPixel<1>(const uint8_t * src, uint8_t * dst) - { - dst[0] = src[0]; - } - - template<> SIMD_INLINE void CopyPixel<2>(const uint8_t * src, uint8_t * dst) - { - ((uint16_t*)dst)[0] = ((uint16_t*)src)[0]; - } - - template<> SIMD_INLINE void CopyPixel<3>(const uint8_t * src, uint8_t * dst) - { - ((uint16_t*)dst)[0] = ((uint16_t*)src)[0]; - dst[2] = src[2]; - } - - template<> SIMD_INLINE void CopyPixel<4>(const uint8_t * src, uint8_t * dst) - { - ((uint32_t*)dst)[0] = ((uint32_t*)src)[0]; - } - - template void TransformImageRotate0(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - size_t rowSize = width * N; - for (size_t row = 0; row < height; ++row) - { - memcpy(dst, src, rowSize); - src += srcStride; - dst += dstStride; - } - } - - template void TransformImageRotate90(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - dst += (width - 1)*dstStride; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - CopyPixel(src + col * N, dst - col * dstStride); - src += srcStride; - dst += N; - } - } - - template void TransformImageRotate180(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - dst += (height - 1)*dstStride + (width - 1)*N; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - CopyPixel(src + col * N, dst - col * N); - src += srcStride; - dst -= dstStride; - } - } - - template void TransformImageRotate270(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - dst += (height - 1)*N; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - CopyPixel(src + col * N, dst + col * dstStride); - src += srcStride; - dst -= N; - } - } - - template void TransformImageTransposeRotate0(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - CopyPixel(src + col * N, dst + col * dstStride); - src += srcStride; - dst += N; - } - } - - template void TransformImageTransposeRotate90(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - dst += (width - 1)*N; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - CopyPixel(src + col * N, dst - col * N); - src += srcStride; - dst += dstStride; - } - } - - template void TransformImageTransposeRotate180(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - dst += (width - 1)*dstStride + (height - 1)*N; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - CopyPixel(src + col * N, dst - col * dstStride); - src += srcStride; - dst -= N; - } - } - - template void TransformImageTransposeRotate270(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - size_t rowSize = width * N; - dst += (height - 1)*dstStride; - for (size_t row = 0; row < height; ++row) - { - memcpy(dst, src, rowSize); - src += srcStride; - dst -= dstStride; - } - } - - template void TransformImage(const uint8_t * src, size_t srcStride, size_t width, size_t height, SimdTransformType transform, uint8_t * dst, size_t dstStride) - { - typedef void(*TransformImagePtr)(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - static const TransformImagePtr transformImage[8] = { TransformImageRotate0, TransformImageRotate90, TransformImageRotate180, TransformImageRotate270, - TransformImageTransposeRotate0, TransformImageTransposeRotate90, TransformImageTransposeRotate180, TransformImageTransposeRotate270 }; - transformImage[(int)transform](src, srcStride, width, height, dst, dstStride); - }; - - void TransformImage(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, SimdTransformType transform, uint8_t * dst, size_t dstStride) - { - switch (pixelSize) - { - case 1: TransformImage<1>(src, srcStride, width, height, transform, dst, dstStride); break; - case 2: TransformImage<2>(src, srcStride, width, height, transform, dst, dstStride); break; - case 3: TransformImage<3>(src, srcStride, width, height, transform, dst, dstStride); break; - case 4: TransformImage<4>(src, srcStride, width, height, transform, dst, dstStride); break; - default: assert(0); - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseWinograd.cpp b/src/3rd/Simd/Simd/SimdBaseWinograd.cpp deleted file mode 100644 index fc99b3ff..00000000 --- a/src/3rd/Simd/Simd/SimdBaseWinograd.cpp +++ /dev/null @@ -1,2392 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdWinograd.h" - -namespace Simd -{ - namespace Base - { - void WinogradKernel1x3Block1x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans) - { - if (trans) - { - for (size_t i = 0; i < size; i += 1) - Base::WinogradKernel1x3Block1x4SetFilter1t(src + i, dst + i, size); - } - else - { - for (size_t i = 0; i < size; i += 1, src += 3, dst += 1) - Base::WinogradKernel1x3Block1x4SetFilter1n(src, dst, size); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel1x3Block1x4SetInput1n(const float src[6], float* dst, size_t stride) - { - dst[0 * stride] = src[0] * 4 - src[2] * 5 + src[4]; - dst[1 * stride] = -src[1] * 4 - src[2] * 4 + src[3] + src[4]; - dst[2 * stride] = src[1] * 4 - src[2] * 4 - src[3] + src[4]; - dst[3 * stride] = -src[1] * 2 - src[2] + src[3] * 2 + src[4]; - dst[4 * stride] = src[1] * 2 - src[2] - src[3] * 2 + src[4]; - dst[5 * stride] = src[1] * 4 - src[3] * 5 + src[5]; - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetInput1n(const float* src, size_t colB, size_t colE, float* dst, size_t dstStride) - { - float tmp[6] = { 0 }; - for (size_t col = colB; col < colE; ++col) - tmp[col] = src[col]; - WinogradKernel1x3Block1x4SetInput1n(tmp, dst, dstStride); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetInput1t(const float* src, size_t srcC, float* dst, size_t dstStride) - { - for (size_t c = 0; c < srcC; ++c, src++, dst++) - { - float tmp[6]; - tmp[0] = src[0 * srcC]; - tmp[1] = src[1 * srcC]; - tmp[2] = src[2 * srcC]; - tmp[3] = src[3 * srcC]; - tmp[4] = src[4 * srcC]; - tmp[5] = src[5 * srcC]; - WinogradKernel1x3Block1x4SetInput1n(tmp, dst, dstStride); - } - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetInput1t(const float* src, size_t srcC, size_t colB, size_t colE, float* dst, size_t dstStride) - { - for (size_t c = 0; c < srcC; ++c, src++, dst++) - { - float tmp[6] = { 0 }; - for (size_t col = colB; col < colE; ++col) - tmp[col] = src[col * srcC]; - WinogradKernel1x3Block1x4SetInput1n(tmp, dst, dstStride); - } - } - - void WinogradKernel1x3Block1x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padX == padW && padY == 0 && padH == 0 && (padX == 0 || padX == 1)); - size_t dstH = srcHeight; - size_t dstW = padX ? srcWidth : srcWidth - 2; - size_t dstW4 = dstW / 4 * 4; - size_t noseW = Simd::Min(6, dstW + 1); - size_t startX = padX ? 4 : 0; - if (padX) - { - if (dstW == dstW4) - dstW4 -= 4; - src -= 1 * (trans ? srcChannels : 1); - } - size_t tailW = dstW - dstW4 + (padX ? 1 : 2); - if (trans) - { - for (size_t row = 0; row < dstH; row += 1) - { - size_t col = 0; - if (padX) - WinogradKernel1x3Block1x4SetInput1t(src, srcChannels, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel1x3Block1x4SetInput1t(src + col * srcChannels, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel1x3Block1x4SetInput1t(src + col * srcChannels, srcChannels, 0, tailW, dst, dstStride), dst += srcChannels; - src += srcWidth * srcChannels; - } - } - else - { - for (size_t c = 0; c < srcChannels; ++c) - { - for (size_t row = 0; row < dstH; row += 1) - { - size_t col = 0; - if (padX) - WinogradKernel1x3Block1x4SetInput1n(src, 1, noseW, dst++, dstStride); - for (col = startX; col < dstW4; col += 4) - WinogradKernel1x3Block1x4SetInput1n(src + col, dst++, dstStride); - if (col < dstW) - WinogradKernel1x3Block1x4SetInput1n(src + col, 0, tailW, dst++, dstStride); - src += srcWidth; - } - } - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel1x3Block1x4SetOutput1n(const float* src, size_t stride, float * dst) - { - float s[6]; - s[0] = src[0 * stride]; - s[1] = src[1 * stride]; - s[2] = src[2 * stride]; - s[3] = src[3 * stride]; - s[4] = src[4 * stride]; - s[5] = src[5 * stride]; - - dst[0] = s[0] + s[1] + s[2] + s[3] + s[4]; - dst[1] = s[1] - s[2] + 2 * s[3] - 2 * s[4]; - dst[2] = s[1] + s[2] + 4 * s[3] + 4 * s[4]; - dst[3] = s[1] - s[2] + 8 * s[3] - 8 * s[4] + s[5]; - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetOutput1n(const float* src, size_t srcStride, float* dst, size_t colE) - { - float tmp[4]; - WinogradKernel1x3Block1x4SetOutput1n(src, srcStride, tmp); - for (size_t col = 0; col < colE; ++col) - dst[col] = tmp[col]; - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetOutput1t(const float* src, size_t srcStride, float* dst, size_t dstC) - { - for (size_t d = 0; d < dstC; ++d, src++, dst++) - { - float tmp[4]; - WinogradKernel1x3Block1x4SetOutput1n(src, srcStride, tmp); - dst[0 * dstC] = tmp[0]; - dst[1 * dstC] = tmp[1]; - dst[2 * dstC] = tmp[2]; - dst[3 * dstC] = tmp[3]; - } - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetOutput1t(const float* src, size_t srcStride, float* dst, size_t dstC, size_t colE) - { - for (size_t d = 0; d < dstC; ++d, src++, dst++) - { - float tmp[4]; - WinogradKernel1x3Block1x4SetOutput1n(src, srcStride, tmp); - for (size_t col = 0; col < colE; ++col) - dst[col * dstC] = tmp[col]; - } - } - - void WinogradKernel1x3Block1x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - size_t dstWidthFull = dstWidth / 4 * 4; - if (trans) - { - for (size_t row = 0; row < dstHeight; row += 1) - { - size_t col; - for (col = 0; col < dstWidthFull; col += 4) - WinogradKernel1x3Block1x4SetOutput1t(src, srcStride, dst + col * dstChannels, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel1x3Block1x4SetOutput1t(src, srcStride, dst + col * dstChannels, dstChannels, dstWidth - col), src += dstChannels; - dst += dstWidth * dstChannels; - } - } - else - { - for (size_t c = 0; c < dstChannels; ++c) - { - for (size_t row = 0; row < dstHeight; row += 1) - { - size_t col; - for (col = 0; col < dstWidthFull; col += 4) - WinogradKernel1x3Block1x4SetOutput1n(src++, srcStride, dst + col); - if (col < dstWidth) - WinogradKernel1x3Block1x4SetOutput1n(src++, srcStride, dst + col, dstWidth - col); - dst += dstWidth; - } - } - } - } - - //----------------------------------------------------------------------- - - void WinogradKernel1x5Block1x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans) - { - if (trans) - { - for (size_t i = 0; i < size; i += 1) - Base::WinogradKernel1x5Block1x4SetFilter1t(src + i, dst + i, size); - } - else - { - for (size_t i = 0; i < size; i += 1, src += 5, dst += 1) - Base::WinogradKernel1x5Block1x4SetFilter1n(src, dst, size); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel1x5Block1x4SetInput1n(const float src[8], float* dst, size_t stride) - { - dst[0 * stride] = 36 * src[0] - 49 * src[2] + 14 * src[4] - src[6]; - dst[1 * stride] = 36 * (src[2] + src[1]) - 13 * (src[4] + src[3]) + src[6] + src[5]; - dst[2 * stride] = 36 * (src[2] - src[1]) - 13 * (src[4] - src[3]) + src[6] - src[5]; - dst[3 * stride] = 9 * (src[2] + 2 * src[1]) - 10 * (src[4] + 2 * src[3]) + src[6] + 2 * src[5]; - dst[4 * stride] = 9 * (src[2] - 2 * src[1]) - 10 * (src[4] - 2 * src[3]) + src[6] - 2 * src[5]; - dst[5 * stride] = 4 * (src[2] + 3 * src[1]) - 5 * (src[4] + 3 * src[3]) + src[6] + 3 * src[5]; - dst[6 * stride] = 4 * (src[2] - 3 * src[1]) - 5 * (src[4] - 3 * src[3]) + src[6] - 3 * src[5]; - dst[7 * stride] = -(36 * src[1] - 49 * src[3] + 14 * src[5] - src[7]); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetInput1n(const float* src, size_t colB, size_t colE, float* dst, size_t dstStride) - { - float tmp[8] = { 0 }; - for (size_t col = colB; col < colE; ++col) - tmp[col] = src[col]; - WinogradKernel1x5Block1x4SetInput1n(tmp, dst, dstStride); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetInput1t(const float* src, size_t srcC, float* dst, size_t dstStride) - { - for (size_t c = 0; c < srcC; ++c, src++, dst++) - { - float tmp[8]; - tmp[0] = src[0 * srcC]; - tmp[1] = src[1 * srcC]; - tmp[2] = src[2 * srcC]; - tmp[3] = src[3 * srcC]; - tmp[4] = src[4 * srcC]; - tmp[5] = src[5 * srcC]; - tmp[6] = src[6 * srcC]; - tmp[7] = src[7 * srcC]; - WinogradKernel1x5Block1x4SetInput1n(tmp, dst, dstStride); - } - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetInput1t(const float* src, size_t srcC, size_t colB, size_t colE, float* dst, size_t dstStride) - { - for (size_t c = 0; c < srcC; ++c, src++, dst++) - { - float tmp[8] = { 0 }; - for (size_t col = colB; col < colE; ++col) - tmp[col] = src[col * srcC]; - WinogradKernel1x5Block1x4SetInput1n(tmp, dst, dstStride); - } - } - - void WinogradKernel1x5Block1x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padX == padW && padY == 0 && padH == 0 && (padX == 0 || padX == 2)); - size_t dstH = srcHeight; - size_t dstW = padX ? srcWidth : srcWidth - 4; - size_t dstW4 = dstW / 4 * 4; - size_t noseW = Simd::Min(8, dstW + 2); - size_t startX = padX ? 4 : 0; - if (padX) - { - if (dstW == dstW4 || dstW == dstW4 + 1) - dstW4 -= 4; - src -= 2 * (trans ? srcChannels : 1); - } - size_t tailW = dstW - dstW4 + (padX ? 2 : 4); - if (trans) - { - for (size_t row = 0; row < dstH; row += 1) - { - size_t col = 0; - if (padX) - WinogradKernel1x5Block1x4SetInput1t(src, srcChannels, 2, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel1x5Block1x4SetInput1t(src + col * srcChannels, srcChannels, dst, dstStride), dst += srcChannels; - for (size_t tail = tailW; col < dstW; col += 4, tail -= 4) - WinogradKernel1x5Block1x4SetInput1t(src + col * srcChannels, srcChannels, 0, tail, dst, dstStride), dst += srcChannels; - src += srcWidth * srcChannels; - } - } - else - { - for (size_t c = 0; c < srcChannels; ++c) - { - for (size_t row = 0; row < dstH; row += 1) - { - size_t col = 0; - if (padX) - WinogradKernel1x5Block1x4SetInput1n(src, 2, noseW, dst++, dstStride); - for (col = startX; col < dstW4; col += 4) - WinogradKernel1x5Block1x4SetInput1n(src + col, dst++, dstStride); - for (size_t tail = tailW; col < dstW; col += 4, tail -= 4) - WinogradKernel1x5Block1x4SetInput1n(src + col, 0, tail, dst++, dstStride); - src += srcWidth; - } - } - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel1x5Block1x4SetOutput1n(const float* src, size_t stride, float* dst) - { - float s[8]; - s[0] = src[0 * stride]; - s[1] = src[1 * stride]; - s[2] = src[2 * stride]; - s[3] = src[3 * stride]; - s[4] = src[4 * stride]; - s[5] = src[5 * stride]; - s[6] = src[6 * stride]; - s[7] = src[7 * stride]; - - dst[0] = s[0] + s[1] + s[2] + s[3] + s[4] + s[5] + s[6]; - dst[1] = s[1] - s[2] + 2 * s[3] - 2 * s[4] + 3 * s[5] - 3 * s[6]; - dst[2] = s[1] + s[2] + 4 * s[3] + 4 * s[4] + 9 * s[5] + 9 * s[6]; - dst[3] = s[1] - s[2] + 8 * s[3] - 8 * s[4] + 27 * s[5] - 27 * s[6] + s[7]; - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetOutput1n(const float* src, size_t srcStride, float* dst, size_t colE) - { - float tmp[4]; - WinogradKernel1x5Block1x4SetOutput1n(src, srcStride, tmp); - for (size_t col = 0; col < colE; ++col) - dst[col] = tmp[col]; - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetOutput1t(const float* src, size_t srcStride, float* dst, size_t dstC) - { - for (size_t d = 0; d < dstC; ++d, src++, dst++) - { - float tmp[4]; - WinogradKernel1x5Block1x4SetOutput1n(src, srcStride, tmp); - dst[0 * dstC] = tmp[0]; - dst[1 * dstC] = tmp[1]; - dst[2 * dstC] = tmp[2]; - dst[3 * dstC] = tmp[3]; - } - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetOutput1t(const float* src, size_t srcStride, float* dst, size_t dstC, size_t colE) - { - for (size_t d = 0; d < dstC; ++d, src++, dst++) - { - float tmp[4]; - WinogradKernel1x5Block1x4SetOutput1n(src, srcStride, tmp); - for (size_t col = 0; col < colE; ++col) - dst[col * dstC] = tmp[col]; - } - } - - void WinogradKernel1x5Block1x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - size_t dstWidthFull = dstWidth / 4 * 4; - if (trans) - { - for (size_t row = 0; row < dstHeight; row += 1) - { - size_t col; - for (col = 0; col < dstWidthFull; col += 4) - WinogradKernel1x5Block1x4SetOutput1t(src, srcStride, dst + col * dstChannels, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel1x5Block1x4SetOutput1t(src, srcStride, dst + col * dstChannels, dstChannels, dstWidth - col), src += dstChannels; - dst += dstWidth * dstChannels; - } - } - else - { - for (size_t c = 0; c < dstChannels; ++c) - { - for (size_t row = 0; row < dstHeight; row += 1) - { - size_t col; - for (col = 0; col < dstWidthFull; col += 4) - WinogradKernel1x5Block1x4SetOutput1n(src++, srcStride, dst + col); - if (col < dstWidth) - WinogradKernel1x5Block1x4SetOutput1n(src++, srcStride, dst + col, dstWidth - col); - dst += dstWidth; - } - } - } - } - - //----------------------------------------------------------------------- - - void WinogradKernel2x2Block2x2SetFilter(const float* src, size_t size, float* dst, SimdBool trans) - { - if (trans) - { - for (size_t i = 0; i < size; i += 1) - Base::WinogradKernel2x2Block2x2SetFilter1t(src + i, dst + i, size); - } - else - { - for (size_t i = 0; i < size; i += 1, src += 4, dst += 1) - Base::WinogradKernel2x2Block2x2SetFilter1n(src, dst, size); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block2x2SetInput1(const float src[16], float* dst, size_t stride) - { - dst[0 * stride] = src[0] - src[1] - src[3] + src[4]; - dst[1 * stride] = src[1] - src[4]; - dst[2 * stride] = src[2] - src[1] + src[4] - src[5]; - dst[3 * stride] = src[3] - src[4]; - dst[4 * stride] = src[4]; - dst[5 * stride] = src[5] - src[4]; - dst[6 * stride] = src[4] - src[3] + src[6] - src[7]; - dst[7 * stride] = src[7] - src[4]; - dst[8 * stride] = src[4] - src[5] + src[8] - src[7]; - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetInput1n(const float* src, size_t srcStride, float* dst, size_t dstStride) - { - float tmp[9]; - tmp[0] = src[0 * srcStride + 0]; - tmp[1] = src[0 * srcStride + 1]; - tmp[2] = src[0 * srcStride + 2]; - tmp[3] = src[1 * srcStride + 0]; - tmp[4] = src[1 * srcStride + 1]; - tmp[5] = src[1 * srcStride + 2]; - tmp[6] = src[2 * srcStride + 0]; - tmp[7] = src[2 * srcStride + 1]; - tmp[8] = src[2 * srcStride + 2]; - - WinogradKernel2x2Block2x2SetInput1(tmp, dst, dstStride); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetInput1n(const float* src, size_t srcStride, size_t rowB, size_t rowE, size_t colB, size_t colE, float* dst, size_t dstStride) - { - float tmp[9] = { 0 }; - for (size_t row = rowB; row < rowE; ++row) - for (size_t col = colB; col < colE; ++col) - tmp[row * 3 + col] = src[row * srcStride + col]; - WinogradKernel2x2Block2x2SetInput1(tmp, dst, dstStride); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetInput1t(const float* src, size_t srcW, size_t srcC, float* dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - for (size_t c = 0; c < srcC; ++c, src++, dst++) - { - float tmp[9]; - tmp[0] = src[0 * srcS + 0 * srcC]; - tmp[1] = src[0 * srcS + 1 * srcC]; - tmp[2] = src[0 * srcS + 2 * srcC]; - tmp[3] = src[1 * srcS + 0 * srcC]; - tmp[4] = src[1 * srcS + 1 * srcC]; - tmp[5] = src[1 * srcS + 2 * srcC]; - tmp[6] = src[2 * srcS + 0 * srcC]; - tmp[7] = src[2 * srcS + 1 * srcC]; - tmp[8] = src[2 * srcS + 2 * srcC]; - WinogradKernel2x2Block2x2SetInput1(tmp, dst, dstStride); - } - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetInput1t(const float* src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float* dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - for (size_t c = 0; c < srcC; ++c, src++, dst++) - { - float tmp[9] = { 0 }; - for (size_t row = rowB; row < rowE; ++row) - for (size_t col = colB; col < colE; ++col) - tmp[row * 3 + col] = src[row * srcS + col * srcC]; - WinogradKernel2x2Block2x2SetInput1(tmp, dst, dstStride); - } - } - - void WinogradKernel2x2Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padY == padX && padW == padH && (padY + padH == 0 || padY + padH == 1)); - size_t dstHeight = srcHeight - 1 + padY + padH; - size_t dstWidth = srcWidth - 1 + padX + padW; - size_t dstHeightFull = AlignLo(dstHeight, 2); - size_t dstWidthFull = AlignLo(dstWidth, 2); - size_t noseW = Simd::Min(3, dstWidth + 1); - size_t noseH = Simd::Min(3, dstHeight + 1); - size_t startY = padY ? 2 : 0; - size_t startX = padX ? 2 : 0; - if (padY || padH) - { - if (dstHeight == dstHeightFull) - dstHeightFull -= 2; - if (dstWidth == dstWidthFull) - dstWidthFull -= 2; - if(padY) - src -= (srcWidth + 1) * (trans ? srcChannels : 1); - } - size_t tailW = dstWidth - dstWidthFull + (padW ? 0 : 1); - size_t tailH = dstHeight - dstHeightFull + (padH ? 0 : 1); - if (trans) - { - size_t row = 0, col = 0; - if (padY) - { - if (padX) - WinogradKernel2x2Block2x2SetInput1t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstWidthFull; col += 2) - WinogradKernel2x2Block2x2SetInput1t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 3, dst, dstStride), dst += srcChannels; - if (col < dstWidth) - WinogradKernel2x2Block2x2SetInput1t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; - } - for (row = startY; row < dstHeightFull; row += 2) - { - if (padX) - WinogradKernel2x2Block2x2SetInput1t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 3, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstWidthFull; col += 2) - WinogradKernel2x2Block2x2SetInput1t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstWidth) - WinogradKernel2x2Block2x2SetInput1t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 3, 0, tailW, dst, dstStride), dst += srcChannels; - } - if (row < dstHeight) - { - if (padX) - WinogradKernel2x2Block2x2SetInput1t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstWidthFull; col += 2) - WinogradKernel2x2Block2x2SetInput1t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 3, dst, dstStride), dst += srcChannels; - if (col < dstWidth) - WinogradKernel2x2Block2x2SetInput1t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; - } - } - else - { - for (size_t c = 0; c < srcChannels; ++c) - { - size_t row = 0, col = 0; - if (padY) - { - if (padX) - WinogradKernel2x2Block2x2SetInput1n(src, srcWidth, 1, noseH, 1, noseW, dst++, dstStride); - for (col = startX; col < dstWidthFull; col += 2) - WinogradKernel2x2Block2x2SetInput1n(src + col, srcWidth, 1, noseH, 0, 3, dst++, dstStride); - if (col < dstWidth) - WinogradKernel2x2Block2x2SetInput1n(src + col, srcWidth, 1, noseH, 0, tailW, dst++, dstStride); - } - for (row = startY; row < dstHeightFull; row += 2) - { - if (padX) - WinogradKernel2x2Block2x2SetInput1n(src + row * srcWidth, srcWidth, 0, 3, 1, noseW, dst++, dstStride); - for (col = startX; col < dstWidthFull; col += 2) - WinogradKernel2x2Block2x2SetInput1n(src + row * srcWidth + col, srcWidth, dst++, dstStride); - if (col < dstWidth) - WinogradKernel2x2Block2x2SetInput1n(src + row * srcWidth + col, srcWidth, 0, 3, 0, tailW, dst++, dstStride); - } - if (row < dstHeight) - { - if (padX) - WinogradKernel2x2Block2x2SetInput1n(src + row * srcWidth, srcWidth, 0, tailH, 1, noseW, dst++, dstStride); - for (col = startX; col < dstWidthFull; col += 2) - WinogradKernel2x2Block2x2SetInput1n(src + row * srcWidth + col, srcWidth, 0, tailH, 0, 3, dst++, dstStride); - if (col < dstWidth) - WinogradKernel2x2Block2x2SetInput1n(src + row * srcWidth + col, srcWidth, 0, tailH, 0, tailW, dst++, dstStride); - } - src += srcWidth * srcHeight; - } - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block2x2SetOutput1(const float* src, size_t stride, float dst[4]) - { - float s[9]; - s[0] = src[0 * stride]; - s[1] = src[1 * stride]; - s[2] = src[2 * stride]; - s[3] = src[3 * stride]; - s[4] = src[4 * stride]; - s[5] = src[5 * stride]; - s[6] = src[6 * stride]; - s[7] = src[7 * stride]; - s[8] = src[8 * stride]; - - dst[0] = s[0] + s[1] + s[3] + s[4]; - dst[1] = s[1] + s[2] + s[4] + s[5]; - dst[2] = s[3] + s[4] + s[6] + s[7]; - dst[3] = s[4] + s[5] + s[7] + s[8]; - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetOutput1n(const float* src, size_t srcStride, float* dst, size_t dstStride) - { - float tmp[4]; - WinogradKernel2x2Block2x2SetOutput1(src, srcStride, tmp); - dst[0 * dstStride + 0] = tmp[0]; - dst[0 * dstStride + 1] = tmp[1]; - dst[1 * dstStride + 0] = tmp[2]; - dst[1 * dstStride + 1] = tmp[3]; - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetOutput1n(const float* src, size_t srcStride, float* dst, size_t dstStride, size_t rowE, size_t colE) - { - float tmp[4]; - WinogradKernel2x2Block2x2SetOutput1(src, srcStride, tmp); - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - dst[row * dstStride + col] = tmp[row * 2 + col]; - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetOutput1t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC) - { - size_t dstS = dstW * dstC; - for (size_t d = 0; d < dstC; ++d, src++, dst++) - { - float tmp[4]; - WinogradKernel2x2Block2x2SetOutput1(src, srcStride, tmp); - dst[0 * dstS + 0 * dstC] = tmp[0]; - dst[0 * dstS + 1 * dstC] = tmp[1]; - dst[1 * dstS + 0 * dstC] = tmp[2]; - dst[1 * dstS + 1 * dstC] = tmp[3]; - } - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetOutput1t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) - { - size_t dstS = dstW * dstC; - for (size_t d = 0; d < dstC; ++d, src++, dst++) - { - float tmp[4]; - WinogradKernel2x2Block2x2SetOutput1(src, srcStride, tmp); - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - dst[row * dstS + col * dstC] = tmp[row * 2 + col]; - } - } - - void WinogradKernel2x2Block2x2SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - size_t dstHeightFull = AlignLo(dstHeight, 2); - size_t dstWidthFull = AlignLo(dstWidth, 2); - if (trans) - { - size_t row, col; - for (row = 0; row < dstHeightFull; row += 2) - { - for (col = 0; col < dstWidthFull; col += 2) - WinogradKernel2x2Block2x2SetOutput1t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel2x2Block2x2SetOutput1t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, 2, dstWidth - col), src += dstChannels; - } - if (row < dstHeight) - { - for (col = 0; col < dstWidthFull; col += 2) - WinogradKernel2x2Block2x2SetOutput1t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, 2), src += dstChannels; - if (col < dstWidth) - WinogradKernel2x2Block2x2SetOutput1t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; - } - } - else - { - for (size_t c = 0; c < dstChannels; ++c) - { - size_t row, col; - for (row = 0; row < dstHeightFull; row += 2) - { - for (col = 0; col < dstWidthFull; col += 2) - WinogradKernel2x2Block2x2SetOutput1n(src++, srcStride, dst + row * dstWidth + col, dstWidth); - if (col < dstWidth) - WinogradKernel2x2Block2x2SetOutput1n(src++, srcStride, dst + row * dstWidth + col, dstWidth, 2, dstWidth - col); - } - if (row < dstHeight) - { - for (col = 0; col < dstWidthFull; col += 2) - WinogradKernel2x2Block2x2SetOutput1n(src++, srcStride, dst + row * dstWidth + col, dstWidth, dstHeight - row, 2); - if (col < dstWidth) - WinogradKernel2x2Block2x2SetOutput1n(src++, srcStride, dst + row * dstWidth + col, dstWidth, dstHeight - row, dstWidth - col); - } - dst += dstHeight * dstWidth; - } - } - } - - //----------------------------------------------------------------------- - - void WinogradKernel2x2Block4x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans) - { - if (trans) - { - for (size_t i = 0; i < size; i += 1) - Base::WinogradKernel2x2Block4x4SetFilter1t(src + i, dst + i, size); - } - else - { - for (size_t i = 0; i < size; i += 1, src += 4, dst += 1) - Base::WinogradKernel2x2Block4x4SetFilter1n(src, dst, size); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInput1(const float src[25], float* dst, size_t stride) - { - float tmp[25]; - tmp[0] = 2 * src[0] - src[5] - 2 * src[10] + src[15]; - tmp[1] = 2 * src[1] - src[6] - 2 * src[11] + src[16]; - tmp[2] = 2 * src[2] - src[7] - 2 * src[12] + src[17]; - tmp[3] = 2 * src[3] - src[8] - 2 * src[13] + src[18]; - tmp[4] = 2 * src[4] - src[9] - 2 * src[14] + src[19]; - tmp[5] = src[15] - 2 * src[5] - src[10]; - tmp[6] = src[16] - 2 * src[6] - src[11]; - tmp[7] = src[17] - 2 * src[7] - src[12]; - tmp[8] = src[18] - 2 * src[8] - src[13]; - tmp[9] = src[19] - 2 * src[9] - src[14]; - tmp[10] = 2 * src[5] - 3 * src[10] + src[15]; - tmp[11] = 2 * src[6] - 3 * src[11] + src[16]; - tmp[12] = 2 * src[7] - 3 * src[12] + src[17]; - tmp[13] = 2 * src[8] - 3 * src[13] + src[18]; - tmp[14] = 2 * src[9] - 3 * src[14] + src[19]; - tmp[15] = src[15] - src[5]; - tmp[16] = src[16] - src[6]; - tmp[17] = src[17] - src[7]; - tmp[18] = src[18] - src[8]; - tmp[19] = src[19] - src[9]; - tmp[20] = 2 * src[5] - src[10] - 2 * src[15] + src[20]; - tmp[21] = 2 * src[6] - src[11] - 2 * src[16] + src[21]; - tmp[22] = 2 * src[7] - src[12] - 2 * src[17] + src[22]; - tmp[23] = 2 * src[8] - src[13] - 2 * src[18] + src[23]; - tmp[24] = 2 * src[9] - src[14] - 2 * src[19] + src[24]; - - dst[0 * stride] = 2 * tmp[0] - tmp[1] - 2 * tmp[2] + tmp[3]; - dst[1 * stride] = tmp[3] - 2 * tmp[1] - tmp[2]; - dst[2 * stride] = 2 * tmp[1] - 3 * tmp[2] + tmp[3]; - dst[3 * stride] = tmp[3] - tmp[1]; - dst[4 * stride] = 2 * tmp[1] - tmp[2] - 2 * tmp[3] + tmp[4]; - dst[5 * stride] = 2 * tmp[5] - tmp[6] - 2 * tmp[7] + tmp[8]; - dst[6 * stride] = tmp[8] - 2 * tmp[6] - tmp[7]; - dst[7 * stride] = 2 * tmp[6] - 3 * tmp[7] + tmp[8]; - dst[8 * stride] = tmp[8] - tmp[6]; - dst[9 * stride] = 2 * tmp[6] - tmp[7] - 2 * tmp[8] + tmp[9]; - dst[10 * stride] = 2 * tmp[10] - tmp[11] - 2 * tmp[12] + tmp[13]; - dst[11 * stride] = tmp[13] - 2 * tmp[11] - tmp[12]; - dst[12 * stride] = 2 * tmp[11] - 3 * tmp[12] + tmp[13]; - dst[13 * stride] = tmp[13] - tmp[11]; - dst[14 * stride] = 2 * tmp[11] - tmp[12] - 2 * tmp[13] + tmp[14]; - dst[15 * stride] = 2 * tmp[15] - tmp[16] - 2 * tmp[17] + tmp[18]; - dst[16 * stride] = tmp[18] - 2 * tmp[16] - tmp[17]; - dst[17 * stride] = 2 * tmp[16] - 3 * tmp[17] + tmp[18]; - dst[18 * stride] = tmp[18] - tmp[16]; - dst[19 * stride] = 2 * tmp[16] - tmp[17] - 2 * tmp[18] + tmp[19]; - dst[20 * stride] = 2 * tmp[20] - tmp[21] - 2 * tmp[22] + tmp[23]; - dst[21 * stride] = tmp[23] - 2 * tmp[21] - tmp[22]; - dst[22 * stride] = 2 * tmp[21] - 3 * tmp[22] + tmp[23]; - dst[23 * stride] = tmp[23] - tmp[21]; - dst[24 * stride] = 2 * tmp[21] - tmp[22] - 2 * tmp[23] + tmp[24]; - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInput1n(const float* src, size_t srcStride, float* dst, size_t dstStride) - { - float tmp[25]; - tmp[0] = src[0 * srcStride + 0]; - tmp[1] = src[0 * srcStride + 1]; - tmp[2] = src[0 * srcStride + 2]; - tmp[3] = src[0 * srcStride + 3]; - tmp[4] = src[0 * srcStride + 4]; - tmp[5] = src[1 * srcStride + 0]; - tmp[6] = src[1 * srcStride + 1]; - tmp[7] = src[1 * srcStride + 2]; - tmp[8] = src[1 * srcStride + 3]; - tmp[9] = src[1 * srcStride + 4]; - tmp[10] = src[2 * srcStride + 0]; - tmp[11] = src[2 * srcStride + 1]; - tmp[12] = src[2 * srcStride + 2]; - tmp[13] = src[2 * srcStride + 3]; - tmp[14] = src[2 * srcStride + 4]; - tmp[15] = src[3 * srcStride + 0]; - tmp[16] = src[3 * srcStride + 1]; - tmp[17] = src[3 * srcStride + 2]; - tmp[18] = src[3 * srcStride + 3]; - tmp[19] = src[3 * srcStride + 4]; - tmp[20] = src[4 * srcStride + 0]; - tmp[21] = src[4 * srcStride + 1]; - tmp[22] = src[4 * srcStride + 2]; - tmp[23] = src[4 * srcStride + 3]; - tmp[24] = src[4 * srcStride + 4]; - WinogradKernel2x2Block4x4SetInput1(tmp, dst, dstStride); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInput1n(const float* src, size_t srcStride, size_t rowB, size_t rowE, size_t colB, size_t colE, float* dst, size_t dstStride) - { - float tmp[25] = { 0 }; - for (size_t row = rowB; row < rowE; ++row) - for (size_t col = colB; col < colE; ++col) - tmp[row * 5 + col] = src[row * srcStride + col]; - WinogradKernel2x2Block4x4SetInput1(tmp, dst, dstStride); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInput1t(const float* src, size_t srcW, size_t srcC, float* dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - for (size_t c = 0; c < srcC; ++c, src++, dst++) - { - float tmp[25]; - tmp[0] = src[0 * srcS + 0 * srcC]; - tmp[1] = src[0 * srcS + 1 * srcC]; - tmp[2] = src[0 * srcS + 2 * srcC]; - tmp[3] = src[0 * srcS + 3 * srcC]; - tmp[4] = src[0 * srcS + 4 * srcC]; - tmp[5] = src[1 * srcS + 0 * srcC]; - tmp[6] = src[1 * srcS + 1 * srcC]; - tmp[7] = src[1 * srcS + 2 * srcC]; - tmp[8] = src[1 * srcS + 3 * srcC]; - tmp[9] = src[1 * srcS + 4 * srcC]; - tmp[10] = src[2 * srcS + 0 * srcC]; - tmp[11] = src[2 * srcS + 1 * srcC]; - tmp[12] = src[2 * srcS + 2 * srcC]; - tmp[13] = src[2 * srcS + 3 * srcC]; - tmp[14] = src[2 * srcS + 4 * srcC]; - tmp[15] = src[3 * srcS + 0 * srcC]; - tmp[16] = src[3 * srcS + 1 * srcC]; - tmp[17] = src[3 * srcS + 2 * srcC]; - tmp[18] = src[3 * srcS + 3 * srcC]; - tmp[19] = src[3 * srcS + 4 * srcC]; - tmp[20] = src[4 * srcS + 0 * srcC]; - tmp[21] = src[4 * srcS + 1 * srcC]; - tmp[22] = src[4 * srcS + 2 * srcC]; - tmp[23] = src[4 * srcS + 3 * srcC]; - tmp[24] = src[4 * srcS + 4 * srcC]; - WinogradKernel2x2Block4x4SetInput1(tmp, dst, dstStride); - } - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInput1t(const float* src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float* dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - for (size_t c = 0; c < srcC; ++c, src++, dst++) - { - float tmp[25] = { 0 }; - for (size_t row = rowB; row < rowE; ++row) - for (size_t col = colB; col < colE; ++col) - tmp[row * 5 + col] = src[row * srcS + col * srcC]; - WinogradKernel2x2Block4x4SetInput1(tmp, dst, dstStride); - } - } - - void WinogradKernel2x2Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padY == padX && padW == padH && (padY + padH == 0 || padY + padH == 1)); - size_t dstHeight = srcHeight - 1 + padY + padH; - size_t dstWidth = srcWidth - 1 + padX + padW; - size_t dstHeightFull = AlignLo(dstHeight, 4); - size_t dstWidthFull = AlignLo(dstWidth, 4); - size_t noseW = Simd::Min(5, dstWidth + 1); - size_t noseH = Simd::Min(5, dstHeight + 1); - size_t startY = padY ? 4 : 0; - size_t startX = padX ? 4 : 0; - if (padY || padH) - { - if (dstHeight == dstHeightFull) - dstHeightFull -= 4; - if (dstWidth == dstWidthFull) - dstWidthFull -= 4; - if (padY) - src -= (srcWidth + 1) * (trans ? srcChannels : 1); - } - size_t tailW = dstWidth - dstWidthFull + (padW ? 0 : 1); - size_t tailH = dstHeight - dstHeightFull + (padH ? 0 : 1); - if (trans) - { - size_t row = 0, col = 0; - if (padY) - { - if (padX) - WinogradKernel2x2Block4x4SetInput1t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstWidthFull; col += 4) - WinogradKernel2x2Block4x4SetInput1t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 5, dst, dstStride), dst += srcChannels; - if (col < dstWidth) - WinogradKernel2x2Block4x4SetInput1t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; - } - for (row = startY; row < dstHeightFull; row += 4) - { - if (padX) - WinogradKernel2x2Block4x4SetInput1t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 5, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstWidthFull; col += 4) - WinogradKernel2x2Block4x4SetInput1t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstWidth) - WinogradKernel2x2Block4x4SetInput1t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 5, 0, tailW, dst, dstStride), dst += srcChannels; - } - if (row < dstHeight) - { - if (padX) - WinogradKernel2x2Block4x4SetInput1t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstWidthFull; col += 4) - WinogradKernel2x2Block4x4SetInput1t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 5, dst, dstStride), dst += srcChannels; - if (col < dstWidth) - WinogradKernel2x2Block4x4SetInput1t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; - } - } - else - { - for (size_t c = 0; c < srcChannels; ++c) - { - size_t row = 0, col = 0; - if (padY) - { - if (padX) - WinogradKernel2x2Block4x4SetInput1n(src, srcWidth, 1, noseH, 1, noseW, dst++, dstStride); - for (col = startX; col < dstWidthFull; col += 4) - WinogradKernel2x2Block4x4SetInput1n(src + col, srcWidth, 1, noseH, 0, 5, dst++, dstStride); - if (col < dstWidth) - WinogradKernel2x2Block4x4SetInput1n(src + col, srcWidth, 1, noseH, 0, tailW, dst++, dstStride); - } - for (row = startY; row < dstHeightFull; row += 4) - { - if (padX) - WinogradKernel2x2Block4x4SetInput1n(src + row * srcWidth, srcWidth, 0, 5, 1, noseW, dst++, dstStride); - for (col = startX; col < dstWidthFull; col += 4) - WinogradKernel2x2Block4x4SetInput1n(src + row * srcWidth + col, srcWidth, dst++, dstStride); - if (col < dstWidth) - WinogradKernel2x2Block4x4SetInput1n(src + row * srcWidth + col, srcWidth, 0, 5, 0, tailW, dst++, dstStride); - } - if (row < dstHeight) - { - if (padX) - WinogradKernel2x2Block4x4SetInput1n(src + row * srcWidth, srcWidth, 0, tailH, 1, noseW, dst++, dstStride); - for (col = startX; col < dstWidthFull; col += 4) - WinogradKernel2x2Block4x4SetInput1n(src + row * srcWidth + col, srcWidth, 0, tailH, 0, 5, dst++, dstStride); - if (col < dstWidth) - WinogradKernel2x2Block4x4SetInput1n(src + row * srcWidth + col, srcWidth, 0, tailH, 0, tailW, dst++, dstStride); - } - src += srcWidth * srcHeight; - } - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutput1(const float* src, size_t stride, float dst[4]) - { - float s[25]; - s[0] = src[0 * stride]; - s[1] = src[1 * stride]; - s[2] = src[2 * stride]; - s[3] = src[3 * stride]; - s[4] = src[4 * stride]; - s[5] = src[5 * stride]; - s[6] = src[6 * stride]; - s[7] = src[7 * stride]; - s[8] = src[8 * stride]; - s[9] = src[9 * stride]; - s[10] = src[10 * stride]; - s[11] = src[11 * stride]; - s[12] = src[12 * stride]; - s[13] = src[13 * stride]; - s[14] = src[14 * stride]; - s[15] = src[15 * stride]; - s[16] = src[16 * stride]; - s[17] = src[17 * stride]; - s[18] = src[18 * stride]; - s[19] = src[19 * stride]; - s[20] = src[20 * stride]; - s[21] = src[21 * stride]; - s[22] = src[22 * stride]; - s[23] = src[23 * stride]; - s[24] = src[24 * stride]; - - float t[20]; - t[0] = s[0] + s[5] + s[10] + s[15]; - t[1] = s[1] + s[6] + s[11] + s[16]; - t[2] = s[2] + s[7] + s[12] + s[17]; - t[3] = s[3] + s[8] + s[13] + s[18]; - t[4] = s[4] + s[9] + s[14] + s[19]; - t[5] = s[5] - s[10] + 2 * s[15]; - t[6] = s[6] - s[11] + 2 * s[16]; - t[7] = s[7] - s[12] + 2 * s[17]; - t[8] = s[8] - s[13] + 2 * s[18]; - t[9] = s[9] - s[14] + 2 * s[19]; - t[10] = s[5] + s[10] + 4 * s[15]; - t[11] = s[6] + s[11] + 4 * s[16]; - t[12] = s[7] + s[12] + 4 * s[17]; - t[13] = s[8] + s[13] + 4 * s[18]; - t[14] = s[9] + s[14] + 4 * s[19]; - t[15] = s[5] - s[10] + 8 * s[15] + s[20]; - t[16] = s[6] - s[11] + 8 * s[16] + s[21]; - t[17] = s[7] - s[12] + 8 * s[17] + s[22]; - t[18] = s[8] - s[13] + 8 * s[18] + s[23]; - t[19] = s[9] - s[14] + 8 * s[19] + s[24]; - - dst[0] = t[0] + t[1] + t[2] + t[3]; - dst[1] = t[1] - t[2] + 2 * t[3]; - dst[2] = t[1] + t[2] + 4 * t[3]; - dst[3] = t[1] - t[2] + 8 * t[3] + t[4]; - dst[4] = t[5] + t[6] + t[7] + t[8]; - dst[5] = t[6] - t[7] + 2 * t[8]; - dst[6] = t[6] + t[7] + 4 * t[8]; - dst[7] = t[6] - t[7] + 8 * t[8] + t[9]; - dst[8] = t[10] + t[11] + t[12] + t[13]; - dst[9] = t[11] - t[12] + 2 * t[13]; - dst[10] = t[11] + t[12] + 4 * t[13]; - dst[11] = t[11] - t[12] + 8 * t[13] + t[14]; - dst[12] = t[15] + t[16] + t[17] + t[18]; - dst[13] = t[16] - t[17] + 2 * t[18]; - dst[14] = t[16] + t[17] + 4 * t[18]; - dst[15] = t[16] - t[17] + 8 * t[18] + t[19]; - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutput1n(const float* src, size_t srcStride, float* dst, size_t dstStride) - { - float tmp[16]; - WinogradKernel2x2Block4x4SetOutput1(src, srcStride, tmp); - dst[0 * dstStride + 0] = tmp[0]; - dst[0 * dstStride + 1] = tmp[1]; - dst[0 * dstStride + 2] = tmp[2]; - dst[0 * dstStride + 3] = tmp[3]; - dst[1 * dstStride + 0] = tmp[4]; - dst[1 * dstStride + 1] = tmp[5]; - dst[1 * dstStride + 2] = tmp[6]; - dst[1 * dstStride + 3] = tmp[7]; - dst[2 * dstStride + 0] = tmp[8]; - dst[2 * dstStride + 1] = tmp[9]; - dst[2 * dstStride + 2] = tmp[10]; - dst[2 * dstStride + 3] = tmp[11]; - dst[3 * dstStride + 0] = tmp[12]; - dst[3 * dstStride + 1] = tmp[13]; - dst[3 * dstStride + 2] = tmp[14]; - dst[3 * dstStride + 3] = tmp[15]; - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutput1n(const float* src, size_t srcStride, float* dst, size_t dstStride, size_t rowE, size_t colE) - { - float tmp[16]; - WinogradKernel2x2Block4x4SetOutput1(src, srcStride, tmp); - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - dst[row * dstStride + col] = tmp[row * 4 + col]; - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutput1t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC) - { - size_t dstS = dstW * dstC; - for (size_t d = 0; d < dstC; ++d, src++, dst++) - { - float tmp[16]; - WinogradKernel2x2Block4x4SetOutput1(src, srcStride, tmp); - dst[0 * dstS + 0 * dstC] = tmp[0]; - dst[0 * dstS + 1 * dstC] = tmp[1]; - dst[0 * dstS + 2 * dstC] = tmp[2]; - dst[0 * dstS + 3 * dstC] = tmp[3]; - dst[1 * dstS + 0 * dstC] = tmp[4]; - dst[1 * dstS + 1 * dstC] = tmp[5]; - dst[1 * dstS + 2 * dstC] = tmp[6]; - dst[1 * dstS + 3 * dstC] = tmp[7]; - dst[2 * dstS + 0 * dstC] = tmp[8]; - dst[2 * dstS + 1 * dstC] = tmp[9]; - dst[2 * dstS + 2 * dstC] = tmp[10]; - dst[2 * dstS + 3 * dstC] = tmp[11]; - dst[3 * dstS + 0 * dstC] = tmp[12]; - dst[3 * dstS + 1 * dstC] = tmp[13]; - dst[3 * dstS + 2 * dstC] = tmp[14]; - dst[3 * dstS + 3 * dstC] = tmp[15]; - } - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutput1t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) - { - size_t dstS = dstW * dstC; - for (size_t d = 0; d < dstC; ++d, src++, dst++) - { - float tmp[16]; - WinogradKernel2x2Block4x4SetOutput1(src, srcStride, tmp); - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - dst[row * dstS + col * dstC] = tmp[row * 4 + col]; - } - } - - void WinogradKernel2x2Block4x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - size_t dstHeightFull = AlignLo(dstHeight, 4); - size_t dstWidthFull = AlignLo(dstWidth, 4); - if (trans) - { - size_t row, col; - for (row = 0; row < dstHeightFull; row += 4) - { - for (col = 0; col < dstWidthFull; col += 4) - WinogradKernel2x2Block4x4SetOutput1t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel2x2Block4x4SetOutput1t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, 4, dstWidth - col), src += dstChannels; - } - if (row < dstHeight) - { - for (col = 0; col < dstWidthFull; col += 4) - WinogradKernel2x2Block4x4SetOutput1t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, 4), src += dstChannels; - if (col < dstWidth) - WinogradKernel2x2Block4x4SetOutput1t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; - } - } - else - { - for (size_t c = 0; c < dstChannels; ++c) - { - size_t row, col; - for (row = 0; row < dstHeightFull; row += 4) - { - for (col = 0; col < dstWidthFull; col += 4) - WinogradKernel2x2Block4x4SetOutput1n(src++, srcStride, dst + row * dstWidth + col, dstWidth); - if (col < dstWidth) - WinogradKernel2x2Block4x4SetOutput1n(src++, srcStride, dst + row * dstWidth + col, dstWidth, 4, dstWidth - col); - } - if (row < dstHeight) - { - for (col = 0; col < dstWidthFull; col += 4) - WinogradKernel2x2Block4x4SetOutput1n(src++, srcStride, dst + row * dstWidth + col, dstWidth, dstHeight - row, 4); - if (col < dstWidth) - WinogradKernel2x2Block4x4SetOutput1n(src++, srcStride, dst + row * dstWidth + col, dstWidth, dstHeight - row, dstWidth - col); - } - dst += dstHeight * dstWidth; - } - } - } - - //----------------------------------------------------------------------- - - void WinogradKernel3x3Block2x2SetFilter(const float * src, size_t size, float * dst, SimdBool trans) - { - if (trans) - { - for (size_t i = 0; i < size; i += 1) - Base::WinogradKernel3x3Block2x2SetFilter1t(src + i, dst + i, size); - } - else - { - for (size_t i = 0; i < size; i += 1, src += 9, dst += 1) - Base::WinogradKernel3x3Block2x2SetFilter1n(src, dst, size); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput1(const float src[16], float * dst, size_t stride) - { - dst[0 * stride] = (src[0] - src[8]) - (src[2] - src[10]); - dst[1 * stride] = (src[1] - src[9]) + (src[2] - src[10]); - dst[2 * stride] = (src[2] - src[10]) - (src[1] - src[9]); - dst[3 * stride] = (src[1] - src[9]) - (src[3] - src[11]); - dst[4 * stride] = (src[4] + src[8]) - (src[6] + src[10]); - dst[5 * stride] = (src[5] + src[9]) + (src[6] + src[10]); - dst[6 * stride] = (src[6] + src[10]) - (src[5] + src[9]); - dst[7 * stride] = (src[5] + src[9]) - (src[7] + src[11]); - dst[8 * stride] = (src[8] - src[4]) - (src[10] - src[6]); - dst[9 * stride] = (src[9] - src[5]) + (src[10] - src[6]); - dst[10 * stride] = (src[10] - src[6]) - (src[9] - src[5]); - dst[11 * stride] = (src[9] - src[5]) - (src[11] - src[7]); - dst[12 * stride] = (src[4] - src[12]) - (src[6] - src[14]); - dst[13 * stride] = (src[5] - src[13]) + (src[6] - src[14]); - dst[14 * stride] = (src[6] - src[14]) - (src[5] - src[13]); - dst[15 * stride] = (src[5] - src[13]) - (src[7] - src[15]); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput1n(const float * src, size_t srcStride, float * dst, size_t dstStride) - { - float tmp[16]; - tmp[0] = src[0 * srcStride + 0]; - tmp[1] = src[0 * srcStride + 1]; - tmp[2] = src[0 * srcStride + 2]; - tmp[3] = src[0 * srcStride + 3]; - tmp[4] = src[1 * srcStride + 0]; - tmp[5] = src[1 * srcStride + 1]; - tmp[6] = src[1 * srcStride + 2]; - tmp[7] = src[1 * srcStride + 3]; - tmp[8] = src[2 * srcStride + 0]; - tmp[9] = src[2 * srcStride + 1]; - tmp[10] = src[2 * srcStride + 2]; - tmp[11] = src[2 * srcStride + 3]; - tmp[12] = src[3 * srcStride + 0]; - tmp[13] = src[3 * srcStride + 1]; - tmp[14] = src[3 * srcStride + 2]; - tmp[15] = src[3 * srcStride + 3]; - WinogradKernel3x3Block2x2SetInput1(tmp, dst, dstStride); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput1n(const float * src, size_t srcStride, size_t rowB, size_t rowE, size_t colB, size_t colE, float * dst, size_t dstStride) - { - float tmp[16] = { 0 }; - for (size_t row = rowB; row < rowE; ++row) - for (size_t col = colB; col < colE; ++col) - tmp[row * 4 + col] = src[row * srcStride + col]; - WinogradKernel3x3Block2x2SetInput1(tmp, dst, dstStride); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput1t(const float * src, size_t srcW, size_t srcC, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - for (size_t c = 0; c < srcC; ++c, src++, dst++) - { - float tmp[16]; - tmp[0] = src[0 * srcS + 0 * srcC]; - tmp[1] = src[0 * srcS + 1 * srcC]; - tmp[2] = src[0 * srcS + 2 * srcC]; - tmp[3] = src[0 * srcS + 3 * srcC]; - tmp[4] = src[1 * srcS + 0 * srcC]; - tmp[5] = src[1 * srcS + 1 * srcC]; - tmp[6] = src[1 * srcS + 2 * srcC]; - tmp[7] = src[1 * srcS + 3 * srcC]; - tmp[8] = src[2 * srcS + 0 * srcC]; - tmp[9] = src[2 * srcS + 1 * srcC]; - tmp[10] = src[2 * srcS + 2 * srcC]; - tmp[11] = src[2 * srcS + 3 * srcC]; - tmp[12] = src[3 * srcS + 0 * srcC]; - tmp[13] = src[3 * srcS + 1 * srcC]; - tmp[14] = src[3 * srcS + 2 * srcC]; - tmp[15] = src[3 * srcS + 3 * srcC]; - WinogradKernel3x3Block2x2SetInput1(tmp, dst, dstStride); - } - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput1t(const float * src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - for (size_t c = 0; c < srcC; ++c, src++, dst++) - { - float tmp[16] = { 0 }; - for (size_t row = rowB; row < rowE; ++row) - for (size_t col = colB; col < colE; ++col) - tmp[row * 4 + col] = src[row * srcS + col * srcC]; - WinogradKernel3x3Block2x2SetInput1(tmp, dst, dstStride); - } - } - - void WinogradKernel3x3Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padY == padX && padY == padH && padY == padW && (padY == 0 || padY == 1)); - SimdBool pad = padY > 0 ? SimdTrue : SimdFalse; - size_t dstHeight = pad ? srcHeight : srcHeight - 2; - size_t dstWidth = pad ? srcWidth : srcWidth - 2; - size_t dstHeightFull = AlignLo(dstHeight, 2); - size_t dstWidthFull = AlignLo(dstWidth, 2); - size_t noseW = Simd::Min(4, dstWidth + 1); - size_t noseH = Simd::Min(4, dstHeight + 1); - size_t start = pad ? 2 : 0; - if (pad) - { - if (dstHeight == dstHeightFull) - dstHeightFull -= 2; - if (dstWidth == dstWidthFull) - dstWidthFull -= 2; - src -= (srcWidth + 1)*(trans ? srcChannels : 1); - } - size_t tailW = dstWidth - dstWidthFull + (pad ? 1 : 2); - size_t tailH = dstHeight - dstHeightFull + (pad ? 1 : 2); - if (trans) - { - size_t row = 0, col = 0; - if (pad) - { - if (pad) - WinogradKernel3x3Block2x2SetInput1t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstWidthFull; col += 2) - WinogradKernel3x3Block2x2SetInput1t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 4, dst, dstStride), dst += srcChannels; - if (col < dstWidth) - WinogradKernel3x3Block2x2SetInput1t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; - } - for (row = start; row < dstHeightFull; row += 2) - { - if (pad) - WinogradKernel3x3Block2x2SetInput1t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 4, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstWidthFull; col += 2) - WinogradKernel3x3Block2x2SetInput1t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstWidth) - WinogradKernel3x3Block2x2SetInput1t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 4, 0, tailW, dst, dstStride), dst += srcChannels; - } - if (row < dstHeight) - { - if (pad) - WinogradKernel3x3Block2x2SetInput1t(src + row * srcWidth* srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstWidthFull; col += 2) - WinogradKernel3x3Block2x2SetInput1t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 4, dst, dstStride), dst += srcChannels; - if (col < dstWidth) - WinogradKernel3x3Block2x2SetInput1t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; - } - } - else - { - for (size_t c = 0; c < srcChannels; ++c) - { - size_t row = 0, col = 0; - if (pad) - { - if (pad) - WinogradKernel3x3Block2x2SetInput1n(src, srcWidth, 1, noseH, 1, noseW, dst++, dstStride); - for (col = start; col < dstWidthFull; col += 2) - WinogradKernel3x3Block2x2SetInput1n(src + col, srcWidth, 1, noseH, 0, 4, dst++, dstStride); - if (col < dstWidth) - WinogradKernel3x3Block2x2SetInput1n(src + col, srcWidth, 1, noseH, 0, tailW, dst++, dstStride); - } - for (row = start; row < dstHeightFull; row += 2) - { - if (pad) - WinogradKernel3x3Block2x2SetInput1n(src + row * srcWidth, srcWidth, 0, 4, 1, noseW, dst++, dstStride); - for (col = start; col < dstWidthFull; col += 2) - WinogradKernel3x3Block2x2SetInput1n(src + row * srcWidth + col, srcWidth, dst++, dstStride); - if (col < dstWidth) - WinogradKernel3x3Block2x2SetInput1n(src + row * srcWidth + col, srcWidth, 0, 4, 0, tailW, dst++, dstStride); - } - if (row < dstHeight) - { - if (pad) - WinogradKernel3x3Block2x2SetInput1n(src + row * srcWidth, srcWidth, 0, tailH, 1, noseW, dst++, dstStride); - for (col = start; col < dstWidthFull; col += 2) - WinogradKernel3x3Block2x2SetInput1n(src + row * srcWidth + col, srcWidth, 0, tailH, 0, 4, dst++, dstStride); - if (col < dstWidth) - WinogradKernel3x3Block2x2SetInput1n(src + row * srcWidth + col, srcWidth, 0, tailH, 0, tailW, dst++, dstStride); - } - src += srcWidth * srcHeight; - } - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutput1(const float * src, size_t stride, float dst[4]) - { - float s[16]; - s[0] = src[0 * stride]; - s[1] = src[1 * stride]; - s[2] = src[2 * stride]; - s[3] = src[3 * stride]; - s[4] = src[4 * stride]; - s[5] = src[5 * stride]; - s[6] = src[6 * stride]; - s[7] = src[7 * stride]; - s[8] = src[8 * stride]; - s[9] = src[9 * stride]; - s[10] = src[10 * stride]; - s[11] = src[11 * stride]; - s[12] = src[12 * stride]; - s[13] = src[13 * stride]; - s[14] = src[14 * stride]; - s[15] = src[15 * stride]; - - float tmp[8]; - tmp[0] = s[0] + s[1] + s[2]; - tmp[1] = s[1] - s[2] - s[3]; - tmp[2] = s[4] + s[5] + s[6]; - tmp[3] = s[5] - s[6] - s[7]; - tmp[4] = s[8] + s[9] + s[10]; - tmp[5] = s[9] - s[10] - s[11]; - tmp[6] = s[12] + s[13] + s[14]; - tmp[7] = s[13] - s[14] - s[15]; - - dst[0] = tmp[0] + tmp[2] + tmp[4]; - dst[1] = tmp[1] + tmp[3] + tmp[5]; - dst[2] = tmp[2] - tmp[4] - tmp[6]; - dst[3] = tmp[3] - tmp[5] - tmp[7]; - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutput1n(const float * src, size_t srcStride, float * dst, size_t dstStride) - { - float tmp[4]; - WinogradKernel3x3Block2x2SetOutput1(src, srcStride, tmp); - dst[0 * dstStride + 0] = tmp[0]; - dst[0 * dstStride + 1] = tmp[1]; - dst[1 * dstStride + 0] = tmp[2]; - dst[1 * dstStride + 1] = tmp[3]; - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutput1n(const float * src, size_t srcStride, float * dst, size_t dstStride, size_t rowE, size_t colE) - { - float tmp[4]; - WinogradKernel3x3Block2x2SetOutput1(src, srcStride, tmp); - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - dst[row*dstStride + col] = tmp[row * 2 + col]; - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutput1t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC) - { - size_t dstS = dstW * dstC; - for (size_t d = 0; d < dstC; ++d, src++, dst++) - { - float tmp[4]; - WinogradKernel3x3Block2x2SetOutput1(src, srcStride, tmp); - dst[0 * dstS + 0 * dstC] = tmp[0]; - dst[0 * dstS + 1 * dstC] = tmp[1]; - dst[1 * dstS + 0 * dstC] = tmp[2]; - dst[1 * dstS + 1 * dstC] = tmp[3]; - } - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutput1t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) - { - size_t dstS = dstW * dstC; - for (size_t d = 0; d < dstC; ++d, src++, dst++) - { - float tmp[4]; - WinogradKernel3x3Block2x2SetOutput1(src, srcStride, tmp); - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - dst[row*dstS + col*dstC] = tmp[row * 2 + col]; - } - } - - void WinogradKernel3x3Block2x2SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - size_t dstHeightFull = AlignLo(dstHeight, 2); - size_t dstWidthFull = AlignLo(dstWidth, 2); - if (trans) - { - size_t row, col; - for (row = 0; row < dstHeightFull; row += 2) - { - for (col = 0; col < dstWidthFull; col += 2) - WinogradKernel3x3Block2x2SetOutput1t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block2x2SetOutput1t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, 2, dstWidth - col), src += dstChannels; - } - if (row < dstHeight) - { - for (col = 0; col < dstWidthFull; col += 2) - WinogradKernel3x3Block2x2SetOutput1t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, 2), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block2x2SetOutput1t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; - } - } - else - { - for (size_t c = 0; c < dstChannels; ++c) - { - size_t row, col; - for (row = 0; row < dstHeightFull; row += 2) - { - for (col = 0; col < dstWidthFull; col += 2) - WinogradKernel3x3Block2x2SetOutput1n(src++, srcStride, dst + row * dstWidth + col, dstWidth); - if (col < dstWidth) - WinogradKernel3x3Block2x2SetOutput1n(src++, srcStride, dst + row * dstWidth + col, dstWidth, 2, dstWidth - col); - } - if (row < dstHeight) - { - for (col = 0; col < dstWidthFull; col += 2) - WinogradKernel3x3Block2x2SetOutput1n(src++, srcStride, dst + row * dstWidth + col, dstWidth, dstHeight - row, 2); - if (col < dstWidth) - WinogradKernel3x3Block2x2SetOutput1n(src++, srcStride, dst + row * dstWidth + col, dstWidth, dstHeight - row, dstWidth - col); - } - dst += dstHeight * dstWidth; - } - } - } - - //----------------------------------------------------------------------- - - void WinogradKernel3x3Block3x3SetFilter(const float * src, size_t size, float * dst, SimdBool trans) - { - if (trans) - { - for (size_t i = 0; i < size; i += 1) - Base::WinogradKernel3x3Block3x3SetFilter1t(src + i, dst + i, size); - } - else - { - for (size_t i = 0; i < size; i += 1, src += 9, dst += 1) - Base::WinogradKernel3x3Block3x3SetFilter1n(src, dst, size); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block3x3SetInput1(const float src[25], float * dst, size_t stride) - { - float tmp[25]; - tmp[0] = 2 * src[0] - src[5] - 2 * src[10] + src[15]; - tmp[1] = 2 * src[1] - src[6] - 2 * src[11] + src[16]; - tmp[2] = 2 * src[2] - src[7] - 2 * src[12] + src[17]; - tmp[3] = 2 * src[3] - src[8] - 2 * src[13] + src[18]; - tmp[4] = 2 * src[4] - src[9] - 2 * src[14] + src[19]; - tmp[5] = -2 * src[5] - src[10] + src[15]; - tmp[6] = -2 * src[6] - src[11] + src[16]; - tmp[7] = -2 * src[7] - src[12] + src[17]; - tmp[8] = -2 * src[8] - src[13] + src[18]; - tmp[9] = -2 * src[9] - src[14] + src[19]; - tmp[10] = 2 * src[5] - 3 * src[10] + src[15]; - tmp[11] = 2 * src[6] - 3 * src[11] + src[16]; - tmp[12] = 2 * src[7] - 3 * src[12] + src[17]; - tmp[13] = 2 * src[8] - 3 * src[13] + src[18]; - tmp[14] = 2 * src[9] - 3 * src[14] + src[19]; - tmp[15] = -src[5] + src[15]; - tmp[16] = -src[6] + src[16]; - tmp[17] = -src[7] + src[17]; - tmp[18] = -src[8] + src[18]; - tmp[19] = -src[9] + src[19]; - tmp[20] = 2 * src[5] - src[10] - 2 * src[15] + src[20]; - tmp[21] = 2 * src[6] - src[11] - 2 * src[16] + src[21]; - tmp[22] = 2 * src[7] - src[12] - 2 * src[17] + src[22]; - tmp[23] = 2 * src[8] - src[13] - 2 * src[18] + src[23]; - tmp[24] = 2 * src[9] - src[14] - 2 * src[19] + src[24]; - - dst[0 * stride] = 2 * tmp[0] - tmp[1] - 2 * tmp[2] + tmp[3]; - dst[1 * stride] = -2 * tmp[1] - tmp[2] + tmp[3]; - dst[2 * stride] = 2 * tmp[1] - 3 * tmp[2] + tmp[3]; - dst[3 * stride] = - tmp[1] + tmp[3]; - dst[4 * stride] = 2 * tmp[1] - tmp[2] - 2 * tmp[3] + tmp[4]; - dst[5 * stride] = 2 * tmp[5] - tmp[6] - 2 * tmp[7] + tmp[8]; - dst[6 * stride] = -2 * tmp[6] - tmp[7] + tmp[8]; - dst[7 * stride] = 2 * tmp[6] - 3 * tmp[7] + tmp[8]; - dst[8 * stride] = -tmp[6] + tmp[8]; - dst[9 * stride] = 2 * tmp[6] - tmp[7] - 2 * tmp[8] + tmp[9]; - dst[10 * stride] = 2 * tmp[10] - tmp[11] - 2 * tmp[12] + tmp[13]; - dst[11 * stride] = -2 * tmp[11] - tmp[12] + tmp[13]; - dst[12 * stride] = 2 * tmp[11] - 3 * tmp[12] + tmp[13]; - dst[13 * stride] = -tmp[11] + tmp[13]; - dst[14 * stride] = 2 * tmp[11] - tmp[12] - 2 * tmp[13] + tmp[14]; - dst[15 * stride] = 2 * tmp[15] - tmp[16] - 2 * tmp[17] + tmp[18]; - dst[16 * stride] = -2 * tmp[16] - tmp[17] + tmp[18]; - dst[17 * stride] = 2 * tmp[16] - 3 * tmp[17] + tmp[18]; - dst[18 * stride] = -tmp[16] + tmp[18]; - dst[19 * stride] = 2 * tmp[16] - tmp[17] - 2 * tmp[18] + tmp[19]; - dst[20 * stride] = 2 * tmp[20] - tmp[21] - 2 * tmp[22] + tmp[23]; - dst[21 * stride] = -2 * tmp[21] - tmp[22] + tmp[23]; - dst[22 * stride] = 2 * tmp[21] - 3 * tmp[22] + tmp[23]; - dst[23 * stride] = -tmp[21] + tmp[23]; - dst[24 * stride] = 2 * tmp[21] - tmp[22] - 2 * tmp[23] + tmp[24]; - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetInput1n(const float * src, size_t srcStride, float * dst, size_t dstStride) - { - float tmp[25]; - tmp[0] = src[0 * srcStride + 0]; - tmp[1] = src[0 * srcStride + 1]; - tmp[2] = src[0 * srcStride + 2]; - tmp[3] = src[0 * srcStride + 3]; - tmp[4] = src[0 * srcStride + 4]; - tmp[5] = src[1 * srcStride + 0]; - tmp[6] = src[1 * srcStride + 1]; - tmp[7] = src[1 * srcStride + 2]; - tmp[8] = src[1 * srcStride + 3]; - tmp[9] = src[1 * srcStride + 4]; - tmp[10] = src[2 * srcStride + 0]; - tmp[11] = src[2 * srcStride + 1]; - tmp[12] = src[2 * srcStride + 2]; - tmp[13] = src[2 * srcStride + 3]; - tmp[14] = src[2 * srcStride + 4]; - tmp[15] = src[3 * srcStride + 0]; - tmp[16] = src[3 * srcStride + 1]; - tmp[17] = src[3 * srcStride + 2]; - tmp[18] = src[3 * srcStride + 3]; - tmp[19] = src[3 * srcStride + 4]; - tmp[20] = src[4 * srcStride + 0]; - tmp[21] = src[4 * srcStride + 1]; - tmp[22] = src[4 * srcStride + 2]; - tmp[23] = src[4 * srcStride + 3]; - tmp[24] = src[4 * srcStride + 4]; - WinogradKernel3x3Block3x3SetInput1(tmp, dst, dstStride); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetInput1n(const float * src, size_t srcStride, size_t rowB, size_t rowE, size_t colB, size_t colE, float * dst, size_t dstStride) - { - float tmp[5 * 5] = { 0 }; - for (size_t row = rowB; row < rowE; ++row) - for (size_t col = colB; col < colE; ++col) - tmp[row * 5 + col] = src[row * srcStride + col]; - WinogradKernel3x3Block3x3SetInput1(tmp, dst, dstStride); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetInput1t(const float * src, size_t srcW, size_t srcC, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - for (size_t c = 0; c < srcC; ++c, src++, dst++) - { - float tmp[25]; - tmp[0] = src[0 * srcS + 0 * srcC]; - tmp[1] = src[0 * srcS + 1 * srcC]; - tmp[2] = src[0 * srcS + 2 * srcC]; - tmp[3] = src[0 * srcS + 3 * srcC]; - tmp[4] = src[0 * srcS + 4 * srcC]; - tmp[5] = src[1 * srcS + 0 * srcC]; - tmp[6] = src[1 * srcS + 1 * srcC]; - tmp[7] = src[1 * srcS + 2 * srcC]; - tmp[8] = src[1 * srcS + 3 * srcC]; - tmp[9] = src[1 * srcS + 4 * srcC]; - tmp[10] = src[2 * srcS + 0 * srcC]; - tmp[11] = src[2 * srcS + 1 * srcC]; - tmp[12] = src[2 * srcS + 2 * srcC]; - tmp[13] = src[2 * srcS + 3 * srcC]; - tmp[14] = src[2 * srcS + 4 * srcC]; - tmp[15] = src[3 * srcS + 0 * srcC]; - tmp[16] = src[3 * srcS + 1 * srcC]; - tmp[17] = src[3 * srcS + 2 * srcC]; - tmp[18] = src[3 * srcS + 3 * srcC]; - tmp[19] = src[3 * srcS + 4 * srcC]; - tmp[20] = src[4 * srcS + 0 * srcC]; - tmp[21] = src[4 * srcS + 1 * srcC]; - tmp[22] = src[4 * srcS + 2 * srcC]; - tmp[23] = src[4 * srcS + 3 * srcC]; - tmp[24] = src[4 * srcS + 4 * srcC]; - WinogradKernel3x3Block3x3SetInput1(tmp, dst, dstStride); - } - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetInput1t(const float * src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - for (size_t c = 0; c < srcC; ++c, src++, dst++) - { - float tmp[25] = { 0 }; - for (size_t row = rowB; row < rowE; ++row) - for (size_t col = colB; col < colE; ++col) - tmp[row * 5 + col] = src[row * srcS + col * srcC]; - WinogradKernel3x3Block3x3SetInput1(tmp, dst, dstStride); - } - } - - void WinogradKernel3x3Block3x3SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padY == padX && padY == padH && padY == padW && (padY == 0 || padY == 1)); - SimdBool pad = padY > 0 ? SimdTrue : SimdFalse; - size_t dstHeight = pad ? srcHeight : srcHeight - 2; - size_t dstWidth = pad ? srcWidth : srcWidth - 2; - size_t dstHeightFull = dstHeight / 3 * 3; - size_t dstWidthFull = dstWidth / 3 * 3; - size_t noseW = Simd::Min(5, dstWidth + 1); - size_t noseH = Simd::Min(5, dstHeight + 1); - size_t start = pad ? 3 : 0; - if (pad) - { - if (dstHeight == dstHeightFull) - dstHeightFull -= 3; - if (dstWidth == dstWidthFull) - dstWidthFull -= 3; - src -= (srcWidth + 1)*(trans ? srcChannels : 1); - } - size_t tailW = dstWidth - dstWidthFull + (pad ? 1 : 2); - size_t tailH = dstHeight - dstHeightFull + (pad ? 1 : 2); - if (trans) - { - size_t row = 0, col = 0; - if (pad) - { - if (pad) - WinogradKernel3x3Block3x3SetInput1t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstWidthFull; col += 3) - WinogradKernel3x3Block3x3SetInput1t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 5, dst, dstStride), dst += srcChannels; - if (col < dstWidth) - WinogradKernel3x3Block3x3SetInput1t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; - } - for (row = start; row < dstHeightFull; row += 3) - { - if (pad) - WinogradKernel3x3Block3x3SetInput1t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 5, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstWidthFull; col += 3) - WinogradKernel3x3Block3x3SetInput1t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstWidth) - WinogradKernel3x3Block3x3SetInput1t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 5, 0, tailW, dst, dstStride), dst += srcChannels; - } - if (row < dstHeight) - { - if (pad) - WinogradKernel3x3Block3x3SetInput1t(src + row * srcWidth* srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstWidthFull; col += 3) - WinogradKernel3x3Block3x3SetInput1t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 5, dst, dstStride), dst += srcChannels; - if (col < dstWidth) - WinogradKernel3x3Block3x3SetInput1t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; - } - } - else - { - for (size_t c = 0; c < srcChannels; ++c) - { - size_t row = 0, col = 0; - if (pad) - { - if (pad) - WinogradKernel3x3Block3x3SetInput1n(src, srcWidth, 1, noseH, 1, noseW, dst++, dstStride); - for (col = start; col < dstWidthFull; col += 3) - WinogradKernel3x3Block3x3SetInput1n(src + col, srcWidth, 1, noseH, 0, 5, dst++, dstStride); - if (col < dstWidth) - WinogradKernel3x3Block3x3SetInput1n(src + col, srcWidth, 1, noseH, 0, tailW, dst++, dstStride); - } - for (row = start; row < dstHeightFull; row += 3) - { - if (pad) - WinogradKernel3x3Block3x3SetInput1n(src + row * srcWidth, srcWidth, 0, 5, 1, noseW, dst++, dstStride); - for (col = start; col < dstWidthFull; col += 3) - WinogradKernel3x3Block3x3SetInput1n(src + row * srcWidth + col, srcWidth, dst++, dstStride); - if (col < dstWidth) - WinogradKernel3x3Block3x3SetInput1n(src + row * srcWidth + col, srcWidth, 0, 5, 0, tailW, dst++, dstStride); - } - if (row < dstHeight) - { - if (pad) - WinogradKernel3x3Block3x3SetInput1n(src + row * srcWidth, srcWidth, 0, tailH, 1, noseW, dst++, dstStride); - for (col = start; col < dstWidthFull; col += 3) - WinogradKernel3x3Block3x3SetInput1n(src + row * srcWidth + col, srcWidth, 0, tailH, 0, 5, dst++, dstStride); - if (col < dstWidth) - WinogradKernel3x3Block3x3SetInput1n(src + row * srcWidth + col, srcWidth, 0, tailH, 0, tailW, dst++, dstStride); - } - src += srcWidth * srcHeight; - } - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block3x3SetOutput1(const float * src, size_t stride, float dst[9]) - { - float s[25]; - s[0] = src[0 * stride]; - s[1] = src[1 * stride]; - s[2] = src[2 * stride]; - s[3] = src[3 * stride]; - s[4] = src[4 * stride]; - s[5] = src[5 * stride]; - s[6] = src[6 * stride]; - s[7] = src[7 * stride]; - s[8] = src[8 * stride]; - s[9] = src[9 * stride]; - s[10] = src[10 * stride]; - s[11] = src[11 * stride]; - s[12] = src[12 * stride]; - s[13] = src[13 * stride]; - s[14] = src[14 * stride]; - s[15] = src[15 * stride]; - s[16] = src[16 * stride]; - s[17] = src[17 * stride]; - s[18] = src[18 * stride]; - s[19] = src[19 * stride]; - s[20] = src[20 * stride]; - s[21] = src[21 * stride]; - s[22] = src[22 * stride]; - s[23] = src[23 * stride]; - s[24] = src[24 * stride]; - - float t[15]; - t[0] = s[0] + s[5] + s[10] + s[15]; - t[1] = s[1] + s[6] + s[11] + s[16]; - t[2] = s[2] + s[7] + s[12] + s[17]; - t[3] = s[3] + s[8] + s[13] + s[18]; - t[4] = s[4] + s[9] + s[14] + s[19]; - t[5] = s[5] - s[10] + 2 * s[15]; - t[6] = s[6] - s[11] + 2 * s[16]; - t[7] = s[7] - s[12] + 2 * s[17]; - t[8] = s[8] - s[13] + 2 * s[18]; - t[9] = s[9] - s[14] + 2 * s[19]; - t[10] = s[5] + s[10] + 4 * s[15] + s[20]; - t[11] = s[6] + s[11] + 4 * s[16] + s[21]; - t[12] = s[7] + s[12] + 4 * s[17] + s[22]; - t[13] = s[8] + s[13] + 4 * s[18] + s[23]; - t[14] = s[9] + s[14] + 4 * s[19] + s[24]; - - dst[0] = t[0] + t[1] + t[2] + t[3]; - dst[1] = t[1] - t[2] + 2 * t[3]; - dst[2] = t[1] + t[2] + 4 * t[3] + t[4]; - dst[3] = t[5] + t[6] + t[7] + t[8]; - dst[4] = t[6] - t[7] + 2 * t[8]; - dst[5] = t[6] + t[7] + 4 * t[8] + t[9]; - dst[6] = t[10] + t[11] + t[12] + t[13]; - dst[7] = t[11] - t[12] + 2 * t[13]; - dst[8] = t[11] + t[12] + 4 * t[13] + t[14]; - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetOutput1n(const float * src, size_t srcStride, float * dst, size_t dstStride) - { - float tmp[9]; - WinogradKernel3x3Block3x3SetOutput1(src, srcStride, tmp); - dst[0 * dstStride + 0] = tmp[0]; - dst[0 * dstStride + 1] = tmp[1]; - dst[0 * dstStride + 2] = tmp[2]; - dst[1 * dstStride + 0] = tmp[3]; - dst[1 * dstStride + 1] = tmp[4]; - dst[1 * dstStride + 2] = tmp[5]; - dst[2 * dstStride + 0] = tmp[6]; - dst[2 * dstStride + 1] = tmp[7]; - dst[2 * dstStride + 2] = tmp[8]; - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetOutput1n(const float * src, size_t srcStride, float * dst, size_t dstStride, size_t rowE, size_t colE) - { - float tmp[9]; - WinogradKernel3x3Block3x3SetOutput1(src, srcStride, tmp); - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - dst[row*dstStride + col] = tmp[row * 3 + col]; - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetOutput1t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC) - { - size_t dstS = dstW * dstC; - for (size_t d = 0; d < dstC; ++d, src++, dst++) - { - float tmp[9]; - WinogradKernel3x3Block3x3SetOutput1(src, srcStride, tmp); - dst[0 * dstS + 0 * dstC] = tmp[0]; - dst[0 * dstS + 1 * dstC] = tmp[1]; - dst[0 * dstS + 2 * dstC] = tmp[2]; - dst[1 * dstS + 0 * dstC] = tmp[3]; - dst[1 * dstS + 1 * dstC] = tmp[4]; - dst[1 * dstS + 2 * dstC] = tmp[5]; - dst[2 * dstS + 0 * dstC] = tmp[6]; - dst[2 * dstS + 1 * dstC] = tmp[7]; - dst[2 * dstS + 2 * dstC] = tmp[8]; - } - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetOutput1t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) - { - size_t dstS = dstW * dstC; - for (size_t d = 0; d < dstC; ++d, src++, dst++) - { - float tmp[9]; - WinogradKernel3x3Block3x3SetOutput1(src, srcStride, tmp); - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - dst[row*dstS + col * dstC] = tmp[row * 3 + col]; - } - } - - void WinogradKernel3x3Block3x3SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - size_t dstHeightFull = dstHeight / 3 * 3; - size_t dstWidthFull = dstWidth / 3 * 3; - if (trans) - { - size_t row, col; - for (row = 0; row < dstHeightFull; row += 3) - { - for (col = 0; col < dstWidthFull; col += 3) - WinogradKernel3x3Block3x3SetOutput1t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block3x3SetOutput1t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, 3, dstWidth - col), src += dstChannels; - } - if (row < dstHeight) - { - for (col = 0; col < dstWidthFull; col += 3) - WinogradKernel3x3Block3x3SetOutput1t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, 3), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block3x3SetOutput1t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; - } - } - else - { - for (size_t c = 0; c < dstChannels; ++c) - { - size_t row, col; - for (row = 0; row < dstHeightFull; row += 3) - { - for (col = 0; col < dstWidthFull; col += 3) - WinogradKernel3x3Block3x3SetOutput1n(src++, srcStride, dst + row * dstWidth + col, dstWidth); - if (col < dstWidth) - WinogradKernel3x3Block3x3SetOutput1n(src++, srcStride, dst + row * dstWidth + col, dstWidth, 3, dstWidth - col); - } - if (row < dstHeight) - { - for (col = 0; col < dstWidthFull; col += 3) - WinogradKernel3x3Block3x3SetOutput1n(src++, srcStride, dst + row * dstWidth + col, dstWidth, dstHeight - row, 3); - if (col < dstWidth) - WinogradKernel3x3Block3x3SetOutput1n(src++, srcStride, dst + row * dstWidth + col, dstWidth, dstHeight - row, dstWidth - col); - } - dst += dstHeight * dstWidth; - } - } - } - - //----------------------------------------------------------------------- - - void WinogradKernel3x3Block4x4SetFilter(const float * src, size_t size, float * dst, SimdBool trans) - { - if (trans) - { - for (size_t i = 0; i < size; i += 1) - Base::WinogradKernel3x3Block4x4SetFilter1t(src + i, dst + i, size); - } - else - { - for (size_t i = 0; i < size; i += 1, src += 9, dst += 1) - Base::WinogradKernel3x3Block4x4SetFilter1n(src, dst, size); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block4x4SetInput1(const float src[36], float * dst, size_t stride) - { - float tmp[36]; - tmp[0] = 4 * src[0] - 5 * src[12] + src[24]; - tmp[1] = 4 * src[1] - 5 * src[13] + src[25]; - tmp[2] = 4 * src[2] - 5 * src[14] + src[26]; - tmp[3] = 4 * src[3] - 5 * src[15] + src[27]; - tmp[4] = 4 * src[4] - 5 * src[16] + src[28]; - tmp[5] = 4 * src[5] - 5 * src[17] + src[29]; - tmp[6] = -4 * src[6] - 4 * src[12] + src[18] + src[24]; - tmp[7] = -4 * src[7] - 4 * src[13] + src[19] + src[25]; - tmp[8] = -4 * src[8] - 4 * src[14] + src[20] + src[26]; - tmp[9] = -4 * src[9] - 4 * src[15] + src[21] + src[27]; - tmp[10] = -4 * src[10] - 4 * src[16] + src[22] + src[28]; - tmp[11] = -4 * src[11] - 4 * src[17] + src[23] + src[29]; - tmp[12] = 4 * src[6] - 4 * src[12] - src[18] + src[24]; - tmp[13] = 4 * src[7] - 4 * src[13] - src[19] + src[25]; - tmp[14] = 4 * src[8] - 4 * src[14] - src[20] + src[26]; - tmp[15] = 4 * src[9] - 4 * src[15] - src[21] + src[27]; - tmp[16] = 4 * src[10] - 4 * src[16] - src[22] + src[28]; - tmp[17] = 4 * src[11] - 4 * src[17] - src[23] + src[29]; - tmp[18] = -2 * src[6] - src[12] + 2 * src[18] + src[24]; - tmp[19] = -2 * src[7] - src[13] + 2 * src[19] + src[25]; - tmp[20] = -2 * src[8] - src[14] + 2 * src[20] + src[26]; - tmp[21] = -2 * src[9] - src[15] + 2 * src[21] + src[27]; - tmp[22] = -2 * src[10] - src[16] + 2 * src[22] + src[28]; - tmp[23] = -2 * src[11] - src[17] + 2 * src[23] + src[29]; - tmp[24] = 2 * src[6] - src[12] - 2 * src[18] + src[24]; - tmp[25] = 2 * src[7] - src[13] - 2 * src[19] + src[25]; - tmp[26] = 2 * src[8] - src[14] - 2 * src[20] + src[26]; - tmp[27] = 2 * src[9] - src[15] - 2 * src[21] + src[27]; - tmp[28] = 2 * src[10] - src[16] - 2 * src[22] + src[28]; - tmp[29] = 2 * src[11] - src[17] - 2 * src[23] + src[29]; - tmp[30] = 4 * src[6] - 5 * src[18] + src[30]; - tmp[31] = 4 * src[7] - 5 * src[19] + src[31]; - tmp[32] = 4 * src[8] - 5 * src[20] + src[32]; - tmp[33] = 4 * src[9] - 5 * src[21] + src[33]; - tmp[34] = 4 * src[10] - 5 * src[22] + src[34]; - tmp[35] = 4 * src[11] - 5 * src[23] + src[35]; - - dst[0 * stride] = tmp[0] * 4 - tmp[2] * 5 + tmp[4]; - dst[1 * stride] = -tmp[1] * 4 - tmp[2] * 4 + tmp[3] + tmp[4]; - dst[2 * stride] = tmp[1] * 4 - tmp[2] * 4 - tmp[3] + tmp[4]; - dst[3 * stride] = -tmp[1] * 2 - tmp[2] + tmp[3] * 2 + tmp[4]; - dst[4 * stride] = tmp[1] * 2 - tmp[2] - tmp[3] * 2 + tmp[4]; - dst[5 * stride] = tmp[1] * 4 - tmp[3] * 5 + tmp[5]; - dst[6 * stride] = tmp[6] * 4 - tmp[8] * 5 + tmp[10]; - dst[7 * stride] = -tmp[7] * 4 - tmp[8] * 4 + tmp[9] + tmp[10]; - dst[8 * stride] = tmp[7] * 4 - tmp[8] * 4 - tmp[9] + tmp[10]; - dst[9 * stride] = -tmp[7] * 2 - tmp[8] + tmp[9] * 2 + tmp[10]; - dst[10 * stride] = tmp[7] * 2 - tmp[8] - tmp[9] * 2 + tmp[10]; - dst[11 * stride] = tmp[7] * 4 - tmp[9] * 5 + tmp[11]; - dst[12 * stride] = tmp[12] * 4 - tmp[14] * 5 + tmp[16]; - dst[13 * stride] = -tmp[13] * 4 - tmp[14] * 4 + tmp[15] + tmp[16]; - dst[14 * stride] = tmp[13] * 4 - tmp[14] * 4 - tmp[15] + tmp[16]; - dst[15 * stride] = -tmp[13] * 2 - tmp[14] + tmp[15] * 2 + tmp[16]; - dst[16 * stride] = tmp[13] * 2 - tmp[14] - tmp[15] * 2 + tmp[16]; - dst[17 * stride] = tmp[13] * 4 - tmp[15] * 5 + tmp[17]; - dst[18 * stride] = tmp[18] * 4 - tmp[20] * 5 + tmp[22]; - dst[19 * stride] = -tmp[19] * 4 - tmp[20] * 4 + tmp[21] + tmp[22]; - dst[20 * stride] = tmp[19] * 4 - tmp[20] * 4 - tmp[21] + tmp[22]; - dst[21 * stride] = -tmp[19] * 2 - tmp[20] + tmp[21] * 2 + tmp[22]; - dst[22 * stride] = tmp[19] * 2 - tmp[20] - tmp[21] * 2 + tmp[22]; - dst[23 * stride] = tmp[19] * 4 - tmp[21] * 5 + tmp[23]; - dst[24 * stride] = tmp[24] * 4 - tmp[26] * 5 + tmp[28]; - dst[25 * stride] = -tmp[25] * 4 - tmp[26] * 4 + tmp[27] + tmp[28]; - dst[26 * stride] = tmp[25] * 4 - tmp[26] * 4 - tmp[27] + tmp[28]; - dst[27 * stride] = -tmp[25] * 2 - tmp[26] + tmp[27] * 2 + tmp[28]; - dst[28 * stride] = tmp[25] * 2 - tmp[26] - tmp[27] * 2 + tmp[28]; - dst[29 * stride] = tmp[25] * 4 - tmp[27] * 5 + tmp[29]; - dst[30 * stride] = tmp[30] * 4 - tmp[32] * 5 + tmp[34]; - dst[31 * stride] = -tmp[31] * 4 - tmp[32] * 4 + tmp[33] + tmp[34]; - dst[32 * stride] = tmp[31] * 4 - tmp[32] * 4 - tmp[33] + tmp[34]; - dst[33 * stride] = -tmp[31] * 2 - tmp[32] + tmp[33] * 2 + tmp[34]; - dst[34 * stride] = tmp[31] * 2 - tmp[32] - tmp[33] * 2 + tmp[34]; - dst[35 * stride] = tmp[31] * 4 - tmp[33] * 5 + tmp[35]; - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetInput1n(const float * src, size_t srcStride, float * dst, size_t dstStride) - { - float tmp[36]; - tmp[0] = src[0 * srcStride + 0]; - tmp[1] = src[0 * srcStride + 1]; - tmp[2] = src[0 * srcStride + 2]; - tmp[3] = src[0 * srcStride + 3]; - tmp[4] = src[0 * srcStride + 4]; - tmp[5] = src[0 * srcStride + 5]; - tmp[6] = src[1 * srcStride + 0]; - tmp[7] = src[1 * srcStride + 1]; - tmp[8] = src[1 * srcStride + 2]; - tmp[9] = src[1 * srcStride + 3]; - tmp[10] = src[1 * srcStride + 4]; - tmp[11] = src[1 * srcStride + 5]; - tmp[12] = src[2 * srcStride + 0]; - tmp[13] = src[2 * srcStride + 1]; - tmp[14] = src[2 * srcStride + 2]; - tmp[15] = src[2 * srcStride + 3]; - tmp[16] = src[2 * srcStride + 4]; - tmp[17] = src[2 * srcStride + 5]; - tmp[18] = src[3 * srcStride + 0]; - tmp[19] = src[3 * srcStride + 1]; - tmp[20] = src[3 * srcStride + 2]; - tmp[21] = src[3 * srcStride + 3]; - tmp[22] = src[3 * srcStride + 4]; - tmp[23] = src[3 * srcStride + 5]; - tmp[24] = src[4 * srcStride + 0]; - tmp[25] = src[4 * srcStride + 1]; - tmp[26] = src[4 * srcStride + 2]; - tmp[27] = src[4 * srcStride + 3]; - tmp[28] = src[4 * srcStride + 4]; - tmp[29] = src[4 * srcStride + 5]; - tmp[30] = src[5 * srcStride + 0]; - tmp[31] = src[5 * srcStride + 1]; - tmp[32] = src[5 * srcStride + 2]; - tmp[33] = src[5 * srcStride + 3]; - tmp[34] = src[5 * srcStride + 4]; - tmp[35] = src[5 * srcStride + 5]; - WinogradKernel3x3Block4x4SetInput1(tmp, dst, dstStride); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetInput1n(const float * src, size_t srcStride, size_t rowB, size_t rowE, size_t colB, size_t colE, float * dst, size_t dstStride) - { - float tmp[6 * 6] = { 0 }; - for (size_t row = rowB; row < rowE; ++row) - for (size_t col = colB; col < colE; ++col) - tmp[row * 6 + col] = src[row * srcStride + col]; - WinogradKernel3x3Block4x4SetInput1(tmp, dst, dstStride); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetInput1t(const float * src, size_t srcW, size_t srcC, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - for (size_t c = 0; c < srcC; ++c, src++, dst++) - { - float tmp[36]; - tmp[0] = src[0 * srcS + 0 * srcC]; - tmp[1] = src[0 * srcS + 1 * srcC]; - tmp[2] = src[0 * srcS + 2 * srcC]; - tmp[3] = src[0 * srcS + 3 * srcC]; - tmp[4] = src[0 * srcS + 4 * srcC]; - tmp[5] = src[0 * srcS + 5 * srcC]; - tmp[6] = src[1 * srcS + 0 * srcC]; - tmp[7] = src[1 * srcS + 1 * srcC]; - tmp[8] = src[1 * srcS + 2 * srcC]; - tmp[9] = src[1 * srcS + 3 * srcC]; - tmp[10] = src[1 * srcS + 4 * srcC]; - tmp[11] = src[1 * srcS + 5 * srcC]; - tmp[12] = src[2 * srcS + 0 * srcC]; - tmp[13] = src[2 * srcS + 1 * srcC]; - tmp[14] = src[2 * srcS + 2 * srcC]; - tmp[15] = src[2 * srcS + 3 * srcC]; - tmp[16] = src[2 * srcS + 4 * srcC]; - tmp[17] = src[2 * srcS + 5 * srcC]; - tmp[18] = src[3 * srcS + 0 * srcC]; - tmp[19] = src[3 * srcS + 1 * srcC]; - tmp[20] = src[3 * srcS + 2 * srcC]; - tmp[21] = src[3 * srcS + 3 * srcC]; - tmp[22] = src[3 * srcS + 4 * srcC]; - tmp[23] = src[3 * srcS + 5 * srcC]; - tmp[24] = src[4 * srcS + 0 * srcC]; - tmp[25] = src[4 * srcS + 1 * srcC]; - tmp[26] = src[4 * srcS + 2 * srcC]; - tmp[27] = src[4 * srcS + 3 * srcC]; - tmp[28] = src[4 * srcS + 4 * srcC]; - tmp[29] = src[4 * srcS + 5 * srcC]; - tmp[30] = src[5 * srcS + 0 * srcC]; - tmp[31] = src[5 * srcS + 1 * srcC]; - tmp[32] = src[5 * srcS + 2 * srcC]; - tmp[33] = src[5 * srcS + 3 * srcC]; - tmp[34] = src[5 * srcS + 4 * srcC]; - tmp[35] = src[5 * srcS + 5 * srcC]; - WinogradKernel3x3Block4x4SetInput1(tmp, dst, dstStride); - } - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetInput1t(const float * src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - for (size_t c = 0; c < srcC; ++c, src++, dst++) - { - float tmp[36] = { 0 }; - for (size_t row = rowB; row < rowE; ++row) - for (size_t col = colB; col < colE; ++col) - tmp[row * 6 + col] = src[row * srcS + col * srcC]; - WinogradKernel3x3Block4x4SetInput1(tmp, dst, dstStride); - } - } - - void WinogradKernel3x3Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padY + padH <= 2 && padX + padW <= 2); - size_t dstH = srcHeight - 2 + padY + padH; - size_t dstW = srcWidth - 2 + padX + padW; - size_t dstH4 = dstH / 4 * 4; - size_t dstW4 = dstW / 4 * 4; - size_t noseW = Simd::Min(6, srcWidth + padX); - size_t noseH = Simd::Min(6, srcHeight + padY); - size_t startY = padY ? 4 : 0; - size_t startX = padX ? 4 : 0; - if (padH && dstH == dstH4) - dstH4 -= 4; - if(padY) - src -= srcWidth * (trans ? srcChannels : 1); - if (padW && dstW == dstW4) - dstW4 -= 4; - if(padX) - src -= 1 * (trans ? srcChannels : 1); - size_t tailW = dstW - dstW4 + (padW ? 1 : 2); - size_t tailH = dstH - dstH4 + (padH ? 1 : 2); - if (trans) - { - size_t row = 0, col = 0; - if (padY) - { - if (padX) - WinogradKernel3x3Block4x4SetInput1t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel3x3Block4x4SetInput1t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 6, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block4x4SetInput1t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; - } - for (row = startY; row < dstH4; row += 4) - { - if (padX) - WinogradKernel3x3Block4x4SetInput1t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 6, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel3x3Block4x4SetInput1t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block4x4SetInput1t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 6, 0, tailW, dst, dstStride), dst += srcChannels; - } - if (row < dstH) - { - if (padX) - WinogradKernel3x3Block4x4SetInput1t(src + row * srcWidth* srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel3x3Block4x4SetInput1t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 6, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block4x4SetInput1t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; - } - } - else - { - for (size_t c = 0; c < srcChannels; ++c) - { - size_t row = 0, col = 0; - if (padY) - { - if (padX) - WinogradKernel3x3Block4x4SetInput1n(src, srcWidth, 1, noseH, 1, noseW, dst++, dstStride); - for (col = startX; col < dstW4; col += 4) - WinogradKernel3x3Block4x4SetInput1n(src + col, srcWidth, 1, noseH, 0, 6, dst++, dstStride); - if (col < dstW) - WinogradKernel3x3Block4x4SetInput1n(src + col, srcWidth, 1, noseH, 0, tailW, dst++, dstStride); - } - for (row = startY; row < dstH4; row += 4) - { - if (padX) - WinogradKernel3x3Block4x4SetInput1n(src + row * srcWidth, srcWidth, 0, 6, 1, noseW, dst++, dstStride); - for (col = startX; col < dstW4; col += 4) - WinogradKernel3x3Block4x4SetInput1n(src + row * srcWidth + col, srcWidth, dst++, dstStride); - if (col < dstW) - WinogradKernel3x3Block4x4SetInput1n(src + row * srcWidth + col, srcWidth, 0, 6, 0, tailW, dst++, dstStride); - } - if (row < dstH) - { - if (padX) - WinogradKernel3x3Block4x4SetInput1n(src + row * srcWidth, srcWidth, 0, tailH, 1, noseW, dst++, dstStride); - for (col = startX; col < dstW4; col += 4) - WinogradKernel3x3Block4x4SetInput1n(src + row * srcWidth + col, srcWidth, 0, tailH, 0, 6, dst++, dstStride); - if (col < dstW) - WinogradKernel3x3Block4x4SetInput1n(src + row * srcWidth + col, srcWidth, 0, tailH, 0, tailW, dst++, dstStride); - } - src += srcWidth * srcHeight; - } - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutput1(const float * src, size_t stride, float dst[16]) - { - float s[36]; - s[0] = src[0 * stride]; - s[1] = src[1 * stride]; - s[2] = src[2 * stride]; - s[3] = src[3 * stride]; - s[4] = src[4 * stride]; - s[5] = src[5 * stride]; - s[6] = src[6 * stride]; - s[7] = src[7 * stride]; - s[8] = src[8 * stride]; - s[9] = src[9 * stride]; - s[10] = src[10 * stride]; - s[11] = src[11 * stride]; - s[12] = src[12 * stride]; - s[13] = src[13 * stride]; - s[14] = src[14 * stride]; - s[15] = src[15 * stride]; - s[16] = src[16 * stride]; - s[17] = src[17 * stride]; - s[18] = src[18 * stride]; - s[19] = src[19 * stride]; - s[20] = src[20 * stride]; - s[21] = src[21 * stride]; - s[22] = src[22 * stride]; - s[23] = src[23 * stride]; - s[24] = src[24 * stride]; - s[25] = src[25 * stride]; - s[26] = src[26 * stride]; - s[27] = src[27 * stride]; - s[28] = src[28 * stride]; - s[29] = src[29 * stride]; - s[30] = src[30 * stride]; - s[31] = src[31 * stride]; - s[32] = src[32 * stride]; - s[33] = src[33 * stride]; - s[34] = src[34 * stride]; - s[35] = src[35 * stride]; - - float t[24]; - t[0] = s[0] + s[6] + s[12] + s[18] + s[24]; - t[1] = s[1] + s[7] + s[13] + s[19] + s[25]; - t[2] = s[2] + s[8] + s[14] + s[20] + s[26]; - t[3] = s[3] + s[9] + s[15] + s[21] + s[27]; - t[4] = s[4] + s[10] + s[16] + s[22] + s[28]; - t[5] = s[5] + s[11] + s[17] + s[23] + s[29]; - t[6] = s[6] - s[12] + 2 * s[18] - 2 * s[24]; - t[7] = s[7] - s[13] + 2 * s[19] - 2 * s[25]; - t[8] = s[8] - s[14] + 2 * s[20] - 2 * s[26]; - t[9] = s[9] - s[15] + 2 * s[21] - 2 * s[27]; - t[10] = s[10] - s[16] + 2 * s[22] - 2 * s[28]; - t[11] = s[11] - s[17] + 2 * s[23] - 2 * s[29]; - t[12] = s[6] + s[12] + 4 * s[18] + 4 * s[24]; - t[13] = s[7] + s[13] + 4 * s[19] + 4 * s[25]; - t[14] = s[8] + s[14] + 4 * s[20] + 4 * s[26]; - t[15] = s[9] + s[15] + 4 * s[21] + 4 * s[27]; - t[16] = s[10] + s[16] + 4 * s[22] + 4 * s[28]; - t[17] = s[11] + s[17] + 4 * s[23] + 4 * s[29]; - t[18] = s[6] - s[12] + 8 * s[18] - 8 * s[24] + s[30]; - t[19] = s[7] - s[13] + 8 * s[19] - 8 * s[25] + s[31]; - t[20] = s[8] - s[14] + 8 * s[20] - 8 * s[26] + s[32]; - t[21] = s[9] - s[15] + 8 * s[21] - 8 * s[27] + s[33]; - t[22] = s[10] - s[16] + 8 * s[22] - 8 * s[28] + s[34]; - t[23] = s[11] - s[17] + 8 * s[23] - 8 * s[29] + s[35]; - - dst[0] = t[0] + t[1] + t[2] + t[3] + t[4]; - dst[1] = t[1] - t[2] + 2 * t[3] - 2 * t[4]; - dst[2] = t[1] + t[2] + 4 * t[3] + 4 * t[4]; - dst[3] = t[1] - t[2] + 8 * t[3] - 8 * t[4] + t[5]; - dst[4] = t[6] + t[7] + t[8] + t[9] + t[10]; - dst[5] = t[7] - t[8] + 2 * t[9] - 2 * t[10]; - dst[6] = t[7] + t[8] + 4 * t[9] + 4 * t[10]; - dst[7] = t[7] - t[8] + 8 * t[9] - 8 * t[10] + t[11]; - dst[8] = t[12] + t[13] + t[14] + t[15] + t[16]; - dst[9] = t[13] - t[14] + 2 * t[15] - 2 * t[16]; - dst[10] = t[13] + t[14] + 4 * t[15] + 4 * t[16]; - dst[11] = t[13] - t[14] + 8 * t[15] - 8 * t[16] + t[17]; - dst[12] = t[18] + t[19] + t[20] + t[21] + t[22]; - dst[13] = t[19] - t[20] + 2 * t[21] - 2 * t[22]; - dst[14] = t[19] + t[20] + 4 * t[21] + 4 * t[22]; - dst[15] = t[19] - t[20] + 8 * t[21] - 8 * t[22] + t[23]; - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutput1n(const float * src, size_t srcStride, float * dst, size_t dstStride) - { - float tmp[16]; - WinogradKernel3x3Block4x4SetOutput1(src, srcStride, tmp); - dst[0 * dstStride + 0] = tmp[0]; - dst[0 * dstStride + 1] = tmp[1]; - dst[0 * dstStride + 2] = tmp[2]; - dst[0 * dstStride + 3] = tmp[3]; - dst[1 * dstStride + 0] = tmp[4]; - dst[1 * dstStride + 1] = tmp[5]; - dst[1 * dstStride + 2] = tmp[6]; - dst[1 * dstStride + 3] = tmp[7]; - dst[2 * dstStride + 0] = tmp[8]; - dst[2 * dstStride + 1] = tmp[9]; - dst[2 * dstStride + 2] = tmp[10]; - dst[2 * dstStride + 3] = tmp[11]; - dst[3 * dstStride + 0] = tmp[12]; - dst[3 * dstStride + 1] = tmp[13]; - dst[3 * dstStride + 2] = tmp[14]; - dst[3 * dstStride + 3] = tmp[15]; - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutput1n(const float * src, size_t srcStride, float * dst, size_t dstStride, size_t rowE, size_t colE) - { - float tmp[16]; - WinogradKernel3x3Block4x4SetOutput1(src, srcStride, tmp); - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - dst[row*dstStride + col] = tmp[row * 4 + col]; - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutput1t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC) - { - size_t dstS = dstW * dstC; - for (size_t d = 0; d < dstC; ++d, src++, dst++) - { - float tmp[16]; - WinogradKernel3x3Block4x4SetOutput1(src, srcStride, tmp); - dst[0 * dstS + 0 * dstC] = tmp[0]; - dst[0 * dstS + 1 * dstC] = tmp[1]; - dst[0 * dstS + 2 * dstC] = tmp[2]; - dst[0 * dstS + 3 * dstC] = tmp[3]; - dst[1 * dstS + 0 * dstC] = tmp[4]; - dst[1 * dstS + 1 * dstC] = tmp[5]; - dst[1 * dstS + 2 * dstC] = tmp[6]; - dst[1 * dstS + 3 * dstC] = tmp[7]; - dst[2 * dstS + 0 * dstC] = tmp[8]; - dst[2 * dstS + 1 * dstC] = tmp[9]; - dst[2 * dstS + 2 * dstC] = tmp[10]; - dst[2 * dstS + 3 * dstC] = tmp[11]; - dst[3 * dstS + 0 * dstC] = tmp[12]; - dst[3 * dstS + 1 * dstC] = tmp[13]; - dst[3 * dstS + 2 * dstC] = tmp[14]; - dst[3 * dstS + 3 * dstC] = tmp[15]; - } - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutput1t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) - { - size_t dstS = dstW * dstC; - for (size_t d = 0; d < dstC; ++d, src++, dst++) - { - float tmp[16]; - WinogradKernel3x3Block4x4SetOutput1(src, srcStride, tmp); - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - dst[row*dstS + col * dstC] = tmp[row * 4 + col]; - } - } - - void WinogradKernel3x3Block4x4SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - size_t dstHeightFull = dstHeight / 4 * 4; - size_t dstWidthFull = dstWidth / 4 * 4; - if (trans) - { - size_t row, col; - for (row = 0; row < dstHeightFull; row += 4) - { - for (col = 0; col < dstWidthFull; col += 4) - WinogradKernel3x3Block4x4SetOutput1t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block4x4SetOutput1t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, 4, dstWidth - col), src += dstChannels; - } - if (row < dstHeight) - { - for (col = 0; col < dstWidthFull; col += 4) - WinogradKernel3x3Block4x4SetOutput1t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, 4), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block4x4SetOutput1t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; - } - } - else - { - for (size_t c = 0; c < dstChannels; ++c) - { - size_t row, col; - for (row = 0; row < dstHeightFull; row += 4) - { - for (col = 0; col < dstWidthFull; col += 4) - WinogradKernel3x3Block4x4SetOutput1n(src++, srcStride, dst + row * dstWidth + col, dstWidth); - if (col < dstWidth) - WinogradKernel3x3Block4x4SetOutput1n(src++, srcStride, dst + row * dstWidth + col, dstWidth, 4, dstWidth - col); - } - if (row < dstHeight) - { - for (col = 0; col < dstWidthFull; col += 4) - WinogradKernel3x3Block4x4SetOutput1n(src++, srcStride, dst + row * dstWidth + col, dstWidth, dstHeight - row, 4); - if (col < dstWidth) - WinogradKernel3x3Block4x4SetOutput1n(src++, srcStride, dst + row * dstWidth + col, dstWidth, dstHeight - row, dstWidth - col); - } - dst += dstHeight * dstWidth; - } - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseYuvToBgr.cpp b/src/3rd/Simd/Simd/SimdBaseYuvToBgr.cpp deleted file mode 100644 index c9b18528..00000000 --- a/src/3rd/Simd/Simd/SimdBaseYuvToBgr.cpp +++ /dev/null @@ -1,146 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdConversion.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE void Yuv422pToBgr(const uint8_t *y, int u, int v, uint8_t * bgr) - { - YuvToBgr(y[0], u, v, bgr); - YuvToBgr(y[1], u, v, bgr + 3); - } - - void Yuv420pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= 2) && (height >= 2)); - - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colY = 0, colBgr = 0; colY < width; colY += 2, colUV++, colBgr += 6) - { - int u_ = u[colUV]; - int v_ = v[colUV]; - Yuv422pToBgr(y + colY, u_, v_, bgr + colBgr); - Yuv422pToBgr(y + yStride + colY, u_, v_, bgr + bgrStride + colBgr); - } - y += 2 * yStride; - u += uStride; - v += vStride; - bgr += 2 * bgrStride; - } - } - - void Yuv422pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - assert((width % 2 == 0) && (width >= 2)); - - for (size_t row = 0; row < height; ++row) - { - for (size_t colUV = 0, colY = 0, colBgr = 0; colY < width; colY += 2, colUV++, colBgr += 6) - Yuv422pToBgr(y + colY, u[colUV], v[colUV], bgr + colBgr); - y += yStride; - u += uStride; - v += vStride; - bgr += bgrStride; - } - } - - void Yuv444pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colBgr = 0; col < width; col++, colBgr += 3) - YuvToBgr(y[col], u[col], v[col], bgr + colBgr); - y += yStride; - u += uStride; - v += vStride; - bgr += bgrStride; - } - } - - //--------------------------------------------------------------------- - - SIMD_INLINE void Yuv422pToRgb(const uint8_t* y, int u, int v, uint8_t* rgb) - { - YuvToRgb(y[0], u, v, rgb); - YuvToRgb(y[1], u, v, rgb + 3); - } - - void Yuv420pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= 2) && (height >= 2)); - - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colY = 0, colRgb = 0; colY < width; colY += 2, colUV++, colRgb += 6) - { - int u_ = u[colUV]; - int v_ = v[colUV]; - Yuv422pToRgb(y + colY, u_, v_, rgb + colRgb); - Yuv422pToRgb(y + yStride + colY, u_, v_, rgb + rgbStride + colRgb); - } - y += 2 * yStride; - u += uStride; - v += vStride; - rgb += 2 * rgbStride; - } - } - - void Yuv422pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) - { - assert((width % 2 == 0) && (width >= 2)); - - for (size_t row = 0; row < height; ++row) - { - for (size_t colUV = 0, colY = 0, colRgb = 0; colY < width; colY += 2, colUV++, colRgb += 6) - Yuv422pToRgb(y + colY, u[colUV], v[colUV], rgb + colRgb); - y += yStride; - u += uStride; - v += vStride; - rgb += rgbStride; - } - } - - void Yuv444pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colRgb = 0; col < width; col++, colRgb += 3) - YuvToRgb(y[col], u[col], v[col], rgb + colRgb); - y += yStride; - u += uStride; - v += vStride; - rgb += rgbStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseYuvToBgra.cpp b/src/3rd/Simd/Simd/SimdBaseYuvToBgra.cpp deleted file mode 100644 index 921f3bae..00000000 --- a/src/3rd/Simd/Simd/SimdBaseYuvToBgra.cpp +++ /dev/null @@ -1,115 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdConversion.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE void Yuva422pToBgra(const uint8_t * y, int u, int v, const uint8_t * a, uint8_t * bgra) - { - YuvToBgra(y[0], u, v, a[0], bgra + 0); - YuvToBgra(y[1], u, v, a[1], bgra + 4); - } - - void Yuva420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= 2) && (height >= 2)); - - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colY = 0, colBgra = 0; colY < width; colY += 2, colUV++, colBgra += 8) - { - int u_ = u[colUV]; - int v_ = v[colUV]; - Yuva422pToBgra(y + colY, u_, v_, a + colY, bgra + colBgra); - Yuva422pToBgra(y + yStride + colY, u_, v_, a + aStride + colY, bgra + bgraStride + colBgra); - } - y += 2 * yStride; - u += uStride; - v += vStride; - a += 2 * aStride; - bgra += 2 * bgraStride; - } - } - - SIMD_INLINE void Yuv422pToBgra(const uint8_t *y, int u, int v, int alpha, uint8_t * bgra) - { - YuvToBgra(y[0], u, v, alpha, bgra + 0); - YuvToBgra(y[1], u, v, alpha, bgra + 4); - } - - void Yuv420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= 2) && (height >= 2)); - - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colY = 0, colBgra = 0; colY < width; colY += 2, colUV++, colBgra += 8) - { - int u_ = u[colUV]; - int v_ = v[colUV]; - Yuv422pToBgra(y + colY, u_, v_, alpha, bgra + colBgra); - Yuv422pToBgra(y + yStride + colY, u_, v_, alpha, bgra + bgraStride + colBgra); - } - y += 2 * yStride; - u += uStride; - v += vStride; - bgra += 2 * bgraStride; - } - } - - void Yuv422pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - assert((width % 2 == 0) && (width >= 2)); - - for (size_t row = 0; row < height; ++row) - { - for (size_t colUV = 0, colY = 0, colBgra = 0; colY < width; colY += 2, colUV++, colBgra += 8) - Yuv422pToBgra(y + colY, u[colUV], v[colUV], alpha, bgra + colBgra); - y += yStride; - u += uStride; - v += vStride; - bgra += bgraStride; - } - } - - void Yuv444pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colBgra = 0; col < width; col++, colBgra += 4) - YuvToBgra(y[col], u[col], v[col], alpha, bgra + colBgra); - y += yStride; - u += uStride; - v += vStride; - bgra += bgraStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseYuvToHsl.cpp b/src/3rd/Simd/Simd/SimdBaseYuvToHsl.cpp deleted file mode 100644 index 5a373a47..00000000 --- a/src/3rd/Simd/Simd/SimdBaseYuvToHsl.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdConversion.h" - -namespace Simd -{ - namespace Base - { - void Yuv444pToHsl(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hsl, size_t hslStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colHsl = 0; col < width; col++, colHsl += 3) - YuvToHsl(y[col], u[col], v[col], hsl + colHsl); - y += yStride; - u += uStride; - v += vStride; - hsl += hslStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseYuvToHsv.cpp b/src/3rd/Simd/Simd/SimdBaseYuvToHsv.cpp deleted file mode 100644 index ac12b921..00000000 --- a/src/3rd/Simd/Simd/SimdBaseYuvToHsv.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdConversion.h" - -namespace Simd -{ - namespace Base - { - void Yuv444pToHsv(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hsv, size_t hsvStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colHsv = 0; col < width; col++, colHsv += 3) - YuvToHsv(y[col], u[col], v[col], hsv + colHsv); - y += yStride; - u += uStride; - v += vStride; - hsv += hsvStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBaseYuvToHue.cpp b/src/3rd/Simd/Simd/SimdBaseYuvToHue.cpp deleted file mode 100644 index b98fc00e..00000000 --- a/src/3rd/Simd/Simd/SimdBaseYuvToHue.cpp +++ /dev/null @@ -1,99 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdConversion.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE int YuvToHue(int y, int u, int v) - { - int red = YuvToRed(y, v); - int green = YuvToGreen(y, u, v); - int blue = YuvToBlue(y, u); - - int max = Max(red, Max(green, blue)); - int min = Min(red, Min(green, blue)); - int range = max - min; - - if (range) - { - int dividend; - - if (red == max) - dividend = green - blue + 6 * range; - else if (green == max) - dividend = blue - red + 2 * range; - else - dividend = red - green + 4 * range; - - return int(KF_255_DIV_6*float(dividend) / float(range) -#if defined(_MSC_VER) - +0.00001f -#endif - ); - } - return 0; - } - - void Yuv420pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= 2) && (height >= 2)); - - for (size_t row = 0; row < height; row += 2) - { - for (size_t col1 = 0, col2 = 0; col2 < width; col2 += 2, col1++) - { - int u_ = u[col1]; - int v_ = v[col1]; - hue[col2] = YuvToHue(y[col2], u_, v_); - hue[col2 + 1] = YuvToHue(y[col2 + 1], u_, v_); - hue[col2 + hueStride] = YuvToHue(y[col2 + yStride], u_, v_); - hue[col2 + hueStride + 1] = YuvToHue(y[col2 + yStride + 1], u_, v_); - } - y += 2 * yStride; - u += uStride; - v += vStride; - hue += 2 * hueStride; - } - } - - void Yuv444pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - hue[col] = YuvToHue(y[col], u[col], v[col]); - } - y += yStride; - u += uStride; - v += vStride; - hue += hueStride; - } - } - } -} diff --git a/src/3rd/Simd/Simd/SimdBayer.h b/src/3rd/Simd/Simd/SimdBayer.h deleted file mode 100644 index ccdc2dd8..00000000 --- a/src/3rd/Simd/Simd/SimdBayer.h +++ /dev/null @@ -1,630 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar, -* 2014-2015 Antonenka Mikhail. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdBayer_h__ -#define __SimdBayer_h__ - -#include "Simd/SimdConst.h" -#include "Simd/SimdMath.h" -#include "Simd/SimdLoad.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE uint8_t BayerToGreen(uint8_t greenLeft, uint8_t greenTop, uint8_t greenRight, uint8_t greenBottom, - uint8_t blueOrRedLeft, uint8_t blueOrRedTop, uint8_t blueOrRedRight, uint8_t blueOrRedBottom) - { - int verticalAbsDifference = AbsDifference(blueOrRedTop, blueOrRedBottom); - int horizontalAbsDifference = AbsDifference(blueOrRedLeft, blueOrRedRight); - if (verticalAbsDifference < horizontalAbsDifference) - return Average(greenTop, greenBottom); - else if (verticalAbsDifference > horizontalAbsDifference) - return Average(greenRight, greenLeft); - else - return Average(greenLeft, greenTop, greenRight, greenBottom); - } - - template void BayerToBgr(const uint8_t * src[6], - size_t col0, size_t col1, size_t col2, size_t col3, size_t col4, size_t col5, - uint8_t * dst00, uint8_t * dst01, uint8_t * dst10, uint8_t * dst11); - - template <> SIMD_INLINE void BayerToBgr(const uint8_t * src[6], - size_t col0, size_t col1, size_t col2, size_t col3, size_t col4, size_t col5, - uint8_t * dst00, uint8_t * dst01, uint8_t * dst10, uint8_t * dst11) - { - dst00[0] = Average(src[1][col2], src[3][col2]); - dst00[1] = src[2][col2]; - dst00[2] = Average(src[2][col1], src[2][col3]); - - dst01[0] = Average(src[1][col2], src[1][col4], src[3][col2], src[3][col4]); - dst01[1] = BayerToGreen(src[2][col2], src[1][col3], src[2][col4], src[3][col3], src[2][col1], src[0][col3], src[2][col5], src[4][col3]); - dst01[2] = src[2][col3]; - - dst10[0] = src[3][col2]; - dst10[1] = BayerToGreen(src[3][col1], src[2][col2], src[3][col3], src[4][col2], src[3][col0], src[1][col2], src[3][col4], src[5][col2]); - dst10[2] = Average(src[2][col1], src[2][col3], src[4][col1], src[4][col3]); - - dst11[0] = Average(src[3][col2], src[3][col4]); - dst11[1] = src[3][col3]; - dst11[2] = Average(src[2][col3], src[4][col3]); - } - - template <> SIMD_INLINE void BayerToBgr(const uint8_t * src[6], - size_t col0, size_t col1, size_t col2, size_t col3, size_t col4, size_t col5, - uint8_t * dst00, uint8_t * dst01, uint8_t * dst10, uint8_t * dst11) - { - dst00[0] = Average(src[2][col1], src[2][col3]); - dst00[1] = src[2][col2]; - dst00[2] = Average(src[1][col2], src[3][col2]); - - dst01[0] = src[2][col3]; - dst01[1] = BayerToGreen(src[2][col2], src[1][col3], src[2][col4], src[3][col3], src[2][col1], src[0][col3], src[2][col5], src[4][col3]); - dst01[2] = Average(src[1][col2], src[1][col4], src[3][col2], src[3][col4]); - - dst10[0] = Average(src[2][col1], src[2][col3], src[4][col1], src[4][col3]); - dst10[1] = BayerToGreen(src[3][col1], src[2][col2], src[3][col3], src[4][col2], src[3][col0], src[1][col2], src[3][col4], src[5][col2]); - dst10[2] = src[3][col2]; - - dst11[0] = Average(src[2][col3], src[4][col3]); - dst11[1] = src[3][col3]; - dst11[2] = Average(src[3][col2], src[3][col4]); - } - - template <> SIMD_INLINE void BayerToBgr(const uint8_t * src[6], - size_t col0, size_t col1, size_t col2, size_t col3, size_t col4, size_t col5, - uint8_t * dst00, uint8_t * dst01, uint8_t * dst10, uint8_t * dst11) - { - dst00[0] = Average(src[1][col1], src[1][col3], src[3][col1], src[3][col3]); - dst00[1] = BayerToGreen(src[2][col1], src[1][col2], src[2][col3], src[3][col2], src[2][col0], src[0][col2], src[2][col4], src[4][col2]); - dst00[2] = src[2][col2]; - - dst01[0] = Average(src[1][col3], src[3][col3]); - dst01[1] = src[2][col3]; - dst01[2] = Average(src[2][col2], src[2][col4]); - - dst10[0] = Average(src[3][col1], src[3][col3]); - dst10[1] = src[3][col2]; - dst10[2] = Average(src[2][col2], src[4][col2]); - - dst11[0] = src[3][col3]; - dst11[1] = BayerToGreen(src[3][col2], src[2][col3], src[3][col4], src[4][col3], src[3][col1], src[1][col3], src[3][col5], src[5][col3]); - dst11[2] = Average(src[2][col2], src[2][col4], src[4][col2], src[4][col4]); - } - - template <> SIMD_INLINE void BayerToBgr(const uint8_t * src[6], - size_t col0, size_t col1, size_t col2, size_t col3, size_t col4, size_t col5, - uint8_t * dst00, uint8_t * dst01, uint8_t * dst10, uint8_t * dst11) - { - dst00[0] = src[2][col2]; - dst00[1] = BayerToGreen(src[2][col1], src[1][col2], src[2][col3], src[3][col2], src[2][col0], src[0][col2], src[2][col4], src[4][col2]); - dst00[2] = Average(src[1][col1], src[1][col3], src[3][col1], src[3][col3]); - - dst01[0] = Average(src[2][col2], src[2][col4]); - dst01[1] = src[2][col3]; - dst01[2] = Average(src[1][col3], src[3][col3]); - - dst10[0] = Average(src[2][col2], src[4][col2]); - dst10[1] = src[3][col2]; - dst10[2] = Average(src[3][col1], src[3][col3]); - - dst11[0] = Average(src[2][col2], src[2][col4], src[4][col2], src[4][col4]); - dst11[1] = BayerToGreen(src[3][col2], src[2][col3], src[3][col4], src[4][col3], src[3][col1], src[1][col3], src[3][col5], src[5][col3]); - dst11[2] = src[3][col3]; - } - } - -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { - SIMD_INLINE void LoadBayerNose(const uint8_t * src, __m128i dst[3]) - { - dst[2] = _mm_loadu_si128((__m128i*)(src + 1)); - dst[0] = _mm_or_si128(_mm_slli_si128(_mm_loadu_si128((__m128i*)src), 1), _mm_and_si128(dst[2], _mm_srli_si128(K_INV_ZERO, A - 1))); - } - - SIMD_INLINE void LoadBayerTail(const uint8_t * src, __m128i dst[3]) - { - dst[0] = _mm_loadu_si128((__m128i*)(src - 1)); - dst[2] = _mm_or_si128(_mm_srli_si128(_mm_loadu_si128((__m128i*)src), 1), _mm_and_si128(dst[0], _mm_slli_si128(K_INV_ZERO, A - 1))); - } - - template SIMD_INLINE void LoadBayerNose(const uint8_t * src[3], size_t offset, size_t stride, __m128i dst[12]) - { - dst[1] = Load((__m128i*)(src[0] + offset)); - LoadBayerNose(src[0] + offset + stride, dst + 0); - LoadNose3(src[1] + offset, dst + 3); - LoadNose3(src[1] + offset + stride, dst + 6); - LoadBayerNose(src[2] + offset, dst + 9); - dst[10] = Load((__m128i*)(src[2] + offset + stride)); - } - - template SIMD_INLINE void LoadBayerBody(const uint8_t * src[3], size_t offset, size_t stride, __m128i dst[12]) - { - dst[1] = Load((__m128i*)(src[0] + offset)); - LoadBodyDx(src[0] + offset + stride, dst + 0); - LoadBody3(src[1] + offset, dst + 3); - LoadBody3(src[1] + offset + stride, dst + 6); - LoadBodyDx(src[2] + offset, dst + 9); - dst[10] = Load((__m128i*)(src[2] + offset + stride)); - } - - template SIMD_INLINE void LoadBayerTail(const uint8_t * src[3], size_t offset, size_t stride, __m128i dst[12]) - { - dst[1] = Load((__m128i*)(src[0] + offset)); - LoadBayerTail(src[0] + offset + stride, dst + 0); - LoadTail3(src[1] + offset, dst + 3); - LoadTail3(src[1] + offset + stride, dst + 6); - LoadBayerTail(src[2] + offset, dst + 9); - dst[10] = Load((__m128i*)(src[2] + offset + stride)); - } - - template SIMD_INLINE __m128i Get(const __m128i src[12]) - { - return U8To16(src[index]); - } - - SIMD_INLINE __m128i BayerToGreen(const __m128i & greenLeft, const __m128i & greenTop, const __m128i & greenRight, const __m128i & greenBottom, - const __m128i & blueOrRedLeft, const __m128i & blueOrRedTop, const __m128i & blueOrRedRight, const __m128i & blueOrRedBottom) - { - __m128i verticalAbsDifference = AbsDifferenceI16(blueOrRedTop, blueOrRedBottom); - __m128i horizontalAbsDifference = AbsDifferenceI16(blueOrRedLeft, blueOrRedRight); - __m128i green = Average16(greenLeft, greenTop, greenRight, greenBottom); - green = Combine(_mm_cmplt_epi16(verticalAbsDifference, horizontalAbsDifference), _mm_avg_epu16(greenTop, greenBottom), green); - return Combine(_mm_cmpgt_epi16(verticalAbsDifference, horizontalAbsDifference), _mm_avg_epu16(greenRight, greenLeft), green); - } - - template void BayerToBgr(const __m128i s[12], __m128i d[6]); - - template <> SIMD_INLINE void BayerToBgr(const __m128i s[12], __m128i d[6]) - { - d[0] = Merge16(_mm_avg_epu16(Get<0, 1>(s), Get<7, 0>(s)), Average16(Get<0, 1>(s), Get<2, 1>(s), Get<7, 0>(s), Get<8, 0>(s))); - d[1] = Merge16(Get<4, 0>(s), BayerToGreen(Get<4, 0>(s), Get<2, 0>(s), Get<5, 0>(s), Get<7, 1>(s), Get<3, 1>(s), Get<1, 1>(s), Get<5, 1>(s), Get<11, 0>(s))); - d[2] = Merge16(_mm_avg_epu16(Get<3, 1>(s), Get<4, 1>(s)), Get<4, 1>(s)); - d[3] = Merge16(Get<7, 0>(s), _mm_avg_epu16(Get<7, 0>(s), Get<8, 0>(s))); - d[4] = Merge16(BayerToGreen(Get<6, 1>(s), Get<4, 0>(s), Get<7, 1>(s), Get<9, 1>(s), Get<6, 0>(s), Get<0, 1>(s), Get<8, 0>(s), Get<10, 0>(s)), Get<7, 1>(s)); - d[5] = Merge16(Average16(Get<3, 1>(s), Get<4, 1>(s), Get<9, 0>(s), Get<11, 0>(s)), _mm_avg_epu16(Get<4, 1>(s), Get<11, 0>(s))); - } - - template <> SIMD_INLINE void BayerToBgr(const __m128i s[12], __m128i d[6]) - { - d[0] = Merge16(_mm_avg_epu16(Get<3, 1>(s), Get<4, 1>(s)), Get<4, 1>(s)); - d[1] = Merge16(Get<4, 0>(s), BayerToGreen(Get<4, 0>(s), Get<2, 0>(s), Get<5, 0>(s), Get<7, 1>(s), Get<3, 1>(s), Get<1, 1>(s), Get<5, 1>(s), Get<11, 0>(s))); - d[2] = Merge16(_mm_avg_epu16(Get<0, 1>(s), Get<7, 0>(s)), Average16(Get<0, 1>(s), Get<2, 1>(s), Get<7, 0>(s), Get<8, 0>(s))); - d[3] = Merge16(Average16(Get<3, 1>(s), Get<4, 1>(s), Get<9, 0>(s), Get<11, 0>(s)), _mm_avg_epu16(Get<4, 1>(s), Get<11, 0>(s))); - d[4] = Merge16(BayerToGreen(Get<6, 1>(s), Get<4, 0>(s), Get<7, 1>(s), Get<9, 1>(s), Get<6, 0>(s), Get<0, 1>(s), Get<8, 0>(s), Get<10, 0>(s)), Get<7, 1>(s)); - d[5] = Merge16(Get<7, 0>(s), _mm_avg_epu16(Get<7, 0>(s), Get<8, 0>(s))); - } - - template <> SIMD_INLINE void BayerToBgr(const __m128i s[12], __m128i d[6]) - { - d[0] = Merge16(Average16(Get<0, 0>(s), Get<2, 0>(s), Get<6, 1>(s), Get<7, 1>(s)), _mm_avg_epu16(Get<2, 0>(s), Get<7, 1>(s))); - d[1] = Merge16(BayerToGreen(Get<3, 1>(s), Get<0, 1>(s), Get<4, 1>(s), Get<7, 0>(s), Get<3, 0>(s), Get<1, 0>(s), Get<5, 0>(s), Get<9, 1>(s)), Get<4, 1>(s)); - d[2] = Merge16(Get<4, 0>(s), _mm_avg_epu16(Get<4, 0>(s), Get<5, 0>(s))); - d[3] = Merge16(_mm_avg_epu16(Get<6, 1>(s), Get<7, 1>(s)), Get<7, 1>(s)); - d[4] = Merge16(Get<7, 0>(s), BayerToGreen(Get<7, 0>(s), Get<4, 1>(s), Get<8, 0>(s), Get<11, 0>(s), Get<6, 1>(s), Get<2, 0>(s), Get<8, 1>(s), Get<10, 1>(s))); - d[5] = Merge16(_mm_avg_epu16(Get<4, 0>(s), Get<9, 1>(s)), Average16(Get<4, 0>(s), Get<5, 0>(s), Get<9, 1>(s), Get<11, 1>(s))); - } - - template <> SIMD_INLINE void BayerToBgr(const __m128i s[12], __m128i d[6]) - { - d[0] = Merge16(Get<4, 0>(s), _mm_avg_epu16(Get<4, 0>(s), Get<5, 0>(s))); - d[1] = Merge16(BayerToGreen(Get<3, 1>(s), Get<0, 1>(s), Get<4, 1>(s), Get<7, 0>(s), Get<3, 0>(s), Get<1, 0>(s), Get<5, 0>(s), Get<9, 1>(s)), Get<4, 1>(s)); - d[2] = Merge16(Average16(Get<0, 0>(s), Get<2, 0>(s), Get<6, 1>(s), Get<7, 1>(s)), _mm_avg_epu16(Get<2, 0>(s), Get<7, 1>(s))); - d[3] = Merge16(_mm_avg_epu16(Get<4, 0>(s), Get<9, 1>(s)), Average16(Get<4, 0>(s), Get<5, 0>(s), Get<9, 1>(s), Get<11, 1>(s))); - d[4] = Merge16(Get<7, 0>(s), BayerToGreen(Get<7, 0>(s), Get<4, 1>(s), Get<8, 0>(s), Get<11, 0>(s), Get<6, 1>(s), Get<2, 0>(s), Get<8, 1>(s), Get<10, 1>(s))); - d[5] = Merge16(_mm_avg_epu16(Get<6, 1>(s), Get<7, 1>(s)), Get<7, 1>(s)); - } - } -#endif//SIMD_SSE2_ENABLE - -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - SIMD_INLINE void LoadBayerNose(const uint8_t * src, __m256i dst[3]) - { - dst[2] = _mm256_loadu_si256((__m256i*)(src + 1)); - __m128i lo = _mm_or_si128(_mm_slli_si128(_mm_loadu_si128((__m128i*)src), 1), - _mm_and_si128(_mm256_castsi256_si128(dst[2]), _mm_srli_si128(Sse2::K_INV_ZERO, HA - 1))); - __m128i hi = _mm_loadu_si128((__m128i*)(src + HA - 1)); - dst[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 0x1); - } - - SIMD_INLINE void LoadBayerTail(const uint8_t * src, __m256i dst[3]) - { - dst[0] = _mm256_loadu_si256((__m256i*)(src - 1)); - __m128i lo = _mm_loadu_si128((__m128i*)(src + 1)); - __m128i hi = _mm_or_si128(_mm_srli_si128(_mm_loadu_si128((__m128i*)src + 1), 1), - _mm_and_si128(_mm256_extracti128_si256(dst[0], 1), _mm_slli_si128(Sse2::K_INV_ZERO, HA - 1))); - dst[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 0x1); - } - - template SIMD_INLINE void LoadBayerNose(const uint8_t * src[3], size_t offset, size_t stride, __m256i dst[12]) - { - dst[1] = Load((__m256i*)(src[0] + offset)); - LoadBayerNose(src[0] + offset + stride, dst + 0); - LoadNose3(src[1] + offset, dst + 3); - LoadNose3(src[1] + offset + stride, dst + 6); - LoadBayerNose(src[2] + offset, dst + 9); - dst[10] = Load((__m256i*)(src[2] + offset + stride)); - } - - template SIMD_INLINE void LoadBayerBody(const uint8_t * src[3], size_t offset, size_t stride, __m256i dst[12]) - { - dst[1] = Load((__m256i*)(src[0] + offset)); - LoadBodyDx(src[0] + offset + stride, dst + 0); - LoadBody3(src[1] + offset, dst + 3); - LoadBody3(src[1] + offset + stride, dst + 6); - LoadBodyDx(src[2] + offset, dst + 9); - dst[10] = Load((__m256i*)(src[2] + offset + stride)); - } - - template SIMD_INLINE void LoadBayerTail(const uint8_t * src[3], size_t offset, size_t stride, __m256i dst[12]) - { - dst[1] = Load((__m256i*)(src[0] + offset)); - LoadBayerTail(src[0] + offset + stride, dst + 0); - LoadTail3(src[1] + offset, dst + 3); - LoadTail3(src[1] + offset + stride, dst + 6); - - LoadBayerTail(src[2] + offset, dst + 9); - dst[10] = Load((__m256i*)(src[2] + offset + stride)); - } - - template SIMD_INLINE __m256i Get(const __m256i src[12]) - { - return U8To16(src[index]); - } - - SIMD_INLINE __m256i BayerToGreen(const __m256i & greenLeft, const __m256i & greenTop, const __m256i & greenRight, const __m256i & greenBottom, - const __m256i & blueOrRedLeft, const __m256i & blueOrRedTop, const __m256i & blueOrRedRight, const __m256i & blueOrRedBottom) - { - __m256i verticalAbsDifference = AbsDifferenceI16(blueOrRedTop, blueOrRedBottom); - __m256i horizontalAbsDifference = AbsDifferenceI16(blueOrRedLeft, blueOrRedRight); - __m256i green = Average16(greenLeft, greenTop, greenRight, greenBottom); - green = _mm256_blendv_epi8(green, _mm256_avg_epu16(greenTop, greenBottom), _mm256_cmpgt_epi16(horizontalAbsDifference, verticalAbsDifference)); - return _mm256_blendv_epi8(green, _mm256_avg_epu16(greenRight, greenLeft), _mm256_cmpgt_epi16(verticalAbsDifference, horizontalAbsDifference)); - } - - template void BayerToBgr(const __m256i s[12], __m256i d[6]); - - template <> SIMD_INLINE void BayerToBgr(const __m256i s[12], __m256i d[6]) - { - d[0] = Merge16(_mm256_avg_epu16(Get<0, 1>(s), Get<7, 0>(s)), Average16(Get<0, 1>(s), Get<2, 1>(s), Get<7, 0>(s), Get<8, 0>(s))); - d[1] = Merge16(Get<4, 0>(s), BayerToGreen(Get<4, 0>(s), Get<2, 0>(s), Get<5, 0>(s), Get<7, 1>(s), Get<3, 1>(s), Get<1, 1>(s), Get<5, 1>(s), Get<11, 0>(s))); - d[2] = Merge16(_mm256_avg_epu16(Get<3, 1>(s), Get<4, 1>(s)), Get<4, 1>(s)); - d[3] = Merge16(Get<7, 0>(s), _mm256_avg_epu16(Get<7, 0>(s), Get<8, 0>(s))); - d[4] = Merge16(BayerToGreen(Get<6, 1>(s), Get<4, 0>(s), Get<7, 1>(s), Get<9, 1>(s), Get<6, 0>(s), Get<0, 1>(s), Get<8, 0>(s), Get<10, 0>(s)), Get<7, 1>(s)); - d[5] = Merge16(Average16(Get<3, 1>(s), Get<4, 1>(s), Get<9, 0>(s), Get<11, 0>(s)), _mm256_avg_epu16(Get<4, 1>(s), Get<11, 0>(s))); - } - - template <> SIMD_INLINE void BayerToBgr(const __m256i s[12], __m256i d[6]) - { - d[0] = Merge16(_mm256_avg_epu16(Get<3, 1>(s), Get<4, 1>(s)), Get<4, 1>(s)); - d[1] = Merge16(Get<4, 0>(s), BayerToGreen(Get<4, 0>(s), Get<2, 0>(s), Get<5, 0>(s), Get<7, 1>(s), Get<3, 1>(s), Get<1, 1>(s), Get<5, 1>(s), Get<11, 0>(s))); - d[2] = Merge16(_mm256_avg_epu16(Get<0, 1>(s), Get<7, 0>(s)), Average16(Get<0, 1>(s), Get<2, 1>(s), Get<7, 0>(s), Get<8, 0>(s))); - d[3] = Merge16(Average16(Get<3, 1>(s), Get<4, 1>(s), Get<9, 0>(s), Get<11, 0>(s)), _mm256_avg_epu16(Get<4, 1>(s), Get<11, 0>(s))); - d[4] = Merge16(BayerToGreen(Get<6, 1>(s), Get<4, 0>(s), Get<7, 1>(s), Get<9, 1>(s), Get<6, 0>(s), Get<0, 1>(s), Get<8, 0>(s), Get<10, 0>(s)), Get<7, 1>(s)); - d[5] = Merge16(Get<7, 0>(s), _mm256_avg_epu16(Get<7, 0>(s), Get<8, 0>(s))); - } - - template <> SIMD_INLINE void BayerToBgr(const __m256i s[12], __m256i d[6]) - { - d[0] = Merge16(Average16(Get<0, 0>(s), Get<2, 0>(s), Get<6, 1>(s), Get<7, 1>(s)), _mm256_avg_epu16(Get<2, 0>(s), Get<7, 1>(s))); - d[1] = Merge16(BayerToGreen(Get<3, 1>(s), Get<0, 1>(s), Get<4, 1>(s), Get<7, 0>(s), Get<3, 0>(s), Get<1, 0>(s), Get<5, 0>(s), Get<9, 1>(s)), Get<4, 1>(s)); - d[2] = Merge16(Get<4, 0>(s), _mm256_avg_epu16(Get<4, 0>(s), Get<5, 0>(s))); - d[3] = Merge16(_mm256_avg_epu16(Get<6, 1>(s), Get<7, 1>(s)), Get<7, 1>(s)); - d[4] = Merge16(Get<7, 0>(s), BayerToGreen(Get<7, 0>(s), Get<4, 1>(s), Get<8, 0>(s), Get<11, 0>(s), Get<6, 1>(s), Get<2, 0>(s), Get<8, 1>(s), Get<10, 1>(s))); - d[5] = Merge16(_mm256_avg_epu16(Get<4, 0>(s), Get<9, 1>(s)), Average16(Get<4, 0>(s), Get<5, 0>(s), Get<9, 1>(s), Get<11, 1>(s))); - } - - template <> SIMD_INLINE void BayerToBgr(const __m256i s[12], __m256i d[6]) - { - d[0] = Merge16(Get<4, 0>(s), _mm256_avg_epu16(Get<4, 0>(s), Get<5, 0>(s))); - d[1] = Merge16(BayerToGreen(Get<3, 1>(s), Get<0, 1>(s), Get<4, 1>(s), Get<7, 0>(s), Get<3, 0>(s), Get<1, 0>(s), Get<5, 0>(s), Get<9, 1>(s)), Get<4, 1>(s)); - d[2] = Merge16(Average16(Get<0, 0>(s), Get<2, 0>(s), Get<6, 1>(s), Get<7, 1>(s)), _mm256_avg_epu16(Get<2, 0>(s), Get<7, 1>(s))); - d[3] = Merge16(_mm256_avg_epu16(Get<4, 0>(s), Get<9, 1>(s)), Average16(Get<4, 0>(s), Get<5, 0>(s), Get<9, 1>(s), Get<11, 1>(s))); - d[4] = Merge16(Get<7, 0>(s), BayerToGreen(Get<7, 0>(s), Get<4, 1>(s), Get<8, 0>(s), Get<11, 0>(s), Get<6, 1>(s), Get<2, 0>(s), Get<8, 1>(s), Get<10, 1>(s))); - d[5] = Merge16(_mm256_avg_epu16(Get<6, 1>(s), Get<7, 1>(s)), Get<7, 1>(s)); - } - } -#endif//SIMD_AVX2_ENABLE - -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - SIMD_INLINE void LoadBayerNose(const uint8_t * src, __m512i dst[3]) - { - dst[2] = _mm512_loadu_si512((__m512i*)(src + 1)); - __mmask64 m = __mmask64(-1) << 1; - __m512i src0 = Load(src - 1, m); - __m128i so = _mm512_extracti32x4_epi32(src0, 0); - __m128i ss = _mm_srli_si128(so, 2); - dst[0] = _mm512_mask_blend_epi8(m, _mm512_inserti32x4(src0, ss, 0), src0); - } - - SIMD_INLINE void LoadBayerTail(const uint8_t * src, __m512i dst[3]) - { - dst[0] = _mm512_loadu_si512((__m512i*)(src - 1)); - __mmask64 m = __mmask64(-1) >> 1; - __m512i src2 = Load(src + 1, m); - __m128i so = _mm512_extracti32x4_epi32(src2, 3); - __m128i ss = _mm_slli_si128(so, 2); - dst[2] = _mm512_mask_blend_epi8(m, _mm512_inserti32x4(src2, ss, 3), src2); - } - - template SIMD_INLINE void LoadBayerNose(const uint8_t * src[3], size_t offset, size_t stride, __m512i dst[12]) - { - dst[1] = Load((__m512i*)(src[0] + offset)); - LoadBayerNose(src[0] + offset + stride, dst + 0); - LoadNose3(src[1] + offset, dst + 3); - LoadNose3(src[1] + offset + stride, dst + 6); - LoadBayerNose(src[2] + offset, dst + 9); - dst[10] = Load((__m512i*)(src[2] + offset + stride)); - } - - template SIMD_INLINE void LoadBayerBody(const uint8_t * src[3], size_t offset, size_t stride, __m512i dst[12]) - { - dst[1] = Load((__m512i*)(src[0] + offset)); - LoadBodyDx(src[0] + offset + stride, dst + 0); - LoadBody3(src[1] + offset, dst + 3); - LoadBody3(src[1] + offset + stride, dst + 6); - LoadBodyDx(src[2] + offset, dst + 9); - dst[10] = Load((__m512i*)(src[2] + offset + stride)); - } - - template SIMD_INLINE void LoadBayerTail(const uint8_t * src[3], size_t offset, size_t stride, __m512i dst[12]) - { - dst[1] = Load((__m512i*)(src[0] + offset)); - LoadBayerTail(src[0] + offset + stride, dst + 0); - LoadTail3(src[1] + offset, dst + 3); - LoadTail3(src[1] + offset + stride, dst + 6); - - LoadBayerTail(src[2] + offset, dst + 9); - dst[10] = Load((__m512i*)(src[2] + offset + stride)); - } - - template SIMD_INLINE __m512i Get(const __m512i src[12]) - { - return U8To16(src[index]); - } - - SIMD_INLINE __m512i BayerToGreen(const __m512i & greenLeft, const __m512i & greenTop, const __m512i & greenRight, const __m512i & greenBottom, - const __m512i & blueOrRedLeft, const __m512i & blueOrRedTop, const __m512i & blueOrRedRight, const __m512i & blueOrRedBottom) - { - __m512i verticalAbsDifference = AbsDifferenceI16(blueOrRedTop, blueOrRedBottom); - __m512i horizontalAbsDifference = AbsDifferenceI16(blueOrRedLeft, blueOrRedRight); - __m512i green = Average16(greenLeft, greenTop, greenRight, greenBottom); - green = _mm512_mask_blend_epi8(_mm512_cmpgt_epu8_mask(horizontalAbsDifference, verticalAbsDifference), green, Average16(greenTop, greenBottom)); - return _mm512_mask_blend_epi8(_mm512_cmpgt_epu8_mask(verticalAbsDifference, horizontalAbsDifference), green, Average16(greenRight, greenLeft)); - } - - template void BayerToBgr(const __m512i s[12], __m512i d[6]); - - template <> SIMD_INLINE void BayerToBgr(const __m512i s[12], __m512i d[6]) - { - d[0] = Merge16(Average16(Get<0, 1>(s), Get<7, 0>(s)), Average16(Get<0, 1>(s), Get<2, 1>(s), Get<7, 0>(s), Get<8, 0>(s))); - d[1] = Merge16(Get<4, 0>(s), BayerToGreen(Get<4, 0>(s), Get<2, 0>(s), Get<5, 0>(s), Get<7, 1>(s), Get<3, 1>(s), Get<1, 1>(s), Get<5, 1>(s), Get<11, 0>(s))); - d[2] = Merge16(Average16(Get<3, 1>(s), Get<4, 1>(s)), Get<4, 1>(s)); - d[3] = Merge16(Get<7, 0>(s), Average16(Get<7, 0>(s), Get<8, 0>(s))); - d[4] = Merge16(BayerToGreen(Get<6, 1>(s), Get<4, 0>(s), Get<7, 1>(s), Get<9, 1>(s), Get<6, 0>(s), Get<0, 1>(s), Get<8, 0>(s), Get<10, 0>(s)), Get<7, 1>(s)); - d[5] = Merge16(Average16(Get<3, 1>(s), Get<4, 1>(s), Get<9, 0>(s), Get<11, 0>(s)), Average16(Get<4, 1>(s), Get<11, 0>(s))); - } - - template <> SIMD_INLINE void BayerToBgr(const __m512i s[12], __m512i d[6]) - { - d[0] = Merge16(Average16(Get<3, 1>(s), Get<4, 1>(s)), Get<4, 1>(s)); - d[1] = Merge16(Get<4, 0>(s), BayerToGreen(Get<4, 0>(s), Get<2, 0>(s), Get<5, 0>(s), Get<7, 1>(s), Get<3, 1>(s), Get<1, 1>(s), Get<5, 1>(s), Get<11, 0>(s))); - d[2] = Merge16(Average16(Get<0, 1>(s), Get<7, 0>(s)), Average16(Get<0, 1>(s), Get<2, 1>(s), Get<7, 0>(s), Get<8, 0>(s))); - d[3] = Merge16(Average16(Get<3, 1>(s), Get<4, 1>(s), Get<9, 0>(s), Get<11, 0>(s)), Average16(Get<4, 1>(s), Get<11, 0>(s))); - d[4] = Merge16(BayerToGreen(Get<6, 1>(s), Get<4, 0>(s), Get<7, 1>(s), Get<9, 1>(s), Get<6, 0>(s), Get<0, 1>(s), Get<8, 0>(s), Get<10, 0>(s)), Get<7, 1>(s)); - d[5] = Merge16(Get<7, 0>(s), Average16(Get<7, 0>(s), Get<8, 0>(s))); - } - - template <> SIMD_INLINE void BayerToBgr(const __m512i s[12], __m512i d[6]) - { - d[0] = Merge16(Average16(Get<0, 0>(s), Get<2, 0>(s), Get<6, 1>(s), Get<7, 1>(s)), Average16(Get<2, 0>(s), Get<7, 1>(s))); - d[1] = Merge16(BayerToGreen(Get<3, 1>(s), Get<0, 1>(s), Get<4, 1>(s), Get<7, 0>(s), Get<3, 0>(s), Get<1, 0>(s), Get<5, 0>(s), Get<9, 1>(s)), Get<4, 1>(s)); - d[2] = Merge16(Get<4, 0>(s), Average16(Get<4, 0>(s), Get<5, 0>(s))); - d[3] = Merge16(Average16(Get<6, 1>(s), Get<7, 1>(s)), Get<7, 1>(s)); - d[4] = Merge16(Get<7, 0>(s), BayerToGreen(Get<7, 0>(s), Get<4, 1>(s), Get<8, 0>(s), Get<11, 0>(s), Get<6, 1>(s), Get<2, 0>(s), Get<8, 1>(s), Get<10, 1>(s))); - d[5] = Merge16(Average16(Get<4, 0>(s), Get<9, 1>(s)), Average16(Get<4, 0>(s), Get<5, 0>(s), Get<9, 1>(s), Get<11, 1>(s))); - } - - template <> SIMD_INLINE void BayerToBgr(const __m512i s[12], __m512i d[6]) - { - d[0] = Merge16(Get<4, 0>(s), Average16(Get<4, 0>(s), Get<5, 0>(s))); - d[1] = Merge16(BayerToGreen(Get<3, 1>(s), Get<0, 1>(s), Get<4, 1>(s), Get<7, 0>(s), Get<3, 0>(s), Get<1, 0>(s), Get<5, 0>(s), Get<9, 1>(s)), Get<4, 1>(s)); - d[2] = Merge16(Average16(Get<0, 0>(s), Get<2, 0>(s), Get<6, 1>(s), Get<7, 1>(s)), Average16(Get<2, 0>(s), Get<7, 1>(s))); - d[3] = Merge16(Average16(Get<4, 0>(s), Get<9, 1>(s)), Average16(Get<4, 0>(s), Get<5, 0>(s), Get<9, 1>(s), Get<11, 1>(s))); - d[4] = Merge16(Get<7, 0>(s), BayerToGreen(Get<7, 0>(s), Get<4, 1>(s), Get<8, 0>(s), Get<11, 0>(s), Get<6, 1>(s), Get<2, 0>(s), Get<8, 1>(s), Get<10, 1>(s))); - d[5] = Merge16(Average16(Get<6, 1>(s), Get<7, 1>(s)), Get<7, 1>(s)); - } - } -#endif//SIMD_AVX512BW_ENABLE - -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE void LoadBayerNose2(const uint8_t * src, uint8x8x2_t dst[3]) - { - dst[2] = LoadHalf2(src + 1); - dst[0].val[0] = LoadBeforeFirst<1>(dst[2].val[0]); - dst[0].val[1] = LoadHalf2(src).val[0]; - } - - template SIMD_INLINE void LoadBayerNose3(const uint8_t * src, uint8x8x2_t dst[3]) - { - dst[1] = LoadHalf2(src); - dst[0].val[0] = LoadBeforeFirst<1>(dst[1].val[0]); - dst[0].val[1] = LoadBeforeFirst<1>(dst[1].val[1]); - dst[2] = LoadHalf2(src + 2); - } - - template SIMD_INLINE void LoadBayerNose(const uint8_t * src[3], size_t offset, size_t stride, uint8x8x2_t dst[12]) - { - dst[1] = LoadHalf2(src[0] + offset); - LoadBayerNose2(src[0] + offset + stride, dst + 0); - LoadBayerNose3(src[1] + offset, dst + 3); - LoadBayerNose3(src[1] + offset + stride, dst + 6); - LoadBayerNose2(src[2] + offset, dst + 9); - dst[10] = LoadHalf2(src[2] + offset + stride); - } - - SIMD_INLINE void LoadBayerBody2(const uint8_t * src, uint8x8x2_t dst[3]) - { - dst[0] = LoadHalf2(src - 1); - dst[2] = LoadHalf2(src + 1); - } - - template SIMD_INLINE void LoadBayerBody3(const uint8_t * src, uint8x8x2_t dst[3]) - { - dst[0] = LoadHalf2(src - 2); - dst[1] = LoadHalf2(src); - dst[2] = LoadHalf2(src + 2); - } - - template SIMD_INLINE void LoadBayerBody(const uint8_t * src[3], size_t offset, size_t stride, uint8x8x2_t dst[12]) - { - dst[1] = LoadHalf2(src[0] + offset); - LoadBayerBody2(src[0] + offset + stride, dst + 0); - LoadBayerBody3(src[1] + offset, dst + 3); - LoadBayerBody3(src[1] + offset + stride, dst + 6); - LoadBayerBody2(src[2] + offset, dst + 9); - dst[10] = LoadHalf2(src[2] + offset + stride); - } - - SIMD_INLINE void LoadBayerTail2(const uint8_t * src, uint8x8x2_t dst[3]) - { - dst[0] = LoadHalf2(src - 1); - dst[2].val[0] = LoadHalf2(src).val[1]; - dst[2].val[1] = LoadAfterLast<1>(dst[0].val[1]); - } - - template SIMD_INLINE void LoadBayerTail3(const uint8_t * src, uint8x8x2_t dst[3]) - { - dst[0] = LoadHalf2(src - 2); - dst[1] = LoadHalf2(src); - dst[2].val[0] = LoadAfterLast<1>(dst[1].val[0]); - dst[2].val[1] = LoadAfterLast<1>(dst[1].val[1]); - } - - template SIMD_INLINE void LoadBayerTail(const uint8_t * src[3], size_t offset, size_t stride, uint8x8x2_t dst[12]) - { - dst[1] = LoadHalf2(src[0] + offset); - LoadBayerTail2(src[0] + offset + stride, dst + 0); - LoadBayerTail3(src[1] + offset, dst + 3); - LoadBayerTail3(src[1] + offset + stride, dst + 6); - LoadBayerTail2(src[2] + offset, dst + 9); - dst[10] = LoadHalf2(src[2] + offset + stride); - } - - SIMD_INLINE uint8x8_t Average(uint8x8_t s0, uint8x8_t s1) - { - return vrhadd_u8(s0, s1); - } - - SIMD_INLINE uint8x8_t Average(const uint8x8_t & s0, const uint8x8_t & s1, const uint8x8_t & s2, const uint8x8_t & s3) - { - return vshrn_n_u16(vaddq_u16(vaddq_u16(vaddl_u8(s0, s1), vaddl_u8(s2, s3)), vdupq_n_u16(2)), 2); - } - - SIMD_INLINE uint8x8_t BayerToGreen(const uint8x8_t & greenLeft, const uint8x8_t & greenTop, const uint8x8_t & greenRight, const uint8x8_t & greenBottom, - const uint8x8_t & blueOrRedLeft, const uint8x8_t & blueOrRedTop, const uint8x8_t & blueOrRedRight, const uint8x8_t & blueOrRedBottom) - { - uint8x8_t verticalAbsDifference = vabd_u8(blueOrRedTop, blueOrRedBottom); - uint8x8_t horizontalAbsDifference = vabd_u8(blueOrRedLeft, blueOrRedRight); - uint8x8_t green = Average(greenLeft, greenTop, greenRight, greenBottom); - green = vbsl_u8(vclt_u8(verticalAbsDifference, horizontalAbsDifference), Average(greenTop, greenBottom), green); - return vbsl_u8(vcgt_u8(verticalAbsDifference, horizontalAbsDifference), Average(greenRight, greenLeft), green); - } - - template void BayerToBgr(const uint8x8x2_t s[12], uint8x8x2_t d[6]); - - template <> SIMD_INLINE void BayerToBgr(const uint8x8x2_t s[12], uint8x8x2_t d[6]) - { - d[0].val[0] = Average(s[0].val[1], s[7].val[0]); - d[0].val[1] = Average(s[0].val[1], s[2].val[1], s[7].val[0], s[8].val[0]); - d[1].val[0] = s[4].val[0]; - d[1].val[1] = BayerToGreen(s[4].val[0], s[2].val[0], s[5].val[0], s[7].val[1], s[3].val[1], s[1].val[1], s[5].val[1], s[11].val[0]); - d[2].val[0] = Average(s[3].val[1], s[4].val[1]); - d[2].val[1] = s[4].val[1]; - d[3].val[0] = s[7].val[0]; - d[3].val[1] = Average(s[7].val[0], s[8].val[0]); - d[4].val[0] = BayerToGreen(s[6].val[1], s[4].val[0], s[7].val[1], s[9].val[1], s[6].val[0], s[0].val[1], s[8].val[0], s[10].val[0]); - d[4].val[1] = s[7].val[1]; - d[5].val[0] = Average(s[3].val[1], s[4].val[1], s[9].val[0], s[11].val[0]); - d[5].val[1] = Average(s[4].val[1], s[11].val[0]); - } - - template <> SIMD_INLINE void BayerToBgr(const uint8x8x2_t s[12], uint8x8x2_t d[6]) - { - d[0].val[0] = Average(s[3].val[1], s[4].val[1]); - d[0].val[1] = s[4].val[1]; - d[1].val[0] = s[4].val[0]; - d[1].val[1] = BayerToGreen(s[4].val[0], s[2].val[0], s[5].val[0], s[7].val[1], s[3].val[1], s[1].val[1], s[5].val[1], s[11].val[0]); - d[2].val[0] = Average(s[0].val[1], s[7].val[0]); - d[2].val[1] = Average(s[0].val[1], s[2].val[1], s[7].val[0], s[8].val[0]); - d[3].val[0] = Average(s[3].val[1], s[4].val[1], s[9].val[0], s[11].val[0]); - d[3].val[1] = Average(s[4].val[1], s[11].val[0]); - d[4].val[0] = BayerToGreen(s[6].val[1], s[4].val[0], s[7].val[1], s[9].val[1], s[6].val[0], s[0].val[1], s[8].val[0], s[10].val[0]); - d[4].val[1] = s[7].val[1]; - d[5].val[0] = s[7].val[0]; - d[5].val[1] = Average(s[7].val[0], s[8].val[0]); - } - - template <> SIMD_INLINE void BayerToBgr(const uint8x8x2_t s[12], uint8x8x2_t d[6]) - { - d[0].val[0] = Average(s[0].val[0], s[2].val[0], s[6].val[1], s[7].val[1]); - d[0].val[1] = Average(s[2].val[0], s[7].val[1]); - d[1].val[0] = BayerToGreen(s[3].val[1], s[0].val[1], s[4].val[1], s[7].val[0], s[3].val[0], s[1].val[0], s[5].val[0], s[9].val[1]); - d[1].val[1] = s[4].val[1]; - d[2].val[0] = s[4].val[0]; - d[2].val[1] = Average(s[4].val[0], s[5].val[0]); - d[3].val[0] = Average(s[6].val[1], s[7].val[1]); - d[3].val[1] = s[7].val[1]; - d[4].val[0] = s[7].val[0]; - d[4].val[1] = BayerToGreen(s[7].val[0], s[4].val[1], s[8].val[0], s[11].val[0], s[6].val[1], s[2].val[0], s[8].val[1], s[10].val[1]); - d[5].val[0] = Average(s[4].val[0], s[9].val[1]); - d[5].val[1] = Average(s[4].val[0], s[5].val[0], s[9].val[1], s[11].val[1]); - } - - template <> SIMD_INLINE void BayerToBgr(const uint8x8x2_t s[12], uint8x8x2_t d[6]) - { - d[0].val[0] = s[4].val[0]; - d[0].val[1] = Average(s[4].val[0], s[5].val[0]); - d[1].val[0] = BayerToGreen(s[3].val[1], s[0].val[1], s[4].val[1], s[7].val[0], s[3].val[0], s[1].val[0], s[5].val[0], s[9].val[1]); - d[1].val[1] = s[4].val[1]; - d[2].val[0] = Average(s[0].val[0], s[2].val[0], s[6].val[1], s[7].val[1]); - d[2].val[1] = Average(s[2].val[0], s[7].val[1]); - d[3].val[0] = Average(s[4].val[0], s[9].val[1]); - d[3].val[1] = Average(s[4].val[0], s[5].val[0], s[9].val[1], s[11].val[1]); - d[4].val[0] = s[7].val[0]; - d[4].val[1] = BayerToGreen(s[7].val[0], s[4].val[1], s[8].val[0], s[11].val[0], s[6].val[1], s[2].val[0], s[8].val[1], s[10].val[1]); - d[5].val[0] = Average(s[6].val[1], s[7].val[1]); - d[5].val[1] = s[7].val[1]; - } - } -#endif//SIMD_NEON_ENABLE -} -#endif//__SimdBayer_h__ diff --git a/src/3rd/Simd/Simd/SimdCompare.h b/src/3rd/Simd/Simd/SimdCompare.h deleted file mode 100644 index ff973dab..00000000 --- a/src/3rd/Simd/Simd/SimdCompare.h +++ /dev/null @@ -1,529 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar, -* 2018-2019 Radchenko Andrey. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdCompare_h__ -#define __SimdCompare_h__ - -#include "Simd/SimdConst.h" - -namespace Simd -{ - namespace Base - { - template SIMD_INLINE bool Compare8u(const uint8_t & src, const uint8_t & b); - - template <> SIMD_INLINE bool Compare8u(const uint8_t & a, const uint8_t & b) - { - return a == b; - } - - template <> SIMD_INLINE bool Compare8u(const uint8_t & a, const uint8_t & b) - { - return a != b; - } - - template <> SIMD_INLINE bool Compare8u(const uint8_t & a, const uint8_t & b) - { - return a > b; - } - - template <> SIMD_INLINE bool Compare8u(const uint8_t & a, const uint8_t & b) - { - return a >= b; - } - - template <> SIMD_INLINE bool Compare8u(const uint8_t & a, const uint8_t & b) - { - return a < b; - } - - template <> SIMD_INLINE bool Compare8u(const uint8_t & a, const uint8_t & b) - { - return a <= b; - } - - template SIMD_INLINE bool Compare16i(const int16_t & src, const int16_t & b); - - template <> SIMD_INLINE bool Compare16i(const int16_t & a, const int16_t & b) - { - return a == b; - } - - template <> SIMD_INLINE bool Compare16i(const int16_t & a, const int16_t & b) - { - return a != b; - } - - template <> SIMD_INLINE bool Compare16i(const int16_t & a, const int16_t & b) - { - return a > b; - } - - template <> SIMD_INLINE bool Compare16i(const int16_t & a, const int16_t & b) - { - return a >= b; - } - - template <> SIMD_INLINE bool Compare16i(const int16_t & a, const int16_t & b) - { - return a < b; - } - - template <> SIMD_INLINE bool Compare16i(const int16_t & a, const int16_t & b) - { - return a <= b; - } - } - -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { - SIMD_INLINE __m128i NotEqual8u(__m128i a, __m128i b) - { - return _mm_andnot_si128(_mm_cmpeq_epi8(a, b), K_INV_ZERO); - } - - SIMD_INLINE __m128i Greater8u(__m128i a, __m128i b) - { - return _mm_andnot_si128(_mm_cmpeq_epi8(_mm_min_epu8(a, b), a), K_INV_ZERO); - } - - SIMD_INLINE __m128i GreaterOrEqual8u(__m128i a, __m128i b) - { - return _mm_cmpeq_epi8(_mm_max_epu8(a, b), a); - } - - SIMD_INLINE __m128i Lesser8u(__m128i a, __m128i b) - { - return _mm_andnot_si128(_mm_cmpeq_epi8(_mm_max_epu8(a, b), a), K_INV_ZERO); - } - - SIMD_INLINE __m128i LesserOrEqual8u(__m128i a, __m128i b) - { - return _mm_cmpeq_epi8(_mm_min_epu8(a, b), a); - } - - template SIMD_INLINE __m128i Compare8u(__m128i a, __m128i b); - - template<> SIMD_INLINE __m128i Compare8u(__m128i a, __m128i b) - { - return _mm_cmpeq_epi8(a, b); - } - - template<> SIMD_INLINE __m128i Compare8u(__m128i a, __m128i b) - { - return NotEqual8u(a, b); - } - - template<> SIMD_INLINE __m128i Compare8u(__m128i a, __m128i b) - { - return Greater8u(a, b); - } - - template<> SIMD_INLINE __m128i Compare8u(__m128i a, __m128i b) - { - return GreaterOrEqual8u(a, b); - } - - template<> SIMD_INLINE __m128i Compare8u(__m128i a, __m128i b) - { - return Lesser8u(a, b); - } - - template<> SIMD_INLINE __m128i Compare8u(__m128i a, __m128i b) - { - return LesserOrEqual8u(a, b); - } - - SIMD_INLINE __m128i NotEqual16i(__m128i a, __m128i b) - { - return _mm_andnot_si128(_mm_cmpeq_epi16(a, b), K_INV_ZERO); - } - - SIMD_INLINE __m128i GreaterOrEqual16i_m128(__m128i a, __m128i b) - { - return _mm_andnot_si128(_mm_cmplt_epi16(a, b), K_INV_ZERO); - } - - SIMD_INLINE __m128i LesserOrEqual16i(__m128i a, __m128i b) - { - return _mm_andnot_si128(_mm_cmpgt_epi16(a, b), K_INV_ZERO); - } - - template SIMD_INLINE __m128i Compare16i(__m128i a, __m128i b); - - template<> SIMD_INLINE __m128i Compare16i(__m128i a, __m128i b) - { - return _mm_cmpeq_epi16(a, b); - } - - template<> SIMD_INLINE __m128i Compare16i(__m128i a, __m128i b) - { - return NotEqual16i(a, b); - } - - template<> SIMD_INLINE __m128i Compare16i(__m128i a, __m128i b) - { - return _mm_cmpgt_epi16(a, b); - } - - template<> SIMD_INLINE __m128i Compare16i(__m128i a, __m128i b) - { - return GreaterOrEqual16i_m128(a, b); - } - - template<> SIMD_INLINE __m128i Compare16i(__m128i a, __m128i b) - { - return _mm_cmplt_epi16(a, b); - } - - template<> SIMD_INLINE __m128i Compare16i(__m128i a, __m128i b) - { - return LesserOrEqual16i(a, b); - } - } -#endif// SIMD_SSE2_ENABLE - -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - SIMD_INLINE __m256i NotEqual8u(__m256i a, __m256i b) - { - return _mm256_andnot_si256(_mm256_cmpeq_epi8(a, b), K_INV_ZERO); - } - - SIMD_INLINE __m256i Greater8u(__m256i a, __m256i b) - { - return _mm256_andnot_si256(_mm256_cmpeq_epi8(_mm256_min_epu8(a, b), a), K_INV_ZERO); - } - - SIMD_INLINE __m256i GreaterOrEqual8u(__m256i a, __m256i b) - { - return _mm256_cmpeq_epi8(_mm256_max_epu8(a, b), a); - } - - SIMD_INLINE __m256i Lesser8u(__m256i a, __m256i b) - { - return _mm256_andnot_si256(_mm256_cmpeq_epi8(_mm256_max_epu8(a, b), a), K_INV_ZERO); - } - - SIMD_INLINE __m256i LesserOrEqual8u(__m256i a, __m256i b) - { - return _mm256_cmpeq_epi8(_mm256_min_epu8(a, b), a); - } - - template SIMD_INLINE __m256i Compare8u(__m256i a, __m256i b); - - template<> SIMD_INLINE __m256i Compare8u(__m256i a, __m256i b) - { - return _mm256_cmpeq_epi8(a, b); - } - - template<> SIMD_INLINE __m256i Compare8u(__m256i a, __m256i b) - { - return NotEqual8u(a, b); - } - - template<> SIMD_INLINE __m256i Compare8u(__m256i a, __m256i b) - { - return Greater8u(a, b); - } - - template<> SIMD_INLINE __m256i Compare8u(__m256i a, __m256i b) - { - return GreaterOrEqual8u(a, b); - } - - template<> SIMD_INLINE __m256i Compare8u(__m256i a, __m256i b) - { - return Lesser8u(a, b); - } - - template<> SIMD_INLINE __m256i Compare8u(__m256i a, __m256i b) - { - return LesserOrEqual8u(a, b); - } - - SIMD_INLINE __m256i NotEqual16i(__m256i a, __m256i b) - { - return _mm256_andnot_si256(_mm256_cmpeq_epi16(a, b), K_INV_ZERO); - } - - SIMD_INLINE __m256i GreaterOrEqual16i_m256(__m256i a, __m256i b) - { - return _mm256_andnot_si256(_mm256_cmpgt_epi16(b, a), K_INV_ZERO); - } - - SIMD_INLINE __m256i LesserOrEqual16i(__m256i a, __m256i b) - { - return _mm256_andnot_si256(_mm256_cmpgt_epi16(a, b), K_INV_ZERO); - } - - template SIMD_INLINE __m256i Compare16i(__m256i a, __m256i b); - - template<> SIMD_INLINE __m256i Compare16i(__m256i a, __m256i b) - { - return _mm256_cmpeq_epi16(a, b); - } - - template<> SIMD_INLINE __m256i Compare16i(__m256i a, __m256i b) - { - return NotEqual16i(a, b); - } - - template<> SIMD_INLINE __m256i Compare16i(__m256i a, __m256i b) - { - return _mm256_cmpgt_epi16(a, b); - } - - template<> SIMD_INLINE __m256i Compare16i(__m256i a, __m256i b) - { - return GreaterOrEqual16i_m256(a, b); - } - - template<> SIMD_INLINE __m256i Compare16i(__m256i a, __m256i b) - { - return _mm256_cmpgt_epi16(b, a); - } - - template<> SIMD_INLINE __m256i Compare16i(__m256i a, __m256i b) - { - return LesserOrEqual16i(a, b); - } - } -#endif// SIMD_AVX2_ENABLE - -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE __mmask64 Compare8u(__m512i a, __m512i b); - - template<> SIMD_INLINE __mmask64 Compare8u(__m512i a, __m512i b) - { - return _mm512_cmpeq_epu8_mask(a, b); - } - - template<> SIMD_INLINE __mmask64 Compare8u(__m512i a, __m512i b) - { - return _mm512_cmpneq_epu8_mask(a, b); - } - - template<> SIMD_INLINE __mmask64 Compare8u(__m512i a, __m512i b) - { - return _mm512_cmpgt_epu8_mask(a, b); - } - - template<> SIMD_INLINE __mmask64 Compare8u(__m512i a, __m512i b) - { - return _mm512_cmpge_epu8_mask(a, b); - } - - template<> SIMD_INLINE __mmask64 Compare8u(__m512i a, __m512i b) - { - return _mm512_cmplt_epu8_mask(a, b); - } - - template<> SIMD_INLINE __mmask64 Compare8u(__m512i a, __m512i b) - { - return _mm512_cmple_epu8_mask(a, b); - } - - template SIMD_INLINE __mmask32 Compare16i(__m512i a, __m512i b); - - template<> SIMD_INLINE __mmask32 Compare16i(__m512i a, __m512i b) - { - return _mm512_cmpeq_epi16_mask(a, b); - } - - template<> SIMD_INLINE __mmask32 Compare16i(__m512i a, __m512i b) - { - return _mm512_cmpneq_epi16_mask(a, b); - } - - template<> SIMD_INLINE __mmask32 Compare16i(__m512i a, __m512i b) - { - return _mm512_cmpgt_epi16_mask(a, b); - } - - template<> SIMD_INLINE __mmask32 Compare16i(__m512i a, __m512i b) - { - return _mm512_cmpge_epi16_mask(a, b); - } - - template<> SIMD_INLINE __mmask32 Compare16i(__m512i a, __m512i b) - { - return _mm512_cmplt_epi16_mask(a, b); - } - - template<> SIMD_INLINE __mmask32 Compare16i(__m512i a, __m512i b) - { - return _mm512_cmple_epi16_mask(a, b); - } - } -#endif// SIMD_AVX512BW_ENABLE - -#ifdef SIMD_VMX_ENABLE - namespace Vmx - { - SIMD_INLINE v128_u8 GreaterOrEqual(v128_u8 a, v128_u8 b) - { - return (v128_u8)vec_cmpeq(vec_max(a, b), a); - } - - template SIMD_INLINE v128_u8 Compare8u(v128_u8 a, v128_u8 b); - - template<> SIMD_INLINE v128_u8 Compare8u(v128_u8 a, v128_u8 b) - { - return (v128_u8)vec_cmpeq(a, b); - } - - template<> SIMD_INLINE v128_u8 Compare8u(v128_u8 a, v128_u8 b) - { - return vec_xor((v128_u8)vec_cmpeq(a, b), K8_FF); - } - - template<> SIMD_INLINE v128_u8 Compare8u(v128_u8 a, v128_u8 b) - { - return (v128_u8)vec_cmpgt(a, b); - } - - template<> SIMD_INLINE v128_u8 Compare8u(v128_u8 a, v128_u8 b) - { - return GreaterOrEqual(a, b); - } - - template<> SIMD_INLINE v128_u8 Compare8u(v128_u8 a, v128_u8 b) - { - return (v128_u8)vec_cmplt(a, b); - } - - template<> SIMD_INLINE v128_u8 Compare8u(v128_u8 a, v128_u8 b) - { - return vec_xor((v128_u8)vec_cmpgt(a, b), K8_FF); - } - - template SIMD_INLINE v128_s16 Compare16i(v128_s16 a, v128_s16 b); - - template<> SIMD_INLINE v128_s16 Compare16i(v128_s16 a, v128_s16 b) - { - return (v128_s16)vec_cmpeq(a, b); - } - - template<> SIMD_INLINE v128_s16 Compare16i(v128_s16 a, v128_s16 b) - { - return (v128_s16)vec_xor((v128_u16)vec_cmpeq(a, b), K16_FFFF); - } - - template<> SIMD_INLINE v128_s16 Compare16i(v128_s16 a, v128_s16 b) - { - return (v128_s16)vec_cmpgt(a, b); - } - - template<> SIMD_INLINE v128_s16 Compare16i(v128_s16 a, v128_s16 b) - { - return (v128_s16)vec_cmpeq(vec_max(a, b), a); - } - - template<> SIMD_INLINE v128_s16 Compare16i(v128_s16 a, v128_s16 b) - { - return (v128_s16)vec_cmplt(a, b); - } - - template<> SIMD_INLINE v128_s16 Compare16i(v128_s16 a, v128_s16 b) - { - return (v128_s16)vec_xor((v128_u16)vec_cmpgt(a, b), K16_FFFF); - } - } -#endif// SIMD_VMX_ENABLE - -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE uint8x16_t Compare8u(const uint8x16_t & a, const uint8x16_t & b); - - template<> SIMD_INLINE uint8x16_t Compare8u(const uint8x16_t & a, const uint8x16_t & b) - { - return vceqq_u8(a, b); - } - - template<> SIMD_INLINE uint8x16_t Compare8u(const uint8x16_t & a, const uint8x16_t & b) - { - return vmvnq_u8(vceqq_u8(a, b)); - } - - template<> SIMD_INLINE uint8x16_t Compare8u(const uint8x16_t & a, const uint8x16_t & b) - { - return vcgtq_u8(a, b); - } - - template<> SIMD_INLINE uint8x16_t Compare8u(const uint8x16_t & a, const uint8x16_t & b) - { - return vcgeq_u8(a, b); - } - - template<> SIMD_INLINE uint8x16_t Compare8u(const uint8x16_t & a, const uint8x16_t & b) - { - return vcltq_u8(a, b); - } - - template<> SIMD_INLINE uint8x16_t Compare8u(const uint8x16_t & a, const uint8x16_t & b) - { - return vcleq_u8(a, b); - } - - template SIMD_INLINE uint16x8_t Compare16i(const int16x8_t & a, const int16x8_t & b); - - template<> SIMD_INLINE uint16x8_t Compare16i(const int16x8_t & a, const int16x8_t & b) - { - return vceqq_s16(a, b); - } - - template<> SIMD_INLINE uint16x8_t Compare16i(const int16x8_t & a, const int16x8_t & b) - { - return vmvnq_u16(vceqq_s16(a, b)); - } - - template<> SIMD_INLINE uint16x8_t Compare16i(const int16x8_t & a, const int16x8_t & b) - { - return vcgtq_s16(a, b); - } - - template<> SIMD_INLINE uint16x8_t Compare16i(const int16x8_t & a, const int16x8_t & b) - { - return vcgeq_s16(a, b); - } - - template<> SIMD_INLINE uint16x8_t Compare16i(const int16x8_t & a, const int16x8_t & b) - { - return vcltq_s16(a, b); - } - - template<> SIMD_INLINE uint16x8_t Compare16i(const int16x8_t & a, const int16x8_t & b) - { - return vcleq_s16(a, b); - } - } -#endif// SIMD_NEON_ENABLE -} -#endif//__SimdCompare_h__ diff --git a/src/3rd/Simd/Simd/SimdConfig.h b/src/3rd/Simd/Simd/SimdConfig.h deleted file mode 100644 index 1a050380..00000000 --- a/src/3rd/Simd/Simd/SimdConfig.h +++ /dev/null @@ -1,83 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdConfig_h__ -#define __SimdConfig_h__ - -//#define SIMD_SSE_DISABLE - -//#define SIMD_SSE2_DISABLE - -//#define SIMD_SSE3_DISABLE - -//#define SIMD_SSSE3_DISABLE - -//#define SIMD_SSE41_DISABLE - -//#define SIMD_SSE42_DISABLE - -//#define SIMD_AVX_DISABLE - -//#define SIMD_AVX2_DISABLE - -//#define SIMD_AVX512F_DISABLE - -//#define SIMD_AVX512BW_DISABLE - -//#define SIMD_AVX512VNNI_DISABLE - -//#define SIMD_VMX_DISABLE - -//#define SIMD_VSX_DISABLE - -//#define SIMD_NEON_DISABLE - -//#define SIMD_NEON_FP16_DISABLE - -//#define SIMD_MSA_DISABLE - -//#define SIMD_STATIC - -#define SIMD_LOG_ENABLE - -#define SIMD_ALLOCATE_ERROR_MESSAGE - -#define SIMD_ALLOCATE_ASSERT - -#define SIMD_NO_MANS_LAND 64 - -#define SIMD_NEON_RCP_ITER -1 - -#define SIMD_NEON_ASM_DISABLE - -#define SIMD_NEON_PREFECH_SIZE 384 - -//#define SIMD_OPENCV_ENABLE - -//#define SIMD_PERFORMANCE_STATISTIC - -//#define SIMD_RUNTIME_STATISTIC - -//#define SIMD_FUTURE_DISABLE - -#endif//__SimdConfig_h__ diff --git a/src/3rd/Simd/Simd/SimdConst.h b/src/3rd/Simd/Simd/SimdConst.h deleted file mode 100644 index 6b260e80..00000000 --- a/src/3rd/Simd/Simd/SimdConst.h +++ /dev/null @@ -1,836 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar, -* 2014-2015 Antonenka Mikhail. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdConst_h__ -#define __SimdConst_h__ - -#include "Simd/SimdInit.h" - -namespace Simd -{ - const size_t HISTOGRAM_SIZE = UCHAR_MAX + 1; - - namespace Base - { - const int LINEAR_SHIFT = 4; - const int LINEAR_ROUND_TERM = 1 << (LINEAR_SHIFT - 1); - - const int BILINEAR_SHIFT = LINEAR_SHIFT * 2; - const int BILINEAR_ROUND_TERM = 1 << (BILINEAR_SHIFT - 1); - - const int FRACTION_RANGE = 1 << LINEAR_SHIFT; - const double FRACTION_ROUND_TERM = 0.5 / FRACTION_RANGE; - - const float KF_255_DIV_6 = 255.0f / 6.0f; - - const int BGR_TO_GRAY_AVERAGING_SHIFT = 14; - const int BGR_TO_GRAY_ROUND_TERM = 1 << (BGR_TO_GRAY_AVERAGING_SHIFT - 1); - const int BLUE_TO_GRAY_WEIGHT = int(0.114*(1 << BGR_TO_GRAY_AVERAGING_SHIFT) + 0.5); - const int GREEN_TO_GRAY_WEIGHT = int(0.587*(1 << BGR_TO_GRAY_AVERAGING_SHIFT) + 0.5); - const int RED_TO_GRAY_WEIGHT = int(0.299*(1 << BGR_TO_GRAY_AVERAGING_SHIFT) + 0.5); - - const int Y_ADJUST = 16; - const int UV_ADJUST = 128; - const int YUV_TO_BGR_AVERAGING_SHIFT = 13; - const int YUV_TO_BGR_ROUND_TERM = 1 << (YUV_TO_BGR_AVERAGING_SHIFT - 1); - const int Y_TO_RGB_WEIGHT = int(1.164*(1 << YUV_TO_BGR_AVERAGING_SHIFT) + 0.5); - const int U_TO_BLUE_WEIGHT = int(2.018*(1 << YUV_TO_BGR_AVERAGING_SHIFT) + 0.5); - const int U_TO_GREEN_WEIGHT = -int(0.391*(1 << YUV_TO_BGR_AVERAGING_SHIFT) + 0.5); - const int V_TO_GREEN_WEIGHT = -int(0.813*(1 << YUV_TO_BGR_AVERAGING_SHIFT) + 0.5); - const int V_TO_RED_WEIGHT = int(1.596*(1 << YUV_TO_BGR_AVERAGING_SHIFT) + 0.5); - - const int BGR_TO_YUV_AVERAGING_SHIFT = 14; - const int BGR_TO_YUV_ROUND_TERM = 1 << (BGR_TO_YUV_AVERAGING_SHIFT - 1); - const int BLUE_TO_Y_WEIGHT = int(0.098*(1 << BGR_TO_YUV_AVERAGING_SHIFT) + 0.5); - const int GREEN_TO_Y_WEIGHT = int(0.504*(1 << BGR_TO_YUV_AVERAGING_SHIFT) + 0.5); - const int RED_TO_Y_WEIGHT = int(0.257*(1 << BGR_TO_YUV_AVERAGING_SHIFT) + 0.5); - const int BLUE_TO_U_WEIGHT = int(0.439*(1 << BGR_TO_YUV_AVERAGING_SHIFT) + 0.5); - const int GREEN_TO_U_WEIGHT = -int(0.291*(1 << BGR_TO_YUV_AVERAGING_SHIFT) + 0.5); - const int RED_TO_U_WEIGHT = -int(0.148*(1 << BGR_TO_YUV_AVERAGING_SHIFT) + 0.5); - const int BLUE_TO_V_WEIGHT = -int(0.071*(1 << BGR_TO_YUV_AVERAGING_SHIFT) + 0.5); - const int GREEN_TO_V_WEIGHT = -int(0.368*(1 << BGR_TO_YUV_AVERAGING_SHIFT) + 0.5); - const int RED_TO_V_WEIGHT = int(0.439*(1 << BGR_TO_YUV_AVERAGING_SHIFT) + 0.5); - - const int DIVISION_BY_9_SHIFT = 16; - const int DIVISION_BY_9_FACTOR = (1 << DIVISION_BY_9_SHIFT) / 9; - } - -#ifdef SIMD_SSE_ENABLE - namespace Sse - { - const size_t F = sizeof(__m128) / sizeof(float); - const size_t DF = 2 * F; - const size_t QF = 4 * F; - const size_t HF = F / 2; - } -#endif// SIMD_SSE_ENABLE - -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { - using namespace Sse; -#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug - using Sse::F; - using Sse::DF; - using Sse::QF; -#endif - - const size_t A = sizeof(__m128i); - const size_t DA = 2 * A; - const size_t QA = 4 * A; - const size_t OA = 8 * A; - const size_t HA = A / 2; - - const __m128i K_ZERO = SIMD_MM_SET1_EPI8(0); - const __m128i K_INV_ZERO = SIMD_MM_SET1_EPI8(0xFF); - - const __m128i K8_01 = SIMD_MM_SET1_EPI8(0x01); - const __m128i K8_02 = SIMD_MM_SET1_EPI8(0x02); - const __m128i K8_03 = SIMD_MM_SET1_EPI8(0x03); - const __m128i K8_04 = SIMD_MM_SET1_EPI8(0x04); - const __m128i K8_07 = SIMD_MM_SET1_EPI8(0x07); - const __m128i K8_08 = SIMD_MM_SET1_EPI8(0x08); - const __m128i K8_10 = SIMD_MM_SET1_EPI8(0x10); - const __m128i K8_20 = SIMD_MM_SET1_EPI8(0x20); - const __m128i K8_40 = SIMD_MM_SET1_EPI8(0x40); - const __m128i K8_80 = SIMD_MM_SET1_EPI8(0x80); - - const __m128i K8_01_FF = SIMD_MM_SET2_EPI8(0x01, 0xFF); - - const __m128i K16_0001 = SIMD_MM_SET1_EPI16(0x0001); - const __m128i K16_0002 = SIMD_MM_SET1_EPI16(0x0002); - const __m128i K16_0003 = SIMD_MM_SET1_EPI16(0x0003); - const __m128i K16_0004 = SIMD_MM_SET1_EPI16(0x0004); - const __m128i K16_0005 = SIMD_MM_SET1_EPI16(0x0005); - const __m128i K16_0006 = SIMD_MM_SET1_EPI16(0x0006); - const __m128i K16_0008 = SIMD_MM_SET1_EPI16(0x0008); - const __m128i K16_0020 = SIMD_MM_SET1_EPI16(0x0020); - const __m128i K16_0080 = SIMD_MM_SET1_EPI16(0x0080); - const __m128i K16_00FF = SIMD_MM_SET1_EPI16(0x00FF); - const __m128i K16_FF00 = SIMD_MM_SET1_EPI16(0xFF00); - - const __m128i K32_00000001 = SIMD_MM_SET1_EPI32(0x00000001); - const __m128i K32_00000002 = SIMD_MM_SET1_EPI32(0x00000002); - const __m128i K32_00000004 = SIMD_MM_SET1_EPI32(0x00000004); - const __m128i K32_00000008 = SIMD_MM_SET1_EPI32(0x00000008); - const __m128i K32_000000FF = SIMD_MM_SET1_EPI32(0x000000FF); - const __m128i K32_0000FFFF = SIMD_MM_SET1_EPI32(0x0000FFFF); - const __m128i K32_00010000 = SIMD_MM_SET1_EPI32(0x00010000); - const __m128i K32_01000000 = SIMD_MM_SET1_EPI32(0x01000000); - const __m128i K32_00FFFFFF = SIMD_MM_SET1_EPI32(0x00FFFFFF); - const __m128i K32_FFFFFF00 = SIMD_MM_SET1_EPI32(0xFFFFFF00); - - const __m128i K64_00000000FFFFFFFF = SIMD_MM_SET2_EPI32(0xFFFFFFFF, 0); - - const __m128i K16_Y_ADJUST = SIMD_MM_SET1_EPI16(Base::Y_ADJUST); - const __m128i K16_UV_ADJUST = SIMD_MM_SET1_EPI16(Base::UV_ADJUST); - - const __m128i K16_YRGB_RT = SIMD_MM_SET2_EPI16(Base::Y_TO_RGB_WEIGHT, Base::YUV_TO_BGR_ROUND_TERM); - const __m128i K16_VR_0 = SIMD_MM_SET2_EPI16(Base::V_TO_RED_WEIGHT, 0); - const __m128i K16_UG_VG = SIMD_MM_SET2_EPI16(Base::U_TO_GREEN_WEIGHT, Base::V_TO_GREEN_WEIGHT); - const __m128i K16_UB_0 = SIMD_MM_SET2_EPI16(Base::U_TO_BLUE_WEIGHT, 0); - - const __m128i K16_BY_RY = SIMD_MM_SET2_EPI16(Base::BLUE_TO_Y_WEIGHT, Base::RED_TO_Y_WEIGHT); - const __m128i K16_GY_RT = SIMD_MM_SET2_EPI16(Base::GREEN_TO_Y_WEIGHT, Base::BGR_TO_YUV_ROUND_TERM); - const __m128i K16_BU_RU = SIMD_MM_SET2_EPI16(Base::BLUE_TO_U_WEIGHT, Base::RED_TO_U_WEIGHT); - const __m128i K16_GU_RT = SIMD_MM_SET2_EPI16(Base::GREEN_TO_U_WEIGHT, Base::BGR_TO_YUV_ROUND_TERM); - const __m128i K16_BV_RV = SIMD_MM_SET2_EPI16(Base::BLUE_TO_V_WEIGHT, Base::RED_TO_V_WEIGHT); - const __m128i K16_GV_RT = SIMD_MM_SET2_EPI16(Base::GREEN_TO_V_WEIGHT, Base::BGR_TO_YUV_ROUND_TERM); - - const __m128i K16_DIVISION_BY_9_FACTOR = SIMD_MM_SET1_EPI16(Base::DIVISION_BY_9_FACTOR); - } -#endif// SIMD_SSE2_ENABLE - -#ifdef SIMD_SSE3_ENABLE - namespace Sse3 - { - using namespace Sse2; -#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug - using Sse::F; - using Sse::DF; - using Sse::QF; -#endif - } -#endif// SIMD_SSE3_ENABLE - -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - using namespace Sse3; - - const __m128i K8_SHUFFLE_GRAY_TO_BGR0 = SIMD_MM_SETR_EPI8(0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5); - const __m128i K8_SHUFFLE_GRAY_TO_BGR1 = SIMD_MM_SETR_EPI8(0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0xA, 0xA); - const __m128i K8_SHUFFLE_GRAY_TO_BGR2 = SIMD_MM_SETR_EPI8(0xA, 0xB, 0xB, 0xB, 0xC, 0xC, 0xC, 0xD, 0xD, 0xD, 0xE, 0xE, 0xE, 0xF, 0xF, 0xF); - - const __m128i K8_SHUFFLE_BLUE_TO_BGR0 = SIMD_MM_SETR_EPI8(0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, -1, 0x5); - const __m128i K8_SHUFFLE_BLUE_TO_BGR1 = SIMD_MM_SETR_EPI8(-1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA, -1); - const __m128i K8_SHUFFLE_BLUE_TO_BGR2 = SIMD_MM_SETR_EPI8(-1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF, -1, -1); - - const __m128i K8_SHUFFLE_GREEN_TO_BGR0 = SIMD_MM_SETR_EPI8(-1, 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, -1); - const __m128i K8_SHUFFLE_GREEN_TO_BGR1 = SIMD_MM_SETR_EPI8(0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA); - const __m128i K8_SHUFFLE_GREEN_TO_BGR2 = SIMD_MM_SETR_EPI8(-1, -1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF, -1); - - const __m128i K8_SHUFFLE_RED_TO_BGR0 = SIMD_MM_SETR_EPI8(-1, -1, 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1); - const __m128i K8_SHUFFLE_RED_TO_BGR1 = SIMD_MM_SETR_EPI8(-1, 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1); - const __m128i K8_SHUFFLE_RED_TO_BGR2 = SIMD_MM_SETR_EPI8(0xA, -1, -1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF); - - const __m128i K8_SHUFFLE_BGR0_TO_BLUE = SIMD_MM_SETR_EPI8(0x0, 0x3, 0x6, 0x9, 0xC, 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_SHUFFLE_BGR1_TO_BLUE = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, 0x2, 0x5, 0x8, 0xB, 0xE, -1, -1, -1, -1, -1); - const __m128i K8_SHUFFLE_BGR2_TO_BLUE = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD); - - const __m128i K8_SHUFFLE_BGR0_TO_GREEN = SIMD_MM_SETR_EPI8(0x1, 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_SHUFFLE_BGR1_TO_GREEN = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF, -1, -1, -1, -1, -1); - const __m128i K8_SHUFFLE_BGR2_TO_GREEN = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x2, 0x5, 0x8, 0xB, 0xE); - - const __m128i K8_SHUFFLE_BGR0_TO_RED = SIMD_MM_SETR_EPI8(0x2, 0x5, 0x8, 0xB, 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_SHUFFLE_BGR1_TO_RED = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1); - const __m128i K8_SHUFFLE_BGR2_TO_RED = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF); - } -#endif// SIMD_SSSE3_ENABLE - -#ifdef SIMD_SSE41_ENABLE - namespace Sse41 - { - using namespace Ssse3; -#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug - using Sse::F; - using Sse::DF; - using Sse::QF; -#endif - } -#endif// SIMD_SSE41_ENABLE - -#ifdef SIMD_SSE42_ENABLE - namespace Sse42 - { - using namespace Sse41; - } -#endif// SIMD_SSE42_ENABLE - -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - const size_t F = sizeof(__m256) / sizeof(float); - const size_t DF = 2 * F; - const size_t QF = 4 * F; - const size_t HF = F / 2; - } -#endif// SIMD_AVX_ENABLE - -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - using namespace Avx; -#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug - using Avx::F; - using Avx::DF; - using Avx::QF; -#endif - - const size_t A = sizeof(__m256i); - const size_t DA = 2 * A; - const size_t QA = 4 * A; - const size_t OA = 8 * A; - const size_t HA = A / 2; - - const __m256i K_ZERO = SIMD_MM256_SET1_EPI8(0); - const __m256i K_INV_ZERO = SIMD_MM256_SET1_EPI8(0xFF); - - const __m256i K8_01 = SIMD_MM256_SET1_EPI8(0x01); - const __m256i K8_02 = SIMD_MM256_SET1_EPI8(0x02); - const __m256i K8_03 = SIMD_MM256_SET1_EPI8(0x03); - const __m256i K8_04 = SIMD_MM256_SET1_EPI8(0x04); - const __m256i K8_07 = SIMD_MM256_SET1_EPI8(0x07); - const __m256i K8_08 = SIMD_MM256_SET1_EPI8(0x08); - const __m256i K8_10 = SIMD_MM256_SET1_EPI8(0x10); - const __m256i K8_20 = SIMD_MM256_SET1_EPI8(0x20); - const __m256i K8_40 = SIMD_MM256_SET1_EPI8(0x40); - const __m256i K8_80 = SIMD_MM256_SET1_EPI8(0x80); - - const __m256i K8_01_FF = SIMD_MM256_SET2_EPI8(0x01, 0xFF); - - const __m256i K16_0001 = SIMD_MM256_SET1_EPI16(0x0001); - const __m256i K16_0002 = SIMD_MM256_SET1_EPI16(0x0002); - const __m256i K16_0003 = SIMD_MM256_SET1_EPI16(0x0003); - const __m256i K16_0004 = SIMD_MM256_SET1_EPI16(0x0004); - const __m256i K16_0005 = SIMD_MM256_SET1_EPI16(0x0005); - const __m256i K16_0006 = SIMD_MM256_SET1_EPI16(0x0006); - const __m256i K16_0008 = SIMD_MM256_SET1_EPI16(0x0008); - const __m256i K16_0010 = SIMD_MM256_SET1_EPI16(0x0010); - const __m256i K16_0018 = SIMD_MM256_SET1_EPI16(0x0018); - const __m256i K16_0020 = SIMD_MM256_SET1_EPI16(0x0020); - const __m256i K16_0080 = SIMD_MM256_SET1_EPI16(0x0080); - const __m256i K16_00FF = SIMD_MM256_SET1_EPI16(0x00FF); - const __m256i K16_FF00 = SIMD_MM256_SET1_EPI16(0xFF00); - - const __m256i K32_00000001 = SIMD_MM256_SET1_EPI32(0x00000001); - const __m256i K32_00000002 = SIMD_MM256_SET1_EPI32(0x00000002); - const __m256i K32_00000004 = SIMD_MM256_SET1_EPI32(0x00000004); - const __m256i K32_00000008 = SIMD_MM256_SET1_EPI32(0x00000008); - const __m256i K32_000000FF = SIMD_MM256_SET1_EPI32(0x000000FF); - const __m256i K32_0000FFFF = SIMD_MM256_SET1_EPI32(0x0000FFFF); - const __m256i K32_00010000 = SIMD_MM256_SET1_EPI32(0x00010000); - const __m256i K32_01000000 = SIMD_MM256_SET1_EPI32(0x01000000); - const __m256i K32_FFFFFF00 = SIMD_MM256_SET1_EPI32(0xFFFFFF00); - - const __m256i K16_Y_ADJUST = SIMD_MM256_SET1_EPI16(Base::Y_ADJUST); - const __m256i K16_UV_ADJUST = SIMD_MM256_SET1_EPI16(Base::UV_ADJUST); - - const __m256i K16_YRGB_RT = SIMD_MM256_SET2_EPI16(Base::Y_TO_RGB_WEIGHT, Base::YUV_TO_BGR_ROUND_TERM); - const __m256i K16_VR_0 = SIMD_MM256_SET2_EPI16(Base::V_TO_RED_WEIGHT, 0); - const __m256i K16_UG_VG = SIMD_MM256_SET2_EPI16(Base::U_TO_GREEN_WEIGHT, Base::V_TO_GREEN_WEIGHT); - const __m256i K16_UB_0 = SIMD_MM256_SET2_EPI16(Base::U_TO_BLUE_WEIGHT, 0); - - const __m256i K16_BY_RY = SIMD_MM256_SET2_EPI16(Base::BLUE_TO_Y_WEIGHT, Base::RED_TO_Y_WEIGHT); - const __m256i K16_GY_RT = SIMD_MM256_SET2_EPI16(Base::GREEN_TO_Y_WEIGHT, Base::BGR_TO_YUV_ROUND_TERM); - const __m256i K16_BU_RU = SIMD_MM256_SET2_EPI16(Base::BLUE_TO_U_WEIGHT, Base::RED_TO_U_WEIGHT); - const __m256i K16_GU_RT = SIMD_MM256_SET2_EPI16(Base::GREEN_TO_U_WEIGHT, Base::BGR_TO_YUV_ROUND_TERM); - const __m256i K16_BV_RV = SIMD_MM256_SET2_EPI16(Base::BLUE_TO_V_WEIGHT, Base::RED_TO_V_WEIGHT); - const __m256i K16_GV_RT = SIMD_MM256_SET2_EPI16(Base::GREEN_TO_V_WEIGHT, Base::BGR_TO_YUV_ROUND_TERM); - - const __m256i K16_DIVISION_BY_9_FACTOR = SIMD_MM256_SET1_EPI16(Base::DIVISION_BY_9_FACTOR); - - const __m256i K8_SHUFFLE_0 = SIMD_MM256_SETR_EPI8( - 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, - 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0); - - const __m256i K8_SHUFFLE_1 = SIMD_MM256_SETR_EPI8( - 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, - 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70); - - const __m256i K8_SHUFFLE_GRAY_TO_BGR0 = SIMD_MM256_SETR_EPI8( - 0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5, - 0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0xA, 0xA); - const __m256i K8_SHUFFLE_GRAY_TO_BGR1 = SIMD_MM256_SETR_EPI8( - 0x2, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, - 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0xA, 0xA, 0xA, 0xB, 0xB, 0xB, 0xC, 0xC, 0xC, 0xD); - const __m256i K8_SHUFFLE_GRAY_TO_BGR2 = SIMD_MM256_SETR_EPI8( - 0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0xA, 0xA, - 0xA, 0xB, 0xB, 0xB, 0xC, 0xC, 0xC, 0xD, 0xD, 0xD, 0xE, 0xE, 0xE, 0xF, 0xF, 0xF); - - const __m256i K8_SHUFFLE_PERMUTED_BLUE_TO_BGR0 = SIMD_MM256_SETR_EPI8( - 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, -1, 0x5, - -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA, -1); - const __m256i K8_SHUFFLE_PERMUTED_BLUE_TO_BGR1 = SIMD_MM256_SETR_EPI8( - -1, 0x3, -1, -1, 0x4, -1, -1, 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, - 0x8, -1, -1, 0x9, -1, -1, 0xA, -1, -1, 0xB, -1, -1, 0xC, -1, -1, 0xD); - const __m256i K8_SHUFFLE_PERMUTED_BLUE_TO_BGR2 = SIMD_MM256_SETR_EPI8( - -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA, -1, - -1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF, -1, -1); - - const __m256i K8_SHUFFLE_PERMUTED_GREEN_TO_BGR0 = SIMD_MM256_SETR_EPI8( - -1, 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, -1, - 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA); - const __m256i K8_SHUFFLE_PERMUTED_GREEN_TO_BGR1 = SIMD_MM256_SETR_EPI8( - -1, -1, 0x3, -1, -1, 0x4, -1, -1, 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, - -1, 0x8, -1, -1, 0x9, -1, -1, 0xA, -1, -1, 0xB, -1, -1, 0xC, -1, -1); - const __m256i K8_SHUFFLE_PERMUTED_GREEN_TO_BGR2 = SIMD_MM256_SETR_EPI8( - 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA, - -1, -1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF, -1); - - const __m256i K8_SHUFFLE_PERMUTED_RED_TO_BGR0 = SIMD_MM256_SETR_EPI8( - -1, -1, 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, - -1, 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1); - const __m256i K8_SHUFFLE_PERMUTED_RED_TO_BGR1 = SIMD_MM256_SETR_EPI8( - 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, -1, 0x5, -1, -1, 0x6, -1, -1, 0x7, - -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA, -1, -1, 0xB, -1, -1, 0xC, -1); - const __m256i K8_SHUFFLE_PERMUTED_RED_TO_BGR2 = SIMD_MM256_SETR_EPI8( - -1, 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, - 0xA, -1, -1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF); - - const __m256i K8_SHUFFLE_BGR0_TO_BLUE = SIMD_MM256_SETR_EPI8( - 0x0, 0x3, 0x6, 0x9, 0xC, 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, 0x2, 0x5, 0x8, 0xB, 0xE, -1, -1, -1, -1, -1); - const __m256i K8_SHUFFLE_BGR1_TO_BLUE = SIMD_MM256_SETR_EPI8( - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD, - 0x0, 0x3, 0x6, 0x9, 0xC, 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m256i K8_SHUFFLE_BGR2_TO_BLUE = SIMD_MM256_SETR_EPI8( - -1, -1, -1, -1, -1, -1, 0x2, 0x5, 0x8, 0xB, 0xE, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD); - - const __m256i K8_SHUFFLE_BGR0_TO_GREEN = SIMD_MM256_SETR_EPI8( - 0x1, 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF, -1, -1, -1, -1, -1); - const __m256i K8_SHUFFLE_BGR1_TO_GREEN = SIMD_MM256_SETR_EPI8( - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x2, 0x5, 0x8, 0xB, 0xE, - 0x1, 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m256i K8_SHUFFLE_BGR2_TO_GREEN = SIMD_MM256_SETR_EPI8( - -1, -1, -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x2, 0x5, 0x8, 0xB, 0xE); - - const __m256i K8_SHUFFLE_BGR0_TO_RED = SIMD_MM256_SETR_EPI8( - 0x2, 0x5, 0x8, 0xB, 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1); - const __m256i K8_SHUFFLE_BGR1_TO_RED = SIMD_MM256_SETR_EPI8( - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF, - 0x2, 0x5, 0x8, 0xB, 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m256i K8_SHUFFLE_BGR2_TO_RED = SIMD_MM256_SETR_EPI8( - -1, -1, -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF); - - const __m256i K8_BGR_TO_BGRA_SHUFFLE = SIMD_MM256_SETR_EPI8( - 0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1, - 0x4, 0x5, 0x6, -1, 0x7, 0x8, 0x9, -1, 0xA, 0xB, 0xC, -1, 0xD, 0xE, 0xF, -1); - - const __m256i K8_RGB_TO_BGRA_SHUFFLE = SIMD_MM256_SETR_EPI8( - 0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1, - 0x6, 0x5, 0x4, -1, 0x9, 0x8, 0x7, -1, 0xC, 0xB, 0xA, -1, 0xF, 0xE, 0xD, -1); - - const __m256i K32_TWO_UNPACK_PERMUTE = SIMD_MM256_SETR_EPI32(0, 2, 4, 6, 1, 3, 5, 7); - } -#endif// SIMD_AVX2_ENABLE - -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - const size_t F = sizeof(__m512) / sizeof(float); - const size_t DF = 2 * F; - const size_t QF = 4 * F; - const size_t HF = F / 2; - - const __m512i K32_INTERLEAVE_0 = SIMD_MM512_SETR_EPI32(0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17); - const __m512i K32_INTERLEAVE_1 = SIMD_MM512_SETR_EPI32(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A, 0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D, 0x0E, 0x1E, 0x0F, 0x1F); - - const __m512i K32_DEINTERLEAVE_0 = SIMD_MM512_SETR_EPI32(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E); - const __m512i K32_DEINTERLEAVE_1 = SIMD_MM512_SETR_EPI32(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F); - - const __m512i K32_PERMUTE_FOR_PACK = SIMD_MM512_SETR_EPI32(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - const __m512i K32_PERMUTE_FOR_UNPACK = SIMD_MM512_SETR_EPI32(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); - } -#endif// SIMD_AVX512F_ENABLE - -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - using namespace Avx512f; - - const size_t A = sizeof(__m512i); - const size_t DA = 2 * A; - const size_t QA = 4 * A; - const size_t HA = A / 2; - - const __m512i K_ZERO = SIMD_MM512_SET1_EPI8(0); - const __m512i K_INV_ZERO = SIMD_MM512_SET1_EPI8(0xFF); - - const __m512i K8_01 = SIMD_MM512_SET1_EPI8(0x01); - const __m512i K8_02 = SIMD_MM512_SET1_EPI8(0x02); - const __m512i K8_03 = SIMD_MM512_SET1_EPI8(0x03); - const __m512i K8_07 = SIMD_MM512_SET1_EPI8(0x07); - - const __m512i K8_01_FF = SIMD_MM512_SET2_EPI8(0x01, 0xFF); - - const __m512i K16_0001 = SIMD_MM512_SET1_EPI16(0x0001); - const __m512i K16_0002 = SIMD_MM512_SET1_EPI16(0x0002); - const __m512i K16_0003 = SIMD_MM512_SET1_EPI16(0x0003); - const __m512i K16_0004 = SIMD_MM512_SET1_EPI16(0x0004); - const __m512i K16_0005 = SIMD_MM512_SET1_EPI16(0x0005); - const __m512i K16_0006 = SIMD_MM512_SET1_EPI16(0x0006); - const __m512i K16_0008 = SIMD_MM512_SET1_EPI16(0x0008); - const __m512i K16_0010 = SIMD_MM512_SET1_EPI16(0x0010); - const __m512i K16_0020 = SIMD_MM512_SET1_EPI16(0x0020); - const __m512i K16_0038 = SIMD_MM512_SET1_EPI16(0x0038); - const __m512i K16_0080 = SIMD_MM512_SET1_EPI16(0x0080); - const __m512i K16_00FF = SIMD_MM512_SET1_EPI16(0x00FF); - const __m512i K16_FF00 = SIMD_MM512_SET1_EPI16(0xFF00); - - const __m512i K32_00000001 = SIMD_MM512_SET1_EPI32(0x00000001); - const __m512i K32_000000FF = SIMD_MM512_SET1_EPI32(0x000000FF); - const __m512i K32_0000FFFF = SIMD_MM512_SET1_EPI32(0x0000FFFF); - const __m512i K32_00010000 = SIMD_MM512_SET1_EPI32(0x00010000); - const __m512i K32_FFFFFF00 = SIMD_MM512_SET1_EPI32(0xFFFFFF00); - - const __m512i K16_Y_ADJUST = SIMD_MM512_SET1_EPI16(Base::Y_ADJUST); - const __m512i K16_UV_ADJUST = SIMD_MM512_SET1_EPI16(Base::UV_ADJUST); - - const __m512i K16_YRGB_RT = SIMD_MM512_SET2_EPI16(Base::Y_TO_RGB_WEIGHT, Base::YUV_TO_BGR_ROUND_TERM); - const __m512i K16_VR_0 = SIMD_MM512_SET2_EPI16(Base::V_TO_RED_WEIGHT, 0); - const __m512i K16_UG_VG = SIMD_MM512_SET2_EPI16(Base::U_TO_GREEN_WEIGHT, Base::V_TO_GREEN_WEIGHT); - const __m512i K16_UB_0 = SIMD_MM512_SET2_EPI16(Base::U_TO_BLUE_WEIGHT, 0); - - const __m512i K16_BY_RY = SIMD_MM512_SET2_EPI16(Base::BLUE_TO_Y_WEIGHT, Base::RED_TO_Y_WEIGHT); - const __m512i K16_GY_RT = SIMD_MM512_SET2_EPI16(Base::GREEN_TO_Y_WEIGHT, Base::BGR_TO_YUV_ROUND_TERM); - const __m512i K16_BU_RU = SIMD_MM512_SET2_EPI16(Base::BLUE_TO_U_WEIGHT, Base::RED_TO_U_WEIGHT); - const __m512i K16_GU_RT = SIMD_MM512_SET2_EPI16(Base::GREEN_TO_U_WEIGHT, Base::BGR_TO_YUV_ROUND_TERM); - const __m512i K16_BV_RV = SIMD_MM512_SET2_EPI16(Base::BLUE_TO_V_WEIGHT, Base::RED_TO_V_WEIGHT); - const __m512i K16_GV_RT = SIMD_MM512_SET2_EPI16(Base::GREEN_TO_V_WEIGHT, Base::BGR_TO_YUV_ROUND_TERM); - - const __m512i K16_DIVISION_BY_9_FACTOR = SIMD_MM512_SET1_EPI16(Base::DIVISION_BY_9_FACTOR); - - const __m512i K8_SUFFLE_BGRA_TO_G0A0 = SIMD_MM512_SETR_EPI8( - 0x1, -1, 0x3, -1, 0x5, -1, 0x7, -1, 0x9, -1, 0xB, -1, 0xD, -1, 0xF, -1, - 0x1, -1, 0x3, -1, 0x5, -1, 0x7, -1, 0x9, -1, 0xB, -1, 0xD, -1, 0xF, -1, - 0x1, -1, 0x3, -1, 0x5, -1, 0x7, -1, 0x9, -1, 0xB, -1, 0xD, -1, 0xF, -1, - 0x1, -1, 0x3, -1, 0x5, -1, 0x7, -1, 0x9, -1, 0xB, -1, 0xD, -1, 0xF, -1); - - const __m512i K8_SUFFLE_BGRA_TO_G000 = SIMD_MM512_SETR_EPI8( - 0x1, -1, -1, -1, 0x5, -1, -1, -1, 0x9, -1, -1, -1, 0xD, -1, -1, -1, - 0x1, -1, -1, -1, 0x5, -1, -1, -1, 0x9, -1, -1, -1, 0xD, -1, -1, -1, - 0x1, -1, -1, -1, 0x5, -1, -1, -1, 0x9, -1, -1, -1, 0xD, -1, -1, -1, - 0x1, -1, -1, -1, 0x5, -1, -1, -1, 0x9, -1, -1, -1, 0xD, -1, -1, -1); - - const __m512i K8_SUFFLE_BGRA_TO_A000 = SIMD_MM512_SETR_EPI8( - 0x3, -1, -1, -1, 0x7, -1, -1, -1, 0xB, -1, -1, -1, 0xF, -1, -1, -1, - 0x3, -1, -1, -1, 0x7, -1, -1, -1, 0xB, -1, -1, -1, 0xF, -1, -1, -1, - 0x3, -1, -1, -1, 0x7, -1, -1, -1, 0xB, -1, -1, -1, 0xF, -1, -1, -1, - 0x3, -1, -1, -1, 0x7, -1, -1, -1, 0xB, -1, -1, -1, 0xF, -1, -1, -1); - - const __m512i K8_SUFFLE_BGR_TO_B0R0 = SIMD_MM512_SETR_EPI8( - 0x0, -1, 0x2, -1, 0x3, -1, 0x5, -1, 0x6, -1, 0x8, -1, 0x9, -1, 0xB, -1, - 0x0, -1, 0x2, -1, 0x3, -1, 0x5, -1, 0x6, -1, 0x8, -1, 0x9, -1, 0xB, -1, - 0x0, -1, 0x2, -1, 0x3, -1, 0x5, -1, 0x6, -1, 0x8, -1, 0x9, -1, 0xB, -1, - 0x0, -1, 0x2, -1, 0x3, -1, 0x5, -1, 0x6, -1, 0x8, -1, 0x9, -1, 0xB, -1); - - const __m512i K8_SUFFLE_BGR_TO_G000 = SIMD_MM512_SETR_EPI8( - 0x1, -1, -1, -1, 0x4, -1, -1, -1, 0x7, -1, -1, -1, 0xA, -1, -1, -1, - 0x1, -1, -1, -1, 0x4, -1, -1, -1, 0x7, -1, -1, -1, 0xA, -1, -1, -1, - 0x1, -1, -1, -1, 0x4, -1, -1, -1, 0x7, -1, -1, -1, 0xA, -1, -1, -1, - 0x1, -1, -1, -1, 0x4, -1, -1, -1, 0x7, -1, -1, -1, 0xA, -1, -1, -1); - - const __m512i K8_SUFFLE_BGR_TO_G010 = SIMD_MM512_SETR_EPI8( - 0x1, -1, 0xC, -1, 0x4, -1, 0xD, -1, 0x7, -1, 0xE, -1, 0xA, -1, 0xF, -1, - 0x1, -1, 0xC, -1, 0x4, -1, 0xD, -1, 0x7, -1, 0xE, -1, 0xA, -1, 0xF, -1, - 0x1, -1, 0xC, -1, 0x4, -1, 0xD, -1, 0x7, -1, 0xE, -1, 0xA, -1, 0xF, -1, - 0x1, -1, 0xC, -1, 0x4, -1, 0xD, -1, 0x7, -1, 0xE, -1, 0xA, -1, 0xF, -1); - - const __m512i K8_SHUFFLE_GRAY_TO_BGR0 = SIMD_MM512_SETR_EPI8( - 0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5, - 0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0xA, 0xA, - 0xA, 0xB, 0xB, 0xB, 0xC, 0xC, 0xC, 0xD, 0xD, 0xD, 0xE, 0xE, 0xE, 0xF, 0xF, 0xF, - 0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5); - const __m512i K8_SHUFFLE_GRAY_TO_BGR1 = SIMD_MM512_SETR_EPI8( - 0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0xA, 0xA, - 0xA, 0xB, 0xB, 0xB, 0xC, 0xC, 0xC, 0xD, 0xD, 0xD, 0xE, 0xE, 0xE, 0xF, 0xF, 0xF, - 0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5, - 0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0xA, 0xA); - const __m512i K8_SHUFFLE_GRAY_TO_BGR2 = SIMD_MM512_SETR_EPI8( - 0xA, 0xB, 0xB, 0xB, 0xC, 0xC, 0xC, 0xD, 0xD, 0xD, 0xE, 0xE, 0xE, 0xF, 0xF, 0xF, - 0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5, - 0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0xA, 0xA, - 0xA, 0xB, 0xB, 0xB, 0xC, 0xC, 0xC, 0xD, 0xD, 0xD, 0xE, 0xE, 0xE, 0xF, 0xF, 0xF); - - const __m512i K16_PERMUTE_FOR_HADD_0 = SIMD_MM512_SETR_EPI16( - 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E, - 0x20, 0x22, 0x24, 0x26, 0x28, 0x2A, 0x2C, 0x2E, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3A, 0x3C, 0x3E); - const __m512i K16_PERMUTE_FOR_HADD_1 = SIMD_MM512_SETR_EPI16( - 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F, - 0x21, 0x23, 0x25, 0x27, 0x29, 0x2B, 0x2D, 0x2F, 0x31, 0x33, 0x35, 0x37, 0x39, 0x3B, 0x3D, 0x3F); - - const __m512i K32_PERMUTE_FOR_TWO_UNPACK = SIMD_MM512_SETR_EPI32(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF); - - const __m512i K64_PERMUTE_FOR_PACK = SIMD_MM512_SETR_EPI64(0, 2, 4, 6, 1, 3, 5, 7); - const __m512i K64_PERMUTE_FOR_UNPACK = SIMD_MM512_SETR_EPI64(0, 4, 1, 5, 2, 6, 3, 7); - - const __m512i K32_PERMUTE_BGR_TO_BGRA = SIMD_MM512_SETR_EPI32(0x00, 0x01, 0x02, 0x10, 0x03, 0x04, 0x05, 0x11, 0x06, 0x07, 0x08, 0x12, 0x09, 0x0A, 0x0B, 0x13); - const __m512i K32_PERMUTE_BGR_TO_BGRA_0 = SIMD_MM512_SETR_EPI32(0x00, 0x01, 0x02, -1, 0x03, 0x04, 0x05, -1, 0x06, 0x07, 0x08, -1, 0x09, 0x0A, 0x0B, -1); - const __m512i K32_PERMUTE_BGR_TO_BGRA_1 = SIMD_MM512_SETR_EPI32(0x0C, 0x0D, 0x0E, -1, 0x0F, 0x10, 0x11, -1, 0x12, 0x13, 0x14, -1, 0x15, 0x16, 0x17, -1); - const __m512i K32_PERMUTE_BGR_TO_BGRA_2 = SIMD_MM512_SETR_EPI32(0x08, 0x09, 0x0A, -1, 0x0B, 0x0C, 0x0D, -1, 0x0E, 0x0F, 0x10, -1, 0x11, 0x12, 0x13, -1); - const __m512i K32_PERMUTE_BGR_TO_BGRA_3 = SIMD_MM512_SETR_EPI32(0x04, 0x05, 0x06, -1, 0x07, 0x08, 0x09, -1, 0x0A, 0x0B, 0x0C, -1, 0x0D, 0x0E, 0x0F, -1); - - const __m512i K8_SHUFFLE_BLUE_TO_BGR0 = SIMD_MM512_SETR_EPI8( - 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, -1, 0x5, - -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA, -1, - -1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF, -1, -1, - 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, -1, 0x5); - const __m512i K8_SHUFFLE_BLUE_TO_BGR1 = SIMD_MM512_SETR_EPI8( - -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA, -1, - -1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF, -1, -1, - 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, -1, 0x5, - -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA, -1); - const __m512i K8_SHUFFLE_BLUE_TO_BGR2 = SIMD_MM512_SETR_EPI8( - -1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF, -1, -1, - 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, -1, 0x5, - -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA, -1, - -1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF, -1, -1); - - const __m512i K8_SHUFFLE_GREEN_TO_BGR0 = SIMD_MM512_SETR_EPI8( - -1, 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, -1, - 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA, - -1, -1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF, -1, - -1, 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, -1); - const __m512i K8_SHUFFLE_GREEN_TO_BGR1 = SIMD_MM512_SETR_EPI8( - 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA, - -1, -1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF, -1, - -1, 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, -1, - 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA); - const __m512i K8_SHUFFLE_GREEN_TO_BGR2 = SIMD_MM512_SETR_EPI8( - -1, -1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF, -1, - -1, 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, -1, - 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA, - -1, -1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF, -1); - - const __m512i K8_SHUFFLE_RED_TO_BGR0 = SIMD_MM512_SETR_EPI8( - -1, -1, 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, - -1, 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, - 0xA, -1, -1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF, - -1, -1, 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1); - const __m512i K8_SHUFFLE_RED_TO_BGR1 = SIMD_MM512_SETR_EPI8( - -1, 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, - 0xA, -1, -1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF, - -1, -1, 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, - -1, 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1); - const __m512i K8_SHUFFLE_RED_TO_BGR2 = SIMD_MM512_SETR_EPI8( - 0xA, -1, -1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF, - -1, -1, 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, - -1, 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, - 0xA, -1, -1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF); - - const __m512i K32_PERMUTE_COLOR_TO_BGR0 = SIMD_MM512_SETR_EPI32(0x0, 0x1, 0x2, 0x3, 0x0, 0x1, 0x2, 0x3, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7); - const __m512i K32_PERMUTE_COLOR_TO_BGR1 = SIMD_MM512_SETR_EPI32(0x4, 0x5, 0x6, 0x7, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0x8, 0x9, 0xA, 0xB); - const __m512i K32_PERMUTE_COLOR_TO_BGR2 = SIMD_MM512_SETR_EPI32(0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0xC, 0xD, 0xE, 0xF, 0xC, 0xD, 0xE, 0xF); - } -#endif// SIMD_AVX512F_ENABLE - -#ifdef SIMD_AVX512VNNI_ENABLE - namespace Avx512vnni - { - using namespace Avx512bw; - } -#endif//SIMD_AVX512VNNI_ENABLE - -#ifdef SIMD_VMX_ENABLE - namespace Vmx - { - typedef __vector int8_t v128_s8; - typedef __vector uint8_t v128_u8; - typedef __vector int16_t v128_s16; - typedef __vector uint16_t v128_u16; - typedef __vector int32_t v128_s32; - typedef __vector uint32_t v128_u32; - typedef __vector float v128_f32; - - const size_t A = sizeof(v128_u8); - const size_t DA = 2 * A; - const size_t QA = 4 * A; - const size_t OA = 8 * A; - const size_t HA = A / 2; - - const size_t F = sizeof(v128_f32) / sizeof(float); - const size_t DF = 2 * F; - const size_t QF = 4 * F; - const size_t HF = F / 2; - - const v128_u8 K8_00 = SIMD_VEC_SET1_EPI8(0x00); - const v128_u8 K8_01 = SIMD_VEC_SET1_EPI8(0x01); - const v128_u8 K8_02 = SIMD_VEC_SET1_EPI8(0x02); - const v128_u8 K8_04 = SIMD_VEC_SET1_EPI8(0x04); - const v128_u8 K8_08 = SIMD_VEC_SET1_EPI8(0x08); - const v128_u8 K8_10 = SIMD_VEC_SET1_EPI8(0x10); - const v128_u8 K8_20 = SIMD_VEC_SET1_EPI8(0x20); - const v128_u8 K8_40 = SIMD_VEC_SET1_EPI8(0x40); - const v128_u8 K8_80 = SIMD_VEC_SET1_EPI8(0x80); - const v128_u8 K8_FF = SIMD_VEC_SET1_EPI8(0xFF); - - const v128_u16 K16_0000 = SIMD_VEC_SET1_EPI16(0x0000); - const v128_u16 K16_0001 = SIMD_VEC_SET1_EPI16(0x0001); - const v128_u16 K16_0002 = SIMD_VEC_SET1_EPI16(0x0002); - const v128_u16 K16_0003 = SIMD_VEC_SET1_EPI16(0x0003); - const v128_u16 K16_0004 = SIMD_VEC_SET1_EPI16(0x0004); - const v128_u16 K16_0005 = SIMD_VEC_SET1_EPI16(0x0005); - const v128_u16 K16_0006 = SIMD_VEC_SET1_EPI16(0x0006); - const v128_u16 K16_0008 = SIMD_VEC_SET1_EPI16(0x0008); - const v128_u16 K16_0010 = SIMD_VEC_SET1_EPI16(0x0010); - const v128_u16 K16_0020 = SIMD_VEC_SET1_EPI16(0x0020); - const v128_u16 K16_0080 = SIMD_VEC_SET1_EPI16(0x0080); - const v128_u16 K16_00FF = SIMD_VEC_SET1_EPI16(0x00FF); - const v128_u16 K16_FFFF = SIMD_VEC_SET1_EPI16(0xFFFF); - - const v128_u32 K32_00000000 = SIMD_VEC_SET1_EPI32(0x00000000); - - const v128_s16 K16_Y_ADJUST = SIMD_VEC_SET1_EPI16(Base::Y_ADJUST); - const v128_s16 K16_UV_ADJUST = SIMD_VEC_SET1_EPI16(Base::UV_ADJUST); - - const v128_s16 K16_YRGB_RT = SIMD_VEC_SET2_EPI16(Base::Y_TO_RGB_WEIGHT, Base::YUV_TO_BGR_ROUND_TERM); - const v128_s16 K16_VR_0 = SIMD_VEC_SET2_EPI16(Base::V_TO_RED_WEIGHT, 0); - const v128_s16 K16_UG_VG = SIMD_VEC_SET2_EPI16(Base::U_TO_GREEN_WEIGHT, Base::V_TO_GREEN_WEIGHT); - const v128_s16 K16_UB_0 = SIMD_VEC_SET2_EPI16(Base::U_TO_BLUE_WEIGHT, 0); - - const v128_u32 K32_YUV_TO_BGR_AVERAGING_SHIFT = SIMD_VEC_SET1_EPI32(Base::YUV_TO_BGR_AVERAGING_SHIFT); - - const v128_s16 K16_BY_RY = SIMD_VEC_SET2_EPI16(Base::BLUE_TO_Y_WEIGHT, Base::RED_TO_Y_WEIGHT); - const v128_s16 K16_GY_RT = SIMD_VEC_SET2_EPI16(Base::GREEN_TO_Y_WEIGHT, Base::BGR_TO_YUV_ROUND_TERM); - const v128_s16 K16_BU_RU = SIMD_VEC_SET2_EPI16(Base::BLUE_TO_U_WEIGHT, Base::RED_TO_U_WEIGHT); - const v128_s16 K16_GU_RT = SIMD_VEC_SET2_EPI16(Base::GREEN_TO_U_WEIGHT, Base::BGR_TO_YUV_ROUND_TERM); - const v128_s16 K16_BV_RV = SIMD_VEC_SET2_EPI16(Base::BLUE_TO_V_WEIGHT, Base::RED_TO_V_WEIGHT); - const v128_s16 K16_GV_RT = SIMD_VEC_SET2_EPI16(Base::GREEN_TO_V_WEIGHT, Base::BGR_TO_YUV_ROUND_TERM); - - const v128_u32 K32_BGR_TO_YUV_AVERAGING_SHIFT = SIMD_VEC_SET1_EPI32(Base::BGR_TO_YUV_AVERAGING_SHIFT); - - const v128_u16 K16_DIVISION_BY_9_FACTOR = SIMD_VEC_SET1_EPI16(Base::DIVISION_BY_9_FACTOR); - - //(0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF); - const v128_u8 K8_PERM_LOAD_BEFORE_FIRST_1 = SIMD_VEC_SETR_EPI8(0x0, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE); - const v128_u8 K8_PERM_LOAD_BEFORE_FIRST_2 = SIMD_VEC_SETR_EPI8(0x0, 0x1, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD); - const v128_u8 K8_PERM_LOAD_BEFORE_FIRST_3 = SIMD_VEC_SETR_EPI8(0x0, 0x1, 0x2, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC); - const v128_u8 K8_PERM_LOAD_BEFORE_FIRST_4 = SIMD_VEC_SETR_EPI8(0x0, 0x1, 0x2, 0x3, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB); - - const v128_u8 K8_PERM_LOAD_AFTER_LAST_1 = SIMD_VEC_SETR_EPI8(0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0xF); - const v128_u8 K8_PERM_LOAD_AFTER_LAST_2 = SIMD_VEC_SETR_EPI8(0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0xE, 0xF); - const v128_u8 K8_PERM_LOAD_AFTER_LAST_3 = SIMD_VEC_SETR_EPI8(0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0xD, 0xE, 0xF); - const v128_u8 K8_PERM_LOAD_AFTER_LAST_4 = SIMD_VEC_SETR_EPI8(0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0xC, 0xD, 0xE, 0xF); - - const v128_u8 K8_PERM_UNPACK_LO_U8 = SIMD_VEC_SETR_EPI8(0x10, 0x00, 0x11, 0x01, 0x12, 0x02, 0x13, 0x03, 0x14, 0x04, 0x15, 0x05, 0x16, 0x06, 0x17, 0x07); - const v128_u8 K8_PERM_UNPACK_HI_U8 = SIMD_VEC_SETR_EPI8(0x18, 0x08, 0x19, 0x09, 0x1A, 0x0A, 0x1B, 0x0B, 0x1C, 0x0C, 0x1D, 0x0D, 0x1E, 0x0E, 0x1F, 0x0F); - - const v128_u8 K8_PERM_UNPACK_LO_U16 = SIMD_VEC_SETR_EPI8(0x10, 0x11, 0x00, 0x01, 0x12, 0x13, 0x02, 0x03, 0x14, 0x15, 0x04, 0x05, 0x16, 0x17, 0x06, 0x07); - const v128_u8 K8_PERM_UNPACK_HI_U16 = SIMD_VEC_SETR_EPI8(0x18, 0x19, 0x08, 0x09, 0x1A, 0x1B, 0x0A, 0x0B, 0x1C, 0x1D, 0x0C, 0x0D, 0x1E, 0x1F, 0x0E, 0x0F); - - const v128_u8 K8_PERM_MUL_HI_U16 = SIMD_VEC_SETR_EPI8(0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D); - - const v128_u8 K8_PERM_INTERLEAVE_BGR_00 = SIMD_VEC_SETR_EPI8(0x00, 0x10, 0x10, 0x01, 0x11, 0x11, 0x02, 0x12, 0x12, 0x03, 0x13, 0x13, 0x04, 0x14, 0x14, 0x05); - const v128_u8 K8_PERM_INTERLEAVE_BGR_01 = SIMD_VEC_SETR_EPI8(0x00, 0x01, 0x10, 0x03, 0x04, 0x11, 0x06, 0x07, 0x12, 0x09, 0x0A, 0x13, 0x0C, 0x0D, 0x14, 0x0F); - const v128_u8 K8_PERM_INTERLEAVE_BGR_10 = SIMD_VEC_SETR_EPI8(0x15, 0x15, 0x06, 0x16, 0x16, 0x07, 0x17, 0x17, 0x08, 0x18, 0x18, 0x09, 0x19, 0x19, 0x0A, 0x1A); - const v128_u8 K8_PERM_INTERLEAVE_BGR_11 = SIMD_VEC_SETR_EPI8(0x00, 0x15, 0x02, 0x03, 0x16, 0x05, 0x06, 0x17, 0x08, 0x09, 0x18, 0x0B, 0x0C, 0x19, 0x0E, 0x0F); - const v128_u8 K8_PERM_INTERLEAVE_BGR_20 = SIMD_VEC_SETR_EPI8(0x1A, 0x0B, 0x1B, 0x1B, 0x0C, 0x1C, 0x1C, 0x0D, 0x1D, 0x1D, 0x0E, 0x1E, 0x1E, 0x0F, 0x1F, 0x1F); - const v128_u8 K8_PERM_INTERLEAVE_BGR_21 = SIMD_VEC_SETR_EPI8(0x1A, 0x01, 0x02, 0x1B, 0x04, 0x05, 0x1C, 0x07, 0x08, 0x1D, 0x0A, 0x0B, 0x1E, 0x0D, 0x0E, 0x1F); - - const v128_u8 K8_PERM_BGR_TO_BLUE_0 = SIMD_VEC_SETR_EPI8(0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0x12, 0x15, 0x18, 0x1B, 0x1E, 0x00, 0x00, 0x00, 0x00, 0x00); - const v128_u8 K8_PERM_BGR_TO_BLUE_1 = SIMD_VEC_SETR_EPI8(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x11, 0x14, 0x17, 0x1A, 0x1D); - const v128_u8 K8_PERM_BGR_TO_GREEN_0 = SIMD_VEC_SETR_EPI8(0x01, 0x04, 0x07, 0x0A, 0x0D, 0x10, 0x13, 0x16, 0x19, 0x1C, 0x1F, 0x00, 0x00, 0x00, 0x00, 0x00); - const v128_u8 K8_PERM_BGR_TO_GREEN_1 = SIMD_VEC_SETR_EPI8(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x12, 0x15, 0x18, 0x1B, 0x1E); - const v128_u8 K8_PERM_BGR_TO_RED_0 = SIMD_VEC_SETR_EPI8(0x02, 0x05, 0x08, 0x0B, 0x0E, 0x11, 0x14, 0x17, 0x1A, 0x1D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); - const v128_u8 K8_PERM_BGR_TO_RED_1 = SIMD_VEC_SETR_EPI8(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, 0x13, 0x16, 0x19, 0x1C, 0x1F); - - const v128_u8 K8_PERM_GRAY_TO_BGR_0 = SIMD_VEC_SETR_EPI8(0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x05); - const v128_u8 K8_PERM_GRAY_TO_BGR_1 = SIMD_VEC_SETR_EPI8(0x05, 0x05, 0x06, 0x06, 0x06, 0x07, 0x07, 0x07, 0x08, 0x08, 0x08, 0x09, 0x09, 0x09, 0x0A, 0x0A); - const v128_u8 K8_PERM_GRAY_TO_BGR_2 = SIMD_VEC_SETR_EPI8(0x0A, 0x0B, 0x0B, 0x0B, 0x0C, 0x0C, 0x0C, 0x0D, 0x0D, 0x0D, 0x0E, 0x0E, 0x0E, 0x0F, 0x0F, 0x0F); - } -#endif//SIMD_VMX_ENABLE - -#ifdef SIMD_VSX_ENABLE - namespace Vsx - { - using namespace Vmx; - - const v128_f32 K_0_0f = SIMD_VEC_SET1_PS(0.0f); - } -#endif//SIMD_VSX_ENABLE - -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - const size_t A = sizeof(uint8x16_t); - const size_t DA = 2 * A; - const size_t QA = 4 * A; - const size_t OA = 8 * A; - const size_t HA = A / 2; - - const size_t F = sizeof(float32x4_t) / sizeof(float); - const size_t DF = 2 * F; - const size_t QF = 4 * F; - const size_t HF = F / 2; - - const uint8x16_t K8_00 = SIMD_VEC_SET1_EPI8(0x00); - const uint8x16_t K8_01 = SIMD_VEC_SET1_EPI8(0x01); - const uint8x16_t K8_02 = SIMD_VEC_SET1_EPI8(0x02); - const uint8x16_t K8_03 = SIMD_VEC_SET1_EPI8(0x03); - const uint8x16_t K8_04 = SIMD_VEC_SET1_EPI8(0x04); - const uint8x16_t K8_07 = SIMD_VEC_SET1_EPI8(0x07); - const uint8x16_t K8_08 = SIMD_VEC_SET1_EPI8(0x08); - const uint8x16_t K8_10 = SIMD_VEC_SET1_EPI8(0x10); - const uint8x16_t K8_20 = SIMD_VEC_SET1_EPI8(0x20); - const uint8x16_t K8_40 = SIMD_VEC_SET1_EPI8(0x40); - const uint8x16_t K8_80 = SIMD_VEC_SET1_EPI8(0x80); - const uint8x16_t K8_FF = SIMD_VEC_SET1_EPI8(0xFF); - - const uint16x8_t K16_0000 = SIMD_VEC_SET1_EPI16(0x0000); - const uint16x8_t K16_0001 = SIMD_VEC_SET1_EPI16(0x0001); - const uint16x8_t K16_0002 = SIMD_VEC_SET1_EPI16(0x0002); - const uint16x8_t K16_0003 = SIMD_VEC_SET1_EPI16(0x0003); - const uint16x8_t K16_0004 = SIMD_VEC_SET1_EPI16(0x0004); - const uint16x8_t K16_0005 = SIMD_VEC_SET1_EPI16(0x0005); - const uint16x8_t K16_0006 = SIMD_VEC_SET1_EPI16(0x0006); - const uint16x8_t K16_0008 = SIMD_VEC_SET1_EPI16(0x0008); - const uint16x8_t K16_0010 = SIMD_VEC_SET1_EPI16(0x0010); - const uint16x8_t K16_0020 = SIMD_VEC_SET1_EPI16(0x0020); - const uint16x8_t K16_0080 = SIMD_VEC_SET1_EPI16(0x0080); - const uint16x8_t K16_00FF = SIMD_VEC_SET1_EPI16(0x00FF); - const uint16x8_t K16_0101 = SIMD_VEC_SET1_EPI16(0x0101); - const uint16x8_t K16_0800 = SIMD_VEC_SET1_EPI16(0x0800); - const uint16x8_t K16_FF00 = SIMD_VEC_SET1_EPI16(0xFF00); - - const uint32x4_t K32_00000000 = SIMD_VEC_SET1_EPI32(0x00000000); - const uint32x4_t K32_00000001 = SIMD_VEC_SET1_EPI32(0x00000001); - const uint32x4_t K32_00000002 = SIMD_VEC_SET1_EPI32(0x00000002); - const uint32x4_t K32_00000003 = SIMD_VEC_SET1_EPI32(0x00000003); - const uint32x4_t K32_00000004 = SIMD_VEC_SET1_EPI32(0x00000004); - const uint32x4_t K32_00000005 = SIMD_VEC_SET1_EPI32(0x00000005); - const uint32x4_t K32_00000008 = SIMD_VEC_SET1_EPI32(0x00000008); - const uint32x4_t K32_00000010 = SIMD_VEC_SET1_EPI32(0x00000010); - const uint32x4_t K32_000000FF = SIMD_VEC_SET1_EPI32(0x000000FF); - const uint32x4_t K32_0000FFFF = SIMD_VEC_SET1_EPI32(0x0000FFFF); - const uint32x4_t K32_00010000 = SIMD_VEC_SET1_EPI32(0x00010000); - const uint32x4_t K32_01000000 = SIMD_VEC_SET1_EPI32(0x01000000); - const uint32x4_t K32_08080800 = SIMD_VEC_SET1_EPI32(0x08080800); - const uint32x4_t K32_FFFFFF00 = SIMD_VEC_SET1_EPI32(0xFFFFFF00); - const uint32x4_t K32_FFFFFFFF = SIMD_VEC_SET1_EPI32(0xFFFFFFFF); - const uint32x4_t K32_0123 = SIMD_VEC_SETR_EPI32(0, 1, 2, 3); - - const uint64x2_t K64_0000000000000000 = SIMD_VEC_SET1_EPI64(0x0000000000000000); - - const uint16x4_t K16_BLUE_TO_GRAY_WEIGHT = SIMD_VEC_SET1_PI16(Base::BLUE_TO_GRAY_WEIGHT); - const uint16x4_t K16_GREEN_TO_GRAY_WEIGHT = SIMD_VEC_SET1_PI16(Base::GREEN_TO_GRAY_WEIGHT); - const uint16x4_t K16_RED_TO_GRAY_WEIGHT = SIMD_VEC_SET1_PI16(Base::RED_TO_GRAY_WEIGHT); - const uint32x4_t K32_BGR_TO_GRAY_ROUND_TERM = SIMD_VEC_SET1_EPI32(Base::BGR_TO_GRAY_ROUND_TERM); - - const int16x8_t K16_Y_ADJUST = SIMD_VEC_SET1_EPI16(Base::Y_ADJUST); - const int16x8_t K16_UV_ADJUST = SIMD_VEC_SET1_EPI16(Base::UV_ADJUST); - - const int16x4_t K16_BLUE_TO_Y_WEIGHT = SIMD_VEC_SET1_PI16(Base::BLUE_TO_Y_WEIGHT); - const int16x4_t K16_GREEN_TO_Y_WEIGHT = SIMD_VEC_SET1_PI16(Base::GREEN_TO_Y_WEIGHT); - const int16x4_t K16_RED_TO_Y_WEIGHT = SIMD_VEC_SET1_PI16(Base::RED_TO_Y_WEIGHT); - - const int16x4_t K16_BLUE_TO_U_WEIGHT = SIMD_VEC_SET1_PI16(Base::BLUE_TO_U_WEIGHT); - const int16x4_t K16_GREEN_TO_U_WEIGHT = SIMD_VEC_SET1_PI16(Base::GREEN_TO_U_WEIGHT); - const int16x4_t K16_RED_TO_U_WEIGHT = SIMD_VEC_SET1_PI16(Base::RED_TO_U_WEIGHT); - - const int16x4_t K16_BLUE_TO_V_WEIGHT = SIMD_VEC_SET1_PI16(Base::BLUE_TO_V_WEIGHT); - const int16x4_t K16_GREEN_TO_V_WEIGHT = SIMD_VEC_SET1_PI16(Base::GREEN_TO_V_WEIGHT); - const int16x4_t K16_RED_TO_V_WEIGHT = SIMD_VEC_SET1_PI16(Base::RED_TO_V_WEIGHT); - - const int32x4_t K32_BGR_TO_YUV_ROUND_TERM = SIMD_VEC_SET1_EPI32(Base::BGR_TO_YUV_ROUND_TERM); - - const int16x4_t K16_Y_TO_RGB_WEIGHT = SIMD_VEC_SET1_PI16(Base::Y_TO_RGB_WEIGHT); - - const int16x4_t K16_U_TO_BLUE_WEIGHT = SIMD_VEC_SET1_PI16(Base::U_TO_BLUE_WEIGHT); - const int16x4_t K16_U_TO_GREEN_WEIGHT = SIMD_VEC_SET1_PI16(Base::U_TO_GREEN_WEIGHT); - - const int16x4_t K16_V_TO_GREEN_WEIGHT = SIMD_VEC_SET1_PI16(Base::V_TO_GREEN_WEIGHT); - const int16x4_t K16_V_TO_RED_WEIGHT = SIMD_VEC_SET1_PI16(Base::V_TO_RED_WEIGHT); - - const int32x4_t K32_YUV_TO_BGR_ROUND_TERM = SIMD_VEC_SET1_EPI32(Base::YUV_TO_BGR_ROUND_TERM); - } -#endif//SIMD_NEON_ENABLE - -#ifdef SIMD_MSA_ENABLE - namespace Msa - { - const size_t A = sizeof(v16u8); - const size_t DA = 2 * A; - const size_t QA = 4 * A; - const size_t OA = 8 * A; - const size_t HA = A / 2; - } -#endif//SIMD_Msa_ENABLE -} -#endif//__SimdConst_h__ diff --git a/src/3rd/Simd/Simd/SimdContour.hpp b/src/3rd/Simd/Simd/SimdContour.hpp deleted file mode 100644 index cb882b9c..00000000 --- a/src/3rd/Simd/Simd/SimdContour.hpp +++ /dev/null @@ -1,398 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar, -* 2014-2019 Antonenka Mikhail. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdContour_hpp__ -#define __SimdContour_hpp__ - -#include "Simd/SimdLib.hpp" - -#include - -namespace Simd -{ - /*! @ingroup cpp_contour - - \short ContourDetector structure provides detection of contours at the image. - - Using example: - \verbatim - #include "Simd/SimdContour.hpp" - #include "Simd/SimdDrawing.hpp" - - int main() - { - typedef Simd::ContourDetector ContourDetector; - - ContourDetector::View image; - image.Load("../../data/image/face/lena.pgm"); - - ContourDetector contourDetector; - - contourDetector.Init(image.Size()); - - ContourDetector::Contours contours; - contourDetector.Detect(image, contours); - - for (size_t i = 0; i < contours.size(); ++i) - { - for (size_t j = 1; j < contours[i].size(); ++j) - Simd::DrawLine(image, contours[i][j - 1], contours[i][j], uint8_t(255)); - } - image.Save("result.pgm"); - - return 0; - } - \endverbatim - - */ - template class A> - struct ContourDetector - { - typedef A Allocator; /*!< Allocator type definition. */ - typedef Simd::View View; /*!< An image type definition. */ - typedef Simd::Point Size; /*!< An image size type definition. */ - typedef Simd::Point Point; /*!< A point type definition. */ - typedef Rectangle Rect; /*!< A rectangle type definition. */ - typedef std::vector Contour; /*!< A contour type definition. */ - typedef std::vector Contours; /*!< A vector of contours type definition. */ - - /*! - Prepares ContourDetector structure to work with image of given size. - - \param [in] size - a size of input image. - */ - void Init(Size size) - { - _m.Recreate(size, View::Int16); - _a.Recreate(size, View::Gray8); - _e.Recreate(size, View::Gray8); - } - - /*! - Detects contours at given image. - - \param [in] src - a input image. - \param [out] contours - detected contours. - \param [in] mask - an image with the mask. It is used to restrict region of contours detection. By default it is not used. - \param [in] indexMin - a minimal index in the mask. By default is equal 3. - \param [in] roi - Region Of Interest. This is Another way to restrict region of contours detection. By default it is not used. - \param [in] gradientThreshold - a gradient threshold for contour detection. If this parameter is negative it will be estimated automatically. By default is equal to 40. - \param [in] anchorThreshold - a anchor threshold for contour detection. By default is equal to 0. - \param [in] anchorScanInterval - the anchor scan interval. This parameter affects to performance. By default is equal to 2. - \param [in] minSegmentLength - the minimal length of detected contour. By default is equal to 2. - \return a result of this operation. - */ - bool Detect(const View & src, Contours & contours, const View & mask = View(), uint8_t indexMin = 3, const Rect & roi = Rect(), - int gradientThreshold = 40, int anchorThreshold = 0, int anchorScanInterval = 2, int minSegmentLength = 2) - { - if (!Simd::Compatible(src, _a)) - return false; - if (mask.format != View::None && !Simd::Compatible(mask, _a)) - return false; - - _roi = roi.Empty() ? Rect(src.Size()) : roi; - _roi.Intersection(Rect(src.Size())); - - ContourMetrics(src, mask, indexMin); - - if (gradientThreshold < 0) - gradientThreshold = EstimateAdaptiveThreshold(); - - ContourAnchors(anchorThreshold, anchorScanInterval); - - PerformSmartRouting(contours, minSegmentLength, gradientThreshold * 2); - - return true; - } - - private: - - enum Direction - { - Unknown = -1, - Up, - Down, - Right, - Left, - }; - - struct Anchor - { - Point p; - uint16_t val; - Anchor(const Point & p_, uint16_t val_) - : p(p_) - , val(val_) - {} - - static SIMD_INLINE bool Compare(const Anchor & a, const Anchor & b) - { - return a.val > b.val; - } - }; - typedef std::vector Anchors; - - Rect _roi; - View _m; - View _a; - View _e; - Anchors _anchors; - - void ContourMetrics(const View & src, const View & mask, uint8_t indexMin) - { - if (mask.format == View::Gray8) - Simd::ContourMetrics(src.Region(_roi), mask.Region(_roi), indexMin, _m.Region(_roi).Ref()); - else - Simd::ContourMetrics(src.Region(_roi), _m.Region(_roi).Ref()); - } - - void ContourAnchors(int anchorThreshold, int anchorScanInterval) - { - Simd::ContourAnchors(_m.Region(_roi), anchorScanInterval, anchorThreshold, _a.Region(_roi).Ref()); - - _anchors.clear(); - for (ptrdiff_t row = _roi.Top() + 1; row < _roi.Bottom() - 1; row += anchorScanInterval) - { - const uint8_t * a = &At(_a, 0, row); - for (ptrdiff_t col = _roi.Left() + 1; col < _roi.Right() - 1; col += anchorScanInterval) - { - if (a[col]) - _anchors.push_back(Anchor(Point(col, row), At(_m, col, row) / 2)); - } - } - - std::stable_sort(_anchors.begin(), _anchors.end(), Anchor::Compare); - } - - void PerformSmartRouting(Contours & contours, size_t minSegmentLength, uint16_t gradientThreshold) - { - View e = _e.Region(_roi); - Rect frame(1, 1, e.width - 1, e.height - 1); - Simd::Fill(e.Region(frame).Ref(), 0); - Simd::FillFrame(e, frame, 255); - - for (size_t i = 0; i < _anchors.size(); i++) - { - const Anchor & anchor = _anchors[i]; - if (anchor.val > 0) - { - Contour contour; - contour.reserve(200); - SmartRoute(contours, contour, anchor.p.x, anchor.p.y, minSegmentLength, gradientThreshold, Unknown); - if (contour.size() > minSegmentLength) - contours.push_back(contour); - } - } - } - - void SmartRoute(Contours & contours, Contour & contour, ptrdiff_t x, ptrdiff_t y, size_t minSegmentLength, uint16_t gradientThreshold, Direction direction) - { - switch (direction) - { - case Unknown: - break; - case Left: - while (CheckMetricsForMagnitudeAndDirection(x, y, gradientThreshold, 1)) - { - if (At(_e, x, y) == 0) - { - At(_e, x, y) = 255; - if (!contour.empty() && (std::abs(contour.back().x - x) > 1 || std::abs(contour.back().y - y) > 1)) - { - if (contour.size() > minSegmentLength) - contours.push_back(contour); - contour.clear(); - } - contour.push_back(Point(x, y)); - } - if (CheckMetricsForMagnitudeMaximum(x - 1, y - 1, x - 1, y, x - 1, y + 1)) - { - x--; - y--; - } - else if (CheckMetricsForMagnitudeMaximum(x - 1, y + 1, x - 1, y, x - 1, y - 1)) - { - x--; - y++; - } - else - x--; - if (At(_e, x, y) != 0) - break; - } - break; - case Right: - while (CheckMetricsForMagnitudeAndDirection(x, y, gradientThreshold, 1)) - { - if (At(_e, x, y) == 0) - { - At(_e, x, y) = 255; - if (!contour.empty() && (std::abs(contour.back().x - x) > 1 || std::abs(contour.back().y - y) > 1)) - { - if (contour.size() > minSegmentLength) - contours.push_back(contour); - contour.clear(); - } - contour.push_back(Point(x, y)); - } - if (CheckMetricsForMagnitudeMaximum(x + 1, y - 1, x + 1, y, x + 1, y + 1)) - { - x++; - y--; - } - else if (CheckMetricsForMagnitudeMaximum(x + 1, y + 1, x + 1, y, x + 1, y - 1)) - { - x++; - y++; - } - else - x++; - if (At(_e, x, y) != 0) - break; - } - break; - case Up: - while (CheckMetricsForMagnitudeAndDirection(x, y, gradientThreshold, 0)) - { - if (At(_e, x, y) == 0) - { - At(_e, x, y) = 255; - if (!contour.empty() && (std::abs(contour.back().x - x) > 1 || std::abs(contour.back().y - y) > 1)) - { - if (contour.size() > minSegmentLength) - contours.push_back(contour); - contour.clear(); - } - contour.push_back(Point(x, y)); - } - if (CheckMetricsForMagnitudeMaximum(x - 1, y - 1, x, y - 1, x + 1, y - 1)) - { - x--; - y--; - } - else if (CheckMetricsForMagnitudeMaximum(x + 1, y - 1, x, y - 1, x - 1, y - 1)) - { - x++; - y--; - } - else - y--; - if (At(_e, x, y) != 0) - break; - } - break; - case Down: - while (CheckMetricsForMagnitudeAndDirection(x, y, gradientThreshold, 0)) - { - if (At(_e, x, y) == 0) - { - At(_e, x, y) = 255; - if (!contour.empty() && (std::abs(contour.back().x - x) > 1 || std::abs(contour.back().y - y) > 1)) - { - if (contour.size() > minSegmentLength) - contours.push_back(contour); - contour.clear(); - } - contour.push_back(Point(x, y)); - } - if (CheckMetricsForMagnitudeMaximum(x + 1, y + 1, x, y + 1, x - 1, y + 1)) - { - x++; - y++; - } - else if (CheckMetricsForMagnitudeMaximum(x - 1, y + 1, x, y + 1, x + 1, y + 1)) - { - x--; - y++; - } - else - y++; - if (At(_e, x, y) != 0) - break; - } - break; - } - - if (At(_e, x, y) != 0 || At(_m, x, y) < gradientThreshold) - return; - - uint16_t d = At(_m, x, y) & 1; - if (d == 0) - { - SmartRoute(contours, contour, x, y, minSegmentLength, gradientThreshold, Up); - SmartRoute(contours, contour, x, y, minSegmentLength, gradientThreshold, Down); - } - else if (d == 1) - { - SmartRoute(contours, contour, x, y, minSegmentLength, gradientThreshold, Right); - SmartRoute(contours, contour, x, y, minSegmentLength, gradientThreshold, Left); - } - } - - bool CheckMetricsForMagnitudeAndDirection(ptrdiff_t x, ptrdiff_t y, int16_t gradientThreshold, int16_t direction) const - { - const uint16_t & m = At(_m, x, y); - return m >= gradientThreshold && (m & 1) == direction; - } - - bool CheckMetricsForMagnitudeMaximum(ptrdiff_t x0, ptrdiff_t y0, ptrdiff_t x1, ptrdiff_t y1, ptrdiff_t x2, ptrdiff_t y2) const - { - const uint16_t m0 = At(_m, x0, y0) | 1; - const uint16_t m1 = At(_m, x1, y1) | 1; - const uint16_t m2 = At(_m, x2, y2) | 1; - return m0 > m1 && m0 > m2; - } - - uint16_t EstimateAdaptiveThreshold() - { - Point roiSize = _roi.Size(); - Point mSize = _m.Size(); - if (roiSize.x >= mSize.x || roiSize.y >= mSize.y) - assert(0); - - View m = _m.Region(_roi); - Point size = m.Size(); - uint16_t value; - uint32_t sum = 0; - int count = 0; - for (ptrdiff_t i = 0; i < size.x; ++i) - { - for (ptrdiff_t j = 0; j < size.y; ++j) - { - value = At(m, i, j); - if (value) - { - count++; - value = value >> 1; - sum += value; - } - } - } - - uint16_t meanThreshold = (uint16_t)((double)sum / count); - return meanThreshold; - } - }; -} -#endif//__SimdContour_hpp__ diff --git a/src/3rd/Simd/Simd/SimdConversion.h b/src/3rd/Simd/Simd/SimdConversion.h deleted file mode 100644 index 68a360ca..00000000 --- a/src/3rd/Simd/Simd/SimdConversion.h +++ /dev/null @@ -1,1194 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar, -* 2014-2015 Antonenka Mikhail. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdConversion_h__ -#define __SimdConversion_h__ - -#include "Simd/SimdConst.h" -#include "Simd/SimdMath.h" -#include "Simd/SimdLoad.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE int BgrToGray(int blue, int green, int red) - { - return (BLUE_TO_GRAY_WEIGHT*blue + GREEN_TO_GRAY_WEIGHT * green + - RED_TO_GRAY_WEIGHT * red + BGR_TO_GRAY_ROUND_TERM) >> BGR_TO_GRAY_AVERAGING_SHIFT; - } - - SIMD_INLINE int BgrToY(int blue, int green, int red) - { - return RestrictRange(((BLUE_TO_Y_WEIGHT*blue + GREEN_TO_Y_WEIGHT * green + RED_TO_Y_WEIGHT * red + - BGR_TO_YUV_ROUND_TERM) >> BGR_TO_YUV_AVERAGING_SHIFT) + Y_ADJUST); - } - - SIMD_INLINE int BgrToU(int blue, int green, int red) - { - return RestrictRange(((BLUE_TO_U_WEIGHT*blue + GREEN_TO_U_WEIGHT * green + RED_TO_U_WEIGHT * red + - BGR_TO_YUV_ROUND_TERM) >> BGR_TO_YUV_AVERAGING_SHIFT) + UV_ADJUST); - } - - SIMD_INLINE int BgrToV(int blue, int green, int red) - { - return RestrictRange(((BLUE_TO_V_WEIGHT*blue + GREEN_TO_V_WEIGHT * green + RED_TO_V_WEIGHT * red + - BGR_TO_YUV_ROUND_TERM) >> BGR_TO_YUV_AVERAGING_SHIFT) + UV_ADJUST); - } - - SIMD_INLINE int YuvToBlue(int y, int u) - { - return RestrictRange((Y_TO_RGB_WEIGHT*(y - Y_ADJUST) + U_TO_BLUE_WEIGHT * (u - UV_ADJUST) + - YUV_TO_BGR_ROUND_TERM) >> YUV_TO_BGR_AVERAGING_SHIFT); - } - - SIMD_INLINE int YuvToGreen(int y, int u, int v) - { - return RestrictRange((Y_TO_RGB_WEIGHT*(y - Y_ADJUST) + U_TO_GREEN_WEIGHT * (u - UV_ADJUST) + - V_TO_GREEN_WEIGHT * (v - UV_ADJUST) + YUV_TO_BGR_ROUND_TERM) >> YUV_TO_BGR_AVERAGING_SHIFT); - } - - SIMD_INLINE int YuvToRed(int y, int v) - { - return RestrictRange((Y_TO_RGB_WEIGHT*(y - Y_ADJUST) + V_TO_RED_WEIGHT * (v - UV_ADJUST) + - YUV_TO_BGR_ROUND_TERM) >> YUV_TO_BGR_AVERAGING_SHIFT); - } - - SIMD_INLINE void YuvToBgr(int y, int u, int v, uint8_t * bgr) - { - bgr[0] = YuvToBlue(y, u); - bgr[1] = YuvToGreen(y, u, v); - bgr[2] = YuvToRed(y, v); - } - - SIMD_INLINE void YuvToBgra(int y, int u, int v, int alpha, uint8_t * bgra) - { - bgra[0] = YuvToBlue(y, u); - bgra[1] = YuvToGreen(y, u, v); - bgra[2] = YuvToRed(y, v); - bgra[3] = alpha; - } - - SIMD_INLINE void YuvToRgb(int y, int u, int v, uint8_t* rgb) - { - rgb[0] = YuvToRed(y, v); - rgb[1] = YuvToGreen(y, u, v); - rgb[2] = YuvToBlue(y, u); - } - - SIMD_INLINE void BgrToHsv(int blue, int green, int red, uint8_t * hsv) - { - int max = Max(red, Max(green, blue)); - int min = Min(red, Min(green, blue)); - int range = max - min; - - if (range) - { - int dividend; - - if (red == max) - dividend = green - blue + 6 * range; - else if (green == max) - dividend = blue - red + 2 * range; - else - dividend = red - green + 4 * range; - - hsv[0] = int(KF_255_DIV_6*dividend / range); - } - else - hsv[0] = 0; - - hsv[1] = max ? 255 * range / max : 0; - - hsv[2] = max; - } - - SIMD_INLINE void YuvToHsv(int y, int u, int v, uint8_t * hsv) - { - int blue = YuvToBlue(y, u); - int green = YuvToGreen(y, u, v); - int red = YuvToRed(y, v); - BgrToHsv(blue, green, red, hsv); - } - - SIMD_INLINE void BgrToHsl(int blue, int green, int red, uint8_t * hsl) - { - int max = Max(red, Max(green, blue)); - int min = Min(red, Min(green, blue)); - int range = max - min; - int sum = max + min; - - if (range) - { - int dividend; - - if (red == max) - dividend = green - blue + 6 * range; - else if (green == max) - dividend = blue - red + 2 * range; - else - dividend = red - green + 4 * range; - - hsl[0] = int(KF_255_DIV_6*dividend / range); - } - else - hsl[0] = 0; - - if (sum == 0 || sum == 510) - hsl[1] = 0; - else if (sum <= 255) - hsl[1] = range * 255 / sum; - else - hsl[1] = range * 255 / (510 - sum); - - hsl[2] = sum / 2; - } - - SIMD_INLINE void YuvToHsl(int y, int u, int v, uint8_t * hsl) - { - int blue = YuvToBlue(y, u); - int green = YuvToGreen(y, u, v); - int red = YuvToRed(y, v); - BgrToHsl(blue, green, red, hsl); - } - - SIMD_INLINE void HsvToBgr(int hue, int saturation, int value, uint8_t * bgr) - { - if (saturation) - { - int sector = hue * 6 / 255; - int min = (255 - saturation)*value / 255; - int delta = (value - min)*(hue * 6 - sector * 255) / 255; - - switch (sector) - { - case 0: - bgr[0] = min; - bgr[1] = min + delta; - bgr[2] = value; - break; - case 1: - bgr[0] = min; - bgr[1] = value; - bgr[2] = value - delta; - break; - case 2: - bgr[0] = min + delta; - bgr[1] = value; - bgr[2] = min; - break; - case 3: - bgr[0] = value; - bgr[1] = value - delta; - bgr[2] = min; - break; - case 4: - bgr[0] = value; - bgr[1] = min; - bgr[2] = min + delta; - break; - case 5: - bgr[0] = value - delta; - bgr[1] = min; - bgr[2] = value; - break; - default: - assert(0); - } - } - else - { - bgr[0] = value; - bgr[1] = value; - bgr[2] = value; - } - } - - SIMD_INLINE void HslToBgr(int hue, int saturation, int lightness, uint8_t * bgr) - { - if (saturation) - { - int sector = hue * 6 / 255; - int max; - if (lightness <= 128) - max = lightness * (255 + saturation) / 255; - else - max = ((255 - lightness)*saturation + lightness * 255) / 255; - int min = (255 - saturation)*max / 255; - int delta = (max - min)*(hue * 6 - sector * 255) / 255; - - switch (sector) - { - case 0: - bgr[0] = min; - bgr[1] = min + delta; - bgr[2] = max; - break; - case 1: - bgr[0] = min; - bgr[1] = max; - bgr[2] = max - delta; - break; - case 2: - bgr[0] = min + delta; - bgr[1] = max; - bgr[2] = min; - break; - case 3: - bgr[0] = max; - bgr[1] = max - delta; - bgr[2] = min; - break; - case 4: - bgr[0] = max; - bgr[1] = min; - bgr[2] = min + delta; - break; - case 5: - bgr[0] = max - delta; - bgr[1] = min; - bgr[2] = max; - break; - default: - assert(0); - } - } - else - { - bgr[0] = lightness; - bgr[1] = lightness; - bgr[2] = lightness; - } - } - } - -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { - SIMD_INLINE __m128i AdjustY16(__m128i y16) - { - return _mm_subs_epi16(y16, K16_Y_ADJUST); - } - - SIMD_INLINE __m128i AdjustUV16(__m128i uv16) - { - return _mm_subs_epi16(uv16, K16_UV_ADJUST); - } - - SIMD_INLINE __m128i AdjustedYuvToRed32(__m128i y16_1, __m128i v16_0) - { - return _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(y16_1, K16_YRGB_RT), - _mm_madd_epi16(v16_0, K16_VR_0)), Base::YUV_TO_BGR_AVERAGING_SHIFT); - } - - SIMD_INLINE __m128i AdjustedYuvToRed16(__m128i y16, __m128i v16) - { - return SaturateI16ToU8(_mm_packs_epi32( - AdjustedYuvToRed32(_mm_unpacklo_epi16(y16, K16_0001), _mm_unpacklo_epi16(v16, K_ZERO)), - AdjustedYuvToRed32(_mm_unpackhi_epi16(y16, K16_0001), _mm_unpackhi_epi16(v16, K_ZERO)))); - } - - SIMD_INLINE __m128i AdjustedYuvToGreen32(__m128i y16_1, __m128i u16_v16) - { - return _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(y16_1, K16_YRGB_RT), - _mm_madd_epi16(u16_v16, K16_UG_VG)), Base::YUV_TO_BGR_AVERAGING_SHIFT); - } - - SIMD_INLINE __m128i AdjustedYuvToGreen16(__m128i y16, __m128i u16, __m128i v16) - { - return SaturateI16ToU8(_mm_packs_epi32( - AdjustedYuvToGreen32(_mm_unpacklo_epi16(y16, K16_0001), _mm_unpacklo_epi16(u16, v16)), - AdjustedYuvToGreen32(_mm_unpackhi_epi16(y16, K16_0001), _mm_unpackhi_epi16(u16, v16)))); - } - - SIMD_INLINE __m128i AdjustedYuvToBlue32(__m128i y16_1, __m128i u16_0) - { - return _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(y16_1, K16_YRGB_RT), - _mm_madd_epi16(u16_0, K16_UB_0)), Base::YUV_TO_BGR_AVERAGING_SHIFT); - } - - SIMD_INLINE __m128i AdjustedYuvToBlue16(__m128i y16, __m128i u16) - { - return SaturateI16ToU8(_mm_packs_epi32( - AdjustedYuvToBlue32(_mm_unpacklo_epi16(y16, K16_0001), _mm_unpacklo_epi16(u16, K_ZERO)), - AdjustedYuvToBlue32(_mm_unpackhi_epi16(y16, K16_0001), _mm_unpackhi_epi16(u16, K_ZERO)))); - } - - SIMD_INLINE __m128i YuvToRed(__m128i y, __m128i v) - { - __m128i lo = AdjustedYuvToRed16( - AdjustY16(_mm_unpacklo_epi8(y, K_ZERO)), - AdjustUV16(_mm_unpacklo_epi8(v, K_ZERO))); - __m128i hi = AdjustedYuvToRed16( - AdjustY16(_mm_unpackhi_epi8(y, K_ZERO)), - AdjustUV16(_mm_unpackhi_epi8(v, K_ZERO))); - return _mm_packus_epi16(lo, hi); - } - - SIMD_INLINE __m128i YuvToGreen(__m128i y, __m128i u, __m128i v) - { - __m128i lo = AdjustedYuvToGreen16( - AdjustY16(_mm_unpacklo_epi8(y, K_ZERO)), - AdjustUV16(_mm_unpacklo_epi8(u, K_ZERO)), - AdjustUV16(_mm_unpacklo_epi8(v, K_ZERO))); - __m128i hi = AdjustedYuvToGreen16( - AdjustY16(_mm_unpackhi_epi8(y, K_ZERO)), - AdjustUV16(_mm_unpackhi_epi8(u, K_ZERO)), - AdjustUV16(_mm_unpackhi_epi8(v, K_ZERO))); - return _mm_packus_epi16(lo, hi); - } - - SIMD_INLINE __m128i YuvToBlue(__m128i y, __m128i u) - { - __m128i lo = AdjustedYuvToBlue16( - AdjustY16(_mm_unpacklo_epi8(y, K_ZERO)), - AdjustUV16(_mm_unpacklo_epi8(u, K_ZERO))); - __m128i hi = AdjustedYuvToBlue16( - AdjustY16(_mm_unpackhi_epi8(y, K_ZERO)), - AdjustUV16(_mm_unpackhi_epi8(u, K_ZERO))); - return _mm_packus_epi16(lo, hi); - } - - SIMD_INLINE __m128i BgrToY32(__m128i b16_r16, __m128i g16_1) - { - return _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(b16_r16, K16_BY_RY), - _mm_madd_epi16(g16_1, K16_GY_RT)), Base::BGR_TO_YUV_AVERAGING_SHIFT); - } - - SIMD_INLINE __m128i BgrToY16(__m128i b16, __m128i g16, __m128i r16) - { - return SaturateI16ToU8(_mm_add_epi16(K16_Y_ADJUST, _mm_packs_epi32( - BgrToY32(_mm_unpacklo_epi16(b16, r16), _mm_unpacklo_epi16(g16, K16_0001)), - BgrToY32(_mm_unpackhi_epi16(b16, r16), _mm_unpackhi_epi16(g16, K16_0001))))); - } - - SIMD_INLINE __m128i BgrToY8(__m128i b8, __m128i g8, __m128i r8) - { - return _mm_packus_epi16( - BgrToY16(_mm_unpacklo_epi8(b8, K_ZERO), _mm_unpacklo_epi8(g8, K_ZERO), _mm_unpacklo_epi8(r8, K_ZERO)), - BgrToY16(_mm_unpackhi_epi8(b8, K_ZERO), _mm_unpackhi_epi8(g8, K_ZERO), _mm_unpackhi_epi8(r8, K_ZERO))); - } - - SIMD_INLINE __m128i BgrToU32(__m128i b16_r16, __m128i g16_1) - { - return _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(b16_r16, K16_BU_RU), - _mm_madd_epi16(g16_1, K16_GU_RT)), Base::BGR_TO_YUV_AVERAGING_SHIFT); - } - - SIMD_INLINE __m128i BgrToU16(__m128i b16, __m128i g16, __m128i r16) - { - return SaturateI16ToU8(_mm_add_epi16(K16_UV_ADJUST, _mm_packs_epi32( - BgrToU32(_mm_unpacklo_epi16(b16, r16), _mm_unpacklo_epi16(g16, K16_0001)), - BgrToU32(_mm_unpackhi_epi16(b16, r16), _mm_unpackhi_epi16(g16, K16_0001))))); - } - - SIMD_INLINE __m128i BgrToU8(__m128i b8, __m128i g8, __m128i r8) - { - return _mm_packus_epi16( - BgrToU16(_mm_unpacklo_epi8(b8, K_ZERO), _mm_unpacklo_epi8(g8, K_ZERO), _mm_unpacklo_epi8(r8, K_ZERO)), - BgrToU16(_mm_unpackhi_epi8(b8, K_ZERO), _mm_unpackhi_epi8(g8, K_ZERO), _mm_unpackhi_epi8(r8, K_ZERO))); - } - - SIMD_INLINE __m128i BgrToV32(__m128i b16_r16, __m128i g16_1) - { - return _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(b16_r16, K16_BV_RV), - _mm_madd_epi16(g16_1, K16_GV_RT)), Base::BGR_TO_YUV_AVERAGING_SHIFT); - } - - SIMD_INLINE __m128i BgrToV16(__m128i b16, __m128i g16, __m128i r16) - { - return SaturateI16ToU8(_mm_add_epi16(K16_UV_ADJUST, _mm_packs_epi32( - BgrToV32(_mm_unpacklo_epi16(b16, r16), _mm_unpacklo_epi16(g16, K16_0001)), - BgrToV32(_mm_unpackhi_epi16(b16, r16), _mm_unpackhi_epi16(g16, K16_0001))))); - } - - SIMD_INLINE __m128i BgrToV8(__m128i b8, __m128i g8, __m128i r8) - { - return _mm_packus_epi16( - BgrToV16(_mm_unpacklo_epi8(b8, K_ZERO), _mm_unpacklo_epi8(g8, K_ZERO), _mm_unpacklo_epi8(r8, K_ZERO)), - BgrToV16(_mm_unpackhi_epi8(b8, K_ZERO), _mm_unpackhi_epi8(g8, K_ZERO), _mm_unpackhi_epi8(r8, K_ZERO))); - } - } -#endif// SIMD_SSE2_ENABLE - -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - template __m128i InterleaveBgr(__m128i blue, __m128i green, __m128i red); - - template<> SIMD_INLINE __m128i InterleaveBgr<0>(__m128i blue, __m128i green, __m128i red) - { - return - _mm_or_si128(_mm_shuffle_epi8(blue, K8_SHUFFLE_BLUE_TO_BGR0), - _mm_or_si128(_mm_shuffle_epi8(green, K8_SHUFFLE_GREEN_TO_BGR0), - _mm_shuffle_epi8(red, K8_SHUFFLE_RED_TO_BGR0))); - } - - template<> SIMD_INLINE __m128i InterleaveBgr<1>(__m128i blue, __m128i green, __m128i red) - { - return - _mm_or_si128(_mm_shuffle_epi8(blue, K8_SHUFFLE_BLUE_TO_BGR1), - _mm_or_si128(_mm_shuffle_epi8(green, K8_SHUFFLE_GREEN_TO_BGR1), - _mm_shuffle_epi8(red, K8_SHUFFLE_RED_TO_BGR1))); - } - - template<> SIMD_INLINE __m128i InterleaveBgr<2>(__m128i blue, __m128i green, __m128i red) - { - return - _mm_or_si128(_mm_shuffle_epi8(blue, K8_SHUFFLE_BLUE_TO_BGR2), - _mm_or_si128(_mm_shuffle_epi8(green, K8_SHUFFLE_GREEN_TO_BGR2), - _mm_shuffle_epi8(red, K8_SHUFFLE_RED_TO_BGR2))); - } - - SIMD_INLINE __m128i BgrToBlue(__m128i bgr[3]) - { - return - _mm_or_si128(_mm_shuffle_epi8(bgr[0], K8_SHUFFLE_BGR0_TO_BLUE), - _mm_or_si128(_mm_shuffle_epi8(bgr[1], K8_SHUFFLE_BGR1_TO_BLUE), - _mm_shuffle_epi8(bgr[2], K8_SHUFFLE_BGR2_TO_BLUE))); - } - - SIMD_INLINE __m128i BgrToGreen(__m128i bgr[3]) - { - return - _mm_or_si128(_mm_shuffle_epi8(bgr[0], K8_SHUFFLE_BGR0_TO_GREEN), - _mm_or_si128(_mm_shuffle_epi8(bgr[1], K8_SHUFFLE_BGR1_TO_GREEN), - _mm_shuffle_epi8(bgr[2], K8_SHUFFLE_BGR2_TO_GREEN))); - } - - SIMD_INLINE __m128i BgrToRed(__m128i bgr[3]) - { - return - _mm_or_si128(_mm_shuffle_epi8(bgr[0], K8_SHUFFLE_BGR0_TO_RED), - _mm_or_si128(_mm_shuffle_epi8(bgr[1], K8_SHUFFLE_BGR1_TO_RED), - _mm_shuffle_epi8(bgr[2], K8_SHUFFLE_BGR2_TO_RED))); - } - } -#endif//SIMD_SSSE3_ENABLE - -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - SIMD_INLINE __m256i AdjustY16(__m256i y16) - { - return _mm256_subs_epi16(y16, K16_Y_ADJUST); - } - - SIMD_INLINE __m256i AdjustUV16(__m256i uv16) - { - return _mm256_subs_epi16(uv16, K16_UV_ADJUST); - } - - SIMD_INLINE __m256i AdjustedYuvToRed32(__m256i y16_1, __m256i v16_0) - { - return _mm256_srai_epi32(_mm256_add_epi32(_mm256_madd_epi16(y16_1, K16_YRGB_RT), - _mm256_madd_epi16(v16_0, K16_VR_0)), Base::YUV_TO_BGR_AVERAGING_SHIFT); - } - - SIMD_INLINE __m256i AdjustedYuvToRed16(__m256i y16, __m256i v16) - { - return SaturateI16ToU8(_mm256_packs_epi32( - AdjustedYuvToRed32(_mm256_unpacklo_epi16(y16, K16_0001), _mm256_unpacklo_epi16(v16, K_ZERO)), - AdjustedYuvToRed32(_mm256_unpackhi_epi16(y16, K16_0001), _mm256_unpackhi_epi16(v16, K_ZERO)))); - } - - SIMD_INLINE __m256i AdjustedYuvToGreen32(__m256i y16_1, __m256i u16_v16) - { - return _mm256_srai_epi32(_mm256_add_epi32(_mm256_madd_epi16(y16_1, K16_YRGB_RT), - _mm256_madd_epi16(u16_v16, K16_UG_VG)), Base::YUV_TO_BGR_AVERAGING_SHIFT); - } - - SIMD_INLINE __m256i AdjustedYuvToGreen16(__m256i y16, __m256i u16, __m256i v16) - { - return SaturateI16ToU8(_mm256_packs_epi32( - AdjustedYuvToGreen32(_mm256_unpacklo_epi16(y16, K16_0001), _mm256_unpacklo_epi16(u16, v16)), - AdjustedYuvToGreen32(_mm256_unpackhi_epi16(y16, K16_0001), _mm256_unpackhi_epi16(u16, v16)))); - } - - SIMD_INLINE __m256i AdjustedYuvToBlue32(__m256i y16_1, __m256i u16_0) - { - return _mm256_srai_epi32(_mm256_add_epi32(_mm256_madd_epi16(y16_1, K16_YRGB_RT), - _mm256_madd_epi16(u16_0, K16_UB_0)), Base::YUV_TO_BGR_AVERAGING_SHIFT); - } - - SIMD_INLINE __m256i AdjustedYuvToBlue16(__m256i y16, __m256i u16) - { - return SaturateI16ToU8(_mm256_packs_epi32( - AdjustedYuvToBlue32(_mm256_unpacklo_epi16(y16, K16_0001), _mm256_unpacklo_epi16(u16, K_ZERO)), - AdjustedYuvToBlue32(_mm256_unpackhi_epi16(y16, K16_0001), _mm256_unpackhi_epi16(u16, K_ZERO)))); - } - - SIMD_INLINE __m256i YuvToRed(__m256i y, __m256i v) - { - __m256i lo = AdjustedYuvToRed16( - AdjustY16(_mm256_unpacklo_epi8(y, K_ZERO)), - AdjustUV16(_mm256_unpacklo_epi8(v, K_ZERO))); - __m256i hi = AdjustedYuvToRed16( - AdjustY16(_mm256_unpackhi_epi8(y, K_ZERO)), - AdjustUV16(_mm256_unpackhi_epi8(v, K_ZERO))); - return _mm256_packus_epi16(lo, hi); - } - - SIMD_INLINE __m256i YuvToGreen(__m256i y, __m256i u, __m256i v) - { - __m256i lo = AdjustedYuvToGreen16( - AdjustY16(_mm256_unpacklo_epi8(y, K_ZERO)), - AdjustUV16(_mm256_unpacklo_epi8(u, K_ZERO)), - AdjustUV16(_mm256_unpacklo_epi8(v, K_ZERO))); - __m256i hi = AdjustedYuvToGreen16( - AdjustY16(_mm256_unpackhi_epi8(y, K_ZERO)), - AdjustUV16(_mm256_unpackhi_epi8(u, K_ZERO)), - AdjustUV16(_mm256_unpackhi_epi8(v, K_ZERO))); - return _mm256_packus_epi16(lo, hi); - } - - SIMD_INLINE __m256i YuvToBlue(__m256i y, __m256i u) - { - __m256i lo = AdjustedYuvToBlue16( - AdjustY16(_mm256_unpacklo_epi8(y, K_ZERO)), - AdjustUV16(_mm256_unpacklo_epi8(u, K_ZERO))); - __m256i hi = AdjustedYuvToBlue16( - AdjustY16(_mm256_unpackhi_epi8(y, K_ZERO)), - AdjustUV16(_mm256_unpackhi_epi8(u, K_ZERO))); - return _mm256_packus_epi16(lo, hi); - } - - SIMD_INLINE __m256i BgrToY32(__m256i b16_r16, __m256i g16_1) - { - return _mm256_srai_epi32(_mm256_add_epi32(_mm256_madd_epi16(b16_r16, K16_BY_RY), - _mm256_madd_epi16(g16_1, K16_GY_RT)), Base::BGR_TO_YUV_AVERAGING_SHIFT); - } - - SIMD_INLINE __m256i BgrToY16(__m256i b16, __m256i g16, __m256i r16) - { - return SaturateI16ToU8(_mm256_add_epi16(K16_Y_ADJUST, _mm256_packs_epi32( - BgrToY32(_mm256_unpacklo_epi16(b16, r16), _mm256_unpacklo_epi16(g16, K16_0001)), - BgrToY32(_mm256_unpackhi_epi16(b16, r16), _mm256_unpackhi_epi16(g16, K16_0001))))); - } - - SIMD_INLINE __m256i BgrToY8(__m256i b8, __m256i g8, __m256i r8) - { - return _mm256_packus_epi16( - BgrToY16(_mm256_unpacklo_epi8(b8, K_ZERO), _mm256_unpacklo_epi8(g8, K_ZERO), _mm256_unpacklo_epi8(r8, K_ZERO)), - BgrToY16(_mm256_unpackhi_epi8(b8, K_ZERO), _mm256_unpackhi_epi8(g8, K_ZERO), _mm256_unpackhi_epi8(r8, K_ZERO))); - } - - SIMD_INLINE __m256i BgrToU32(__m256i b16_r16, __m256i g16_1) - { - return _mm256_srai_epi32(_mm256_add_epi32(_mm256_madd_epi16(b16_r16, K16_BU_RU), - _mm256_madd_epi16(g16_1, K16_GU_RT)), Base::BGR_TO_YUV_AVERAGING_SHIFT); - } - - SIMD_INLINE __m256i BgrToU16(__m256i b16, __m256i g16, __m256i r16) - { - return SaturateI16ToU8(_mm256_add_epi16(K16_UV_ADJUST, _mm256_packs_epi32( - BgrToU32(_mm256_unpacklo_epi16(b16, r16), _mm256_unpacklo_epi16(g16, K16_0001)), - BgrToU32(_mm256_unpackhi_epi16(b16, r16), _mm256_unpackhi_epi16(g16, K16_0001))))); - } - - SIMD_INLINE __m256i BgrToU8(__m256i b8, __m256i g8, __m256i r8) - { - return _mm256_packus_epi16( - BgrToU16(_mm256_unpacklo_epi8(b8, K_ZERO), _mm256_unpacklo_epi8(g8, K_ZERO), _mm256_unpacklo_epi8(r8, K_ZERO)), - BgrToU16(_mm256_unpackhi_epi8(b8, K_ZERO), _mm256_unpackhi_epi8(g8, K_ZERO), _mm256_unpackhi_epi8(r8, K_ZERO))); - } - - SIMD_INLINE __m256i BgrToV32(__m256i b16_r16, __m256i g16_1) - { - return _mm256_srai_epi32(_mm256_add_epi32(_mm256_madd_epi16(b16_r16, K16_BV_RV), - _mm256_madd_epi16(g16_1, K16_GV_RT)), Base::BGR_TO_YUV_AVERAGING_SHIFT); - } - - SIMD_INLINE __m256i BgrToV16(__m256i b16, __m256i g16, __m256i r16) - { - return SaturateI16ToU8(_mm256_add_epi16(K16_UV_ADJUST, _mm256_packs_epi32( - BgrToV32(_mm256_unpacklo_epi16(b16, r16), _mm256_unpacklo_epi16(g16, K16_0001)), - BgrToV32(_mm256_unpackhi_epi16(b16, r16), _mm256_unpackhi_epi16(g16, K16_0001))))); - } - - SIMD_INLINE __m256i BgrToV8(__m256i b8, __m256i g8, __m256i r8) - { - return _mm256_packus_epi16( - BgrToV16(_mm256_unpacklo_epi8(b8, K_ZERO), _mm256_unpacklo_epi8(g8, K_ZERO), _mm256_unpacklo_epi8(r8, K_ZERO)), - BgrToV16(_mm256_unpackhi_epi8(b8, K_ZERO), _mm256_unpackhi_epi8(g8, K_ZERO), _mm256_unpackhi_epi8(r8, K_ZERO))); - } - - template __m256i GrayToBgr(__m256i gray); - - template<> SIMD_INLINE __m256i GrayToBgr<0>(__m256i gray) - { - return _mm256_shuffle_epi8(_mm256_permute4x64_epi64(gray, 0x44), K8_SHUFFLE_GRAY_TO_BGR0); - } - - template<> SIMD_INLINE __m256i GrayToBgr<1>(__m256i gray) - { - return _mm256_shuffle_epi8(_mm256_permute4x64_epi64(gray, 0x99), K8_SHUFFLE_GRAY_TO_BGR1); - } - - template<> SIMD_INLINE __m256i GrayToBgr<2>(__m256i gray) - { - return _mm256_shuffle_epi8(_mm256_permute4x64_epi64(gray, 0xEE), K8_SHUFFLE_GRAY_TO_BGR2); - } - - template __m256i InterleaveBgr(__m256i blue, __m256i green, __m256i red); - - template<> SIMD_INLINE __m256i InterleaveBgr<0>(__m256i blue, __m256i green, __m256i red) - { - return - _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(blue, 0x44), K8_SHUFFLE_PERMUTED_BLUE_TO_BGR0), - _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(green, 0x44), K8_SHUFFLE_PERMUTED_GREEN_TO_BGR0), - _mm256_shuffle_epi8(_mm256_permute4x64_epi64(red, 0x44), K8_SHUFFLE_PERMUTED_RED_TO_BGR0))); - } - - template<> SIMD_INLINE __m256i InterleaveBgr<1>(__m256i blue, __m256i green, __m256i red) - { - return - _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(blue, 0x99), K8_SHUFFLE_PERMUTED_BLUE_TO_BGR1), - _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(green, 0x99), K8_SHUFFLE_PERMUTED_GREEN_TO_BGR1), - _mm256_shuffle_epi8(_mm256_permute4x64_epi64(red, 0x99), K8_SHUFFLE_PERMUTED_RED_TO_BGR1))); - } - - template<> SIMD_INLINE __m256i InterleaveBgr<2>(__m256i blue, __m256i green, __m256i red) - { - return - _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(blue, 0xEE), K8_SHUFFLE_PERMUTED_BLUE_TO_BGR2), - _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(green, 0xEE), K8_SHUFFLE_PERMUTED_GREEN_TO_BGR2), - _mm256_shuffle_epi8(_mm256_permute4x64_epi64(red, 0xEE), K8_SHUFFLE_PERMUTED_RED_TO_BGR2))); - } - - SIMD_INLINE __m256i BgrToBlue(__m256i bgr[3]) - { - __m256i b0 = _mm256_shuffle_epi8(bgr[0], K8_SHUFFLE_BGR0_TO_BLUE); - __m256i b2 = _mm256_shuffle_epi8(bgr[2], K8_SHUFFLE_BGR2_TO_BLUE); - return - _mm256_or_si256(_mm256_permute2x128_si256(b0, b2, 0x20), - _mm256_or_si256(_mm256_shuffle_epi8(bgr[1], K8_SHUFFLE_BGR1_TO_BLUE), - _mm256_permute2x128_si256(b0, b2, 0x31))); - } - - SIMD_INLINE __m256i BgrToGreen(__m256i bgr[3]) - { - __m256i g0 = _mm256_shuffle_epi8(bgr[0], K8_SHUFFLE_BGR0_TO_GREEN); - __m256i g2 = _mm256_shuffle_epi8(bgr[2], K8_SHUFFLE_BGR2_TO_GREEN); - return - _mm256_or_si256(_mm256_permute2x128_si256(g0, g2, 0x20), - _mm256_or_si256(_mm256_shuffle_epi8(bgr[1], K8_SHUFFLE_BGR1_TO_GREEN), - _mm256_permute2x128_si256(g0, g2, 0x31))); - } - - SIMD_INLINE __m256i BgrToRed(__m256i bgr[3]) - { - __m256i r0 = _mm256_shuffle_epi8(bgr[0], K8_SHUFFLE_BGR0_TO_RED); - __m256i r2 = _mm256_shuffle_epi8(bgr[2], K8_SHUFFLE_BGR2_TO_RED); - return - _mm256_or_si256(_mm256_permute2x128_si256(r0, r2, 0x20), - _mm256_or_si256(_mm256_shuffle_epi8(bgr[1], K8_SHUFFLE_BGR1_TO_RED), - _mm256_permute2x128_si256(r0, r2, 0x31))); - } - - template __m256i BgrToBgra(const __m256i & bgr, const __m256i & alpha); - - template<> SIMD_INLINE __m256i BgrToBgra(const __m256i & bgr, const __m256i & alpha) - { - return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0x94), K8_BGR_TO_BGRA_SHUFFLE), alpha); - } - - template<> SIMD_INLINE __m256i BgrToBgra(const __m256i & bgr, const __m256i & alpha) - { - return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0xE9), K8_BGR_TO_BGRA_SHUFFLE), alpha); - } - - template __m256i RgbToBgra(const __m256i & rgb, const __m256i & alpha); - - template<> SIMD_INLINE __m256i RgbToBgra(const __m256i & rgb, const __m256i & alpha) - { - return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(rgb, 0x94), K8_RGB_TO_BGRA_SHUFFLE), alpha); - } - - template<> SIMD_INLINE __m256i RgbToBgra(const __m256i & rgb, const __m256i & alpha) - { - return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(rgb, 0xE9), K8_RGB_TO_BGRA_SHUFFLE), alpha); - } - } -#endif// SIMD_AVX2_ENABLE - -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - SIMD_INLINE __m512i AdjustY16(__m512i y16) - { - return _mm512_subs_epi16(y16, K16_Y_ADJUST); - } - - SIMD_INLINE __m512i AdjustUV16(__m512i uv16) - { - return _mm512_subs_epi16(uv16, K16_UV_ADJUST); - } - - SIMD_INLINE __m512i AdjustedYuvToRed32(__m512i y16_1, __m512i v16_0) - { - return _mm512_srai_epi32(_mm512_add_epi32(_mm512_madd_epi16(y16_1, K16_YRGB_RT), - _mm512_madd_epi16(v16_0, K16_VR_0)), Base::YUV_TO_BGR_AVERAGING_SHIFT); - } - - SIMD_INLINE __m512i AdjustedYuvToRed16(__m512i y16, __m512i v16) - { - return SaturateI16ToU8(_mm512_packs_epi32( - AdjustedYuvToRed32(_mm512_unpacklo_epi16(y16, K16_0001), _mm512_unpacklo_epi16(v16, K_ZERO)), - AdjustedYuvToRed32(_mm512_unpackhi_epi16(y16, K16_0001), _mm512_unpackhi_epi16(v16, K_ZERO)))); - } - - SIMD_INLINE __m512i AdjustedYuvToGreen32(__m512i y16_1, __m512i u16_v16) - { - return _mm512_srai_epi32(_mm512_add_epi32(_mm512_madd_epi16(y16_1, K16_YRGB_RT), - _mm512_madd_epi16(u16_v16, K16_UG_VG)), Base::YUV_TO_BGR_AVERAGING_SHIFT); - } - - SIMD_INLINE __m512i AdjustedYuvToGreen16(__m512i y16, __m512i u16, __m512i v16) - { - return SaturateI16ToU8(_mm512_packs_epi32( - AdjustedYuvToGreen32(_mm512_unpacklo_epi16(y16, K16_0001), _mm512_unpacklo_epi16(u16, v16)), - AdjustedYuvToGreen32(_mm512_unpackhi_epi16(y16, K16_0001), _mm512_unpackhi_epi16(u16, v16)))); - } - - SIMD_INLINE __m512i AdjustedYuvToBlue32(__m512i y16_1, __m512i u16_0) - { - return _mm512_srai_epi32(_mm512_add_epi32(_mm512_madd_epi16(y16_1, K16_YRGB_RT), - _mm512_madd_epi16(u16_0, K16_UB_0)), Base::YUV_TO_BGR_AVERAGING_SHIFT); - } - - SIMD_INLINE __m512i AdjustedYuvToBlue16(__m512i y16, __m512i u16) - { - return SaturateI16ToU8(_mm512_packs_epi32( - AdjustedYuvToBlue32(_mm512_unpacklo_epi16(y16, K16_0001), _mm512_unpacklo_epi16(u16, K_ZERO)), - AdjustedYuvToBlue32(_mm512_unpackhi_epi16(y16, K16_0001), _mm512_unpackhi_epi16(u16, K_ZERO)))); - } - - SIMD_INLINE __m512i YuvToRed(__m512i y, __m512i v) - { - __m512i lo = AdjustedYuvToRed16( - AdjustY16(_mm512_unpacklo_epi8(y, K_ZERO)), - AdjustUV16(_mm512_unpacklo_epi8(v, K_ZERO))); - __m512i hi = AdjustedYuvToRed16( - AdjustY16(_mm512_unpackhi_epi8(y, K_ZERO)), - AdjustUV16(_mm512_unpackhi_epi8(v, K_ZERO))); - return _mm512_packus_epi16(lo, hi); - } - - SIMD_INLINE __m512i YuvToGreen(__m512i y, __m512i u, __m512i v) - { - __m512i lo = AdjustedYuvToGreen16( - AdjustY16(_mm512_unpacklo_epi8(y, K_ZERO)), - AdjustUV16(_mm512_unpacklo_epi8(u, K_ZERO)), - AdjustUV16(_mm512_unpacklo_epi8(v, K_ZERO))); - __m512i hi = AdjustedYuvToGreen16( - AdjustY16(_mm512_unpackhi_epi8(y, K_ZERO)), - AdjustUV16(_mm512_unpackhi_epi8(u, K_ZERO)), - AdjustUV16(_mm512_unpackhi_epi8(v, K_ZERO))); - return _mm512_packus_epi16(lo, hi); - } - - SIMD_INLINE __m512i YuvToBlue(__m512i y, __m512i u) - { - __m512i lo = AdjustedYuvToBlue16( - AdjustY16(_mm512_unpacklo_epi8(y, K_ZERO)), - AdjustUV16(_mm512_unpacklo_epi8(u, K_ZERO))); - __m512i hi = AdjustedYuvToBlue16( - AdjustY16(_mm512_unpackhi_epi8(y, K_ZERO)), - AdjustUV16(_mm512_unpackhi_epi8(u, K_ZERO))); - return _mm512_packus_epi16(lo, hi); - } - - template __m512i GrayToBgr(__m512i gray); - - template<> SIMD_INLINE __m512i GrayToBgr<0>(__m512i gray) - { - return _mm512_shuffle_epi8(_mm512_shuffle_i64x2(gray, gray, 0x40), K8_SHUFFLE_GRAY_TO_BGR0); - } - - template<> SIMD_INLINE __m512i GrayToBgr<1>(__m512i gray) - { - return _mm512_shuffle_epi8(_mm512_shuffle_i64x2(gray, gray, 0xA5), K8_SHUFFLE_GRAY_TO_BGR1); - } - - template<> SIMD_INLINE __m512i GrayToBgr<2>(__m512i gray) - { - return _mm512_shuffle_epi8(_mm512_shuffle_i64x2(gray, gray, 0xFE), K8_SHUFFLE_GRAY_TO_BGR2); - } - - SIMD_INLINE __m512i BgrToY32(__m512i b16_r16, __m512i g16_1) - { - return _mm512_srai_epi32(_mm512_add_epi32(_mm512_madd_epi16(b16_r16, K16_BY_RY), - _mm512_madd_epi16(g16_1, K16_GY_RT)), Base::BGR_TO_YUV_AVERAGING_SHIFT); - } - - SIMD_INLINE __m512i BgrToU32(__m512i b16_r16, __m512i g16_1) - { - return _mm512_srai_epi32(_mm512_add_epi32(_mm512_madd_epi16(b16_r16, K16_BU_RU), - _mm512_madd_epi16(g16_1, K16_GU_RT)), Base::BGR_TO_YUV_AVERAGING_SHIFT); - } - - SIMD_INLINE __m512i BgrToV32(__m512i b16_r16, __m512i g16_1) - { - return _mm512_srai_epi32(_mm512_add_epi32(_mm512_madd_epi16(b16_r16, K16_BV_RV), - _mm512_madd_epi16(g16_1, K16_GV_RT)), Base::BGR_TO_YUV_AVERAGING_SHIFT); - } - - template __m512i InterleaveBgr(__m512i blue, __m512i green, __m512i red); - - template<> SIMD_INLINE __m512i InterleaveBgr<0>(__m512i blue, __m512i green, __m512i red) - { - return - _mm512_or_si512(_mm512_shuffle_epi8(_mm512_permutexvar_epi32(K32_PERMUTE_COLOR_TO_BGR0, blue), K8_SHUFFLE_BLUE_TO_BGR0), - _mm512_or_si512(_mm512_shuffle_epi8(_mm512_permutexvar_epi32(K32_PERMUTE_COLOR_TO_BGR0, green), K8_SHUFFLE_GREEN_TO_BGR0), - _mm512_shuffle_epi8(_mm512_permutexvar_epi32(K32_PERMUTE_COLOR_TO_BGR0, red), K8_SHUFFLE_RED_TO_BGR0))); - } - - template<> SIMD_INLINE __m512i InterleaveBgr<1>(__m512i blue, __m512i green, __m512i red) - { - return - _mm512_or_si512(_mm512_shuffle_epi8(_mm512_permutexvar_epi32(K32_PERMUTE_COLOR_TO_BGR1, blue), K8_SHUFFLE_BLUE_TO_BGR1), - _mm512_or_si512(_mm512_shuffle_epi8(_mm512_permutexvar_epi32(K32_PERMUTE_COLOR_TO_BGR1, green), K8_SHUFFLE_GREEN_TO_BGR1), - _mm512_shuffle_epi8(_mm512_permutexvar_epi32(K32_PERMUTE_COLOR_TO_BGR1, red), K8_SHUFFLE_RED_TO_BGR1))); - } - - template<> SIMD_INLINE __m512i InterleaveBgr<2>(__m512i blue, __m512i green, __m512i red) - { - return - _mm512_or_si512(_mm512_shuffle_epi8(_mm512_permutexvar_epi32(K32_PERMUTE_COLOR_TO_BGR2, blue), K8_SHUFFLE_BLUE_TO_BGR2), - _mm512_or_si512(_mm512_shuffle_epi8(_mm512_permutexvar_epi32(K32_PERMUTE_COLOR_TO_BGR2, green), K8_SHUFFLE_GREEN_TO_BGR2), - _mm512_shuffle_epi8(_mm512_permutexvar_epi32(K32_PERMUTE_COLOR_TO_BGR2, red), K8_SHUFFLE_RED_TO_BGR2))); - } - } -#endif//SIMD_AVX512BW_ENABLE - -#ifdef SIMD_VMX_ENABLE - namespace Vmx - { - SIMD_INLINE v128_s16 AdjustY(v128_u16 y) - { - return vec_sub((v128_s16)y, K16_Y_ADJUST); - } - - SIMD_INLINE v128_s16 AdjustUV(v128_u16 uv) - { - return vec_sub((v128_s16)uv, K16_UV_ADJUST); - } - - SIMD_INLINE v128_s32 PreparedYuvToRed(v128_s16 y_1, v128_s16 v_0) - { - return vec_sra(vec_msum(y_1, K16_YRGB_RT, vec_msum(v_0, K16_VR_0, (v128_s32)K32_00000000)), K32_YUV_TO_BGR_AVERAGING_SHIFT); - } - - SIMD_INLINE v128_u16 AdjustedYuvToRed(v128_s16 y, v128_s16 v) - { - v128_s32 lo = PreparedYuvToRed((v128_s16)UnpackLoU16(K16_0001, (v128_u16)y), (v128_s16)UnpackLoU16(K16_0000, (v128_u16)v)); - v128_s32 hi = PreparedYuvToRed((v128_s16)UnpackHiU16(K16_0001, (v128_u16)y), (v128_s16)UnpackHiU16(K16_0000, (v128_u16)v)); - return SaturateI16ToU8(vec_pack(lo, hi)); - } - - SIMD_INLINE v128_u8 YuvToRed(v128_u8 y, v128_u8 v) - { - v128_u16 lo = AdjustedYuvToRed(AdjustY(UnpackLoU8(y)), AdjustUV(UnpackLoU8(v))); - v128_u16 hi = AdjustedYuvToRed(AdjustY(UnpackHiU8(y)), AdjustUV(UnpackHiU8(v))); - return vec_pack(lo, hi); - } - - SIMD_INLINE v128_s32 PreparedYuvToGreen(v128_s16 y_1, v128_s16 u_v) - { - return vec_sra(vec_msum(y_1, K16_YRGB_RT, vec_msum(u_v, K16_UG_VG, (v128_s32)K32_00000000)), K32_YUV_TO_BGR_AVERAGING_SHIFT); - } - - SIMD_INLINE v128_u16 AdjustedYuvToGreen(v128_s16 y, v128_s16 u, v128_s16 v) - { - v128_s32 lo = PreparedYuvToGreen((v128_s16)UnpackLoU16(K16_0001, (v128_u16)y), (v128_s16)UnpackLoU16((v128_u16)v, (v128_u16)u)); - v128_s32 hi = PreparedYuvToGreen((v128_s16)UnpackHiU16(K16_0001, (v128_u16)y), (v128_s16)UnpackHiU16((v128_u16)v, (v128_u16)u)); - return SaturateI16ToU8(vec_pack(lo, hi)); - } - - SIMD_INLINE v128_u8 YuvToGreen(v128_u8 y, v128_u8 u, v128_u8 v) - { - v128_u16 lo = AdjustedYuvToGreen(AdjustY(UnpackLoU8(y)), AdjustUV(UnpackLoU8(u)), AdjustUV(UnpackLoU8(v))); - v128_u16 hi = AdjustedYuvToGreen(AdjustY(UnpackHiU8(y)), AdjustUV(UnpackHiU8(u)), AdjustUV(UnpackHiU8(v))); - return vec_pack(lo, hi); - } - - SIMD_INLINE v128_s32 PreparedYuvToBlue(v128_s16 y_1, v128_s16 u_0) - { - return vec_sra(vec_msum(y_1, K16_YRGB_RT, vec_msum(u_0, K16_UB_0, (v128_s32)K32_00000000)), K32_YUV_TO_BGR_AVERAGING_SHIFT); - } - - SIMD_INLINE v128_u16 AdjustedYuvToBlue(v128_s16 y, v128_s16 u) - { - v128_s32 lo = PreparedYuvToBlue((v128_s16)UnpackLoU16(K16_0001, (v128_u16)y), (v128_s16)UnpackLoU16(K16_0000, (v128_u16)u)); - v128_s32 hi = PreparedYuvToBlue((v128_s16)UnpackHiU16(K16_0001, (v128_u16)y), (v128_s16)UnpackHiU16(K16_0000, (v128_u16)u)); - return SaturateI16ToU8(vec_pack(lo, hi)); - } - - SIMD_INLINE v128_u8 YuvToBlue(v128_u8 y, v128_u8 u) - { - v128_u16 lo = AdjustedYuvToBlue(AdjustY(UnpackLoU8(y)), AdjustUV(UnpackLoU8(u))); - v128_u16 hi = AdjustedYuvToBlue(AdjustY(UnpackHiU8(y)), AdjustUV(UnpackHiU8(u))); - return vec_pack(lo, hi); - } - - SIMD_INLINE v128_s32 BgrToY(v128_s16 b_r, v128_s16 g_1) - { - return vec_sra(vec_msum(b_r, K16_BY_RY, vec_msum(g_1, K16_GY_RT, (v128_s32)K32_00000000)), K32_BGR_TO_YUV_AVERAGING_SHIFT); - } - - SIMD_INLINE v128_u16 BgrToY(v128_s16 b, v128_s16 g, v128_s16 r) - { - return SaturateI16ToU8(vec_add((v128_s16)K16_Y_ADJUST, vec_pack( - BgrToY((v128_s16)UnpackLoU16((v128_u16)r, (v128_u16)b), (v128_s16)UnpackLoU16(K16_0001, (v128_u16)g)), - BgrToY((v128_s16)UnpackHiU16((v128_u16)r, (v128_u16)b), (v128_s16)UnpackHiU16(K16_0001, (v128_u16)g))))); - } - - SIMD_INLINE v128_u8 BgrToY(v128_u8 b, v128_u8 g, v128_u8 r) - { - return vec_pack( - BgrToY((v128_s16)UnpackLoU8(b), (v128_s16)UnpackLoU8(g), (v128_s16)UnpackLoU8(r)), - BgrToY((v128_s16)UnpackHiU8(b), (v128_s16)UnpackHiU8(g), (v128_s16)UnpackHiU8(r))); - } - - SIMD_INLINE v128_s32 BgrToU(v128_s16 b_r, v128_s16 g_1) - { - return vec_sra(vec_msum(b_r, K16_BU_RU, vec_msum(g_1, K16_GU_RT, (v128_s32)K32_00000000)), K32_BGR_TO_YUV_AVERAGING_SHIFT); - } - - SIMD_INLINE v128_u16 BgrToU(v128_s16 b, v128_s16 g, v128_s16 r) - { - return SaturateI16ToU8(vec_add((v128_s16)K16_UV_ADJUST, vec_pack( - BgrToU((v128_s16)UnpackLoU16((v128_u16)r, (v128_u16)b), (v128_s16)UnpackLoU16(K16_0001, (v128_u16)g)), - BgrToU((v128_s16)UnpackHiU16((v128_u16)r, (v128_u16)b), (v128_s16)UnpackHiU16(K16_0001, (v128_u16)g))))); - } - - SIMD_INLINE v128_u8 BgrToU(v128_u8 b, v128_u8 g, v128_u8 r) - { - return vec_pack( - BgrToU((v128_s16)UnpackLoU8(b), (v128_s16)UnpackLoU8(g), (v128_s16)UnpackLoU8(r)), - BgrToU((v128_s16)UnpackHiU8(b), (v128_s16)UnpackHiU8(g), (v128_s16)UnpackHiU8(r))); - } - - SIMD_INLINE v128_s32 BgrToV(v128_s16 b_r, v128_s16 g_1) - { - return vec_sra(vec_msum(b_r, K16_BV_RV, vec_msum(g_1, K16_GV_RT, (v128_s32)K32_00000000)), K32_BGR_TO_YUV_AVERAGING_SHIFT); - } - - SIMD_INLINE v128_u16 BgrToV(v128_s16 b, v128_s16 g, v128_s16 r) - { - return SaturateI16ToU8(vec_add((v128_s16)K16_UV_ADJUST, vec_pack( - BgrToV((v128_s16)UnpackLoU16((v128_u16)r, (v128_u16)b), (v128_s16)UnpackLoU16(K16_0001, (v128_u16)g)), - BgrToV((v128_s16)UnpackHiU16((v128_u16)r, (v128_u16)b), (v128_s16)UnpackHiU16(K16_0001, (v128_u16)g))))); - } - - SIMD_INLINE v128_u8 BgrToV(v128_u8 b, v128_u8 g, v128_u8 r) - { - return vec_pack( - BgrToV((v128_s16)UnpackLoU8(b), (v128_s16)UnpackLoU8(g), (v128_s16)UnpackLoU8(r)), - BgrToV((v128_s16)UnpackHiU8(b), (v128_s16)UnpackHiU8(g), (v128_s16)UnpackHiU8(r))); - } - - template v128_u8 InterleaveBgr(v128_u8 blue, v128_u8 green, v128_u8 red); - - template<> SIMD_INLINE v128_u8 InterleaveBgr<0>(v128_u8 blue, v128_u8 green, v128_u8 red) - { - return vec_perm(vec_perm(blue, green, K8_PERM_INTERLEAVE_BGR_00), red, K8_PERM_INTERLEAVE_BGR_01); - } - - template<> SIMD_INLINE v128_u8 InterleaveBgr<1>(v128_u8 blue, v128_u8 green, v128_u8 red) - { - return vec_perm(vec_perm(blue, green, K8_PERM_INTERLEAVE_BGR_10), red, K8_PERM_INTERLEAVE_BGR_11); - } - - template<> SIMD_INLINE v128_u8 InterleaveBgr<2>(v128_u8 blue, v128_u8 green, v128_u8 red) - { - return vec_perm(vec_perm(blue, green, K8_PERM_INTERLEAVE_BGR_20), red, K8_PERM_INTERLEAVE_BGR_21); - } - - SIMD_INLINE v128_u8 BgrToBlue(v128_u8 bgr[3]) - { - return vec_perm(vec_perm(bgr[0], bgr[1], K8_PERM_BGR_TO_BLUE_0), bgr[2], K8_PERM_BGR_TO_BLUE_1); - } - - SIMD_INLINE v128_u8 BgrToGreen(v128_u8 bgr[3]) - { - return vec_perm(vec_perm(bgr[0], bgr[1], K8_PERM_BGR_TO_GREEN_0), bgr[2], K8_PERM_BGR_TO_GREEN_1); - } - - SIMD_INLINE v128_u8 BgrToRed(v128_u8 bgr[3]) - { - return vec_perm(vec_perm(bgr[0], bgr[1], K8_PERM_BGR_TO_RED_0), bgr[2], K8_PERM_BGR_TO_RED_1); - } - } -#endif// SIMD_VMX_ENABLE - -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE uint32x4_t BgrToGray(const uint16x8_t & blue, const uint16x8_t & green, const uint16x8_t & red) - { - return vshrq_n_u32(vmlal_u16(vmlal_u16(vmlal_u16(K32_BGR_TO_GRAY_ROUND_TERM, Half(blue), K16_BLUE_TO_GRAY_WEIGHT), - Half(green), K16_GREEN_TO_GRAY_WEIGHT), Half(red), K16_RED_TO_GRAY_WEIGHT), Base::BGR_TO_GRAY_AVERAGING_SHIFT); - } - - SIMD_INLINE uint16x8_t BgrToGray(const uint16x8_t & blue, const uint16x8_t & green, const uint16x8_t & red) - { - return PackU32(BgrToGray<0>(blue, green, red), BgrToGray<1>(blue, green, red)); - } - - template SIMD_INLINE int32x4_t BgrToY(uint16x8_t blue, uint16x8_t green, uint16x8_t red) - { - return vshrq_n_s32(vmlal_s16(vmlal_s16(vmlal_s16(K32_BGR_TO_YUV_ROUND_TERM, (int16x4_t)Half(blue), K16_BLUE_TO_Y_WEIGHT), - (int16x4_t)Half(green), K16_GREEN_TO_Y_WEIGHT), (int16x4_t)Half(red), K16_RED_TO_Y_WEIGHT), Base::BGR_TO_YUV_AVERAGING_SHIFT); - } - - SIMD_INLINE int16x8_t BgrToY(uint16x8_t blue, uint16x8_t green, uint16x8_t red) - { - return vaddq_s16(K16_Y_ADJUST, PackI32(BgrToY<0>(blue, green, red), BgrToY<1>(blue, green, red))); - } - - SIMD_INLINE uint8x16_t BgrToY(uint8x16_t blue, uint8x16_t green, uint8x16_t red) - { - return PackSaturatedI16( - BgrToY(UnpackU8<0>(blue), UnpackU8<0>(green), UnpackU8<0>(red)), - BgrToY(UnpackU8<1>(blue), UnpackU8<1>(green), UnpackU8<1>(red))); - } - - template SIMD_INLINE int32x4_t BgrToU(uint16x8_t blue, uint16x8_t green, uint16x8_t red) - { - return vshrq_n_s32(vmlal_s16(vmlal_s16(vmlal_s16(K32_BGR_TO_YUV_ROUND_TERM, (int16x4_t)Half(blue), K16_BLUE_TO_U_WEIGHT), - (int16x4_t)Half(green), K16_GREEN_TO_U_WEIGHT), (int16x4_t)Half(red), K16_RED_TO_U_WEIGHT), Base::BGR_TO_YUV_AVERAGING_SHIFT); - } - - SIMD_INLINE int16x8_t BgrToU(uint16x8_t blue, uint16x8_t green, uint16x8_t red) - { - return vaddq_s16(K16_UV_ADJUST, PackI32(BgrToU<0>(blue, green, red), BgrToU<1>(blue, green, red))); - } - - SIMD_INLINE uint8x16_t BgrToU(uint8x16_t blue, uint8x16_t green, uint8x16_t red) - { - return PackSaturatedI16( - BgrToU(UnpackU8<0>(blue), UnpackU8<0>(green), UnpackU8<0>(red)), - BgrToU(UnpackU8<1>(blue), UnpackU8<1>(green), UnpackU8<1>(red))); - } - - template SIMD_INLINE int32x4_t BgrToV(uint16x8_t blue, uint16x8_t green, uint16x8_t red) - { - return vshrq_n_s32(vmlal_s16(vmlal_s16(vmlal_s16(K32_BGR_TO_YUV_ROUND_TERM, (int16x4_t)Half(blue), K16_BLUE_TO_V_WEIGHT), - (int16x4_t)Half(green), K16_GREEN_TO_V_WEIGHT), (int16x4_t)Half(red), K16_RED_TO_V_WEIGHT), Base::BGR_TO_YUV_AVERAGING_SHIFT); - } - - SIMD_INLINE int16x8_t BgrToV(uint16x8_t blue, uint16x8_t green, uint16x8_t red) - { - return vaddq_s16(K16_UV_ADJUST, PackI32(BgrToV<0>(blue, green, red), BgrToV<1>(blue, green, red))); - } - - SIMD_INLINE uint8x16_t BgrToV(uint8x16_t blue, uint8x16_t green, uint8x16_t red) - { - return PackSaturatedI16( - BgrToV(UnpackU8<0>(blue), UnpackU8<0>(green), UnpackU8<0>(red)), - BgrToV(UnpackU8<1>(blue), UnpackU8<1>(green), UnpackU8<1>(red))); - } - - template SIMD_INLINE int16x8_t AdjustY(uint8x16_t y) - { - return vsubq_s16((int16x8_t)UnpackU8(y), K16_Y_ADJUST); - } - - template SIMD_INLINE int16x8_t AdjustUV(uint8x16_t uv) - { - return vsubq_s16((int16x8_t)UnpackU8(uv), K16_UV_ADJUST); - } - - template SIMD_INLINE int32x4_t YuvToRed(int16x8_t y, int16x8_t v) - { - return vshrq_n_s32(vmlal_s16(vmlal_s16(K32_YUV_TO_BGR_ROUND_TERM, Half(y), K16_Y_TO_RGB_WEIGHT), - Half(v), K16_V_TO_RED_WEIGHT), Base::YUV_TO_BGR_AVERAGING_SHIFT); - } - - SIMD_INLINE int16x8_t YuvToRed(int16x8_t y, int16x8_t v) - { - return PackI32(YuvToRed<0>(y, v), YuvToRed<1>(y, v)); - } - - SIMD_INLINE uint8x16_t YuvToRed(uint8x16_t y, uint8x16_t v) - { - return PackSaturatedI16(YuvToRed(AdjustY<0>(y), AdjustUV<0>(v)), YuvToRed(AdjustY<1>(y), AdjustUV<1>(v))); - } - - template SIMD_INLINE int32x4_t YuvToGreen(int16x8_t y, int16x8_t u, int16x8_t v) - { - return vshrq_n_s32(vmlal_s16(vmlal_s16(vmlal_s16(K32_YUV_TO_BGR_ROUND_TERM, Half(y), K16_Y_TO_RGB_WEIGHT), - Half(u), K16_U_TO_GREEN_WEIGHT), Half(v), K16_V_TO_GREEN_WEIGHT), Base::YUV_TO_BGR_AVERAGING_SHIFT); - } - - SIMD_INLINE int16x8_t YuvToGreen(int16x8_t y, int16x8_t u, int16x8_t v) - { - return PackI32(YuvToGreen<0>(y, u, v), YuvToGreen<1>(y, u, v)); - } - - SIMD_INLINE uint8x16_t YuvToGreen(uint8x16_t y, uint8x16_t u, uint8x16_t v) - { - return PackSaturatedI16(YuvToGreen(AdjustY<0>(y), AdjustUV<0>(u), AdjustUV<0>(v)), - YuvToGreen(AdjustY<1>(y), AdjustUV<1>(u), AdjustUV<1>(v))); - } - - template SIMD_INLINE int32x4_t YuvToBlue(int16x8_t y, int16x8_t u) - { - return vshrq_n_s32(vmlal_s16(vmlal_s16(K32_YUV_TO_BGR_ROUND_TERM, Half(y), K16_Y_TO_RGB_WEIGHT), - Half(u), K16_U_TO_BLUE_WEIGHT), Base::YUV_TO_BGR_AVERAGING_SHIFT); - } - - SIMD_INLINE int16x8_t YuvToBlue(int16x8_t y, int16x8_t u) - { - return PackI32(YuvToBlue<0>(y, u), YuvToBlue<1>(y, u)); - } - - SIMD_INLINE uint8x16_t YuvToBlue(uint8x16_t y, uint8x16_t u) - { - return PackSaturatedI16(YuvToBlue(AdjustY<0>(y), AdjustUV<0>(u)), YuvToBlue(AdjustY<1>(y), AdjustUV<1>(u))); - } - - SIMD_INLINE void YuvToBgr(uint8x16_t y, uint8x16_t u, uint8x16_t v, uint8x16x3_t & bgr) - { - int16x8_t yLo = AdjustY<0>(y), uLo = AdjustUV<0>(u), vLo = AdjustUV<0>(v); - int16x8_t yHi = AdjustY<1>(y), uHi = AdjustUV<1>(u), vHi = AdjustUV<1>(v); - bgr.val[0] = PackSaturatedI16(YuvToBlue(yLo, uLo), YuvToBlue(yHi, uHi)); - bgr.val[1] = PackSaturatedI16(YuvToGreen(yLo, uLo, vLo), YuvToGreen(yHi, uHi, vHi)); - bgr.val[2] = PackSaturatedI16(YuvToRed(yLo, vLo), YuvToRed(yHi, vHi)); - } - - SIMD_INLINE void YuvToRgb(uint8x16_t y, uint8x16_t u, uint8x16_t v, uint8x16x3_t& rgb) - { - int16x8_t yLo = AdjustY<0>(y), uLo = AdjustUV<0>(u), vLo = AdjustUV<0>(v); - int16x8_t yHi = AdjustY<1>(y), uHi = AdjustUV<1>(u), vHi = AdjustUV<1>(v); - rgb.val[0] = PackSaturatedI16(YuvToRed(yLo, vLo), YuvToRed(yHi, vHi)); - rgb.val[1] = PackSaturatedI16(YuvToGreen(yLo, uLo, vLo), YuvToGreen(yHi, uHi, vHi)); - rgb.val[2] = PackSaturatedI16(YuvToBlue(yLo, uLo), YuvToBlue(yHi, uHi)); - } - } -#endif// SIMD_NEON_ENABLE -} -#endif//__SimdConversion_h__ diff --git a/src/3rd/Simd/Simd/SimdCpu.h b/src/3rd/Simd/Simd/SimdCpu.h deleted file mode 100644 index 82140923..00000000 --- a/src/3rd/Simd/Simd/SimdCpu.h +++ /dev/null @@ -1,136 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdCpu_h__ -#define __SimdCpu_h__ - -#include "Simd/SimdDefs.h" - -namespace Simd -{ - namespace Base - { - size_t CpuSocketNumber(); - - size_t CpuCoreNumber(); - - size_t CpuThreadNumber(); - - size_t CpuCacheSize(size_t level); - } - -#ifdef SIMD_SSE_ENABLE - namespace Sse - { - const unsigned int SCR_FTZ = 1 << 15; - const unsigned int SCR_DAZ = 1 << 6; - - SIMD_INLINE SimdBool GetFastMode() - { - return _mm_getcsr() & (SCR_FTZ | SCR_DAZ) ? SimdTrue : SimdFalse; - } - - SIMD_INLINE void SetFastMode(SimdBool value) - { - if (value) - _mm_setcsr(_mm_getcsr() | (SCR_FTZ | SCR_DAZ)); - else - _mm_setcsr(_mm_getcsr() & ~(SCR_FTZ | SCR_DAZ)); - } - } -#endif - -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE unsigned int GetStatusWord() - { - unsigned int dst; -#if defined(__GNUC__) -#if defined(SIMD_ARM64_ENABLE) - __asm__ volatile("mrs %[dst], FPCR" : [dst] "=r" (dst)); -#else - __asm__ volatile("vmrs %[dst], FPSCR" : [dst] "=r" (dst)); -#endif -#endif - return dst; - } - - SIMD_INLINE void SetStatusWord(unsigned int src) - { -#if defined(__GNUC__) -#if defined(SIMD_ARM64_ENABLE) - __asm__ volatile("msr FPCR, %[src]" : : [src] "r" (src)); -#else - __asm__ volatile("vmsr FPSCR, %[src]" : : [src] "r" (src)); -#endif -#endif - } - - const unsigned int FPSCR_FTZ = 1 << 24; - - SIMD_INLINE SimdBool GetFastMode() - { - return GetStatusWord() & FPSCR_FTZ ? SimdTrue : SimdFalse; - } - - SIMD_INLINE void SetFastMode(SimdBool value) - { - if (value) - SetStatusWord(GetStatusWord() | FPSCR_FTZ); - else - SetStatusWord(GetStatusWord() & ~FPSCR_FTZ); - } - } -#endif - - namespace Cpu - { - const size_t SOCKET_NUMBER = Base::CpuSocketNumber(); - const size_t CORE_NUMBER = Base::CpuCoreNumber(); - const size_t THREAD_NUMBER = Base::CpuThreadNumber(); - const size_t L1_CACHE_SIZE = Base::CpuCacheSize(1); - const size_t L2_CACHE_SIZE = Base::CpuCacheSize(2); - const size_t L3_CACHE_SIZE = Base::CpuCacheSize(3); - } - - namespace Base - { - SIMD_INLINE size_t AlgCacheL1() - { - return Cpu::L1_CACHE_SIZE; - } - - SIMD_INLINE size_t AlgCacheL2() - { - return Cpu::L2_CACHE_SIZE; - } - - SIMD_INLINE size_t AlgCacheL3() - { - return Cpu::L3_CACHE_SIZE * Cpu::SOCKET_NUMBER / Cpu::CORE_NUMBER; - } - } -} - -#endif//__SimdCpu_h__ diff --git a/src/3rd/Simd/Simd/SimdDefs.h b/src/3rd/Simd/Simd/SimdDefs.h deleted file mode 100644 index 4b4df52c..00000000 --- a/src/3rd/Simd/Simd/SimdDefs.h +++ /dev/null @@ -1,410 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdDefs_h__ -#define __SimdDefs_h__ - -#include "Simd/SimdConfig.h" -#include "Simd/SimdLib.h" - -#include -#include -#include -#include -#include -#include -#define _USE_MATH_DEFINES -#include -#include -#include - -#if defined(SIMD_SSE_DISABLE) && !defined(SIMD_SSE2_DISABLE) -#define SIMD_SSE2_DISABLE -#endif - -#if defined(SIMD_SSE2_DISABLE) && !defined(SIMD_SSE3_DISABLE) -#define SIMD_SSE3_DISABLE -#endif - -#if defined(SIMD_SSE3_DISABLE) && !defined(SIMD_SSSE3_DISABLE) -#define SIMD_SSSE3_DISABLE -#endif - -#if defined(SIMD_SSSE3_DISABLE) && !defined(SIMD_SSE41_DISABLE) -#define SIMD_SSE41_DISABLE -#endif - -#if defined(SIMD_SSE41_DISABLE) && !defined(SIMD_SSE42_DISABLE) -#define SIMD_SSE42_DISABLE -#endif - -#if defined(SIMD_SSE41_DISABLE) && !defined(SIMD_SSE42_DISABLE) -#define SIMD_SSE42_DISABLE -#endif - -#if defined(SIMD_SSE42_DISABLE) && !defined(SIMD_AVX_DISABLE) -#define SIMD_AVX_DISABLE -#endif - -#if defined(SIMD_AVX_DISABLE) && !defined(SIMD_AVX2_DISABLE) -#define SIMD_AVX2_DISABLE -#endif - -#if defined(SIMD_AVX2_DISABLE) && !defined(SIMD_AVX512F_DISABLE) -#define SIMD_AVX512F_DISABLE -#endif - -#if defined(SIMD_AVX512F_DISABLE) && !defined(SIMD_AVX512BW_DISABLE) -#define SIMD_AVX512BW_DISABLE -#endif - -#if defined(SIMD_AVX512BW_DISABLE) && !defined(SIMD_AVX512VNNI_DISABLE) -#define SIMD_AVX512VNNI_DISABLE -#endif - -#if defined(SIMD_VMX_DISABLE) && !defined(SIMD_VSX_DISABLE) -#define SIMD_VSX_DISABLE -#endif - -#if defined(_MSC_VER) && defined(_MSC_FULL_VER) - -#define SIMD_ALIGNED(x) __declspec(align(x)) - -#define SIMD_NOINLINE __declspec(noinline) - -#ifdef _M_IX86 -#define SIMD_X86_ENABLE -#endif - -#if defined(_M_X64) || defined(_M_AMD64) -#define SIMD_X64_ENABLE -#endif - -#if defined(_M_ARM) -#define SIMD_ARM_ENABLE -#endif - -#if defined(SIMD_X64_ENABLE) || defined(SIMD_X86_ENABLE) - -#if !defined(SIMD_SSE_DISABLE) && _MSC_VER >= 1200 -#define SIMD_SSE_ENABLE -#endif - -#if !defined(SIMD_SSE2_DISABLE) && _MSC_VER >= 1300 -#define SIMD_SSE2_ENABLE -#endif - -#if !defined(SIMD_SSE3_DISABLE) && _MSC_VER >= 1500 -#define SIMD_SSE3_ENABLE -#endif - -#if !defined(SIMD_SSSE3_DISABLE) && _MSC_VER >= 1500 -#define SIMD_SSSE3_ENABLE -#endif - -#if !defined(SIMD_SSE41_DISABLE) && _MSC_VER >= 1500 -#define SIMD_SSE41_ENABLE -#endif - -#if !defined(SIMD_SSE42_DISABLE) && _MSC_VER >= 1500 -#define SIMD_SSE42_ENABLE -#endif - -#if !defined(SIMD_AVX_DISABLE) && _MSC_FULL_VER >= 160040219 -#define SIMD_AVX_ENABLE -#endif - -#if !defined(SIMD_AVX2_DISABLE) && _MSC_VER >= 1700 -#define SIMD_AVX2_ENABLE -#endif - -#if defined(NDEBUG) && _MSC_VER >= 1700 && _MSC_VER < 1900 -#define SIMD_MADDUBS_ERROR // Visual Studio 2012/2013 release mode compiler bug in function _mm256_maddubs_epi16: -#endif - -#if !defined(SIMD_AVX512F_DISABLE) && _MSC_VER >= 1911 -#define SIMD_AVX512F_ENABLE -#endif - -#if !defined(SIMD_AVX512BW_DISABLE) && _MSC_VER >= 1911 -#define SIMD_AVX512BW_ENABLE -#endif - -#if !defined(SIMD_AVX512VNNI_DISABLE) && _MSC_VER >= 1924 -#define SIMD_AVX512VNNI_ENABLE -#endif - -#if defined(NDEBUG) && _MSC_VER == 1914 -#define SIMD_MASKZ_LOAD_ERROR -#endif - -#endif//defined(SIMD_X64_ENABLE) || defined(SIMD_X86_ENABLE) - -#if defined(SIMD_ARM_ENABLE) - -#if !defined(SIMD_NEON_DISABLE) && _MSC_VER >= 1700 -#define SIMD_NEON_ENABLE -#endif - -#endif - -#if _MSC_VER >= 1900 -#define SIMD_CPP_2011_ENABLE -#endif - -#if _MSVC_LANG >= 201402L -#define SIMD_CPP_2014_ENABLE -#endif - -#if _MSVC_LANG >= 201703L -#define SIMD_CPP_2017_ENABLE -#endif - -#define SIMD_FUNCTION __FUNCTION__ - -#elif defined(__GNUC__) - -#define SIMD_ALIGNED(x) __attribute__ ((aligned(x))) - -#define SIMD_NOINLINE __attribute__ ((noinline)) - -#ifdef __i386__ -#define SIMD_X86_ENABLE -#endif - -#if defined(__x86_64__) || defined(__amd64__) -#define SIMD_X64_ENABLE -#endif - -#ifdef __BIG_ENDIAN__ -#define SIMD_BIG_ENDIAN -#endif - -#ifdef __powerpc__ -#define SIMD_PPC_ENABLE -#endif - -#ifdef __powerpc64__ -#define SIMD_PPC64_ENABLE -#endif - -#if defined __arm__ -#define SIMD_ARM_ENABLE -#endif - -#if defined __aarch64__ -#define SIMD_ARM64_ENABLE -#endif - -#if defined __mips__ -#define SIMD_MIPS_ENABLE -#endif - -#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) - -#if !defined(SIMD_SSE_DISABLE) && defined(__SSE__) -#define SIMD_SSE_ENABLE -#endif - -#if !defined(SIMD_SSE2_DISABLE) && defined(__SSE2__) -#define SIMD_SSE2_ENABLE -#endif - -#if !defined(SIMD_SSE3_DISABLE) && defined(__SSE3__) -#define SIMD_SSE3_ENABLE -#endif - -#if !defined(SIMD_SSSE3_DISABLE) && defined(__SSSE3__) -#define SIMD_SSSE3_ENABLE -#endif - -#if !defined(SIMD_SSE41_DISABLE) && defined(__SSE4_1__) -#define SIMD_SSE41_ENABLE -#endif - -#if !defined(SIMD_SSE42_DISABLE) && defined(__SSE4_2__) -#define SIMD_SSE42_ENABLE -#endif - -#if !defined(SIMD_AVX_DISABLE) && defined(__AVX__) -#define SIMD_AVX_ENABLE -#endif - -#if !defined(SIMD_AVX2_DISABLE) && defined(__AVX2__) -#define SIMD_AVX2_ENABLE -#endif - -#if !defined(__clang__) || (defined(__clang__) && __clang_major__ >= 4) -#if !defined(SIMD_AVX512F_DISABLE) && defined(__AVX512F__) -#define SIMD_AVX512F_ENABLE -#endif - -#if !defined(SIMD_AVX512BW_DISABLE) && defined(__AVX512BW__) -#define SIMD_AVX512BW_ENABLE -#endif - -#if !defined(SIMD_AVX512VNNI_DISABLE) && defined(__AVX512VNNI__) -#define SIMD_AVX512VNNI_ENABLE -#endif -#endif - -#endif//defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) - -#if defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) - -#if !defined(SIMD_VMX_DISABLE) && defined(__ALTIVEC__) -#define SIMD_VMX_ENABLE -#endif - -#if !defined(SIMD_VSX_DISABLE) && defined(__VSX__) -#define SIMD_VSX_ENABLE -#endif - -#endif//defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) - -#if defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE) - -#if !defined(SIMD_NEON_DISABLE) && (defined(__ARM_NEON) || defined(SIMD_ARM64_ENABLE)) -#define SIMD_NEON_ENABLE -#endif - -#if !defined(SIMD_NEON_ASM_DISABLE) && defined(__GNUC__) -#define SIMD_NEON_ASM_ENABLE -#endif - -#if !defined(SIMD_NEON_FP16_DISABLE) && (defined(__ARM_FP16_FORMAT_IEEE) || defined(__ARM_FP16_FORMAT_ALTERNATIVE)) -#define SIMD_NEON_FP16_ENABLE -#endif - -#endif//defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE) - -#if defined(SIMD_MIPS_ENABLE) - -#if !defined(SIMD_MSA_DISABLE) && defined(__mips_msa) -#define SIMD_MSA_ENABLE -#endif - -#endif //defined(SIMD_MIPS_ENABLE) - -#if __cplusplus >= 201103L -#define SIMD_CPP_2011_ENABLE -#endif - -#if __cplusplus >= 201402L -#define SIMD_CPP_2014_ENABLE -#endif - -#if __cplusplus >= 201703L -#define SIMD_CPP_2017_ENABLE -#endif - -#if defined(__clang__) -#define SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR -#endif - -#define SIMD_FUNCTION __PRETTY_FUNCTION__ - -#else - -#error This platform is unsupported! - -#endif - -#ifdef SIMD_SSE_ENABLE -#include -#endif - -#ifdef SIMD_SSE2_ENABLE -#include -#endif - -#ifdef SIMD_SSE3_ENABLE -# include -#endif - -#ifdef SIMD_SSSE3_ENABLE -#include -#endif - -#ifdef SIMD_SSE41_ENABLE -#include -#endif - -#ifdef SIMD_SSE42_ENABLE -#include -#endif - -#if defined(SIMD_AVX_ENABLE) || defined(SIMD_AVX2_ENABLE) \ - || defined(SIMD_AVX512F_ENABLE) || defined(SIMD_AVX512BW_ENABLE) || defined(SIMD_AVX512VNNI_ENABLE) -#include -#endif - -#if defined(SIMD_VMX_ENABLE) || defined(SIMD_VSX_ENABLE) -#include -#include -#ifdef __cplusplus -#undef vector -#undef pixel -#undef bool -#endif -#endif - -#if defined(SIMD_NEON_ENABLE) -#include -#endif - -#if defined(SIMD_MSA_ENABLE) -#include -#endif - -#if defined(SIMD_AVX512F_ENABLE) || defined(SIMD_AVX512BW_ENABLE) || defined(SIMD_AVX512VNNI_ENABLE) -#define SIMD_ALIGN 64 -#elif defined(SIMD_AVX_ENABLE) || defined(SIMD_AVX2_ENABLE) -#define SIMD_ALIGN 32 -#elif defined(SIMD_SSE_ENABLE) || defined(SIMD_SSE2_ENABLE) || defined(SIMD_SSE3_ENABLE) || defined(SIMD_SSSE3_ENABLE) || defined(SIMD_SSE41_ENABLE) || defined(SIMD_SSE42_ENABLE) \ - || defined(SIMD_VMX_ENABLE) || defined(SIMD_VSX_ENABLE) \ - || defined(SIMD_NEON_ENABLE) \ - || defined(SIMD_MSA_ENABLE) -#define SIMD_ALIGN 16 -#elif defined (SIMD_X64_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM64_ENABLE) -#define SIMD_ALIGN 8 -#else -#define SIMD_ALIGN 4 -#endif - -#if (defined(SIMD_AVX512F_ENABLE) || defined(SIMD_AVX512BW_ENABLE) || defined(SIMD_AVX512VNNI_ENABLE)) -#ifdef SIMD_X64_ENABLE -#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER >= 1915) -#define SIMD_ZMM_COUNT 32 -#else -#define SIMD_ZMM_COUNT 16 -#endif -#else -#define SIMD_ZMM_COUNT 8 -#endif -#endif - -#define SIMD_CAT_DO(a, b) a##b -#define SIMD_CAT(a, b) SIMD_CAT_DO(a, b) - -#endif//__SimdDefs_h__ diff --git a/src/3rd/Simd/Simd/SimdDetection.h b/src/3rd/Simd/Simd/SimdDetection.h deleted file mode 100644 index c64bf3a7..00000000 --- a/src/3rd/Simd/Simd/SimdDetection.h +++ /dev/null @@ -1,352 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdDetection_h__ -#define __SimdDetection_h__ - -#include "Simd/SimdConst.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdEnable.h" - -#include "Simd/SimdView.hpp" - -#include - -namespace Simd -{ - namespace Detection - { - typedef Simd::View Image; - typedef Simd::Point Size; - typedef Simd::Rectangle Rect; - - struct Data : public Deletable - { - struct DTreeNode - { - int featureIdx; - float threshold; // for ordered features only - int left; - int right; - }; - - struct DTree - { - int nodeCount; - }; - - struct Stage - { - int first; - int ntrees; - float threshold; - }; - - struct Rect - { - int x, y, width, height; - Rect() : x(0), y(0), width(0), height(0) {} - }; - - struct WeightedRect - { - Rect r; - float weight; - WeightedRect() : weight(0) {} - }; - - struct HaarFeature - { - bool tilted; - enum { RECT_NUM = 3 }; - WeightedRect rect[RECT_NUM]; - }; - - struct LbpFeature - { - Rect rect; - }; - - bool isStumpBased; - bool hasTilted; - bool canInt16; - - int stageType; - SimdDetectionInfoFlags featureType; - int ncategories; - Size origWinSize; - - std::vector stages; - std::vector classifiers; - std::vector nodes; - std::vector leaves; - std::vector subsets; - - std::vector haarFeatures; - std::vector lbpFeatures; - - virtual ~Data() {} - }; - - struct HidBase : public Deletable - { - SimdDetectionInfoFlags featureType; - Size origWinSize; - bool isStumpBased; - bool isThroughColumn; - bool hasTilted; - bool isInt16; - int ncategories; - - virtual ~HidBase() {} - }; - - struct WeightedRect - { - uint32_t *p0, *p1, *p2, *p3; - float weight; - }; - - struct HidHaarFeature - { - WeightedRect rect[Data::HaarFeature::RECT_NUM]; - }; - - struct HidHaarStage - { - int first; - int ntrees; - float threshold; - bool hasThree; - bool canSkip; - }; - - struct HidHaarNode - { - int featureIdx; - int left; - int right; - float threshold; - }; - - struct HidHaarCascade : public HidBase - { - typedef HidHaarNode Node; - typedef std::vector Nodes; - - struct Tree - { - int nodeCount; - }; - typedef std::vector Trees; - - typedef HidHaarFeature Feature; - typedef std::vector Features; - - typedef HidHaarStage Stage; - typedef std::vector Stages; - - typedef float Leave; - typedef std::vector Leaves; - - typedef int ILeave; - typedef std::vector ILeaves; - - Nodes nodes; - Trees trees; - Stages stages; - Leaves leaves; - Features features; - - float windowArea; - float invWinArea; - uint32_t *pq[4]; - uint32_t *p[4]; - - Image sum, sqsum, tilted; - Image isum, itilted; - - virtual ~HidHaarCascade() - { - } - }; - - template struct HidLbpFeature - { - Rect rect; - const TSum * p[16]; - }; - - template struct HidLbpStage - { - int first; - int ntrees; - TWeight threshold; - }; - - template struct HidLbpCascade : public HidBase - { - struct Node - { - int featureIdx; - int left; - int right; - }; - typedef std::vector Nodes; - - struct Tree - { - int nodeCount; - }; - typedef std::vector Trees; - - typedef HidLbpStage Stage; - typedef std::vector Stages; - - typedef TWeight Leave; - typedef std::vector Leaves; - - typedef int Subset; - typedef std::vector Subsets; - - typedef HidLbpFeature Feature; - typedef std::vector Features; - - Nodes nodes; - Trees trees; - Stages stages; - Leaves leaves; - Subsets subsets; - Features features; - - Image sum; - Image isum; - - virtual ~HidLbpCascade() {} - }; - - template struct Buffer - { - Buffer(size_t size) - { - _p = Allocate(2 * size * sizeof(T)); - m = (T*)_p; - d = m + size; - } - - ~Buffer() - { - Free(_p); - } - - T *m, *d; - private: - void *_p; - }; - } - - namespace Base - { - using namespace Detection; - - SIMD_INLINE uint32_t Sum32i(uint32_t * const ptr[4], size_t offset) - { - return ptr[0][offset] - ptr[1][offset] - ptr[2][offset] + ptr[3][offset]; - } - - SIMD_INLINE float Norm32f(const HidHaarCascade & hid, size_t offset) - { - float sum = float(Sum32i(hid.p, offset)); - float sqsum = float(Sum32i(hid.pq, offset)); - float q = sqsum*hid.windowArea - sum *sum; - return q < 0.0f ? 1.0f : sqrtf(q); - } - - SIMD_INLINE int Norm16i(const HidHaarCascade & hid, size_t offset) - { - return Simd::Round(Norm32f(hid, offset)*hid.invWinArea); - } - - SIMD_INLINE float WeightedSum32f(const WeightedRect & rect, size_t offset) - { - uint32_t sum = rect.p0[offset] - rect.p1[offset] - rect.p2[offset] + rect.p3[offset]; - return rect.weight*sum; - } - - int Detect32f(const struct HidHaarCascade & hid, size_t offset, int startStage, float norm); - - template< class T> SIMD_INLINE T IntegralSum(const T * p0, const T * p1, const T * p2, const T * p3, ptrdiff_t offset) - { - return p0[offset] - p1[offset] - p2[offset] + p3[offset]; - } - - template< class T> SIMD_INLINE int Calculate(const HidLbpFeature & feature, ptrdiff_t offset) - { - T central = IntegralSum(feature.p[5], feature.p[6], feature.p[9], feature.p[10], offset); - - return - (IntegralSum(feature.p[0], feature.p[1], feature.p[4], feature.p[5], offset) >= central ? 128 : 0) | - (IntegralSum(feature.p[1], feature.p[2], feature.p[5], feature.p[6], offset) >= central ? 64 : 0) | - (IntegralSum(feature.p[2], feature.p[3], feature.p[6], feature.p[7], offset) >= central ? 32 : 0) | - (IntegralSum(feature.p[6], feature.p[7], feature.p[10], feature.p[11], offset) >= central ? 16 : 0) | - (IntegralSum(feature.p[10], feature.p[11], feature.p[14], feature.p[15], offset) >= central ? 8 : 0) | - (IntegralSum(feature.p[9], feature.p[10], feature.p[13], feature.p[14], offset) >= central ? 4 : 0) | - (IntegralSum(feature.p[8], feature.p[9], feature.p[12], feature.p[13], offset) >= central ? 2 : 0) | - (IntegralSum(feature.p[4], feature.p[5], feature.p[8], feature.p[9], offset) >= central ? 1 : 0); - } - - template inline int Detect(const HidLbpCascade & hid, size_t offset, int startStage) - { - typedef HidLbpCascade Hid; - - size_t subsetSize = (hid.ncategories + 31) / 32; - const int * subsets = hid.subsets.data(); - const typename Hid::Leave * leaves = hid.leaves.data(); - const typename Hid::Node * nodes = hid.nodes.data(); - const typename Hid::Stage * stages = hid.stages.data(); - if (startStage >= (int)hid.stages.size()) - return 1; - int nodeOffset = stages[startStage].first; - int leafOffset = 2 * nodeOffset; - for (int i_stage = startStage, n_stages = (int)hid.stages.size(); i_stage < n_stages; i_stage++) - { - const typename Hid::Stage & stage = stages[i_stage]; - TWeight sum = 0; - for (int i_tree = 0, n_trees = stage.ntrees; i_tree < n_trees; i_tree++) - { - const typename Hid::Node & node = nodes[nodeOffset]; - int c = Calculate(hid.features[node.featureIdx], offset); - const int * subset = subsets + nodeOffset*subsetSize; - sum += leaves[subset[c >> 5] & (1 << (c & 31)) ? leafOffset : leafOffset + 1]; - nodeOffset++; - leafOffset += 2; - } - if (sum < stage.threshold) - return -i_stage; - } - return 1; - } - } -} - -#endif//__SimdDetection_h__ diff --git a/src/3rd/Simd/Simd/SimdDetection.hpp b/src/3rd/Simd/Simd/SimdDetection.hpp deleted file mode 100644 index ed484f05..00000000 --- a/src/3rd/Simd/Simd/SimdDetection.hpp +++ /dev/null @@ -1,730 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar, -* 2019-2019 Facundo Galan. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdDetection_hpp__ -#define __SimdDetection_hpp__ - -#include "Simd/SimdLib.hpp" -#include "Simd/SimdParallel.hpp" - -#include -#include -#include - -#include - -#ifndef SIMD_CHECK_PERFORMANCE -#define SIMD_CHECK_PERFORMANCE() -#endif - -namespace Simd -{ - /*! @ingroup cpp_detection - - \short The Detection structure provides object detection with using of HAAR and LBP cascade classifiers. - - Using example (face detection in the image): - \code - #include "Simd/SimdDetection.hpp" - #include "Simd/SimdDrawing.hpp" - - int main() - { - typedef Simd::Detection Detection; - - Detection::View image; - image.Load("../../data/image/face/lena.pgm"); - - Detection detection; - - detection.Load("../../data/cascade/haar_face_0.xml"); - - detection.Init(image.Size()); - - Detection::Objects objects; - detection.Detect(image, objects); - - for (size_t i = 0; i < objects.size(); ++i) - Simd::DrawRectangle(image, objects[i].rect, uint8_t(255)); - - image.Save("result.pgm"); - - return 0; - } - \endcode - - Using example (face detection in the video captured by OpenCV): - \code - #include - #include - - #include "opencv2/opencv.hpp" - #ifndef SIMD_OPENCV_ENABLE - #define SIMD_OPENCV_ENABLE - #endif - #include "Simd/SimdDetection.hpp" - #include "Simd/SimdDrawing.hpp" - - int main(int argc, char * argv[]) - { - if (argc < 2) - { - std::cout << "You have to set video source! It can be 0 for camera or video file name." << std::endl; - return 1; - } - std::string source = argv[1]; - - cv::VideoCapture capture; - if (source == "0") - capture.open(0); - else - capture.open(source); - if (!capture.isOpened()) - { - std::cout << "Can't capture '" << source << "' !" << std::endl; - return 1; - } - - typedef Simd::Detection Detection; - Detection detection; - detection.Load("../../data/cascade/haar_face_0.xml"); - bool inited = false; - - const char * WINDOW_NAME = "FaceDetection"; - cv::namedWindow(WINDOW_NAME, 1); - for (;;) - { - cv::Mat frame; - capture >> frame; - - Detection::View image = frame; - - if (!inited) - { - detection.Init(image.Size(), 1.2, image.Size() / 20); - inited = true; - } - - Detection::Objects objects; - detection.Detect(image, objects); - - for (size_t i = 0; i < objects.size(); ++i) - Simd::DrawRectangle(image, objects[i].rect, Simd::Pixel::Bgr24(0, 255, 255)); - - cv::imshow(WINDOW_NAME, frame); - if (cvWaitKey(1) == 27)// "press 'Esc' to break video"; - break; - } - return 0; - } - \endcode - - \note This is wrapper around low-level \ref object_detection API. - */ - template class A> - struct Detection - { - typedef A Allocator; /*!< Allocator type definition. */ - typedef Simd::View View; /*!< An image type definition. */ - typedef Simd::Point Size; /*!< An image size type definition. */ - typedef std::vector Sizes; /*!< A vector of image sizes type definition. */ - typedef Simd::Rectangle Rect; /*!< A rectangle type definition. */ - typedef std::vector Rects; /*!< A vector of rectangles type definition. */ - typedef int Tag; /*!< A tag type definition. */ - - static const Tag UNDEFINED_OBJECT_TAG = -1; /*!< The undefined object tag. */ - - /*! - \short The Object structure describes detected object. - - */ - struct Object - { - Rect rect; /*!< \brief A bounding box around of detected object. */ - int weight; /*!< \brief An object weight (number of elementary detections). */ - Tag tag; /*!< \brief An object tag. It's useful if more than one detector works. */ - - /*! - Creates a new Object structure. - - \param [in] r - initial bounding box. - \param [in] w - initial weight. - \param [in] t - initial tag. - */ - Object(const Rect & r = Rect(), int w = 0, Tag t = UNDEFINED_OBJECT_TAG) - : rect(r) - , weight(w) - , tag(t) - { - - } - - /*! - Creates a new Object structure on the base of another object. - - \param [in] o - another object. - */ - Object(const Object & o) - : rect(o.rect) - , weight(o.weight) - , tag(o.tag) - { - } - }; - typedef std::vector Objects; /*!< A vector of objects type defenition. */ - - /*! - Creates a new empty Detection structure. - */ - Detection() - { - } - - /*! - A Detection destructor. - */ - ~Detection() - { - for (size_t i = 0; i < _data.size(); ++i) - ::SimdRelease(_data[i].handle); - } - - /*! - Loads from file classifier cascade. Supports OpenCV HAAR and LBP cascades type. - You can call this function more than once if you want to use several object detectors at the same time. - - \note Tree based cascades and old cascade formats are not supported! - - \param [in] xml - a string containing XML with cascade. - \param [in] tag - an user defined tag. This tag will be inserted in output Object structure. - \return a result of this operation. - */ - bool LoadStringXml(const std::string & xml, Tag tag = UNDEFINED_OBJECT_TAG) - { - // Copy the received string to a non const char pointer. - char * xmlTmp = new char[xml.size() + 1]; - std::copy(xml.begin(), xml.end(), xmlTmp); - xmlTmp[xml.size()] = '\0'; - - Handle handle = ::SimdDetectionLoadStringXml(xmlTmp); - if (handle) - { - Data data; - data.handle = handle; - data.tag = tag; - ::SimdDetectionInfo(handle, (size_t*)&data.size.x, (size_t*)&data.size.y, &data.flags); - _data.push_back(data); - } - return handle != NULL; - } - - /*! - Loads from file classifier cascade. Supports OpenCV HAAR and LBP cascades type. - You can call this function more than once if you want to use several object detectors at the same time. - - \note Tree based cascades and old cascade formats are not supported! - - \param [in] path - a path to cascade. - \param [in] tag - an user defined tag. This tag will be inserted in output Object structure. - \return a result of this operation. - */ - bool Load(const std::string & path, Tag tag = UNDEFINED_OBJECT_TAG) - { - Handle handle = ::SimdDetectionLoadA(path.c_str()); - if (handle) - { - Data data; - data.handle = handle; - data.tag = tag; - ::SimdDetectionInfo(handle, (size_t*)&data.size.x, (size_t*)&data.size.y, &data.flags); - _data.push_back(data); - } - return handle != NULL; - } - - /*! - Prepares Detection structure to work with image of given size. - - \param [in] imageSize - a size of input image. - \param [in] scaleFactor - a scale factor. To detect objects of different sizes the algorithm uses many scaled image. - This parameter defines size difference between neighboring images. This parameter strongly affects to performance. - \param [in] sizeMin - a minimal size of detected objects. This parameter strongly affects to performance. - \param [in] sizeMax - a maximal size of detected objects. - \param [in] roi - a 8-bit image mask which defines Region Of Interest. User can restricts detection region with using this mask. - The mask affects to the center of detected object. - \param [in] threadNumber - a number of work threads. It useful for multi core CPU. Use value -1 to auto choose of thread number. - \return a result of this operation. - */ - bool Init(const Size & imageSize, double scaleFactor = 1.1, const Size & sizeMin = Size(0, 0), - const Size & sizeMax = Size(INT_MAX, INT_MAX), const View & roi = View(), ptrdiff_t threadNumber = -1) - { - if (_data.empty()) - return false; - _imageSize = imageSize; - ptrdiff_t threadNumberMax = std::thread::hardware_concurrency(); - _threadNumber = (threadNumber <= 0 || threadNumber > threadNumberMax) ? threadNumberMax : threadNumber; - return InitLevels(scaleFactor, sizeMin, sizeMax, roi); - } - - /*! - Detects objects at given image. - - \param [in] src - a input image. - \param [out] objects - detected objects. - \param [in] groupSizeMin - a minimal weight (number of elementary detections) of detected image. - \param [in] sizeDifferenceMax - a parameter to group elementary detections. - \param [in] motionMask - an using of motion detection flag. Useful for dynamical restriction of detection region to addition to ROI. - \param [in] motionRegions - a set of rectangles (motion regions) to restrict detection region to addition to ROI. - The regions affect to the center of detected object. - \return a result of this operation. - */ - bool Detect(const View & src, Objects & objects, int groupSizeMin = 3, double sizeDifferenceMax = 0.2, - bool motionMask = false, const Rects & motionRegions = Rects()) - { - SIMD_CHECK_PERFORMANCE(); - - if (_levels.empty() || src.Size() != _imageSize) - return false; - - FillLevels(src); - - typedef std::map Candidates; - Candidates candidates; - - for (size_t i = 0; i < _levels.size(); ++i) - { - Level & level = *_levels[i]; - View mask = level.roi; - Rect rect = level.rect; - if (motionMask) - { - FillMotionMask(motionRegions, level, rect); - mask = level.mask; - } - if (rect.Empty()) - continue; - for (size_t j = 0; j < level.hids.size(); ++j) - { - Hid & hid = level.hids[j]; - - hid.Detect(mask, rect, level.dst, _threadNumber, level.throughColumn); - - AddObjects(candidates[hid.data->tag], level.dst, rect, hid.data->size, level.scale, - level.throughColumn ? 2 : 1, hid.data->tag); - } - } - - objects.clear(); - for (typename Candidates::iterator it = candidates.begin(); it != candidates.end(); ++it) - GroupObjects(objects, it->second, groupSizeMin, sizeDifferenceMax); - - return true; - } - - private: - - typedef void * Handle; - - struct Data - { - Handle handle; - Tag tag; - Size size; - ::SimdDetectionInfoFlags flags; - - bool Haar() const { return (flags&::SimdDetectionInfoFeatureMask) == ::SimdDetectionInfoFeatureHaar; } - bool Tilted() const { return (flags&::SimdDetectionInfoHasTilted) != 0; } - bool Int16() const { return (flags&::SimdDetectionInfoCanInt16) != 0; } - }; - - typedef void(*DetectPtr)(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - struct Worker; - typedef std::shared_ptr WorkerPtr; - typedef std::vector WorkerPtrs; - - struct Hid - { - Handle handle; - Data * data; - DetectPtr detect; - - void Detect(const View & mask, const Rect & rect, View & dst, size_t threadNumber, bool throughColumn) - { - SIMD_CHECK_PERFORMANCE(); - - Size s = dst.Size() - data->size; - View m = mask.Region(s, View::MiddleCenter); - Rect r = rect.Shifted(-data->size / 2).Intersection(Rect(s)); - Simd::Fill(dst, 0); - ::SimdDetectionPrepare(handle); - - Parallel(r.top, r.bottom, [&](size_t thread, size_t begin, size_t end) - { - detect(handle, m.data, m.stride, r.left, begin, r.right, end, dst.data, dst.stride); - }, rect.Area() >= (data->Haar() ? 10000 : 30000) ? threadNumber : 1, throughColumn ? 2 : 1); - } - }; - typedef std::vector Hids; - - struct Level - { - Hids hids; - double scale; - - View src; - View roi; - View mask; - - Rect rect; - - View sum; - View sqsum; - View tilted; - - View dst; - - bool throughColumn; - bool needSqsum; - bool needTilted; - - ~Level() - { - for (size_t i = 0; i < hids.size(); ++i) - ::SimdRelease(hids[i].handle); - } - }; - typedef std::unique_ptr LevelPtr; - typedef std::vector LevelPtrs; - - std::vector _data; - Size _imageSize; - bool _needNormalization; - ptrdiff_t _threadNumber; - LevelPtrs _levels; - - bool InitLevels(double scaleFactor, const Size & sizeMin, const Size & sizeMax, const View & roi) - { - _needNormalization = false; - _levels.clear(); - _levels.reserve(100); - double scale = 1.0; - do - { - std::vector inserts(_data.size(), false); - bool exit = true, insert = false; - for (size_t i = 0; i < _data.size(); ++i) - { - Size windowSize = _data[i].size * scale; - if (windowSize.x <= sizeMax.x && windowSize.y <= sizeMax.y && - windowSize.x <= _imageSize.x && windowSize.y <= _imageSize.y) - { - if (windowSize.x >= sizeMin.x && windowSize.y >= sizeMin.y) - insert = inserts[i] = true; - exit = false; - } - } - if (exit) - break; - - if (insert) - { - _levels.push_back(LevelPtr(new Level())); - Level & level = *_levels.back(); - - level.scale = scale; - level.throughColumn = scale <= 2.0; - Size scaledSize(_imageSize / scale); - - level.src.Recreate(scaledSize, View::Gray8); - level.roi.Recreate(scaledSize, View::Gray8); - level.mask.Recreate(scaledSize, View::Gray8); - - level.sum.Recreate(scaledSize + Size(1, 1), View::Int32); - level.sqsum.Recreate(scaledSize + Size(1, 1), View::Int32); - level.tilted.Recreate(scaledSize + Size(1, 1), View::Int32); - - level.dst.Recreate(scaledSize, View::Gray8); - - level.needSqsum = false, level.needTilted = false; - for (size_t i = 0; i < _data.size(); ++i) - { - if (!inserts[i]) - continue; - Handle handle = ::SimdDetectionInit(_data[i].handle, level.sum.data, level.sum.stride, level.sum.width, level.sum.height, - level.sqsum.data, level.sqsum.stride, level.tilted.data, level.tilted.stride, level.throughColumn, _data[i].Int16()); - if (handle) - { - Hid hid; - hid.handle = handle; - hid.data = &_data[i]; - if (_data[i].Haar()) - hid.detect = level.throughColumn ? ::SimdDetectionHaarDetect32fi : ::SimdDetectionHaarDetect32fp; - else - { - if (_data[i].Int16()) - hid.detect = level.throughColumn ? ::SimdDetectionLbpDetect16ii : ::SimdDetectionLbpDetect16ip; - else - hid.detect = level.throughColumn ? ::SimdDetectionLbpDetect32fi : ::SimdDetectionLbpDetect32fp; - } - level.hids.push_back(hid); - } - else - return false; - level.needSqsum = level.needSqsum | _data[i].Haar(); - level.needTilted = level.needTilted | _data[i].Tilted(); - _needNormalization = _needNormalization | _data[i].Haar(); - } - - level.rect = Rect(level.roi.Size()); - if (roi.format == View::None) - Simd::Fill(level.roi, 255); - else - { - Simd::ResizeBilinear(roi, level.roi); - Simd::Binarization(level.roi, 0, 255, 0, level.roi, SimdCompareGreater); - Simd::SegmentationShrinkRegion(level.roi, 255, level.rect); - } - } - scale *= scaleFactor; - } while (true); - return !_levels.empty(); - } - - void FillLevels(View src) - { - View gray; - if (src.format != View::Gray8) - { - gray.Recreate(src.Size(), View::Gray8); - Convert(src, gray); - src = gray; - } - - Simd::ResizeBilinear(src, _levels[0]->src); - if (_needNormalization) - Simd::NormalizeHistogram(_levels[0]->src, _levels[0]->src); - EstimateIntegral(*_levels[0]); - for (size_t i = 1; i < _levels.size(); ++i) - { - Simd::ResizeBilinear(_levels[0]->src, _levels[i]->src); - EstimateIntegral(*_levels[i]); - } - } - - void EstimateIntegral(Level & level) - { - if (level.needSqsum) - { - if (level.needTilted) - Simd::Integral(level.src, level.sum, level.sqsum, level.tilted); - else - Simd::Integral(level.src, level.sum, level.sqsum); - } - else - Simd::Integral(level.src, level.sum); - } - - void FillMotionMask(const Rects & rects, Level & level, Rect & rect) const - { - Simd::Fill(level.mask, 0); - rect = Rect(); - for (size_t i = 0; i < rects.size(); i++) - { - Rect r = rects[i] / level.scale; - rect |= r; - Simd::Fill(level.mask.Region(r).Ref(), 0xFF); - } - rect &= level.rect; - Simd::OperationBinary8u(level.mask, level.roi, level.mask, SimdOperationBinary8uAnd); - } - - void AddObjects(Objects & objects, const View & dst, const Rect & rect, const Size & size, double scale, size_t step, Tag tag) - { - Size s = dst.Size() - size; - Rect r = rect.Shifted(-size / 2).Intersection(Rect(s)); - for (ptrdiff_t row = r.top; row < r.bottom; row += step) - { - const uint8_t * mask = dst.data + row*dst.stride; - for (ptrdiff_t col = r.left; col < r.right; col += step) - { - if (mask[col] != 0) - objects.push_back(Object(Rect(col, row, col + size.x, row + size.y)*scale, 1, tag)); - } - } - } - - struct Similar - { - Similar(double sizeDifferenceMax) - : _sizeDifferenceMax(sizeDifferenceMax) - {} - - SIMD_INLINE bool operator() (const Object & o1, const Object & o2) const - { - const Rect & r1 = o1.rect; - const Rect & r2 = o2.rect; - double delta = _sizeDifferenceMax*(std::min(r1.Width(), r2.Width()) + std::min(r1.Height(), r2.Height()))*0.5; - return - std::abs(r1.left - r2.left) <= delta && std::abs(r1.top - r2.top) <= delta && - std::abs(r1.right - r2.right) <= delta && std::abs(r1.bottom - r2.bottom) <= delta; - } - - private: - double _sizeDifferenceMax; - }; - - template int Partition(const std::vector & vec, std::vector & labels, double sizeDifferenceMax) - { - Similar similar(sizeDifferenceMax); - int i, j, N = (int)vec.size(); - const int PARENT = 0; - const int RANK = 1; - - std::vector _nodes(N * 2); - int(*nodes)[2] = (int(*)[2])&_nodes[0]; - - for (i = 0; i < N; i++) - { - nodes[i][PARENT] = -1; - nodes[i][RANK] = 0; - } - - for (i = 0; i < N; i++) - { - int root = i; - while (nodes[root][PARENT] >= 0) - root = nodes[root][PARENT]; - - for (j = 0; j < N; j++) - { - if (i == j || !similar(vec[i], vec[j])) - continue; - int root2 = j; - - while (nodes[root2][PARENT] >= 0) - root2 = nodes[root2][PARENT]; - - if (root2 != root) - { - int rank = nodes[root][RANK], rank2 = nodes[root2][RANK]; - if (rank > rank2) - nodes[root2][PARENT] = root; - else - { - nodes[root][PARENT] = root2; - nodes[root2][RANK] += rank == rank2; - root = root2; - } - assert(nodes[root][PARENT] < 0); - - int k = j, parent; - while ((parent = nodes[k][PARENT]) >= 0) - { - nodes[k][PARENT] = root; - k = parent; - } - - k = i; - while ((parent = nodes[k][PARENT]) >= 0) - { - nodes[k][PARENT] = root; - k = parent; - } - } - } - } - - labels.resize(N); - int nclasses = 0; - - for (i = 0; i < N; i++) - { - int root = i; - while (nodes[root][PARENT] >= 0) - root = nodes[root][PARENT]; - if (nodes[root][RANK] >= 0) - nodes[root][RANK] = ~nclasses++; - labels[i] = ~nodes[root][RANK]; - } - - return nclasses; - } - - void GroupObjects(Objects & dst, const Objects & src, size_t groupSizeMin, double sizeDifferenceMax) - { - if (groupSizeMin == 0 || src.size() < groupSizeMin) - return; - - std::vector labels; - int nclasses = Partition(src, labels, sizeDifferenceMax); - - Objects buffer; - buffer.resize(nclasses); - for (size_t i = 0; i < labels.size(); ++i) - { - int cls = labels[i]; - buffer[cls].rect += src[i].rect; - buffer[cls].weight++; - buffer[cls].tag = src[i].tag; - } - - for (size_t i = 0; i < buffer.size(); i++) - buffer[i].rect = buffer[i].rect / double(buffer[i].weight); - - for (size_t i = 0; i < buffer.size(); i++) - { - Rect r1 = buffer[i].rect; - int n1 = buffer[i].weight; - if (n1 < (int)groupSizeMin) - continue; - - size_t j; - for (j = 0; j < buffer.size(); j++) - { - int n2 = buffer[j].weight; - - if (j == i || n2 < (int)groupSizeMin) - continue; - - Rect r2 = buffer[j].rect; - - int dx = Simd::Round(r2.Width() * sizeDifferenceMax); - int dy = Simd::Round(r2.Height() * sizeDifferenceMax); - - if (i != j && (n2 > std::max(3, n1) || n1 < 3) && - r1.left >= r2.left - dx && r1.top >= r2.top - dy && - r1.right <= r2.right + dx && r1.bottom <= r2.bottom + dy) - break; - } - - if (j == buffer.size()) - dst.push_back(buffer[i]); - } - } - }; -} - -#endif//__SimdDetection_hpp__ diff --git a/src/3rd/Simd/Simd/SimdDrawing.hpp b/src/3rd/Simd/Simd/SimdDrawing.hpp deleted file mode 100644 index bf9c5409..00000000 --- a/src/3rd/Simd/Simd/SimdDrawing.hpp +++ /dev/null @@ -1,411 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdDrawing_hpp__ -#define __SimdDrawing_hpp__ - -#include "Simd/SimdView.hpp" - -#include - -namespace Simd -{ - /*! @ingroup cpp_drawing - - \fn void DrawLine(View & canvas, ptrdiff_t x1, ptrdiff_t y1, ptrdiff_t x2, ptrdiff_t y2, const Color & color, size_t width = 1) - - \short Draws a line at the image. - - \param [out] canvas - a canvas (image where we draw line). - \param [in] x1 - X coordinate of the first point of the line. - \param [in] y1 - Y coordinate of the first point of the line. - \param [in] x2 - X coordinate of the second point of the line. - \param [in] y2 - Y coordinate of the second point of the line. - \param [in] color - a color of the line. - \param [in] width - a width of the line. By default it is equal to 1. - */ - template class A, class Color> SIMD_INLINE void DrawLine(View & canvas, ptrdiff_t x1, ptrdiff_t y1, ptrdiff_t x2, ptrdiff_t y2, const Color & color, size_t width = 1) - { - assert(canvas.PixelSize() == sizeof(Color)); - - const ptrdiff_t w = canvas.width - 1; - const ptrdiff_t h = canvas.height - 1; - - if (x1 < 0 || y1 < 0 || x1 > w || y1 > h || x2 < 0 || y2 < 0 || x2 > w || y2 > h) - { - if ((x1 < 0 && x2 < 0) || (y1 < 0 && y2 < 0) || (x1 > w && x2 > w) || (y1 > h && y2 > h)) - return; - - if (y1 == y2) - { - x1 = std::min(std::max(x1, 0), w); - x2 = std::min(std::max(x2, 0), w); - } - else if (x1 == x2) - { - y1 = std::min(std::max(y1, 0), h); - y2 = std::min(std::max(y2, 0), h); - } - else - { - ptrdiff_t x0 = (x1*y2 - y1*x2) / (y2 - y1); - ptrdiff_t y0 = (y1*x2 - x1*y2) / (x2 - x1); - ptrdiff_t xh = (x1*y2 - y1*x2 + h*(x2 - x1)) / (y2 - y1); - ptrdiff_t yw = (y1*x2 - x1*y2 + w*(y2 - y1)) / (x2 - x1); - - if (x1 < 0) - { - x1 = 0; - y1 = y0; - } - if (x2 < 0) - { - x2 = 0; - y2 = y0; - } - if (x1 > w) - { - x1 = w; - y1 = yw; - } - if (x2 > w) - { - x2 = w; - y2 = yw; - } - if ((y1 < 0 && y2 < 0) || (y1 > h && y2 > h)) - return; - - if (y1 < 0) - { - x1 = x0; - y1 = 0; - } - if (y2 < 0) - { - x2 = x0; - y2 = 0; - } - - if (y1 > h) - { - x1 = xh; - y1 = h; - } - if (y2 > h) - { - x2 = xh; - y2 = h; - } - } - } - - const bool inverse = std::abs(y2 - y1) > std::abs(x2 - x1); - if (inverse) - { - std::swap(x1, y1); - std::swap(x2, y2); - } - - if (x1 > x2) - { - std::swap(x1, x2); - std::swap(y1, y2); - } - - const double dx = double(x2 - x1); - const double dy = (double)std::abs(y2 - y1); - - double error = dx / 2.0f; - const ptrdiff_t ystep = (y1 < y2) ? 1 : -1; - ptrdiff_t y0 = y1 - width / 2; - - for (ptrdiff_t x = x1; x <= x2; x++) - { - for (size_t i = 0; i < width; ++i) - { - ptrdiff_t y = y0 + i; - if (y >= 0) - { - if (inverse) - { - if (y < w) - At(canvas, y, x) = color; - } - else - { - if (y < h) - At(canvas, x, y) = color; - } - } - - } - - error -= dy; - if (error < 0) - { - y0 += ystep; - error += dx; - } - } - } - - /*! @ingroup cpp_drawing - - \fn void DrawLine(View & canvas, const Point & p1, const Point & p2, const Color & color, size_t width = 1) - - \short Draws a line at the image. - - \param [out] canvas - a canvas (image where we draw line). - \param [in] p1 - the first point of the line. - \param [in] p2 - the second point of the line. - \param [in] color - a color of the line. - \param [in] width - a width of the line. By default it is equal to 1. - */ - template class A, class Color> SIMD_INLINE void DrawLine(View & canvas, const Point & p1, const Point & p2, const Color & color, size_t width = 1) - { - DrawLine(canvas, p1.x, p1.y, p2.x, p2.y, color, width); - } - - /*! @ingroup cpp_drawing - - \fn void DrawRectangle(View & canvas, const Rectangle & rect, const Color & color, size_t width = 1) - - \short Draws a rectangle at the image. - - \param [out] canvas - a canvas (image where we draw rectangle). - \param [in] rect - a rectangle. - \param [in] color - a color of the rectangle frame. - \param [in] width - a width of the rectangle frame. By default it is equal to 1. - */ - template class A, class Color> SIMD_INLINE void DrawRectangle(View & canvas, const Rectangle & rect, const Color & color, size_t width = 1) - { - DrawLine(canvas, rect.left, rect.top, rect.right, rect.top, color, width); - DrawLine(canvas, rect.right, rect.top, rect.right, rect.bottom, color, width); - DrawLine(canvas, rect.right, rect.bottom, rect.left, rect.bottom, color, width); - DrawLine(canvas, rect.left, rect.bottom, rect.left, rect.top, color, width); - } - - /*! @ingroup cpp_drawing - - \fn void DrawRectangle(View & canvas, const Point & topLeft, const Point & bottomRight, const Color & color, size_t width = 1) - - \short Draws a rectangle at the image. - - \param [out] canvas - a canvas (image where we draw rectangle). - \param [in] topLeft - a top-left corner of the rectangle. - \param [in] bottomRight - a bottom-right corner of the rectangle. - \param [in] color - a color of the rectangle frame. - \param [in] width - a width of the rectangle frame. By default it is equal to 1. - */ - template class A, class Color> SIMD_INLINE void DrawRectangle(View & canvas, const Point & topLeft, const Point & bottomRight, const Color & color, size_t width = 1) - { - DrawRectangle(canvas, Rectangle(topLeft, bottomRight), color, width); - } - - /*! @ingroup cpp_drawing - - \fn void DrawRectangle(View & canvas, ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, const Color & color, size_t width = 1) - - \short Draws a rectangle at the image. - - \param [out] canvas - a canvas (image where we draw rectangle). - \param [in] left - a left of the rectangle. - \param [in] top - a top of the rectangl. - \param [in] right - a right of the rectangl. - \param [in] bottom - a bottom of the rectangl. - \param [in] color - a color of the rectangle frame. - \param [in] width - a width of the rectangle frame. By default it is equal to 1. - */ - template class A, class Color> SIMD_INLINE void DrawRectangle(View & canvas, ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, const Color & color, size_t width = 1) - { - DrawRectangle(canvas, Rectangle(left, top, right, bottom), color, width); - } - - /*! @ingroup cpp_drawing - - \fn void DrawFilledRectangle(View & canvas, Rectangle rect, const Color & color) - - \short Draws a filled rectangle at the image. - - \param [out] canvas - a canvas (image where we draw filled rectangle). - \param [in] rect - a rectangle to fill. - \param [in] color - a color of the filled rectangle. - */ - template class A, class Color> SIMD_INLINE void DrawFilledRectangle(View & canvas, Rectangle rect, const Color & color) - { - assert(canvas.PixelSize() == sizeof(color)); - - if (sizeof(Color) <= 4) - Simd::FillPixel(canvas.Region(rect).Ref(), color); - else - { - rect &= Rectangle(canvas.Size()); - for (ptrdiff_t row = rect.top; row < rect.bottom; ++row) - { - Color * dst = &At(canvas, 0, row); - for (ptrdiff_t col = rect.left; col < rect.right; ++col) - dst[col] = color; - } - } - } - - /*! @ingroup cpp_drawing - - \fn void DrawPolygon(View & canvas, const std::vector> & polygon, const Color & color, size_t width = 1) - - \short Draws a polygon at the image. - - \param [out] canvas - a canvas (image where we draw polygon). - \param [in] polygon - a polygon. - \param [in] color - a color of the polygon. - \param [in] width - a width of the polygon. By default it is equal to 1. - */ - template class A, class Color> SIMD_INLINE void DrawPolygon(View & canvas, const std::vector> & polygon, const Color & color, size_t width = 1) - { - assert(canvas.PixelSize() == sizeof(color)); - - typedef Simd::Point Point; - - for (size_t i = 0; i < polygon.size(); ++i) - { - const Point & p1 = (i ? polygon[i - 1] : polygon.back()), p2 = polygon[i]; - DrawLine(canvas, p1, p2, color, width); - } - } - - /*! @ingroup cpp_drawing - - \fn void DrawFilledPolygon(View & canvas, const std::vector> & polygon, const Color & color) - - \short Draws a filled polygon at the image. - - \param [out] canvas - a canvas (image where we draw filled polygon). - \param [in] polygon - a polygon. - \param [in] color - a color of the polygon frame. - */ - template class A, class Color> SIMD_INLINE void DrawFilledPolygon(View & canvas, const std::vector> & polygon, const Color & color) - { - assert(canvas.PixelSize() == sizeof(color)); - - typedef Simd::Point Point; - typedef std::vector Vector; - - ptrdiff_t top = canvas.height, bottom = 0; - for (size_t i = 0; i < polygon.size(); ++i) - { - top = std::min(top, polygon[i].y); - bottom = std::max(bottom, polygon[i].y); - } - top = std::max(0, top); - bottom = std::min(bottom, canvas.height); - - for (ptrdiff_t y = top; y < bottom; ++y) - { - Vector intersections; - for (size_t i = 0; i < polygon.size(); ++i) - { - const Point & p0 = (i ? polygon[i - 1] : polygon.back()), p1 = polygon[i]; - if ((y >= p0.y && y < p1.y) || (y >= p1.y && y < p0.y)) - intersections.push_back(p0.x + (y - p0.y)*(p1.x - p0.x) / (p1.y - p0.y)); - } - assert(intersections.size() % 2 == 0); - std::sort(intersections.begin(), intersections.end()); - for (size_t i = 0; i < intersections.size(); i += 2) - { - ptrdiff_t left = std::max(0, intersections[i + 0]); - ptrdiff_t right = std::min(canvas.width, intersections[i + 1]); - Color * dst = &At(canvas, 0, y); - for (ptrdiff_t x = left; x < right; ++x) - dst[x] = color; - } - } - } - - /*! @ingroup cpp_drawing - - \fn void DrawEllipse(View & canvas, const Point & center, const Point & axes, double slope, const Color & color, size_t width = 1) - - \short Draws an ellipse at the image. - - \param [out] canvas - a canvas (image where we draw ellipse). - \param [in] center - a center of the ellipse. - \param [in] axes - axes of the ellipse. - \param [in] slope - a slope of the ellipse. - \param [in] color - a color of the ellipse. - \param [in] width - a width of the ellipse. - */ - template class A, class Color> SIMD_INLINE void DrawEllipse(View & canvas, const Point & center, const Point & axes, double slope, const Color & color, size_t width = 1) - { - assert(canvas.PixelSize() == sizeof(color)); - - const size_t n = 8 * std::max((size_t)1, (size_t)::pow(axes.x*axes.x + axes.y*axes.y, 0.25)); - double ss = ::sin(slope); - double sc = ::cos(slope); - double px, py, da = 2 * M_PI / n; - for (size_t i = 0; i <= n; ++i) - { - double a = i*da; - double ax = ::sin(a)*axes.x; - double ay = ::cos(a)*axes.y; - double cx = ax*sc + ay*ss + center.x; - double cy = ay*sc - ax*ss + center.y; - if (i > 0) - DrawLine(canvas, (ptrdiff_t)cx, (ptrdiff_t)cy, (ptrdiff_t)px, (ptrdiff_t)py, color, width); - px = cx; - py = cy; - } - } - - /*! @ingroup cpp_drawing - - \fn void DrawCircle(View & canvas, const Point & center, ptrdiff_t radius, const Color & color, size_t width = 1) - - \short Draws a circle at the image. - - \param [out] canvas - a canvas (image where we draw circle). - \param [in] center - a center of the circle. - \param [in] radius - a radius of the circle. - \param [in] color - a color of the circle. - \param [in] width - a width of the circle. - */ - template class A, class Color> SIMD_INLINE void DrawCircle(View & canvas, const Point & center, ptrdiff_t radius, const Color & color, size_t width = 1) - { - assert(canvas.PixelSize() == sizeof(color)); - - const size_t n = 8 * std::max((size_t)1, (size_t)::pow(radius, 0.5)); - double px, py, da = 2 * M_PI / n; - for (size_t i = 0; i <= n; ++i) - { - double a = i*da; - double cx = radius*::cos(a) + center.x; - double cy = radius*::sin(a) + center.y; - if (i > 0) - DrawLine(canvas, (ptrdiff_t)cx, (ptrdiff_t)cy, (ptrdiff_t)px, (ptrdiff_t)py, color, width); - px = cx; - py = cy; - } - } -} - -#endif//__SimdDrawing_hpp__ diff --git a/src/3rd/Simd/Simd/SimdEnable.h b/src/3rd/Simd/Simd/SimdEnable.h deleted file mode 100644 index 6905d173..00000000 --- a/src/3rd/Simd/Simd/SimdEnable.h +++ /dev/null @@ -1,733 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdEnable_h__ -#define __SimdEnable_h__ - -#include "Simd/SimdDefs.h" - -#if defined(_MSC_VER) - -#ifndef NOMINMAX -#define NOMINMAX -#endif -#include -#include - -#elif defined(__GNUC__) - -#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) -#include -#endif - -#if defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE) -#include -#include -#include -#if defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE) -#include -#endif -#endif - -#else -# error Do not know how to detect CPU info -#endif - -namespace Simd -{ -#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) - namespace Cpuid - { - // See http://www.sandpile.org/x86/cpuid.htm for additional information. - enum Level - { - Ordinary = 1, - Extended = 7, - }; - - enum Register - { - Eax = 0, - Ebx = 1, - Ecx = 2, - Edx = 3, - }; - - enum Bit - { - // Ordinary: - // Edx: - SSE = 1 << 25, - SSE2 = 1 << 26, - - // Ecx: - SSE3 = 1 << 0, - SSSE3 = 1 << 9, - FMA = 1 << 12, - SSE41 = 1 << 19, - SSE42 = 1 << 20, - OSXSAVE = 1 << 27, - AVX = 1 << 28, - F16C = 1 << 29, - - // Extended: - // Ebx: - AVX2 = 1 << 5, - AVX512F = 1 << 16, - AVX512DQ = 1 << 17, - AVX512CD = 1 << 28, - AVX512BW = 1 << 30, - AVX512VL = 1 << 31, - - // Ecx: - AVX512VBMI = 1 << 1, - AVX512VNNI = 1 << 11, - }; - - SIMD_INLINE bool CheckBit(Level level, Register index, Bit bit) - { - unsigned int registers[4] = { 0, 0, 0, 0 }; -#if defined(_MSC_VER) - __cpuid((int*)registers, level); -#elif (defined __GNUC__) - if (__get_cpuid_max(0, NULL) < level) - return false; - __cpuid_count(level, 0, registers[Eax], registers[Ebx], registers[Ecx], registers[Edx]); -#else -#error Do not know how to detect CPU info! -#endif - return (registers[index] & bit) == bit; - } - } -#endif//defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) - -#if defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE)) - namespace CpuInfo - { - SIMD_INLINE bool CheckBit(int at, int bit) - { - bool result = false; - int file = ::open("/proc/self/auxv", O_RDONLY); - if (file < 0) - return false; - const ssize_t size = 64; - unsigned long buffer[size]; - for (ssize_t count = size; count == size;) - { - count = ::read(file, buffer, sizeof(buffer)) / sizeof(unsigned long); - for (int i = 0; i < count; i += 2) - { - if (buffer[i] == (unsigned)at) - { - result = !!(buffer[i + 1] & bit); - count = 0; - } - if (buffer[i] == AT_NULL) - count = 0; - } - } - ::close(file); - return result; - } - } -#endif//defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE)) - -#ifdef SIMD_SSE_ENABLE - namespace Sse - { - SIMD_INLINE bool SupportedByCPU() - { - return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Edx, Cpuid::SSE); - } - - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - __m128 value = _mm_set1_ps(1.0f);// try to execute of SSE instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { - SIMD_INLINE bool SupportedByCPU() - { - return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Edx, Cpuid::SSE2); - } - - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - __m128d value = _mm_set1_pd(1.0);// try to execute of SSE2 instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - -#ifdef SIMD_SSE3_ENABLE - namespace Sse3 - { - SIMD_INLINE bool SupportedByCPU() - { - return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE3); - } - - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - __m128 value = _mm_hadd_ps(_mm_set1_ps(1.0f), _mm_set1_ps(2.0f)); //try to execute of SSE3 instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - SIMD_INLINE bool SupportedByCPU() - { - return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSSE3); - } - - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - __m128i value = _mm_abs_epi8(_mm_set1_epi8(-1)); //try to execute of SSSE3 instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - -#ifdef SIMD_SSE41_ENABLE - namespace Sse41 - { - SIMD_INLINE bool SupportedByCPU() - { - return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE41); - } - - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - int value = _mm_testz_si128(_mm_set1_epi8(0), _mm_set1_epi8(-1)); // try to execute of SSE41 instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - -#ifdef SIMD_SSE42_ENABLE - namespace Sse42 - { - SIMD_INLINE bool SupportedByCPU() - { - return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE42); - } - - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - uint32_t value = _mm_crc32_u8(0, 1); // try to execute of SSE42 instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - SIMD_INLINE bool SupportedByCPU() - { - return - Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::OSXSAVE) && - Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::AVX); - } - - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - __m256d value = _mm256_set1_pd(1.0);// try to execute of AVX instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - SIMD_INLINE bool SupportedByCPU() - { - return - Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::OSXSAVE) && - Cpuid::CheckBit(Cpuid::Extended, Cpuid::Ebx, Cpuid::AVX2) && - Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::FMA) && - Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::F16C); - } - - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - __m256i value = _mm256_abs_epi8(_mm256_set1_epi8(1));// try to execute of AVX2 instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - SIMD_INLINE bool SupportedByCPU() - { - return - Cpuid::CheckBit(Cpuid::Extended, Cpuid::Ebx, Cpuid::AVX512F) && - Cpuid::CheckBit(Cpuid::Extended, Cpuid::Ebx, Cpuid::AVX512CD); - } - - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - __m512d value = _mm512_set1_pd(1.0);// try to execute of AVX-512F instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - SIMD_INLINE bool SupportedByCPU() - { - return - Cpuid::CheckBit(Cpuid::Extended, Cpuid::Ebx, Cpuid::AVX512F) && - Cpuid::CheckBit(Cpuid::Extended, Cpuid::Ebx, Cpuid::AVX512CD) && - Cpuid::CheckBit(Cpuid::Extended, Cpuid::Ebx, Cpuid::AVX512DQ) && - Cpuid::CheckBit(Cpuid::Extended, Cpuid::Ebx, Cpuid::AVX512BW) && - Cpuid::CheckBit(Cpuid::Extended, Cpuid::Ebx, Cpuid::AVX512VL); - } - - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - __m512i value = _mm512_abs_epi8(_mm512_set1_epi8(1));// try to execute of AVX-512BW instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - -#ifdef SIMD_AVX512VNNI_ENABLE - namespace Avx512vnni - { - SIMD_INLINE bool SupportedByCPU() - { - return - Cpuid::CheckBit(Cpuid::Extended, Cpuid::Ecx, Cpuid::AVX512VNNI); - } - - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - __m512i value = _mm512_dpbusd_epi32(_mm512_setzero_si512(), _mm512_set1_epi8(1), _mm512_set1_epi8(1));// try to execute of AVX-512VNNI instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - -#ifdef SIMD_VMX_ENABLE - namespace Vmx - { - SIMD_INLINE bool SupportedByCPU() - { - return CpuInfo::CheckBit(AT_HWCAP, PPC_FEATURE_HAS_ALTIVEC); - } - - SIMD_INLINE bool SupportedByOS() - { - return true; - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - -#ifdef SIMD_VSX_ENABLE - namespace Vsx - { - SIMD_INLINE bool SupportedByCPU() - { - return CpuInfo::CheckBit(AT_HWCAP, PPC_FEATURE_HAS_VSX); - } - - SIMD_INLINE bool SupportedByOS() - { - return true; - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE bool SupportedByCPU() - { -#if defined(_MSC_VER) - return true; -#elif defined(__GNUC__) -#if defined(SIMD_ARM64_ENABLE) - return true; -#else - return CpuInfo::CheckBit(AT_HWCAP, HWCAP_NEON); -#endif -#else -#error Do not know how to detect NEON support! -#endif - } - - SIMD_INLINE bool SupportedByOS() - { - return true; - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - -#ifdef SIMD_MSA_ENABLE - namespace Msa - { - SIMD_INLINE bool SupportedByCPU() - { - return true; - } - - SIMD_INLINE bool SupportedByOS() - { - return true; - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - - SIMD_INLINE size_t Alignment() - { -#ifdef SIMD_AVX512VNNI_ENABLE - if (Avx512vnni::Enable) - return sizeof(__m512i); - else -#endif -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - return sizeof(__m512i); - else -#endif -#ifdef SIMD_AVX512F_ENABLE - if (Avx512f::Enable) - return sizeof(__m512); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable) - return sizeof(__m256i); - else -#endif -#ifdef SIMD_AVX_ENABLE - if (Avx::Enable) - return sizeof(__m256); - else -#endif -#ifdef SIMD_SSE41_ENABLE - if (Sse41::Enable) - return sizeof(__m128i); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable) - return sizeof(__m128i); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if (Sse2::Enable) - return sizeof(__m128i); - else -#endif -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable) - return sizeof(__m128); - else -#endif -#ifdef SIMD_VSX_ENABLE - if (Vsx::Enable) - return sizeof(__vector uint8_t); - else -#endif -#ifdef SIMD_VMX_ENABLE - if (Vmx::Enable) - return sizeof(__vector uint8_t); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable) - return sizeof(uint8x16_t); - else -#endif -#ifdef SIMD_MSA_ENABLE - if (Msa::Enable) - return sizeof(v16u8); - else -#endif - return sizeof(void *); - } - - const size_t ALIGNMENT = Alignment(); -} - -#define SIMD_BASE_FUNC(func) Simd::Base::func - -#ifdef SIMD_SSE_ENABLE -#define SIMD_SSE_FUNC(func) Simd::Sse::Enable ? Simd::Sse::func : -#else -#define SIMD_SSE_FUNC(func) -#endif - -#ifdef SIMD_SSE2_ENABLE -#define SIMD_SSE2_FUNC(func) Simd::Sse2::Enable ? Simd::Sse2::func : -#else -#define SIMD_SSE2_FUNC(func) -#endif - -#ifdef SIMD_SSE3_ENABLE -#define SIMD_SSE3_FUNC(func) Simd::Sse3::Enable ? Simd::Sse3::func : -#else -#define SIMD_SSE3_FUNC(func) -#endif - -#ifdef SIMD_SSSE3_ENABLE -#define SIMD_SSSE3_FUNC(func) Simd::Ssse3::Enable ? Simd::Ssse3::func : -#else -#define SIMD_SSSE3_FUNC(func) -#endif - -#ifdef SIMD_SSE41_ENABLE -#define SIMD_SSE41_FUNC(func) Simd::Sse41::Enable ? Simd::Sse41::func : -#else -#define SIMD_SSE41_FUNC(func) -#endif - -#ifdef SIMD_SSE42_ENABLE -#define SIMD_SSE42_FUNC(func) Simd::Sse42::Enable ? Simd::Sse42::func : -#else -#define SIMD_SSE42_FUNC(func) -#endif - -#ifdef SIMD_AVX_ENABLE -#define SIMD_AVX_FUNC(func) Simd::Avx::Enable ? Simd::Avx::func : -#else -#define SIMD_AVX_FUNC(func) -#endif - -#ifdef SIMD_AVX2_ENABLE -#define SIMD_AVX2_FUNC(func) Simd::Avx2::Enable ? Simd::Avx2::func : -#else -#define SIMD_AVX2_FUNC(func) -#endif - -#ifdef SIMD_AVX512F_ENABLE -#define SIMD_AVX512F_FUNC(func) Simd::Avx512f::Enable ? Simd::Avx512f::func : -#else -#define SIMD_AVX512F_FUNC(func) -#endif - -#ifdef SIMD_AVX512BW_ENABLE -#define SIMD_AVX512BW_FUNC(func) Simd::Avx512bw::Enable ? Simd::Avx512bw::func : -#else -#define SIMD_AVX512BW_FUNC(func) -#endif - -#ifdef SIMD_AVX512VNNI_ENABLE -#define SIMD_AVX512VNNI_FUNC(func) Simd::Avx512vnni::Enable ? Simd::Avx512vnni::func : -#else -#define SIMD_AVX512VNNI_FUNC(func) -#endif - -#ifdef SIMD_VMX_ENABLE -#define SIMD_VMX_FUNC(func) Simd::Vmx::Enable ? Simd::Vmx::func : -#else -#define SIMD_VMX_FUNC(func) -#endif - -#ifdef SIMD_VSX_ENABLE -#define SIMD_VSX_FUNC(func) Simd::Vsx::Enable ? Simd::Vsx::func : -#else -#define SIMD_VSX_FUNC(func) -#endif - -#ifdef SIMD_NEON_ENABLE -#define SIMD_NEON_FUNC(func) Simd::Neon::Enable ? Simd::Neon::func : -#else -#define SIMD_NEON_FUNC(func) -#endif - -#define SIMD_FUNC0(func) SIMD_BASE_FUNC(func) -#define SIMD_FUNC1(func, EXT1) EXT1(func) SIMD_BASE_FUNC(func) -#define SIMD_FUNC2(func, EXT1, EXT2) EXT1(func) EXT2(func) SIMD_BASE_FUNC(func) -#define SIMD_FUNC3(func, EXT1, EXT2, EXT3) EXT1(func) EXT2(func) EXT3(func) SIMD_BASE_FUNC(func) -#define SIMD_FUNC4(func, EXT1, EXT2, EXT3, EXT4) EXT1(func) EXT2(func) EXT3(func) EXT4(func) SIMD_BASE_FUNC(func) -#define SIMD_FUNC5(func, EXT1, EXT2, EXT3, EXT4, EXT5) EXT1(func) EXT2(func) EXT3(func) EXT4(func) EXT5(func) SIMD_BASE_FUNC(func) -#define SIMD_FUNC6(func, EXT1, EXT2, EXT3, EXT4, EXT5, EXT6) EXT1(func) EXT2(func) EXT3(func) EXT4(func) EXT5(func) EXT6(func) SIMD_BASE_FUNC(func) - -#endif//__SimdEnable_h__ diff --git a/src/3rd/Simd/Simd/SimdExp.h b/src/3rd/Simd/Simd/SimdExp.h deleted file mode 100644 index f9b29d8b..00000000 --- a/src/3rd/Simd/Simd/SimdExp.h +++ /dev/null @@ -1,621 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdExp_h__ -#define __SimdExp_h__ - -#include "Simd/SimdMath.h" - -namespace Simd -{ - namespace Base - { - SIMD_INLINE float Exp(float value) - { - return ::expf(value); - } - - SIMD_INLINE float Log(float value) - { - return ::logf(value); - } - } - -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { - class Exp - { - __m128i _exponent, _mantissa, _127; - __m128 _1_0, _0_5, _min, _max, _exp0, _exp1, _exp2, _exp3, _exp4, _exp5, _k; - - SIMD_INLINE __m128 Poly5(__m128 x) const - { - __m128 p = _exp5; - p = _mm_add_ps(_mm_mul_ps(x, p), _exp4); - p = _mm_add_ps(_mm_mul_ps(x, p), _exp3); - p = _mm_add_ps(_mm_mul_ps(x, p), _exp2); - p = _mm_add_ps(_mm_mul_ps(x, p), _exp1); - p = _mm_add_ps(_mm_mul_ps(x, p), _exp0); - return p; - } - - SIMD_INLINE __m128 Exp2(__m128 x) const - { - x = _mm_max_ps(_mm_min_ps(x, _max), _min); - __m128i ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _0_5)); - __m128 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart)); - __m128 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _127), 23)); - __m128 expfpart = Poly5(fpart); - return _mm_mul_ps(expipart, expfpart); - } - - public: - - SIMD_INLINE Exp(float k = 1.0f) - { - _exponent = _mm_set1_epi32(0x7F800000); - _mantissa = _mm_set1_epi32(0x007FFFFF); - _127 = _mm_set1_epi32(127); - _1_0 = _mm_set1_ps(1.0f); - _0_5 = _mm_set1_ps(0.5f); - _min = _mm_set1_ps(-126.99999f); - _max = _mm_set1_ps(129.00000f); - _exp0 = _mm_set1_ps(9.9999994e-1f); - _exp1 = _mm_set1_ps(6.9315308e-1f); - _exp2 = _mm_set1_ps(2.4015361e-1f); - _exp3 = _mm_set1_ps(5.5826318e-2f); - _exp4 = _mm_set1_ps(8.9893397e-3f); - _exp5 = _mm_set1_ps(1.8775767e-3f); - _k = _mm_set1_ps(k / 0.69314718056f); - } - - SIMD_INLINE __m128 Exponent(__m128 value) const - { - return Exp2(_mm_mul_ps(_k, value)); - } - - SIMD_INLINE __m128 Sigmoid(__m128 value) const - { - __m128 exp = Exp2(_mm_mul_ps(_k, value)); - return _mm_div_ps(_1_0, _mm_add_ps(_1_0, exp)); - } - - SIMD_INLINE __m128 Tanh(__m128 value) const - { - __m128 exp = Exp2(_mm_mul_ps(_k, value)); - return _mm_div_ps(_mm_sub_ps(_1_0, exp), _mm_add_ps(_1_0, exp)); - } - - SIMD_INLINE __m128 Elu(__m128 value, __m128 alpha) const - { - __m128 exp = Exp2(_mm_mul_ps(_k, value)); - __m128 neg = _mm_mul_ps(alpha, _mm_sub_ps(exp, _1_0)); - __m128 mask = _mm_cmpgt_ps(_mm_setzero_ps(), value); - return Sse::Combine(mask, neg, value); - } - }; - - namespace Detail - { - SIMD_INLINE __m128 Poly5(__m128 x, float a, float b, float c, float d, float e, float f) - { - __m128 p = _mm_set1_ps(f); - p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(e)); - p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(d)); - p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(c)); - p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(b)); - p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(a)); - return p; - } - - SIMD_INLINE __m128 Exp2(__m128 x) - { - x = _mm_max_ps(_mm_min_ps(x, _mm_set1_ps(129.00000f)), _mm_set1_ps(-126.99999f)); - __m128i ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f))); - __m128 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart)); - __m128 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23)); - __m128 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f); - return _mm_mul_ps(expipart, expfpart); - } - - SIMD_INLINE __m128 Log2(__m128 x) - { - __m128 _1 = _mm_set1_ps(1.0f); - __m128i i = _mm_castps_si128(x); - __m128 e = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, _mm_set1_epi32(0x7F800000)), 23), _mm_set1_epi32(127))); - __m128 m = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, _mm_set1_epi32(0x007FFFFF))), _1); - __m128 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); - return _mm_add_ps(_mm_mul_ps(p, _mm_sub_ps(m, _1)), e); - } - } - - SIMD_INLINE __m128 Exponent(__m128 value) - { - return Detail::Exp2(_mm_mul_ps(_mm_set1_ps(1.44269504f), value)); - } - - SIMD_INLINE __m128 Elu(__m128 value, __m128 alpha) - { - __m128 exp = Exponent(value); - __m128 neg = _mm_mul_ps(alpha, _mm_sub_ps(exp, _mm_set1_ps(1.0f))); - __m128 mask = _mm_cmpgt_ps(_mm_setzero_ps(), value); - return Sse::Combine(mask, neg, value); - } - - SIMD_INLINE __m128 Logarithm(__m128 value) - { - return _mm_mul_ps(_mm_set1_ps(0.693147181f), Detail::Log2(value)); - } - - SIMD_INLINE __m128 Softplus(__m128 value, __m128 beta, __m128 threshold) - { - __m128 exp = Exponent(_mm_mul_ps(value, beta)); - __m128 log = Logarithm(_mm_add_ps(_mm_set1_ps(1.0f), exp)); - __m128 mask = _mm_cmpgt_ps(threshold, value); - return Sse::Combine(mask, _mm_div_ps(log, beta), value); - } - - SIMD_INLINE __m128 Tanh(__m128 value) - { - __m128 _1 = _mm_set1_ps(1.0f); - __m128 exp = Detail::Exp2(_mm_mul_ps(_mm_set1_ps(2.88539008f), value)); - return _mm_div_ps(_mm_sub_ps(exp, _1), _mm_add_ps(_1, exp)); - } - } -#endif //SIMD_SSE2_ENABLE - -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - class Exp - { - __m256i _exponent, _mantissa, _127; - __m256 _1_0, _0_5, _min, _max, _exp0, _exp1, _exp2, _exp3, _exp4, _exp5, _k; - - SIMD_INLINE __m256 Poly5(__m256 x) const - { - __m256 p = _exp5; - p = _mm256_fmadd_ps(x, p, _exp4); - p = _mm256_fmadd_ps(x, p, _exp3); - p = _mm256_fmadd_ps(x, p, _exp2); - p = _mm256_fmadd_ps(x, p, _exp1); - p = _mm256_fmadd_ps(x, p, _exp0); - return p; - } - - SIMD_INLINE __m256 Exp2(__m256 x) const - { - x = _mm256_max_ps(_mm256_min_ps(x, _max), _min); - __m256i ipart = _mm256_cvtps_epi32(_mm256_sub_ps(x, _0_5)); - __m256 fpart = _mm256_sub_ps(x, _mm256_cvtepi32_ps(ipart)); - __m256 expipart = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_add_epi32(ipart, _127), 23)); - __m256 expfpart = Poly5(fpart); - return _mm256_mul_ps(expipart, expfpart); - } - - public: - - SIMD_INLINE Exp(float k = 1.0f) - { - _exponent = _mm256_set1_epi32(0x7F800000); - _mantissa = _mm256_set1_epi32(0x007FFFFF); - _127 = _mm256_set1_epi32(127); - _1_0 = _mm256_set1_ps(1.0f); - _0_5 = _mm256_set1_ps(0.5f); - _min = _mm256_set1_ps(-126.99999f); - _max = _mm256_set1_ps(129.00000f); - _exp0 = _mm256_set1_ps(9.9999994e-1f); - _exp1 = _mm256_set1_ps(6.9315308e-1f); - _exp2 = _mm256_set1_ps(2.4015361e-1f); - _exp3 = _mm256_set1_ps(5.5826318e-2f); - _exp4 = _mm256_set1_ps(8.9893397e-3f); - _exp5 = _mm256_set1_ps(1.8775767e-3f); - _k = _mm256_set1_ps(k / 0.69314718056f); - } - - SIMD_INLINE __m256 Exponent(__m256 value) const - { - return Exp2(_mm256_mul_ps(_k, value)); - } - - SIMD_INLINE __m256 Sigmoid(__m256 value) const - { - __m256 exp = Exp2(_mm256_mul_ps(_k, value)); - return _mm256_div_ps(_1_0, _mm256_add_ps(_1_0, exp)); - } - - SIMD_INLINE __m256 Tanh(__m256 value) const - { - __m256 exp = Exp2(_mm256_mul_ps(_k, value)); - return _mm256_div_ps(_mm256_sub_ps(_1_0, exp), _mm256_add_ps(_1_0, exp)); - } - - SIMD_INLINE __m256 Elu(__m256 value, __m256 alpha) const - { - __m256 exp = Exp2(_mm256_mul_ps(_k, value)); - __m256 neg = _mm256_mul_ps(alpha, _mm256_sub_ps(exp, _1_0)); - __m256 mask = _mm256_cmp_ps(_mm256_setzero_ps(), value, _CMP_GT_OS); - return _mm256_blendv_ps(value, neg, mask); - } - }; - - namespace Detail - { - SIMD_INLINE __m256 Poly5(__m256 x, float a, float b, float c, float d, float e, float f) - { - __m256 p = _mm256_set1_ps(f); - p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(e)); - p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(d)); - p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(c)); - p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(b)); - p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(a)); - return p; - } - - SIMD_INLINE __m256 Exp2(__m256 x) - { - x = _mm256_max_ps(_mm256_min_ps(x, _mm256_set1_ps(129.00000f)), _mm256_set1_ps(-126.99999f)); - __m256i ipart = _mm256_cvtps_epi32(_mm256_sub_ps(x, _mm256_set1_ps(0.5f))); - __m256 fpart = _mm256_sub_ps(x, _mm256_cvtepi32_ps(ipart)); - __m256 expipart = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_add_epi32(ipart, _mm256_set1_epi32(127)), 23)); - __m256 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f); - return _mm256_mul_ps(expipart, expfpart); - } - - SIMD_INLINE __m256 Log2(__m256 x) - { - __m256 _1 = _mm256_set1_ps(1.0f); - __m256i i = _mm256_castps_si256(x); - __m256 e = _mm256_cvtepi32_ps(_mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(i, _mm256_set1_epi32(0x7F800000)), 23), _mm256_set1_epi32(127))); - __m256 m = _mm256_or_ps(_mm256_castsi256_ps(_mm256_and_si256(i, _mm256_set1_epi32(0x007FFFFF))), _1); - __m256 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); - return _mm256_add_ps(_mm256_mul_ps(p, _mm256_sub_ps(m, _1)), e); - } - } - - SIMD_INLINE __m256 Exponent(__m256 value) - { - return Detail::Exp2(_mm256_mul_ps(_mm256_set1_ps(1.44269504f), value)); - } - - SIMD_INLINE __m256 Elu(__m256 value, __m256 alpha) - { - __m256 exp = Exponent(value); - __m256 neg = _mm256_mul_ps(alpha, _mm256_sub_ps(exp, _mm256_set1_ps(1.0f))); - __m256 mask = _mm256_cmp_ps(_mm256_setzero_ps(), value, _CMP_GT_OS); - return _mm256_blendv_ps(value, neg, mask); - } - - SIMD_INLINE __m256 Logarithm(__m256 value) - { - return _mm256_mul_ps(_mm256_set1_ps(0.693147181f), Detail::Log2(value)); - } - - SIMD_INLINE __m256 Softplus(__m256 value, __m256 beta, __m256 threshold) - { - __m256 exp = Exponent(_mm256_mul_ps(value, beta)); - __m256 log = Logarithm(_mm256_add_ps(_mm256_set1_ps(1.0f), exp)); - __m256 mask = _mm256_cmp_ps(threshold, value, _CMP_GT_OS); - return _mm256_blendv_ps(value, _mm256_div_ps(log, beta), mask); - } - - SIMD_INLINE __m256 Tanh(__m256 value) - { - __m256 _1 = _mm256_set1_ps(1.0f); - __m256 exp = Detail::Exp2(_mm256_mul_ps(_mm256_set1_ps(2.88539008f), value)); - return _mm256_div_ps(_mm256_sub_ps(exp, _1), _mm256_add_ps(_1, exp)); - } - } -#endif //SIMD_AVX2_ENABLE - -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - class Exp - { - __m512i _exponent, _mantissa, _127; - __m512 _1_0, _0_5, _min, _max, _exp0, _exp1, _exp2, _exp3, _exp4, _exp5, _k; - - SIMD_INLINE __m512 Poly5(__m512 x) const - { - __m512 p = _exp5; - p = _mm512_fmadd_ps(x, p, _exp4); - p = _mm512_fmadd_ps(x, p, _exp3); - p = _mm512_fmadd_ps(x, p, _exp2); - p = _mm512_fmadd_ps(x, p, _exp1); - p = _mm512_fmadd_ps(x, p, _exp0); - return p; - } - - SIMD_INLINE __m512 Exp2(__m512 x) const - { - x = _mm512_max_ps(_mm512_min_ps(x, _max), _min); - __m512i ipart = _mm512_cvtps_epi32(_mm512_sub_ps(x, _0_5)); - __m512 fpart = _mm512_sub_ps(x, _mm512_cvtepi32_ps(ipart)); - __m512 expipart = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_add_epi32(ipart, _127), 23)); - __m512 expfpart = Poly5(fpart); - return _mm512_mul_ps(expipart, expfpart); - } - - public: - - SIMD_INLINE Exp(float k = 1.0f) - { - _exponent = _mm512_set1_epi32(0x7F800000); - _mantissa = _mm512_set1_epi32(0x007FFFFF); - _127 = _mm512_set1_epi32(127); - _1_0 = _mm512_set1_ps(1.0f); - _0_5 = _mm512_set1_ps(0.5f); - _min = _mm512_set1_ps(-126.99999f); - _max = _mm512_set1_ps(129.00000f); - _exp0 = _mm512_set1_ps(9.9999994e-1f); - _exp1 = _mm512_set1_ps(6.9315308e-1f); - _exp2 = _mm512_set1_ps(2.4015361e-1f); - _exp3 = _mm512_set1_ps(5.5826318e-2f); - _exp4 = _mm512_set1_ps(8.9893397e-3f); - _exp5 = _mm512_set1_ps(1.8775767e-3f); - _k = _mm512_set1_ps(k / 0.69314718056f); - } - - SIMD_INLINE __m512 Exponent(__m512 value) const - { - return Exp2(_mm512_mul_ps(_k, value)); - } - - SIMD_INLINE __m512 Sigmoid(__m512 value) const - { - __m512 exp = Exp2(_mm512_mul_ps(_k, value)); - return _mm512_div_ps(_1_0, _mm512_add_ps(_1_0, exp)); - } - - SIMD_INLINE __m512 Tanh(__m512 value) const - { - __m512 exp = Exp2(_mm512_mul_ps(_k, value)); - return _mm512_div_ps(_mm512_sub_ps(_1_0, exp), _mm512_add_ps(_1_0, exp)); - } - - SIMD_INLINE __m512 Elu(__m512 value, __m512 alpha) const - { - __m512 exp = Exp2(_mm512_mul_ps(_k, value)); - __m512 neg = _mm512_mul_ps(alpha, _mm512_sub_ps(exp, _1_0)); - __mmask16 mask = _mm512_cmp_ps_mask(_mm512_setzero_ps(), value, _CMP_GT_OS); - return _mm512_mask_blend_ps(mask, value, neg); - } - }; - - namespace Detail - { - SIMD_INLINE __m512 Poly5(__m512 x, float a, float b, float c, float d, float e, float f) - { - __m512 p = _mm512_set1_ps(f); - p = _mm512_add_ps(_mm512_mul_ps(x, p), _mm512_set1_ps(e)); - p = _mm512_add_ps(_mm512_mul_ps(x, p), _mm512_set1_ps(d)); - p = _mm512_add_ps(_mm512_mul_ps(x, p), _mm512_set1_ps(c)); - p = _mm512_add_ps(_mm512_mul_ps(x, p), _mm512_set1_ps(b)); - p = _mm512_add_ps(_mm512_mul_ps(x, p), _mm512_set1_ps(a)); - return p; - } - - SIMD_INLINE __m512 Exp2(__m512 x) - { - x = _mm512_max_ps(_mm512_min_ps(x, _mm512_set1_ps(129.00000f)), _mm512_set1_ps(-126.99999f)); - __m512i ipart = _mm512_cvtps_epi32(_mm512_sub_ps(x, _mm512_set1_ps(0.5f))); - __m512 fpart = _mm512_sub_ps(x, _mm512_cvtepi32_ps(ipart)); - __m512 expipart = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_add_epi32(ipart, _mm512_set1_epi32(127)), 23)); - __m512 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f); - return _mm512_mul_ps(expipart, expfpart); - } - - SIMD_INLINE __m512 Log2(__m512 x) - { - __m512 _1 = _mm512_set1_ps(1.0f); - __m512i i = _mm512_castps_si512(x); - __m512 e = _mm512_cvtepi32_ps(_mm512_sub_epi32(_mm512_srli_epi32(_mm512_and_si512(i, _mm512_set1_epi32(0x7F800000)), 23), _mm512_set1_epi32(127))); - __m512 m = Or(_mm512_castsi512_ps(_mm512_and_si512(i, _mm512_set1_epi32(0x007FFFFF))), _1); - __m512 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); - return _mm512_add_ps(_mm512_mul_ps(p, _mm512_sub_ps(m, _1)), e); - } - } - - SIMD_INLINE __m512 Exponent(__m512 value) - { - return Detail::Exp2(_mm512_mul_ps(_mm512_set1_ps(1.44269504f), value)); - } - - SIMD_INLINE __m512 Elu(__m512 value, __m512 alpha) - { - __m512 exp = Exponent(value); - __m512 neg = _mm512_mul_ps(alpha, _mm512_sub_ps(exp, _mm512_set1_ps(1.0f))); - __mmask16 mask = _mm512_cmp_ps_mask(_mm512_setzero_ps(), value, _CMP_GT_OS); - return _mm512_mask_blend_ps(mask, value, neg); - } - - SIMD_INLINE __m512 Logarithm(__m512 value) - { - return _mm512_mul_ps(_mm512_set1_ps(0.693147181f), Detail::Log2(value)); - } - - SIMD_INLINE __m512 Softplus(__m512 value, __m512 beta, __m512 threshold) - { - __m512 exp = Exponent(_mm512_mul_ps(value, beta)); - __m512 log = Logarithm(_mm512_add_ps(_mm512_set1_ps(1.0f), exp)); - __mmask16 mask = _mm512_cmp_ps_mask(threshold, value, _CMP_GT_OS); - return _mm512_mask_blend_ps(mask, value, _mm512_div_ps(log, beta)); - } - - SIMD_INLINE __m512 Tanh(__m512 value) - { - __m512 _1 = _mm512_set1_ps(1.0f); - __m512 exp = Detail::Exp2(_mm512_mul_ps(_mm512_set1_ps(2.88539008f), value)); - return _mm512_div_ps(_mm512_sub_ps(exp, _1), _mm512_add_ps(_1, exp)); - } - } -#endif //SIMD_AVX512F_ENABLE - -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - class Exp - { - int32x4_t _exponent, _mantissa, _127; - float32x4_t _1_0, _0_5, _min, _max, _exp0, _exp1, _exp2, _exp3, _exp4, _exp5, _k; - - SIMD_INLINE float32x4_t Poly5(float32x4_t x) const - { - float32x4_t p = _exp5; - p = vmlaq_f32(_exp4, x, p); - p = vmlaq_f32(_exp3, x, p); - p = vmlaq_f32(_exp2, x, p); - p = vmlaq_f32(_exp1, x, p); - p = vmlaq_f32(_exp0, x, p); - return p; - } - - SIMD_INLINE float32x4_t Exp2(float32x4_t x) const - { - x = vmaxq_f32(vminq_f32(x, _max), _min); - int32x4_t ipart = vcvtq_s32_f32(vsubq_f32(x, _0_5)); - float32x4_t fpart = vsubq_f32(x, vcvtq_f32_s32(ipart)); - float32x4_t expipart = vreinterpretq_f32_s32(vshlq_n_s32(vaddq_s32(ipart, _127), 23)); - float32x4_t expfpart = Poly5(fpart); - return vmulq_f32(expipart, expfpart); - } - - public: - - SIMD_INLINE Exp(float k = 1.0f) - { - _exponent = vdupq_n_s32(0x7F800000); - _mantissa = vdupq_n_s32(0x007FFFFF); - _127 = vdupq_n_s32(127); - _1_0 = vdupq_n_f32(1.0f); - _0_5 = vdupq_n_f32(0.5f); - _min = vdupq_n_f32(-126.99999f); - _max = vdupq_n_f32(129.00000f); - _exp0 = vdupq_n_f32(9.9999994e-1f); - _exp1 = vdupq_n_f32(6.9315308e-1f); - _exp2 = vdupq_n_f32(2.4015361e-1f); - _exp3 = vdupq_n_f32(5.5826318e-2f); - _exp4 = vdupq_n_f32(8.9893397e-3f); - _exp5 = vdupq_n_f32(1.8775767e-3f); - _k = vdupq_n_f32(k / 0.69314718056f); - } - - SIMD_INLINE float32x4_t Exponent(float32x4_t value) const - { - return Exp2(vmulq_f32(_k, value)); - } - - template SIMD_INLINE float32x4_t Sigmoid(float32x4_t value) const - { - float32x4_t exp = Exp2(vmulq_f32(_k, value)); - return Reciprocal(vaddq_f32(_1_0, exp)); - } - - template SIMD_INLINE float32x4_t Tanh(float32x4_t value) const - { - float32x4_t exp = Exp2(vmulq_f32(_k, value)); - return Div(vsubq_f32(_1_0, exp), vaddq_f32(_1_0, exp)); - } - - SIMD_INLINE float32x4_t Elu(float32x4_t value, float32x4_t alpha) const - { - float32x4_t exp = Exp2(vmulq_f32(_k, value)); - float32x4_t neg = vmulq_f32(alpha, vsubq_f32(exp, _1_0)); - uint32x4_t mask = vcgtq_f32(vdupq_n_f32(0.0f), value); - return vbslq_f32(mask, neg, value); - } - }; - - namespace Detail - { - SIMD_INLINE float32x4_t Poly5(float32x4_t x, float a, float b, float c, float d, float e, float f) - { - float32x4_t p = vdupq_n_f32(f); - p = vmlaq_f32(vdupq_n_f32(e), x, p); - p = vmlaq_f32(vdupq_n_f32(d), x, p); - p = vmlaq_f32(vdupq_n_f32(c), x, p); - p = vmlaq_f32(vdupq_n_f32(b), x, p); - p = vmlaq_f32(vdupq_n_f32(a), x, p); - return p; - } - - SIMD_INLINE float32x4_t Exp2(float32x4_t x) - { - x = vmaxq_f32(vminq_f32(x, vdupq_n_f32(129.00000f)), vdupq_n_f32(-126.99999f)); - int32x4_t ipart = vcvtq_s32_f32(vsubq_f32(x, vdupq_n_f32(0.5f))); - float32x4_t fpart = vsubq_f32(x, vcvtq_f32_s32(ipart)); - float32x4_t expipart = vreinterpretq_f32_s32(vshlq_n_s32(vaddq_s32(ipart, vdupq_n_s32(127)), 23)); - float32x4_t expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f); - return vmulq_f32(expipart, expfpart); - } - - SIMD_INLINE float32x4_t Log2(float32x4_t x) - { - float32x4_t _1 = vdupq_n_f32(1.0f); - int32x4_t i = vreinterpretq_s32_f32(x); - float32x4_t e = vcvtq_f32_s32(vsubq_s32(vshrq_n_s32(vandq_s32(i, vdupq_n_s32(0x7F800000)), 23), vdupq_n_s32(127))); - float32x4_t m = Or(vreinterpretq_f32_s32(vandq_s32(i, vdupq_n_s32(0x007FFFFF))), _1); - float32x4_t p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); - return vaddq_f32(vmulq_f32(p, vsubq_f32(m, _1)), e); - } - } - - SIMD_INLINE float32x4_t Exponent(float32x4_t value) - { - return Detail::Exp2(vmulq_f32(vdupq_n_f32(1.44269504f), value)); - } - - SIMD_INLINE float32x4_t Elu(float32x4_t value, float32x4_t alpha) - { - float32x4_t exp = Exponent(value); - float32x4_t neg = vmulq_f32(alpha, vsubq_f32(exp, vdupq_n_f32(1.0f))); - uint32x4_t mask = vcgtq_f32(vdupq_n_f32(0.0f), value); - return vbslq_f32(mask, neg, value); - } - - SIMD_INLINE float32x4_t Logarithm(float32x4_t value) - { - return vmulq_f32(vdupq_n_f32(0.693147181f), Detail::Log2(value)); - } - - template SIMD_INLINE float32x4_t Softplus(float32x4_t value, float32x4_t beta, float32x4_t threshold) - { - float32x4_t exp = Exponent(vmulq_f32(value, beta)); - float32x4_t log = Logarithm(vaddq_f32(vdupq_n_f32(1.0f), exp)); - uint32x4_t mask = vcgtq_f32(threshold, value); - return vbslq_f32(mask, Div(log, beta), value); - } - - template SIMD_INLINE float32x4_t Tanh(float32x4_t value) - { - float32x4_t _1 = vdupq_n_f32(1.0f); - float32x4_t exp = Detail::Exp2(vmulq_f32(vdupq_n_f32(2.88539008f), value)); - return Div(vsubq_f32(exp, _1), vaddq_f32(_1, exp)); - } - } -#endif //SIMD_NEON_ENABLE -} - -#endif//__SimdExp_h__ diff --git a/src/3rd/Simd/Simd/SimdExtract.h b/src/3rd/Simd/Simd/SimdExtract.h deleted file mode 100644 index 85e2b8f3..00000000 --- a/src/3rd/Simd/Simd/SimdExtract.h +++ /dev/null @@ -1,324 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdExtract_h__ -#define __SimdExtract_h__ - -#include "Simd/SimdConst.h" - -namespace Simd -{ -#ifdef SIMD_SSE_ENABLE - namespace Sse - { - SIMD_INLINE float ExtractValue(__m128 a, int i) - { - float SIMD_ALIGNED(16) _a[4]; - _mm_store_ps(_a, a); - return _a[i]; - } - - SIMD_INLINE float ExtractSum(__m128 a) - { - float SIMD_ALIGNED(16) _a[4]; - _mm_store_ps(_a, a); - return _a[0] + _a[1] + _a[2] + _a[3]; - } - } -#endif//SIMD_SSE_ENABLE - -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { - template SIMD_INLINE int ExtractInt8(__m128i a) - { - return _mm_extract_epi16(_mm_srli_si128(a, index & 0x1), index >> 1) & 0xFF; - } - - template SIMD_INLINE int ExtractInt16(__m128i a) - { - return _mm_extract_epi16(a, index); - } - - template SIMD_INLINE int ExtractInt32(__m128i a) - { - return _mm_cvtsi128_si32(_mm_srli_si128(a, 4 * index)); - } - - SIMD_INLINE int ExtractInt32Sum(__m128i a) - { - int SIMD_ALIGNED(16) _a[4]; - _mm_store_si128((__m128i*)_a, a); - return _a[0] + _a[1] + _a[2] + _a[3]; - } - - template SIMD_INLINE int64_t ExtractInt64(__m128i a) - { -#if defined(SIMD_X64_ENABLE) && (!defined(_MSC_VER) || (defined(_MSC_VER) && _MSC_VER >= 1600)) - return _mm_cvtsi128_si64(_mm_srli_si128(a, 8 * index)); -#else - return (int64_t)ExtractInt32<2 * index + 1>(a) * 0x100000000 + (uint32_t)ExtractInt32<2 * index>(a); -#endif - } - - SIMD_INLINE int64_t ExtractInt64Sum(__m128i a) - { - int64_t SIMD_ALIGNED(16) _a[2]; - _mm_store_si128((__m128i*)_a, a); - return _a[0] + _a[1]; - } - } -#endif// SIMD_SSE2_ENABLE - -#ifdef SIMD_SSE3_ENABLE - namespace Sse3 - { - SIMD_INLINE float ExtractSum(__m128 a) - { - return _mm_cvtss_f32(_mm_hadd_ps(_mm_hadd_ps(a, _mm_setzero_ps()), _mm_setzero_ps())); - } - - SIMD_INLINE __m128 Extract4Sums(const __m128 a[4]) - { - return _mm_hadd_ps(_mm_hadd_ps(a[0], a[1]), _mm_hadd_ps(a[2], a[3])); - } - } -#endif//SIMD_SSE3_ENABLE - -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - SIMD_INLINE float ExtractValue(__m256 a, int i) - { - float SIMD_ALIGNED(32) _a[8]; - _mm256_store_ps(_a, a); - return _a[i]; - } - - SIMD_INLINE float ExtractSum(__m256 a) - { - float SIMD_ALIGNED(32) _a[8]; - _mm256_store_ps(_a, _mm256_hadd_ps(_mm256_hadd_ps(a, _mm256_setzero_ps()), _mm256_setzero_ps())); - return _a[0] + _a[4]; - } - - SIMD_INLINE __m128 Extract4Sums(const __m256 a[4]) - { - __m256 b = _mm256_hadd_ps(_mm256_hadd_ps(a[0], a[1]), _mm256_hadd_ps(a[2], a[3])); - return _mm_add_ps(_mm256_castps256_ps128(b), _mm256_extractf128_ps(b, 1)); - } - - SIMD_INLINE __m128 Extract4Sums(const __m256 & a0, const __m256 & a1, const __m256 & a2, const __m256 & a3) - { - __m256 b = _mm256_hadd_ps(_mm256_hadd_ps(a0, a1), _mm256_hadd_ps(a2, a3)); - return _mm_add_ps(_mm256_castps256_ps128(b), _mm256_extractf128_ps(b, 1)); - } - } -#endif//SIMD_AVX_ENABLE - -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE T Extract(__m256i a, size_t index) - { - const size_t size = A / sizeof(T); - assert(index < size); - T buffer[size]; - _mm256_storeu_si256((__m256i*)buffer, a); - return buffer[index]; - } - - template SIMD_INLINE T ExtractSum(__m256i a) - { - const size_t size = A / sizeof(T); - T buffer[size]; - _mm256_storeu_si256((__m256i*)buffer, a); - T sum = 0; - for (size_t i = 0; i < size; ++i) - sum += buffer[i]; - return sum; - } - - template <> SIMD_INLINE uint32_t ExtractSum(__m256i a) - { - __m128i b = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(a, 1)); - return _mm_extract_epi32(_mm_hadd_epi32(_mm_hadd_epi32(b, _mm_setzero_si128()), _mm_setzero_si128()), 0); - } - -#if defined(SIMD_X64_ENABLE) - template <> SIMD_INLINE uint64_t ExtractSum(__m256i a) - { - __m128i b = _mm_add_epi64(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(a, 1)); - return _mm_extract_epi64(b, 0) + _mm_extract_epi64(b, 1); - } -#endif - - template SIMD_INLINE int64_t Extract64i(__m256i value) - { - assert(index >= 0 && index < 4); -#if defined(SIMD_X64_ENABLE) -#if (defined(_MSC_VER) && (_MSC_VER <= 1900)) - return _mm_extract_epi64(_mm256_extractf128_si256(value, index / 2), index % 2); -#else - return _mm256_extract_epi64(value, index); -#endif -#else - SIMD_ALIGNED(32) int64_t buffer[4]; - _mm256_store_si256((__m256i*)buffer, value); - return buffer[index]; -#endif - } - } -#endif// SIMD_AVX2_ENABLE - -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - SIMD_INLINE float Extract(const __m512 & a, size_t index) - { - float buffer[F]; - _mm512_storeu_ps(buffer, a); - return buffer[index]; - } - - SIMD_INLINE float ExtractSum(const __m512 & a) - { - __m128 lo = _mm_add_ps(_mm512_extractf32x4_ps(a, 0), _mm512_extractf32x4_ps(a, 1)); - __m128 hi = _mm_add_ps(_mm512_extractf32x4_ps(a, 2), _mm512_extractf32x4_ps(a, 3)); - return _mm_cvtss_f32(_mm_hadd_ps(_mm_hadd_ps(_mm_add_ps(lo, hi), _mm_setzero_ps()), _mm_setzero_ps())); - } - - SIMD_INLINE __m128 Extract4Sums(const __m512 a[4]) - { - __m256 b0 = _mm512_castps512_ps256(_mm512_add_ps(a[0], Alignr<8>(a[0], a[0]))); - __m256 b1 = _mm512_castps512_ps256(_mm512_add_ps(a[1], Alignr<8>(a[1], a[1]))); - __m256 b2 = _mm512_castps512_ps256(_mm512_add_ps(a[2], Alignr<8>(a[2], a[2]))); - __m256 b3 = _mm512_castps512_ps256(_mm512_add_ps(a[3], Alignr<8>(a[3], a[3]))); - __m256 c = _mm256_hadd_ps(_mm256_hadd_ps(b0, b1), _mm256_hadd_ps(b2, b3)); - return _mm_add_ps(_mm256_castps256_ps128(c), _mm256_extractf128_ps(c, 1)); - } - - SIMD_INLINE __m128 Extract4Sums(const __m512 & a0, const __m512 & a1, const __m512 & a2, const __m512 & a3) - { - __m256 b0 = _mm512_castps512_ps256(_mm512_add_ps(a0, Alignr<8>(a0, a0))); - __m256 b1 = _mm512_castps512_ps256(_mm512_add_ps(a1, Alignr<8>(a1, a1))); - __m256 b2 = _mm512_castps512_ps256(_mm512_add_ps(a2, Alignr<8>(a2, a2))); - __m256 b3 = _mm512_castps512_ps256(_mm512_add_ps(a3, Alignr<8>(a3, a3))); - __m256 c = _mm256_hadd_ps(_mm256_hadd_ps(b0, b1), _mm256_hadd_ps(b2, b3)); - return _mm_add_ps(_mm256_castps256_ps128(c), _mm256_extractf128_ps(c, 1)); - } - } -#endif//SIMD_AVX512F_ENABLE - -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - template SIMD_INLINE T ExtractSum(__m512i a) - { - const size_t size = A / sizeof(T); - T buffer[size]; - _mm512_storeu_si512(buffer, a); - T sum = 0; - for (size_t i = 0; i < size; ++i) - sum += buffer[i]; - return sum; - } - - template <> SIMD_INLINE uint32_t ExtractSum(__m512i a) - { - __m256i b = _mm256_add_epi32(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(a, 1)); - __m128i c = _mm_add_epi32(_mm256_extractf128_si256(b, 0), _mm256_extractf128_si256(b, 1)); - return _mm_extract_epi32(_mm_hadd_epi32(_mm_hadd_epi32(c, _mm_setzero_si128()), _mm_setzero_si128()), 0); - } - -#if defined(SIMD_X64_ENABLE) - template <> SIMD_INLINE uint64_t ExtractSum(__m512i a) - { - __m256i b = _mm256_add_epi64(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(a, 1)); - __m128i c = _mm_add_epi64(_mm256_extractf128_si256(b, 0), _mm256_extractf128_si256(b, 1)); - return _mm_extract_epi64(c, 0) + _mm_extract_epi64(c, 1); - } -#endif - } -#endif//SIMD_AVX512BW_ENABLE - -#ifdef SIMD_VMX_ENABLE - namespace Vmx - { - SIMD_INLINE uint32_t ExtractSum(v128_u32 a) - { - return vec_extract(a, 0) + vec_extract(a, 1) + vec_extract(a, 2) + vec_extract(a, 3); - } - - SIMD_INLINE float ExtractSum(v128_f32 a) - { - return vec_extract(a, 0) + vec_extract(a, 1) + vec_extract(a, 2) + vec_extract(a, 3); - } - } -#endif// SIMD_VMX_ENABLE - -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE uint32_t ExtractSum32u(const uint32x4_t & a) - { - return vgetq_lane_u32(a, 0) + vgetq_lane_u32(a, 1) + vgetq_lane_u32(a, 2) + vgetq_lane_u32(a, 3); - } - - SIMD_INLINE uint64_t ExtractSum64u(const uint64x2_t & a) - { - return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); - } - - SIMD_INLINE int64_t ExtractSum64i(const int64x2_t & a) - { - return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); - } - - SIMD_INLINE float ExtractSum32f(const float32x4_t & a) - { - return vgetq_lane_f32(a, 0) + vgetq_lane_f32(a, 1) + vgetq_lane_f32(a, 2) + vgetq_lane_f32(a, 3); - } - - SIMD_INLINE float32x4_t Extract4Sums(const float32x4_t a[4]) - { - float32x4x2_t b0 = vzipq_f32(a[0], a[2]); - float32x4x2_t b1 = vzipq_f32(a[1], a[3]); - float32x4x2_t c0 = vzipq_f32(b0.val[0], b1.val[0]); - float32x4x2_t c1 = vzipq_f32(b0.val[1], b1.val[1]); - return vaddq_f32(vaddq_f32(c0.val[0], c0.val[1]), vaddq_f32(c1.val[0], c1.val[1])); - } - - SIMD_INLINE float32x4_t Extract4Sums(const float32x4_t & a0, const float32x4_t & a1, const float32x4_t & a2, const float32x4_t & a3) - { - float32x4x2_t b0 = vzipq_f32(a0, a2); - float32x4x2_t b1 = vzipq_f32(a1, a3); - float32x4x2_t c0 = vzipq_f32(b0.val[0], b1.val[0]); - float32x4x2_t c1 = vzipq_f32(b0.val[1], b1.val[1]); - return vaddq_f32(vaddq_f32(c0.val[0], c0.val[1]), vaddq_f32(c1.val[0], c1.val[1])); - } - } -#endif// SIMD_NEON_ENABLE -} - -#endif//__SimdExtract_h__ diff --git a/src/3rd/Simd/Simd/SimdFont.hpp b/src/3rd/Simd/Simd/SimdFont.hpp deleted file mode 100644 index 0e0e6bd5..00000000 --- a/src/3rd/Simd/Simd/SimdFont.hpp +++ /dev/null @@ -1,1878 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdFont_hpp__ -#define __SimdFont_hpp__ - -#include "Simd/SimdLib.hpp" -#include "Simd/SimdDrawing.hpp" - -#include -#include -#include - -namespace Simd -{ - /*! @ingroup cpp_drawing - - \short The Font class provides text drawing. - */ - class Font - { - public: - typedef std::string String; /*!< String type definition. */ - typedef Simd::Point Point; /*!< Point type definition. */ - typedef Simd::View View; /*!< Image time definition. */ - - /*! - Creates a new Font class with given height. - - \note The font supports ASCII characters only. It was generated on the base of the generic monospace font from Gdiplus. - - \param [in] height - initial height value. By default it is equal to 16. - */ - Font(size_t height = 16) - { - LoadDefault(); - Resize(height); - } - - /*! - Sets a new height value to font. - - \param [in] height - a new height value. - - \return a result of the operation. - */ - bool Resize(size_t height) - { - if (height == _currentSize.y) - return true; - - if (height < 4u || height > (size_t)_originalSize.y*4) - return false; - - _currentSize.y = height; - _currentSize.x = height*_originalSize.x / _originalSize.y; - _currentIndent.x = height*_originalIndent.x / _originalSize.y; - _currentIndent.y = height*_originalIndent.y / _originalSize.y; - - size_t level = 0; - for (; (height << (level + 1)) < (size_t)_originalSize.y; level++); - Point size = _currentSize << level; - - _currentSymbols.resize(_originalSymbols.size()); - for (size_t i = 0; i < _originalSymbols.size(); ++i) - { - _currentSymbols[i].value = _originalSymbols[i].value; - _currentSymbols[i].image.Recreate(_currentSize, View::Gray8); - if (level) - { - Pyramid pyramid(size, level + 1); - Simd::ResizeBilinear(_originalSymbols[i].image, pyramid[0]); - Simd::Build(pyramid, SimdReduce2x2); - Simd::Copy(pyramid[level], _currentSymbols[i].image); - } - else - Simd::ResizeBilinear(_originalSymbols[i].image, _currentSymbols[i].image); - } - - return true; - } - - /*! - Gets height of the font. - - \return current height of the font. - */ - size_t Height() const - { - return _currentSize.y; - } - - /*! - Measures a size of region is need to draw given text. - - \param [in] text - a text to draw. - - \return measured size. - */ - Point Measure(const String & text) const - { - Point size, curr; - for (size_t i = 0; i < text.size(); ++i) - { - if (text[i] >= _symbolMin && text[i] <= _symbolMax) - { - curr.x += _currentSize.x; - size.x = std::max(size.x, curr.x); - size.y = std::max(size.y, curr.y + _currentSize.y); - } - else if (text[i] == '\n') - { - curr.x = 0; - curr.y += _currentSize.y; - } - } - return size.x ? size + 2 * _currentIndent : Point(); - } - - /*! - Draws a text at the image. - - \param [out] canvas - a canvas (image where we draw text). - \param [in] text - a text to draw. - \param [in] position - a start position to draw text. - \param [in] color - a color of the text. - - \return a result of the operation. - */ - template bool Draw(View & canvas, const String & text, const Point & position, const Color & color) const - { - assert(sizeof(color) == canvas.PixelSize()); - - View alpha; - Rect canvasRect, alphaRect; - CreateAlpha(text, Rect(canvas.Size()), position, alpha, canvasRect, alphaRect); - - if(alpha.Area()) - Simd::AlphaFilling(canvas.Region(canvasRect).Ref(), color, alpha.Region(alphaRect)); - - return true; - } - - /*! - Draws a text at the image. - - \param [out] canvas - a canvas (image where we draw text). - \param [in] text - a text to draw. - \param [in] position - a position to draw text (see Simd::View::Position). - \param [in] color - a color of the text. - - \return a result of the operation. - */ - template bool Draw(View & canvas, const String & text, const View::Position & position, const Color & color) const - { - return Draw(canvas.Region(Measure(text), position).Ref(), text, Point(0, 0), color); - } - - /*! - Draws a text at the image. Fills the text background by given color. - - \param [out] canvas - a canvas (image where we draw text). - \param [in] text - a text to draw. - \param [in] position - a position to draw text (see Simd::View::Position). - \param [in] color - a color of the text. - \param [in] background - background color. - - \return a result of the operation. - */ - template bool Draw(View & canvas, const String & text, const View::Position & position, const Color & color, const Color & background) const - { - View region = canvas.Region(Measure(text), position); - Simd::FillPixel(region, background); - return Draw(region, text, Point(0, 0), color); - } - - private: - typedef Simd::Rectangle Rect; - typedef std::vector Rects; - typedef Simd::Pyramid Pyramid; - typedef std::vector Points; - - struct Symbol - { - char value; - View image; - }; - typedef std::vector Symbols; - - Symbols _originalSymbols, _currentSymbols; - Point _originalSize, _currentSize, _originalIndent, _currentIndent; - char _symbolMin, _symbolMax; - - void CreateAlpha(const String & text, const Rect & canvas, const Point & shift, View & alpha, Rect & canvasRect, Rect & alphaRect) const - { - Rects rects; - rects.reserve(text.size()); - String symbols; - symbols.reserve(text.size()); - Point curr; - for (size_t i = 0; i < text.size(); ++i) - { - char value = text[i]; - if (value >= _symbolMin && value <= _symbolMax) - { - Rect current(curr, curr + _currentSize); - Rect shifted = current.Shifted(shift + _currentIndent); - if (!canvas.Intersection(shifted).Empty()) - { - alphaRect |= current; - canvasRect |= shifted; - rects.push_back(current); - symbols.push_back(value); - } - curr.x += _currentSize.x; - } - else if (value == '\n') - { - curr.x = 0; - curr.y += _currentSize.y; - } - } - alpha.Recreate(alphaRect.Size(), View::Gray8); - Simd::Fill(alpha, 0); - for (size_t i = 0; i < symbols.size(); ++i) - Simd::Copy(_currentSymbols[symbols[i] - _symbolMin].image, alpha.Region(rects[i].Shifted(-alphaRect.TopLeft())).Ref()); - Rect old = canvasRect; - canvasRect &= canvas; - alphaRect.Shift(-alphaRect.TopLeft()); - alphaRect.left += canvasRect.left - old.left; - alphaRect.top += canvasRect.top - old.top; - alphaRect.right += canvasRect.right - old.right; - alphaRect.bottom += canvasRect.bottom - old.bottom; - } - - uint8_t LoadValue(const uint8_t * & data, size_t & size) - { - if (size == 0) - throw; - size--; - return *data++; - } - - bool Load(const uint8_t * data, size_t size) - { - try - { - _symbolMin = LoadValue(data, size); - _symbolMax = LoadValue(data, size); - _originalSize.x = LoadValue(data, size); - _originalSize.y = LoadValue(data, size); - _originalIndent.x = LoadValue(data, size); - _originalIndent.y = LoadValue(data, size); - _originalSymbols.resize(_symbolMax - _symbolMin); - for (char s = _symbolMin; s < _symbolMax; ++s) - { - Symbol & symbol = _originalSymbols[s - _symbolMin]; - symbol.value = LoadValue(data, size); - if(symbol.value != s) - throw; - symbol.image.Recreate(_originalSize, View::Gray8); - Simd::Fill(symbol.image, 0); - size_t top = LoadValue(data, size); - size_t bottom = LoadValue(data, size); - for (size_t r = top; r < bottom; ++r) - { - size_t count = LoadValue(data, size); - for (size_t l = 0; l < count; ++l) - { - size_t left = LoadValue(data, size); - size_t right = LoadValue(data, size); - assert(left < right); - memset(symbol.image.Row(r) + left, 0xFF, right - left); - } - } - } - } - catch (...) - { - _originalSize = Point(); - _originalSymbols.clear(); - return false; - } - return true; - } - - bool LoadDefault() - { - static const uint8_t data[] = { - 32, 127, 127, 231, 33, 12, 32, 0, 0, 33, 26, 160, 1, 57, 66, 1, 54, 69, 1, 52, 70, 1, 51, 72, 1, 50, - 73, 1, 49, 74, 1, 48, 75, 1, 48, 75, 1, 47, 76, 1, 47, 76, 1, 46, 77, 1, 46, 77, 1, 46, 77, 1, 45, 77, - 1, 45, 77, 1, 45, 78, 1, 45, 78, 1, 45, 78, 1, 45, 78, 1, 45, 78, 1, 45, 77, 1, 45, 77, 1, 46, 77, 1, - 46, 77, 1, 46, 77, 1, 46, 77, 1, 46, 77, 1, 46, 77, 1, 46, 77, 1, 46, 77, 1, 46, 77, 1, 46, 76, 1, 47, - 76, 1, 47, 76, 1, 47, 76, 1, 47, 76, 1, 47, 76, 1, 47, 76, 1, 47, 76, 1, 47, 76, 1, 47, 76, 1, 47, 75, - 1, 47, 75, 1, 48, 75, 1, 48, 75, 1, 48, 75, 1, 48, 75, 1, 48, 75, 1, 48, 75, 1, 48, 75, 1, 48, 75, 1, - 48, 75, 1, 48, 74, 1, 48, 74, 1, 49, 74, 1, 49, 74, 1, 49, 74, 1, 49, 74, 1, 49, 74, 1, 49, 74, 1, 49, - 74, 1, 49, 74, 1, 49, 74, 1, 49, 73, 1, 50, 73, 1, 50, 73, 1, 50, 73, 1, 50, 73, 1, 50, 73, 1, 50, 73, - 1, 50, 73, 1, 50, 73, 1, 50, 73, 1, 50, 73, 1, 50, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, - 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 52, 71, 1, 52, 71, 1, 52, 71, 1, 52, - 71, 1, 53, 70, 1, 54, 69, 1, 54, 68, 1, 56, 67, 1, 57, 65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 56, 67, 1, 54, 69, 1, 53, 71, 1, 52, 72, 1, 51, 72, 1, 50, 73, 1, 50, 73, 1, 50, - 74, 1, 49, 74, 1, 49, 74, 1, 49, 74, 1, 50, 74, 1, 50, 74, 1, 50, 73, 1, 51, 73, 1, 51, 72, 1, 52, 71, - 1, 53, 70, 1, 55, 68, 1, 56, 67, 34, 35, 90, 2, 27, 53, 70, 96, 2, 28, 53, 70, 96, 2, 28, 53, 70, 95, - 2, 28, 53, 70, 95, 2, 28, 53, 70, 95, 2, 28, 53, 70, 95, 2, 28, 53, 71, 95, 2, 28, 52, 71, 95, 2, 29, - 52, 71, 95, 2, 29, 52, 71, 94, 2, 29, 52, 71, 94, 2, 29, 52, 71, 94, 2, 29, 52, 71, 94, 2, 29, 52, 72, - 94, 2, 30, 52, 72, 94, 2, 30, 51, 72, 94, 2, 30, 51, 72, 94, 2, 30, 51, 72, 93, 2, 30, 51, 72, 93, 2, - 30, 51, 72, 93, 2, 30, 51, 73, 93, 2, 31, 51, 73, 93, 2, 31, 50, 73, 93, 2, 31, 50, 73, 93, 2, 31, 50, - 73, 93, 2, 31, 50, 73, 92, 2, 31, 50, 74, 92, 2, 31, 50, 74, 92, 2, 32, 50, 74, 92, 2, 32, 50, 74, 92, - 2, 32, 49, 74, 92, 2, 32, 49, 74, 92, 2, 32, 49, 74, 91, 2, 32, 49, 75, 91, 2, 32, 49, 75, 91, 2, 33, - 49, 75, 91, 2, 33, 49, 75, 91, 2, 33, 48, 75, 91, 2, 33, 48, 75, 91, 2, 33, 48, 75, 91, 2, 33, 48, 76, - 90, 2, 33, 48, 76, 90, 2, 34, 48, 76, 90, 2, 34, 48, 76, 90, 2, 34, 48, 76, 90, 2, 34, 47, 76, 90, 2, - 34, 47, 76, 90, 2, 34, 47, 77, 89, 2, 34, 47, 77, 89, 2, 35, 47, 77, 89, 2, 35, 47, 77, 89, 2, 35, 46, - 77, 89, 2, 36, 46, 78, 88, 2, 36, 45, 79, 87, 2, 38, 43, 80, 86, 35, 18, 176, 2, 47, 51, 83, 87, 2, 44, - 54, 80, 89, 2, 43, 55, 78, 91, 2, 42, 56, 77, 92, 2, 41, 57, 76, 93, 2, 40, 58, 76, 93, 2, 40, 58, 75, - 94, 2, 39, 59, 75, 94, 2, 39, 59, 75, 94, 2, 39, 59, 74, 94, 2, 39, 59, 74, 95, 2, 39, 59, 74, 95, 2, - 39, 59, 74, 95, 2, 39, 59, 74, 94, 2, 39, 59, 74, 94, 2, 39, 59, 74, 94, 2, 38, 59, 74, 94, 2, 38, 59, - 74, 94, 2, 38, 59, 74, 94, 2, 38, 59, 74, 94, 2, 38, 59, 74, 94, 2, 38, 58, 74, 94, 2, 38, 58, 73, 94, - 2, 38, 58, 73, 94, 2, 38, 58, 73, 94, 2, 38, 58, 73, 94, 2, 38, 58, 73, 93, 2, 38, 58, 73, 93, 2, 38, - 58, 73, 93, 2, 38, 58, 73, 93, 2, 37, 58, 73, 93, 2, 37, 58, 73, 93, 2, 37, 58, 73, 93, 2, 37, 58, 73, - 93, 2, 37, 58, 73, 93, 2, 37, 58, 73, 93, 2, 37, 57, 73, 93, 2, 37, 57, 72, 93, 2, 37, 57, 72, 93, 2, - 37, 57, 72, 93, 2, 37, 57, 72, 93, 2, 37, 57, 72, 92, 2, 37, 57, 72, 92, 2, 37, 57, 72, 92, 2, 37, 57, - 72, 92, 2, 36, 57, 72, 92, 2, 36, 57, 72, 92, 2, 36, 57, 72, 92, 1, 26, 102, 1, 23, 105, 1, 21, 107, - 1, 20, 108, 1, 19, 109, 1, 19, 109, 1, 18, 110, 1, 18, 110, 1, 18, 110, 1, 18, 110, 1, 18, 110, 1, 18, - 110, 1, 18, 110, 1, 18, 110, 1, 19, 109, 1, 19, 109, 1, 20, 108, 1, 21, 107, 1, 22, 106, 1, 25, 103, - 2, 35, 55, 70, 91, 2, 35, 55, 70, 90, 2, 35, 55, 70, 90, 2, 35, 55, 70, 90, 2, 35, 55, 70, 90, 2, 34, - 55, 70, 90, 2, 34, 55, 70, 90, 2, 34, 55, 70, 90, 2, 34, 55, 70, 90, 2, 34, 55, 70, 90, 2, 34, 55, 70, - 90, 2, 34, 54, 69, 90, 2, 34, 54, 69, 90, 2, 34, 54, 69, 90, 2, 34, 54, 69, 90, 2, 34, 54, 69, 90, 2, - 34, 54, 69, 89, 2, 34, 54, 69, 89, 2, 34, 54, 69, 89, 2, 34, 54, 69, 89, 2, 33, 54, 69, 89, 2, 33, 54, - 69, 89, 1, 25, 95, 1, 19, 100, 1, 17, 102, 1, 16, 103, 1, 15, 104, 1, 15, 105, 1, 14, 105, 1, 14, 106, - 1, 14, 106, 1, 13, 106, 1, 13, 106, 1, 13, 106, 1, 14, 106, 1, 14, 106, 1, 14, 105, 1, 15, 105, 1, 15, - 104, 1, 16, 103, 1, 17, 102, 1, 19, 100, 1, 25, 95, 2, 32, 52, 67, 88, 2, 32, 52, 67, 87, 2, 32, 52, - 67, 87, 2, 32, 52, 67, 87, 2, 32, 52, 67, 87, 2, 31, 52, 67, 87, 2, 31, 52, 67, 87, 2, 31, 52, 67, 87, - 2, 31, 52, 67, 87, 2, 31, 52, 67, 87, 2, 31, 52, 67, 87, 2, 31, 51, 67, 87, 2, 31, 51, 66, 87, 2, 31, - 51, 66, 87, 2, 31, 51, 66, 87, 2, 31, 51, 66, 86, 2, 31, 51, 66, 86, 2, 31, 51, 66, 86, 2, 31, 51, 66, - 86, 2, 30, 51, 66, 86, 2, 30, 51, 66, 86, 2, 30, 51, 66, 86, 2, 30, 51, 66, 86, 2, 30, 51, 66, 86, 2, - 30, 51, 66, 86, 2, 30, 50, 66, 86, 2, 30, 50, 65, 86, 2, 30, 50, 65, 86, 2, 30, 50, 65, 86, 2, 30, 50, - 65, 86, 2, 30, 50, 65, 85, 2, 30, 50, 65, 85, 2, 30, 50, 65, 85, 2, 30, 50, 65, 85, 2, 29, 50, 65, 85, - 2, 29, 50, 65, 85, 2, 29, 50, 65, 85, 2, 30, 50, 65, 85, 2, 30, 50, 65, 84, 2, 30, 49, 65, 84, 2, 30, - 49, 65, 84, 2, 31, 49, 66, 84, 2, 31, 48, 66, 83, 2, 32, 47, 67, 82, 2, 33, 46, 68, 81, 2, 34, 45, 70, - 80, 2, 36, 43, 71, 78, 36, 16, 183, 1, 61, 63, 1, 58, 67, 1, 56, 68, 1, 55, 69, 1, 54, 70, 1, 53, 71, - 1, 53, 71, 1, 53, 71, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 73, 1, 48, 77, 1, 44, 81, 2, 42, 84, 88, 96, 1, 40, 98, - 1, 38, 99, 1, 37, 100, 1, 35, 100, 1, 34, 101, 1, 33, 101, 1, 32, 101, 1, 31, 102, 1, 30, 102, 1, 29, - 102, 1, 28, 102, 1, 28, 102, 1, 27, 102, 1, 26, 102, 1, 26, 102, 1, 25, 102, 2, 25, 58, 69, 102, 2, 24, - 53, 73, 102, 2, 24, 50, 76, 102, 2, 24, 49, 78, 102, 2, 23, 47, 80, 102, 2, 23, 46, 81, 102, 2, 23, 45, - 82, 102, 2, 23, 44, 82, 102, 2, 22, 44, 82, 101, 2, 22, 43, 83, 101, 2, 22, 43, 83, 101, 2, 22, 43, 84, - 100, 2, 22, 42, 84, 100, 2, 22, 42, 85, 99, 2, 22, 42, 87, 98, 2, 22, 43, 88, 96, 1, 22, 43, 1, 23, 44, - 1, 23, 44, 1, 23, 45, 1, 23, 47, 1, 23, 48, 1, 24, 51, 1, 24, 54, 1, 24, 58, 1, 25, 62, 1, 25, 67, 1, - 26, 72, 1, 26, 76, 1, 27, 80, 1, 28, 83, 1, 29, 85, 1, 29, 88, 1, 30, 90, 1, 31, 92, 1, 33, 93, 1, 34, - 95, 1, 35, 96, 1, 37, 97, 1, 39, 98, 1, 41, 99, 1, 43, 100, 1, 46, 101, 1, 50, 101, 1, 54, 102, 1, 59, - 102, 1, 64, 103, 1, 69, 104, 1, 73, 104, 1, 76, 104, 1, 79, 105, 1, 81, 105, 1, 82, 105, 1, 83, 105, - 1, 84, 106, 2, 26, 29, 85, 106, 2, 23, 32, 85, 106, 2, 22, 34, 86, 106, 2, 20, 35, 86, 106, 2, 20, 36, - 86, 106, 2, 19, 36, 86, 106, 2, 19, 37, 86, 106, 2, 18, 37, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, - 106, 2, 18, 38, 85, 106, 2, 18, 39, 85, 106, 2, 18, 39, 84, 106, 2, 18, 40, 84, 106, 2, 18, 41, 83, 105, - 2, 18, 42, 82, 105, 2, 18, 43, 81, 105, 2, 18, 44, 79, 104, 2, 18, 46, 77, 104, 2, 18, 49, 75, 104, 2, - 18, 52, 73, 103, 2, 18, 56, 68, 103, 1, 18, 102, 1, 18, 101, 1, 18, 101, 1, 18, 100, 1, 18, 99, 1, 18, - 98, 1, 18, 98, 1, 18, 97, 1, 18, 96, 1, 18, 95, 1, 18, 94, 1, 19, 92, 1, 19, 91, 1, 19, 89, 1, 20, 88, - 2, 21, 35, 39, 86, 2, 22, 33, 41, 83, 2, 24, 31, 44, 81, 1, 47, 77, 1, 51, 73, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 53, 71, 1, 53, 71, 1, 53, 71, 1, 54, 70, 1, 55, 69, 1, 56, 68, 1, 57, 67, - 1, 61, 63, 37, 30, 160, 1, 44, 54, 1, 40, 58, 1, 37, 61, 1, 36, 62, 1, 34, 64, 1, 32, 66, 1, 31, 67, - 1, 30, 68, 1, 29, 69, 1, 28, 70, 1, 27, 71, 1, 26, 72, 1, 25, 73, 2, 25, 44, 54, 73, 2, 24, 42, 56, 74, - 2, 24, 40, 58, 74, 2, 23, 39, 59, 75, 2, 23, 38, 61, 75, 2, 22, 37, 61, 76, 2, 22, 36, 62, 76, 2, 22, - 35, 63, 76, 2, 21, 35, 63, 77, 2, 21, 34, 64, 77, 2, 21, 34, 64, 77, 2, 21, 34, 65, 77, 2, 21, 33, 65, - 77, 2, 21, 33, 65, 77, 2, 20, 33, 65, 78, 2, 20, 33, 65, 78, 2, 20, 33, 65, 78, 2, 21, 33, 65, 78, 2, - 21, 33, 65, 77, 2, 21, 33, 65, 77, 2, 21, 34, 64, 77, 2, 21, 34, 64, 77, 2, 21, 34, 64, 77, 2, 21, 35, - 63, 77, 2, 22, 36, 63, 76, 2, 22, 36, 62, 76, 2, 22, 37, 61, 76, 2, 23, 38, 60, 75, 2, 23, 39, 59, 75, - 2, 24, 41, 57, 74, 2, 25, 42, 56, 74, 2, 25, 45, 53, 73, 1, 26, 72, 2, 27, 71, 100, 103, 2, 28, 71, 96, - 105, 2, 28, 70, 93, 106, 2, 29, 69, 90, 107, 2, 31, 68, 87, 107, 2, 32, 66, 84, 107, 2, 33, 65, 81, 108, - 2, 35, 63, 78, 107, 2, 37, 62, 75, 107, 2, 39, 59, 72, 107, 2, 42, 56, 68, 106, 1, 65, 105, 1, 62, 103, - 1, 59, 100, 1, 56, 97, 1, 53, 94, 1, 50, 91, 1, 47, 88, 1, 44, 85, 1, 41, 82, 1, 38, 78, 1, 35, 75, 1, - 31, 72, 1, 28, 69, 1, 25, 66, 1, 22, 63, 1, 20, 60, 2, 19, 57, 69, 81, 2, 18, 54, 66, 84, 2, 17, 51, - 63, 87, 2, 17, 47, 61, 88, 2, 17, 44, 59, 90, 2, 17, 41, 58, 92, 2, 17, 38, 57, 93, 2, 18, 35, 55, 94, - 2, 18, 32, 54, 95, 2, 19, 29, 53, 96, 2, 20, 26, 53, 97, 1, 52, 98, 1, 51, 99, 2, 50, 69, 80, 99, 2, - 50, 67, 82, 100, 2, 49, 65, 84, 100, 2, 49, 64, 85, 101, 2, 48, 63, 86, 101, 2, 48, 62, 87, 102, 2, 48, - 61, 88, 102, 2, 47, 61, 89, 102, 2, 47, 60, 89, 102, 2, 47, 60, 90, 103, 2, 46, 59, 90, 103, 2, 46, 59, - 90, 103, 2, 46, 59, 91, 103, 2, 46, 59, 91, 103, 2, 46, 59, 91, 103, 2, 46, 59, 91, 103, 2, 46, 59, 91, - 103, 2, 46, 59, 91, 103, 2, 46, 59, 91, 103, 2, 46, 59, 90, 103, 2, 46, 59, 90, 103, 2, 47, 60, 90, 103, - 2, 47, 60, 89, 103, 2, 47, 61, 89, 102, 2, 47, 61, 88, 102, 2, 48, 62, 87, 102, 2, 48, 63, 86, 101, 2, - 49, 64, 85, 101, 2, 49, 65, 84, 100, 2, 50, 67, 83, 100, 2, 50, 69, 81, 99, 2, 51, 73, 76, 99, 1, 52, - 98, 1, 53, 97, 1, 53, 96, 1, 54, 95, 1, 55, 94, 1, 56, 93, 1, 58, 92, 1, 59, 91, 1, 61, 89, 1, 63, 87, - 1, 65, 84, 1, 68, 81, 38, 44, 160, 1, 65, 72, 1, 59, 76, 2, 56, 79, 85, 89, 1, 54, 92, 1, 52, 94, 1, - 50, 95, 1, 49, 96, 1, 48, 96, 1, 47, 97, 1, 46, 97, 1, 45, 97, 1, 44, 97, 1, 43, 97, 1, 42, 97, 1, 42, - 97, 1, 41, 97, 1, 40, 97, 1, 40, 96, 1, 39, 95, 1, 39, 94, 1, 38, 93, 2, 38, 63, 73, 92, 2, 38, 61, 74, - 90, 2, 37, 60, 76, 88, 2, 37, 59, 77, 86, 2, 37, 58, 78, 84, 2, 37, 58, 80, 82, 1, 37, 57, 1, 37, 57, - 1, 37, 57, 1, 36, 57, 1, 36, 57, 1, 37, 57, 1, 37, 57, 1, 37, 58, 1, 37, 58, 1, 37, 59, 1, 37, 59, 1, - 37, 60, 1, 38, 60, 1, 38, 61, 1, 39, 62, 1, 39, 62, 1, 39, 63, 1, 40, 64, 1, 40, 64, 1, 41, 65, 1, 42, - 66, 1, 42, 66, 1, 43, 67, 1, 43, 68, 1, 42, 68, 1, 41, 69, 1, 39, 70, 1, 38, 71, 1, 37, 71, 2, 36, 72, - 88, 105, 2, 35, 73, 88, 108, 2, 34, 73, 87, 109, 2, 33, 74, 87, 110, 2, 32, 75, 87, 110, 2, 31, 75, 87, - 111, 2, 30, 76, 86, 111, 2, 30, 77, 86, 112, 2, 29, 77, 86, 112, 2, 28, 78, 85, 112, 2, 28, 79, 85, 112, - 2, 27, 79, 85, 112, 3, 27, 54, 56, 80, 84, 112, 3, 27, 52, 56, 81, 84, 111, 3, 26, 51, 57, 81, 84, 111, - 3, 26, 49, 58, 82, 84, 110, 2, 25, 48, 58, 110, 2, 25, 48, 59, 109, 2, 25, 47, 60, 108, 2, 25, 46, 60, - 106, 2, 24, 46, 61, 103, 2, 24, 45, 62, 103, 2, 24, 45, 62, 103, 2, 24, 44, 63, 102, 2, 24, 44, 64, 102, - 2, 24, 44, 65, 102, 2, 24, 44, 65, 102, 2, 24, 44, 66, 101, 2, 24, 44, 67, 101, 2, 24, 44, 67, 101, 2, - 24, 44, 68, 100, 2, 24, 44, 69, 100, 2, 24, 45, 69, 99, 2, 24, 45, 70, 99, 2, 24, 46, 71, 98, 2, 24, - 47, 71, 98, 2, 25, 48, 72, 97, 2, 25, 49, 71, 105, 2, 25, 51, 69, 108, 2, 26, 55, 65, 109, 1, 26, 110, - 1, 26, 111, 1, 27, 111, 1, 28, 112, 1, 28, 112, 1, 29, 112, 1, 29, 112, 1, 30, 112, 1, 31, 112, 1, 32, - 112, 1, 33, 112, 1, 34, 111, 1, 35, 111, 1, 37, 110, 2, 38, 84, 85, 109, 2, 40, 82, 85, 108, 2, 42, 80, - 86, 105, 1, 44, 78, 1, 48, 76, 1, 52, 71, 39, 33, 93, 1, 49, 75, 1, 49, 75, 1, 49, 75, 1, 49, 75, 1, - 49, 74, 1, 50, 74, 1, 50, 74, 1, 50, 74, 1, 50, 74, 1, 50, 74, 1, 50, 74, 1, 50, 74, 1, 50, 74, 1, 51, - 73, 1, 51, 73, 1, 51, 73, 1, 51, 73, 1, 51, 73, 1, 51, 73, 1, 51, 73, 1, 52, 73, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 53, 72, 1, 53, 72, 1, 53, 71, 1, 53, 71, 1, 53, 71, 1, - 53, 71, 1, 53, 71, 1, 53, 71, 1, 54, 71, 1, 54, 71, 1, 54, 70, 1, 54, 70, 1, 54, 70, 1, 54, 70, 1, 54, - 70, 1, 55, 70, 1, 55, 70, 1, 55, 70, 1, 55, 69, 1, 55, 69, 1, 55, 69, 1, 55, 69, 1, 55, 69, 1, 56, 69, - 1, 56, 69, 1, 56, 69, 1, 56, 68, 1, 56, 68, 1, 57, 68, 1, 57, 68, 1, 58, 67, 1, 58, 66, 1, 60, 64, 40, - 27, 189, 1, 88, 92, 1, 85, 95, 1, 84, 97, 1, 82, 98, 1, 82, 98, 1, 81, 99, 1, 80, 100, 1, 79, 100, 1, - 79, 100, 1, 78, 100, 1, 77, 100, 1, 77, 100, 1, 76, 100, 1, 76, 100, 1, 75, 99, 1, 74, 99, 1, 74, 98, - 1, 73, 98, 1, 73, 97, 1, 72, 97, 1, 72, 96, 1, 71, 95, 1, 71, 95, 1, 70, 94, 1, 70, 94, 1, 69, 93, 1, - 69, 93, 1, 68, 92, 1, 68, 92, 1, 67, 91, 1, 67, 90, 1, 66, 90, 1, 66, 89, 1, 65, 89, 1, 65, 88, 1, 65, - 88, 1, 64, 88, 1, 64, 87, 1, 63, 87, 1, 63, 86, 1, 62, 86, 1, 62, 85, 1, 62, 85, 1, 61, 85, 1, 61, 84, - 1, 61, 84, 1, 60, 84, 1, 60, 83, 1, 60, 83, 1, 60, 83, 1, 59, 82, 1, 59, 82, 1, 59, 82, 1, 58, 81, 1, - 58, 81, 1, 58, 81, 1, 58, 81, 1, 57, 80, 1, 57, 80, 1, 57, 80, 1, 57, 80, 1, 57, 79, 1, 56, 79, 1, 56, - 79, 1, 56, 79, 1, 56, 79, 1, 56, 79, 1, 56, 78, 1, 56, 78, 1, 56, 78, 1, 55, 78, 1, 55, 78, 1, 55, 78, - 1, 55, 78, 1, 55, 78, 1, 55, 78, 1, 55, 78, 1, 55, 78, 1, 55, 78, 1, 55, 78, 1, 55, 78, 1, 55, 78, 1, - 55, 78, 1, 55, 78, 1, 55, 78, 1, 55, 78, 1, 55, 78, 1, 55, 78, 1, 55, 78, 1, 55, 78, 1, 55, 78, 1, 55, - 78, 1, 56, 78, 1, 56, 78, 1, 56, 78, 1, 56, 79, 1, 56, 79, 1, 56, 79, 1, 56, 79, 1, 57, 79, 1, 57, 79, - 1, 57, 80, 1, 57, 80, 1, 57, 80, 1, 57, 80, 1, 58, 81, 1, 58, 81, 1, 58, 81, 1, 58, 81, 1, 59, 82, 1, - 59, 82, 1, 59, 82, 1, 60, 83, 1, 60, 83, 1, 60, 83, 1, 60, 84, 1, 61, 84, 1, 61, 84, 1, 61, 85, 1, 62, - 85, 1, 62, 85, 1, 63, 86, 1, 63, 86, 1, 63, 87, 1, 64, 87, 1, 64, 88, 1, 65, 88, 1, 65, 88, 1, 65, 89, - 1, 66, 89, 1, 66, 90, 1, 67, 90, 1, 67, 91, 1, 68, 91, 1, 68, 92, 1, 69, 92, 1, 69, 93, 1, 70, 94, 1, - 70, 94, 1, 71, 95, 1, 71, 95, 1, 72, 96, 1, 72, 96, 1, 73, 97, 1, 73, 98, 1, 74, 98, 1, 75, 99, 1, 75, - 99, 1, 76, 100, 1, 76, 100, 1, 77, 100, 1, 77, 100, 1, 78, 100, 1, 79, 100, 1, 79, 100, 1, 80, 100, 1, - 81, 99, 1, 82, 98, 1, 83, 98, 1, 84, 97, 1, 85, 95, 1, 88, 92, 41, 27, 189, 1, 33, 38, 1, 31, 40, 1, - 29, 42, 1, 28, 43, 1, 27, 44, 1, 26, 45, 1, 26, 45, 1, 26, 46, 1, 26, 47, 1, 25, 47, 1, 25, 48, 1, 25, - 49, 1, 26, 49, 1, 26, 50, 1, 26, 50, 1, 27, 51, 1, 27, 52, 1, 28, 52, 1, 28, 53, 1, 29, 53, 1, 30, 54, - 1, 30, 54, 1, 31, 55, 1, 31, 55, 1, 32, 56, 1, 33, 56, 1, 33, 57, 1, 34, 57, 1, 34, 58, 1, 35, 58, 1, - 35, 59, 1, 36, 59, 1, 36, 60, 1, 37, 60, 1, 37, 61, 1, 37, 61, 1, 38, 61, 1, 38, 62, 1, 39, 62, 1, 39, - 63, 1, 40, 63, 1, 40, 63, 1, 41, 64, 1, 41, 64, 1, 41, 64, 1, 42, 65, 1, 42, 65, 1, 42, 66, 1, 43, 66, - 1, 43, 66, 1, 43, 66, 1, 44, 67, 1, 44, 67, 1, 44, 67, 1, 44, 67, 1, 45, 68, 1, 45, 68, 1, 45, 68, 1, - 45, 68, 1, 46, 68, 1, 46, 69, 1, 46, 69, 1, 46, 69, 1, 46, 69, 1, 47, 69, 1, 47, 70, 1, 47, 70, 1, 47, - 70, 1, 47, 70, 1, 47, 70, 1, 47, 70, 1, 48, 70, 1, 48, 70, 1, 48, 70, 1, 48, 71, 1, 48, 71, 1, 48, 71, - 1, 48, 71, 1, 48, 71, 1, 48, 71, 1, 48, 71, 1, 48, 71, 1, 48, 71, 1, 48, 71, 1, 48, 71, 1, 48, 71, 1, - 48, 71, 1, 48, 70, 1, 48, 70, 1, 48, 70, 1, 48, 70, 1, 47, 70, 1, 47, 70, 1, 47, 70, 1, 47, 70, 1, 47, - 70, 1, 47, 70, 1, 47, 69, 1, 47, 69, 1, 46, 69, 1, 46, 69, 1, 46, 69, 1, 46, 68, 1, 46, 68, 1, 45, 68, - 1, 45, 68, 1, 45, 68, 1, 44, 67, 1, 44, 67, 1, 44, 67, 1, 44, 67, 1, 43, 66, 1, 43, 66, 1, 43, 66, 1, - 42, 65, 1, 42, 65, 1, 42, 65, 1, 41, 64, 1, 41, 64, 1, 41, 64, 1, 40, 63, 1, 40, 63, 1, 39, 63, 1, 39, - 62, 1, 38, 62, 1, 38, 61, 1, 38, 61, 1, 37, 61, 1, 37, 60, 1, 36, 60, 1, 36, 59, 1, 35, 59, 1, 35, 58, - 1, 34, 58, 1, 34, 57, 1, 33, 57, 1, 33, 56, 1, 32, 56, 1, 31, 55, 1, 31, 55, 1, 30, 54, 1, 30, 54, 1, - 29, 53, 1, 28, 53, 1, 28, 52, 1, 27, 52, 1, 27, 51, 1, 26, 50, 1, 26, 50, 1, 26, 49, 1, 25, 49, 1, 25, - 48, 1, 25, 47, 1, 26, 47, 1, 26, 46, 1, 26, 45, 1, 26, 45, 1, 27, 44, 1, 28, 43, 1, 29, 42, 1, 30, 40, - 1, 33, 38, 42, 29, 114, 1, 59, 65, 1, 57, 67, 1, 55, 68, 1, 54, 69, 1, 54, 70, 1, 53, 71, 1, 53, 71, - 1, 52, 71, 1, 52, 71, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 3, 25, - 31, 52, 72, 93, 99, 3, 23, 34, 52, 72, 90, 101, 3, 21, 37, 52, 72, 86, 103, 3, 20, 40, 52, 72, 83, 104, - 3, 20, 43, 52, 72, 80, 104, 3, 19, 47, 52, 72, 77, 105, 3, 18, 50, 52, 72, 74, 105, 1, 18, 106, 1, 18, - 106, 1, 18, 106, 1, 18, 106, 1, 18, 106, 1, 18, 106, 1, 18, 106, 1, 19, 105, 1, 19, 104, 1, 20, 104, - 1, 21, 103, 1, 22, 101, 1, 24, 99, 1, 27, 97, 1, 30, 94, 1, 33, 91, 1, 36, 88, 1, 39, 84, 1, 43, 81, - 1, 45, 78, 1, 45, 79, 1, 44, 80, 1, 43, 81, 1, 43, 81, 1, 42, 82, 1, 41, 83, 1, 40, 83, 1, 40, 84, 1, - 39, 85, 1, 38, 86, 1, 37, 86, 1, 37, 87, 2, 36, 61, 63, 88, 2, 35, 60, 63, 88, 2, 35, 60, 64, 89, 2, - 34, 59, 65, 90, 2, 33, 58, 66, 91, 2, 33, 57, 66, 91, 2, 32, 57, 67, 92, 2, 32, 56, 68, 92, 2, 31, 55, - 69, 93, 2, 31, 55, 69, 93, 2, 31, 54, 70, 93, 2, 31, 53, 71, 93, 2, 31, 52, 71, 93, 2, 31, 52, 72, 93, - 2, 31, 51, 73, 92, 2, 32, 50, 74, 92, 2, 32, 49, 74, 91, 2, 33, 49, 75, 91, 2, 34, 48, 76, 90, 2, 35, - 46, 77, 88, 2, 37, 45, 79, 87, 43, 42, 157, 1, 59, 65, 1, 57, 67, 1, 56, 69, 1, 55, 70, 1, 54, 70, 1, - 53, 71, 1, 53, 71, 1, 53, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 21, 102, 1, 15, 109, 1, 13, 111, 1, 12, 112, 1, 11, - 113, 1, 11, 113, 1, 10, 114, 1, 10, 114, 1, 10, 115, 1, 9, 115, 1, 9, 115, 1, 9, 115, 1, 10, 115, 1, - 10, 114, 1, 10, 114, 1, 11, 114, 1, 11, 113, 1, 12, 112, 1, 13, 111, 1, 15, 109, 1, 19, 105, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 53, 72, 1, 53, 71, 1, 53, 71, 1, 54, 71, 1, 54, 70, 1, 55, 69, 1, 56, 68, 1, 58, 66, 44, 130, - 190, 1, 57, 84, 1, 57, 84, 1, 57, 83, 1, 56, 82, 1, 56, 82, 1, 56, 81, 1, 55, 81, 1, 55, 80, 1, 55, 80, - 1, 55, 79, 1, 54, 79, 1, 54, 78, 1, 54, 78, 1, 54, 77, 1, 53, 77, 1, 53, 76, 1, 53, 76, 1, 52, 75, 1, - 52, 75, 1, 52, 74, 1, 52, 73, 1, 51, 73, 1, 51, 72, 1, 51, 72, 1, 50, 71, 1, 50, 71, 1, 50, 70, 1, 50, - 70, 1, 49, 69, 1, 49, 69, 1, 49, 68, 1, 49, 68, 1, 48, 67, 1, 48, 67, 1, 48, 66, 1, 47, 66, 1, 47, 65, - 1, 47, 64, 1, 47, 64, 1, 46, 63, 1, 46, 63, 1, 46, 62, 1, 45, 62, 1, 45, 61, 1, 45, 61, 1, 45, 60, 1, - 44, 60, 1, 44, 59, 1, 44, 59, 1, 43, 58, 1, 43, 58, 1, 43, 57, 1, 43, 56, 1, 43, 56, 1, 43, 55, 1, 43, - 55, 1, 43, 54, 1, 44, 53, 1, 45, 52, 1, 47, 51, 45, 90, 109, 1, 20, 105, 1, 20, 105, 1, 20, 105, 1, 20, - 105, 1, 20, 105, 1, 20, 105, 1, 20, 105, 1, 20, 105, 1, 20, 105, 1, 20, 105, 1, 20, 105, 1, 20, 105, - 1, 20, 105, 1, 20, 105, 1, 20, 105, 1, 20, 105, 1, 20, 105, 1, 20, 105, 1, 20, 105, 46, 134, 160, 1, - 56, 68, 1, 54, 71, 1, 52, 72, 1, 51, 74, 1, 50, 74, 1, 49, 75, 1, 49, 76, 1, 48, 76, 1, 48, 77, 1, 48, - 77, 1, 47, 77, 1, 47, 77, 1, 47, 78, 1, 47, 78, 1, 47, 77, 1, 47, 77, 1, 48, 77, 1, 48, 77, 1, 48, 76, - 1, 49, 76, 1, 49, 75, 1, 50, 75, 1, 51, 74, 1, 52, 72, 1, 54, 71, 1, 56, 69, 47, 14, 181, 1, 94, 98, - 1, 91, 101, 1, 90, 102, 1, 89, 104, 1, 88, 104, 1, 88, 105, 1, 87, 106, 1, 86, 106, 1, 86, 106, 1, 85, - 106, 1, 85, 106, 1, 84, 106, 1, 84, 106, 1, 83, 106, 1, 83, 105, 1, 83, 105, 1, 82, 104, 1, 82, 104, - 1, 81, 103, 1, 81, 103, 1, 80, 103, 1, 80, 102, 1, 79, 102, 1, 79, 101, 1, 78, 101, 1, 78, 100, 1, 77, - 100, 1, 77, 99, 1, 77, 99, 1, 76, 98, 1, 76, 98, 1, 75, 97, 1, 75, 97, 1, 74, 96, 1, 74, 96, 1, 73, 96, - 1, 73, 95, 1, 72, 95, 1, 72, 94, 1, 71, 94, 1, 71, 93, 1, 70, 93, 1, 70, 92, 1, 70, 92, 1, 69, 91, 1, - 69, 91, 1, 68, 90, 1, 68, 90, 1, 67, 90, 1, 67, 89, 1, 66, 89, 1, 66, 88, 1, 65, 88, 1, 65, 87, 1, 64, - 87, 1, 64, 86, 1, 63, 86, 1, 63, 85, 1, 63, 85, 1, 62, 84, 1, 62, 84, 1, 61, 83, 1, 61, 83, 1, 60, 83, - 1, 60, 82, 1, 59, 82, 1, 59, 81, 1, 58, 81, 1, 58, 80, 1, 57, 80, 1, 57, 79, 1, 56, 79, 1, 56, 78, 1, - 56, 78, 1, 55, 77, 1, 55, 77, 1, 54, 77, 1, 54, 76, 1, 53, 76, 1, 53, 75, 1, 52, 75, 1, 52, 74, 1, 51, - 74, 1, 51, 73, 1, 50, 73, 1, 50, 72, 1, 50, 72, 1, 49, 71, 1, 49, 71, 1, 48, 71, 1, 48, 70, 1, 47, 70, - 1, 47, 69, 1, 46, 69, 1, 46, 68, 1, 45, 68, 1, 45, 67, 1, 44, 67, 1, 44, 66, 1, 43, 66, 1, 43, 65, 1, - 43, 65, 1, 42, 64, 1, 42, 64, 1, 41, 64, 1, 41, 63, 1, 40, 63, 1, 40, 62, 1, 39, 62, 1, 39, 61, 1, 38, - 61, 1, 38, 60, 1, 37, 60, 1, 37, 59, 1, 36, 59, 1, 36, 58, 1, 36, 58, 1, 35, 58, 1, 35, 57, 1, 34, 57, - 1, 34, 56, 1, 33, 56, 1, 33, 55, 1, 32, 55, 1, 32, 54, 1, 31, 54, 1, 31, 53, 1, 30, 53, 1, 30, 52, 1, - 29, 52, 1, 29, 51, 1, 29, 51, 1, 28, 51, 1, 28, 50, 1, 27, 50, 1, 27, 49, 1, 26, 49, 1, 26, 48, 1, 25, - 48, 1, 25, 47, 1, 24, 47, 1, 24, 46, 1, 23, 46, 1, 23, 45, 1, 22, 45, 1, 22, 45, 1, 22, 44, 1, 21, 44, - 1, 21, 43, 1, 20, 43, 1, 20, 42, 1, 19, 42, 1, 19, 41, 1, 19, 41, 1, 18, 40, 1, 18, 40, 1, 18, 39, 1, - 18, 39, 1, 18, 38, 1, 19, 38, 1, 19, 37, 1, 19, 37, 1, 20, 36, 1, 21, 35, 1, 22, 34, 1, 24, 33, 1, 28, - 29, 48, 26, 160, 1, 55, 69, 1, 51, 74, 1, 47, 77, 1, 45, 79, 1, 43, 81, 1, 42, 83, 1, 40, 84, 1, 39, - 85, 1, 38, 87, 1, 36, 88, 1, 35, 89, 1, 34, 90, 1, 33, 90, 1, 33, 91, 1, 32, 92, 1, 31, 93, 1, 30, 93, - 1, 30, 94, 1, 29, 95, 1, 29, 95, 2, 28, 60, 65, 96, 2, 27, 55, 69, 97, 2, 27, 53, 71, 97, 2, 26, 52, - 73, 98, 2, 26, 50, 74, 98, 2, 25, 49, 75, 99, 2, 25, 48, 76, 99, 2, 24, 47, 77, 100, 2, 24, 47, 78, 100, - 2, 23, 46, 78, 101, 2, 23, 45, 79, 101, 2, 23, 45, 80, 102, 2, 22, 44, 80, 102, 2, 22, 43, 81, 102, 2, - 22, 43, 81, 103, 2, 21, 42, 82, 103, 2, 21, 42, 82, 103, 2, 21, 42, 82, 104, 2, 20, 41, 83, 104, 2, 20, - 41, 83, 104, 2, 20, 40, 84, 105, 2, 20, 40, 84, 105, 2, 19, 40, 84, 105, 2, 19, 40, 84, 105, 2, 19, 39, - 85, 105, 2, 19, 39, 85, 105, 2, 19, 39, 85, 106, 2, 18, 39, 85, 106, 2, 18, 39, 85, 106, 2, 18, 39, 86, - 106, 2, 18, 39, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, - 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, - 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, - 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, - 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, - 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, - 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 39, 86, 106, 2, 18, 39, 86, 106, 2, 18, 39, 85, 106, 2, - 18, 39, 85, 106, 2, 19, 39, 85, 106, 2, 19, 39, 85, 106, 2, 19, 40, 85, 105, 2, 19, 40, 85, 105, 2, 19, - 40, 84, 105, 2, 20, 40, 84, 105, 2, 20, 40, 84, 104, 2, 20, 41, 83, 104, 2, 20, 41, 83, 104, 2, 21, 42, - 83, 104, 2, 21, 42, 82, 104, 2, 21, 43, 82, 103, 2, 22, 43, 81, 103, 2, 22, 43, 81, 102, 2, 22, 44, 80, - 102, 2, 23, 45, 80, 102, 2, 23, 45, 79, 101, 2, 24, 46, 78, 101, 2, 24, 46, 78, 100, 2, 24, 47, 77, 100, - 2, 25, 48, 76, 100, 2, 25, 49, 75, 99, 2, 26, 50, 74, 99, 2, 26, 51, 73, 98, 2, 27, 53, 71, 97, 2, 28, - 55, 69, 97, 2, 28, 58, 67, 96, 1, 29, 96, 1, 29, 95, 1, 30, 95, 1, 31, 94, 1, 31, 93, 1, 32, 93, 1, 33, - 92, 1, 34, 91, 1, 34, 90, 1, 35, 89, 1, 36, 88, 1, 37, 87, 1, 39, 86, 1, 40, 85, 1, 41, 83, 1, 43, 81, - 1, 45, 79, 1, 47, 77, 1, 50, 74, 1, 54, 70, 49, 26, 157, 1, 70, 72, 1, 66, 72, 1, 63, 72, 1, 59, 72, - 1, 55, 72, 1, 51, 72, 1, 47, 72, 1, 43, 72, 1, 39, 72, 1, 35, 72, 1, 31, 72, 1, 28, 72, 1, 25, 72, 1, - 23, 72, 1, 21, 72, 1, 20, 72, 1, 19, 72, 1, 19, 72, 1, 18, 72, 1, 18, 72, 1, 18, 72, 1, 18, 72, 1, 18, - 72, 1, 18, 72, 1, 18, 72, 1, 19, 72, 1, 19, 72, 2, 20, 48, 52, 72, 2, 20, 44, 52, 72, 2, 21, 40, 52, - 72, 2, 22, 36, 52, 72, 2, 24, 32, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 26, 98, 1, - 23, 101, 1, 22, 103, 1, 21, 104, 1, 20, 105, 1, 19, 105, 1, 19, 106, 1, 18, 106, 1, 18, 106, 1, 18, 106, - 1, 18, 106, 1, 18, 106, 1, 18, 106, 1, 19, 106, 1, 19, 106, 1, 20, 105, 1, 20, 104, 1, 21, 103, 1, 23, - 102, 1, 25, 99, 50, 26, 157, 1, 53, 68, 1, 48, 73, 1, 45, 77, 1, 42, 79, 1, 39, 81, 1, 37, 84, 1, 36, - 86, 1, 34, 87, 1, 32, 88, 1, 31, 89, 1, 30, 91, 1, 28, 92, 1, 27, 93, 1, 26, 94, 1, 25, 95, 1, 25, 96, - 1, 24, 97, 1, 23, 97, 1, 22, 98, 1, 22, 99, 2, 21, 57, 64, 99, 2, 21, 52, 69, 100, 2, 20, 49, 72, 100, - 2, 19, 47, 74, 101, 2, 19, 45, 76, 101, 2, 19, 44, 77, 102, 2, 18, 42, 78, 102, 2, 18, 41, 79, 102, 2, - 17, 40, 80, 103, 2, 17, 40, 81, 103, 2, 17, 39, 82, 103, 2, 17, 38, 82, 104, 2, 16, 38, 83, 104, 2, 16, - 37, 83, 104, 2, 16, 37, 83, 104, 2, 16, 37, 84, 104, 2, 16, 36, 84, 104, 2, 17, 36, 84, 104, 2, 17, 35, - 84, 104, 2, 18, 35, 84, 104, 2, 19, 34, 84, 104, 2, 20, 33, 84, 104, 2, 21, 32, 84, 104, 2, 23, 30, 83, - 104, 1, 83, 104, 1, 82, 104, 1, 82, 104, 1, 81, 104, 1, 81, 103, 1, 80, 103, 1, 79, 103, 1, 78, 102, - 1, 77, 102, 1, 77, 102, 1, 76, 101, 1, 75, 101, 1, 74, 100, 1, 73, 100, 1, 72, 99, 1, 71, 98, 1, 69, - 98, 1, 68, 97, 1, 67, 96, 1, 66, 95, 1, 65, 94, 1, 64, 93, 1, 63, 92, 1, 62, 92, 1, 61, 91, 1, 60, 90, - 1, 59, 89, 1, 57, 88, 1, 56, 87, 1, 55, 86, 1, 54, 85, 1, 53, 84, 1, 52, 83, 1, 51, 81, 1, 49, 80, 1, - 48, 79, 1, 47, 78, 1, 46, 77, 1, 45, 76, 1, 44, 75, 1, 42, 74, 1, 41, 73, 1, 40, 71, 1, 39, 70, 1, 38, - 69, 1, 37, 68, 1, 35, 67, 1, 34, 66, 1, 33, 65, 1, 32, 63, 1, 31, 62, 1, 29, 61, 1, 28, 60, 1, 27, 59, - 1, 26, 58, 1, 25, 57, 1, 23, 55, 1, 22, 54, 1, 21, 53, 1, 20, 52, 1, 18, 51, 1, 17, 50, 1, 16, 48, 1, - 15, 47, 2, 14, 46, 92, 98, 2, 12, 45, 90, 100, 2, 12, 44, 88, 101, 1, 12, 102, 1, 12, 103, 1, 12, 103, - 1, 12, 104, 1, 12, 104, 1, 12, 104, 1, 12, 104, 1, 12, 104, 1, 12, 104, 1, 12, 105, 1, 12, 105, 1, 12, - 105, 1, 12, 105, 1, 12, 105, 1, 12, 105, 1, 12, 105, 1, 12, 105, 1, 12, 105, 1, 12, 105, 1, 12, 105, - 51, 26, 160, 1, 53, 70, 1, 49, 75, 1, 44, 79, 1, 41, 82, 1, 39, 84, 1, 36, 86, 1, 35, 88, 1, 33, 89, - 1, 31, 90, 1, 30, 92, 1, 28, 93, 1, 27, 94, 1, 26, 95, 1, 25, 96, 1, 24, 97, 1, 23, 97, 1, 23, 98, 1, - 22, 99, 1, 22, 99, 1, 21, 100, 2, 21, 56, 67, 101, 2, 21, 51, 72, 101, 2, 21, 48, 75, 102, 2, 21, 46, - 76, 102, 2, 21, 44, 78, 102, 2, 21, 43, 79, 103, 2, 21, 42, 80, 103, 2, 22, 40, 81, 103, 2, 23, 39, 82, - 104, 2, 23, 38, 82, 104, 2, 24, 37, 83, 104, 2, 26, 36, 83, 104, 2, 29, 34, 84, 104, 1, 84, 104, 1, 84, - 104, 1, 84, 104, 1, 84, 104, 1, 84, 104, 1, 84, 104, 1, 84, 104, 1, 83, 104, 1, 83, 104, 1, 82, 104, - 1, 82, 104, 1, 81, 104, 1, 80, 103, 1, 80, 103, 1, 79, 103, 1, 77, 102, 1, 76, 102, 1, 74, 101, 1, 72, - 101, 1, 57, 100, 1, 52, 100, 1, 51, 99, 1, 49, 98, 1, 48, 98, 1, 48, 97, 1, 47, 96, 1, 47, 95, 1, 47, - 94, 1, 46, 93, 1, 46, 92, 1, 46, 91, 1, 46, 92, 1, 47, 94, 1, 47, 95, 1, 47, 96, 1, 48, 97, 1, 49, 98, - 1, 50, 99, 1, 52, 100, 1, 55, 101, 1, 63, 102, 1, 69, 102, 1, 72, 103, 1, 74, 104, 1, 76, 104, 1, 78, - 105, 1, 79, 105, 1, 80, 106, 1, 81, 106, 1, 82, 107, 1, 83, 107, 1, 84, 107, 1, 85, 108, 1, 86, 108, - 1, 86, 108, 1, 87, 108, 1, 87, 108, 1, 88, 109, 1, 88, 109, 1, 88, 109, 1, 88, 109, 1, 88, 109, 1, 89, - 109, 1, 89, 109, 1, 88, 109, 1, 88, 109, 1, 88, 109, 1, 88, 109, 1, 88, 108, 1, 87, 108, 1, 87, 108, - 1, 86, 108, 1, 85, 108, 1, 84, 107, 2, 21, 28, 83, 107, 2, 19, 30, 82, 107, 2, 18, 32, 80, 106, 2, 17, - 34, 78, 106, 2, 16, 36, 75, 105, 2, 16, 39, 71, 105, 2, 15, 45, 64, 104, 1, 15, 104, 1, 15, 103, 1, 15, - 102, 1, 15, 102, 1, 15, 101, 1, 15, 100, 1, 15, 99, 1, 16, 98, 1, 16, 97, 1, 17, 96, 1, 18, 95, 1, 19, - 93, 1, 21, 92, 1, 23, 90, 1, 25, 88, 1, 27, 86, 1, 29, 83, 1, 33, 80, 1, 37, 76, 1, 44, 69, 52, 29, 157, - 1, 65, 92, 1, 64, 92, 1, 64, 92, 1, 63, 92, 1, 62, 92, 1, 62, 92, 1, 61, 92, 1, 61, 92, 1, 60, 92, 1, - 60, 92, 1, 59, 92, 1, 58, 92, 1, 58, 92, 1, 57, 92, 1, 57, 92, 1, 56, 92, 1, 55, 92, 1, 55, 92, 1, 54, - 92, 1, 54, 92, 1, 53, 92, 1, 52, 92, 1, 52, 92, 1, 51, 92, 1, 51, 92, 1, 50, 92, 1, 50, 92, 1, 49, 92, - 1, 48, 92, 1, 48, 92, 2, 47, 71, 72, 92, 2, 47, 70, 72, 92, 2, 46, 70, 72, 92, 2, 45, 69, 72, 92, 2, - 45, 69, 72, 92, 2, 44, 68, 72, 92, 2, 44, 68, 72, 92, 2, 43, 67, 72, 92, 2, 43, 66, 72, 92, 2, 42, 66, - 72, 92, 2, 41, 65, 72, 92, 2, 41, 65, 72, 92, 2, 40, 64, 72, 92, 2, 40, 63, 72, 92, 2, 39, 63, 72, 92, - 2, 38, 62, 72, 92, 2, 38, 62, 72, 92, 2, 37, 61, 72, 92, 2, 37, 60, 72, 92, 2, 36, 60, 72, 92, 2, 35, - 59, 72, 92, 2, 35, 59, 72, 92, 2, 34, 58, 72, 92, 2, 34, 58, 72, 92, 2, 33, 57, 72, 92, 2, 33, 56, 72, - 92, 2, 32, 56, 72, 92, 2, 31, 55, 72, 92, 2, 31, 55, 72, 92, 2, 30, 54, 72, 92, 2, 30, 53, 72, 92, 2, - 29, 53, 72, 92, 2, 28, 52, 72, 92, 2, 28, 52, 72, 92, 2, 27, 51, 72, 92, 2, 27, 50, 72, 92, 2, 26, 50, - 72, 92, 2, 25, 49, 72, 92, 2, 25, 49, 72, 92, 2, 24, 48, 72, 92, 2, 24, 47, 72, 92, 2, 23, 47, 72, 92, - 2, 23, 46, 72, 92, 2, 22, 46, 72, 92, 2, 21, 45, 72, 92, 2, 21, 45, 72, 92, 2, 20, 44, 72, 92, 2, 20, - 43, 72, 92, 2, 19, 43, 72, 92, 2, 18, 42, 72, 92, 1, 18, 98, 1, 17, 100, 1, 17, 101, 1, 16, 102, 1, 16, - 103, 1, 16, 104, 1, 16, 104, 1, 16, 104, 1, 16, 104, 1, 16, 104, 1, 16, 104, 1, 16, 104, 1, 16, 104, - 1, 16, 104, 1, 16, 103, 1, 16, 103, 1, 16, 102, 1, 16, 101, 1, 16, 99, 1, 16, 96, 1, 72, 92, 1, 72, 92, - 1, 72, 92, 1, 72, 92, 1, 72, 92, 1, 72, 92, 1, 72, 92, 1, 72, 92, 1, 58, 97, 1, 55, 99, 1, 54, 101, 1, - 53, 102, 1, 52, 103, 1, 51, 103, 1, 51, 104, 1, 50, 104, 1, 50, 104, 1, 50, 104, 1, 50, 104, 1, 50, 104, - 1, 50, 104, 1, 51, 104, 1, 51, 103, 1, 52, 103, 1, 52, 102, 1, 53, 101, 1, 55, 100, 1, 57, 97, 53, 29, - 160, 1, 25, 91, 1, 25, 95, 1, 25, 96, 1, 25, 98, 1, 25, 99, 1, 25, 99, 1, 25, 100, 1, 25, 100, 1, 25, - 100, 1, 25, 100, 1, 25, 100, 1, 25, 100, 1, 25, 100, 1, 25, 100, 1, 25, 100, 1, 25, 99, 1, 25, 98, 1, - 25, 97, 1, 25, 96, 1, 25, 94, 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, - 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, - 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, 2, 25, 45, 59, 72, 2, 25, 45, 53, 77, 2, 25, 45, - 49, 80, 1, 25, 83, 1, 25, 85, 1, 25, 87, 1, 25, 89, 1, 25, 90, 1, 25, 92, 1, 25, 93, 1, 25, 94, 1, 25, - 95, 1, 25, 96, 1, 25, 97, 1, 25, 98, 1, 25, 99, 1, 25, 100, 1, 25, 100, 1, 25, 101, 1, 25, 102, 2, 25, - 64, 67, 102, 2, 25, 57, 73, 103, 2, 26, 53, 76, 104, 2, 26, 50, 77, 104, 2, 27, 47, 79, 105, 2, 27, 45, - 80, 105, 2, 28, 43, 81, 105, 2, 29, 40, 82, 106, 2, 31, 38, 83, 106, 1, 84, 107, 1, 85, 107, 1, 85, 107, - 1, 86, 107, 1, 86, 108, 1, 87, 108, 1, 87, 108, 1, 87, 108, 1, 88, 108, 1, 88, 108, 1, 88, 109, 1, 88, - 109, 1, 88, 109, 1, 88, 109, 1, 88, 109, 1, 89, 109, 1, 89, 109, 1, 89, 109, 1, 89, 109, 1, 88, 109, - 1, 88, 109, 1, 88, 109, 1, 88, 109, 1, 88, 109, 1, 88, 108, 1, 88, 108, 1, 87, 108, 1, 87, 108, 1, 87, - 108, 1, 86, 108, 1, 86, 107, 2, 22, 27, 85, 107, 2, 20, 29, 84, 107, 2, 18, 31, 83, 106, 2, 17, 32, 82, - 106, 2, 16, 34, 81, 106, 2, 16, 35, 80, 105, 2, 15, 37, 78, 105, 2, 15, 39, 75, 104, 2, 14, 42, 72, 104, - 2, 14, 48, 66, 103, 1, 14, 103, 1, 15, 102, 1, 15, 101, 1, 15, 101, 1, 15, 100, 1, 16, 99, 1, 16, 98, - 1, 17, 97, 1, 18, 96, 1, 19, 95, 1, 21, 94, 1, 22, 92, 1, 24, 91, 1, 26, 89, 1, 28, 87, 1, 30, 85, 1, - 33, 83, 1, 36, 80, 1, 40, 76, 1, 46, 70, 54, 26, 160, 1, 79, 94, 1, 74, 99, 1, 71, 102, 1, 68, 104, 1, - 65, 106, 1, 63, 107, 1, 61, 108, 1, 59, 109, 1, 57, 110, 1, 56, 110, 1, 54, 110, 1, 53, 111, 1, 51, 111, - 1, 50, 111, 1, 49, 111, 1, 48, 110, 1, 46, 110, 1, 45, 110, 1, 44, 109, 1, 43, 109, 2, 42, 82, 92, 108, - 2, 41, 78, 95, 107, 2, 40, 75, 97, 105, 2, 39, 73, 100, 101, 1, 38, 70, 1, 38, 68, 1, 37, 67, 1, 36, - 65, 1, 35, 64, 1, 35, 62, 1, 34, 61, 1, 33, 60, 1, 33, 59, 1, 32, 58, 1, 31, 57, 1, 31, 56, 1, 30, 55, - 1, 30, 54, 1, 29, 53, 1, 29, 53, 1, 28, 52, 1, 28, 51, 1, 27, 50, 1, 27, 50, 1, 27, 49, 1, 26, 49, 1, - 26, 48, 1, 26, 48, 1, 25, 47, 1, 25, 47, 2, 25, 46, 68, 76, 2, 25, 46, 62, 81, 2, 24, 46, 59, 84, 2, - 24, 45, 56, 86, 2, 24, 45, 54, 88, 2, 24, 45, 52, 90, 2, 24, 44, 50, 92, 2, 23, 44, 49, 93, 2, 23, 44, - 48, 94, 2, 23, 44, 46, 95, 2, 23, 43, 45, 97, 2, 23, 43, 44, 98, 1, 23, 99, 1, 23, 100, 1, 23, 100, 1, - 23, 101, 1, 23, 102, 1, 23, 103, 1, 23, 103, 1, 23, 104, 1, 23, 105, 2, 23, 66, 76, 105, 2, 23, 63, 78, - 106, 2, 23, 62, 80, 106, 2, 23, 60, 82, 107, 2, 23, 59, 83, 107, 2, 23, 58, 84, 108, 2, 23, 57, 85, 108, - 2, 23, 56, 85, 108, 2, 23, 55, 86, 109, 2, 23, 54, 87, 109, 2, 23, 53, 88, 109, 2, 23, 52, 88, 109, 2, - 23, 51, 88, 110, 2, 24, 51, 89, 110, 2, 24, 50, 89, 110, 2, 24, 50, 90, 110, 2, 24, 49, 90, 110, 2, 24, - 48, 90, 110, 2, 24, 48, 90, 111, 2, 24, 47, 90, 111, 2, 25, 47, 91, 111, 2, 25, 46, 91, 111, 2, 25, 46, - 91, 111, 2, 25, 46, 91, 111, 2, 25, 47, 91, 111, 2, 26, 47, 91, 111, 2, 26, 47, 91, 111, 2, 26, 48, 90, - 111, 2, 27, 48, 90, 111, 2, 27, 48, 90, 111, 2, 27, 49, 90, 110, 2, 27, 49, 89, 110, 2, 28, 50, 89, 110, - 2, 28, 50, 88, 110, 2, 29, 51, 88, 110, 2, 29, 52, 87, 109, 2, 29, 53, 86, 109, 2, 30, 54, 85, 109, 2, - 30, 55, 84, 109, 2, 31, 56, 83, 108, 2, 31, 58, 81, 108, 2, 32, 60, 79, 107, 2, 32, 64, 75, 107, 1, 33, - 106, 1, 33, 106, 1, 34, 105, 1, 34, 105, 1, 35, 104, 1, 36, 103, 1, 37, 102, 1, 38, 101, 1, 39, 101, - 1, 40, 100, 1, 41, 98, 1, 42, 97, 1, 43, 96, 1, 45, 95, 1, 46, 93, 1, 48, 91, 1, 50, 89, 1, 53, 87, 1, - 56, 83, 1, 61, 79, 55, 29, 157, 1, 16, 104, 1, 16, 104, 1, 16, 104, 1, 16, 104, 1, 16, 104, 1, 16, 104, - 1, 16, 104, 1, 16, 104, 1, 16, 104, 1, 16, 104, 1, 16, 104, 1, 16, 104, 1, 16, 104, 1, 16, 104, 1, 16, - 104, 1, 16, 104, 1, 16, 104, 1, 16, 104, 1, 16, 104, 1, 16, 104, 2, 16, 37, 83, 104, 2, 16, 36, 83, 104, - 2, 17, 36, 82, 104, 2, 17, 36, 82, 103, 2, 17, 36, 82, 103, 2, 17, 36, 81, 103, 2, 18, 35, 81, 102, 2, - 18, 35, 81, 102, 2, 19, 34, 80, 102, 2, 20, 33, 80, 101, 2, 22, 31, 80, 101, 2, 24, 28, 79, 101, 1, 79, - 100, 1, 79, 100, 1, 78, 100, 1, 78, 99, 1, 78, 99, 1, 77, 99, 1, 77, 98, 1, 77, 98, 1, 76, 98, 1, 76, - 97, 1, 76, 97, 1, 75, 97, 1, 75, 96, 1, 75, 96, 1, 74, 96, 1, 74, 95, 1, 74, 95, 1, 73, 95, 1, 73, 94, - 1, 73, 94, 1, 72, 94, 1, 72, 93, 1, 71, 93, 1, 71, 93, 1, 71, 92, 1, 70, 92, 1, 70, 92, 1, 70, 91, 1, - 69, 91, 1, 69, 91, 1, 69, 90, 1, 68, 90, 1, 68, 90, 1, 68, 89, 1, 67, 89, 1, 67, 89, 1, 67, 88, 1, 66, - 88, 1, 66, 88, 1, 66, 87, 1, 65, 87, 1, 65, 87, 1, 65, 86, 1, 64, 86, 1, 64, 86, 1, 64, 85, 1, 63, 85, - 1, 63, 85, 1, 63, 84, 1, 62, 84, 1, 62, 84, 1, 62, 83, 1, 61, 83, 1, 61, 83, 1, 61, 82, 1, 60, 82, 1, - 60, 82, 1, 60, 81, 1, 59, 81, 1, 59, 81, 1, 59, 80, 1, 58, 80, 1, 58, 80, 1, 58, 79, 1, 57, 79, 1, 57, - 79, 1, 57, 78, 1, 56, 78, 1, 56, 78, 1, 56, 77, 1, 55, 77, 1, 55, 77, 1, 55, 76, 1, 54, 76, 1, 54, 76, - 1, 54, 75, 1, 53, 75, 1, 53, 74, 1, 53, 74, 1, 52, 74, 1, 52, 73, 1, 52, 73, 1, 51, 73, 1, 51, 72, 1, - 51, 72, 1, 51, 72, 1, 51, 71, 1, 51, 71, 1, 51, 71, 1, 51, 70, 1, 51, 70, 1, 52, 69, 1, 53, 69, 1, 54, - 68, 1, 55, 67, 1, 57, 65, 56, 26, 160, 1, 55, 70, 1, 50, 74, 1, 46, 78, 1, 44, 81, 1, 42, 83, 1, 40, - 85, 1, 38, 87, 1, 37, 88, 1, 35, 89, 1, 34, 91, 1, 33, 92, 1, 31, 93, 1, 31, 94, 1, 30, 95, 1, 29, 96, - 1, 28, 96, 1, 27, 97, 1, 26, 98, 1, 26, 99, 1, 25, 99, 2, 25, 58, 66, 100, 2, 24, 53, 71, 100, 2, 23, - 51, 73, 101, 2, 23, 49, 75, 102, 2, 22, 47, 77, 102, 2, 22, 46, 78, 102, 2, 22, 45, 79, 103, 2, 22, 44, - 80, 103, 2, 21, 43, 81, 103, 2, 21, 43, 81, 104, 2, 21, 42, 82, 104, 2, 21, 42, 82, 104, 2, 20, 41, 83, - 104, 2, 20, 41, 83, 104, 2, 20, 41, 83, 104, 2, 20, 40, 84, 104, 2, 20, 40, 84, 104, 2, 20, 40, 84, 105, - 2, 20, 40, 84, 105, 2, 20, 40, 84, 104, 2, 20, 40, 84, 104, 2, 20, 41, 83, 104, 2, 20, 41, 83, 104, 2, - 20, 41, 83, 104, 2, 21, 41, 83, 104, 2, 21, 42, 82, 104, 2, 21, 42, 82, 104, 2, 21, 43, 81, 103, 2, 22, - 44, 80, 103, 2, 22, 44, 80, 103, 2, 22, 45, 79, 102, 2, 23, 46, 78, 102, 2, 23, 48, 76, 101, 2, 24, 49, - 75, 101, 2, 24, 51, 73, 100, 2, 25, 54, 70, 100, 2, 25, 61, 63, 99, 1, 26, 99, 1, 27, 98, 1, 27, 97, - 1, 28, 96, 1, 29, 96, 1, 30, 95, 1, 31, 94, 1, 32, 93, 1, 33, 92, 1, 34, 91, 1, 32, 92, 1, 31, 93, 1, - 30, 94, 1, 29, 95, 1, 28, 96, 1, 27, 97, 1, 26, 98, 1, 26, 99, 1, 25, 100, 1, 24, 100, 2, 23, 57, 67, - 101, 2, 23, 53, 71, 102, 2, 22, 51, 73, 102, 2, 22, 49, 75, 103, 2, 21, 48, 77, 103, 2, 21, 46, 78, 104, - 2, 20, 45, 79, 104, 2, 20, 44, 80, 104, 2, 20, 43, 81, 105, 2, 19, 43, 82, 105, 2, 19, 42, 83, 105, 2, - 19, 41, 83, 106, 2, 19, 41, 84, 106, 2, 19, 40, 84, 106, 2, 18, 40, 85, 106, 2, 18, 39, 85, 106, 2, 18, - 39, 86, 106, 2, 18, 39, 86, 106, 2, 18, 39, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, - 86, 106, 2, 18, 38, 86, 106, 2, 18, 39, 86, 106, 2, 18, 39, 86, 106, 2, 18, 39, 85, 106, 2, 19, 40, 85, - 106, 2, 19, 40, 84, 106, 2, 19, 41, 84, 105, 2, 19, 41, 83, 105, 2, 20, 42, 82, 105, 2, 20, 43, 81, 105, - 2, 20, 45, 80, 104, 2, 21, 46, 78, 104, 2, 21, 48, 76, 104, 2, 21, 51, 74, 103, 2, 22, 56, 69, 103, 1, - 22, 102, 1, 23, 102, 1, 24, 101, 1, 24, 100, 1, 25, 100, 1, 26, 99, 1, 27, 98, 1, 28, 97, 1, 28, 96, - 1, 29, 95, 1, 31, 94, 1, 32, 92, 1, 33, 91, 1, 35, 89, 1, 37, 87, 1, 39, 85, 1, 42, 83, 1, 44, 80, 1, - 48, 76, 1, 53, 71, 57, 26, 160, 1, 56, 71, 1, 52, 75, 1, 49, 78, 1, 46, 80, 1, 44, 83, 1, 43, 84, 1, - 41, 86, 1, 40, 87, 1, 38, 89, 1, 37, 90, 1, 36, 91, 1, 35, 92, 1, 34, 93, 1, 34, 94, 1, 33, 95, 1, 32, - 96, 1, 31, 97, 1, 31, 97, 1, 30, 98, 1, 29, 99, 2, 29, 61, 66, 99, 2, 28, 57, 71, 100, 2, 28, 55, 73, - 101, 2, 27, 53, 75, 101, 2, 27, 51, 76, 102, 2, 26, 50, 77, 102, 2, 26, 49, 78, 103, 2, 26, 48, 79, 103, - 2, 25, 48, 80, 104, 2, 25, 47, 81, 104, 2, 25, 46, 82, 104, 2, 24, 46, 82, 105, 2, 24, 45, 83, 105, 2, - 24, 45, 83, 105, 2, 24, 44, 84, 106, 2, 23, 44, 84, 106, 2, 23, 44, 85, 106, 2, 23, 44, 85, 107, 2, 23, - 43, 86, 107, 2, 23, 43, 86, 107, 2, 23, 43, 86, 107, 2, 23, 43, 86, 108, 2, 23, 43, 87, 108, 2, 23, 43, - 87, 108, 2, 23, 43, 87, 108, 2, 23, 43, 88, 109, 2, 23, 43, 88, 109, 2, 23, 43, 87, 109, 2, 23, 43, 87, - 109, 2, 23, 43, 86, 109, 2, 23, 43, 86, 110, 2, 23, 44, 85, 110, 2, 23, 44, 84, 110, 2, 23, 44, 84, 110, - 2, 23, 44, 83, 110, 2, 24, 45, 83, 110, 2, 24, 45, 82, 110, 2, 24, 46, 81, 110, 2, 24, 46, 80, 111, 2, - 25, 47, 80, 111, 2, 25, 47, 79, 111, 2, 25, 48, 78, 111, 2, 26, 49, 77, 111, 2, 26, 50, 76, 111, 2, 27, - 51, 75, 111, 2, 27, 52, 74, 111, 2, 27, 53, 73, 111, 2, 28, 55, 71, 111, 2, 29, 57, 69, 111, 2, 29, 61, - 64, 111, 1, 30, 111, 1, 30, 111, 1, 31, 111, 1, 32, 111, 1, 33, 111, 1, 34, 111, 1, 34, 111, 1, 35, 111, - 1, 36, 111, 2, 37, 89, 90, 110, 2, 38, 88, 90, 110, 2, 40, 86, 90, 110, 2, 41, 85, 89, 110, 2, 42, 83, - 89, 110, 2, 44, 82, 89, 110, 2, 46, 80, 88, 109, 2, 48, 78, 88, 109, 2, 50, 76, 87, 109, 2, 53, 73, 87, - 109, 2, 57, 68, 86, 108, 1, 86, 108, 1, 85, 108, 1, 85, 107, 1, 84, 107, 1, 83, 107, 1, 82, 106, 1, 82, - 106, 1, 81, 105, 1, 80, 105, 1, 79, 104, 1, 78, 104, 1, 77, 103, 1, 76, 103, 1, 75, 102, 1, 73, 101, - 1, 72, 101, 1, 70, 100, 1, 69, 99, 1, 67, 98, 1, 65, 98, 1, 63, 97, 2, 28, 37, 60, 96, 2, 27, 40, 57, - 95, 2, 26, 43, 52, 94, 1, 25, 93, 1, 24, 92, 1, 24, 91, 1, 23, 90, 1, 23, 89, 1, 23, 87, 1, 23, 86, 1, - 23, 85, 1, 23, 83, 1, 23, 82, 1, 24, 80, 1, 24, 78, 1, 25, 77, 1, 25, 75, 1, 26, 73, 1, 28, 70, 1, 29, - 68, 1, 31, 65, 1, 34, 61, 1, 38, 56, 58, 68, 161, 1, 56, 68, 1, 54, 71, 1, 52, 72, 1, 51, 73, 1, 50, - 74, 1, 49, 75, 1, 49, 76, 1, 48, 76, 1, 48, 77, 1, 47, 77, 1, 47, 77, 1, 47, 77, 1, 47, 77, 1, 47, 77, - 1, 47, 77, 1, 47, 77, 1, 47, 77, 1, 48, 77, 1, 48, 76, 1, 49, 76, 1, 49, 75, 1, 50, 74, 1, 51, 73, 1, - 52, 72, 1, 54, 71, 1, 56, 68, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 57, 67, 1, 55, 70, 1, 53, 72, 1, 51, 73, 1, 50, 74, - 1, 50, 75, 1, 49, 76, 1, 48, 76, 1, 48, 77, 1, 48, 77, 1, 47, 77, 1, 47, 77, 1, 47, 77, 1, 47, 78, 1, - 47, 77, 1, 47, 77, 1, 47, 77, 1, 48, 77, 1, 48, 77, 1, 49, 76, 1, 49, 75, 1, 50, 75, 1, 51, 74, 1, 52, - 73, 1, 53, 71, 1, 55, 69, 1, 59, 66, 59, 68, 179, 1, 67, 76, 1, 64, 79, 1, 62, 81, 1, 61, 82, 1, 60, - 83, 1, 59, 84, 1, 58, 85, 1, 58, 85, 1, 57, 86, 1, 57, 86, 1, 57, 86, 1, 57, 87, 1, 56, 87, 1, 56, 87, - 1, 56, 87, 1, 57, 87, 1, 57, 86, 1, 57, 86, 1, 57, 86, 1, 58, 85, 1, 58, 85, 1, 59, 84, 1, 60, 83, 1, - 61, 82, 1, 62, 81, 1, 64, 79, 1, 67, 76, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 55, 81, 1, 55, 80, 1, 54, 80, 1, 54, 79, 1, 54, 78, 1, 54, - 78, 1, 53, 77, 1, 53, 76, 1, 53, 76, 1, 52, 75, 1, 52, 75, 1, 52, 74, 1, 51, 73, 1, 51, 73, 1, 51, 72, - 1, 50, 71, 1, 50, 71, 1, 50, 70, 1, 49, 70, 1, 49, 69, 1, 49, 68, 1, 49, 68, 1, 48, 67, 1, 48, 67, 1, - 48, 66, 1, 47, 65, 1, 47, 65, 1, 47, 64, 1, 46, 63, 1, 46, 63, 1, 46, 62, 1, 45, 62, 1, 45, 61, 1, 45, - 60, 1, 44, 60, 1, 44, 59, 1, 44, 59, 1, 44, 58, 1, 43, 57, 1, 43, 57, 1, 43, 56, 1, 42, 55, 1, 42, 55, - 1, 42, 54, 1, 41, 54, 1, 41, 53, 1, 41, 52, 1, 42, 52, 1, 42, 51, 1, 43, 50, 1, 46, 48, 60, 42, 157, - 1, 103, 107, 1, 100, 109, 1, 98, 111, 1, 96, 112, 1, 94, 113, 1, 92, 113, 1, 90, 114, 1, 89, 114, 1, - 87, 115, 1, 85, 115, 1, 83, 115, 1, 81, 115, 1, 80, 115, 1, 78, 114, 1, 76, 114, 1, 74, 113, 1, 72, 113, - 1, 71, 112, 1, 69, 111, 1, 67, 109, 1, 65, 107, 1, 63, 106, 1, 62, 104, 1, 60, 102, 1, 58, 100, 1, 56, - 99, 1, 54, 97, 1, 53, 95, 1, 51, 93, 1, 49, 91, 1, 47, 90, 1, 45, 88, 1, 44, 86, 1, 42, 84, 1, 40, 82, - 1, 38, 80, 1, 36, 79, 1, 35, 77, 1, 33, 75, 1, 31, 73, 1, 29, 71, 1, 27, 70, 1, 26, 68, 1, 24, 66, 1, - 22, 64, 1, 20, 62, 1, 18, 61, 1, 17, 59, 1, 15, 57, 1, 13, 55, 1, 11, 53, 1, 9, 52, 1, 8, 50, 1, 6, 48, - 1, 4, 46, 1, 2, 44, 1, 0, 43, 1, 0, 41, 1, 1, 43, 1, 2, 45, 1, 4, 46, 1, 6, 48, 1, 8, 50, 1, 10, 52, - 1, 11, 54, 1, 13, 55, 1, 15, 57, 1, 17, 59, 1, 19, 61, 1, 20, 63, 1, 22, 64, 1, 24, 66, 1, 26, 68, 1, - 28, 70, 1, 29, 72, 1, 31, 73, 1, 33, 75, 1, 35, 77, 1, 37, 79, 1, 38, 81, 1, 40, 82, 1, 42, 84, 1, 44, - 86, 1, 46, 88, 1, 47, 90, 1, 49, 91, 1, 51, 93, 1, 53, 95, 1, 55, 97, 1, 56, 99, 1, 58, 100, 1, 60, 102, - 1, 62, 104, 1, 64, 106, 1, 65, 108, 1, 67, 109, 1, 69, 111, 1, 71, 112, 1, 73, 113, 1, 74, 113, 1, 76, - 114, 1, 78, 114, 1, 80, 115, 1, 82, 115, 1, 83, 115, 1, 85, 115, 1, 87, 115, 1, 89, 114, 1, 90, 114, - 1, 92, 113, 1, 94, 113, 1, 96, 112, 1, 98, 111, 1, 100, 109, 1, 103, 107, 61, 70, 129, 1, 15, 109, 1, - 11, 113, 1, 9, 115, 1, 8, 116, 1, 7, 117, 1, 7, 118, 1, 6, 118, 1, 6, 119, 1, 5, 119, 1, 5, 119, 1, 5, - 119, 1, 5, 119, 1, 5, 119, 1, 6, 119, 1, 6, 118, 1, 7, 118, 1, 7, 117, 1, 8, 116, 1, 10, 115, 1, 12, - 113, 1, 18, 107, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 18, 107, 1, 12, 113, 1, 10, 115, - 1, 8, 116, 1, 7, 117, 1, 7, 118, 1, 6, 118, 1, 6, 119, 1, 5, 119, 1, 5, 119, 1, 5, 119, 1, 5, 119, 1, - 5, 119, 1, 6, 119, 1, 6, 118, 1, 7, 118, 1, 7, 117, 1, 8, 116, 1, 9, 115, 1, 11, 113, 1, 15, 109, 62, - 42, 157, 1, 17, 21, 1, 15, 24, 1, 14, 27, 1, 12, 29, 1, 12, 30, 1, 11, 32, 1, 10, 34, 1, 10, 36, 1, 10, - 38, 1, 10, 39, 1, 10, 41, 1, 10, 43, 1, 10, 45, 1, 10, 47, 1, 10, 48, 1, 11, 50, 1, 12, 52, 1, 13, 54, - 1, 14, 56, 1, 15, 57, 1, 17, 59, 1, 19, 61, 1, 20, 63, 1, 22, 65, 1, 24, 66, 1, 26, 68, 1, 28, 70, 1, - 29, 72, 1, 31, 74, 1, 33, 75, 1, 35, 77, 1, 37, 79, 1, 38, 81, 1, 40, 83, 1, 42, 84, 1, 44, 86, 1, 46, - 88, 1, 47, 90, 1, 49, 92, 1, 51, 93, 1, 53, 95, 1, 55, 97, 1, 57, 99, 1, 58, 101, 1, 60, 102, 1, 62, - 104, 1, 64, 106, 1, 66, 108, 1, 67, 110, 1, 69, 111, 1, 71, 113, 1, 73, 115, 1, 75, 117, 1, 76, 119, - 1, 78, 120, 1, 80, 122, 1, 82, 124, 1, 83, 126, 1, 82, 124, 1, 80, 122, 1, 78, 120, 1, 76, 118, 1, 74, - 117, 1, 73, 115, 1, 71, 113, 1, 69, 111, 1, 67, 109, 1, 65, 108, 1, 64, 106, 1, 62, 104, 1, 60, 102, - 1, 58, 100, 1, 56, 99, 1, 55, 97, 1, 53, 95, 1, 51, 93, 1, 49, 91, 1, 47, 90, 1, 46, 88, 1, 44, 86, 1, - 42, 84, 1, 40, 82, 1, 38, 81, 1, 37, 79, 1, 35, 77, 1, 33, 75, 1, 31, 73, 1, 29, 72, 1, 28, 70, 1, 26, - 68, 1, 24, 66, 1, 22, 64, 1, 20, 63, 1, 18, 61, 1, 17, 59, 1, 15, 57, 1, 14, 55, 1, 12, 54, 1, 12, 52, - 1, 11, 50, 1, 10, 48, 1, 10, 46, 1, 10, 45, 1, 10, 43, 1, 10, 41, 1, 10, 39, 1, 10, 37, 1, 10, 36, 1, - 10, 34, 1, 11, 32, 1, 12, 30, 1, 12, 28, 1, 14, 27, 1, 15, 24, 1, 17, 21, 63, 34, 160, 1, 60, 67, 1, - 52, 75, 1, 47, 79, 1, 43, 82, 1, 40, 84, 1, 38, 86, 1, 35, 88, 1, 33, 90, 1, 30, 91, 1, 28, 92, 1, 26, - 94, 1, 24, 95, 1, 22, 96, 1, 22, 97, 1, 22, 98, 1, 22, 98, 1, 22, 99, 1, 22, 100, 1, 22, 101, 1, 22, - 101, 1, 22, 102, 2, 22, 55, 72, 102, 2, 22, 51, 75, 103, 2, 22, 47, 77, 103, 2, 22, 45, 79, 104, 2, 22, - 43, 80, 104, 2, 22, 42, 81, 104, 2, 22, 42, 82, 105, 2, 22, 42, 83, 105, 2, 22, 42, 83, 105, 2, 22, 42, - 84, 105, 2, 22, 42, 85, 106, 2, 22, 42, 85, 106, 2, 23, 42, 85, 106, 2, 23, 42, 86, 106, 2, 23, 41, 86, - 106, 2, 23, 41, 86, 106, 2, 24, 41, 86, 106, 2, 24, 40, 86, 106, 2, 25, 39, 86, 106, 2, 27, 38, 86, 106, - 2, 29, 36, 85, 106, 1, 85, 106, 1, 85, 106, 1, 84, 106, 1, 84, 106, 1, 83, 106, 1, 82, 105, 1, 81, 105, - 1, 80, 105, 1, 78, 104, 1, 77, 104, 1, 75, 104, 1, 73, 103, 1, 71, 103, 1, 69, 102, 1, 67, 101, 1, 65, - 101, 1, 62, 100, 1, 60, 99, 1, 57, 98, 1, 55, 97, 1, 52, 96, 1, 52, 95, 1, 52, 94, 1, 52, 92, 1, 52, - 91, 1, 52, 90, 1, 52, 88, 1, 52, 86, 1, 52, 84, 1, 52, 83, 1, 52, 81, 1, 52, 79, 1, 52, 77, 1, 52, 74, - 1, 52, 72, 1, 52, 72, 1, 53, 72, 1, 53, 72, 1, 53, 72, 1, 53, 71, 1, 54, 71, 1, 54, 70, 1, 55, 69, 1, - 56, 68, 1, 58, 67, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 58, 67, 1, 54, 71, 1, 52, - 73, 1, 51, 74, 1, 50, 75, 1, 49, 75, 1, 49, 76, 1, 48, 76, 1, 48, 77, 1, 48, 77, 1, 48, 77, 1, 48, 77, - 1, 48, 77, 1, 48, 76, 1, 49, 76, 1, 49, 75, 1, 50, 75, 1, 51, 74, 1, 52, 73, 1, 53, 71, 1, 56, 68, 64, - 26, 174, 1, 56, 68, 1, 51, 73, 1, 48, 76, 1, 46, 79, 1, 44, 80, 1, 42, 82, 1, 41, 83, 1, 39, 85, 1, 38, - 86, 1, 37, 87, 1, 36, 88, 1, 35, 89, 1, 34, 90, 1, 33, 90, 1, 33, 91, 2, 32, 58, 68, 92, 2, 31, 55, 71, - 92, 2, 30, 52, 73, 93, 2, 30, 51, 74, 93, 2, 29, 49, 76, 94, 2, 29, 48, 77, 94, 2, 28, 47, 78, 95, 2, - 27, 46, 78, 95, 2, 27, 45, 79, 95, 2, 26, 44, 80, 96, 2, 26, 43, 80, 96, 2, 25, 43, 81, 96, 2, 25, 42, - 81, 97, 2, 24, 41, 82, 97, 2, 24, 41, 82, 97, 2, 24, 40, 82, 97, 2, 23, 40, 82, 97, 2, 23, 39, 82, 98, - 2, 23, 39, 83, 98, 2, 22, 39, 83, 98, 2, 22, 38, 83, 98, 2, 22, 38, 83, 98, 2, 21, 37, 83, 98, 2, 21, - 37, 83, 98, 2, 21, 37, 83, 98, 2, 21, 37, 83, 98, 2, 21, 36, 83, 98, 2, 20, 36, 83, 98, 2, 20, 36, 78, - 98, 2, 20, 36, 73, 98, 2, 20, 35, 69, 98, 2, 20, 35, 67, 98, 2, 20, 35, 65, 98, 2, 20, 35, 63, 98, 2, - 20, 35, 61, 98, 2, 19, 35, 60, 98, 2, 19, 34, 59, 98, 2, 19, 34, 57, 98, 2, 19, 34, 56, 98, 2, 19, 34, - 55, 98, 2, 19, 34, 54, 98, 2, 19, 34, 54, 98, 2, 19, 34, 53, 98, 3, 19, 34, 52, 80, 83, 98, 3, 19, 34, - 51, 76, 83, 98, 3, 19, 34, 51, 73, 83, 98, 3, 19, 34, 50, 71, 83, 98, 3, 19, 34, 50, 69, 83, 98, 3, 19, - 34, 49, 68, 83, 98, 3, 19, 34, 49, 67, 83, 98, 3, 19, 34, 48, 66, 83, 98, 3, 19, 34, 48, 65, 83, 98, - 3, 19, 34, 48, 64, 83, 98, 3, 19, 34, 47, 63, 83, 98, 3, 19, 34, 47, 63, 83, 98, 3, 19, 34, 47, 62, 83, - 98, 3, 19, 34, 47, 62, 83, 98, 3, 19, 34, 47, 62, 83, 98, 3, 19, 34, 47, 61, 83, 98, 3, 19, 34, 46, 61, - 83, 98, 3, 19, 34, 46, 61, 83, 98, 3, 19, 34, 46, 61, 83, 98, 3, 19, 34, 47, 61, 83, 98, 3, 19, 34, 47, - 62, 83, 98, 3, 19, 34, 47, 62, 83, 98, 3, 19, 34, 47, 62, 83, 98, 3, 19, 34, 47, 63, 83, 98, 3, 19, 34, - 47, 63, 83, 98, 3, 19, 34, 48, 64, 83, 98, 3, 19, 34, 48, 65, 83, 98, 3, 19, 34, 48, 66, 83, 98, 3, 19, - 34, 49, 67, 83, 98, 3, 19, 34, 49, 69, 83, 98, 3, 19, 34, 50, 71, 83, 98, 3, 19, 34, 50, 74, 83, 98, - 2, 19, 34, 51, 98, 2, 19, 34, 51, 99, 2, 19, 34, 52, 100, 2, 19, 34, 53, 101, 2, 19, 34, 54, 102, 2, - 19, 34, 55, 102, 2, 19, 34, 55, 102, 2, 19, 35, 57, 102, 2, 20, 35, 58, 102, 2, 20, 35, 59, 102, 2, 20, - 35, 60, 101, 2, 20, 35, 62, 101, 2, 20, 35, 64, 100, 2, 20, 36, 67, 99, 3, 20, 36, 70, 82, 83, 97, 1, - 21, 36, 1, 21, 36, 1, 21, 36, 1, 21, 37, 1, 22, 37, 1, 22, 37, 1, 22, 38, 1, 22, 38, 1, 23, 38, 1, 23, - 39, 1, 23, 39, 1, 24, 40, 1, 24, 40, 1, 24, 41, 1, 25, 41, 1, 25, 42, 1, 25, 43, 1, 26, 43, 1, 26, 44, - 2, 27, 44, 87, 92, 2, 27, 45, 85, 93, 2, 28, 46, 84, 94, 2, 28, 47, 82, 95, 2, 29, 48, 81, 96, 2, 29, - 50, 80, 96, 2, 30, 51, 77, 96, 2, 31, 54, 75, 96, 2, 31, 57, 71, 96, 1, 32, 96, 1, 33, 96, 1, 34, 95, - 1, 35, 95, 1, 36, 94, 1, 37, 93, 1, 38, 92, 1, 39, 91, 1, 40, 89, 1, 42, 87, 1, 43, 86, 1, 45, 84, 1, - 48, 80, 1, 50, 77, 1, 55, 71, 65, 37, 157, 1, 25, 73, 1, 22, 74, 1, 20, 74, 1, 19, 74, 1, 18, 75, 1, - 17, 75, 1, 17, 76, 1, 16, 76, 1, 16, 77, 1, 16, 77, 1, 16, 77, 1, 16, 78, 1, 16, 78, 1, 16, 79, 1, 17, - 79, 1, 17, 79, 1, 18, 80, 1, 19, 80, 1, 20, 81, 1, 22, 81, 1, 27, 81, 1, 42, 82, 1, 42, 82, 1, 41, 83, - 1, 41, 83, 1, 41, 83, 1, 40, 84, 1, 40, 84, 2, 39, 62, 63, 85, 2, 39, 61, 63, 85, 2, 39, 61, 63, 85, - 2, 38, 60, 64, 86, 2, 38, 60, 64, 86, 2, 37, 60, 65, 87, 2, 37, 59, 65, 87, 2, 37, 59, 66, 87, 2, 36, - 58, 66, 88, 2, 36, 58, 66, 88, 2, 35, 57, 67, 89, 2, 35, 57, 67, 89, 2, 35, 57, 68, 89, 2, 34, 56, 68, - 90, 2, 34, 56, 68, 90, 2, 33, 55, 69, 91, 2, 33, 55, 69, 91, 2, 33, 55, 70, 92, 2, 32, 54, 70, 92, 2, - 32, 54, 70, 92, 2, 31, 53, 71, 93, 2, 31, 53, 71, 93, 2, 31, 53, 72, 94, 2, 30, 52, 72, 94, 2, 30, 52, - 72, 94, 2, 29, 51, 73, 95, 2, 29, 51, 73, 95, 2, 29, 51, 74, 96, 2, 28, 50, 74, 96, 2, 28, 50, 74, 96, - 2, 27, 49, 75, 97, 2, 27, 49, 75, 97, 2, 27, 48, 76, 98, 2, 26, 48, 76, 98, 2, 26, 48, 76, 98, 2, 25, - 47, 77, 99, 2, 25, 47, 77, 99, 2, 25, 46, 78, 100, 2, 24, 46, 78, 100, 1, 24, 100, 1, 23, 101, 1, 23, - 101, 1, 23, 102, 1, 22, 102, 1, 22, 102, 1, 21, 103, 1, 21, 103, 1, 21, 104, 1, 20, 104, 1, 20, 104, - 1, 19, 105, 1, 19, 105, 1, 19, 106, 1, 18, 106, 1, 18, 106, 1, 17, 107, 1, 17, 107, 1, 17, 108, 1, 16, - 108, 1, 16, 109, 2, 15, 37, 87, 109, 2, 15, 37, 88, 109, 2, 15, 36, 88, 110, 2, 14, 36, 88, 110, 2, 14, - 36, 89, 111, 2, 13, 35, 89, 111, 2, 13, 35, 90, 111, 2, 13, 34, 90, 112, 2, 12, 34, 90, 112, 2, 12, 34, - 91, 113, 2, 11, 33, 91, 113, 2, 11, 33, 92, 113, 2, 5, 43, 81, 120, 2, 2, 46, 78, 123, 2, 0, 48, 77, - 124, 2, 0, 49, 76, 125, 2, 0, 50, 75, 126, 2, 0, 50, 74, 126, 2, 0, 51, 74, 127, 2, 0, 51, 74, 127, 2, - 0, 51, 73, 127, 2, 0, 51, 73, 127, 2, 0, 51, 73, 128, 2, 0, 51, 73, 127, 2, 0, 51, 73, 127, 2, 0, 51, - 74, 127, 2, 0, 50, 74, 127, 2, 0, 50, 75, 126, 2, 0, 49, 75, 125, 2, 0, 48, 76, 124, 2, 1, 47, 78, 123, - 2, 4, 44, 80, 120, 66, 37, 157, 1, 13, 73, 1, 10, 80, 1, 8, 84, 1, 7, 86, 1, 6, 89, 1, 5, 91, 1, 5, 92, - 1, 4, 94, 1, 4, 95, 1, 4, 97, 1, 4, 98, 1, 4, 99, 1, 4, 100, 1, 4, 101, 1, 5, 101, 1, 5, 102, 1, 6, 103, - 1, 7, 104, 1, 8, 104, 1, 10, 105, 1, 15, 105, 2, 21, 41, 76, 106, 2, 21, 41, 79, 106, 2, 21, 41, 82, - 107, 2, 21, 41, 83, 107, 2, 21, 41, 84, 107, 2, 21, 41, 85, 108, 2, 21, 41, 86, 108, 2, 21, 41, 87, 108, - 2, 21, 41, 87, 108, 2, 21, 41, 88, 108, 2, 21, 41, 88, 108, 2, 21, 41, 88, 109, 2, 21, 41, 88, 109, 2, - 21, 41, 88, 109, 2, 21, 41, 88, 109, 2, 21, 41, 88, 109, 2, 21, 41, 88, 108, 2, 21, 41, 88, 108, 2, 21, - 41, 87, 108, 2, 21, 41, 86, 108, 2, 21, 41, 86, 108, 2, 21, 41, 85, 108, 2, 21, 41, 84, 107, 2, 21, 41, - 82, 107, 2, 21, 41, 81, 107, 2, 21, 41, 79, 106, 2, 21, 41, 77, 106, 2, 21, 41, 74, 105, 2, 21, 41, 70, - 105, 1, 21, 104, 1, 21, 104, 1, 21, 103, 1, 21, 102, 1, 21, 101, 1, 21, 100, 1, 21, 99, 1, 21, 99, 1, - 21, 100, 1, 21, 101, 1, 21, 103, 1, 21, 104, 1, 21, 105, 1, 21, 106, 1, 21, 107, 1, 21, 108, 1, 21, 109, - 1, 21, 110, 1, 21, 111, 1, 21, 112, 2, 21, 41, 70, 112, 2, 21, 41, 78, 113, 2, 21, 41, 82, 113, 2, 21, - 41, 85, 114, 2, 21, 41, 87, 115, 2, 21, 41, 88, 115, 2, 21, 41, 90, 115, 2, 21, 41, 91, 116, 2, 21, 41, - 92, 116, 2, 21, 41, 93, 116, 2, 21, 41, 94, 117, 2, 21, 41, 95, 117, 2, 21, 41, 95, 117, 2, 21, 41, 96, - 117, 2, 21, 41, 96, 117, 2, 21, 41, 97, 117, 2, 21, 41, 97, 117, 2, 21, 41, 97, 117, 2, 21, 41, 97, 118, - 2, 21, 41, 97, 117, 2, 21, 41, 97, 117, 2, 21, 41, 97, 117, 2, 21, 41, 96, 117, 2, 21, 41, 96, 117, 2, - 21, 41, 95, 117, 2, 21, 41, 94, 117, 2, 21, 41, 93, 117, 2, 21, 41, 91, 116, 2, 21, 41, 88, 116, 2, 21, - 41, 83, 116, 1, 12, 115, 1, 9, 115, 1, 8, 114, 1, 6, 114, 1, 6, 113, 1, 5, 112, 1, 5, 112, 1, 4, 111, - 1, 4, 110, 1, 4, 109, 1, 4, 108, 1, 4, 107, 1, 4, 105, 1, 5, 104, 1, 5, 102, 1, 5, 100, 1, 6, 98, 1, - 7, 96, 1, 9, 92, 1, 11, 88, 67, 34, 160, 1, 59, 67, 1, 52, 76, 1, 48, 80, 2, 45, 83, 101, 105, 2, 42, - 86, 98, 108, 2, 40, 88, 97, 109, 2, 38, 90, 96, 110, 2, 36, 92, 95, 111, 1, 34, 111, 1, 33, 112, 1, 31, - 112, 1, 30, 112, 1, 28, 113, 1, 27, 113, 1, 26, 113, 1, 25, 113, 1, 24, 113, 1, 23, 113, 1, 22, 113, - 1, 21, 113, 1, 21, 113, 2, 20, 56, 72, 113, 2, 19, 52, 76, 113, 2, 18, 49, 79, 113, 2, 18, 47, 82, 113, - 2, 17, 46, 84, 113, 2, 16, 44, 85, 113, 2, 16, 43, 87, 113, 2, 15, 41, 88, 113, 2, 15, 40, 89, 113, 2, - 14, 39, 90, 113, 2, 14, 38, 91, 113, 2, 13, 37, 91, 113, 2, 13, 37, 92, 113, 2, 12, 36, 92, 113, 2, 12, - 35, 92, 113, 2, 12, 34, 93, 113, 2, 11, 34, 93, 113, 2, 11, 33, 93, 112, 2, 11, 33, 93, 112, 2, 10, 32, - 94, 112, 2, 10, 31, 94, 112, 2, 10, 31, 94, 111, 2, 9, 31, 95, 111, 2, 9, 30, 96, 110, 2, 9, 30, 97, - 109, 2, 9, 30, 98, 107, 2, 9, 29, 102, 104, 1, 8, 29, 1, 8, 29, 1, 8, 29, 1, 8, 28, 1, 8, 28, 1, 8, 28, - 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, - 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, - 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 29, 1, 8, 29, 1, 8, 29, 1, 9, 29, 1, 9, 30, 1, 9, 30, 1, 9, 30, - 1, 9, 31, 1, 10, 31, 1, 10, 32, 1, 10, 32, 1, 11, 33, 1, 11, 34, 2, 12, 34, 103, 109, 2, 12, 35, 101, - 111, 2, 12, 36, 99, 113, 2, 13, 37, 98, 114, 2, 13, 38, 97, 114, 2, 14, 39, 96, 115, 2, 14, 40, 95, 115, - 2, 15, 42, 94, 116, 2, 16, 43, 93, 116, 2, 16, 45, 91, 116, 2, 17, 47, 90, 116, 2, 18, 49, 88, 116, 2, - 18, 51, 85, 116, 2, 19, 56, 82, 115, 2, 20, 62, 76, 115, 1, 21, 114, 1, 22, 114, 1, 22, 113, 1, 23, 112, - 1, 24, 111, 1, 25, 110, 1, 27, 109, 1, 28, 108, 1, 29, 107, 1, 30, 106, 1, 32, 104, 1, 34, 103, 1, 35, - 101, 1, 37, 99, 1, 40, 97, 1, 42, 95, 1, 45, 92, 1, 48, 89, 1, 52, 84, 1, 58, 79, 68, 37, 157, 1, 14, - 64, 1, 10, 71, 1, 8, 75, 1, 7, 78, 1, 6, 81, 1, 5, 83, 1, 5, 85, 1, 5, 87, 1, 4, 89, 1, 4, 90, 1, 4, - 92, 1, 4, 93, 1, 4, 94, 1, 5, 95, 1, 5, 96, 1, 6, 97, 1, 6, 98, 1, 7, 98, 1, 8, 99, 1, 10, 100, 1, 15, - 101, 2, 17, 37, 67, 101, 2, 17, 37, 71, 102, 2, 17, 37, 74, 103, 2, 17, 37, 76, 103, 2, 17, 37, 77, 104, - 2, 17, 37, 79, 105, 2, 17, 37, 80, 105, 2, 17, 37, 81, 106, 2, 17, 37, 82, 106, 2, 17, 37, 83, 107, 2, - 17, 37, 84, 107, 2, 17, 37, 84, 108, 2, 17, 37, 85, 108, 2, 17, 37, 86, 109, 2, 17, 37, 87, 109, 2, 17, - 37, 87, 110, 2, 17, 37, 88, 110, 2, 17, 37, 88, 110, 2, 17, 37, 89, 111, 2, 17, 37, 89, 111, 2, 17, 37, - 90, 111, 2, 17, 37, 90, 112, 2, 17, 37, 91, 112, 2, 17, 37, 91, 112, 2, 17, 37, 91, 112, 2, 17, 37, 92, - 112, 2, 17, 37, 92, 113, 2, 17, 37, 92, 113, 2, 17, 37, 92, 113, 2, 17, 37, 92, 113, 2, 17, 37, 93, 113, - 2, 17, 37, 93, 113, 2, 17, 37, 93, 113, 2, 17, 37, 93, 113, 2, 17, 37, 93, 113, 2, 17, 37, 93, 113, 2, - 17, 37, 93, 113, 2, 17, 37, 93, 113, 2, 17, 37, 93, 113, 2, 17, 37, 93, 113, 2, 17, 37, 93, 113, 2, 17, - 37, 93, 113, 2, 17, 37, 93, 113, 2, 17, 37, 93, 113, 2, 17, 37, 93, 113, 2, 17, 37, 93, 113, 2, 17, 37, - 93, 113, 2, 17, 37, 93, 113, 2, 17, 37, 93, 113, 2, 17, 37, 93, 113, 2, 17, 37, 93, 113, 2, 17, 37, 93, - 113, 2, 17, 37, 92, 113, 2, 17, 37, 92, 113, 2, 17, 37, 92, 113, 2, 17, 37, 92, 113, 2, 17, 37, 92, 112, - 2, 17, 37, 91, 112, 2, 17, 37, 91, 112, 2, 17, 37, 91, 112, 2, 17, 37, 90, 112, 2, 17, 37, 90, 111, 2, - 17, 37, 90, 111, 2, 17, 37, 89, 111, 2, 17, 37, 88, 111, 2, 17, 37, 88, 110, 2, 17, 37, 87, 110, 2, 17, - 37, 86, 109, 2, 17, 37, 85, 109, 2, 17, 37, 85, 109, 2, 17, 37, 84, 108, 2, 17, 37, 82, 108, 2, 17, 37, - 81, 107, 2, 17, 37, 80, 107, 2, 17, 37, 78, 106, 2, 17, 37, 77, 105, 2, 17, 37, 74, 105, 2, 17, 37, 71, - 104, 2, 17, 37, 66, 104, 1, 12, 103, 1, 9, 102, 1, 8, 101, 1, 7, 100, 1, 6, 99, 1, 5, 98, 1, 5, 97, 1, - 4, 96, 1, 4, 95, 1, 4, 94, 1, 4, 92, 1, 4, 91, 1, 4, 89, 1, 5, 88, 1, 5, 86, 1, 6, 84, 1, 6, 82, 1, 7, - 79, 1, 9, 75, 1, 11, 69, 69, 37, 157, 1, 14, 109, 1, 10, 109, 1, 8, 109, 1, 7, 109, 1, 6, 109, 1, 5, - 109, 1, 5, 109, 1, 5, 109, 1, 4, 109, 1, 4, 109, 1, 4, 109, 1, 4, 109, 1, 4, 109, 1, 5, 109, 1, 5, 109, - 1, 6, 109, 1, 6, 109, 1, 7, 109, 1, 8, 109, 1, 10, 109, 1, 16, 109, 2, 21, 41, 89, 109, 2, 21, 41, 89, - 109, 2, 21, 41, 89, 109, 2, 21, 41, 89, 109, 2, 21, 41, 89, 109, 2, 21, 41, 89, 109, 2, 21, 41, 89, 109, - 2, 21, 41, 89, 109, 2, 21, 41, 89, 109, 2, 21, 41, 89, 109, 2, 21, 41, 89, 109, 2, 21, 41, 89, 109, 2, - 21, 41, 89, 109, 2, 21, 41, 89, 109, 3, 21, 41, 66, 73, 89, 108, 3, 21, 41, 64, 75, 89, 108, 3, 21, 41, - 62, 76, 89, 108, 3, 21, 41, 62, 77, 90, 108, 3, 21, 41, 61, 78, 90, 107, 3, 21, 41, 60, 78, 91, 107, - 3, 21, 41, 60, 78, 91, 106, 3, 21, 41, 60, 79, 92, 105, 3, 21, 41, 59, 79, 94, 104, 3, 21, 41, 59, 79, - 97, 100, 2, 21, 41, 59, 79, 2, 21, 41, 59, 79, 2, 21, 41, 59, 79, 2, 21, 41, 59, 79, 2, 21, 41, 59, 79, - 1, 21, 79, 1, 21, 79, 1, 21, 79, 1, 21, 79, 1, 21, 79, 1, 21, 79, 1, 21, 79, 1, 21, 79, 1, 21, 79, 1, - 21, 79, 1, 21, 79, 1, 21, 79, 1, 21, 79, 1, 21, 79, 1, 21, 79, 1, 21, 79, 1, 21, 79, 1, 21, 79, 1, 21, - 79, 1, 21, 79, 2, 21, 41, 59, 79, 2, 21, 41, 59, 79, 2, 21, 41, 59, 79, 2, 21, 41, 59, 79, 2, 21, 41, - 59, 79, 3, 21, 41, 59, 79, 102, 104, 3, 21, 41, 59, 79, 99, 108, 3, 21, 41, 60, 79, 97, 109, 3, 21, 41, - 60, 79, 96, 110, 3, 21, 41, 60, 78, 95, 111, 3, 21, 41, 61, 78, 95, 112, 3, 21, 41, 61, 77, 94, 112, - 3, 21, 41, 62, 76, 94, 113, 3, 21, 41, 63, 75, 94, 113, 3, 21, 41, 65, 73, 93, 113, 2, 21, 41, 93, 113, - 2, 21, 41, 93, 113, 2, 21, 41, 93, 113, 2, 21, 41, 93, 113, 2, 21, 41, 93, 113, 2, 21, 41, 93, 113, 2, - 21, 41, 93, 113, 2, 21, 41, 93, 113, 2, 21, 41, 93, 113, 2, 21, 41, 93, 113, 2, 21, 41, 93, 113, 2, 21, - 41, 93, 113, 2, 21, 41, 93, 113, 2, 21, 41, 93, 113, 2, 21, 41, 93, 113, 1, 13, 113, 1, 9, 113, 1, 8, - 113, 1, 7, 113, 1, 6, 113, 1, 5, 113, 1, 5, 113, 1, 5, 113, 1, 4, 113, 1, 4, 113, 1, 4, 113, 1, 4, 113, - 1, 5, 113, 1, 5, 113, 1, 5, 113, 1, 6, 113, 1, 7, 113, 1, 7, 113, 1, 9, 113, 1, 11, 113, 70, 37, 157, - 1, 21, 120, 1, 17, 120, 1, 15, 120, 1, 14, 120, 1, 13, 120, 1, 12, 120, 1, 12, 120, 1, 12, 120, 1, 11, - 120, 1, 11, 120, 1, 11, 120, 1, 11, 120, 1, 11, 120, 1, 12, 120, 1, 12, 120, 1, 13, 120, 1, 13, 120, - 1, 14, 120, 1, 15, 120, 1, 17, 120, 1, 22, 120, 2, 28, 48, 100, 120, 2, 28, 48, 100, 120, 2, 28, 48, - 100, 120, 2, 28, 48, 100, 120, 2, 28, 48, 100, 120, 2, 28, 48, 100, 120, 2, 28, 48, 100, 120, 2, 28, - 48, 100, 120, 2, 28, 48, 100, 120, 2, 28, 48, 100, 120, 2, 28, 48, 100, 120, 2, 28, 48, 100, 120, 2, - 28, 48, 100, 120, 2, 28, 48, 100, 120, 3, 28, 48, 73, 79, 100, 120, 3, 28, 48, 71, 82, 100, 120, 3, 28, - 48, 69, 83, 101, 120, 3, 28, 48, 69, 84, 101, 119, 3, 28, 48, 68, 84, 101, 119, 3, 28, 48, 67, 85, 102, - 118, 3, 28, 48, 67, 85, 103, 117, 3, 28, 48, 67, 86, 104, 116, 3, 28, 48, 66, 86, 105, 115, 3, 28, 48, - 66, 86, 108, 112, 2, 28, 48, 66, 86, 2, 28, 48, 66, 86, 2, 28, 48, 66, 86, 2, 28, 48, 66, 86, 2, 28, - 48, 66, 86, 1, 28, 86, 1, 28, 86, 1, 28, 86, 1, 28, 86, 1, 28, 86, 1, 28, 86, 1, 28, 86, 1, 28, 86, 1, - 28, 86, 1, 28, 86, 1, 28, 86, 1, 28, 86, 1, 28, 86, 1, 28, 86, 1, 28, 86, 1, 28, 86, 1, 28, 86, 1, 28, - 86, 1, 28, 86, 1, 28, 86, 2, 28, 48, 66, 86, 2, 28, 48, 66, 86, 2, 28, 48, 66, 86, 2, 28, 48, 66, 86, - 2, 28, 48, 66, 86, 2, 28, 48, 66, 86, 2, 28, 48, 66, 86, 2, 28, 48, 67, 86, 2, 28, 48, 67, 85, 2, 28, - 48, 67, 85, 2, 28, 48, 67, 85, 2, 28, 48, 68, 84, 2, 28, 48, 69, 83, 2, 28, 48, 70, 82, 2, 28, 48, 72, - 80, 1, 28, 48, 1, 28, 48, 1, 28, 48, 1, 28, 48, 1, 28, 48, 1, 28, 48, 1, 28, 48, 1, 28, 48, 1, 28, 48, - 1, 28, 48, 1, 28, 48, 1, 28, 48, 1, 28, 48, 1, 28, 48, 1, 28, 48, 1, 19, 74, 1, 16, 77, 1, 15, 79, 1, - 14, 80, 1, 13, 81, 1, 12, 81, 1, 12, 82, 1, 12, 82, 1, 11, 82, 1, 11, 82, 1, 11, 82, 1, 11, 82, 1, 11, - 82, 1, 12, 82, 1, 12, 82, 1, 13, 81, 1, 13, 80, 1, 14, 79, 1, 16, 78, 1, 18, 75, 71, 34, 160, 1, 60, - 70, 1, 53, 78, 1, 49, 83, 2, 45, 86, 102, 105, 2, 43, 89, 99, 108, 2, 40, 92, 97, 109, 2, 38, 94, 96, - 111, 1, 36, 111, 1, 35, 112, 1, 33, 112, 1, 32, 112, 1, 30, 113, 1, 29, 113, 1, 28, 113, 1, 27, 113, - 1, 26, 113, 1, 25, 113, 1, 24, 113, 1, 23, 113, 1, 22, 113, 1, 21, 113, 2, 20, 56, 76, 113, 2, 20, 52, - 80, 113, 2, 19, 49, 83, 113, 2, 18, 47, 86, 113, 2, 17, 45, 88, 113, 2, 17, 44, 89, 113, 2, 16, 42, 90, - 113, 2, 16, 41, 91, 113, 2, 15, 40, 92, 113, 2, 14, 39, 92, 113, 2, 14, 38, 93, 113, 2, 13, 37, 93, 113, - 2, 13, 37, 93, 113, 2, 12, 36, 94, 113, 2, 12, 35, 94, 113, 2, 12, 34, 94, 112, 2, 11, 34, 95, 112, 2, - 11, 33, 95, 112, 2, 11, 33, 95, 111, 2, 10, 32, 96, 110, 2, 10, 32, 97, 109, 2, 10, 31, 99, 107, 1, 9, - 31, 1, 9, 30, 1, 9, 30, 1, 9, 30, 1, 9, 29, 1, 8, 29, 1, 8, 29, 1, 8, 29, 1, 8, 29, 1, 8, 28, 1, 8, 28, - 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, 8, 28, 1, - 8, 28, 1, 8, 28, 2, 8, 28, 66, 114, 2, 8, 28, 63, 117, 2, 8, 28, 62, 119, 2, 8, 28, 61, 119, 2, 8, 28, - 60, 120, 2, 8, 28, 60, 121, 2, 8, 28, 59, 121, 2, 8, 28, 59, 122, 2, 8, 28, 59, 122, 2, 8, 28, 59, 122, - 2, 8, 28, 59, 122, 2, 8, 28, 59, 122, 2, 8, 28, 59, 121, 2, 8, 28, 59, 121, 2, 8, 29, 60, 121, 2, 8, - 29, 60, 120, 2, 9, 29, 61, 119, 2, 9, 29, 62, 118, 2, 9, 30, 64, 117, 2, 9, 30, 66, 115, 2, 9, 30, 93, - 113, 2, 10, 31, 93, 113, 2, 10, 31, 93, 113, 2, 10, 32, 93, 113, 2, 10, 32, 93, 113, 2, 11, 33, 93, 113, - 2, 11, 33, 93, 113, 2, 12, 34, 93, 113, 2, 12, 35, 93, 113, 2, 12, 36, 93, 113, 2, 13, 37, 93, 113, 2, - 13, 37, 93, 113, 2, 14, 39, 93, 113, 2, 14, 40, 93, 113, 2, 15, 42, 93, 113, 2, 15, 44, 91, 113, 2, 16, - 46, 88, 113, 2, 17, 49, 85, 113, 2, 17, 53, 81, 113, 2, 18, 59, 75, 113, 1, 19, 113, 1, 19, 113, 1, 20, - 113, 1, 21, 113, 1, 22, 113, 1, 23, 113, 1, 24, 113, 1, 25, 113, 1, 26, 111, 1, 28, 109, 1, 29, 107, - 1, 31, 105, 1, 33, 103, 1, 35, 101, 1, 37, 98, 1, 40, 96, 1, 43, 93, 1, 47, 90, 1, 51, 85, 1, 58, 80, - 72, 37, 157, 2, 19, 47, 77, 105, 2, 16, 52, 73, 109, 2, 14, 54, 72, 111, 2, 13, 55, 70, 113, 2, 12, 56, - 69, 114, 2, 11, 56, 69, 114, 2, 11, 57, 68, 115, 2, 10, 57, 68, 115, 2, 10, 58, 68, 115, 2, 10, 58, 68, - 115, 2, 10, 58, 68, 115, 2, 10, 58, 68, 115, 2, 10, 58, 68, 115, 2, 10, 57, 68, 115, 2, 11, 57, 68, 115, - 2, 11, 56, 69, 114, 2, 12, 56, 70, 113, 2, 13, 55, 70, 112, 2, 14, 54, 72, 111, 2, 16, 52, 74, 109, 2, - 20, 46, 79, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, - 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, - 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, - 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, - 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, - 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 1, 21, 105, 1, 21, 105, - 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, - 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, - 1, 21, 105, 1, 21, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, - 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, - 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, - 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, - 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, - 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 13, 49, 76, 112, - 2, 10, 53, 73, 115, 2, 9, 54, 71, 116, 2, 8, 55, 70, 117, 2, 7, 56, 69, 118, 2, 7, 57, 69, 118, 2, 6, - 57, 68, 119, 2, 6, 57, 68, 119, 2, 6, 58, 68, 119, 2, 6, 58, 68, 119, 2, 6, 58, 68, 119, 2, 6, 58, 68, - 119, 2, 6, 57, 68, 119, 2, 6, 57, 68, 119, 2, 7, 57, 69, 119, 2, 7, 56, 69, 118, 2, 8, 55, 70, 117, 2, - 9, 54, 71, 116, 2, 10, 53, 72, 115, 2, 13, 51, 75, 112, 73, 37, 157, 1, 27, 96, 1, 24, 100, 1, 22, 102, - 1, 21, 104, 1, 20, 104, 1, 19, 105, 1, 19, 106, 1, 18, 106, 1, 18, 106, 1, 18, 106, 1, 18, 106, 1, 18, - 106, 1, 18, 106, 1, 18, 106, 1, 19, 105, 1, 19, 105, 1, 20, 104, 1, 21, 103, 1, 22, 102, 1, 24, 100, - 1, 29, 95, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 26, 98, 1, 23, 101, 1, 21, 103, 1, 20, - 104, 1, 20, 105, 1, 19, 105, 1, 19, 106, 1, 18, 106, 1, 18, 106, 1, 18, 106, 1, 18, 106, 1, 18, 106, - 1, 18, 106, 1, 18, 106, 1, 19, 105, 1, 19, 105, 1, 20, 104, 1, 21, 103, 1, 23, 102, 1, 25, 99, 74, 37, - 160, 1, 51, 115, 1, 48, 120, 1, 46, 122, 1, 44, 123, 1, 44, 124, 1, 43, 124, 1, 43, 125, 1, 42, 125, - 1, 42, 126, 1, 42, 126, 1, 42, 126, 1, 42, 126, 1, 42, 126, 1, 42, 125, 1, 43, 125, 1, 43, 124, 1, 44, - 124, 1, 45, 123, 1, 46, 122, 1, 48, 120, 1, 53, 114, 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, - 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, - 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, - 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, - 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, - 1, 80, 100, 1, 80, 100, 1, 80, 100, 1, 80, 100, 2, 19, 26, 80, 100, 2, 17, 28, 80, 100, 2, 16, 29, 80, - 100, 2, 15, 30, 80, 100, 2, 14, 31, 80, 100, 2, 13, 31, 80, 100, 2, 13, 31, 80, 100, 2, 13, 32, 80, 100, - 2, 13, 32, 80, 100, 2, 12, 32, 80, 100, 2, 12, 32, 80, 100, 2, 12, 32, 80, 100, 2, 12, 32, 80, 100, 2, - 12, 32, 80, 100, 2, 12, 32, 80, 100, 2, 12, 32, 80, 100, 2, 12, 32, 80, 100, 2, 12, 32, 80, 100, 2, 12, - 32, 80, 100, 2, 12, 32, 80, 100, 2, 12, 32, 80, 100, 2, 12, 32, 80, 100, 2, 12, 32, 80, 100, 2, 12, 32, - 80, 100, 2, 12, 32, 80, 100, 2, 12, 32, 79, 100, 2, 12, 32, 79, 100, 2, 12, 32, 78, 100, 2, 12, 32, 78, - 99, 2, 12, 32, 77, 99, 2, 12, 32, 77, 99, 2, 12, 32, 76, 99, 2, 12, 32, 75, 98, 2, 12, 33, 74, 98, 2, - 12, 35, 72, 98, 2, 12, 37, 71, 97, 2, 12, 40, 69, 97, 2, 12, 43, 67, 96, 2, 12, 46, 64, 96, 2, 12, 51, - 60, 95, 1, 12, 94, 1, 12, 94, 1, 12, 93, 1, 12, 92, 1, 12, 91, 1, 12, 90, 1, 12, 89, 1, 14, 88, 1, 16, - 87, 1, 18, 86, 1, 20, 85, 1, 22, 83, 1, 25, 82, 1, 27, 80, 1, 30, 78, 1, 33, 76, 1, 36, 74, 1, 39, 71, - 1, 43, 68, 1, 47, 64, 75, 37, 157, 2, 13, 52, 81, 109, 2, 10, 56, 77, 113, 2, 8, 58, 75, 115, 2, 7, 59, - 74, 117, 2, 6, 60, 73, 117, 2, 5, 61, 73, 118, 2, 5, 61, 72, 118, 2, 4, 61, 72, 119, 2, 4, 62, 72, 119, - 2, 4, 62, 72, 119, 2, 4, 62, 71, 119, 2, 4, 62, 72, 119, 2, 4, 62, 72, 119, 2, 4, 61, 72, 119, 2, 5, - 61, 72, 118, 2, 5, 61, 73, 118, 2, 6, 60, 73, 117, 2, 7, 59, 74, 116, 2, 8, 58, 76, 115, 2, 10, 56, 77, - 113, 2, 15, 51, 76, 108, 2, 21, 41, 75, 105, 2, 21, 41, 74, 104, 2, 21, 41, 73, 103, 2, 21, 41, 71, 102, - 2, 21, 41, 70, 101, 2, 21, 41, 69, 100, 2, 21, 41, 68, 99, 2, 21, 41, 67, 97, 2, 21, 41, 66, 96, 2, 21, - 41, 64, 95, 2, 21, 41, 63, 94, 2, 21, 41, 62, 93, 2, 21, 41, 61, 92, 2, 21, 41, 60, 90, 2, 21, 41, 59, - 89, 2, 21, 41, 57, 88, 2, 21, 41, 56, 87, 2, 21, 41, 55, 86, 2, 21, 41, 54, 85, 2, 21, 41, 53, 84, 2, - 21, 41, 52, 82, 2, 21, 41, 50, 81, 2, 21, 41, 49, 80, 2, 21, 41, 48, 79, 2, 21, 41, 47, 78, 2, 21, 41, - 46, 77, 2, 21, 41, 45, 76, 2, 21, 41, 43, 74, 2, 21, 41, 42, 73, 1, 21, 72, 1, 21, 71, 1, 21, 70, 1, - 21, 72, 1, 21, 73, 1, 21, 75, 1, 21, 76, 1, 21, 77, 1, 21, 78, 1, 21, 79, 1, 21, 80, 1, 21, 81, 1, 21, - 82, 1, 21, 83, 1, 21, 84, 1, 21, 85, 1, 21, 86, 2, 21, 53, 55, 87, 2, 21, 52, 57, 88, 2, 21, 50, 59, - 89, 2, 21, 49, 60, 89, 2, 21, 48, 62, 90, 2, 21, 47, 63, 91, 2, 21, 46, 64, 91, 2, 21, 44, 66, 92, 2, - 21, 43, 67, 93, 2, 21, 42, 68, 93, 2, 21, 41, 69, 94, 2, 21, 41, 70, 95, 2, 21, 41, 70, 95, 2, 21, 41, - 71, 96, 2, 21, 41, 72, 96, 2, 21, 41, 73, 97, 2, 21, 41, 73, 97, 2, 21, 41, 74, 98, 2, 21, 41, 75, 98, - 2, 21, 41, 75, 99, 2, 21, 41, 76, 99, 2, 21, 41, 77, 100, 2, 21, 41, 77, 100, 2, 21, 41, 78, 101, 2, - 21, 41, 78, 101, 2, 21, 41, 79, 102, 2, 21, 41, 79, 102, 2, 21, 41, 80, 103, 2, 21, 41, 80, 103, 2, 21, - 41, 81, 104, 2, 21, 41, 81, 104, 2, 21, 41, 82, 104, 2, 21, 41, 82, 105, 2, 12, 54, 83, 116, 2, 9, 57, - 83, 119, 2, 8, 58, 84, 120, 2, 6, 59, 84, 121, 2, 6, 60, 85, 122, 2, 5, 61, 85, 123, 2, 5, 61, 86, 123, - 2, 4, 61, 86, 124, 2, 4, 62, 87, 124, 2, 4, 62, 87, 124, 2, 4, 62, 88, 124, 2, 4, 62, 88, 124, 2, 4, - 62, 89, 124, 2, 5, 61, 89, 123, 2, 5, 61, 89, 123, 2, 5, 60, 90, 122, 2, 6, 60, 90, 122, 2, 7, 59, 91, - 121, 2, 9, 57, 91, 119, 2, 11, 55, 92, 117, 76, 37, 157, 1, 18, 69, 1, 14, 73, 1, 12, 75, 1, 11, 76, - 1, 10, 77, 1, 9, 78, 1, 9, 78, 1, 8, 79, 1, 8, 79, 1, 8, 79, 1, 8, 79, 1, 8, 79, 1, 8, 79, 1, 8, 79, - 1, 9, 78, 1, 9, 78, 1, 10, 77, 1, 11, 76, 1, 12, 75, 1, 14, 73, 1, 19, 68, 1, 33, 54, 1, 33, 54, 1, 33, - 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, - 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, - 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, - 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, - 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 1, 33, 54, 2, 33, 54, 105, 110, 2, 33, 54, 103, - 112, 2, 33, 54, 101, 114, 2, 33, 54, 100, 115, 2, 33, 54, 99, 116, 2, 33, 54, 99, 116, 2, 33, 54, 98, - 117, 2, 33, 54, 98, 117, 2, 33, 54, 98, 117, 2, 33, 54, 98, 117, 2, 33, 54, 98, 117, 2, 33, 54, 97, 117, - 2, 33, 54, 97, 118, 2, 33, 54, 97, 118, 2, 33, 54, 97, 118, 2, 33, 54, 97, 118, 2, 33, 54, 97, 118, 2, - 33, 54, 97, 118, 2, 33, 54, 97, 118, 2, 33, 54, 97, 118, 2, 33, 54, 97, 118, 2, 33, 54, 97, 118, 2, 33, - 54, 97, 118, 2, 33, 54, 97, 118, 2, 33, 54, 97, 118, 2, 33, 54, 97, 118, 2, 33, 54, 97, 118, 2, 33, 54, - 97, 118, 2, 33, 54, 97, 118, 2, 33, 54, 97, 118, 2, 33, 54, 97, 118, 2, 33, 54, 97, 118, 2, 33, 54, 97, - 118, 1, 16, 118, 1, 13, 118, 1, 12, 118, 1, 10, 118, 1, 10, 118, 1, 9, 118, 1, 9, 118, 1, 8, 118, 1, - 8, 118, 1, 8, 118, 1, 8, 118, 1, 8, 118, 1, 8, 118, 1, 9, 118, 1, 9, 118, 1, 10, 118, 1, 10, 118, 1, - 11, 118, 1, 13, 118, 1, 15, 118, 77, 37, 157, 2, 9, 35, 90, 116, 2, 5, 36, 89, 120, 2, 3, 36, 89, 122, - 2, 2, 37, 88, 123, 2, 1, 37, 88, 124, 2, 1, 37, 87, 125, 2, 0, 38, 87, 125, 2, 0, 38, 87, 125, 2, 0, - 39, 86, 126, 2, 0, 39, 86, 126, 2, 0, 40, 85, 126, 2, 0, 40, 85, 126, 2, 0, 41, 84, 126, 2, 0, 41, 84, - 125, 2, 0, 42, 83, 125, 2, 1, 42, 83, 124, 2, 1, 42, 83, 124, 2, 2, 43, 82, 123, 2, 4, 43, 82, 121, 2, - 5, 44, 81, 120, 2, 10, 44, 81, 116, 2, 10, 45, 80, 115, 2, 10, 45, 80, 115, 2, 10, 46, 80, 115, 2, 10, - 46, 79, 115, 2, 10, 47, 79, 115, 2, 10, 47, 78, 115, 2, 10, 47, 78, 115, 2, 10, 48, 77, 115, 2, 10, 48, - 77, 115, 2, 10, 49, 76, 115, 2, 10, 49, 76, 115, 2, 10, 50, 76, 115, 2, 10, 50, 75, 115, 2, 10, 51, 75, - 115, 2, 10, 51, 74, 115, 2, 10, 51, 74, 115, 2, 10, 52, 73, 115, 2, 10, 52, 73, 115, 2, 10, 53, 72, 115, - 4, 10, 30, 31, 53, 72, 94, 95, 115, 4, 10, 30, 31, 54, 72, 94, 95, 115, 4, 10, 30, 32, 54, 71, 93, 95, - 115, 4, 10, 30, 32, 55, 71, 93, 95, 115, 4, 10, 30, 33, 55, 70, 93, 95, 115, 4, 10, 30, 33, 56, 70, 92, - 95, 115, 4, 10, 30, 34, 56, 69, 92, 95, 115, 4, 10, 30, 34, 56, 69, 91, 95, 115, 4, 10, 30, 34, 57, 68, - 91, 95, 115, 4, 10, 30, 35, 57, 68, 90, 95, 115, 4, 10, 30, 35, 58, 68, 90, 95, 115, 4, 10, 30, 36, 58, - 67, 89, 95, 115, 4, 10, 30, 36, 59, 67, 89, 95, 115, 4, 10, 30, 37, 59, 66, 89, 95, 115, 4, 10, 30, 37, - 60, 66, 88, 95, 115, 4, 10, 30, 38, 60, 65, 88, 95, 115, 4, 10, 30, 38, 61, 65, 87, 95, 115, 4, 10, 30, - 38, 61, 64, 87, 95, 115, 4, 10, 30, 39, 61, 64, 86, 95, 115, 4, 10, 30, 39, 62, 64, 86, 95, 115, 4, 10, - 30, 40, 62, 63, 85, 95, 115, 3, 10, 30, 40, 85, 95, 115, 3, 10, 30, 41, 85, 95, 115, 3, 10, 30, 41, 84, - 95, 115, 3, 10, 30, 42, 84, 95, 115, 3, 10, 30, 42, 83, 95, 115, 3, 10, 30, 43, 83, 95, 115, 3, 10, 30, - 43, 82, 95, 115, 3, 10, 30, 43, 82, 95, 115, 3, 10, 30, 44, 81, 95, 115, 3, 10, 30, 44, 81, 95, 115, - 3, 10, 30, 45, 81, 95, 115, 3, 10, 30, 45, 80, 95, 115, 3, 10, 30, 46, 80, 95, 115, 3, 10, 30, 46, 79, - 95, 115, 3, 10, 30, 47, 79, 95, 115, 3, 10, 30, 47, 78, 95, 115, 3, 10, 30, 48, 78, 95, 115, 3, 10, 30, - 48, 77, 95, 115, 3, 10, 30, 48, 77, 95, 115, 3, 10, 30, 49, 77, 95, 115, 3, 10, 30, 49, 76, 95, 115, - 3, 10, 30, 50, 76, 95, 115, 3, 10, 30, 50, 75, 95, 115, 3, 10, 30, 51, 75, 95, 115, 3, 10, 30, 51, 74, - 95, 115, 3, 10, 30, 52, 74, 95, 115, 3, 10, 30, 52, 73, 95, 115, 3, 10, 30, 52, 73, 95, 115, 3, 10, 30, - 53, 73, 95, 115, 3, 10, 30, 53, 72, 95, 115, 3, 10, 30, 54, 72, 95, 115, 2, 10, 30, 95, 115, 2, 10, 30, - 95, 115, 2, 10, 30, 95, 115, 2, 10, 30, 95, 115, 2, 10, 30, 95, 115, 2, 10, 30, 95, 115, 2, 10, 30, 95, - 115, 2, 10, 30, 95, 115, 2, 5, 43, 82, 120, 2, 2, 46, 79, 123, 2, 1, 48, 77, 124, 2, 0, 49, 76, 125, - 2, 0, 50, 76, 126, 2, 0, 50, 75, 127, 2, 0, 51, 75, 127, 2, 0, 51, 74, 127, 2, 0, 51, 74, 127, 2, 0, - 51, 74, 128, 2, 0, 51, 74, 128, 2, 0, 51, 74, 128, 2, 0, 51, 74, 127, 2, 0, 51, 74, 127, 2, 0, 50, 75, - 127, 2, 0, 50, 75, 126, 2, 0, 49, 76, 125, 2, 1, 48, 77, 124, 2, 2, 47, 79, 123, 2, 5, 44, 81, 120, 78, - 37, 157, 2, 9, 37, 77, 111, 2, 5, 37, 73, 115, 2, 4, 38, 72, 117, 2, 2, 39, 70, 119, 2, 1, 39, 69, 120, - 2, 1, 40, 69, 120, 2, 0, 41, 68, 121, 2, 0, 41, 68, 121, 2, 0, 42, 68, 121, 2, 0, 43, 68, 121, 2, 0, - 43, 68, 121, 2, 0, 44, 68, 121, 2, 0, 44, 68, 121, 2, 0, 45, 68, 121, 2, 0, 46, 68, 121, 2, 1, 46, 69, - 120, 2, 2, 47, 70, 119, 2, 3, 48, 71, 119, 2, 4, 48, 72, 117, 2, 6, 49, 74, 115, 2, 11, 50, 79, 111, - 2, 16, 50, 89, 109, 2, 16, 51, 89, 109, 2, 16, 51, 89, 109, 2, 16, 52, 89, 109, 2, 16, 53, 89, 109, 2, - 16, 53, 89, 109, 2, 16, 54, 89, 109, 2, 16, 55, 89, 109, 2, 16, 55, 89, 109, 2, 16, 56, 89, 109, 2, 16, - 57, 89, 109, 2, 16, 57, 89, 109, 2, 16, 58, 89, 109, 2, 16, 58, 89, 109, 2, 16, 59, 89, 109, 2, 16, 60, - 89, 109, 2, 16, 60, 89, 109, 2, 16, 61, 89, 109, 2, 16, 62, 89, 109, 3, 16, 37, 38, 62, 89, 109, 3, 16, - 37, 39, 63, 89, 109, 3, 16, 37, 39, 64, 89, 109, 3, 16, 37, 40, 64, 89, 109, 3, 16, 37, 40, 65, 89, 109, - 3, 16, 37, 41, 65, 89, 109, 3, 16, 37, 42, 66, 89, 109, 3, 16, 37, 42, 67, 89, 109, 3, 16, 37, 43, 67, - 89, 109, 3, 16, 37, 44, 68, 89, 109, 3, 16, 37, 44, 69, 89, 109, 3, 16, 37, 45, 69, 89, 109, 3, 16, 37, - 46, 70, 89, 109, 3, 16, 37, 46, 71, 89, 109, 3, 16, 37, 47, 71, 89, 109, 3, 16, 37, 47, 72, 89, 109, - 3, 16, 37, 48, 73, 89, 109, 3, 16, 37, 49, 73, 89, 109, 3, 16, 37, 49, 74, 89, 109, 3, 16, 37, 50, 74, - 89, 109, 3, 16, 37, 51, 75, 89, 109, 3, 16, 37, 51, 76, 89, 109, 3, 16, 37, 52, 76, 89, 109, 3, 16, 37, - 53, 77, 89, 109, 3, 16, 37, 53, 78, 89, 109, 3, 16, 37, 54, 78, 89, 109, 3, 16, 37, 55, 79, 89, 109, - 3, 16, 37, 55, 80, 89, 109, 3, 16, 37, 56, 80, 89, 109, 3, 16, 37, 56, 81, 89, 109, 3, 16, 37, 57, 81, - 89, 109, 3, 16, 37, 58, 82, 89, 109, 3, 16, 37, 58, 83, 89, 109, 3, 16, 37, 59, 83, 89, 109, 3, 16, 37, - 60, 84, 89, 109, 3, 16, 37, 60, 85, 89, 109, 3, 16, 37, 61, 85, 89, 109, 3, 16, 37, 62, 86, 89, 109, - 3, 16, 37, 62, 87, 89, 109, 3, 16, 37, 63, 87, 89, 109, 3, 16, 37, 63, 88, 89, 109, 3, 16, 37, 64, 88, - 89, 109, 2, 16, 37, 65, 109, 2, 16, 37, 65, 109, 2, 16, 37, 66, 109, 2, 16, 37, 67, 109, 2, 16, 37, 67, - 109, 2, 16, 37, 68, 109, 2, 16, 37, 69, 109, 2, 16, 37, 69, 109, 2, 16, 37, 70, 109, 2, 16, 37, 70, 109, - 2, 16, 37, 71, 109, 2, 16, 37, 72, 109, 2, 16, 37, 72, 109, 2, 16, 37, 73, 109, 2, 16, 37, 74, 109, 2, - 16, 37, 74, 109, 2, 16, 37, 75, 109, 2, 16, 37, 76, 109, 2, 12, 49, 76, 109, 2, 9, 52, 77, 109, 2, 7, - 54, 78, 109, 2, 6, 55, 78, 109, 2, 5, 56, 79, 109, 2, 5, 56, 79, 109, 2, 4, 57, 80, 109, 2, 4, 57, 81, - 109, 2, 4, 57, 81, 109, 2, 4, 58, 82, 109, 2, 4, 58, 83, 109, 2, 4, 57, 83, 109, 2, 4, 57, 84, 109, 2, - 4, 57, 85, 109, 2, 5, 57, 85, 109, 2, 5, 56, 86, 109, 2, 6, 55, 86, 109, 2, 7, 54, 87, 109, 2, 8, 53, - 88, 109, 2, 11, 50, 88, 109, 79, 34, 160, 1, 57, 68, 1, 51, 74, 1, 47, 77, 1, 44, 80, 1, 42, 83, 1, 40, - 85, 1, 38, 87, 1, 36, 89, 1, 34, 90, 1, 33, 92, 1, 31, 93, 1, 30, 94, 1, 29, 96, 1, 27, 97, 1, 26, 98, - 1, 25, 99, 1, 24, 100, 1, 23, 101, 1, 22, 102, 1, 21, 103, 1, 20, 104, 2, 20, 55, 70, 105, 2, 19, 51, - 73, 106, 2, 18, 49, 75, 106, 2, 17, 47, 77, 107, 2, 16, 45, 79, 108, 2, 16, 44, 81, 108, 2, 15, 42, 82, - 109, 2, 15, 41, 83, 110, 2, 14, 40, 84, 110, 2, 13, 39, 85, 111, 2, 13, 38, 86, 111, 2, 12, 37, 87, 112, - 2, 12, 36, 88, 113, 2, 11, 35, 89, 113, 2, 11, 35, 90, 113, 2, 10, 34, 90, 114, 2, 10, 33, 91, 114, 2, - 10, 32, 92, 115, 2, 9, 32, 93, 115, 2, 9, 31, 93, 115, 2, 9, 31, 94, 116, 2, 8, 30, 94, 116, 2, 8, 30, - 95, 116, 2, 8, 29, 95, 117, 2, 7, 29, 96, 117, 2, 7, 28, 96, 117, 2, 7, 28, 96, 117, 2, 7, 28, 96, 118, - 2, 7, 28, 97, 118, 2, 6, 27, 97, 118, 2, 6, 27, 97, 118, 2, 6, 27, 98, 118, 2, 6, 27, 98, 118, 2, 6, - 26, 98, 119, 2, 6, 26, 98, 119, 2, 6, 26, 98, 119, 2, 6, 26, 98, 119, 2, 6, 26, 98, 119, 2, 5, 26, 98, - 119, 2, 5, 26, 99, 119, 2, 5, 26, 99, 119, 2, 5, 26, 99, 119, 2, 5, 26, 99, 119, 2, 5, 26, 99, 119, 2, - 5, 26, 99, 119, 2, 5, 26, 99, 119, 2, 5, 26, 98, 119, 2, 6, 26, 98, 119, 2, 6, 26, 98, 119, 2, 6, 26, - 98, 119, 2, 6, 26, 98, 119, 2, 6, 26, 98, 118, 2, 6, 27, 98, 118, 2, 6, 27, 97, 118, 2, 6, 27, 97, 118, - 2, 6, 27, 97, 118, 2, 7, 28, 97, 118, 2, 7, 28, 96, 117, 2, 7, 28, 96, 117, 2, 7, 29, 96, 117, 2, 8, - 29, 95, 117, 2, 8, 30, 95, 117, 2, 8, 30, 94, 116, 2, 8, 31, 94, 116, 2, 9, 31, 93, 116, 2, 9, 32, 93, - 115, 2, 9, 32, 92, 115, 2, 10, 33, 92, 115, 2, 10, 34, 91, 114, 2, 11, 34, 90, 114, 2, 11, 35, 89, 113, - 2, 12, 36, 89, 113, 2, 12, 37, 88, 112, 2, 13, 37, 87, 112, 2, 13, 38, 86, 111, 2, 14, 39, 85, 111, 2, - 14, 41, 84, 110, 2, 15, 42, 83, 110, 2, 16, 43, 81, 109, 2, 16, 45, 80, 108, 2, 17, 46, 78, 107, 2, 18, - 48, 76, 107, 2, 18, 50, 74, 106, 2, 19, 53, 71, 105, 2, 20, 57, 67, 104, 1, 21, 103, 1, 22, 103, 1, 23, - 102, 1, 24, 101, 1, 25, 100, 1, 26, 99, 1, 27, 97, 1, 28, 96, 1, 29, 95, 1, 31, 94, 1, 32, 92, 1, 33, - 91, 1, 35, 89, 1, 37, 88, 1, 39, 86, 1, 41, 84, 1, 43, 81, 1, 46, 79, 1, 49, 75, 1, 54, 71, 80, 37, 157, - 1, 23, 76, 1, 19, 85, 1, 17, 90, 1, 16, 93, 1, 15, 95, 1, 15, 97, 1, 14, 99, 1, 14, 101, 1, 14, 102, - 1, 13, 104, 1, 13, 105, 1, 13, 106, 1, 14, 107, 1, 14, 108, 1, 14, 109, 1, 15, 110, 1, 15, 111, 1, 16, - 111, 1, 17, 112, 1, 19, 113, 1, 25, 113, 2, 30, 50, 82, 114, 2, 30, 50, 85, 114, 2, 30, 50, 88, 115, - 2, 30, 50, 90, 115, 2, 30, 50, 91, 116, 2, 30, 50, 92, 116, 2, 30, 50, 93, 116, 2, 30, 50, 94, 117, 2, - 30, 50, 95, 117, 2, 30, 50, 96, 117, 2, 30, 50, 96, 118, 2, 30, 50, 97, 118, 2, 30, 50, 97, 118, 2, 30, - 50, 97, 118, 2, 30, 50, 98, 118, 2, 30, 50, 98, 118, 2, 30, 50, 98, 118, 2, 30, 50, 98, 118, 2, 30, 50, - 98, 118, 2, 30, 50, 98, 118, 2, 30, 50, 98, 118, 2, 30, 50, 97, 118, 2, 30, 50, 97, 118, 2, 30, 50, 97, - 118, 2, 30, 50, 96, 118, 2, 30, 50, 96, 117, 2, 30, 50, 95, 117, 2, 30, 50, 94, 117, 2, 30, 50, 94, 117, - 2, 30, 50, 93, 116, 2, 30, 50, 92, 116, 2, 30, 50, 91, 116, 2, 30, 50, 89, 115, 2, 30, 50, 87, 115, 2, - 30, 50, 86, 114, 2, 30, 50, 83, 114, 2, 30, 50, 80, 113, 2, 30, 50, 74, 112, 1, 30, 112, 1, 30, 111, - 1, 30, 110, 1, 30, 109, 1, 30, 108, 1, 30, 107, 1, 30, 106, 1, 30, 105, 1, 30, 104, 1, 30, 102, 1, 30, - 101, 1, 30, 100, 1, 30, 98, 1, 30, 97, 1, 30, 95, 1, 30, 93, 1, 30, 91, 1, 30, 88, 1, 30, 85, 1, 30, - 79, 1, 30, 50, 1, 30, 50, 1, 30, 50, 1, 30, 50, 1, 30, 50, 1, 30, 50, 1, 30, 50, 1, 30, 50, 1, 30, 50, - 1, 30, 50, 1, 30, 50, 1, 30, 50, 1, 30, 50, 1, 30, 50, 1, 30, 50, 1, 30, 50, 1, 30, 50, 1, 30, 50, 1, - 30, 50, 1, 30, 50, 1, 30, 50, 1, 22, 76, 1, 18, 79, 1, 17, 81, 1, 16, 82, 1, 15, 83, 1, 14, 83, 1, 14, - 84, 1, 14, 84, 1, 14, 84, 1, 13, 84, 1, 13, 84, 1, 13, 84, 1, 14, 84, 1, 14, 84, 1, 14, 83, 1, 15, 83, - 1, 16, 82, 1, 17, 81, 1, 18, 80, 1, 20, 77, 81, 34, 187, 1, 57, 67, 1, 51, 73, 1, 47, 77, 1, 44, 80, - 1, 42, 82, 1, 40, 85, 1, 38, 87, 1, 36, 88, 1, 34, 90, 1, 33, 91, 1, 31, 93, 1, 30, 94, 1, 29, 95, 1, - 27, 97, 1, 26, 98, 1, 25, 99, 1, 24, 100, 1, 23, 101, 1, 22, 102, 1, 21, 103, 1, 20, 104, 2, 20, 55, - 70, 105, 2, 19, 51, 73, 105, 2, 18, 49, 75, 106, 2, 17, 47, 77, 107, 2, 16, 45, 79, 108, 2, 16, 44, 80, - 108, 2, 15, 42, 82, 109, 2, 15, 41, 83, 110, 2, 14, 40, 84, 110, 2, 13, 39, 85, 111, 2, 13, 38, 86, 111, - 2, 12, 37, 87, 112, 2, 12, 36, 88, 112, 2, 11, 35, 89, 113, 2, 11, 35, 90, 113, 2, 10, 34, 90, 114, 2, - 10, 33, 91, 114, 2, 10, 32, 92, 115, 2, 9, 32, 93, 115, 2, 9, 31, 93, 115, 2, 9, 31, 94, 116, 2, 8, 30, - 94, 116, 2, 8, 30, 94, 116, 2, 8, 29, 95, 117, 2, 7, 29, 95, 117, 2, 7, 28, 96, 117, 2, 7, 28, 96, 117, - 2, 7, 28, 96, 118, 2, 6, 28, 97, 118, 2, 6, 27, 97, 118, 2, 6, 27, 97, 118, 2, 6, 27, 97, 118, 2, 6, - 26, 98, 118, 2, 6, 26, 98, 118, 2, 6, 26, 98, 119, 2, 6, 26, 98, 119, 2, 6, 26, 98, 119, 2, 5, 26, 98, - 119, 2, 5, 26, 98, 119, 2, 5, 26, 98, 119, 2, 5, 26, 98, 119, 2, 5, 26, 99, 119, 2, 5, 26, 99, 119, 2, - 5, 26, 99, 119, 2, 5, 26, 98, 119, 2, 5, 26, 98, 119, 2, 6, 26, 98, 119, 2, 6, 26, 98, 119, 2, 6, 26, - 98, 119, 2, 6, 26, 98, 119, 2, 6, 26, 98, 118, 2, 6, 26, 98, 118, 2, 6, 27, 98, 118, 2, 6, 27, 97, 118, - 2, 6, 27, 97, 118, 2, 7, 27, 97, 118, 2, 7, 28, 97, 118, 2, 7, 28, 96, 117, 2, 7, 28, 96, 117, 2, 7, - 28, 96, 117, 2, 8, 29, 95, 117, 2, 8, 29, 95, 116, 2, 8, 30, 94, 116, 2, 9, 30, 94, 116, 2, 9, 31, 93, - 115, 2, 9, 31, 93, 115, 2, 10, 32, 92, 115, 2, 10, 33, 92, 114, 2, 10, 33, 91, 114, 2, 11, 34, 90, 114, - 2, 11, 35, 89, 113, 2, 12, 36, 88, 113, 2, 12, 36, 88, 112, 2, 13, 37, 87, 112, 2, 13, 38, 86, 111, 2, - 14, 39, 85, 111, 2, 14, 41, 84, 110, 2, 15, 42, 82, 109, 2, 16, 43, 81, 109, 2, 16, 44, 80, 108, 2, 17, - 46, 78, 107, 2, 18, 48, 76, 107, 2, 19, 50, 74, 106, 2, 19, 53, 71, 105, 2, 20, 57, 67, 104, 1, 21, 103, - 1, 22, 103, 1, 23, 102, 1, 24, 101, 1, 25, 100, 1, 26, 99, 1, 27, 97, 1, 28, 96, 1, 29, 95, 1, 31, 94, - 1, 32, 92, 1, 33, 91, 1, 35, 89, 1, 37, 88, 1, 39, 86, 1, 41, 84, 1, 41, 81, 1, 40, 79, 1, 39, 75, 1, - 38, 70, 2, 37, 66, 107, 110, 2, 36, 73, 105, 114, 2, 35, 78, 103, 115, 2, 34, 82, 101, 116, 2, 33, 84, - 99, 117, 2, 32, 87, 97, 118, 2, 31, 89, 95, 118, 1, 30, 119, 1, 29, 119, 1, 28, 119, 1, 27, 119, 1, 26, - 119, 1, 26, 119, 1, 25, 119, 1, 25, 118, 1, 25, 118, 1, 25, 117, 1, 25, 116, 1, 25, 115, 1, 25, 114, - 1, 25, 112, 1, 26, 110, 2, 27, 56, 73, 109, 2, 27, 51, 76, 107, 2, 29, 46, 79, 105, 2, 30, 42, 82, 102, - 2, 33, 37, 86, 98, 82, 37, 157, 1, 14, 66, 1, 10, 76, 1, 8, 80, 1, 7, 84, 1, 6, 86, 1, 5, 89, 1, 5, 91, - 1, 5, 92, 1, 4, 94, 1, 4, 95, 1, 4, 96, 1, 4, 97, 1, 4, 98, 1, 5, 99, 1, 5, 100, 1, 5, 101, 1, 6, 102, - 1, 7, 103, 1, 8, 103, 1, 10, 104, 1, 15, 104, 2, 21, 41, 73, 105, 2, 21, 41, 77, 106, 2, 21, 41, 79, - 106, 2, 21, 41, 81, 106, 2, 21, 41, 82, 107, 2, 21, 41, 83, 107, 2, 21, 41, 84, 107, 2, 21, 41, 85, 108, - 2, 21, 41, 86, 108, 2, 21, 41, 87, 108, 2, 21, 41, 87, 108, 2, 21, 41, 88, 108, 2, 21, 41, 88, 108, 2, - 21, 41, 88, 109, 2, 21, 41, 88, 109, 2, 21, 41, 88, 109, 2, 21, 41, 88, 109, 2, 21, 41, 88, 109, 2, 21, - 41, 88, 109, 2, 21, 41, 87, 108, 2, 21, 41, 87, 108, 2, 21, 41, 86, 108, 2, 21, 41, 86, 108, 2, 21, 41, - 85, 108, 2, 21, 41, 84, 107, 2, 21, 41, 82, 107, 2, 21, 41, 81, 107, 2, 21, 41, 79, 106, 2, 21, 41, 77, - 106, 2, 21, 41, 75, 105, 2, 21, 41, 73, 105, 2, 21, 41, 69, 104, 2, 21, 41, 65, 104, 1, 21, 103, 1, 21, - 102, 1, 21, 101, 1, 21, 100, 1, 21, 99, 1, 21, 98, 1, 21, 97, 1, 21, 96, 1, 21, 95, 1, 21, 93, 1, 21, - 92, 1, 21, 90, 1, 21, 89, 1, 21, 87, 1, 21, 87, 1, 21, 88, 1, 21, 89, 1, 21, 90, 1, 21, 91, 1, 21, 92, - 2, 21, 41, 61, 93, 2, 21, 41, 63, 94, 2, 21, 41, 65, 95, 2, 21, 41, 66, 96, 2, 21, 41, 68, 97, 2, 21, - 41, 69, 98, 2, 21, 41, 70, 99, 2, 21, 41, 71, 99, 2, 21, 41, 72, 100, 2, 21, 41, 74, 101, 2, 21, 41, - 75, 102, 2, 21, 41, 76, 103, 2, 21, 41, 77, 103, 2, 21, 41, 77, 104, 2, 21, 41, 78, 105, 2, 21, 41, 79, - 106, 2, 21, 41, 80, 106, 2, 21, 41, 81, 107, 2, 21, 41, 82, 108, 2, 21, 41, 83, 108, 2, 21, 41, 84, 109, - 2, 21, 41, 84, 110, 2, 21, 41, 85, 110, 2, 21, 41, 86, 111, 2, 21, 41, 87, 112, 2, 21, 41, 88, 112, 2, - 12, 54, 88, 119, 2, 9, 57, 89, 123, 2, 8, 58, 90, 124, 2, 7, 60, 91, 125, 2, 6, 60, 91, 126, 2, 5, 61, - 92, 126, 2, 5, 61, 93, 127, 2, 4, 62, 93, 127, 2, 4, 62, 94, 127, 2, 4, 62, 95, 127, 2, 4, 62, 96, 127, - 2, 4, 62, 96, 127, 2, 4, 62, 97, 127, 2, 5, 61, 98, 127, 2, 5, 61, 98, 126, 2, 6, 61, 99, 126, 2, 6, - 60, 100, 125, 2, 7, 59, 100, 124, 2, 9, 57, 101, 123, 2, 11, 55, 102, 120, 83, 34, 160, 1, 58, 66, 1, - 50, 74, 1, 46, 78, 2, 43, 81, 96, 97, 2, 41, 83, 92, 101, 2, 39, 85, 90, 102, 2, 36, 87, 89, 103, 1, - 35, 104, 1, 34, 105, 1, 32, 105, 1, 31, 106, 1, 30, 106, 1, 29, 106, 1, 28, 106, 1, 27, 106, 1, 26, 106, - 1, 25, 106, 1, 24, 106, 1, 24, 106, 1, 23, 106, 1, 22, 106, 2, 22, 54, 70, 106, 2, 21, 50, 73, 106, 2, - 21, 48, 76, 106, 2, 20, 46, 78, 106, 2, 20, 45, 79, 106, 2, 19, 44, 81, 106, 2, 19, 42, 82, 106, 2, 19, - 41, 83, 106, 2, 19, 41, 84, 106, 2, 18, 40, 84, 106, 2, 18, 39, 85, 106, 2, 18, 39, 85, 106, 2, 18, 38, - 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 86, 106, 2, 18, 38, 87, 106, 2, 18, 38, 87, - 106, 2, 18, 39, 87, 105, 2, 18, 39, 88, 105, 2, 18, 40, 88, 104, 2, 18, 40, 89, 104, 2, 19, 41, 90, 103, - 2, 19, 42, 92, 101, 2, 19, 43, 95, 98, 1, 19, 44, 1, 20, 45, 1, 20, 47, 1, 21, 50, 1, 21, 53, 1, 22, - 58, 1, 22, 62, 1, 23, 67, 1, 23, 72, 1, 24, 76, 1, 25, 80, 1, 26, 84, 1, 27, 87, 1, 27, 89, 1, 28, 92, - 1, 30, 94, 1, 31, 96, 1, 32, 97, 1, 34, 98, 1, 35, 100, 1, 37, 101, 1, 40, 102, 1, 42, 103, 1, 45, 103, - 1, 49, 104, 1, 52, 105, 1, 57, 106, 1, 61, 106, 1, 67, 107, 1, 71, 107, 1, 76, 108, 1, 79, 108, 1, 81, - 109, 1, 83, 109, 1, 85, 109, 1, 86, 110, 2, 23, 25, 87, 110, 2, 19, 29, 88, 110, 2, 18, 30, 89, 110, - 2, 17, 31, 89, 110, 2, 16, 32, 90, 110, 2, 15, 33, 90, 111, 2, 15, 33, 90, 111, 2, 15, 33, 90, 111, 2, - 14, 34, 91, 111, 2, 14, 34, 90, 111, 2, 14, 35, 90, 111, 2, 14, 35, 90, 110, 2, 14, 35, 90, 110, 2, 14, - 36, 89, 110, 2, 14, 36, 88, 110, 2, 14, 38, 87, 110, 2, 14, 39, 86, 110, 2, 14, 40, 85, 109, 2, 14, 42, - 84, 109, 2, 14, 44, 82, 109, 2, 14, 46, 80, 108, 2, 14, 49, 77, 108, 2, 14, 52, 74, 108, 2, 14, 57, 69, - 107, 1, 14, 106, 1, 14, 106, 1, 14, 105, 1, 14, 105, 1, 14, 104, 1, 14, 103, 1, 14, 102, 1, 14, 101, - 1, 14, 100, 1, 14, 98, 1, 15, 97, 1, 15, 96, 2, 16, 32, 33, 94, 2, 16, 31, 35, 93, 2, 17, 30, 38, 90, - 2, 19, 29, 40, 88, 2, 21, 27, 43, 86, 1, 46, 83, 1, 50, 79, 1, 55, 75, 84, 37, 157, 1, 9, 114, 1, 9, - 114, 1, 9, 114, 1, 9, 114, 1, 9, 114, 1, 9, 114, 1, 9, 114, 1, 9, 114, 1, 9, 114, 1, 9, 114, 1, 9, 114, - 1, 9, 114, 1, 9, 114, 1, 9, 114, 1, 9, 114, 1, 9, 114, 1, 9, 114, 1, 9, 114, 1, 9, 114, 1, 9, 114, 1, - 9, 114, 3, 9, 30, 52, 72, 94, 114, 3, 9, 30, 52, 72, 94, 114, 3, 9, 30, 52, 72, 94, 114, 3, 9, 30, 52, - 72, 94, 114, 3, 9, 30, 52, 72, 94, 114, 3, 9, 30, 52, 72, 94, 114, 3, 9, 30, 52, 72, 94, 114, 3, 9, 30, - 52, 72, 94, 114, 3, 9, 30, 52, 72, 94, 114, 3, 9, 30, 52, 72, 94, 114, 3, 9, 30, 52, 72, 94, 114, 3, - 9, 30, 52, 72, 94, 114, 3, 9, 30, 52, 72, 94, 114, 3, 9, 30, 52, 72, 94, 114, 3, 9, 30, 52, 72, 94, 114, - 3, 9, 30, 52, 72, 94, 114, 3, 9, 30, 52, 72, 94, 114, 3, 9, 30, 52, 72, 94, 114, 3, 9, 30, 52, 72, 94, - 114, 3, 9, 30, 52, 72, 94, 114, 3, 9, 30, 52, 72, 94, 114, 3, 10, 30, 52, 72, 94, 114, 3, 10, 29, 52, - 72, 94, 114, 3, 10, 29, 52, 72, 95, 114, 3, 10, 29, 52, 72, 95, 114, 3, 10, 29, 52, 72, 95, 114, 3, 11, - 28, 52, 72, 95, 113, 3, 11, 28, 52, 72, 96, 113, 3, 12, 27, 52, 72, 96, 112, 3, 12, 27, 52, 72, 97, 111, - 3, 14, 26, 52, 72, 98, 110, 3, 15, 24, 52, 72, 100, 109, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 33, 91, 1, 30, 94, 1, 28, 96, 1, 27, - 97, 1, 26, 97, 1, 26, 98, 1, 25, 99, 1, 25, 99, 1, 25, 99, 1, 25, 99, 1, 25, 99, 1, 25, 99, 1, 25, 99, - 1, 25, 99, 1, 25, 98, 1, 26, 98, 1, 27, 97, 1, 28, 96, 1, 29, 94, 1, 32, 92, 85, 37, 160, 2, 14, 48, - 77, 111, 2, 9, 51, 73, 115, 2, 7, 53, 71, 117, 2, 6, 54, 70, 118, 2, 6, 55, 69, 119, 2, 5, 56, 68, 120, - 2, 4, 56, 68, 120, 2, 4, 57, 68, 121, 2, 4, 57, 67, 121, 2, 4, 57, 67, 121, 2, 3, 57, 67, 121, 2, 4, - 57, 67, 121, 2, 4, 57, 67, 121, 2, 4, 57, 68, 121, 2, 4, 56, 68, 120, 2, 5, 56, 69, 120, 2, 6, 55, 69, - 119, 2, 6, 54, 70, 118, 2, 8, 53, 71, 117, 2, 9, 51, 73, 115, 2, 14, 46, 79, 110, 2, 16, 36, 88, 108, - 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, - 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, - 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, - 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, - 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, - 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, - 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, - 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, - 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, - 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, - 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, - 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 36, 88, 108, 2, 16, 37, 88, 108, 2, 16, 37, 88, 108, 2, 16, - 37, 88, 108, 2, 17, 37, 87, 108, 2, 17, 38, 87, 108, 2, 17, 38, 87, 108, 2, 17, 38, 86, 107, 2, 17, 39, - 86, 107, 2, 17, 39, 85, 107, 2, 18, 40, 84, 107, 2, 18, 41, 84, 106, 2, 18, 42, 83, 106, 2, 19, 43, 82, - 105, 2, 19, 43, 81, 105, 2, 20, 45, 80, 105, 2, 20, 46, 79, 104, 2, 21, 47, 77, 104, 2, 21, 49, 76, 103, - 2, 22, 51, 74, 103, 2, 23, 53, 71, 102, 2, 23, 57, 67, 101, 1, 24, 101, 1, 25, 100, 1, 26, 99, 1, 27, - 98, 1, 28, 97, 1, 28, 96, 1, 29, 95, 1, 30, 94, 1, 32, 93, 1, 33, 92, 1, 34, 91, 1, 35, 90, 1, 37, 88, - 1, 38, 87, 1, 40, 85, 1, 42, 83, 1, 44, 81, 1, 46, 78, 1, 49, 75, 1, 54, 71, 86, 37, 157, 2, 6, 41, 83, - 117, 2, 2, 45, 79, 122, 2, 1, 47, 77, 124, 2, 0, 48, 76, 125, 2, 0, 49, 75, 126, 2, 0, 50, 74, 126, 2, - 0, 50, 74, 127, 2, 0, 51, 74, 127, 2, 0, 51, 73, 127, 2, 0, 51, 73, 127, 2, 0, 51, 73, 128, 2, 0, 51, - 73, 127, 2, 0, 51, 73, 127, 2, 0, 51, 74, 127, 2, 0, 50, 74, 127, 2, 0, 50, 75, 126, 2, 0, 49, 75, 125, - 2, 0, 48, 76, 125, 2, 1, 47, 77, 123, 2, 2, 45, 79, 122, 2, 7, 40, 84, 117, 2, 11, 33, 91, 113, 2, 12, - 34, 91, 113, 2, 12, 34, 90, 112, 2, 12, 34, 90, 112, 2, 13, 35, 89, 111, 2, 13, 35, 89, 111, 2, 14, 36, - 89, 111, 2, 14, 36, 88, 110, 2, 14, 37, 88, 110, 2, 15, 37, 87, 109, 2, 15, 37, 87, 109, 2, 16, 38, 86, - 109, 2, 16, 38, 86, 108, 2, 17, 39, 86, 108, 2, 17, 39, 85, 107, 2, 17, 39, 85, 107, 2, 18, 40, 84, 106, - 2, 18, 40, 84, 106, 2, 19, 41, 84, 106, 2, 19, 41, 83, 105, 2, 19, 42, 83, 105, 2, 20, 42, 82, 104, 2, - 20, 42, 82, 104, 2, 21, 43, 81, 104, 2, 21, 43, 81, 103, 2, 22, 44, 81, 103, 2, 22, 44, 80, 102, 2, 22, - 44, 80, 102, 2, 23, 45, 79, 101, 2, 23, 45, 79, 101, 2, 24, 46, 78, 101, 2, 24, 46, 78, 100, 2, 24, 47, - 78, 100, 2, 25, 47, 77, 99, 2, 25, 47, 77, 99, 2, 26, 48, 76, 98, 2, 26, 48, 76, 98, 2, 27, 49, 76, 98, - 2, 27, 49, 75, 97, 2, 27, 49, 75, 97, 2, 28, 50, 74, 96, 2, 28, 50, 74, 96, 2, 29, 51, 73, 96, 2, 29, - 51, 73, 95, 2, 29, 52, 73, 95, 2, 30, 52, 72, 94, 2, 30, 52, 72, 94, 2, 31, 53, 71, 93, 2, 31, 53, 71, - 93, 2, 32, 54, 71, 93, 2, 32, 54, 70, 92, 2, 32, 54, 70, 92, 2, 33, 55, 69, 91, 2, 33, 55, 69, 91, 2, - 34, 56, 68, 91, 2, 34, 56, 68, 90, 2, 34, 57, 68, 90, 2, 35, 57, 67, 89, 2, 35, 57, 67, 89, 2, 36, 58, - 66, 88, 2, 36, 58, 66, 88, 2, 37, 59, 65, 88, 2, 37, 59, 65, 87, 2, 37, 59, 65, 87, 2, 38, 60, 64, 86, - 2, 38, 60, 64, 86, 2, 39, 61, 63, 86, 2, 39, 61, 63, 85, 2, 39, 62, 63, 85, 1, 40, 84, 1, 40, 84, 1, - 41, 83, 1, 41, 83, 1, 42, 83, 1, 42, 82, 1, 42, 82, 1, 43, 81, 1, 43, 81, 1, 44, 80, 1, 44, 80, 1, 44, - 80, 1, 45, 79, 1, 45, 79, 1, 46, 78, 1, 46, 78, 1, 47, 78, 1, 47, 77, 1, 47, 77, 1, 48, 76, 1, 48, 76, - 1, 49, 75, 1, 49, 75, 1, 49, 75, 1, 50, 74, 1, 50, 74, 1, 51, 73, 1, 51, 73, 1, 52, 73, 1, 52, 72, 87, - 37, 157, 2, 9, 43, 81, 115, 2, 5, 47, 77, 119, 2, 3, 49, 75, 121, 2, 2, 51, 74, 123, 2, 1, 51, 73, 123, - 2, 0, 52, 72, 124, 2, 0, 52, 72, 125, 2, 0, 53, 72, 125, 2, 0, 53, 71, 125, 2, 0, 53, 71, 125, 2, 0, - 53, 71, 125, 2, 0, 53, 71, 125, 2, 0, 53, 71, 125, 2, 0, 53, 72, 125, 2, 0, 52, 72, 124, 2, 0, 52, 73, - 124, 2, 1, 51, 73, 123, 2, 2, 50, 74, 122, 2, 3, 49, 75, 121, 2, 4, 47, 77, 119, 2, 8, 42, 83, 116, 2, - 9, 29, 95, 116, 2, 9, 29, 95, 115, 2, 9, 29, 95, 115, 2, 9, 30, 95, 115, 2, 9, 30, 95, 115, 2, 9, 30, - 95, 115, 2, 9, 30, 95, 115, 2, 9, 30, 94, 115, 2, 10, 30, 94, 115, 3, 10, 30, 52, 73, 94, 115, 3, 10, - 30, 51, 73, 94, 114, 3, 10, 30, 51, 74, 94, 114, 3, 10, 31, 51, 74, 94, 114, 3, 10, 31, 50, 74, 94, 114, - 3, 10, 31, 50, 75, 94, 114, 3, 10, 31, 50, 75, 93, 114, 3, 11, 31, 49, 75, 93, 114, 3, 11, 31, 49, 76, - 93, 114, 3, 11, 31, 49, 76, 93, 113, 3, 11, 31, 48, 76, 93, 113, 3, 11, 32, 48, 77, 93, 113, 3, 11, 32, - 48, 77, 93, 113, 3, 11, 32, 47, 77, 93, 113, 3, 11, 32, 47, 78, 93, 113, 3, 12, 32, 47, 78, 92, 113, - 3, 12, 32, 46, 78, 92, 113, 3, 12, 32, 46, 79, 92, 113, 3, 12, 32, 46, 79, 92, 112, 3, 12, 33, 45, 79, - 92, 112, 3, 12, 33, 45, 80, 92, 112, 3, 12, 33, 45, 80, 92, 112, 3, 12, 33, 44, 80, 92, 112, 3, 12, 33, - 44, 81, 91, 112, 3, 13, 33, 44, 81, 91, 112, 3, 13, 33, 43, 81, 91, 112, 3, 13, 33, 43, 82, 91, 111, - 3, 13, 33, 43, 82, 91, 111, 3, 13, 34, 42, 82, 91, 111, 3, 13, 34, 42, 83, 91, 111, 3, 13, 34, 41, 83, - 91, 111, 3, 13, 34, 41, 83, 91, 111, 3, 14, 34, 41, 84, 90, 111, 3, 14, 34, 40, 84, 90, 111, 4, 14, 34, - 40, 62, 63, 84, 90, 111, 4, 14, 34, 40, 61, 63, 85, 90, 110, 4, 14, 35, 39, 61, 63, 85, 90, 110, 4, 14, - 35, 39, 61, 64, 85, 90, 110, 4, 14, 35, 39, 60, 64, 86, 90, 110, 4, 14, 35, 38, 60, 64, 86, 90, 110, - 4, 15, 35, 38, 60, 65, 86, 89, 110, 4, 15, 35, 38, 59, 65, 87, 89, 110, 4, 15, 35, 37, 59, 65, 87, 89, - 110, 4, 15, 35, 37, 59, 66, 87, 89, 109, 4, 15, 35, 37, 58, 66, 88, 89, 109, 3, 15, 58, 66, 88, 89, 109, - 3, 15, 58, 67, 88, 89, 109, 2, 15, 57, 67, 109, 2, 16, 57, 67, 109, 2, 16, 57, 68, 109, 2, 16, 56, 68, - 109, 2, 16, 56, 68, 109, 2, 16, 56, 69, 108, 2, 16, 55, 69, 108, 2, 16, 55, 70, 108, 2, 16, 55, 70, 108, - 2, 16, 54, 70, 108, 2, 17, 54, 71, 108, 2, 17, 54, 71, 108, 2, 17, 53, 71, 108, 2, 17, 53, 72, 107, 2, - 17, 53, 72, 107, 2, 17, 52, 72, 107, 2, 17, 52, 73, 107, 2, 17, 52, 73, 107, 2, 18, 51, 73, 107, 2, 18, - 51, 74, 107, 2, 18, 51, 74, 107, 2, 18, 50, 74, 107, 2, 18, 50, 75, 106, 2, 18, 50, 75, 106, 2, 18, 49, - 75, 106, 2, 18, 49, 76, 106, 2, 19, 49, 76, 106, 2, 19, 48, 76, 106, 2, 19, 48, 77, 106, 2, 19, 48, 77, - 106, 2, 19, 47, 77, 105, 2, 19, 47, 78, 105, 2, 19, 47, 78, 105, 2, 19, 46, 78, 105, 2, 20, 46, 79, 105, - 2, 20, 46, 79, 105, 2, 20, 46, 79, 105, 2, 20, 45, 80, 105, 2, 20, 45, 80, 105, 2, 20, 45, 80, 104, 2, - 20, 44, 81, 104, 2, 20, 44, 81, 104, 2, 20, 44, 81, 104, 88, 37, 157, 2, 15, 41, 83, 109, 2, 11, 45, - 79, 113, 2, 9, 47, 77, 115, 2, 8, 48, 76, 116, 2, 7, 49, 75, 117, 2, 7, 50, 74, 118, 2, 6, 50, 74, 118, - 2, 6, 51, 74, 118, 2, 5, 51, 73, 119, 2, 5, 51, 73, 119, 2, 5, 51, 73, 119, 2, 6, 51, 73, 119, 2, 6, - 51, 73, 118, 2, 6, 51, 74, 118, 2, 6, 50, 74, 118, 2, 7, 50, 75, 117, 2, 8, 49, 75, 117, 2, 8, 48, 76, - 116, 2, 9, 47, 77, 114, 2, 11, 45, 79, 113, 2, 16, 44, 81, 108, 2, 18, 44, 80, 106, 2, 19, 45, 79, 106, - 2, 20, 46, 78, 105, 2, 20, 47, 78, 104, 2, 21, 48, 77, 103, 2, 22, 48, 76, 102, 2, 23, 49, 75, 101, 2, - 24, 50, 74, 101, 2, 25, 51, 74, 100, 2, 25, 52, 73, 99, 2, 26, 53, 72, 98, 2, 27, 53, 71, 97, 2, 28, - 54, 70, 96, 2, 29, 55, 69, 96, 2, 30, 56, 69, 95, 2, 30, 57, 68, 94, 2, 31, 58, 67, 93, 2, 32, 58, 66, - 92, 2, 33, 59, 65, 92, 2, 34, 60, 65, 91, 2, 35, 61, 64, 90, 2, 35, 62, 63, 89, 1, 36, 88, 1, 37, 87, - 1, 38, 87, 1, 39, 86, 1, 40, 85, 1, 40, 84, 1, 41, 83, 1, 42, 82, 1, 43, 82, 1, 44, 81, 1, 45, 80, 1, - 46, 79, 1, 46, 78, 1, 47, 78, 1, 48, 77, 1, 49, 76, 1, 49, 76, 1, 48, 77, 1, 47, 77, 1, 46, 78, 1, 45, - 79, 1, 45, 80, 1, 44, 81, 1, 43, 81, 1, 42, 82, 1, 41, 83, 1, 40, 84, 1, 40, 85, 1, 39, 86, 1, 38, 86, - 1, 37, 87, 1, 36, 88, 2, 36, 62, 63, 89, 2, 35, 61, 64, 90, 2, 34, 60, 65, 91, 2, 33, 59, 65, 91, 2, - 32, 59, 66, 92, 2, 31, 58, 67, 93, 2, 31, 57, 68, 94, 2, 30, 56, 69, 95, 2, 29, 55, 70, 96, 2, 28, 55, - 70, 96, 2, 27, 54, 71, 97, 2, 27, 53, 72, 98, 2, 26, 52, 73, 99, 2, 25, 51, 74, 100, 2, 24, 50, 74, 101, - 2, 23, 50, 75, 101, 2, 22, 49, 76, 102, 2, 22, 48, 77, 103, 2, 21, 47, 78, 104, 2, 20, 46, 79, 105, 2, - 19, 46, 79, 106, 2, 18, 45, 80, 106, 2, 17, 44, 81, 107, 2, 17, 43, 82, 108, 2, 16, 42, 83, 109, 2, 12, - 45, 80, 113, 2, 9, 48, 77, 116, 2, 7, 50, 75, 118, 2, 6, 51, 74, 119, 2, 5, 51, 73, 119, 2, 5, 52, 73, - 120, 2, 4, 53, 72, 121, 2, 4, 53, 72, 121, 2, 4, 53, 72, 121, 2, 3, 53, 72, 121, 2, 3, 53, 71, 121, 2, - 4, 53, 72, 121, 2, 4, 53, 72, 121, 2, 4, 53, 72, 121, 2, 4, 52, 72, 120, 2, 5, 52, 73, 120, 2, 6, 51, - 74, 119, 2, 7, 50, 75, 118, 2, 8, 49, 76, 117, 2, 11, 46, 79, 114, 89, 37, 157, 2, 15, 41, 83, 109, 2, - 11, 45, 79, 113, 2, 9, 47, 77, 115, 2, 8, 48, 76, 116, 2, 7, 49, 75, 117, 2, 7, 50, 74, 118, 2, 6, 50, - 74, 118, 2, 6, 51, 73, 119, 2, 6, 51, 73, 119, 2, 6, 51, 73, 119, 2, 6, 51, 73, 119, 2, 6, 51, 73, 119, - 2, 6, 51, 73, 119, 2, 6, 51, 73, 119, 2, 6, 50, 74, 118, 2, 7, 50, 74, 118, 2, 8, 49, 75, 117, 2, 8, - 48, 76, 116, 2, 10, 47, 77, 115, 2, 12, 45, 79, 113, 2, 16, 43, 81, 108, 2, 19, 44, 80, 105, 2, 20, 44, - 80, 104, 2, 20, 45, 79, 104, 2, 21, 46, 78, 103, 2, 22, 46, 78, 102, 2, 23, 47, 77, 102, 2, 23, 48, 76, - 101, 2, 24, 48, 76, 100, 2, 25, 49, 75, 100, 2, 25, 50, 74, 99, 2, 26, 50, 74, 98, 2, 27, 51, 73, 98, - 2, 27, 52, 72, 97, 2, 28, 52, 72, 96, 2, 29, 53, 71, 96, 2, 29, 54, 70, 95, 2, 30, 55, 70, 94, 2, 31, - 55, 69, 93, 2, 31, 56, 68, 93, 2, 32, 57, 68, 92, 2, 33, 57, 67, 91, 2, 33, 58, 66, 91, 2, 34, 59, 66, - 90, 2, 35, 59, 65, 89, 2, 35, 60, 64, 89, 2, 36, 61, 64, 88, 2, 37, 61, 63, 87, 1, 37, 87, 1, 38, 86, - 1, 39, 85, 1, 40, 85, 1, 40, 84, 1, 41, 83, 1, 42, 83, 1, 42, 82, 1, 43, 81, 1, 44, 81, 1, 44, 80, 1, - 45, 79, 1, 46, 79, 1, 46, 78, 1, 47, 77, 1, 48, 77, 1, 48, 76, 1, 49, 75, 1, 50, 75, 1, 50, 74, 1, 51, - 73, 1, 52, 73, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 33, 91, 1, 30, 94, 1, 29, 96, 1, 27, 97, 1, 27, 98, 1, 26, 98, - 1, 26, 99, 1, 25, 99, 1, 25, 99, 1, 25, 100, 1, 25, 100, 1, 25, 99, 1, 25, 99, 1, 25, 99, 1, 26, 99, - 1, 26, 98, 1, 27, 97, 1, 28, 96, 1, 30, 95, 1, 32, 92, 90, 37, 157, 1, 20, 104, 1, 20, 104, 1, 20, 104, - 1, 20, 104, 1, 20, 104, 1, 20, 104, 1, 20, 104, 1, 20, 104, 1, 20, 104, 1, 20, 104, 1, 20, 104, 1, 20, - 104, 1, 20, 104, 1, 20, 104, 1, 20, 104, 1, 20, 104, 1, 20, 104, 1, 20, 104, 1, 20, 104, 1, 20, 104, - 1, 20, 104, 2, 20, 41, 77, 103, 2, 20, 41, 77, 102, 2, 20, 41, 76, 102, 2, 20, 41, 75, 101, 2, 20, 41, - 74, 100, 2, 20, 41, 73, 99, 2, 20, 41, 73, 99, 2, 20, 41, 72, 98, 2, 20, 41, 71, 97, 2, 20, 41, 70, 96, - 2, 20, 41, 70, 95, 2, 20, 41, 69, 95, 2, 20, 41, 68, 94, 2, 20, 41, 67, 93, 2, 20, 41, 66, 92, 2, 20, - 41, 66, 92, 2, 21, 41, 65, 91, 2, 21, 40, 64, 90, 2, 21, 40, 63, 89, 2, 21, 40, 63, 88, 2, 21, 40, 62, - 88, 2, 21, 40, 61, 87, 2, 22, 39, 60, 86, 2, 22, 39, 59, 85, 2, 23, 38, 59, 85, 2, 24, 37, 58, 84, 2, - 26, 35, 57, 83, 2, 28, 33, 56, 82, 1, 56, 81, 1, 55, 81, 1, 54, 80, 1, 53, 79, 1, 52, 78, 1, 52, 78, - 1, 51, 77, 1, 50, 76, 1, 49, 75, 1, 49, 74, 1, 48, 74, 1, 47, 73, 1, 46, 72, 1, 45, 71, 1, 45, 71, 1, - 44, 70, 1, 43, 69, 1, 42, 68, 1, 41, 67, 1, 41, 67, 1, 40, 66, 1, 39, 65, 2, 38, 64, 97, 100, 2, 38, - 64, 94, 103, 2, 37, 63, 92, 105, 2, 36, 62, 91, 106, 2, 35, 61, 90, 106, 2, 34, 60, 90, 107, 2, 34, 60, - 89, 107, 2, 33, 59, 89, 108, 2, 32, 58, 89, 108, 2, 31, 57, 89, 108, 2, 31, 56, 88, 108, 2, 30, 56, 88, - 108, 2, 29, 55, 88, 108, 2, 28, 54, 88, 109, 2, 27, 53, 88, 109, 2, 27, 53, 88, 109, 2, 26, 52, 88, 109, - 2, 25, 51, 88, 109, 2, 24, 50, 88, 109, 2, 24, 49, 88, 109, 2, 23, 49, 88, 109, 2, 22, 48, 88, 109, 2, - 21, 47, 88, 109, 2, 20, 46, 88, 109, 2, 20, 46, 88, 109, 2, 19, 45, 88, 109, 2, 18, 44, 88, 109, 2, 17, - 43, 88, 109, 2, 17, 42, 88, 109, 1, 16, 109, 1, 16, 109, 1, 16, 109, 1, 16, 108, 1, 16, 108, 1, 16, 108, - 1, 16, 108, 1, 16, 108, 1, 16, 108, 1, 16, 108, 1, 16, 108, 1, 16, 108, 1, 16, 108, 1, 16, 108, 1, 16, - 108, 1, 16, 108, 1, 16, 108, 1, 16, 108, 1, 16, 108, 1, 16, 108, 91, 29, 189, 1, 52, 88, 1, 52, 92, 1, - 52, 94, 1, 52, 95, 1, 52, 96, 1, 52, 97, 1, 52, 97, 1, 52, 97, 1, 52, 98, 1, 52, 98, 1, 52, 98, 1, 52, - 98, 1, 52, 98, 1, 52, 97, 1, 52, 97, 1, 52, 96, 1, 52, 96, 1, 52, 95, 1, 52, 93, 1, 52, 91, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 91, 1, 52, 93, 1, - 52, 95, 1, 52, 96, 1, 52, 96, 1, 52, 97, 1, 52, 97, 1, 52, 98, 1, 52, 98, 1, 52, 98, 1, 52, 98, 1, 52, - 98, 1, 52, 97, 1, 52, 97, 1, 52, 97, 1, 52, 96, 1, 52, 95, 1, 52, 94, 1, 52, 93, 1, 52, 89, 92, 14, 180, - 1, 27, 31, 1, 23, 33, 1, 22, 35, 1, 21, 36, 1, 20, 36, 1, 19, 37, 1, 19, 38, 1, 19, 38, 1, 18, 39, 1, - 18, 39, 1, 18, 40, 1, 18, 40, 1, 19, 40, 1, 19, 41, 1, 19, 41, 1, 20, 42, 1, 20, 42, 1, 20, 43, 1, 21, - 43, 1, 21, 44, 1, 22, 44, 1, 22, 45, 1, 23, 45, 1, 23, 46, 1, 24, 46, 1, 24, 47, 1, 25, 47, 1, 25, 47, - 1, 26, 48, 1, 26, 48, 1, 27, 49, 1, 27, 49, 1, 27, 50, 1, 28, 50, 1, 28, 51, 1, 29, 51, 1, 29, 52, 1, - 30, 52, 1, 30, 53, 1, 31, 53, 1, 31, 54, 1, 32, 54, 1, 32, 54, 1, 33, 55, 1, 33, 55, 1, 34, 56, 1, 34, - 56, 1, 34, 57, 1, 35, 57, 1, 35, 58, 1, 36, 58, 1, 36, 59, 1, 37, 59, 1, 37, 60, 1, 38, 60, 1, 38, 61, - 1, 39, 61, 1, 39, 61, 1, 40, 62, 1, 40, 62, 1, 40, 63, 1, 41, 63, 1, 41, 64, 1, 42, 64, 1, 42, 65, 1, - 43, 65, 1, 43, 66, 1, 44, 66, 1, 44, 67, 1, 45, 67, 1, 45, 68, 1, 46, 68, 1, 46, 68, 1, 47, 69, 1, 47, - 69, 1, 47, 70, 1, 48, 70, 1, 48, 71, 1, 49, 71, 1, 49, 72, 1, 50, 72, 1, 50, 73, 1, 51, 73, 1, 51, 74, - 1, 52, 74, 1, 52, 75, 1, 53, 75, 1, 53, 75, 1, 54, 76, 1, 54, 76, 1, 54, 77, 1, 55, 77, 1, 55, 78, 1, - 56, 78, 1, 56, 79, 1, 57, 79, 1, 57, 80, 1, 58, 80, 1, 58, 81, 1, 59, 81, 1, 59, 82, 1, 60, 82, 1, 60, - 82, 1, 61, 83, 1, 61, 83, 1, 61, 84, 1, 62, 84, 1, 62, 85, 1, 63, 85, 1, 63, 86, 1, 64, 86, 1, 64, 87, - 1, 65, 87, 1, 65, 88, 1, 66, 88, 1, 66, 89, 1, 67, 89, 1, 67, 89, 1, 67, 90, 1, 68, 90, 1, 68, 91, 1, - 69, 91, 1, 69, 92, 1, 70, 92, 1, 70, 93, 1, 71, 93, 1, 71, 94, 1, 72, 94, 1, 72, 95, 1, 73, 95, 1, 73, - 96, 1, 74, 96, 1, 74, 96, 1, 74, 97, 1, 75, 97, 1, 75, 98, 1, 76, 98, 1, 76, 99, 1, 77, 99, 1, 77, 100, - 1, 78, 100, 1, 78, 101, 1, 79, 101, 1, 79, 102, 1, 80, 102, 1, 80, 103, 1, 81, 103, 1, 81, 103, 1, 81, - 104, 1, 82, 104, 1, 82, 105, 1, 83, 105, 1, 83, 105, 1, 84, 106, 1, 84, 106, 1, 85, 106, 1, 85, 106, - 1, 86, 106, 1, 86, 106, 1, 87, 106, 1, 87, 106, 1, 88, 105, 1, 89, 104, 1, 90, 103, 1, 91, 102, 1, 93, - 99, 93, 29, 189, 1, 36, 72, 1, 32, 72, 1, 30, 72, 1, 29, 72, 1, 28, 72, 1, 28, 72, 1, 27, 72, 1, 27, - 72, 1, 26, 72, 1, 26, 72, 1, 26, 72, 1, 26, 72, 1, 26, 72, 1, 27, 72, 1, 27, 72, 1, 28, 72, 1, 28, 72, - 1, 29, 72, 1, 31, 72, 1, 33, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 33, 72, 1, 31, 72, 1, 29, 72, 1, 29, 72, 1, 28, 72, 1, 27, 72, 1, 27, 72, 1, 27, - 72, 1, 27, 72, 1, 26, 72, 1, 27, 72, 1, 27, 72, 1, 27, 72, 1, 27, 72, 1, 28, 72, 1, 28, 72, 1, 29, 72, - 1, 30, 72, 1, 32, 72, 1, 35, 72, 94, 24, 90, 1, 61, 63, 1, 60, 64, 1, 59, 65, 1, 58, 65, 1, 57, 66, 1, - 57, 67, 1, 56, 68, 1, 55, 69, 1, 54, 70, 1, 53, 70, 1, 52, 71, 1, 52, 72, 1, 51, 73, 1, 50, 74, 1, 49, - 75, 1, 48, 75, 1, 47, 76, 1, 47, 77, 1, 46, 78, 1, 45, 79, 1, 44, 80, 1, 43, 80, 1, 42, 81, 1, 42, 82, - 1, 41, 83, 1, 40, 84, 1, 39, 85, 1, 38, 85, 1, 38, 86, 1, 37, 87, 1, 36, 88, 1, 35, 89, 2, 34, 61, 63, - 90, 2, 33, 60, 64, 90, 2, 33, 59, 65, 91, 2, 32, 58, 65, 92, 2, 31, 57, 66, 93, 2, 30, 57, 67, 94, 2, - 29, 56, 68, 95, 2, 28, 55, 69, 95, 2, 28, 54, 70, 96, 2, 27, 53, 70, 97, 2, 26, 52, 71, 98, 2, 25, 52, - 72, 99, 2, 24, 51, 73, 100, 2, 23, 50, 74, 100, 2, 23, 49, 75, 101, 2, 22, 48, 76, 102, 2, 21, 47, 76, - 103, 2, 20, 47, 77, 104, 2, 20, 46, 78, 104, 2, 19, 45, 79, 105, 2, 19, 44, 80, 106, 2, 18, 43, 81, 106, - 2, 18, 43, 81, 106, 2, 18, 42, 82, 106, 2, 18, 41, 83, 106, 2, 18, 40, 84, 106, 2, 18, 39, 85, 106, 2, - 18, 38, 86, 106, 2, 19, 37, 87, 105, 2, 20, 37, 87, 104, 2, 20, 36, 88, 104, 2, 21, 35, 89, 103, 2, 23, - 33, 91, 101, 2, 24, 32, 92, 100, 95, 198, 218, 1, 1, 124, 1, 0, 127, 1, 0, 128, 1, 0, 129, 1, 0, 130, - 1, 0, 131, 1, 0, 131, 1, 0, 131, 1, 0, 132, 1, 0, 132, 1, 0, 132, 1, 0, 132, 1, 0, 131, 1, 0, 131, 1, - 0, 131, 1, 0, 130, 1, 0, 129, 1, 0, 128, 1, 0, 127, 1, 1, 124, 96, 22, 56, 1, 47, 51, 1, 45, 53, 1, 44, - 55, 1, 44, 56, 1, 43, 57, 1, 43, 59, 1, 43, 60, 1, 43, 61, 1, 43, 62, 1, 44, 64, 1, 44, 65, 1, 45, 66, - 1, 46, 67, 1, 48, 68, 1, 49, 70, 1, 50, 71, 1, 51, 72, 1, 53, 73, 1, 54, 75, 1, 55, 76, 1, 56, 77, 1, - 58, 78, 1, 59, 79, 1, 60, 80, 1, 61, 81, 1, 62, 81, 1, 64, 81, 1, 65, 81, 1, 66, 81, 1, 67, 81, 1, 69, - 80, 1, 70, 80, 1, 72, 78, 1, 74, 76, 97, 65, 161, 1, 50, 71, 1, 44, 76, 1, 39, 80, 1, 35, 82, 1, 31, - 85, 1, 27, 86, 1, 25, 88, 1, 23, 90, 1, 22, 91, 1, 22, 92, 1, 21, 93, 1, 21, 94, 1, 20, 95, 1, 20, 95, - 1, 20, 96, 1, 20, 97, 1, 21, 97, 1, 21, 98, 1, 21, 98, 1, 21, 99, 2, 22, 55, 69, 99, 2, 23, 48, 74, 99, - 2, 24, 43, 76, 100, 2, 25, 38, 77, 100, 2, 28, 34, 78, 100, 1, 79, 100, 1, 79, 100, 1, 80, 100, 1, 80, - 100, 1, 80, 101, 1, 80, 101, 1, 80, 101, 1, 80, 101, 2, 54, 65, 80, 101, 2, 45, 75, 80, 101, 1, 41, 101, - 1, 37, 101, 1, 35, 101, 1, 32, 101, 1, 30, 101, 1, 28, 101, 1, 27, 101, 1, 26, 101, 1, 24, 101, 1, 23, - 101, 1, 22, 101, 1, 20, 101, 1, 19, 101, 1, 19, 101, 1, 18, 101, 1, 17, 101, 1, 16, 101, 1, 15, 101, - 1, 15, 101, 2, 14, 49, 71, 101, 2, 13, 45, 77, 101, 2, 13, 42, 80, 101, 2, 12, 40, 80, 101, 2, 12, 39, - 80, 101, 2, 12, 37, 80, 101, 2, 11, 36, 80, 101, 2, 11, 34, 80, 101, 2, 11, 33, 80, 101, 2, 10, 32, 80, - 101, 2, 10, 32, 80, 101, 2, 10, 31, 79, 101, 2, 10, 31, 77, 101, 2, 10, 31, 75, 101, 2, 10, 31, 74, 101, - 2, 10, 32, 71, 101, 2, 10, 32, 69, 101, 2, 11, 33, 66, 101, 2, 11, 35, 64, 109, 2, 11, 36, 60, 113, 2, - 11, 40, 56, 114, 1, 12, 115, 1, 12, 116, 1, 12, 117, 1, 13, 117, 1, 14, 117, 1, 14, 117, 1, 15, 118, - 1, 16, 118, 1, 17, 118, 1, 18, 117, 1, 19, 117, 1, 20, 117, 1, 21, 116, 2, 23, 78, 80, 115, 2, 24, 76, - 80, 114, 2, 25, 74, 80, 113, 2, 27, 71, 80, 111, 1, 30, 67, 1, 32, 64, 1, 37, 58, 1, 45, 49, 98, 29, - 160, 1, 7, 36, 1, 4, 36, 1, 3, 36, 1, 2, 36, 1, 1, 36, 1, 0, 36, 1, 0, 36, 1, 0, 36, 1, 0, 36, 1, 0, - 36, 1, 0, 36, 1, 0, 36, 1, 0, 36, 1, 0, 36, 1, 0, 36, 1, 1, 36, 1, 2, 36, 1, 2, 36, 1, 4, 36, 1, 6, 36, - 1, 16, 36, 1, 16, 36, 1, 16, 36, 1, 16, 36, 1, 16, 36, 1, 16, 36, 1, 16, 36, 1, 16, 36, 1, 16, 36, 1, - 16, 36, 1, 16, 36, 1, 16, 36, 1, 16, 36, 1, 16, 36, 1, 16, 36, 1, 16, 36, 2, 16, 36, 59, 76, 2, 16, 36, - 54, 81, 2, 16, 36, 51, 84, 2, 16, 36, 48, 87, 2, 16, 36, 46, 89, 2, 16, 36, 43, 91, 2, 16, 36, 42, 93, - 2, 16, 36, 40, 94, 2, 16, 36, 38, 96, 1, 16, 97, 1, 16, 99, 1, 16, 100, 1, 16, 101, 1, 16, 102, 1, 16, - 103, 1, 16, 104, 1, 16, 105, 1, 16, 106, 1, 16, 107, 1, 16, 107, 2, 16, 61, 72, 108, 2, 16, 57, 76, 109, - 2, 16, 54, 79, 110, 2, 16, 53, 81, 110, 2, 16, 51, 83, 111, 2, 16, 49, 84, 111, 2, 16, 48, 86, 112, 2, - 16, 47, 87, 112, 2, 16, 46, 88, 113, 2, 16, 45, 89, 113, 2, 16, 44, 90, 114, 2, 16, 43, 91, 114, 2, 16, - 42, 92, 115, 2, 16, 41, 92, 115, 2, 16, 41, 93, 115, 2, 16, 40, 93, 116, 2, 16, 40, 94, 116, 2, 16, 39, - 95, 116, 2, 16, 39, 95, 116, 2, 16, 38, 95, 116, 2, 16, 38, 96, 117, 2, 16, 38, 96, 117, 2, 16, 37, 96, - 117, 2, 16, 37, 96, 117, 2, 16, 37, 97, 117, 2, 16, 37, 97, 117, 2, 16, 37, 97, 117, 2, 16, 37, 97, 117, - 2, 16, 37, 97, 117, 2, 16, 37, 97, 117, 2, 16, 37, 97, 117, 2, 16, 37, 97, 117, 2, 16, 37, 97, 117, 2, - 16, 37, 97, 117, 2, 16, 37, 97, 117, 2, 16, 37, 97, 117, 2, 16, 37, 96, 117, 2, 16, 38, 96, 117, 2, 16, - 38, 96, 117, 2, 16, 38, 95, 116, 2, 16, 39, 95, 116, 2, 16, 39, 94, 116, 2, 16, 40, 94, 116, 2, 16, 41, - 93, 115, 2, 16, 41, 92, 115, 2, 16, 42, 92, 115, 2, 16, 43, 91, 114, 2, 16, 44, 90, 114, 2, 16, 45, 89, - 113, 2, 16, 46, 87, 113, 2, 16, 48, 86, 112, 2, 16, 50, 84, 112, 2, 7, 52, 82, 111, 2, 4, 54, 79, 111, - 2, 3, 59, 75, 110, 1, 2, 109, 1, 1, 109, 1, 0, 108, 1, 0, 107, 1, 0, 106, 1, 0, 105, 1, 0, 104, 1, 0, - 103, 1, 0, 102, 1, 0, 101, 1, 0, 100, 1, 0, 98, 2, 1, 36, 37, 97, 2, 1, 36, 39, 95, 2, 2, 36, 41, 93, - 2, 4, 36, 43, 91, 2, 6, 36, 45, 89, 1, 48, 86, 1, 51, 83, 1, 56, 78, 99, 65, 161, 1, 55, 75, 1, 50, 81, - 1, 46, 84, 2, 43, 88, 97, 105, 2, 41, 90, 96, 107, 2, 39, 92, 94, 108, 1, 37, 109, 1, 35, 110, 1, 34, - 110, 1, 32, 110, 1, 31, 111, 1, 30, 111, 1, 29, 111, 1, 27, 111, 1, 26, 111, 1, 26, 111, 1, 25, 111, - 1, 24, 111, 1, 23, 111, 1, 22, 111, 2, 21, 58, 72, 111, 2, 21, 54, 77, 111, 2, 20, 51, 80, 111, 2, 19, - 49, 82, 111, 2, 19, 47, 84, 111, 2, 18, 45, 86, 111, 2, 18, 44, 87, 111, 2, 17, 43, 88, 111, 2, 17, 41, - 89, 111, 2, 16, 41, 89, 111, 2, 16, 40, 90, 111, 2, 16, 39, 90, 111, 2, 15, 38, 91, 111, 2, 15, 37, 91, - 111, 2, 15, 37, 91, 111, 2, 14, 36, 91, 110, 2, 14, 36, 92, 110, 2, 14, 35, 92, 110, 2, 14, 35, 93, 109, - 2, 13, 35, 93, 108, 2, 13, 34, 95, 107, 2, 13, 34, 96, 106, 2, 13, 34, 101, 102, 1, 13, 33, 1, 13, 33, - 1, 13, 33, 1, 13, 33, 1, 13, 33, 1, 13, 33, 1, 13, 33, 1, 13, 33, 1, 13, 33, 1, 13, 33, 1, 13, 33, 1, - 13, 33, 1, 13, 33, 1, 13, 34, 1, 13, 34, 1, 13, 34, 1, 13, 35, 1, 14, 35, 1, 14, 36, 1, 14, 36, 1, 14, - 37, 2, 15, 37, 102, 110, 2, 15, 38, 101, 112, 2, 15, 39, 100, 113, 2, 16, 40, 98, 114, 2, 16, 41, 97, - 115, 2, 17, 43, 96, 115, 2, 17, 44, 94, 116, 2, 18, 46, 92, 116, 2, 18, 49, 90, 116, 2, 19, 52, 87, 116, - 2, 19, 58, 81, 116, 1, 20, 116, 1, 21, 116, 1, 22, 116, 1, 22, 115, 1, 23, 115, 1, 24, 114, 1, 25, 113, - 1, 26, 112, 1, 28, 111, 1, 29, 110, 1, 30, 109, 1, 32, 107, 1, 33, 106, 1, 35, 104, 1, 37, 102, 1, 39, - 100, 1, 41, 97, 1, 44, 94, 1, 48, 90, 1, 53, 84, 1, 65, 74, 100, 29, 160, 1, 79, 109, 1, 76, 109, 1, - 75, 109, 1, 74, 109, 1, 73, 109, 1, 72, 109, 1, 72, 109, 1, 72, 109, 1, 71, 109, 1, 71, 109, 1, 71, 109, - 1, 71, 109, 1, 72, 109, 1, 72, 109, 1, 72, 109, 1, 73, 109, 1, 74, 109, 1, 75, 109, 1, 76, 109, 1, 79, - 109, 1, 88, 109, 1, 88, 109, 1, 88, 109, 1, 88, 109, 1, 88, 109, 1, 88, 109, 1, 88, 109, 1, 88, 109, - 1, 88, 109, 1, 88, 109, 1, 88, 109, 1, 88, 109, 1, 88, 109, 1, 88, 109, 1, 88, 109, 1, 88, 109, 2, 49, - 66, 88, 109, 2, 44, 71, 88, 109, 2, 41, 74, 88, 109, 2, 38, 77, 88, 109, 2, 36, 79, 88, 109, 2, 34, 81, - 88, 109, 2, 32, 83, 88, 109, 2, 30, 85, 88, 109, 2, 29, 87, 88, 109, 1, 28, 109, 1, 26, 109, 1, 25, 109, - 1, 24, 109, 1, 23, 109, 1, 22, 109, 1, 21, 109, 1, 20, 109, 1, 19, 109, 1, 18, 109, 1, 18, 109, 2, 17, - 53, 63, 109, 2, 16, 49, 67, 109, 2, 15, 46, 70, 109, 2, 15, 44, 72, 109, 2, 14, 42, 74, 109, 2, 13, 40, - 76, 109, 2, 13, 39, 77, 109, 2, 12, 38, 78, 109, 2, 12, 37, 79, 109, 2, 12, 36, 80, 109, 2, 11, 35, 81, - 109, 2, 11, 34, 82, 109, 2, 10, 33, 83, 109, 2, 10, 33, 84, 109, 2, 10, 32, 84, 109, 2, 9, 31, 85, 109, - 2, 9, 31, 85, 109, 2, 9, 30, 86, 109, 2, 9, 30, 86, 109, 2, 8, 30, 87, 109, 2, 8, 29, 87, 109, 2, 8, - 29, 87, 109, 2, 8, 29, 87, 109, 2, 8, 28, 88, 109, 2, 8, 28, 88, 109, 2, 8, 28, 88, 109, 2, 8, 28, 88, - 109, 2, 8, 28, 88, 109, 2, 7, 28, 88, 109, 2, 7, 28, 88, 109, 2, 7, 28, 88, 109, 2, 8, 28, 88, 109, 2, - 8, 28, 88, 109, 2, 8, 28, 88, 109, 2, 8, 28, 88, 109, 2, 8, 28, 88, 109, 2, 8, 29, 87, 109, 2, 8, 29, - 87, 109, 2, 8, 29, 87, 109, 2, 8, 30, 87, 109, 2, 9, 30, 86, 109, 2, 9, 31, 85, 109, 2, 9, 31, 85, 109, - 2, 9, 32, 84, 109, 2, 10, 32, 84, 109, 2, 10, 33, 83, 109, 2, 11, 34, 82, 109, 2, 11, 35, 81, 109, 2, - 11, 36, 80, 109, 2, 12, 38, 78, 109, 2, 12, 39, 77, 109, 2, 13, 41, 75, 109, 2, 13, 43, 73, 117, 2, 14, - 45, 70, 121, 2, 15, 50, 66, 122, 1, 15, 123, 1, 16, 124, 1, 17, 125, 1, 18, 125, 1, 19, 125, 1, 20, 126, - 1, 21, 126, 1, 22, 126, 1, 23, 126, 1, 24, 125, 1, 25, 125, 1, 27, 125, 1, 28, 124, 2, 30, 86, 88, 123, - 2, 32, 84, 88, 122, 2, 33, 82, 88, 121, 2, 36, 79, 88, 119, 1, 39, 77, 1, 42, 74, 1, 46, 69, 101, 65, - 160, 1, 50, 70, 1, 46, 75, 1, 42, 78, 1, 40, 81, 1, 37, 84, 1, 35, 86, 1, 33, 88, 1, 31, 90, 1, 30, 91, - 1, 28, 93, 1, 27, 94, 1, 26, 96, 1, 24, 97, 1, 23, 98, 1, 22, 99, 1, 21, 100, 1, 20, 101, 1, 19, 102, - 1, 18, 102, 1, 17, 103, 2, 17, 54, 67, 104, 2, 16, 49, 71, 104, 2, 15, 46, 75, 105, 2, 14, 44, 77, 106, - 2, 14, 42, 79, 106, 2, 13, 41, 80, 107, 2, 13, 39, 82, 107, 2, 12, 38, 83, 108, 2, 12, 37, 84, 108, 2, - 11, 36, 85, 109, 2, 11, 35, 86, 109, 2, 11, 34, 87, 110, 2, 10, 34, 87, 110, 2, 10, 33, 88, 110, 2, 10, - 32, 89, 111, 2, 9, 32, 89, 111, 2, 9, 31, 90, 111, 1, 9, 112, 1, 9, 112, 1, 9, 112, 1, 8, 112, 1, 8, - 112, 1, 8, 113, 1, 8, 113, 1, 8, 113, 1, 8, 113, 1, 8, 113, 1, 8, 113, 1, 8, 113, 1, 8, 113, 1, 8, 113, - 1, 8, 113, 1, 8, 113, 1, 8, 113, 1, 8, 113, 1, 8, 113, 1, 9, 113, 1, 9, 113, 1, 9, 30, 1, 9, 31, 1, 9, - 31, 1, 10, 32, 1, 10, 32, 1, 10, 33, 1, 11, 34, 1, 11, 35, 1, 11, 36, 1, 12, 37, 2, 12, 38, 102, 104, - 2, 13, 40, 96, 108, 2, 13, 41, 92, 109, 2, 14, 43, 88, 110, 2, 15, 45, 84, 111, 2, 15, 49, 79, 112, 2, - 16, 53, 72, 112, 1, 17, 113, 1, 18, 113, 1, 18, 113, 1, 19, 113, 1, 20, 113, 1, 21, 113, 1, 22, 113, - 1, 23, 112, 1, 24, 111, 1, 26, 111, 1, 27, 110, 1, 28, 108, 1, 30, 106, 1, 32, 104, 1, 34, 101, 1, 36, - 98, 1, 38, 94, 1, 41, 89, 1, 44, 84, 1, 49, 77, 102, 29, 157, 1, 71, 91, 1, 66, 100, 1, 63, 106, 1, 60, - 110, 1, 58, 112, 1, 56, 114, 1, 55, 115, 1, 53, 116, 1, 52, 116, 1, 51, 117, 1, 50, 117, 1, 49, 117, - 1, 48, 117, 1, 47, 117, 1, 47, 117, 1, 46, 117, 1, 45, 117, 1, 45, 116, 1, 44, 116, 1, 44, 115, 2, 43, - 74, 85, 114, 2, 43, 69, 94, 113, 2, 43, 67, 101, 111, 1, 43, 65, 1, 42, 64, 1, 42, 64, 1, 42, 63, 1, - 42, 63, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, - 62, 1, 42, 62, 1, 30, 95, 1, 25, 101, 1, 23, 103, 1, 21, 104, 1, 20, 105, 1, 20, 105, 1, 19, 106, 1, - 19, 106, 1, 19, 107, 1, 18, 107, 1, 18, 107, 1, 18, 107, 1, 19, 107, 1, 19, 106, 1, 19, 106, 1, 20, 105, - 1, 20, 105, 1, 21, 104, 1, 22, 103, 1, 24, 101, 1, 29, 97, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, - 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, - 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, - 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, - 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, - 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 42, 62, 1, 24, 96, 1, 21, - 99, 1, 20, 101, 1, 18, 102, 1, 18, 103, 1, 17, 103, 1, 17, 104, 1, 16, 104, 1, 16, 104, 1, 16, 104, 1, - 16, 104, 1, 16, 104, 1, 16, 104, 1, 17, 104, 1, 17, 103, 1, 18, 103, 1, 18, 102, 1, 19, 101, 1, 21, 100, - 1, 23, 97, 103, 64, 200, 1, 55, 57, 1, 46, 66, 1, 41, 70, 2, 38, 73, 84, 110, 2, 36, 76, 84, 115, 2, - 34, 78, 84, 117, 2, 32, 80, 84, 119, 2, 30, 81, 84, 120, 2, 29, 83, 84, 120, 1, 27, 121, 1, 26, 121, - 1, 25, 122, 1, 24, 122, 1, 23, 122, 1, 22, 122, 1, 21, 122, 1, 20, 121, 1, 19, 121, 1, 18, 120, 1, 17, - 120, 1, 16, 119, 2, 16, 50, 63, 118, 2, 15, 46, 66, 116, 2, 14, 44, 68, 111, 2, 14, 42, 70, 105, 2, 13, - 40, 72, 105, 2, 13, 39, 73, 105, 2, 12, 38, 74, 105, 2, 12, 37, 76, 105, 2, 11, 36, 77, 105, 2, 11, 35, - 77, 105, 2, 11, 34, 78, 105, 2, 10, 33, 79, 105, 2, 10, 33, 80, 105, 2, 10, 32, 80, 105, 2, 9, 31, 81, - 105, 2, 9, 31, 81, 105, 2, 9, 30, 82, 105, 2, 9, 30, 82, 105, 2, 9, 30, 83, 105, 2, 8, 29, 83, 105, 2, - 8, 29, 83, 105, 2, 8, 29, 84, 105, 2, 8, 29, 84, 105, 2, 8, 28, 84, 105, 2, 8, 28, 84, 105, 2, 8, 28, - 84, 105, 2, 8, 28, 84, 105, 2, 8, 28, 84, 105, 2, 8, 28, 84, 105, 2, 8, 28, 84, 105, 2, 8, 28, 84, 105, - 2, 8, 28, 84, 105, 2, 8, 29, 84, 105, 2, 8, 29, 84, 105, 2, 8, 29, 83, 105, 2, 8, 29, 83, 105, 2, 9, - 30, 83, 105, 2, 9, 30, 82, 105, 2, 9, 31, 82, 105, 2, 9, 31, 81, 105, 2, 9, 32, 81, 105, 2, 10, 32, 80, - 105, 2, 10, 33, 80, 105, 2, 10, 34, 79, 105, 2, 11, 34, 78, 105, 2, 11, 35, 77, 105, 2, 12, 36, 76, 105, - 2, 12, 37, 75, 105, 2, 12, 38, 74, 105, 2, 13, 40, 73, 105, 2, 13, 41, 71, 105, 2, 14, 43, 69, 105, 2, - 15, 45, 68, 105, 2, 15, 48, 65, 105, 2, 16, 51, 61, 105, 1, 17, 105, 1, 17, 105, 1, 18, 105, 1, 19, 105, - 1, 20, 105, 1, 21, 105, 1, 22, 105, 1, 23, 105, 1, 24, 105, 1, 25, 105, 1, 26, 105, 1, 28, 105, 2, 29, - 83, 84, 105, 2, 31, 81, 84, 105, 2, 32, 79, 84, 105, 2, 34, 77, 84, 105, 2, 37, 75, 84, 105, 2, 39, 72, - 84, 105, 2, 42, 69, 84, 105, 2, 47, 65, 84, 105, 1, 84, 105, 1, 84, 105, 1, 84, 105, 1, 84, 105, 1, 84, - 105, 1, 84, 104, 1, 84, 104, 1, 83, 104, 1, 83, 104, 1, 83, 104, 1, 82, 104, 1, 82, 104, 1, 81, 103, - 1, 80, 103, 1, 79, 103, 1, 78, 102, 1, 76, 102, 1, 74, 101, 1, 71, 101, 1, 39, 101, 1, 35, 100, 1, 33, - 99, 1, 32, 99, 1, 31, 98, 1, 30, 97, 1, 30, 96, 1, 29, 95, 1, 29, 95, 1, 29, 93, 1, 29, 92, 1, 29, 91, - 1, 29, 90, 1, 29, 88, 1, 30, 87, 1, 30, 85, 1, 31, 83, 1, 32, 81, 1, 33, 78, 1, 35, 75, 1, 40, 67, 104, - 29, 157, 1, 12, 41, 1, 9, 41, 1, 7, 41, 1, 6, 41, 1, 5, 41, 1, 5, 41, 1, 4, 41, 1, 4, 41, 1, 4, 41, 1, - 4, 41, 1, 4, 41, 1, 4, 41, 1, 4, 41, 1, 4, 41, 1, 5, 41, 1, 5, 41, 1, 6, 41, 1, 7, 41, 1, 9, 41, 1, 11, - 41, 1, 21, 41, 1, 21, 41, 1, 21, 41, 1, 21, 41, 1, 21, 41, 1, 21, 41, 1, 21, 41, 1, 21, 41, 1, 21, 41, - 1, 21, 41, 1, 21, 41, 1, 21, 41, 1, 21, 41, 1, 21, 41, 1, 21, 41, 1, 21, 41, 2, 21, 41, 59, 76, 2, 21, - 41, 54, 81, 2, 21, 41, 51, 84, 2, 21, 41, 49, 86, 2, 21, 41, 46, 88, 2, 21, 41, 44, 90, 2, 21, 41, 42, - 91, 1, 21, 93, 1, 21, 94, 1, 21, 95, 1, 21, 96, 1, 21, 97, 1, 21, 98, 1, 21, 99, 1, 21, 99, 1, 21, 100, - 1, 21, 101, 1, 21, 101, 1, 21, 102, 1, 21, 102, 2, 21, 62, 72, 102, 2, 21, 58, 76, 103, 2, 21, 56, 78, - 103, 2, 21, 54, 80, 104, 2, 21, 52, 81, 104, 2, 21, 51, 82, 104, 2, 21, 49, 83, 104, 2, 21, 48, 83, 104, - 2, 21, 47, 83, 105, 2, 21, 46, 84, 105, 2, 21, 45, 84, 105, 2, 21, 44, 84, 105, 2, 21, 43, 84, 105, 2, - 21, 42, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, - 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, - 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, - 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, - 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, - 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, - 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, - 84, 105, 2, 21, 41, 84, 105, 2, 21, 41, 84, 105, 2, 13, 48, 77, 112, 2, 11, 51, 74, 115, 2, 9, 53, 73, - 116, 2, 8, 54, 72, 117, 2, 8, 54, 71, 118, 2, 7, 55, 71, 119, 2, 7, 55, 70, 119, 2, 6, 56, 70, 119, 2, - 6, 56, 70, 119, 2, 6, 56, 70, 120, 2, 6, 56, 70, 120, 2, 6, 56, 70, 120, 2, 6, 56, 70, 119, 2, 6, 56, - 70, 119, 2, 7, 55, 71, 119, 2, 7, 55, 71, 118, 2, 8, 54, 72, 117, 2, 9, 53, 73, 116, 2, 11, 51, 74, 115, - 2, 13, 49, 77, 113, 105, 29, 157, 1, 45, 69, 1, 45, 69, 1, 45, 69, 1, 45, 69, 1, 45, 69, 1, 45, 69, 1, - 45, 69, 1, 45, 69, 1, 45, 69, 1, 45, 69, 1, 45, 69, 1, 45, 69, 1, 45, 69, 1, 45, 69, 1, 45, 69, 1, 45, - 69, 1, 45, 69, 1, 45, 69, 1, 45, 69, 1, 45, 69, 1, 45, 69, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 33, 72, 1, 28, 72, 1, 26, 72, 1, 25, 72, 1, 24, 72, 1, 23, 72, 1, 23, 72, 1, 22, 72, 1, 22, - 72, 1, 22, 72, 1, 22, 72, 1, 22, 72, 1, 22, 72, 1, 22, 72, 1, 23, 72, 1, 23, 72, 1, 24, 72, 1, 25, 72, - 1, 26, 72, 1, 28, 72, 1, 32, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 22, 102, 1, 19, 105, 1, 18, 106, 1, 16, - 108, 1, 16, 108, 1, 15, 109, 1, 15, 109, 1, 14, 110, 1, 14, 110, 1, 14, 110, 1, 14, 110, 1, 14, 110, - 1, 14, 110, 1, 15, 109, 1, 15, 109, 1, 15, 109, 1, 16, 108, 1, 17, 107, 1, 19, 105, 1, 21, 103, 106, - 29, 200, 1, 58, 82, 1, 58, 82, 1, 58, 82, 1, 58, 82, 1, 58, 82, 1, 58, 82, 1, 58, 82, 1, 58, 82, 1, 58, - 82, 1, 58, 82, 1, 58, 82, 1, 58, 82, 1, 58, 82, 1, 58, 82, 1, 58, 82, 1, 58, 82, 1, 58, 82, 1, 58, 82, - 1, 58, 82, 1, 58, 82, 1, 58, 82, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 31, 94, 1, 25, - 94, 1, 23, 94, 1, 22, 94, 1, 21, 94, 1, 21, 94, 1, 20, 94, 1, 20, 94, 1, 19, 94, 1, 19, 94, 1, 19, 94, - 1, 19, 94, 1, 19, 94, 1, 20, 94, 1, 20, 94, 1, 21, 94, 1, 21, 94, 1, 22, 94, 1, 23, 94, 1, 25, 94, 1, - 29, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, - 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, - 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, - 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, - 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, - 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, - 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, - 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, - 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 74, 94, 1, 73, 94, 1, 73, 94, 1, 72, 94, 1, 72, 94, 1, - 71, 93, 1, 71, 93, 1, 70, 93, 1, 69, 92, 1, 67, 92, 1, 66, 92, 1, 64, 91, 1, 61, 91, 1, 29, 90, 1, 25, - 90, 1, 23, 89, 1, 21, 88, 1, 21, 88, 1, 20, 87, 1, 19, 86, 1, 19, 85, 1, 19, 84, 1, 19, 83, 1, 19, 82, - 1, 19, 81, 1, 19, 80, 1, 19, 78, 1, 19, 77, 1, 20, 75, 1, 21, 73, 1, 22, 71, 1, 23, 68, 1, 25, 65, 1, - 30, 57, 107, 29, 157, 1, 16, 45, 1, 13, 45, 1, 12, 45, 1, 11, 45, 1, 10, 45, 1, 9, 45, 1, 9, 45, 1, 8, - 45, 1, 8, 45, 1, 8, 45, 1, 8, 45, 1, 8, 45, 1, 8, 45, 1, 9, 45, 1, 9, 45, 1, 10, 45, 1, 10, 45, 1, 11, - 45, 1, 13, 45, 1, 15, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, - 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, 25, 45, 1, - 25, 45, 1, 25, 45, 2, 25, 45, 70, 98, 2, 25, 45, 65, 103, 2, 25, 45, 63, 105, 2, 25, 45, 62, 106, 2, - 25, 45, 61, 107, 2, 25, 45, 60, 108, 2, 25, 45, 60, 108, 2, 25, 45, 59, 108, 2, 25, 45, 59, 109, 2, 25, - 45, 59, 109, 2, 25, 45, 59, 109, 2, 25, 45, 59, 109, 2, 25, 45, 59, 109, 2, 25, 45, 59, 108, 2, 25, 45, - 60, 108, 2, 25, 45, 60, 108, 2, 25, 45, 61, 107, 2, 25, 45, 61, 106, 2, 25, 45, 60, 105, 2, 25, 45, 59, - 103, 2, 25, 45, 58, 99, 2, 25, 45, 56, 89, 2, 25, 45, 55, 88, 2, 25, 45, 54, 86, 2, 25, 45, 53, 85, 2, - 25, 45, 51, 84, 2, 25, 45, 50, 83, 2, 25, 45, 49, 81, 2, 25, 45, 48, 80, 2, 25, 45, 46, 79, 1, 25, 78, - 1, 25, 76, 1, 25, 75, 1, 25, 74, 1, 25, 73, 1, 25, 72, 1, 25, 70, 1, 25, 69, 1, 25, 68, 1, 25, 67, 1, - 25, 65, 1, 25, 66, 1, 25, 67, 1, 25, 68, 1, 25, 69, 1, 25, 71, 1, 25, 72, 1, 25, 73, 1, 25, 74, 1, 25, - 75, 1, 25, 76, 1, 25, 77, 1, 25, 79, 1, 25, 80, 2, 25, 48, 50, 81, 2, 25, 47, 52, 82, 2, 25, 45, 53, - 83, 2, 25, 45, 54, 84, 2, 25, 45, 55, 85, 2, 25, 45, 56, 86, 2, 25, 45, 57, 88, 2, 25, 45, 58, 89, 2, - 25, 45, 59, 90, 2, 25, 45, 61, 91, 2, 25, 45, 62, 92, 2, 25, 45, 63, 93, 2, 25, 45, 64, 94, 2, 25, 45, - 65, 95, 2, 25, 45, 66, 97, 2, 25, 45, 67, 98, 2, 16, 45, 68, 109, 2, 13, 45, 70, 113, 2, 12, 45, 71, - 114, 2, 11, 45, 70, 115, 2, 10, 45, 69, 116, 2, 9, 45, 69, 117, 2, 9, 45, 68, 117, 2, 8, 45, 68, 117, - 2, 8, 45, 67, 118, 2, 8, 45, 67, 118, 2, 8, 45, 67, 118, 2, 8, 45, 67, 118, 2, 8, 45, 68, 117, 2, 9, - 45, 68, 117, 2, 9, 45, 68, 117, 2, 10, 45, 69, 116, 2, 10, 45, 70, 115, 2, 11, 45, 70, 115, 2, 13, 45, - 72, 113, 2, 15, 45, 74, 111, 108, 29, 157, 1, 30, 72, 1, 27, 72, 1, 26, 72, 1, 24, 72, 1, 24, 72, 1, - 23, 72, 1, 23, 72, 1, 22, 72, 1, 22, 72, 1, 22, 72, 1, 22, 72, 1, 22, 72, 1, 22, 72, 1, 23, 72, 1, 23, - 72, 1, 24, 72, 1, 24, 72, 1, 25, 72, 1, 27, 72, 1, 29, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 22, 102, 1, 19, 105, 1, 18, 107, 1, 17, 108, 1, 16, 108, 1, 15, 109, 1, 15, 109, 1, 14, 110, 1, 14, - 110, 1, 14, 110, 1, 14, 110, 1, 14, 110, 1, 14, 110, 1, 15, 110, 1, 15, 109, 1, 16, 109, 1, 16, 108, - 1, 17, 107, 1, 19, 105, 1, 21, 103, 109, 65, 157, 2, 43, 54, 82, 95, 2, 40, 57, 78, 99, 3, 9, 31, 38, - 59, 76, 101, 3, 4, 31, 36, 60, 74, 103, 3, 2, 31, 35, 62, 72, 104, 3, 1, 31, 34, 63, 70, 106, 3, 0, 31, - 33, 64, 69, 107, 2, 0, 65, 67, 108, 1, 0, 109, 1, 0, 110, 1, 0, 111, 1, 0, 111, 1, 0, 112, 1, 0, 112, - 1, 0, 113, 1, 0, 113, 1, 0, 114, 1, 0, 114, 1, 0, 114, 1, 1, 115, 3, 2, 43, 49, 85, 92, 115, 3, 4, 41, - 51, 83, 93, 115, 3, 7, 39, 52, 81, 94, 115, 3, 10, 37, 52, 80, 94, 115, 3, 10, 36, 53, 79, 95, 115, 3, - 10, 35, 53, 78, 95, 115, 3, 10, 34, 53, 77, 95, 116, 3, 10, 34, 53, 76, 95, 116, 3, 10, 33, 53, 75, 95, - 116, 3, 10, 32, 53, 75, 95, 116, 3, 10, 31, 53, 74, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, - 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, - 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, - 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, - 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, - 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, - 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, - 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, - 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, - 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, - 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, - 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 10, 31, 53, 73, 95, 116, 3, 6, 35, 53, 78, 95, 120, 3, - 3, 38, 53, 81, 95, 123, 3, 1, 40, 53, 82, 95, 125, 3, 0, 41, 53, 83, 95, 125, 3, 0, 42, 53, 84, 95, 126, - 3, 0, 42, 53, 85, 95, 127, 3, 0, 43, 53, 85, 95, 127, 3, 0, 43, 53, 85, 95, 128, 3, 0, 43, 53, 86, 95, - 128, 3, 0, 43, 53, 86, 95, 128, 3, 0, 43, 53, 86, 95, 128, 3, 0, 43, 53, 86, 95, 128, 3, 0, 43, 53, 86, - 95, 128, 3, 0, 43, 53, 85, 95, 128, 3, 0, 42, 53, 85, 95, 127, 3, 0, 42, 53, 84, 95, 126, 3, 0, 41, 53, - 84, 95, 126, 3, 1, 40, 53, 83, 95, 125, 3, 3, 39, 53, 81, 95, 123, 3, 5, 36, 53, 79, 95, 121, 110, 65, - 157, 1, 60, 77, 1, 56, 81, 2, 20, 42, 53, 84, 2, 15, 42, 51, 86, 2, 13, 42, 49, 88, 2, 12, 42, 47, 90, - 2, 11, 42, 45, 92, 2, 10, 42, 44, 93, 2, 9, 42, 43, 94, 1, 9, 95, 1, 9, 96, 1, 9, 97, 1, 9, 98, 1, 9, - 99, 1, 9, 99, 1, 9, 100, 1, 9, 101, 1, 10, 101, 1, 11, 102, 1, 11, 102, 2, 13, 62, 73, 103, 2, 15, 58, - 77, 103, 2, 18, 56, 79, 103, 2, 21, 54, 80, 104, 2, 21, 52, 82, 104, 2, 21, 51, 83, 104, 2, 21, 50, 83, - 104, 2, 21, 48, 84, 105, 2, 21, 47, 84, 105, 2, 21, 46, 84, 105, 2, 21, 45, 85, 105, 2, 21, 44, 85, 105, - 2, 21, 43, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, - 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, - 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, - 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, - 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, - 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, - 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, - 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 14, 49, 80, 110, 2, 11, 52, - 77, 113, 2, 10, 53, 76, 114, 2, 9, 54, 75, 115, 2, 8, 55, 74, 116, 2, 7, 55, 73, 117, 2, 7, 56, 73, 117, - 2, 7, 56, 73, 117, 2, 6, 56, 72, 117, 2, 6, 56, 72, 118, 2, 6, 56, 72, 118, 2, 6, 56, 72, 118, 2, 7, - 56, 73, 117, 2, 7, 56, 73, 117, 2, 7, 56, 73, 117, 2, 8, 55, 74, 116, 2, 9, 54, 75, 115, 2, 10, 53, 75, - 114, 2, 11, 52, 77, 113, 2, 13, 49, 79, 111, 111, 65, 160, 1, 54, 71, 1, 49, 76, 1, 45, 79, 1, 42, 82, - 1, 40, 84, 1, 38, 86, 1, 36, 89, 1, 34, 90, 1, 33, 92, 1, 31, 93, 1, 29, 95, 1, 28, 96, 1, 27, 97, 1, - 26, 99, 1, 25, 100, 1, 24, 101, 1, 23, 102, 1, 22, 103, 1, 21, 104, 1, 20, 104, 2, 19, 56, 68, 105, 2, - 19, 52, 72, 106, 2, 18, 49, 76, 106, 2, 17, 47, 78, 107, 2, 17, 45, 79, 108, 2, 16, 43, 81, 108, 2, 16, - 42, 83, 109, 2, 15, 41, 84, 109, 2, 15, 40, 85, 110, 2, 14, 39, 86, 110, 2, 14, 38, 87, 111, 2, 13, 37, - 88, 111, 2, 13, 36, 88, 112, 2, 13, 35, 89, 112, 2, 12, 35, 90, 112, 2, 12, 34, 91, 113, 2, 12, 33, 91, - 113, 2, 11, 33, 92, 113, 2, 11, 32, 92, 113, 2, 11, 32, 93, 114, 2, 11, 32, 93, 114, 2, 11, 31, 93, 114, - 2, 10, 31, 94, 114, 2, 10, 31, 94, 114, 2, 10, 31, 94, 114, 2, 10, 30, 94, 115, 2, 10, 30, 94, 115, 2, - 10, 30, 94, 115, 2, 10, 30, 94, 115, 2, 10, 30, 94, 115, 2, 10, 30, 94, 115, 2, 10, 30, 94, 115, 2, 10, - 30, 94, 115, 2, 10, 31, 94, 114, 2, 10, 31, 94, 114, 2, 10, 31, 93, 114, 2, 11, 32, 93, 114, 2, 11, 32, - 93, 114, 2, 11, 32, 92, 114, 2, 11, 33, 91, 113, 2, 11, 34, 91, 113, 2, 12, 34, 90, 113, 2, 12, 35, 89, - 112, 2, 13, 36, 89, 112, 2, 13, 37, 88, 112, 2, 13, 38, 87, 111, 2, 14, 39, 86, 111, 2, 14, 40, 85, 110, - 2, 15, 41, 83, 110, 2, 15, 43, 82, 109, 2, 16, 44, 80, 109, 2, 16, 46, 78, 108, 2, 17, 48, 76, 108, 2, - 18, 51, 74, 107, 2, 18, 55, 70, 106, 1, 19, 106, 1, 20, 105, 1, 21, 104, 1, 22, 103, 1, 23, 102, 1, 24, - 101, 1, 25, 100, 1, 26, 99, 1, 27, 98, 1, 28, 96, 1, 30, 95, 1, 31, 93, 1, 33, 92, 1, 35, 90, 1, 37, - 88, 1, 39, 86, 1, 41, 83, 1, 44, 80, 1, 47, 77, 1, 52, 73, 112, 65, 200, 1, 59, 76, 1, 54, 81, 2, 11, - 37, 51, 84, 2, 6, 37, 48, 87, 2, 4, 37, 46, 89, 2, 2, 37, 43, 91, 2, 2, 37, 42, 93, 2, 1, 37, 40, 95, - 2, 0, 37, 38, 96, 1, 0, 98, 1, 0, 99, 1, 0, 100, 1, 0, 101, 1, 0, 102, 1, 0, 103, 1, 0, 104, 1, 0, 105, - 1, 1, 106, 1, 1, 107, 1, 2, 108, 2, 4, 62, 73, 109, 2, 6, 57, 77, 109, 2, 10, 54, 80, 110, 2, 17, 52, - 82, 111, 2, 17, 51, 83, 111, 2, 17, 49, 85, 112, 2, 17, 48, 86, 112, 2, 17, 47, 87, 113, 2, 17, 46, 88, - 113, 2, 17, 45, 90, 114, 2, 17, 44, 90, 114, 2, 17, 43, 91, 115, 2, 17, 42, 92, 115, 2, 17, 41, 93, 115, - 2, 17, 41, 93, 116, 2, 17, 40, 94, 116, 2, 17, 40, 94, 116, 2, 17, 39, 95, 116, 2, 17, 39, 95, 117, 2, - 17, 38, 96, 117, 2, 17, 38, 96, 117, 2, 17, 38, 96, 117, 2, 17, 37, 97, 117, 2, 17, 37, 97, 117, 2, 17, - 37, 97, 117, 2, 17, 37, 97, 117, 2, 17, 37, 97, 117, 2, 17, 37, 97, 117, 2, 17, 37, 97, 117, 2, 17, 37, - 97, 117, 2, 17, 37, 96, 117, 2, 17, 38, 96, 117, 2, 17, 38, 96, 117, 2, 17, 39, 95, 117, 2, 17, 39, 95, - 117, 2, 17, 40, 94, 116, 2, 17, 40, 93, 116, 2, 17, 41, 93, 116, 2, 17, 42, 92, 115, 2, 17, 43, 91, 115, - 2, 17, 44, 90, 115, 2, 17, 45, 89, 114, 2, 17, 46, 88, 114, 2, 17, 47, 87, 113, 2, 17, 49, 85, 113, 2, - 17, 50, 84, 112, 2, 17, 52, 82, 112, 2, 17, 54, 80, 111, 2, 17, 57, 77, 111, 2, 17, 62, 72, 110, 1, 17, - 109, 1, 17, 108, 1, 17, 107, 1, 17, 106, 1, 17, 106, 1, 17, 105, 1, 17, 103, 1, 17, 102, 1, 17, 101, - 1, 17, 100, 1, 17, 98, 2, 17, 37, 38, 97, 2, 17, 37, 40, 95, 2, 17, 37, 41, 93, 2, 17, 37, 43, 92, 2, - 17, 37, 46, 89, 2, 17, 37, 48, 87, 2, 17, 37, 51, 85, 2, 17, 37, 55, 80, 2, 17, 37, 59, 76, 1, 17, 37, - 1, 17, 37, 1, 17, 37, 1, 17, 37, 1, 17, 37, 1, 17, 37, 1, 17, 37, 1, 17, 37, 1, 17, 37, 1, 17, 37, 1, - 17, 37, 1, 17, 37, 1, 17, 37, 1, 17, 37, 1, 17, 37, 1, 17, 37, 1, 17, 37, 1, 17, 37, 1, 17, 37, 1, 17, - 37, 1, 17, 37, 1, 17, 37, 1, 17, 37, 1, 17, 37, 1, 10, 52, 1, 6, 57, 1, 4, 59, 1, 2, 60, 1, 1, 61, 1, - 1, 61, 1, 0, 62, 1, 0, 62, 1, 0, 62, 1, 0, 63, 1, 0, 63, 1, 0, 63, 1, 0, 62, 1, 0, 62, 1, 0, 62, 1, 1, - 61, 1, 1, 61, 1, 2, 60, 1, 4, 59, 1, 6, 56, 1, 11, 51, 113, 65, 200, 1, 49, 67, 1, 45, 72, 2, 41, 75, - 89, 115, 2, 39, 78, 89, 120, 2, 37, 80, 89, 122, 2, 34, 82, 89, 123, 2, 33, 84, 89, 124, 2, 31, 86, 89, - 125, 2, 30, 87, 89, 125, 1, 28, 126, 1, 27, 126, 1, 26, 126, 1, 25, 126, 1, 23, 126, 1, 22, 126, 1, 21, - 126, 1, 21, 125, 1, 20, 125, 1, 19, 124, 1, 18, 123, 2, 17, 54, 65, 122, 2, 16, 49, 69, 120, 2, 16, 46, - 71, 116, 2, 15, 44, 73, 109, 2, 14, 43, 75, 109, 2, 14, 41, 77, 109, 2, 13, 39, 78, 109, 2, 13, 38, 79, - 109, 2, 12, 38, 80, 109, 2, 12, 37, 81, 109, 2, 11, 36, 82, 109, 2, 11, 35, 83, 109, 2, 11, 34, 84, 109, - 2, 10, 33, 84, 109, 2, 10, 33, 85, 109, 2, 10, 32, 86, 109, 2, 10, 31, 86, 109, 2, 9, 31, 86, 109, 2, - 9, 30, 87, 109, 2, 9, 30, 87, 109, 2, 9, 30, 88, 109, 2, 9, 29, 88, 109, 2, 9, 29, 88, 109, 2, 9, 29, - 89, 109, 2, 8, 29, 89, 109, 2, 8, 29, 89, 109, 2, 8, 29, 89, 109, 2, 8, 29, 89, 109, 2, 8, 29, 89, 109, - 2, 9, 29, 88, 109, 2, 9, 29, 88, 109, 2, 9, 30, 88, 109, 2, 9, 30, 88, 109, 2, 9, 30, 87, 109, 2, 9, - 31, 87, 109, 2, 9, 32, 86, 109, 2, 10, 32, 85, 109, 2, 10, 33, 85, 109, 2, 10, 34, 84, 109, 2, 11, 35, - 83, 109, 2, 11, 36, 82, 109, 2, 11, 37, 81, 109, 2, 12, 38, 80, 109, 2, 12, 39, 79, 109, 2, 13, 40, 77, - 109, 2, 13, 42, 76, 109, 2, 14, 44, 74, 109, 2, 15, 46, 72, 109, 2, 15, 49, 69, 109, 2, 16, 54, 63, 109, - 1, 17, 109, 1, 17, 109, 1, 18, 109, 1, 19, 109, 1, 20, 109, 1, 21, 109, 1, 22, 109, 1, 24, 109, 1, 25, - 109, 1, 26, 109, 1, 28, 109, 2, 29, 88, 89, 109, 2, 31, 86, 89, 109, 2, 32, 84, 89, 109, 2, 34, 82, 89, - 109, 2, 36, 80, 89, 109, 2, 39, 77, 89, 109, 2, 41, 75, 89, 109, 2, 45, 71, 89, 109, 2, 49, 67, 89, 109, - 1, 89, 109, 1, 89, 109, 1, 89, 109, 1, 89, 109, 1, 89, 109, 1, 89, 109, 1, 89, 109, 1, 89, 109, 1, 89, - 109, 1, 89, 109, 1, 89, 109, 1, 89, 109, 1, 89, 109, 1, 89, 109, 1, 89, 109, 1, 89, 109, 1, 89, 109, - 1, 89, 109, 1, 89, 109, 1, 89, 109, 1, 89, 109, 1, 89, 109, 1, 89, 109, 1, 89, 109, 1, 73, 116, 1, 69, - 120, 1, 67, 122, 1, 66, 123, 1, 65, 124, 1, 64, 125, 1, 64, 125, 1, 64, 126, 1, 63, 126, 1, 63, 126, - 1, 63, 126, 1, 63, 126, 1, 63, 126, 1, 64, 126, 1, 64, 125, 1, 65, 125, 1, 65, 124, 1, 66, 123, 1, 67, - 122, 1, 69, 120, 1, 74, 115, 114, 65, 157, 1, 85, 98, 1, 81, 101, 2, 28, 58, 79, 103, 2, 23, 58, 77, - 106, 2, 21, 58, 74, 107, 2, 20, 58, 73, 109, 2, 19, 58, 71, 110, 2, 18, 58, 69, 111, 2, 17, 58, 68, 113, - 2, 17, 58, 66, 114, 2, 17, 58, 65, 115, 2, 17, 58, 63, 115, 2, 17, 58, 62, 116, 2, 17, 58, 60, 116, 2, - 17, 58, 59, 117, 1, 17, 117, 1, 17, 117, 1, 18, 117, 1, 19, 117, 1, 19, 117, 2, 21, 89, 94, 117, 2, 23, - 86, 96, 116, 2, 27, 84, 98, 116, 2, 38, 82, 99, 115, 2, 38, 80, 100, 114, 2, 38, 79, 101, 113, 2, 38, - 77, 103, 111, 2, 38, 76, 107, 108, 1, 38, 74, 1, 38, 73, 1, 38, 72, 1, 38, 70, 1, 38, 69, 1, 38, 68, - 1, 38, 66, 1, 38, 65, 1, 38, 64, 1, 38, 63, 1, 38, 61, 1, 38, 60, 1, 38, 59, 1, 38, 58, 1, 38, 58, 1, - 38, 58, 1, 38, 58, 1, 38, 58, 1, 38, 58, 1, 38, 58, 1, 38, 58, 1, 38, 58, 1, 38, 58, 1, 38, 58, 1, 38, - 58, 1, 38, 58, 1, 38, 58, 1, 38, 58, 1, 38, 58, 1, 38, 58, 1, 38, 58, 1, 38, 58, 1, 38, 58, 1, 38, 58, - 1, 38, 58, 1, 38, 58, 1, 38, 58, 1, 38, 58, 1, 38, 58, 1, 38, 58, 1, 38, 58, 1, 38, 58, 1, 38, 58, 1, - 38, 58, 1, 20, 92, 1, 17, 95, 1, 16, 97, 1, 14, 98, 1, 14, 99, 1, 13, 99, 1, 13, 100, 1, 12, 100, 1, - 12, 100, 1, 12, 100, 1, 12, 100, 1, 12, 100, 1, 12, 100, 1, 13, 100, 1, 13, 99, 1, 13, 99, 1, 14, 98, - 1, 15, 97, 1, 17, 96, 1, 19, 93, 115, 65, 160, 1, 52, 72, 1, 47, 77, 2, 43, 81, 94, 96, 2, 41, 84, 91, - 99, 2, 38, 86, 89, 100, 1, 36, 101, 1, 34, 102, 1, 33, 103, 1, 31, 103, 1, 30, 103, 1, 28, 104, 1, 27, - 104, 1, 26, 104, 1, 25, 104, 1, 25, 104, 1, 24, 104, 1, 23, 104, 1, 23, 104, 1, 22, 104, 1, 22, 104, - 2, 21, 55, 69, 104, 2, 21, 50, 73, 104, 2, 21, 47, 76, 104, 2, 21, 44, 79, 104, 2, 21, 43, 81, 104, 2, - 20, 41, 83, 104, 2, 20, 41, 84, 104, 2, 20, 40, 84, 104, 2, 20, 41, 85, 103, 2, 20, 41, 85, 103, 2, 20, - 42, 86, 102, 2, 21, 44, 87, 101, 2, 21, 46, 88, 100, 2, 21, 49, 90, 98, 1, 21, 55, 1, 22, 62, 1, 22, - 68, 1, 22, 74, 1, 23, 79, 1, 23, 83, 1, 24, 86, 1, 25, 89, 1, 26, 91, 1, 26, 93, 1, 27, 95, 1, 28, 96, - 1, 30, 98, 1, 31, 99, 1, 33, 100, 1, 35, 101, 1, 37, 102, 1, 40, 103, 1, 44, 104, 1, 48, 104, 1, 54, - 105, 1, 62, 106, 1, 69, 106, 1, 75, 106, 2, 24, 29, 79, 107, 2, 21, 31, 82, 107, 2, 20, 33, 84, 108, - 2, 19, 34, 85, 108, 2, 18, 34, 86, 108, 2, 18, 35, 87, 108, 2, 17, 35, 88, 108, 2, 17, 36, 88, 108, 2, - 17, 36, 88, 109, 2, 17, 37, 88, 108, 2, 16, 38, 87, 108, 2, 16, 40, 86, 108, 2, 16, 41, 85, 108, 2, 16, - 43, 83, 108, 2, 16, 46, 81, 108, 2, 16, 49, 77, 107, 2, 16, 53, 73, 107, 1, 16, 107, 1, 16, 106, 1, 16, - 106, 1, 16, 105, 1, 16, 104, 1, 16, 104, 1, 16, 103, 1, 16, 102, 1, 17, 101, 1, 17, 99, 1, 17, 98, 1, - 18, 96, 1, 18, 95, 1, 19, 93, 1, 20, 92, 2, 21, 32, 36, 89, 2, 23, 30, 38, 87, 1, 41, 84, 1, 45, 80, - 1, 50, 75, 116, 36, 160, 1, 43, 51, 1, 41, 53, 1, 40, 54, 1, 39, 55, 1, 38, 56, 1, 38, 56, 1, 38, 56, - 1, 37, 56, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, - 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, - 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 27, 97, 1, 22, 102, 1, 20, 104, 1, 19, - 105, 1, 18, 106, 1, 17, 107, 1, 17, 107, 1, 16, 108, 1, 16, 108, 1, 16, 108, 1, 16, 108, 1, 16, 108, - 1, 16, 108, 1, 16, 108, 1, 17, 107, 1, 17, 107, 1, 18, 106, 1, 19, 105, 1, 20, 104, 1, 22, 102, 1, 26, - 98, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, - 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, - 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, - 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, - 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 1, 37, 57, 2, 37, 57, 109, 114, 2, 37, 57, 106, - 116, 2, 37, 58, 104, 117, 2, 37, 58, 101, 118, 2, 37, 59, 99, 119, 2, 37, 59, 97, 120, 2, 37, 61, 93, - 120, 2, 38, 63, 90, 121, 2, 38, 66, 84, 121, 1, 38, 121, 1, 39, 121, 1, 39, 121, 1, 39, 121, 1, 40, 120, - 1, 41, 120, 1, 41, 119, 1, 42, 118, 1, 43, 117, 1, 43, 116, 1, 44, 115, 1, 45, 113, 1, 46, 111, 1, 48, - 108, 1, 49, 106, 1, 51, 102, 1, 53, 99, 1, 56, 95, 1, 58, 91, 1, 63, 86, 117, 67, 160, 2, 16, 42, 75, - 105, 2, 10, 42, 70, 105, 2, 8, 42, 68, 105, 2, 7, 42, 67, 105, 2, 6, 42, 66, 105, 2, 6, 42, 65, 105, - 2, 5, 42, 65, 105, 2, 5, 42, 64, 105, 2, 4, 42, 64, 105, 2, 4, 42, 64, 105, 2, 4, 42, 64, 105, 2, 4, - 42, 64, 105, 2, 4, 42, 64, 105, 2, 5, 42, 64, 105, 2, 5, 42, 65, 105, 2, 6, 42, 65, 105, 2, 6, 42, 66, - 105, 2, 7, 42, 67, 105, 2, 8, 42, 68, 105, 2, 10, 42, 70, 105, 2, 14, 42, 74, 105, 2, 21, 42, 85, 105, - 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, - 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, - 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, - 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, - 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, - 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, - 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, - 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 85, 105, 2, 21, 42, 84, 105, 2, 21, 42, - 82, 105, 2, 21, 42, 81, 105, 2, 21, 42, 79, 105, 2, 21, 42, 77, 105, 2, 21, 43, 75, 105, 2, 21, 43, 73, - 105, 2, 22, 44, 71, 105, 2, 22, 45, 68, 110, 2, 22, 46, 65, 113, 2, 22, 49, 61, 114, 1, 23, 115, 1, 23, - 116, 1, 23, 117, 1, 24, 117, 1, 24, 117, 1, 25, 118, 1, 26, 118, 1, 26, 118, 1, 27, 118, 1, 28, 117, - 1, 29, 117, 1, 30, 117, 1, 31, 116, 2, 32, 83, 85, 115, 2, 33, 80, 85, 115, 2, 35, 78, 85, 113, 2, 36, - 75, 85, 111, 1, 38, 72, 1, 41, 68, 1, 44, 63, 118, 67, 157, 2, 13, 44, 80, 112, 2, 7, 49, 75, 117, 2, - 5, 51, 73, 119, 2, 4, 53, 72, 121, 2, 3, 54, 71, 122, 2, 3, 54, 70, 122, 2, 2, 55, 70, 123, 2, 2, 55, - 69, 123, 2, 1, 55, 69, 124, 2, 1, 55, 69, 124, 2, 1, 55, 69, 124, 2, 1, 55, 69, 124, 2, 1, 55, 69, 124, - 2, 2, 55, 69, 123, 2, 2, 55, 70, 123, 2, 3, 54, 70, 122, 2, 3, 54, 71, 122, 2, 4, 53, 71, 121, 2, 5, - 52, 73, 120, 2, 7, 50, 74, 118, 2, 11, 45, 79, 113, 2, 18, 41, 84, 106, 2, 19, 41, 83, 106, 2, 19, 42, - 83, 105, 2, 20, 42, 82, 105, 2, 20, 43, 82, 104, 2, 21, 43, 81, 104, 2, 21, 44, 81, 103, 2, 21, 44, 80, - 103, 2, 22, 45, 80, 102, 2, 22, 45, 79, 102, 2, 23, 45, 79, 101, 2, 23, 46, 78, 101, 2, 24, 46, 78, 100, - 2, 24, 47, 77, 100, 2, 25, 47, 77, 100, 2, 25, 48, 76, 99, 2, 26, 48, 76, 99, 2, 26, 49, 75, 98, 2, 27, - 49, 75, 98, 2, 27, 50, 74, 97, 2, 28, 50, 74, 97, 2, 28, 51, 74, 96, 2, 29, 51, 73, 96, 2, 29, 52, 73, - 95, 2, 30, 52, 72, 95, 2, 30, 53, 72, 94, 2, 31, 53, 71, 94, 2, 31, 54, 71, 93, 2, 32, 54, 70, 93, 2, - 32, 55, 70, 92, 2, 33, 55, 69, 92, 2, 33, 56, 69, 91, 2, 34, 56, 68, 91, 2, 34, 57, 68, 90, 2, 35, 57, - 67, 90, 2, 35, 58, 67, 89, 2, 36, 58, 66, 89, 2, 36, 59, 66, 88, 2, 37, 59, 65, 88, 2, 37, 60, 65, 87, - 2, 38, 60, 64, 87, 2, 38, 61, 64, 86, 2, 39, 61, 63, 86, 2, 39, 62, 63, 85, 1, 40, 85, 1, 40, 84, 1, - 41, 84, 1, 41, 83, 1, 42, 83, 1, 42, 82, 1, 42, 82, 1, 43, 81, 1, 43, 81, 1, 44, 80, 1, 44, 80, 1, 45, - 80, 1, 45, 79, 1, 46, 79, 1, 46, 78, 1, 47, 78, 1, 47, 77, 1, 48, 77, 1, 48, 76, 1, 49, 76, 1, 49, 75, - 1, 50, 75, 1, 50, 74, 1, 51, 74, 1, 51, 73, 119, 67, 157, 2, 12, 36, 88, 112, 2, 7, 41, 83, 117, 2, 5, - 43, 81, 119, 2, 4, 44, 80, 120, 2, 3, 45, 79, 121, 2, 2, 46, 78, 122, 2, 2, 46, 78, 122, 2, 1, 46, 78, - 123, 2, 1, 47, 77, 123, 2, 1, 47, 77, 123, 2, 1, 47, 77, 123, 2, 1, 47, 77, 123, 2, 1, 47, 77, 123, 2, - 1, 46, 78, 123, 2, 2, 46, 78, 123, 2, 2, 46, 78, 122, 2, 3, 45, 79, 121, 2, 4, 44, 80, 121, 2, 5, 43, - 81, 119, 2, 7, 41, 82, 117, 3, 11, 37, 53, 71, 87, 113, 3, 12, 33, 53, 72, 92, 112, 3, 13, 33, 52, 72, - 91, 112, 3, 13, 33, 52, 73, 91, 112, 3, 13, 34, 51, 73, 91, 112, 3, 13, 34, 51, 73, 91, 111, 3, 13, 34, - 51, 74, 90, 111, 3, 14, 34, 50, 74, 90, 111, 3, 14, 35, 50, 74, 90, 111, 3, 14, 35, 49, 75, 90, 110, - 3, 14, 35, 49, 75, 89, 110, 3, 15, 35, 49, 76, 89, 110, 3, 15, 36, 48, 76, 89, 110, 3, 15, 36, 48, 76, - 89, 109, 3, 15, 36, 48, 77, 88, 109, 3, 16, 36, 47, 77, 88, 109, 3, 16, 37, 47, 78, 88, 109, 3, 16, 37, - 46, 78, 88, 108, 3, 16, 37, 46, 78, 87, 108, 3, 17, 37, 46, 79, 87, 108, 3, 17, 38, 45, 79, 87, 108, - 3, 17, 38, 45, 79, 87, 108, 3, 17, 38, 45, 80, 86, 107, 3, 18, 38, 44, 80, 86, 107, 3, 18, 39, 44, 81, - 86, 107, 3, 18, 39, 43, 81, 86, 107, 3, 18, 39, 43, 81, 85, 106, 3, 19, 39, 43, 82, 85, 106, 3, 19, 40, - 42, 82, 85, 106, 3, 19, 40, 42, 83, 85, 106, 3, 19, 40, 41, 83, 84, 105, 3, 20, 40, 41, 83, 84, 105, - 2, 20, 40, 41, 105, 1, 20, 105, 2, 20, 62, 63, 104, 2, 21, 61, 63, 104, 2, 21, 61, 64, 104, 2, 21, 61, - 64, 104, 2, 21, 60, 64, 103, 2, 22, 60, 65, 103, 2, 22, 59, 65, 103, 2, 22, 59, 66, 103, 2, 22, 59, 66, - 103, 2, 23, 58, 66, 102, 2, 23, 58, 67, 102, 2, 23, 57, 67, 102, 2, 23, 57, 68, 102, 2, 24, 57, 68, 101, - 2, 24, 56, 68, 101, 2, 24, 56, 69, 101, 2, 24, 56, 69, 101, 2, 24, 55, 70, 100, 2, 25, 55, 70, 100, 2, - 25, 54, 70, 100, 2, 25, 54, 71, 100, 2, 25, 54, 71, 99, 2, 26, 53, 72, 99, 2, 26, 53, 72, 99, 2, 26, - 53, 72, 99, 2, 26, 52, 73, 98, 2, 27, 52, 73, 98, 2, 27, 51, 74, 98, 2, 27, 51, 74, 98, 2, 27, 51, 74, - 97, 2, 28, 50, 75, 97, 2, 28, 50, 75, 97, 2, 28, 50, 76, 97, 2, 28, 49, 76, 97, 2, 29, 49, 76, 96, 2, - 29, 48, 77, 96, 120, 67, 157, 2, 21, 44, 80, 103, 2, 16, 49, 75, 109, 2, 14, 51, 73, 111, 2, 12, 53, - 72, 112, 2, 11, 54, 71, 113, 2, 11, 54, 70, 113, 2, 10, 55, 70, 114, 2, 10, 55, 69, 114, 2, 10, 55, 69, - 115, 2, 9, 55, 69, 115, 2, 9, 55, 69, 115, 2, 10, 55, 69, 115, 2, 10, 55, 69, 114, 2, 10, 55, 69, 114, - 2, 10, 55, 70, 114, 2, 11, 54, 70, 113, 2, 11, 53, 71, 113, 2, 12, 53, 72, 112, 2, 14, 52, 73, 111, 2, - 15, 50, 74, 109, 2, 19, 51, 73, 105, 2, 21, 52, 72, 103, 2, 22, 53, 71, 102, 2, 23, 54, 70, 101, 2, 25, - 56, 69, 100, 2, 26, 57, 68, 98, 2, 27, 58, 66, 97, 2, 28, 59, 65, 96, 2, 29, 60, 64, 95, 2, 30, 62, 63, - 94, 1, 32, 93, 1, 33, 91, 1, 34, 90, 1, 35, 89, 1, 36, 88, 1, 37, 87, 1, 39, 86, 1, 40, 84, 1, 41, 83, - 1, 42, 82, 1, 43, 81, 1, 44, 80, 1, 46, 78, 1, 47, 77, 1, 45, 78, 1, 44, 80, 1, 43, 81, 1, 42, 82, 1, - 41, 83, 1, 40, 84, 1, 39, 85, 1, 38, 87, 1, 36, 88, 1, 35, 89, 1, 34, 90, 1, 33, 91, 1, 32, 92, 2, 31, - 61, 63, 94, 2, 30, 60, 64, 95, 2, 28, 59, 65, 96, 2, 27, 58, 66, 97, 2, 26, 57, 67, 98, 2, 25, 55, 69, - 99, 2, 24, 54, 70, 101, 2, 23, 53, 71, 102, 2, 22, 52, 72, 103, 2, 21, 51, 73, 104, 2, 19, 50, 74, 105, - 2, 18, 48, 76, 106, 2, 17, 47, 77, 108, 2, 14, 48, 76, 111, 2, 11, 51, 74, 114, 2, 9, 52, 73, 115, 2, - 8, 53, 71, 117, 2, 7, 54, 71, 117, 2, 7, 54, 70, 118, 2, 6, 55, 70, 118, 2, 6, 55, 69, 119, 2, 6, 55, - 69, 119, 2, 6, 55, 69, 119, 2, 6, 55, 69, 119, 2, 6, 55, 69, 119, 2, 6, 55, 69, 119, 2, 6, 55, 70, 119, - 2, 6, 54, 70, 118, 2, 7, 54, 70, 118, 2, 8, 53, 71, 117, 2, 9, 52, 72, 116, 2, 10, 51, 74, 114, 2, 13, - 48, 76, 112, 121, 67, 200, 2, 17, 40, 84, 108, 2, 12, 45, 79, 113, 2, 10, 47, 77, 115, 2, 8, 48, 76, - 116, 2, 7, 49, 75, 117, 2, 7, 50, 74, 118, 2, 6, 50, 74, 118, 2, 6, 51, 74, 119, 2, 6, 51, 73, 119, 2, - 6, 51, 73, 119, 2, 6, 51, 73, 119, 2, 6, 51, 73, 119, 2, 6, 51, 73, 119, 2, 6, 51, 74, 119, 2, 6, 50, - 74, 118, 2, 7, 50, 74, 118, 2, 8, 49, 75, 117, 2, 8, 48, 76, 117, 2, 10, 47, 77, 115, 2, 11, 45, 79, - 114, 2, 15, 41, 83, 109, 2, 16, 39, 85, 108, 2, 17, 39, 85, 108, 2, 17, 40, 84, 107, 2, 18, 40, 84, 107, - 2, 18, 41, 83, 106, 2, 19, 41, 83, 106, 2, 19, 42, 82, 105, 2, 20, 42, 82, 105, 2, 20, 43, 81, 104, 2, - 21, 43, 81, 104, 2, 21, 44, 80, 103, 2, 22, 44, 80, 103, 2, 22, 45, 79, 102, 2, 23, 45, 79, 102, 2, 23, - 46, 78, 101, 2, 24, 46, 78, 101, 2, 24, 47, 77, 100, 2, 25, 47, 77, 100, 2, 25, 48, 76, 99, 2, 26, 48, - 76, 99, 2, 26, 49, 75, 98, 2, 27, 49, 75, 98, 2, 27, 50, 74, 97, 2, 28, 50, 74, 97, 2, 28, 51, 73, 96, - 2, 29, 51, 73, 96, 2, 29, 52, 72, 95, 2, 30, 52, 72, 95, 2, 30, 53, 71, 94, 2, 31, 53, 71, 93, 2, 31, - 54, 70, 93, 2, 32, 55, 70, 92, 2, 32, 55, 69, 92, 2, 33, 56, 69, 91, 2, 33, 56, 68, 91, 2, 34, 57, 68, - 90, 2, 34, 57, 67, 90, 2, 35, 58, 67, 89, 2, 35, 58, 66, 89, 2, 36, 59, 66, 88, 2, 36, 59, 65, 88, 2, - 37, 60, 65, 87, 2, 37, 60, 64, 87, 2, 38, 61, 64, 86, 2, 38, 61, 63, 86, 2, 39, 62, 63, 85, 1, 39, 85, - 1, 40, 84, 1, 41, 84, 1, 41, 83, 1, 42, 83, 1, 42, 82, 1, 43, 82, 1, 43, 81, 1, 44, 81, 1, 44, 80, 1, - 45, 80, 1, 45, 79, 1, 46, 79, 1, 46, 78, 1, 47, 78, 1, 47, 77, 1, 48, 77, 1, 48, 76, 1, 49, 76, 1, 49, - 75, 1, 50, 75, 1, 50, 74, 1, 51, 74, 1, 50, 73, 1, 50, 73, 1, 49, 72, 1, 49, 72, 1, 48, 71, 1, 48, 71, - 1, 47, 70, 1, 47, 70, 1, 46, 69, 1, 46, 69, 1, 45, 68, 1, 45, 68, 1, 44, 67, 1, 44, 67, 1, 43, 66, 1, - 43, 66, 1, 42, 65, 1, 42, 65, 1, 41, 64, 1, 41, 64, 1, 40, 63, 1, 40, 63, 1, 16, 66, 1, 11, 71, 1, 10, - 73, 1, 8, 74, 1, 7, 75, 1, 7, 76, 1, 6, 76, 1, 6, 76, 1, 6, 77, 1, 6, 77, 1, 5, 77, 1, 6, 77, 1, 6, 77, - 1, 6, 76, 1, 6, 76, 1, 7, 76, 1, 7, 75, 1, 8, 74, 1, 10, 73, 1, 12, 71, 1, 17, 66, 122, 67, 157, 1, 21, - 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, - 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, 105, 1, 21, - 104, 1, 21, 103, 1, 21, 102, 1, 21, 101, 2, 21, 41, 71, 100, 2, 21, 41, 70, 99, 2, 21, 41, 69, 98, 2, - 21, 41, 68, 97, 2, 21, 41, 67, 96, 2, 21, 41, 66, 95, 2, 21, 41, 65, 94, 2, 21, 41, 64, 93, 2, 22, 40, - 63, 92, 2, 22, 40, 62, 91, 2, 23, 39, 61, 90, 2, 23, 39, 60, 89, 2, 24, 38, 59, 88, 2, 25, 37, 58, 87, - 2, 27, 34, 57, 86, 1, 56, 85, 1, 55, 84, 1, 54, 83, 1, 53, 82, 1, 52, 81, 1, 51, 80, 1, 50, 79, 1, 49, - 78, 1, 48, 77, 1, 47, 76, 1, 46, 75, 1, 45, 74, 1, 44, 73, 1, 43, 72, 1, 42, 71, 1, 41, 70, 1, 40, 69, - 1, 39, 68, 1, 38, 67, 1, 37, 66, 1, 36, 65, 1, 35, 64, 1, 34, 63, 2, 33, 62, 93, 101, 2, 32, 61, 91, - 103, 2, 31, 60, 90, 104, 2, 30, 59, 89, 105, 2, 29, 58, 89, 105, 2, 28, 57, 88, 106, 2, 27, 56, 88, 106, - 2, 26, 55, 87, 106, 2, 25, 54, 87, 107, 2, 24, 53, 87, 107, 2, 23, 52, 87, 107, 1, 22, 107, 1, 21, 107, - 1, 20, 107, 1, 19, 107, 1, 19, 107, 1, 19, 107, 1, 19, 107, 1, 19, 107, 1, 19, 107, 1, 19, 107, 1, 19, - 107, 1, 19, 107, 1, 19, 107, 1, 19, 107, 1, 19, 107, 1, 19, 107, 1, 19, 107, 1, 19, 107, 1, 19, 107, - 1, 19, 107, 123, 29, 189, 1, 74, 80, 1, 70, 84, 1, 68, 85, 1, 66, 86, 1, 64, 87, 1, 63, 88, 1, 62, 88, - 1, 61, 89, 1, 60, 89, 1, 59, 89, 1, 58, 89, 1, 57, 89, 1, 56, 89, 1, 56, 88, 1, 55, 88, 1, 55, 88, 1, - 54, 87, 1, 54, 86, 1, 54, 85, 1, 53, 83, 1, 53, 80, 1, 53, 75, 1, 52, 74, 1, 52, 74, 1, 52, 73, 1, 52, - 73, 1, 52, 73, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 51, 72, 1, 51, 72, 1, 50, 72, 1, 46, 72, 1, 42, 71, 1, 40, 71, 1, 38, 71, 1, - 37, 70, 1, 37, 70, 1, 36, 69, 1, 36, 69, 1, 36, 68, 1, 36, 67, 1, 35, 67, 1, 35, 67, 1, 36, 67, 1, 36, - 68, 1, 36, 69, 1, 36, 69, 1, 37, 70, 1, 37, 70, 1, 38, 71, 1, 40, 71, 1, 42, 71, 1, 46, 72, 1, 49, 72, - 1, 51, 72, 1, 51, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 73, 1, 52, 73, 1, 52, 73, 1, 52, 73, 1, 53, - 74, 1, 53, 76, 1, 53, 80, 1, 53, 84, 1, 54, 85, 1, 54, 86, 1, 54, 87, 1, 55, 88, 1, 55, 88, 1, 56, 88, - 1, 57, 89, 1, 57, 89, 1, 58, 89, 1, 59, 89, 1, 60, 89, 1, 61, 89, 1, 62, 88, 1, 63, 88, 1, 64, 87, 1, - 66, 86, 1, 68, 85, 1, 70, 84, 1, 74, 81, 124, 29, 189, 1, 60, 64, 1, 57, 67, 1, 56, 69, 1, 55, 69, 1, - 54, 70, 1, 53, 71, 1, 53, 71, 1, 53, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, - 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, - 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, - 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 53, 72, 1, 53, - 71, 1, 53, 71, 1, 54, 71, 1, 55, 70, 1, 56, 69, 1, 57, 67, 1, 59, 65, 125, 29, 189, 1, 43, 49, 1, 40, - 53, 1, 38, 56, 1, 37, 58, 1, 37, 59, 1, 36, 61, 1, 35, 62, 1, 35, 63, 1, 35, 64, 1, 35, 65, 1, 35, 66, - 1, 35, 66, 1, 35, 67, 1, 35, 68, 1, 36, 68, 1, 36, 69, 1, 37, 69, 1, 38, 70, 1, 39, 70, 1, 40, 70, 1, - 44, 71, 1, 48, 71, 1, 50, 71, 1, 50, 71, 1, 51, 71, 1, 51, 71, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, - 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, - 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, - 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, - 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 52, 72, 1, 52, 72, 1, 52, 72, 1, 52, 73, - 1, 52, 74, 1, 52, 77, 1, 52, 82, 1, 53, 84, 1, 53, 85, 1, 53, 86, 1, 54, 87, 1, 54, 87, 1, 55, 88, 1, - 56, 88, 1, 56, 88, 1, 57, 88, 1, 57, 88, 1, 56, 88, 1, 56, 88, 1, 55, 88, 1, 54, 87, 1, 54, 87, 1, 53, - 86, 1, 53, 85, 1, 52, 84, 1, 52, 82, 1, 52, 78, 1, 52, 74, 1, 52, 73, 1, 52, 73, 1, 52, 72, 1, 51, 72, - 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, - 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, - 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, - 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, 51, 72, 1, - 51, 72, 1, 51, 72, 1, 51, 71, 1, 51, 71, 1, 50, 71, 1, 50, 71, 1, 48, 71, 1, 44, 71, 1, 40, 70, 1, 39, - 70, 1, 38, 70, 1, 37, 69, 1, 36, 69, 1, 36, 68, 1, 35, 68, 1, 35, 67, 1, 35, 67, 1, 35, 66, 1, 35, 65, - 1, 35, 64, 1, 35, 63, 1, 35, 62, 1, 36, 61, 1, 37, 59, 1, 37, 58, 1, 38, 56, 1, 40, 53, 1, 43, 50, 126, - 79, 121, 1, 38, 50, 1, 36, 53, 1, 33, 56, 1, 32, 57, 1, 30, 59, 1, 28, 60, 2, 27, 61, 98, 102, 2, 26, - 63, 96, 105, 2, 25, 64, 94, 107, 2, 24, 65, 93, 108, 2, 23, 66, 92, 109, 2, 22, 67, 92, 109, 2, 21, 68, - 91, 110, 2, 20, 69, 90, 110, 2, 19, 70, 89, 110, 2, 18, 71, 88, 110, 2, 17, 72, 87, 111, 2, 17, 74, 86, - 110, 2, 16, 75, 85, 110, 2, 15, 76, 84, 110, 3, 15, 42, 46, 77, 83, 110, 2, 14, 41, 48, 109, 2, 14, 40, - 49, 108, 2, 14, 39, 50, 108, 2, 14, 38, 51, 107, 2, 14, 37, 52, 106, 2, 14, 36, 53, 105, 2, 14, 35, 55, - 105, 2, 14, 34, 56, 104, 2, 15, 33, 57, 103, 2, 15, 32, 58, 102, 2, 16, 31, 59, 101, 2, 17, 30, 60, 100, - 2, 18, 29, 61, 99, 2, 20, 27, 63, 98, 1, 64, 97, 1, 65, 95, 1, 67, 94, 1, 69, 92, 1, 71, 90, 1, 73, 88, - 1, 79, 84, - }; - - return Load(data, sizeof(data)); - } - /// \cond DO_NOT_DOCUMENT -#ifdef TEST_GENERATE_FONT - friend Test::GenerateFont; -#endif - /// \endcond - }; -} - -#endif//__SimdFont_hpp__ diff --git a/src/3rd/Simd/Simd/SimdFrame.hpp b/src/3rd/Simd/Simd/SimdFrame.hpp deleted file mode 100644 index c233064e..00000000 --- a/src/3rd/Simd/Simd/SimdFrame.hpp +++ /dev/null @@ -1,882 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar, -* 2014-2019 Antonenka Mikhail, -* 2019-2019 Artur Voronkov. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdFrame_hpp__ -#define __SimdFrame_hpp__ - -#include "Simd/SimdLib.hpp" - -namespace Simd -{ - /*! @ingroup cpp_frame - - \short The Frame structure provides storage and manipulation of frames (multiplanar images). - - \ref cpp_frame_functions. - */ - template class A> - struct Frame - { - typedef A Allocator; /*!< Allocator type definition. */ - - /*! Maximal count of pixel planes in a frame. */ - static const size_t PLANE_COUNT_MAX = 4; - - /*! - \enum Format - Describes pixel format types of a frame. - */ - enum Format - { - /*! An undefined pixel format. */ - None = 0, - /*! Two planes (8-bit full size Y plane, 16-bit interlived half size UV plane) NV12 pixel format. */ - Nv12, - /*! Three planes (8-bit full size Y plane, 8-bit half size U plane, 8-bit half size V plane) YUV420P pixel format. */ - Yuv420p, - /*! One plane 32-bit (4 8-bit channels) BGRA (Blue, Green, Red, Alpha) pixel format. */ - Bgra32, - /*! One plane 24-bit (3 8-bit channels) BGR (Blue, Green, Red) pixel format. */ - Bgr24, - /*! One plane 8-bit gray pixel format. */ - Gray8, - /*! One plane 24-bit (3 8-bit channels) RGB (Red, Green, Blue) pixel format. */ - Rgb24, - }; - - const size_t width; /*!< \brief A width of the frame. */ - const size_t height; /*!< \brief A height of the frame. */ - const Format format; /*!< \brief A pixel format types of the frame. */ - View planes[PLANE_COUNT_MAX];/*!< \brief Planes of the frame. */ - bool flipped; /*!< \brief A flag of vertically flipped image (false - frame point (0, 0) is placed at top left corner of the frame, true - frame point (0, 0) is placed at bottom left corner of the frame. */ - double timestamp; /*!< \brief A timestamp of the frame. */ - - /*! - Creates a new empty Frame structure. - */ - Frame(); - - /*! - Creates a new Frame structure on the base of the other frame. - - \note This constructor is not create new frame! It only creates a reference to the same frame. If you want to create a copy then must use method Simd::FRame::Clone. - - \param [in] frame - an original frame. - */ - Frame(const Frame & frame); - - /*! - Creates a new one plane Frame structure on the base of the image view. - - \note This constructor is not create new image frame! It only creates a reference to the same image. If you want to create a copy then must use method Simd::Frame::Clone. - - \param [in] view - an original image view. - \param [in] flipped_ - a flag of vertically flipped image of created frame. It is equal to false by default. - \param [in] timestamp_ - a timestamp of created frame. It is equal to 0 by default. - */ - Frame(const View & view, bool flipped_ = false, double timestamp_ = 0); - - /*! - Creates a new Frame structure with specified width, height and pixel format. - - \param [in] width_ - a width of created frame. - \param [in] height_ - a height of created frame. - \param [in] format_ - a pixel format of created frame. - \param [in] flipped_ - a flag of vertically flipped image of created frame. It is equal to false by default. - \param [in] timestamp_ - a timestamp of created frame. It is equal to 0 by default. - */ - Frame(size_t width_, size_t height_, Format format_, bool flipped_ = false, double timestamp_ = 0); - - /*! - Creates a new Frame structure with specified width, height and pixel format. - - \param [in] size - a size (width and height) of created frame. - \param [in] format_ - a pixel format of created frame. - \param [in] flipped_ - a flag of vertically flipped image of created frame. It is equal to false by default. - \param [in] timestamp_ - a timestamp of created frame. It is equal to 0 by default. - */ - Frame(const Point & size, Format format_, bool flipped_ = false, double timestamp_ = 0); - - /*! - Creates a new Frame structure with specified width, height and pixel format around external buffers. - - \param [in] width_ - a width of created frame. - \param [in] height_ - a height of created frame. - \param [in] format_ - a pixel format of created frame. - \param [in] data0 - a pointer to the pixel data of first image plane. - \param [in] stride0 - a row size of first image plane. - \param [in] data1 - a pointer to the pixel data of second image plane. - \param [in] stride1 - a row size of second image plane. - \param [in] data2 - a pointer to the pixel data of third image plane. - \param [in] stride2 - a row size of third image plane. - \param [in] flipped_ - a flag of vertically flipped image of created frame. It is equal to false by default. - \param [in] timestamp_ - a timestamp of created frame. It is equal to 0 by default. - */ - Frame(size_t width_, size_t height_, Format format_, uint8_t * data0, size_t stride0, - uint8_t * data1, size_t stride1, uint8_t * data2, size_t stride2, bool flipped_ = false, double timestamp_ = 0); - - /*! - A Frame destructor. - */ - ~Frame(); - - /*! - Gets a copy of current frame. - - \return a pointer to the new Frame structure. The user must free this pointer after usage. - */ - Frame * Clone() const; - - /*! - Gets a copy of current frame using buffer as a storage. - - \param [in, out] buffer - an external frame as a buffer. - \return a pointer to the new Frame structure (not owner). The user must free this pointer after usage. - */ - Frame * Clone(Frame & buffer) const; - - /*! - Creates reference to other Frame structure. - - \note This function is not create copy of the frame! It only create a reference to the same frame. - - \param [in] frame - an original frame. - \return a reference to itself. - */ - Frame & operator = (const Frame & frame); - - /*! - Creates reference to itself. - - \return a reference to itself. - */ - Frame & Ref(); - - /*! - Re-creates a Frame structure with specified width, height and pixel format. - - \param [in] width_ - a width of re-created frame. - \param [in] height_ - a height of re-created frame. - \param [in] format_ - a pixel format of re-created frame. - */ - void Recreate(size_t width_, size_t height_, Format format_); - - /*! - Re-creates a Frame structure with specified width, height and pixel format. - - \param [in] size - a size (width and height) of re-created frame. - \param [in] format_ - a pixel format of re-created frame. - */ - void Recreate(const Point & size, Format format_); - - /*! - Creates a new Frame structure which points to the region of current frame bounded by the rectangle with specified coordinates. - - \param [in] left - a left side of the region. - \param [in] top - a top side of the region. - \param [in] right - a right side of the region. - \param [in] bottom - a bottom side of the region. - \return - a new Frame structure which points to the region of frame. - */ - Frame Region(const ptrdiff_t & left, const ptrdiff_t & top, const ptrdiff_t & right, const ptrdiff_t & bottom) const; - - /*! - Creates a new Frame structure which points to the region of current frame bounded by the rectangle with specified coordinates. - - \param [in, out] left - a left side of the required region. Returns the left side of the actual region. - \param [in, out] top - a top side of the required region. Returns the top side of the actual region. - \param [in, out] right - a right side of the required region. Returns the right side of the actual region. - \param [in, out] bottom - a bottom side of the required region. Returns the bottom side of the actual region. - \return - a new Frame structure which points to the region of frame. - */ - Frame Region(ptrdiff_t & left, ptrdiff_t & top, ptrdiff_t & right, ptrdiff_t & bottom) const; - - /*! - Creates a new Frame structure which points to the region of frame bounded by the rectangle with specified coordinates. - - \param [in] topLeft - a top-left corner of the region. - \param [in] bottomRight - a bottom-right corner of the region. - \return - a new Frame structure which points to the region of frame. - */ - Frame Region(const Point & topLeft, const Point & bottomRight) const; - - /*! - Creates a new Frame structure which points to the region of frame bounded by the rectangle with specified coordinates. - - \param [in, out] topLeft - a top-left corner of the required region. Returns the top-left corner of the actual region. - \param [in, out] bottomRight - a bottom-right corner of the required region. Returns the bottom-right corner of the actual region. - \return - a new Frame structure which points to the region of frame. - */ - Frame Region(Point & topLeft, Point & bottomRight) const; - - /*! - Creates a new Frame structure which points to the region of frame bounded by the rectangle with specified coordinates. - - \param [in] rect - a rectangle which bound the region. - \return - a new Frame structure which points to the region of frame. - */ - Frame Region(const Rectangle & rect) const; - - /*! - Creates a new Frame structure which points to the region of frame bounded by the rectangle with specified coordinates. - - \param [in, out] rect - a rectangle which bound the required region. Returns the actual region. - \return - a new Frame structure which points to the region of frame. - */ - Frame Region(Rectangle & rect) const; - - /*! - Creates a new Frame structure which points to the vertically flipped frame. - - \return - a new Frame structure which points to the flipped frame. - */ - Frame Flipped() const; - - /*! - Gets size (width and height) of the frame. - - \return - a new Point structure with frame width and height. - */ - Point Size() const; - - /*! - Gets size in bytes required to store pixel data of current Frame structure. - - \return - a size of data pixels in bytes. - */ - size_t DataSize() const; - - /*! - Gets area in pixels of of current Frame structure. - - \return - a area of current Frame in pixels. - */ - size_t Area() const; - - /*! - \fn size_t PlaneCount(Format format); - - Gets number of planes in the frame for current pixel format. - - \param [in] format - a pixel format. - \return - a number of planes. - */ - static size_t PlaneCount(Format format); - - /*! - Gets number of planes for current frame. - - \return - a number of planes. - */ - size_t PlaneCount() const; - }; - - /*! @ingroup cpp_frame_functions - - \fn template class A, template class B> bool EqualSize(const Frame & a, const Frame & b); - - Checks two frames on the same size. - - \param [in] a - a first frame. - \param [in] b - a second frame. - \return - a result of checking. - */ - template class A, template class B> bool EqualSize(const Frame & a, const Frame & b); - - /*! @ingroup cpp_frame_functions - - \fn template class A, template class B> bool Compatible(const Frame & a, const Frame & b); - - Checks two frames on compatibility (the frames must have the same size and pixel format). - - \param [in] a - a first frame. - \param [in] b - a second frame. - \return - a result of checking. - */ - template class A, template class B> bool Compatible(const Frame & a, const Frame & b); - - /*! @ingroup cpp_frame_functions - - \fn template class A, template class B> void Copy(const Frame & src, Frame & dst); - - \short Copies one frame to another frame. - - The frames must have the same width, height and format. - - \param [in] src - an input frame. - \param [out] dst - an output frame. - */ - template class A, template class B> void Copy(const Frame & src, Frame & dst); - - /*! @ingroup cpp_frame_functions - - \fn template class A> void Convert(const Frame & src, Frame & dst); - - \short Converts one frame to another frame. - - The frames must have the same width and height. - - \param [in] src - an input frame. - \param [out] dst - an output frame. - */ - template class A> void Convert(const Frame & src, Frame & dst); - - //------------------------------------------------------------------------- - - // struct Frame implementation: - - template class A> SIMD_INLINE Frame::Frame() - : width(0) - , height(0) - , format(None) - , flipped(false) - , timestamp(0) - { - } - - template class A> SIMD_INLINE Frame::Frame(const Frame & frame) - : width(frame.width) - , height(frame.height) - , format(frame.format) - , flipped(frame.flipped) - , timestamp(frame.timestamp) - { - for (size_t i = 0, n = PlaneCount(); i < n; ++i) - planes[i] = frame.planes[i]; - } - - template class A> SIMD_INLINE Frame::Frame(const View & view, bool flipped_, double timestamp_) - : width(view.width) - , height(view.height) - , format(None) - , flipped(flipped_) - , timestamp(timestamp_) - { - switch (view.format) - { - case View::Gray8: (Format&)format = Gray8; break; - case View::Bgr24: (Format&)format = Bgr24; break; - case View::Bgra32: (Format&)format = Bgra32; break; - case View::Rgb24: (Format&)format = Rgb24; break; - default: - assert(0); - } - planes[0] = view; - } - - template class A> SIMD_INLINE Frame::Frame(size_t width_, size_t height_, Format format_, bool flipped_, double timestamp_) - : width(0) - , height(0) - , format(None) - , flipped(flipped_) - , timestamp(timestamp_) - { - Recreate(width_, height_, format_); - } - - template class A> SIMD_INLINE Frame::Frame(const Point & size, Format format_, bool flipped_, double timestamp_) - : width(0) - , height(0) - , format(None) - , flipped(flipped_) - , timestamp(timestamp_) - { - Recreate(size, format_); - } - - template class A> SIMD_INLINE Frame::Frame(size_t width_, size_t height_, Format format_, uint8_t * data0, size_t stride0, - uint8_t * data1, size_t stride1, uint8_t * data2, size_t stride2, bool flipped_, double timestamp_) - : width(width_) - , height(height_) - , format(format_) - , flipped(flipped_) - , timestamp(timestamp_) - { - switch (format) - { - case None: - break; - case Nv12: - assert((width & 1) == 0 && (height & 1) == 0); - planes[0] = View(width, height, stride0, View::Gray8, data0); - planes[1] = View(width / 2, height / 2, stride1, View::Uv16, data1); - break; - case Yuv420p: - assert((width & 1) == 0 && (height & 1) == 0); - planes[0] = View(width, height, stride0, View::Gray8, data0); - planes[1] = View(width / 2, height / 2, stride1, View::Gray8, data1); - planes[2] = View(width / 2, height / 2, stride2, View::Gray8, data2); - break; - case Bgra32: - planes[0] = View(width, height, stride0, View::Bgra32, data0); - break; - case Bgr24: - planes[0] = View(width, height, stride0, View::Bgr24, data0); - break; - case Gray8: - planes[0] = View(width, height, stride0, View::Gray8, data0); - break; - case Rgb24: - planes[0] = View(width, height, stride0, View::Rgb24, data0); - break; - default: - assert(0); - } - } - - template class A> SIMD_INLINE Frame::~Frame() - { - } - - template class A> SIMD_INLINE Frame * Frame::Clone() const - { - Frame * clone = new Frame(width, height, format, flipped, timestamp); - Copy(*this, *clone); - return clone; - } - - /*! \cond */ - template class A> SIMD_INLINE Frame * Frame::Clone(Frame & buffer) const - { - for (size_t i = 0; i < PlaneCount(); ++i) - { - if (buffer.planes[i].width < planes[i].width || buffer.planes[i].height < planes[i].height) - buffer.planes[i].Recreate(planes[i].Size(), planes[i].format); - } - Frame * clone = new Frame(width, height, format, - buffer.planes[0].data, buffer.planes[0].stride, - buffer.planes[1].data, buffer.planes[1].stride, - buffer.planes[2].data, buffer.planes[2].stride, - flipped, timestamp); - Copy(*this, *clone); - return clone; - } - /*! \endcond */ - - template class A> SIMD_INLINE Frame & Frame::operator = (const Frame & frame) - { - if (this != &frame) - { - *(size_t*)&width = frame.width; - *(size_t*)&height = frame.height; - *(Format*)&format = frame.format; - flipped = frame.flipped; - timestamp = frame.timestamp; - for (size_t i = 0, n = PlaneCount(); i < n; ++i) - planes[i] = frame.planes[i]; - } - return *this; - } - - template class A> SIMD_INLINE Frame & Frame::Ref() - { - return *this; - } - - template class A> SIMD_INLINE void Frame::Recreate(size_t width_, size_t height_, Format format_) - { - *(size_t*)&width = width_; - *(size_t*)&height = height_; - *(Format*)&format = format_; - - for (size_t i = 0; i < PLANE_COUNT_MAX; ++i) - planes[i].Recreate(0, 0, View::None); - - switch (format) - { - case None: - break; - case Nv12: - assert((width & 1) == 0 && (height & 1) == 0); - planes[0].Recreate(width, height, View::Gray8); - planes[1].Recreate(width / 2, height / 2, View::Uv16); - break; - case Yuv420p: - assert((width & 1) == 0 && (height & 1) == 0); - planes[0].Recreate(width, height, View::Gray8); - planes[1].Recreate(width / 2, height / 2, View::Gray8); - planes[2].Recreate(width / 2, height / 2, View::Gray8); - break; - case Bgra32: - planes[0].Recreate(width, height, View::Bgra32); - break; - case Bgr24: - planes[0].Recreate(width, height, View::Bgr24); - break; - case Gray8: - planes[0].Recreate(width, height, View::Gray8); - break; - case Rgb24: - planes[0].Recreate(width, height, View::Rgb24); - break; - default: - assert(0); - } - } - - template class A> SIMD_INLINE void Frame::Recreate(const Point & size, Format format_) - { - Recreate(size.x, size.y, format_); - } - - template class A> SIMD_INLINE Frame Frame::Region(const ptrdiff_t & left, const ptrdiff_t & top, const ptrdiff_t & right, const ptrdiff_t & bottom) const - { - Rectangle rect(left, top, right, bottom); - return Region(rect.left, rect.top, rect.right, rect.bottom); - } - - template class A> SIMD_INLINE Frame Frame::Region(ptrdiff_t & left, ptrdiff_t & top, ptrdiff_t & right, ptrdiff_t & bottom) const - { - if (format != None && right >= left && bottom >= top) - { - left = std::min(std::max(left, 0), width); - top = std::min(std::max(top, 0), height); - right = std::min(std::max(right, 0), width); - bottom = std::min(std::max(bottom, 0), height); - - if (format == Nv12 || format == Yuv420p) - { - left = left & ~1; - top = top & ~1; - right = (right + 1) & ~1; - bottom = (bottom + 1) & ~1; - } - - Frame frame; - *(size_t*)&frame.width = right - left; - *(size_t*)&frame.height = bottom - top; - *(Format*)&frame.format = format; - frame.flipped = flipped; - frame.timestamp = timestamp; - - frame.planes[0] = planes[0].Region(left, top, right, bottom); - - if (format == Nv12 || format == Yuv420p) - frame.planes[1] = planes[1].Region(left / 2, top / 2, right / 2, bottom / 2); - - if (format == Yuv420p) - frame.planes[2] = planes[2].Region(left / 2, top / 2, right / 2, bottom / 2); - - return frame; - } - else - return Frame(); - } - - template class A> SIMD_INLINE Frame Frame::Region(const Point & topLeft, const Point & bottomRight) const - { - return Region(topLeft.x, topLeft.y, bottomRight.x, bottomRight.y); - } - - template class A> SIMD_INLINE Frame Frame::Region(Point & topLeft, Point & bottomRight) const - { - return Region(topLeft.x, topLeft.y, bottomRight.x, bottomRight.y); - } - - template class A> SIMD_INLINE Frame Frame::Region(const Rectangle & rect) const - { - return Region(rect.left, rect.top, rect.right, rect.bottom); - } - - template class A> SIMD_INLINE Frame Frame::Region(Rectangle & rect) const - { - return Region(rect.left, rect.top, rect.right, rect.bottom); - } - - template class A> SIMD_INLINE Frame Frame::Flipped() const - { - Frame frame; - *(size_t*)&frame.width = width; - *(size_t*)&frame.height = height; - *(Format*)&frame.format = format; - frame.timestamp = timestamp; - frame.flipped = !flipped; - for (size_t i = 0, n = PlaneCount(); i < n; ++i) - frame.planes[i] = planes[i].Flipped(); - return frame; - } - - template class A> SIMD_INLINE Point Frame::Size() const - { - return Point(width, height); - } - - template class A> SIMD_INLINE size_t Frame::DataSize() const - { - size_t size = 0; - for (size_t i = 0; i < PLANE_COUNT_MAX; ++i) - size += planes[i].DataSize(); - return size; - } - - template class A> SIMD_INLINE size_t Frame::Area() const - { - return width*height; - } - - template class A> SIMD_INLINE size_t Frame::PlaneCount(Format format) - { - switch (format) - { - case None: return 0; - case Nv12: return 2; - case Yuv420p: return 3; - case Bgra32: return 1; - case Bgr24: return 1; - case Gray8: return 1; - case Rgb24: return 1; - default: assert(0); return 0; - } - } - - template class A> SIMD_INLINE size_t Frame::PlaneCount() const - { - return PlaneCount(format); - } - - // View utilities implementation: - - template class A, template class B> SIMD_INLINE bool EqualSize(const Frame & a, const Frame & b) - { - return - (a.width == b.width && a.height == b.height); - } - - template class A, template class B> SIMD_INLINE bool Compatible(const Frame & a, const Frame & b) - { - typedef typename Frame::Format Format; - - return - (a.width == b.width && a.height == b.height && a.format == (Format)b.format && a.flipped == b.flipped); - } - - template class A, template class B> SIMD_INLINE void Copy(const Frame & src, Frame & dst) - { - assert(Compatible(src, dst)); - - if (src.format) - { - for (size_t i = 0, n = src.PlaneCount(); i < n; ++i) - Simd::Copy(src.planes[i], dst.planes[i]); - } - } - - template class A> SIMD_INLINE void Convert(const Frame & src, Frame & dst) - { - assert(EqualSize(src, dst) && src.format && dst.format && src.flipped == dst.flipped); - - if (src.format == dst.format) - { - Copy(src, dst); - return; - } - - switch (src.format) - { - case Frame::Nv12: - switch (dst.format) - { - case Frame::Yuv420p: - Copy(src.planes[0], dst.planes[0]); - DeinterleaveUv(src.planes[1], dst.planes[1], dst.planes[2]); - break; - case Frame::Bgra32: - { - View u(src.Size(), View::Gray8), v(src.Size(), View::Gray8); - DeinterleaveUv(src.planes[1], u, v); - Yuv420pToBgra(src.planes[0], u, v, dst.planes[0]); - break; - } - case Frame::Bgr24: - { - View u(src.Size(), View::Gray8), v(src.Size(), View::Gray8); - DeinterleaveUv(src.planes[1], u, v); - Yuv420pToBgr(src.planes[0], u, v, dst.planes[0]); - break; - } - case Frame::Gray8: - Copy(src.planes[0], dst.planes[0]); - break; - case Frame::Rgb24: - { - View u(src.Size(), View::Gray8), v(src.Size(), View::Gray8); - DeinterleaveUv(src.planes[1], u, v); - View bgr(src.Size(), View::Bgr24); - Yuv420pToBgr(src.planes[0], u, v, bgr); - BgrToRgb(bgr, dst.planes[0]); - break; - } - default: - assert(0); - } - break; - - case Frame::Yuv420p: - switch (dst.format) - { - case Frame::Nv12: - Copy(src.planes[0], dst.planes[0]); - InterleaveUv(src.planes[1], src.planes[2], dst.planes[1]); - break; - case Frame::Bgra32: - Yuv420pToBgra(src.planes[0], src.planes[1], src.planes[2], dst.planes[0]); - break; - case Frame::Bgr24: - Yuv420pToBgr(src.planes[0], src.planes[1], src.planes[2], dst.planes[0]); - break; - case Frame::Gray8: - Copy(src.planes[0], dst.planes[0]); - break; - case Frame::Rgb24: - { - View bgr(src.Size(), View::Bgr24); - Yuv420pToBgr(src.planes[0], src.planes[1], src.planes[2], bgr); - BgrToRgb(bgr, dst.planes[0]); - break; - } - default: - assert(0); - } - break; - - case Frame::Bgra32: - switch (dst.format) - { - case Frame::Nv12: - { - View u(src.Size(), View::Gray8), v(src.Size(), View::Gray8); - BgraToYuv420p(src.planes[0], dst.planes[0], u, v); - InterleaveUv(u, v, dst.planes[1]); - break; - } - case Frame::Yuv420p: - BgraToYuv420p(src.planes[0], dst.planes[0], dst.planes[1], dst.planes[2]); - break; - case Frame::Bgr24: - BgraToBgr(src.planes[0], dst.planes[0]); - break; - case Frame::Gray8: - BgraToGray(src.planes[0], dst.planes[0]); - break; - case Frame::Rgb24: - BgraToRgb(src.planes[0], dst.planes[0]); - break; - default: - assert(0); - } - break; - - case Frame::Bgr24: - switch (dst.format) - { - case Frame::Nv12: - { - View u(src.Size(), View::Gray8), v(src.Size(), View::Gray8); - BgrToYuv420p(src.planes[0], dst.planes[0], u, v); - InterleaveUv(u, v, dst.planes[1]); - break; - } - case Frame::Yuv420p: - BgrToYuv420p(src.planes[0], dst.planes[0], dst.planes[1], dst.planes[2]); - break; - case Frame::Bgra32: - BgrToBgra(src.planes[0], dst.planes[0]); - break; - case Frame::Gray8: - BgrToGray(src.planes[0], dst.planes[0]); - break; - case Frame::Rgb24: - BgrToRgb(src.planes[0], dst.planes[0]); - break; - default: - assert(0); - } - break; - - case Frame::Gray8: - switch (dst.format) - { - case Frame::Nv12: - Copy(src.planes[0], dst.planes[0]); - Fill(dst.planes[1], 128); - break; - case Frame::Yuv420p: - Copy(src.planes[0], dst.planes[0]); - Fill(dst.planes[1], 128); - Fill(dst.planes[2], 128); - break; - case Frame::Bgra32: - GrayToBgra(src.planes[0], dst.planes[0]); - break; - case Frame::Bgr24: - GrayToBgr(src.planes[0], dst.planes[0]); - break; - case Frame::Rgb24: - GrayToBgr(src.planes[0], dst.planes[0]); - break; - default: - assert(0); - } - break; - - case Frame::Rgb24: - switch (dst.format) - { - case Frame::Nv12: - { - View bgr(src.Size(), View::Bgr24); - BgrToRgb(src.planes[0], bgr); - View u(src.Size(), View::Gray8), v(src.Size(), View::Gray8); - BgrToYuv420p(bgr, dst.planes[0], u, v); - InterleaveUv(u, v, dst.planes[1]); - break; - } - case Frame::Yuv420p: - { - View bgr(src.Size(), View::Bgr24); - BgrToRgb(src.planes[0], bgr); - BgrToYuv420p(bgr, dst.planes[0], dst.planes[1], dst.planes[2]); - break; - } - case Frame::Bgra32: - RgbToBgra(src.planes[0], dst.planes[0]); - break; - case Frame::Gray8: - RgbToGray(src.planes[0], dst.planes[0]); - break; - case Frame::Rgb24: - BgrToRgb(src.planes[0], dst.planes[0]); - break; - default: - assert(0); - } - - default: - assert(0); - } - } -} - -#endif//__SimdFrame_hpp__ diff --git a/src/3rd/Simd/Simd/SimdGemm.h b/src/3rd/Simd/Simd/SimdGemm.h deleted file mode 100644 index 962bb7cf..00000000 --- a/src/3rd/Simd/Simd/SimdGemm.h +++ /dev/null @@ -1,634 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdGemm_h__ -#define __SimdGemm_h__ - -#include "Simd/SimdArray.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdParallel.hpp" -#include "Simd/SimdPerformance.h" - -#ifdef _N -#undef _N -#endif - -namespace Simd -{ - template class GemmNN - { - public: - typedef void(*Main)(size_t K, T alpha, const T * A, size_t lda, const T * B, size_t ldb, size_t sb, T * C, size_t ldc, TM tail); - typedef void(*Tail)(size_t M, size_t K, T alpha, const T * A, size_t lda, const T * B, size_t ldb, size_t sb, T * C, size_t ldc, TM tail); - typedef void(*PackA)(const T * A, size_t lda, size_t M, size_t K, size_t microM, T * pA); - typedef void(*PackB)(const T * B, size_t ldb, size_t K, size_t N, size_t microN, T * pB); - typedef void(*ScaleC)(size_t M, size_t N, T beta, T * C, size_t ldc); - typedef TM(*TailMask)(ptrdiff_t tail); - - GemmNN(size_t M, size_t N, size_t K, size_t microM, size_t microN, size_t L1, size_t L2, size_t L3, size_t F, - Main kernelMM, Main kernelMT, Tail kernelTM, Tail kernelTT, PackA packA, PackB packB, ScaleC scaleC, TailMask tailMask) - : _M(M) - , _N(N) - , _K(K) - , _microM(microM) - , _microN(microN) - , _F(F) - , _threadNumber(Base::GetThreadNumber()) - , _kernelMM(kernelMM) - , _kernelMT(kernelMT) - , _kernelTM(kernelTM) - , _kernelTT(kernelTT) - , _scaleC(scaleC) - , _packB(packB) - , _packA(packA) - { - _macroK = Simd::Min(L1 / sizeof(T) / _microN, _K); - _macroM = Simd::Min(AlignLoAny(L2 / sizeof(T) / _macroK, _microM), AlignHiAny(_M, _microM)); - _macroN = Simd::Min(AlignLoAny(L3 / sizeof(T) / _macroK, _microN), AlignHiAny(_N, _microN)); - if (_N * _M * _K < 256 * 256 * 256 * 2) - _threadNumber = 1; - _pA.resize(_threadNumber); - _pB.resize(_threadNumber); - for (size_t t = 0; t < _threadNumber; ++t) - { - _pA[t].Resize(_macroM * _macroK); - _pB[t].Resize(_macroN * _macroK); - } - size_t NF = AlignLo(_N, _F); - if (tailMask) - { - _main = TM(-1); - _tail = NF == _N ? TM(-1) : tailMask(_N - NF); - } - else - { - _main = TM(_F); - _tail = NF == _N ? TM(_F) : TM(_N - NF); - } - } - - void Run(const T * alpha, const T * A, size_t lda, const T * B, size_t ldb, const T * beta, T * C, size_t ldc) - { - Simd::Parallel(0, _N, [&](size_t thread, size_t begin, size_t end) - { - ThreadKernel(end - begin, *alpha, A, lda, B + begin, ldb, *beta, C + begin, ldc, thread); - }, _threadNumber, _microN); - } - - private: - - void ThreadKernel(size_t N, T alpha, const T * A, size_t lda, const T * B, size_t ldb, T beta, T * C, size_t ldc, size_t thread) - { - for (size_t j = 0; j < N; j += _macroN) - { - size_t macroN = Simd::Min(N, j + _macroN) - j; - for (size_t k = 0; k < _K; k += _macroK) - { - size_t macroK = Simd::Min(_K, k + _macroK) - k; - for (size_t i = 0; i < _M; i += _macroM) - { - size_t macroM = Simd::Min(_M, i + _macroM) - i; - if (k == 0) - _scaleC(macroM, macroN, beta, C + i * ldc + j, ldc); - MacroKernel(macroM, macroN, macroK, alpha, A + i * lda + k, lda, B + k * ldb + j, ldb, beta, C + i * ldc + j, ldc, i == 0, thread); - } - } - } - } - - void MacroKernel(size_t M, size_t N, size_t K, T alpha, const T * A, size_t lda, const T * B, size_t ldb, T beta, T * C, size_t ldc, bool packB, size_t thread) - { - size_t klda = lda; - if (_packA) - { - _packA(A, lda, M, K, _microM, _pA[thread].data); - A = _pA[thread].data; - lda = K; - klda = 1; - } - size_t MA = AlignLoAny(M, _microM); - size_t NA = AlignLoAny(N, _microN); - size_t j = 0; - for (; j < NA; j += _microN) - { - T * pB = _pB[thread].data + j * _macroK; - if (packB) - _packB(B + j, ldb, K, _microN, _microN, pB); - size_t i = 0; - for (; i < MA; i += _microM) - _kernelMM(K, alpha, A + i * lda, klda, pB, _F, _microN, C + i * ldc + j, ldc, _main); - if (i < M) - _kernelTM(M - i, K, alpha, A + i * lda, klda, pB, _F, _microN, C + i * ldc + j, ldc, _main); - } - if (j < N) - { - T * pB = _pB[thread].data + j * _macroK; - if (packB) - _packB(B + j, ldb, K, N - j, _microN, pB); - size_t i = 0; - for (; i < MA; i += _microM) - _kernelMT(K, alpha, A + i * lda, klda, pB, _F, _microN, C + i * ldc + j, ldc, _tail); - if (i < M) - _kernelTT(M - i, K, alpha, A + i * lda, klda, pB, _F, _microN, C + i * ldc + j, ldc, _tail); - } - } - - typedef std::vector> Arrays; - - Arrays _pA, _pB; - size_t _M, _N, _K, _microM, _microN, _macroM, _macroN, _macroK, _F, _threadNumber; - TM _main, _tail; - Main _kernelMM, _kernelMT; - Tail _kernelTM, _kernelTT; - ScaleC _scaleC; - PackB _packB; - PackA _packA; - }; - - template class GemmNT - { - public: - typedef void(*Kernel)(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc); - typedef void(*ScaleC)(size_t M, size_t N, T beta, T * C, size_t ldc); - - GemmNT(size_t M, size_t N, size_t K, size_t L1, size_t L2, size_t L3, size_t F, ScaleC scaleC, - Kernel k1x1, Kernel k1x4, Kernel k2x1, Kernel k2x4, Kernel k3x1, Kernel k3x4, Kernel k6x1, Kernel k6x4) - : _M(M) - , _N(N) - , _K(K) - , _F(F) - , _threadNumber(Base::GetThreadNumber()) - , _scaleC(scaleC) - , _k1x1(k1x1) - , _k1x4(k1x4) - , _k2x1(k2x1) - , _k2x4(k2x4) - , _k3x1(k3x1) - , _k3x4(k3x4) - , _k6x1(k6x1) - , _k6x4(k6x4) - { - _microN = 4; - _microM = _k6x4 ? 6 : 3; - _macroK = AlignLo(L1 / sizeof(T) / _microN, _F); - _macroM = AlignLoAny(L2 / sizeof(T) / _macroK, _microM); - _macroN = AlignLoAny(L3 / sizeof(T) / _macroK, _microN); - if (_N * _M * _K < 256 * 256 * 256 * 2) - _threadNumber = 1; - } - - void Run(const T * alpha, const T * A, size_t lda, const T * B, size_t ldb, const T * beta, T * C, size_t ldc) - { - Simd::Parallel(0, _N, [&](size_t thread, size_t begin, size_t end) - { - ThreadKernel(end - begin, *alpha, A, lda, B + begin*ldb, ldb, *beta, C + begin, ldc, thread); - }, _threadNumber, _microN); - } - - private: - - void ThreadKernel(size_t N, T alpha, const T * A, size_t lda, const T * B, size_t ldb, T beta, T * C, size_t ldc, size_t thread) - { - for (size_t j = 0; j < N; j += _macroN) - { - size_t macroN = Simd::Min(N, j + _macroN) - j; - for (size_t k = 0; k < _K; k += _macroK) - { - size_t macroK = Simd::Min(_K, k + _macroK) - k; - for (size_t i = 0; i < _M; i += _macroM) - { - size_t macroM = Simd::Min(_M, i + _macroM) - i; - if (k == 0) - _scaleC(macroM, macroN, beta, C + i * ldc + j, ldc); - MacroKernel(macroM, macroN, macroK, alpha, A + i * lda + k, lda, B + j * ldb + k, ldb, beta, C + i * ldc + j, ldc); - } - } - } - } - - void MacroKernel(size_t M, size_t N, size_t K, T alpha, const T * A, size_t lda, const T * B, size_t ldb, T beta, T * C, size_t ldc) - { - size_t N4 = Simd::AlignLo(N, 4); - size_t i = 0; - if (_k6x4) - { - size_t M6 = Simd::AlignLoAny(M, 6); - for (; i < M6; i += 6) - { - const float * pA = A + i * lda; - float * pC = C + i * ldc; - size_t j = 0; - for (; j < N4; j += 4) - _k6x4(K, alpha, pA, lda, B + j * ldb, ldb, pC + j, ldc); - for (; j < N; ++j) - _k6x1(K, alpha, pA, lda, B + j * ldb, ldb, pC + j, ldc); - } - } - if (_k3x4) - { - size_t M3 = Simd::AlignLoAny(M, 3); - for (; i < M3; i += 3) - { - const float * pA = A + i * lda; - float * pC = C + i * ldc; - size_t j = 0; - for (; j < N4; j += 4) - _k3x4(K, alpha, pA, lda, B + j * ldb, ldb, pC + j, ldc); - for (; j < N; ++j) - _k3x1(K, alpha, pA, lda, B + j * ldb, ldb, pC + j, ldc); - } - for (; i < M - 1; i += 2) - { - const float * pA = A + i * lda; - float * pC = C + i * ldc; - size_t j = 0; - for (; j < N4; j += 4) - _k2x4(K, alpha, pA, lda, B + j * ldb, ldb, pC + j, ldc); - for (; j < N; ++j) - _k2x1(K, alpha, pA, lda, B + j * ldb, ldb, pC + j, ldc); - } - } - for (; i < M; i++) - { - const float * pA = A + i * lda; - float * pC = C + i * ldc; - size_t j = 0; - for (; j < N4; j += 4) - _k1x4(K, alpha, pA, lda, B + j * ldb, ldb, pC + j, ldc); - for (; j < N; ++j) - _k1x1(K, alpha, pA, lda, B + j * ldb, ldb, pC + j, ldc); - } - } - - size_t _M, _N, _K, _microM, _microN, _macroM, _macroN, _macroK, _F, _threadNumber; - ScaleC _scaleC; - Kernel _k1x1, _k1x4, _k2x1, _k2x4, _k3x1, _k3x4, _k6x1, _k6x4; - }; - - template class GemmNNcb - { - public: - typedef void(*Main)(size_t K, T alpha, const T * A, size_t lda, const T * B, size_t ldb, size_t sb, T * C, size_t ldc, TM tail); - typedef void(*Tail)(size_t M, size_t K, T alpha, const T * A, size_t lda, const T * B, size_t ldb, size_t sb, T * C, size_t ldc, TM tail); - typedef void(*PackA)(const T * A, size_t lda, size_t M, size_t K, size_t microM, T * pA); - typedef void(*PackB)(const T * B, size_t ldb, size_t K, size_t N, size_t microN, T * pB); - typedef void(*ScaleC)(size_t M, size_t N, T beta, T * C, size_t ldc); - typedef TM(*TailMask)(ptrdiff_t tail); - - GemmNNcb(size_t M, size_t N, size_t K, size_t microM, size_t microN, size_t L1, size_t L2, size_t L3, size_t F, - Main kernelMM, Main kernelMT, Tail kernelTM, Tail kernelTT, PackA packA, PackB packB, ScaleC scaleC, TailMask tailMask, bool compatible = false) - : _0(0) - , _1(1) - { - L2 = Simd::RestrictRange(size_t(::sqrt(L1 * L3)), L2/4, L2); - _compatible = compatible; - _M = M; - _N = N; - _K = K; - _microM = microM; - _microN = microN; - _F = F; - _kernelMM = kernelMM; - _kernelMT = kernelMT; - _kernelTM = kernelTM; - _kernelTT = kernelTT; - _scaleC = scaleC; - _packB = packB; - _packA = packA; - _macroK = Simd::Min(L1 / sizeof(T) / _microN, _K); - _macroM = Simd::Min(AlignLoAny(L2 / sizeof(T) / _macroK, _microM), AlignHiAny(_M, _microM)); - _macroN = Simd::Min(AlignLoAny(L3 / sizeof(T) / _macroK, _microN), AlignHiAny(_N, _compatible ? _F : _microN)); - if (_packA) - { - - _pA.Resize(_macroM * _macroK); - } - size_t NF = AlignLo(_N, _F); - if (tailMask) - { - _main = TM(-1); - _tail = NF == _N ? TM(-1) : tailMask(_N - NF); - } - else - { - _main = TM(_F); - _tail = NF == _N ? TM(_F) : TM(_N - NF); - } - } - - SIMD_INLINE size_t BufferSize() const - { - return AlignHiAny(_N, _compatible ? _F : _microN)*_K; - } - - void ReorderB(const T * B, size_t ldb, T * pB) - { - if (_compatible) - { - _packB(B, ldb, _K, _N, _F, pB); - } - else - { - for (size_t j = 0; j < _N; j += _macroN) - { - size_t macroN = Simd::Min(_N, j + _macroN) - j; - for (size_t k = 0; k < _K; k += _macroK) - { - size_t macroK = Simd::Min(_K, k + _macroK) - k; - _packB(B + k * ldb + j, ldb, macroK, macroN, _microN, pB); - pB += AlignHiAny(macroN, _microN)*macroK; - } - } - } - } - - SIMD_INLINE void Run(const T * A, size_t lda, const T * pB, T * C, size_t ldc) - { - Run(_M, A, lda, pB, C, ldc); - } - - void Run(size_t M, const T * A, size_t lda, const T * pB, T * C, size_t ldc) - { - assert(M <= _M); - for (size_t j = 0; j < _N; j += _macroN) - { - size_t macroN = Simd::Min(_N, j + _macroN) - j; - for (size_t k = 0; k < _K; k += _macroK) - { - size_t macroK = Simd::Min(_K, k + _macroK) - k; - for (size_t i = 0; i < M; i += _macroM) - { - size_t macroM = Simd::Min(M, i + _macroM) - i; - if (k == 0) - _scaleC(macroM, macroN, _0, C + i * ldc + j, ldc); - if (_compatible) - MacroKernelCompatible(macroM, macroN, macroK, A + i * lda + k, lda, pB + j * _K + k * _F, C + i * ldc + j, ldc); - else - MacroKernelSpecific(macroM, macroN, macroK, A + i * lda + k, lda, pB, C + i * ldc + j, ldc); - } - if(!_compatible) - pB += AlignHiAny(macroN, _microN)*macroK; - } - } - } - - private: - - void MacroKernelSpecific(size_t M, size_t N, size_t K, const T * A, size_t lda, const T * pB, T * C, size_t ldc) - { - size_t klda = lda; - if (_packA) - { - _packA(A, lda, M, K, _microM, _pA.data); - A = _pA.data; - lda = K; - klda = 1; - } - size_t MA = AlignLoAny(M, _microM); - size_t NA = AlignLoAny(N, _microN); - size_t j = 0; - for (; j < NA; j += _microN) - { - size_t i = 0; - for (; i < MA; i += _microM) - _kernelMM(K, _1, A + i * lda, klda, pB, _F, _microN, C + i * ldc + j, ldc, _main); - if (i < M) - _kernelTM(M - i, K, _1, A + i * lda, klda, pB, _F, _microN, C + i * ldc + j, ldc, _main); - pB += _microN * K; - } - if (j < N) - { - size_t i = 0; - for (; i < MA; i += _microM) - _kernelMT(K, _1, A + i * lda, klda, pB, _F, _microN, C + i * ldc + j, ldc, _tail); - if (i < M) - _kernelTT(M - i, K, _1, A + i * lda, klda, pB, _F, _microN, C + i * ldc + j, ldc, _tail); - } - } - - void MacroKernelCompatible(size_t M, size_t N, size_t K, const T * A, size_t lda, const T * pB, T * C, size_t ldc) - { - size_t klda = lda, plda = lda; - T * pA = (T*)A; - if (_packA) - { - //_packA(A, lda, M, K, _microM, _pA.data); - pA = _pA.data; - plda = K; - klda = 1; - } - size_t MA = AlignLoAny(M, _microM); - size_t NA = AlignLoAny(N, _microN); - size_t j = 0; - for (; j < NA; j += _microN) - { - size_t i = 0; - for (; i < MA; i += _microM) - { - if (_packA && j == 0) - _packA(A + i * lda, lda, _microM, K, _microM, pA + i * plda); - _kernelMM(K, _1, pA + i * plda, klda, pB, _F * _K, _F, C + i * ldc + j, ldc, _main); - } - if (i < M) - { - if (_packA && j == 0) - _packA(A + i * lda, lda, M - i, K, _microM, pA + i * plda); - _kernelTM(M - i, K, _1, pA + i * plda, klda, pB, _F * _K, _F, C + i * ldc + j, ldc, _main); - } - pB += _microN * _K; - } - if (j < N) - { - size_t i = 0; - for (; i < MA; i += _microM) - { - if (_packA && j == 0) - _packA(A + i * lda, lda, _microM, K, _microM, pA + i * plda); - _kernelMT(K, _1, pA + i * plda, klda, pB, _F * _K, _F, C + i * ldc + j, ldc, _tail); - } - if (i < M) - { - if (_packA && j == 0) - _packA(A + i * lda, lda, M - i, K, _microM, pA + i * plda); - _kernelTT(M - i, K, _1, pA + i * plda, klda, pB, _F * _K, _F, C + i * ldc + j, ldc, _tail); - } - } - } - - typedef Simd::Array Array; - - size_t _M, _N, _K, _microM, _microN, _macroM, _macroN, _macroK, _F; - TM _main, _tail; - Main _kernelMM, _kernelMT; - Tail _kernelTM, _kernelTT; - ScaleC _scaleC; - PackB _packB; - PackA _packA; - Array _pA; - T _0, _1; - bool _compatible; - }; - - enum GemmKernelType - { - GemmKernelAny = 0, - GemmKernelF1, - GemmKernelF2, - GemmKernelF3, - GemmKernelF4, - }; - -#ifdef SIMD_SSE_ENABLE - namespace Sse - { - void GemmKernel4x12nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - void GemmKernel4x8nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - void GemmKernel4x4nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - - void GemmKernel6x8nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - void GemmKernel6x4nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - - void GemmKernelMx12nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - void GemmKernelMx8nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - void GemmKernelMx4nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - - void GemmPackA(const float * A, size_t lda, size_t M, size_t K, size_t microM, float * pA); - void GemmPackB(const float * B, size_t ldb, size_t K, size_t N, size_t microN, float * pB); - void GemmScaleC(size_t M, size_t N, float beta, float * C, size_t ldc); - - size_t Gemm32fNNcbBufferSize(size_t M, size_t N, size_t K, GemmKernelType type, bool compatibility); - void Gemm32fNNcbReorderB(size_t M, size_t N, size_t K, const float * B, float * pB, GemmKernelType type, bool compatibility); - void Gemm32fNNcbRun(size_t M, size_t N, size_t K, const float * A, const float * pB, float * C, GemmKernelType type, bool compatibility); - } -#endif//SIMD_SSE_ENABLE - -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - void GemmKernel4x24nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - void GemmKernel4x16nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - void GemmKernel4x8nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - - void GemmKernel6x16nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - void GemmKernel6x8nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - - void GemmKernelMx24nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - void GemmKernelMx16nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - void GemmKernelMx8nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - - void GemmPackA(const float * A, size_t lda, size_t M, size_t K, size_t microM, float * pA); - void GemmPackB(const float * B, size_t ldb, size_t K, size_t N, size_t microN, float * pB); - void GemmScaleC(size_t M, size_t N, float beta, float * C, size_t ldc); - - size_t Gemm32fNNcbBufferSize(size_t M, size_t N, size_t K, GemmKernelType type, bool compatibility); - void Gemm32fNNcbReorderB(size_t M, size_t N, size_t K, const float * B, float * pB, GemmKernelType type, bool compatibility); - void Gemm32fNNcbRun(size_t M, size_t N, size_t K, const float * A, const float * pB, float * C, GemmKernelType type, bool compatibility); - } -#endif//SIMD_AVX_ENABLE - -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - void GemmKernel4x24nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - void GemmKernel4x16nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - void GemmKernel4x8nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - - void GemmKernel6x16nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - void GemmKernel6x8nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - - void GemmKernelMx24nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - void GemmKernelMx16nn(size_t M,size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - void GemmKernelMx8nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - - size_t Gemm32fNNcbBufferSize(size_t M, size_t N, size_t K, GemmKernelType type, bool compatibility); - void Gemm32fNNcbReorderB(size_t M, size_t N, size_t K, const float * B, float * pB, GemmKernelType type, bool compatibility); - void Gemm32fNNcbRun(size_t M, size_t N, size_t K, const float * A, const float * pB, float * C, GemmKernelType type, bool compatibility); - } -#endif//SIMD_AVX_ENABLE - -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - void GemmKernel4x48nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask); - void GemmKernel4x32nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask); - void GemmKernel4x16nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask); - - void GemmKernel6x64nn(size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, size_t sb, float* C, size_t ldc, __mmask16 mask); - void GemmKernel6x48nn(size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, size_t sb, float* C, size_t ldc, __mmask16 mask); - void GemmKernel6x32nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask); - void GemmKernel6x16nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask); - - void GemmKernel8x48nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask); - void GemmKernel8x32nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask); - void GemmKernel8x16nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask); - - void GemmKernel9x48nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask); - void GemmKernel9x32nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask); - void GemmKernel9x16nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask); - - void GemmKernel12x32nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask); - void GemmKernel12x16nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask); - - void GemmKernel14x32nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask); - void GemmKernel14x16nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask); - - void GemmKernelMx48nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask); - void GemmKernelMx32nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask); - void GemmKernelMx16nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, __mmask16 mask); - - void GemmPackA(const float * A, size_t lda, size_t M, size_t K, size_t microM, float * pA); - void GemmPackB(const float * B, size_t ldb, size_t K, size_t N, size_t microN, float * pB); - void GemmScaleC(size_t M, size_t N, float beta, float * C, size_t ldc); - - size_t Gemm32fNNcbBufferSize(size_t M, size_t N, size_t K, GemmKernelType type, bool compatibility); - void Gemm32fNNcbReorderB(size_t M, size_t N, size_t K, const float * B, float * pB, GemmKernelType type, bool compatibility); - void Gemm32fNNcbRun(size_t M, size_t N, size_t K, const float * A, const float * pB, float * C, GemmKernelType type, bool compatibility); - } -#endif//SIMD_AVX512F_ENABLE - -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - void GemmKernel4x12nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - void GemmKernel4x8nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - void GemmKernel4x4nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - - void GemmKernel6x8nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - void GemmKernel6x4nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - - void GemmKernelMx12nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - void GemmKernelMx8nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - void GemmKernelMx4nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail); - - void GemmPackA(const float * A, size_t lda, size_t M, size_t K, size_t microM, float * pA); - void GemmPackB(const float * B, size_t ldb, size_t K, size_t N, size_t microN, float * pB); - void GemmScaleC(size_t M, size_t N, float beta, float * C, size_t ldc); - - size_t Gemm32fNNcbBufferSize(size_t M, size_t N, size_t K, GemmKernelType type, bool compatibility); - void Gemm32fNNcbReorderB(size_t M, size_t N, size_t K, const float * B, float * pB, GemmKernelType type, bool compatibility); - void Gemm32fNNcbRun(size_t M, size_t N, size_t K, const float * A, const float * pB, float * C, GemmKernelType type, bool compatibility); - } -#endif//SIMD_NEON_ENABLE -} - -#endif//__SimdGemm_h__ diff --git a/src/3rd/Simd/Simd/SimdImageMatcher.hpp b/src/3rd/Simd/Simd/SimdImageMatcher.hpp deleted file mode 100644 index c9bc3dd0..00000000 --- a/src/3rd/Simd/Simd/SimdImageMatcher.hpp +++ /dev/null @@ -1,462 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdImageMatcher_hpp__ -#define __SimdImageMatcher_hpp__ - -#include "Simd/SimdLib.hpp" - -#include - -namespace Simd -{ - /*! @ingroup cpp_image_matcher - - \short The ImageMatcher structure provides fast algorithm of searching of similar images. - - Using example (the filter removes duplicates from the list): - \verbatim - #include "Simd/SimdImageMatcher.hpp" - - typedef Simd::ImageMatcher ImageMatcher; - typedef std::shared_ptr ViewPtr; - typedef std::vector ViewPtrs; - - void FilterDuplicates(const ViewPtrs & src, double threshold, ViewPtrs & dst) - { - ImageMatcher matcher; - matcher.Init(threshold, ImageMatcher::Hash16x16, src.size()); - for (size_t i = 0; i < src.size(); ++i) - { - ImageMatcher::HashPtr hash = matcher.Create(*src[i], i); - ImageMatcher::Results results; - if (!matcher.Find(hash, results)) - { - matcher.Add(hash); - dst.push_back(src[i]); - } - } - } - \endverbatim - */ - template class Allocator> - struct ImageMatcher - { - typedef Simd::View View; /*!< An image type definition. */ - - /*! - \short The Hash structure is used for fast image matching. - - To create the structure use method Simd::ImageMatcher::Create(). - */ - struct Hash - { - Tag tag; /*!< An arbitrary tag linked with the image. */ - - private: - Hash(const Tag & t, size_t mainSize, size_t fastSize) - : tag(t) - , skip(false) - { - hash.resize(mainSize + fastSize, 0); - main = hash.data(); - fast = main + mainSize; - } - - std::vector > hash; - uint8_t * main; - uint8_t * fast; - mutable bool skip; - - friend struct ImageMatcher; - }; - typedef std::shared_ptr HashPtr; /*!< A shared pointer to Hash structure. */ - - /*! - \short The Result structure is a result of matching current image and images added before to ImageMatcher. - */ - struct Result - { - const Hash * hash; /*!< A hash to found similar image. */ - const double difference; /*!< A mean squared difference between current and found similar image. */ - - /*! - Creates a new Result structure. - - \param [in] h - a pointer to hash of found similar image. - \param [in] d - A mean squared difference. - */ - Result(const Hash * h, double d) - : hash(h) - , difference(d) - { - } - }; - typedef std::vector Results; /*!< A vector with results. */ - - /*! - \enum HashType - - Describes size of reduced image used in image Hash. - */ - enum HashType - { - Hash16x16, /*!< 16x16 reduced image size. */ - Hash32x32, /*!< 32x32 reduced image size. */ - Hash64x64, /*!< 32x32 reduced image size. */ - }; - - /*! - Signalizes true if ImageMatcher is initialized. - - \return true if ImageMatcher is initialized. - */ - bool Empty() const - { - return !_matcher; - } - - /*! - Gets total number of images added to ImageMatcher. - - \return total number of images added to ImageMatcher. - */ - size_t Size() const - { - return _matcher ? _matcher->Size() : 0; - } - - /*! - Initializes ImageMatcher for search. - - \param [in] threshold - a maximal mean squared difference for similar images. By default it is equal to 0.05. - \param [in] type - a type of Hash used for matching. By default it is equal to ImageMatcher::Hash16x16. - \param [in] number - an estimated total number of images used for matching. By default it is equal to 0. - \param [in] normalized - a flag signalized that images have normalized histogram. By default it is false. - \return the result of the operation. - */ - bool Init(double threshold = 0.05, HashType type = Hash16x16, size_t number = 0, bool normalized = false) - { - static const size_t sizes[] = { 16, 32, 64 }; - size_t size = sizes[type]; - - if (number >= 10000 && threshold < 0.10) - _matcher.reset(new Matcher_3D(threshold, size, number, normalized)); - else if (number > 1000 && !normalized) - _matcher.reset(new Matcher_1D(threshold, size, number)); - else - _matcher.reset(new Matcher_0D(threshold, size, number)); - return (bool)_matcher; - } - - /*! - Creates hash for given image. - - \param [in] view - an input image. - \param [in] tag - a tag of arbitrary type. - \return the smart pointer to Hash for image matching. - */ - HashPtr Create(const View & view, const Tag & tag) - { - const size_t main = _matcher->main; - const size_t fast = _matcher->fast; - - HashPtr hash(HashPtr(new Hash(tag, Square(main), Square(fast)))); - - View gray; - if (view.format == View::Gray8) - gray = view; - else - { - gray.Recreate(view.Size(), View::Gray8); - Simd::Convert(view, gray); - } - - Simd::ResizeBilinear(gray, View(main, main, main, View::Gray8, hash->main).Ref()); - - size_t step = main / fast; - size_t area = Simd::Square(step); - - for (size_t fast_y = 0; fast_y < fast; ++fast_y) - { - for (size_t fast_x = 0; fast_x < fast; ++fast_x) - { - size_t sum = area / 2; - for (size_t y = fast_y*step, y_end = y + step; y < y_end; ++y) - { - const uint8_t * pm = hash->main + y*main; - for (size_t x = fast_x*step, x_end = x + step; x < x_end; ++x) - sum += pm[x]; - } - hash->fast[fast_y*fast + fast_x] = uint8_t(sum / area); - } - } - - return hash; - } - - /*! - Finds all similar images earlier added to ImageMatcher for given image. - - \param [in] hash - a smart pointer to hash of the image. - \param [out] results - a list of found similar images. - \return true if similar images were found. - */ - bool Find(const HashPtr & hash, Results & results) - { - results.clear(); - _matcher->Find(hash, results); - return results.size() != 0; - } - - /*! - Adds given image to ImageMatcher. - - \param [in] hash - a smart pointer to hash of the image. - */ - void Add(const HashPtr & hash) - { - _matcher->Add(hash); - } - - /*! - Skips searching of the image in ImageMatcher. - - \param [in] hash - a smart pointer to hash of the image. - */ - void Skip(const HashPtr & hash) - { - hash->skip = true; - } - - private: - struct Matcher - { - const size_t fast; - const size_t main; - - Matcher(double threshold, size_t size) - : fast(4) - , main(size) - , _fastSize(fast*fast) - , _mainSize(size*size) - , _size(0) - , _threshold(threshold) - { - _fastMax = uint64_t(Square(threshold*UINT8_MAX)*_fastSize); - _mainMax = uint64_t(Square(threshold*UINT8_MAX)*_mainSize); - } - - size_t Size() const { return _size; } - - virtual ~Matcher() {} - virtual void Add(const HashPtr & hash) = 0; - virtual void Find(const HashPtr & hash, Results & results) = 0; - - protected: - typedef std::vector Set; - typedef std::vector Sets; - Sets _sets; - size_t _fastSize, _mainSize, _size; - uint64_t _mainMax, _fastMax; - double _threshold; - - void AddIn(size_t index, const HashPtr & hash) - { - _sets[index].push_back(hash); - _size++; - } - - void FindIn(size_t index, const HashPtr & hash, Results & results) - { - const Set & set = _sets[index]; - for (size_t i = 0; i < set.size(); ++i) - { - double difference = 0; - if (Compare(set[i], hash, difference)) - results.push_back(Result(set[i].get(), difference)); - } - } - - bool Compare(const HashPtr & a, const HashPtr & b, double & difference) - { - if (a->skip || b->skip) - return false; - - uint64_t fastSum = 0; - ::SimdSquaredDifferenceSum(a->fast, _fastSize, b->fast, _fastSize, _fastSize, 1, &fastSum); - if (fastSum > _fastMax) - return false; - - uint64_t mainSum = 0; - ::SimdSquaredDifferenceSum(a->main, _mainSize, b->main, _mainSize, _mainSize, 1, &mainSum); - if (mainSum > _mainMax) - return false; - - difference = ::sqrt(double(mainSum) / _mainSize / UINT8_MAX / UINT8_MAX); - - return difference <= _threshold; - } - }; - typedef std::unique_ptr MatcherPtr; - MatcherPtr _matcher; - - struct Matcher_0D : public Matcher - { - Matcher_0D(double threshold, size_t size, size_t number) - : Matcher(threshold, size) - { - this->_sets.resize(1); - this->_sets[0].reserve(number); - } - - virtual void Add(const HashPtr & hash) - { - this->AddIn(0, hash); - } - - virtual void Find(const HashPtr & hash, Results & results) - { - this->FindIn(0, hash, results); - } - }; - - struct Matcher_1D : public Matcher - { - Matcher_1D(double threshold, size_t size, size_t number) - : Matcher(threshold, size) - , _range(256) - { - this->_sets.resize(_range); - _half = (int)ceil(double(_range)*threshold); - } - - virtual void Add(const HashPtr & hash) - { - this->AddIn(Get(hash), hash); - } - - virtual void Find(const HashPtr & hash, Results & results) - { - size_t index = Get(hash); - for (size_t i = std::max(index, _half) - _half, end = std::min(index + _half + 1, _range); i < end; ++i) - this->FindIn(i, hash, results); - } - - private: - size_t _range, _half; - - size_t Get(const HashPtr & hash) - { - size_t sum = 0; - for (size_t i = 0; i < this->_fastSize; ++i) - sum += hash->fast[i]; - return sum >> 4; - } - }; - - struct Matcher_3D : public Matcher - { - Matcher_3D(double threshold, size_t size, size_t number, bool normalized) - : Matcher(threshold, size) - , _normalized(normalized) - { - const int MAX_RANGES[] = { 96, 96, 96, 96, 96, 96, 80, 64, 56, 48, 48 }; - _maxRange = MAX_RANGES[int(threshold / 0.01)]; - - _shift.x = _maxRange >> 2; - _shift.y = _maxRange >> 2; - _shift.z = _normalized ? (_maxRange >> 2) : 0; - - _range.x = _maxRange >> 1; - _range.y = _maxRange >> 1; - _range.z = _normalized ? (_maxRange >> 1) : _maxRange; - - _stride.x = 1; - _stride.y = _range.x; - _stride.z = _range.x*_range.y; - - this->_sets.resize(_range.z*_range.x*_range.y); - _half = (int)ceil(double(_maxRange)*threshold); - } - - virtual void Add(const HashPtr & hash) - { - Index i; - Get(hash, i); - this->AddIn(i.x*_stride.x + i.y*_stride.y + i.z*_stride.z, hash); - } - - virtual void Find(const HashPtr & hash, Results & results) - { - Index i, lo, hi; - Get(hash, i); - - lo.x = std::max(0, i.x - _half)*_stride.x; - lo.y = std::max(0, i.y - _half)*_stride.y; - lo.z = std::max(0, i.z - _half)*_stride.z; - - hi.x = std::min(_range.x, i.x + _half + 1)*_stride.x; - hi.y = std::min(_range.y, i.y + _half + 1)*_stride.y; - hi.z = std::min(_range.z, i.z + _half + 1)*_stride.z; - - for (int z = lo.z; z < hi.z; z += _stride.z) - for (int y = lo.y; y < hi.y; y += _stride.y) - for (int x = lo.x; x < hi.x; x += _stride.x) - this->FindIn(x + y + z, hash, results); - } - - private: - int _maxRange, _half; - bool _normalized; - - struct Index - { - int x; - int y; - int z; - }; - Index _shift, _range, _stride; - - void Get(const HashPtr & hash, Index & index) - { - const uint8_t * p = hash->fast; - int s[2][2]; - s[0][0] = p[0x0] + p[0x1] + p[0x4] + p[0x5]; - s[0][1] = p[0x2] + p[0x3] + p[0x6] + p[0x7]; - s[1][0] = p[0x8] + p[0x9] + p[0xC] + p[0xD]; - s[1][1] = p[0xA] + p[0xB] + p[0xE] + p[0xF]; - - index.x = (s[0][0] - s[0][1] + s[1][0] - s[1][1] + 0x7FF)*_maxRange >> 12; - index.y = (s[0][0] + s[0][1] - s[1][0] - s[1][1] + 0x7FF)*_maxRange >> 12; - index.z = (s[0][0] + s[1][1] + (_normalized ? (0x7FF - s[1][0] - s[0][1]) : (s[1][0] + s[0][1])))*_maxRange >> 12; - - index.x = std::max(0, std::min(_range.x - 1, index.x - _shift.x)); - index.y = std::max(0, std::min(_range.y - 1, index.y - _shift.y)); - index.z = std::max(0, std::min(_range.z - 1, index.z - _shift.z)); - } - }; - }; -} - -#endif//__SimdImageMatcher_hpp__ diff --git a/src/3rd/Simd/Simd/SimdInit.h b/src/3rd/Simd/Simd/SimdInit.h deleted file mode 100644 index 3463646c..00000000 --- a/src/3rd/Simd/Simd/SimdInit.h +++ /dev/null @@ -1,587 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdInit_h__ -#define __SimdInit_h__ - -#include "Simd/SimdDefs.h" - -namespace Simd -{ -#if defined(_MSC_VER) && (defined(SIMD_X64_ENABLE) || defined(SIMD_X86_ENABLE)) - - template SIMD_INLINE char GetChar(T value, size_t index) - { - return ((char*)&value)[index]; - } - -#define SIMD_AS_CHAR(a) char(a) - -#define SIMD_AS_2CHARS(a) \ - Simd::GetChar(int16_t(a), 0), Simd::GetChar(int16_t(a), 1) - -#define SIMD_AS_4CHARS(a) \ - Simd::GetChar(int32_t(a), 0), Simd::GetChar(int32_t(a), 1), \ - Simd::GetChar(int32_t(a), 2), Simd::GetChar(int32_t(a), 3) - -#define SIMD_AS_8CHARS(a) \ - Simd::GetChar(int64_t(a), 0), Simd::GetChar(int64_t(a), 1), \ - Simd::GetChar(int64_t(a), 2), Simd::GetChar(int64_t(a), 3), \ - Simd::GetChar(int64_t(a), 4), Simd::GetChar(int64_t(a), 5), \ - Simd::GetChar(int64_t(a), 6), Simd::GetChar(int64_t(a), 7) - -#elif defined(__GNUC__) || (defined(_MSC_VER) && defined(SIMD_NEON_ENABLE)) - -#define SIMD_CHAR_AS_LONGLONG(a) (((long long)a) & 0xFF) - -#define SIMD_SHORT_AS_LONGLONG(a) (((long long)a) & 0xFFFF) - -#define SIMD_INT_AS_LONGLONG(a) (((long long)a) & 0xFFFFFFFF) - -#define SIMD_LL_SET1_EPI8(a) \ - SIMD_CHAR_AS_LONGLONG(a) | (SIMD_CHAR_AS_LONGLONG(a) << 8) | \ - (SIMD_CHAR_AS_LONGLONG(a) << 16) | (SIMD_CHAR_AS_LONGLONG(a) << 24) | \ - (SIMD_CHAR_AS_LONGLONG(a) << 32) | (SIMD_CHAR_AS_LONGLONG(a) << 40) | \ - (SIMD_CHAR_AS_LONGLONG(a) << 48) | (SIMD_CHAR_AS_LONGLONG(a) << 56) - -#define SIMD_LL_SET2_EPI8(a, b) \ - SIMD_CHAR_AS_LONGLONG(a) | (SIMD_CHAR_AS_LONGLONG(b) << 8) | \ - (SIMD_CHAR_AS_LONGLONG(a) << 16) | (SIMD_CHAR_AS_LONGLONG(b) << 24) | \ - (SIMD_CHAR_AS_LONGLONG(a) << 32) | (SIMD_CHAR_AS_LONGLONG(b) << 40) | \ - (SIMD_CHAR_AS_LONGLONG(a) << 48) | (SIMD_CHAR_AS_LONGLONG(b) << 56) - -#define SIMD_LL_SETR_EPI8(a, b, c, d, e, f, g, h) \ - SIMD_CHAR_AS_LONGLONG(a) | (SIMD_CHAR_AS_LONGLONG(b) << 8) | \ - (SIMD_CHAR_AS_LONGLONG(c) << 16) | (SIMD_CHAR_AS_LONGLONG(d) << 24) | \ - (SIMD_CHAR_AS_LONGLONG(e) << 32) | (SIMD_CHAR_AS_LONGLONG(f) << 40) | \ - (SIMD_CHAR_AS_LONGLONG(g) << 48) | (SIMD_CHAR_AS_LONGLONG(h) << 56) - -#define SIMD_LL_SET1_EPI16(a) \ - SIMD_SHORT_AS_LONGLONG(a) | (SIMD_SHORT_AS_LONGLONG(a) << 16) | \ - (SIMD_SHORT_AS_LONGLONG(a) << 32) | (SIMD_SHORT_AS_LONGLONG(a) << 48) - -#define SIMD_LL_SET2_EPI16(a, b) \ - SIMD_SHORT_AS_LONGLONG(a) | (SIMD_SHORT_AS_LONGLONG(b) << 16) | \ - (SIMD_SHORT_AS_LONGLONG(a) << 32) | (SIMD_SHORT_AS_LONGLONG(b) << 48) - -#define SIMD_LL_SETR_EPI16(a, b, c, d) \ - SIMD_SHORT_AS_LONGLONG(a) | (SIMD_SHORT_AS_LONGLONG(b) << 16) | \ - (SIMD_SHORT_AS_LONGLONG(c) << 32) | (SIMD_SHORT_AS_LONGLONG(d) << 48) - -#define SIMD_LL_SET1_EPI32(a) \ - SIMD_INT_AS_LONGLONG(a) | (SIMD_INT_AS_LONGLONG(a) << 32) - -#define SIMD_LL_SET2_EPI32(a, b) \ - SIMD_INT_AS_LONGLONG(a) | (SIMD_INT_AS_LONGLONG(b) << 32) - -#endif//defined(__GNUC__) || (defined(_MSC_VER) && defined(SIMD_NEON_ENABLE)) - -#if defined(SIMD_SSE2_ENABLE) - -#if defined(_MSC_VER) - -#define SIMD_MM_SET1_EPI8(a) \ - {SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \ - SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \ - SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \ - SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a)} - -#define SIMD_MM_SET2_EPI8(a0, a1) \ - {SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), \ - SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), \ - SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), \ - SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1)} - -#define SIMD_MM_SETR_EPI8(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, ac, ad, ae, af) \ - {SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a2), SIMD_AS_CHAR(a3), \ - SIMD_AS_CHAR(a4), SIMD_AS_CHAR(a5), SIMD_AS_CHAR(a6), SIMD_AS_CHAR(a7), \ - SIMD_AS_CHAR(a8), SIMD_AS_CHAR(a9), SIMD_AS_CHAR(aa), SIMD_AS_CHAR(ab), \ - SIMD_AS_CHAR(ac), SIMD_AS_CHAR(ad), SIMD_AS_CHAR(ae), SIMD_AS_CHAR(af)} - -#define SIMD_MM_SET1_EPI16(a) \ - {SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), \ - SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a)} - -#define SIMD_MM_SET2_EPI16(a0, a1) \ - {SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), \ - SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1)} - -#define SIMD_MM_SETR_EPI16(a0, a1, a2, a3, a4, a5, a6, a7) \ - {SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), SIMD_AS_2CHARS(a2), SIMD_AS_2CHARS(a3), \ - SIMD_AS_2CHARS(a4), SIMD_AS_2CHARS(a5), SIMD_AS_2CHARS(a6), SIMD_AS_2CHARS(a7)} - -#define SIMD_MM_SET1_EPI32(a) \ - {SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a)} - -#define SIMD_MM_SET2_EPI32(a0, a1) \ - {SIMD_AS_4CHARS(a0), SIMD_AS_4CHARS(a1), SIMD_AS_4CHARS(a0), SIMD_AS_4CHARS(a1)} - -#define SIMD_MM_SETR_EPI32(a0, a1, a2, a3) \ - {SIMD_AS_4CHARS(a0), SIMD_AS_4CHARS(a1), SIMD_AS_4CHARS(a2), SIMD_AS_4CHARS(a3)} - -#define SIMD_MM_SET1_EPI64(a) \ - {SIMD_AS_8CHARS(a), SIMD_AS_8CHARS(a)} - -#define SIMD_MM_SET2_EPI64(a0, a1) \ - {SIMD_AS_8CHARS(a0), SIMD_AS_8CHARS(a1)} - -#define SIMD_MM_SETR_EPI64(a0, a1) \ - {SIMD_AS_8CHARS(a0), SIMD_AS_8CHARS(a1)} - -#elif defined(__GNUC__) - -#define SIMD_MM_SET1_EPI8(a) \ - {SIMD_LL_SET1_EPI8(a), SIMD_LL_SET1_EPI8(a)} - -#define SIMD_MM_SET2_EPI8(a0, a1) \ - {SIMD_LL_SET2_EPI8(a0, a1), SIMD_LL_SET2_EPI8(a0, a1)} - -#define SIMD_MM_SETR_EPI8(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, ac, ad, ae, af) \ - {SIMD_LL_SETR_EPI8(a0, a1, a2, a3, a4, a5, a6, a7), SIMD_LL_SETR_EPI8(a8, a9, aa, ab, ac, ad, ae, af)} - -#define SIMD_MM_SET1_EPI16(a) \ - {SIMD_LL_SET1_EPI16(a), SIMD_LL_SET1_EPI16(a)} - -#define SIMD_MM_SET2_EPI16(a0, a1) \ - {SIMD_LL_SET2_EPI16(a0, a1), SIMD_LL_SET2_EPI16(a0, a1)} - -#define SIMD_MM_SETR_EPI16(a0, a1, a2, a3, a4, a5, a6, a7) \ - {SIMD_LL_SETR_EPI16(a0, a1, a2, a3), SIMD_LL_SETR_EPI16(a4, a5, a6, a7)} - -#define SIMD_MM_SET1_EPI32(a) \ - {SIMD_LL_SET1_EPI32(a), SIMD_LL_SET1_EPI32(a)} - -#define SIMD_MM_SET2_EPI32(a0, a1) \ - {SIMD_LL_SET2_EPI32(a0, a1), SIMD_LL_SET2_EPI32(a0, a1)} - -#define SIMD_MM_SETR_EPI32(a0, a1, a2, a3) \ - {SIMD_LL_SET2_EPI32(a0, a1), SIMD_LL_SET2_EPI32(a2, a3)} - -#define SIMD_MM_SET1_EPI64(a) \ - {a, a} - -#define SIMD_MM_SET2_EPI64(a0, a1) \ - {a0, a1} - -#define SIMD_MM_SETR_EPI64(a0, a1) \ - {a0, a1} - -#endif// defined(_MSC_VER) || defined(__GNUC__) - -#endif// SIMD_SSE2_ENABLE - -#if defined(SIMD_AVX2_ENABLE) - -#if defined(_MSC_VER) - -#define SIMD_MM256_SET1_EPI8(a) \ - {SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \ - SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \ - SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \ - SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \ - SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \ - SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \ - SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \ - SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a)} - -#define SIMD_MM256_SET2_EPI8(a0, a1) \ - {SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), \ - SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), \ - SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), \ - SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), \ - SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), \ - SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), \ - SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), \ - SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1)} - -#define SIMD_MM256_SETR_EPI8(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, ac, ad, ae, af, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf) \ - {SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a2), SIMD_AS_CHAR(a3), \ - SIMD_AS_CHAR(a4), SIMD_AS_CHAR(a5), SIMD_AS_CHAR(a6), SIMD_AS_CHAR(a7), \ - SIMD_AS_CHAR(a8), SIMD_AS_CHAR(a9), SIMD_AS_CHAR(aa), SIMD_AS_CHAR(ab), \ - SIMD_AS_CHAR(ac), SIMD_AS_CHAR(ad), SIMD_AS_CHAR(ae), SIMD_AS_CHAR(af), \ - SIMD_AS_CHAR(b0), SIMD_AS_CHAR(b1), SIMD_AS_CHAR(b2), SIMD_AS_CHAR(b3), \ - SIMD_AS_CHAR(b4), SIMD_AS_CHAR(b5), SIMD_AS_CHAR(b6), SIMD_AS_CHAR(b7), \ - SIMD_AS_CHAR(b8), SIMD_AS_CHAR(b9), SIMD_AS_CHAR(ba), SIMD_AS_CHAR(bb), \ - SIMD_AS_CHAR(bc), SIMD_AS_CHAR(bd), SIMD_AS_CHAR(be), SIMD_AS_CHAR(bf)} - -#define SIMD_MM256_SET1_EPI16(a) \ - {SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), \ - SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), \ - SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), \ - SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a)} - -#define SIMD_MM256_SET2_EPI16(a0, a1) \ - {SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), \ - SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), \ - SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), \ - SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1)} - -#define SIMD_MM256_SETR_EPI16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, ac, ad, ae, af) \ - {SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), SIMD_AS_2CHARS(a2), SIMD_AS_2CHARS(a3), \ - SIMD_AS_2CHARS(a4), SIMD_AS_2CHARS(a5), SIMD_AS_2CHARS(a6), SIMD_AS_2CHARS(a7), \ - SIMD_AS_2CHARS(a8), SIMD_AS_2CHARS(a9), SIMD_AS_2CHARS(aa), SIMD_AS_2CHARS(ab), \ - SIMD_AS_2CHARS(ac), SIMD_AS_2CHARS(ad), SIMD_AS_2CHARS(ae), SIMD_AS_2CHARS(af)} - -#define SIMD_MM256_SET1_EPI32(a) \ - {SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a), \ - SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a)} - -#define SIMD_MM256_SET2_EPI32(a0, a1) \ - {SIMD_AS_4CHARS(a0), SIMD_AS_4CHARS(a1), SIMD_AS_4CHARS(a0), SIMD_AS_4CHARS(a1), \ - SIMD_AS_4CHARS(a0), SIMD_AS_4CHARS(a1), SIMD_AS_4CHARS(a0), SIMD_AS_4CHARS(a1)} - -#define SIMD_MM256_SETR_EPI32(a0, a1, a2, a3, a4, a5, a6, a7) \ - {SIMD_AS_4CHARS(a0), SIMD_AS_4CHARS(a1), SIMD_AS_4CHARS(a2), SIMD_AS_4CHARS(a3), \ - SIMD_AS_4CHARS(a4), SIMD_AS_4CHARS(a5), SIMD_AS_4CHARS(a6), SIMD_AS_4CHARS(a7)} - -#define SIMD_MM256_SET1_EPI64(a) \ - {SIMD_AS_8CHARS(a), SIMD_AS_8CHARS(a), SIMD_AS_8CHARS(a), SIMD_AS_8CHARS(a)} - -#define SIMD_MM256_SET2_EPI64(a0, a1) \ - {SIMD_AS_8CHARS(a0), SIMD_AS_8CHARS(a1), SIMD_AS_8CHARS(a0), SIMD_AS_8CHARS(a1)} - -#define SIMD_MM256_SETR_EPI64(a0, a1, a2, a3) \ - {SIMD_AS_8CHARS(a0), SIMD_AS_8CHARS(a1), SIMD_AS_8CHARS(a2), SIMD_AS_8CHARS(a3)} - -#elif defined(__GNUC__) - -#define SIMD_MM256_SET1_EPI8(a) \ - {SIMD_LL_SET1_EPI8(a), SIMD_LL_SET1_EPI8(a), \ - SIMD_LL_SET1_EPI8(a), SIMD_LL_SET1_EPI8(a)} - -#define SIMD_MM256_SET2_EPI8(a0, a1) \ - {SIMD_LL_SET2_EPI8(a0, a1), SIMD_LL_SET2_EPI8(a0, a1), \ - SIMD_LL_SET2_EPI8(a0, a1), SIMD_LL_SET2_EPI8(a0, a1)} - -#define SIMD_MM256_SETR_EPI8(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, ac, ad, ae, af, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf) \ - {SIMD_LL_SETR_EPI8(a0, a1, a2, a3, a4, a5, a6, a7), SIMD_LL_SETR_EPI8(a8, a9, aa, ab, ac, ad, ae, af), \ - SIMD_LL_SETR_EPI8(b0, b1, b2, b3, b4, b5, b6, b7), SIMD_LL_SETR_EPI8(b8, b9, ba, bb, bc, bd, be, bf)} - -#define SIMD_MM256_SET1_EPI16(a) \ - {SIMD_LL_SET1_EPI16(a), SIMD_LL_SET1_EPI16(a), \ - SIMD_LL_SET1_EPI16(a), SIMD_LL_SET1_EPI16(a)} - -#define SIMD_MM256_SET2_EPI16(a0, a1) \ - {SIMD_LL_SET2_EPI16(a0, a1), SIMD_LL_SET2_EPI16(a0, a1), \ - SIMD_LL_SET2_EPI16(a0, a1), SIMD_LL_SET2_EPI16(a0, a1)} - -#define SIMD_MM256_SETR_EPI16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, ac, ad, ae, af) \ - {SIMD_LL_SETR_EPI16(a0, a1, a2, a3), SIMD_LL_SETR_EPI16(a4, a5, a6, a7), \ - SIMD_LL_SETR_EPI16(a8, a9, aa, ab), SIMD_LL_SETR_EPI16(ac, ad, ae, af)} - -#define SIMD_MM256_SET1_EPI32(a) \ - {SIMD_LL_SET1_EPI32(a), SIMD_LL_SET1_EPI32(a), \ - SIMD_LL_SET1_EPI32(a), SIMD_LL_SET1_EPI32(a)} - -#define SIMD_MM256_SET2_EPI32(a0, a1) \ - {SIMD_LL_SET2_EPI32(a0, a1), SIMD_LL_SET2_EPI32(a0, a1), \ - SIMD_LL_SET2_EPI32(a0, a1), SIMD_LL_SET2_EPI32(a0, a1)} - -#define SIMD_MM256_SETR_EPI32(a0, a1, a2, a3, a4, a5, a6, a7) \ - {SIMD_LL_SET2_EPI32(a0, a1), SIMD_LL_SET2_EPI32(a2, a3), \ - SIMD_LL_SET2_EPI32(a4, a5), SIMD_LL_SET2_EPI32(a6, a7)} - -#define SIMD_MM256_SET1_EPI64(a) \ - {a, a, a, a} - -#define SIMD_MM256_SET2_EPI64(a0, a1) \ - {a0, a1, a0, a1} - -#define SIMD_MM256_SETR_EPI64(a0, a1, a2, a3) \ - {a0, a1, a2, a3} - -#endif// defined(_MSC_VER) || defined(__GNUC__) - -#endif// SIMD_AVX2_ENABLE - -#if defined(SIMD_AVX512F_ENABLE) || defined(SIMD_AVX512BW_ENABLE) - -#if defined(_MSC_VER) - -#define SIMD_MM512_SET1_EPI8(a) \ - {SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \ - SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \ - SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \ - SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \ - SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \ - SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \ - SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \ - SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a)} - -#define SIMD_MM512_SET2_EPI8(a0, a1) \ - {SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), \ - SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), \ - SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), \ - SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), \ - SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), \ - SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), \ - SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), \ - SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1)} - -#define SIMD_MM512_SETR_EPI8(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, ac, ad, ae, af, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, ca, cb, cc, cd, ce, cf, d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, da, db, dc, dd, de, df) \ - {SIMD_AS_CHAR(a0), SIMD_AS_CHAR(a1), SIMD_AS_CHAR(a2), SIMD_AS_CHAR(a3), SIMD_AS_CHAR(a4), SIMD_AS_CHAR(a5), SIMD_AS_CHAR(a6), SIMD_AS_CHAR(a7), \ - SIMD_AS_CHAR(a8), SIMD_AS_CHAR(a9), SIMD_AS_CHAR(aa), SIMD_AS_CHAR(ab), SIMD_AS_CHAR(ac), SIMD_AS_CHAR(ad), SIMD_AS_CHAR(ae), SIMD_AS_CHAR(af), \ - SIMD_AS_CHAR(b0), SIMD_AS_CHAR(b1), SIMD_AS_CHAR(b2), SIMD_AS_CHAR(b3), SIMD_AS_CHAR(b4), SIMD_AS_CHAR(b5), SIMD_AS_CHAR(b6), SIMD_AS_CHAR(b7), \ - SIMD_AS_CHAR(b8), SIMD_AS_CHAR(b9), SIMD_AS_CHAR(ba), SIMD_AS_CHAR(bb), SIMD_AS_CHAR(bc), SIMD_AS_CHAR(bd), SIMD_AS_CHAR(be), SIMD_AS_CHAR(bf), \ - SIMD_AS_CHAR(c0), SIMD_AS_CHAR(c1), SIMD_AS_CHAR(c2), SIMD_AS_CHAR(c3), SIMD_AS_CHAR(c4), SIMD_AS_CHAR(c5), SIMD_AS_CHAR(c6), SIMD_AS_CHAR(c7), \ - SIMD_AS_CHAR(c8), SIMD_AS_CHAR(c9), SIMD_AS_CHAR(ca), SIMD_AS_CHAR(cb), SIMD_AS_CHAR(cc), SIMD_AS_CHAR(cd), SIMD_AS_CHAR(ce), SIMD_AS_CHAR(cf), \ - SIMD_AS_CHAR(d0), SIMD_AS_CHAR(d1), SIMD_AS_CHAR(d2), SIMD_AS_CHAR(d3), SIMD_AS_CHAR(d4), SIMD_AS_CHAR(d5), SIMD_AS_CHAR(d6), SIMD_AS_CHAR(d7), \ - SIMD_AS_CHAR(d8), SIMD_AS_CHAR(d9), SIMD_AS_CHAR(da), SIMD_AS_CHAR(db), SIMD_AS_CHAR(dc), SIMD_AS_CHAR(dd), SIMD_AS_CHAR(de), SIMD_AS_CHAR(df)} - -#define SIMD_MM512_SET1_EPI16(a) \ - {SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), \ - SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), \ - SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), \ - SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a), SIMD_AS_2CHARS(a)} - -#define SIMD_MM512_SET2_EPI16(a0, a1) \ - {SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), \ - SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), \ - SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), \ - SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1)} - -#define SIMD_MM512_SETR_EPI16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, ac, ad, ae, af, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf) \ - {SIMD_AS_2CHARS(a0), SIMD_AS_2CHARS(a1), SIMD_AS_2CHARS(a2), SIMD_AS_2CHARS(a3), SIMD_AS_2CHARS(a4), SIMD_AS_2CHARS(a5), SIMD_AS_2CHARS(a6), SIMD_AS_2CHARS(a7), \ - SIMD_AS_2CHARS(a8), SIMD_AS_2CHARS(a9), SIMD_AS_2CHARS(aa), SIMD_AS_2CHARS(ab), SIMD_AS_2CHARS(ac), SIMD_AS_2CHARS(ad), SIMD_AS_2CHARS(ae), SIMD_AS_2CHARS(af), \ - SIMD_AS_2CHARS(b0), SIMD_AS_2CHARS(b1), SIMD_AS_2CHARS(b2), SIMD_AS_2CHARS(b3), SIMD_AS_2CHARS(b4), SIMD_AS_2CHARS(b5), SIMD_AS_2CHARS(b6), SIMD_AS_2CHARS(b7), \ - SIMD_AS_2CHARS(b8), SIMD_AS_2CHARS(b9), SIMD_AS_2CHARS(ba), SIMD_AS_2CHARS(bb), SIMD_AS_2CHARS(bc), SIMD_AS_2CHARS(bd), SIMD_AS_2CHARS(be), SIMD_AS_2CHARS(bf)} - -#define SIMD_MM512_SET1_EPI32(a) \ - {SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a), \ - SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a), SIMD_AS_4CHARS(a)} - -#define SIMD_MM512_SET2_EPI32(a0, a1) \ - {SIMD_AS_4CHARS(a0), SIMD_AS_4CHARS(a1), SIMD_AS_4CHARS(a0), SIMD_AS_4CHARS(a1), SIMD_AS_4CHARS(a0), SIMD_AS_4CHARS(a1), SIMD_AS_4CHARS(a0), SIMD_AS_4CHARS(a1), \ - SIMD_AS_4CHARS(a0), SIMD_AS_4CHARS(a1), SIMD_AS_4CHARS(a0), SIMD_AS_4CHARS(a1), SIMD_AS_4CHARS(a0), SIMD_AS_4CHARS(a1), SIMD_AS_4CHARS(a0), SIMD_AS_4CHARS(a1)} - -#define SIMD_MM512_SETR_EPI32(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, ac, ad, ae, af) \ - {SIMD_AS_4CHARS(a0), SIMD_AS_4CHARS(a1), SIMD_AS_4CHARS(a2), SIMD_AS_4CHARS(a3), SIMD_AS_4CHARS(a4), SIMD_AS_4CHARS(a5), SIMD_AS_4CHARS(a6), SIMD_AS_4CHARS(a7), \ - SIMD_AS_4CHARS(a8), SIMD_AS_4CHARS(a9), SIMD_AS_4CHARS(aa), SIMD_AS_4CHARS(ab), SIMD_AS_4CHARS(ac), SIMD_AS_4CHARS(ad), SIMD_AS_4CHARS(ae), SIMD_AS_4CHARS(af)} - -#define SIMD_MM512_SET1_EPI64(a) \ - {SIMD_AS_8CHARS(a), SIMD_AS_8CHARS(a), SIMD_AS_8CHARS(a), SIMD_AS_8CHARS(a), SIMD_AS_8CHARS(a), SIMD_AS_8CHARS(a), SIMD_AS_8CHARS(a), SIMD_AS_8CHARS(a)} - -#define SIMD_MM512_SET2_EPI64(a0, a1) \ - {SIMD_AS_8CHARS(a0), SIMD_AS_8CHARS(a1), SIMD_AS_8CHARS(a0), SIMD_AS_8CHARS(a1), SIMD_AS_8CHARS(a0), SIMD_AS_8CHARS(a1), SIMD_AS_8CHARS(a0), SIMD_AS_8CHARS(a1)} - -#define SIMD_MM512_SETR_EPI64(a0, a1, a2, a3, a4, a5, a6, a7) \ - {SIMD_AS_8CHARS(a0), SIMD_AS_8CHARS(a1), SIMD_AS_8CHARS(a2), SIMD_AS_8CHARS(a3), SIMD_AS_8CHARS(a4), SIMD_AS_8CHARS(a5), SIMD_AS_8CHARS(a6), SIMD_AS_8CHARS(a7)} - -#elif defined(__GNUC__) - -#define SIMD_MM512_SET1_EPI8(a) \ - {SIMD_LL_SET1_EPI8(a), SIMD_LL_SET1_EPI8(a), SIMD_LL_SET1_EPI8(a), SIMD_LL_SET1_EPI8(a), \ - SIMD_LL_SET1_EPI8(a), SIMD_LL_SET1_EPI8(a), SIMD_LL_SET1_EPI8(a), SIMD_LL_SET1_EPI8(a)} - -#define SIMD_MM512_SET2_EPI8(a0, a1) \ - {SIMD_LL_SET2_EPI8(a0, a1), SIMD_LL_SET2_EPI8(a0, a1), SIMD_LL_SET2_EPI8(a0, a1), SIMD_LL_SET2_EPI8(a0, a1), \ - SIMD_LL_SET2_EPI8(a0, a1), SIMD_LL_SET2_EPI8(a0, a1), SIMD_LL_SET2_EPI8(a0, a1), SIMD_LL_SET2_EPI8(a0, a1)} - -#define SIMD_MM512_SETR_EPI8(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, ac, ad, ae, af, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, ca, cb, cc, cd, ce, cf, d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, da, db, dc, dd, de, df) \ - {SIMD_LL_SETR_EPI8(a0, a1, a2, a3, a4, a5, a6, a7), SIMD_LL_SETR_EPI8(a8, a9, aa, ab, ac, ad, ae, af), \ - SIMD_LL_SETR_EPI8(b0, b1, b2, b3, b4, b5, b6, b7), SIMD_LL_SETR_EPI8(b8, b9, ba, bb, bc, bd, be, bf), \ - SIMD_LL_SETR_EPI8(c0, c1, c2, c3, c4, c5, c6, c7), SIMD_LL_SETR_EPI8(c8, c9, ca, cb, cc, cd, ce, cf), \ - SIMD_LL_SETR_EPI8(d0, d1, d2, d3, d4, d5, d6, d7), SIMD_LL_SETR_EPI8(d8, d9, da, db, dc, dd, de, df)} - -#define SIMD_MM512_SET1_EPI16(a) \ - {SIMD_LL_SET1_EPI16(a), SIMD_LL_SET1_EPI16(a), SIMD_LL_SET1_EPI16(a), SIMD_LL_SET1_EPI16(a), \ - SIMD_LL_SET1_EPI16(a), SIMD_LL_SET1_EPI16(a), SIMD_LL_SET1_EPI16(a), SIMD_LL_SET1_EPI16(a)} - -#define SIMD_MM512_SET2_EPI16(a0, a1) \ - {SIMD_LL_SET2_EPI16(a0, a1), SIMD_LL_SET2_EPI16(a0, a1), SIMD_LL_SET2_EPI16(a0, a1), SIMD_LL_SET2_EPI16(a0, a1), \ - SIMD_LL_SET2_EPI16(a0, a1), SIMD_LL_SET2_EPI16(a0, a1), SIMD_LL_SET2_EPI16(a0, a1), SIMD_LL_SET2_EPI16(a0, a1)} - -#define SIMD_MM512_SETR_EPI16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, ac, ad, ae, af, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf) \ - {SIMD_LL_SETR_EPI16(a0, a1, a2, a3), SIMD_LL_SETR_EPI16(a4, a5, a6, a7), SIMD_LL_SETR_EPI16(a8, a9, aa, ab), SIMD_LL_SETR_EPI16(ac, ad, ae, af), \ - SIMD_LL_SETR_EPI16(b0, b1, b2, b3), SIMD_LL_SETR_EPI16(b4, b5, b6, b7), SIMD_LL_SETR_EPI16(b8, b9, ba, bb), SIMD_LL_SETR_EPI16(bc, bd, be, bf)} - -#define SIMD_MM512_SET1_EPI32(a) \ - {SIMD_LL_SET1_EPI32(a), SIMD_LL_SET1_EPI32(a), SIMD_LL_SET1_EPI32(a), SIMD_LL_SET1_EPI32(a), \ - SIMD_LL_SET1_EPI32(a), SIMD_LL_SET1_EPI32(a), SIMD_LL_SET1_EPI32(a), SIMD_LL_SET1_EPI32(a)} - -#define SIMD_MM512_SET2_EPI32(a0, a1) \ - {SIMD_LL_SET2_EPI32(a0, a1), SIMD_LL_SET2_EPI32(a0, a1), SIMD_LL_SET2_EPI32(a0, a1), SIMD_LL_SET2_EPI32(a0, a1), \ - SIMD_LL_SET2_EPI32(a0, a1), SIMD_LL_SET2_EPI32(a0, a1), SIMD_LL_SET2_EPI32(a0, a1), SIMD_LL_SET2_EPI32(a0, a1)} - -#define SIMD_MM512_SETR_EPI32(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, ac, ad, ae, af) \ - {SIMD_LL_SET2_EPI32(a0, a1), SIMD_LL_SET2_EPI32(a2, a3), SIMD_LL_SET2_EPI32(a4, a5), SIMD_LL_SET2_EPI32(a6, a7), \ - SIMD_LL_SET2_EPI32(a8, a9), SIMD_LL_SET2_EPI32(aa, ab), SIMD_LL_SET2_EPI32(ac, ad), SIMD_LL_SET2_EPI32(ae, af)} - -#define SIMD_MM512_SET1_EPI64(a) \ - {a, a, a, a, a, a, a, a} - -#define SIMD_MM512_SET2_EPI64(a0, a1) \ - {a0, a1, a0, a1, a0, a1, a0, a1} - -#define SIMD_MM512_SETR_EPI64(a0, a1, a2, a3, a4, a5, a6, a7) \ - {a0, a1, a2, a3, a4, a5, a6, a7} - -#endif// defined(_MSC_VER) || defined(__GNUC__) - -#endif//defined(SIMD_AVX512F_ENABLE) || defined(SIMD_AVX512BW_ENABLE) - -#if defined(SIMD_VMX_ENABLE) || (defined(SIMD_NEON_ENABLE) && defined(__GNUC__)) - -#define SIMD_VEC_SET1_EPI8(a) \ - {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a} - -#define SIMD_VEC_SET2_EPI8(a0, a1) \ - {a0, a1, a0, a1, a0, a1, a0, a1, a0, a1, a0, a1, a0, a1, a0, a1} - -#define SIMD_VEC_SETR_EPI8(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, ac, ad, ae, af) \ - {a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, ac, ad, ae, af} - -#define SIMD_VEC_SET1_EPI16(a) \ - {a, a, a, a, a, a, a, a} - -#define SIMD_VEC_SET2_EPI16(a0, a1) \ - {a0, a1, a0, a1, a0, a1, a0, a1} - -#define SIMD_VEC_SETR_EPI16(a0, a1, a2, a3, a4, a5, a6, a7) \ - {a0, a1, a2, a3, a4, a5, a6, a7} - -#define SIMD_VEC_SET1_EPI32(a) \ - {a, a, a, a} - -#define SIMD_VEC_SET2_EPI32(a0, a1) \ - {a0, a1, a0, a1} - -#define SIMD_VEC_SETR_EPI32(a0, a1, a2, a3) \ - {a0, a1, a2, a3} - -#define SIMD_VEC_SET1_EPI64(a) \ - {a, a} - -#define SIMD_VEC_SET2_EPI64(a0, a1) \ - {a0, a1} - -#define SIMD_VEC_SETR_EPI64(a0, a1) \ - {a0, a1} - -#define SIMD_VEC_SET1_PS(a) \ - {a, a, a, a} - -#define SIMD_VEC_SET2_PS(a0, a1) \ - {a0, a1, a0, a1} - -#define SIMD_VEC_SETR_PS(a0, a1, a2, a3) \ - {a0, a1, a2, a3} - -#define SIMD_VEC_SET1_PI8(a) \ - {a, a, a, a, a, a, a, a} - -#define SIMD_VEC_SET2_PI8(a0, a1) \ - {a0, a1, a0, a1, a0, a1, a0, a1} - -#define SIMD_VEC_SETR_PI8(a0, a1, a2, a3, a4, a5, a6, a7) \ - {a0, a1, a2, a3, a4, a5, a6, a7} - -#define SIMD_VEC_SET1_PI16(a) \ - {a, a, a, a} - -#define SIMD_VEC_SET2_PI16(a0, a1) \ - {a0, a1, a0, a1} - -#define SIMD_VEC_SETR_PI16(a0, a1, a2, a3) \ - {a0, a1, a2, a3} - -#define SIMD_VEC_SET1_PI32(a) \ - {a, a} - -#define SIMD_VEC_SETR_PI32(a0, a1) \ - {a0, a1} - -#define SIMD_VEC_SETR_PI64(a) \ - {a} - -#endif//defined(SIMD_VMX_ENABLE) || (defined(SIMD_NEON_ENABLE) && defined(__GNUC__)) - -#if defined(_MSC_VER) && defined(SIMD_NEON_ENABLE) - -#define SIMD_VEC_SET1_EPI8(a) \ - {SIMD_LL_SET1_EPI8(a), SIMD_LL_SET1_EPI8(a)} - -#define SIMD_VEC_SET2_EPI8(a0, a1) \ - {SIMD_LL_SET2_EPI8(a0, a1), SIMD_LL_SET2_EPI8(a0, a1)} - -#define SIMD_VEC_SETR_EPI8(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, ac, ad, ae, af) \ - {SIMD_LL_SETR_EPI8(a0, a1, a2, a3, a4, a5, a6, a7), SIMD_LL_SETR_EPI8(a8, a9, aa, ab, ac, ad, ae, af)} - -#define SIMD_VEC_SET1_EPI16(a) \ - {SIMD_LL_SET1_EPI16(a), SIMD_LL_SET1_EPI16(a)} - -#define SIMD_VEC_SET2_EPI16(a0, a1) \ - {SIMD_LL_SET2_EPI16(a0, a1), SIMD_LL_SET2_EPI16(a0, a1)} - -#define SIMD_VEC_SETR_EPI16(a0, a1, a2, a3, a4, a5, a6, a7) \ - {SIMD_LL_SETR_EPI16(a0, a1, a2, a3), SIMD_LL_SETR_EPI16(a4, a5, a6, a7)} - -#define SIMD_VEC_SET1_EPI32(a) \ - {SIMD_LL_SET1_EPI32(a), SIMD_LL_SET1_EPI32(a)} - -#define SIMD_VEC_SET2_EPI32(a0, a1) \ - {SIMD_LL_SET2_EPI32(a0, a1), SIMD_LL_SET2_EPI32(a0, a1)} - -#define SIMD_VEC_SETR_EPI32(a0, a1, a2, a3) \ - {SIMD_LL_SET2_EPI32(a0, a1), SIMD_LL_SET2_EPI32(a2, a3)} - -#define SIMD_VEC_SET1_EPI64(a) \ - {a, a} - -#define SIMD_VEC_SET2_EPI64(a0, a1) \ - {a0, a1} - -#define SIMD_VEC_SETR_EPI64(a0, a1) \ - {a0, a1} - -#define SIMD_VEC_SET1_PI8(a) \ - {SIMD_LL_SET1_EPI8(a)} - -#define SIMD_VEC_SET2_PI8(a0, a1) \ - {SIMD_LL_SET2_EPI8(a0, a1)} - -#define SIMD_VEC_SETR_PI8(a0, a1, a2, a3, a4, a5, a6, a7) \ - {SIMD_LL_SETR_EPI8(a0, a1, a2, a3, a4, a5, a6, a7)} - -#define SIMD_VEC_SET1_PI16(a) \ - {SIMD_LL_SET1_EPI16(a)} - -#define SIMD_VEC_SET2_PI16(a0, a1) \ - {SIMD_LL_SET2_EPI16(a0, a1)} - -#define SIMD_VEC_SETR_PI16(a0, a1, a2, a3) \ - {SIMD_LL_SETR_EPI16(a0, a1, a2, a3)} - -#define SIMD_VEC_SET1_PI32(a) \ - {SIMD_LL_SET1_EPI32(a)} - -#define SIMD_VEC_SETR_PI32(a0, a1, a2, a3) \ - {SIMD_LL_SET2_EPI32(a0, a1)} - -#define SIMD_VEC_SETR_PI64(a0) \ - {a0} - -#endif//defined(_MSC_VER) && defined(SIMD_NEON_ENABLE) -} - -#endif//__SimdInit_h__ diff --git a/src/3rd/Simd/Simd/SimdIntegral.h b/src/3rd/Simd/Simd/SimdIntegral.h deleted file mode 100644 index b13bcf96..00000000 --- a/src/3rd/Simd/Simd/SimdIntegral.h +++ /dev/null @@ -1,250 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdIntegral_h__ -#define __SimdIntegral_h__ - -#include "Simd/SimdMemory.h" - -namespace Simd -{ - template struct IntegralBuffer - { - IntegralBuffer(size_t size) - { - _p = Allocate(sizeof(T)*size); - p = (T*)_p; - } - - ~IntegralBuffer() - { - Free(_p); - } - - T * p; - private: - void *_p; - }; - - template void IntegralSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, TSum * sum, size_t sumStride) - { - memset(sum, 0, (width + 1) * sizeof(TSum)); - sum += sumStride + 1; - - for (size_t row = 0; row < height; row++) - { - TSum rowSum = 0; - sum[-1] = 0; - for (size_t col = 0; col < width; col++) - { - rowSum += src[col]; - sum[col] = rowSum + sum[col - sumStride]; - } - src += srcStride; - sum += sumStride; - } - } - - template void IntegralSumSqsum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - TSum * sum, size_t sumStride, TSqsum * sqsum, size_t sqsumStride) - { - memset(sum, 0, (width + 1) * sizeof(TSum)); - sum += sumStride + 1; - - memset(sqsum, 0, (width + 1) * sizeof(TSqsum)); - sqsum += sqsumStride + 1; - - for (size_t row = 0; row < height; row++) - { - TSum row_sum = 0; - TSqsum row_sqsum = 0; - sum[-1] = 0; - sqsum[-1] = 0; - for (size_t col = 0; col < width; col++) - { - TSum value = src[col]; - row_sum += value; - row_sqsum += value*value; - sum[col] = row_sum + sum[col - sumStride]; - sqsum[col] = row_sqsum + sqsum[col - sqsumStride]; - } - src += srcStride; - sum += sumStride; - sqsum += sqsumStride; - } - } - - template void IntegralSumSqsumTilted(const uint8_t * src, ptrdiff_t srcStride, size_t width, size_t height, - TSum * sum, ptrdiff_t sumStride, TSqsum * sqsum, ptrdiff_t sqsumStride, TSum * tilted, ptrdiff_t tiltedStride) - { - memset(sum, 0, (width + 1) * sizeof(TSum)); - sum += sumStride + 1; - - memset(sqsum, 0, (width + 1) * sizeof(TSqsum)); - sqsum += sqsumStride + 1; - - memset(tilted, 0, (width + 1) * sizeof(TSum)); - tilted += tiltedStride + 1; - - IntegralBuffer _buffer(width + 1); - TSum * buffer = _buffer.p; - TSum s = 0; - TSqsum sq = 0; - - sum[-1] = 0; - tilted[-1] = 0; - sqsum[-1] = 0; - - for (size_t col = 0; col < width; col++) - { - TSum value = src[col]; - buffer[col] = value; - tilted[col] = value; - s += value; - sq += value*value; - sum[col] = s; - sqsum[col] = sq; - } - - if (width == 1) - buffer[1] = 0; - - for (size_t row = 1; row < height; ++row) - { - src += srcStride; - sum += sumStride; - tilted += tiltedStride; - sqsum += sqsumStride; - - TSum value = src[0]; - TSum t0 = s = value; - TSqsum tq0 = sq = value*value; - - sum[-1] = 0; - sqsum[-1] = 0; - tilted[-1] = tilted[-tiltedStride]; - - sum[0] = sum[-sumStride] + t0; - sqsum[0] = sqsum[-sqsumStride] + tq0; - tilted[0] = tilted[-tiltedStride] + t0 + buffer[1]; - - size_t col; - for (col = 1; col < width - 1; ++col) - { - TSum t1 = buffer[col]; - buffer[col - 1] = t1 + t0; - t0 = value = src[col]; - tq0 = value*value; - s += t0; - sq += tq0; - sum[col] = sum[col - sumStride] + s; - sqsum[col] = sqsum[col - sqsumStride] + sq; - t1 += buffer[col + 1] + t0 + tilted[col - tiltedStride - 1]; - tilted[col] = t1; - } - - if (width > 1) - { - TSum t1 = buffer[col]; - buffer[col - 1] = t1 + t0; - t0 = value = src[col]; - tq0 = value*value; - s += t0; - sq += tq0; - sum[col] = sum[col - sumStride] + s; - sqsum[col] = sqsum[col - sqsumStride] + sq; - tilted[col] = t0 + t1 + tilted[col - tiltedStride - 1]; - buffer[col] = t0; - } - } - } - - template void IntegralSumTilted(const uint8_t * src, ptrdiff_t srcStride, size_t width, size_t height, - TSum * sum, ptrdiff_t sumStride, TSum * tilted, ptrdiff_t tiltedStride) - { - memset(sum, 0, (width + 1) * sizeof(TSum)); - sum += sumStride + 1; - - memset(tilted, 0, (width + 1) * sizeof(TSum)); - tilted += tiltedStride + 1; - - IntegralBuffer _buffer(width + 1); - TSum * buffer = _buffer.p; - TSum s = 0; - - sum[-1] = 0; - tilted[-1] = 0; - - for (size_t col = 0; col < width; col++) - { - TSum value = src[col]; - buffer[col] = value; - tilted[col] = value; - s += value; - sum[col] = s; - } - - if (width == 1) - buffer[1] = 0; - - for (size_t row = 1; row < height; ++row) - { - src += srcStride; - sum += sumStride; - tilted += tiltedStride; - - TSum value = src[0]; - TSum t0 = s = value; - - sum[-1] = 0; - tilted[-1] = tilted[-tiltedStride]; - - sum[0] = sum[-sumStride] + t0; - tilted[0] = tilted[-tiltedStride] + t0 + buffer[1]; - - size_t col; - for (col = 1; col < width - 1; ++col) - { - TSum t1 = buffer[col]; - buffer[col - 1] = t1 + t0; - t0 = value = src[col]; - s += t0; - sum[col] = sum[col - sumStride] + s; - t1 += buffer[col + 1] + t0 + tilted[col - tiltedStride - 1]; - tilted[col] = t1; - } - - if (width > 1) - { - TSum t1 = buffer[col]; - buffer[col - 1] = t1 + t0; - t0 = value = src[col]; - s += t0; - sum[col] = sum[col - sumStride] + s; - tilted[col] = t0 + t1 + tilted[col - tiltedStride - 1]; - buffer[col] = t0; - } - } - } -} -#endif//__SimdIntegral_h__ diff --git a/src/3rd/Simd/Simd/SimdLib.cpp b/src/3rd/Simd/Simd/SimdLib.cpp deleted file mode 100644 index aa922b51..00000000 --- a/src/3rd/Simd/Simd/SimdLib.cpp +++ /dev/null @@ -1,6328 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar, -* 2014-2018 Antonenka Mikhail, -* 2018-2018 Radchenko Andrey, -* 2019-2019 Facundo Galan. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdConfig.h" - -#ifndef SIMD_LIB_CPP -#define SIMD_LIB_CPP -#endif - -#if defined(WIN32) && !defined(SIMD_STATIC) - -#define SIMD_EXPORTS -#ifndef NOMINMAX -#define NOMINMAX -#endif -#include - -BOOL APIENTRY DllMain(HMODULE hModule, DWORD dwReasonForCall, LPVOID lpReserved) -{ - switch (dwReasonForCall) - { - case DLL_PROCESS_DETACH: - case DLL_PROCESS_ATTACH: - case DLL_THREAD_ATTACH: - case DLL_THREAD_DETACH: - return TRUE; - } - return TRUE; -} -#endif//WIN32 - -#include "Simd/SimdLib.h" - -#include "Simd/SimdMemory.h" -#include "Simd/SimdEnable.h" -#include "Simd/SimdConst.h" -#include "Simd/SimdCpu.h" -#include "Simd/SimdLog.h" -#include "Simd/SimdPerformance.h" - -#include "Simd/SimdResizer.h" -#include "Simd/SimdSynetConvolution8i.h" -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSynetDeconvolution32f.h" -#include "Simd/SimdSynetMergedConvolution32f.h" - -#include "Simd/SimdBase.h" -#include "Simd/SimdSse1.h" -#include "Simd/SimdSse2.h" -#include "Simd/SimdSse3.h" -#include "Simd/SimdSsse3.h" -#include "Simd/SimdSse41.h" -#include "Simd/SimdSse42.h" -#include "Simd/SimdAvx1.h" -#include "Simd/SimdAvx2.h" -#include "Simd/SimdAvx512f.h" -#include "Simd/SimdAvx512bw.h" -#include "Simd/SimdAvx512vnni.h" -#include "Simd/SimdVmx.h" -#include "Simd/SimdVsx.h" -#include "Simd/SimdNeon.h" -#include "Simd/SimdMsa.h" - -#if !defined(SIMD_VERSION) -#include "Simd/SimdVersion.h" -#endif - -SIMD_API const char * SimdVersion() -{ - return SIMD_VERSION; -} - -using namespace Simd; - -SIMD_API size_t SimdCpuInfo(SimdCpuInfoType type) -{ - switch (type) - { - case SimdCpuInfoSockets: return Cpu::SOCKET_NUMBER; - case SimdCpuInfoCores: return Cpu::CORE_NUMBER; - case SimdCpuInfoThreads: return Cpu::THREAD_NUMBER; - case SimdCpuInfoCacheL1: return Cpu::L1_CACHE_SIZE; - case SimdCpuInfoCacheL2: return Cpu::L2_CACHE_SIZE; - case SimdCpuInfoCacheL3: return Cpu::L3_CACHE_SIZE; -#ifdef SIMD_SSE_ENABLE - case SimdCpuInfoSse: return Sse::Enable ? 1 : 0; -#endif -#ifdef SIMD_SSE2_ENABLE - case SimdCpuInfoSse2: return Sse2::Enable ? 1 : 0; -#endif -#ifdef SIMD_SSE3_ENABLE - case SimdCpuInfoSse3: return Sse3::Enable ? 1 : 0; -#endif -#ifdef SIMD_SSSE3_ENABLE - case SimdCpuInfoSsse3: return Ssse3::Enable ? 1 : 0; -#endif -#ifdef SIMD_SSE41_ENABLE - case SimdCpuInfoSse41: return Sse41::Enable ? 1 : 0; -#endif -#ifdef SIMD_SSE42_ENABLE - case SimdCpuInfoSse42: return Sse42::Enable ? 1 : 0; -#endif -#ifdef SIMD_AVX_ENABLE - case SimdCpuInfoAvx: return Avx::Enable ? 1 : 0; -#endif -#ifdef SIMD_AVX2_ENABLE - case SimdCpuInfoAvx2: return Avx2::Enable ? 1 : 0; -#endif -#ifdef SIMD_AVX512F_ENABLE - case SimdCpuInfoAvx512f: return Avx512f::Enable ? 1 : 0; -#endif -#ifdef SIMD_AVX512BW_ENABLE - case SimdCpuInfoAvx512bw: return Avx512bw::Enable ? 1 : 0; -#endif -#ifdef SIMD_AVX512VNNI_ENABLE - case SimdCpuInfoAvx512vnni: return Avx512vnni::Enable ? 1 : 0; -#endif -#ifdef SIMD_VMX_ENABLE - case SimdCpuInfoVmx: return Vmx::Enable ? 1 : 0; -#endif -#ifdef SIMD_VSX_ENABLE - case SimdCpuInfoVsx: return Vsx::Enable ? 1 : 0; -#endif -#ifdef SIMD_NEON_ENABLE - case SimdCpuInfoNeon: return Neon::Enable ? 1 : 0; -#endif -#ifdef SIMD_MSA_ENABLE - case SimdCpuInfoMsa: return Msa::Enable ? 1 : 0; -#endif - default: - return 0; - } -} - -SIMD_API const char * SimdPerformanceStatistic() -{ -#if defined(SIMD_PERFORMANCE_STATISTIC) && defined(NDEBUG) - return Base::PerformanceMeasurerStorage::s_storage.PerformanceStatistic(); -#else - return ""; -#endif -} - -SIMD_API void * SimdAllocate(size_t size, size_t align) -{ - return Allocate(size, align); -} - -SIMD_API void SimdFree(void * ptr) -{ - Free(ptr); -} - -SIMD_API size_t SimdAlign(size_t size, size_t align) -{ - return AlignHi(size, align); -} - -SIMD_API size_t SimdAlignment() -{ - return Simd::ALIGNMENT; -} - -SIMD_API void SimdRelease(void * context) -{ - delete (Deletable*)context; -} - -SIMD_API size_t SimdGetThreadNumber() -{ - return Base::GetThreadNumber(); -} - -SIMD_API void SimdSetThreadNumber(size_t threadNumber) -{ - Base::SetThreadNumber(threadNumber); -} - -SIMD_API SimdBool SimdGetFastMode() -{ -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable) - return Sse::GetFastMode(); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable) - return Neon::GetFastMode(); - else -#endif - return SimdFalse; -} - -SIMD_API void SimdSetFastMode(SimdBool value) -{ -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable) - Sse::SetFastMode(value); -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable) - Neon::SetFastMode(value); -#endif -} - -SIMD_API uint32_t SimdCrc32c(const void * src, size_t size) -{ -#ifdef SIMD_SSE42_ENABLE - if(Sse42::Enable) - return Sse42::Crc32c(src, size); - else -#endif - return Base::Crc32c(src, size); -} - -SIMD_API void SimdAbsDifference(const uint8_t *a, size_t aStride, const uint8_t * b, size_t bStride, uint8_t *c, size_t cStride, - size_t width, size_t height) -{ -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::AbsDifference(a, aStride, b, bStride, c, cStride, width, height); - else -#endif - Base::AbsDifference(a, aStride, b, bStride, c, cStride, width, height); -} - -SIMD_API void SimdAbsDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint64_t * sum) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::AbsDifferenceSum(a, aStride, b, bStride, width, height, sum); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::AbsDifferenceSum(a, aStride, b, bStride, width, height, sum); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::AbsDifferenceSum(a, aStride, b, bStride, width, height, sum); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::AbsDifferenceSum(a, aStride, b, bStride, width, height, sum); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::AbsDifferenceSum(a, aStride, b, bStride, width, height, sum); - else -#endif - Base::AbsDifferenceSum(a, aStride, b, bStride, width, height, sum); -} - -SIMD_API void SimdAbsDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::AbsDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::AbsDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::AbsDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::AbsDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::AbsDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - else -#endif - Base::AbsDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); -} - -SIMD_API void SimdAbsDifferenceSums3x3(const uint8_t *current, size_t currentStride, const uint8_t * background, size_t backgroundStride, - size_t width, size_t height, uint64_t * sums) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::AbsDifferenceSums3x3(current, currentStride, background, backgroundStride, width, height, sums); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A + 2) - Avx2::AbsDifferenceSums3x3(current, currentStride, background, backgroundStride, width, height, sums); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A + 2) - Sse2::AbsDifferenceSums3x3(current, currentStride, background, backgroundStride, width, height, sums); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A + 2) - Vmx::AbsDifferenceSums3x3(current, currentStride, background, backgroundStride, width, height, sums); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A + 2) - Neon::AbsDifferenceSums3x3(current, currentStride, background, backgroundStride, width, height, sums); - else -#endif - Base::AbsDifferenceSums3x3(current, currentStride, background, backgroundStride, width, height, sums); -} - -SIMD_API void SimdAbsDifferenceSums3x3Masked(const uint8_t *current, size_t currentStride, const uint8_t *background, size_t backgroundStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sums) -{ -#if defined(SIMD_AVX512BW_ENABLE) && !defined(SIMD_MASKZ_LOAD_ERROR) - if (Avx512bw::Enable) - Avx512bw::AbsDifferenceSums3x3Masked(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A + 2) - Avx2::AbsDifferenceSums3x3Masked(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A + 2) - Sse2::AbsDifferenceSums3x3Masked(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A + 2) - Vmx::AbsDifferenceSums3x3Masked(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A + 2) - Neon::AbsDifferenceSums3x3Masked(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums); - else -#endif - Base::AbsDifferenceSums3x3Masked(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums); -} - -SIMD_API void SimdAbsGradientSaturatedSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Simd::Avx512bw::AbsGradientSaturatedSum(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Simd::Avx2::AbsGradientSaturatedSum(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::AbsGradientSaturatedSum(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::AbsGradientSaturatedSum(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::AbsGradientSaturatedSum(src, srcStride, width, height, dst, dstStride); - else -#endif - Base::AbsGradientSaturatedSum(src, srcStride, width, height, dst, dstStride); -} - -SIMD_API void SimdAddFeatureDifference(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, - uint16_t weight, uint8_t * difference, size_t differenceStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::AddFeatureDifference(value, valueStride, width, height, lo, loStride, hi, hiStride, weight, difference, differenceStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::AddFeatureDifference(value, valueStride, width, height, lo, loStride, hi, hiStride, weight, difference, differenceStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::AddFeatureDifference(value, valueStride, width, height, lo, loStride, hi, hiStride, weight, difference, differenceStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::AddFeatureDifference(value, valueStride, width, height, lo, loStride, hi, hiStride, weight, difference, differenceStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::AddFeatureDifference(value, valueStride, width, height, lo, loStride, hi, hiStride, weight, difference, differenceStride); - else -#endif - Base::AddFeatureDifference(value, valueStride, width, height, lo, loStride, hi, hiStride, weight, difference, differenceStride); -} - -SIMD_API void SimdAlphaBlending(const uint8_t *src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t *alpha, size_t alphaStride, uint8_t *dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::AlphaBlending(src, srcStride, width, height, channelCount, alpha, alphaStride, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::AlphaBlending(src, srcStride, width, height, channelCount, alpha, alphaStride, dst, dstStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::AlphaBlending(src, srcStride, width, height, channelCount, alpha, alphaStride, dst, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::AlphaBlending(src, srcStride, width, height, channelCount, alpha, alphaStride, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::AlphaBlending(src, srcStride, width, height, channelCount, alpha, alphaStride, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::AlphaBlending(src, srcStride, width, height, channelCount, alpha, alphaStride, dst, dstStride); - else -#endif - Base::AlphaBlending(src, srcStride, width, height, channelCount, alpha, alphaStride, dst, dstStride); -} - -SIMD_API void SimdAlphaFilling(uint8_t * dst, size_t dstStride, size_t width, size_t height, const uint8_t * channel, size_t channelCount, const uint8_t * alpha, size_t alphaStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::AlphaFilling(dst, dstStride, width, height, channel, channelCount, alpha, alphaStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::AlphaFilling(dst, dstStride, width, height, channel, channelCount, alpha, alphaStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::AlphaFilling(dst, dstStride, width, height, channel, channelCount, alpha, alphaStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if (Sse2::Enable && width >= Sse2::A) - Sse2::AlphaFilling(dst, dstStride, width, height, channel, channelCount, alpha, alphaStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::AlphaFilling(dst, dstStride, width, height, channel, channelCount, alpha, alphaStride); - else -#endif - Base::AlphaFilling(dst, dstStride, width, height, channel, channelCount, alpha, alphaStride); -} - -SIMD_API void SimdBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BackgroundGrowRangeSlow(value, valueStride, width, height, lo, loStride, hi, hiStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::BackgroundGrowRangeSlow(value, valueStride, width, height, lo, loStride, hi, hiStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::BackgroundGrowRangeSlow(value, valueStride, width, height, lo, loStride, hi, hiStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::BackgroundGrowRangeSlow(value, valueStride, width, height, lo, loStride, hi, hiStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::BackgroundGrowRangeSlow(value, valueStride, width, height, lo, loStride, hi, hiStride); - else -#endif - Base::BackgroundGrowRangeSlow(value, valueStride, width, height, lo, loStride, hi, hiStride); -} - -SIMD_API void SimdBackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BackgroundGrowRangeFast(value, valueStride, width, height, lo, loStride, hi, hiStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::BackgroundGrowRangeFast(value, valueStride, width, height, lo, loStride, hi, hiStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::BackgroundGrowRangeFast(value, valueStride, width, height, lo, loStride, hi, hiStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::BackgroundGrowRangeFast(value, valueStride, width, height, lo, loStride, hi, hiStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::BackgroundGrowRangeFast(value, valueStride, width, height, lo, loStride, hi, hiStride); - else -#endif - Base::BackgroundGrowRangeFast(value, valueStride, width, height, lo, loStride, hi, hiStride); -} - -SIMD_API void SimdBackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * loValue, size_t loValueStride, const uint8_t * hiValue, size_t hiValueStride, - uint8_t * loCount, size_t loCountStride, uint8_t * hiCount, size_t hiCountStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BackgroundIncrementCount(value, valueStride, width, height, loValue, loValueStride, hiValue, hiValueStride, loCount, loCountStride, hiCount, hiCountStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::BackgroundIncrementCount(value, valueStride, width, height, loValue, loValueStride, hiValue, hiValueStride, loCount, loCountStride, hiCount, hiCountStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::BackgroundIncrementCount(value, valueStride, width, height, loValue, loValueStride, hiValue, hiValueStride, loCount, loCountStride, hiCount, hiCountStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::BackgroundIncrementCount(value, valueStride, width, height, loValue, loValueStride, hiValue, hiValueStride, loCount, loCountStride, hiCount, hiCountStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::BackgroundIncrementCount(value, valueStride, width, height, loValue, loValueStride, hiValue, hiValueStride, loCount, loCountStride, hiCount, hiCountStride); - else -#endif - Base::BackgroundIncrementCount(value, valueStride, width, height, loValue, loValueStride, hiValue, hiValueStride, loCount, loCountStride, hiCount, hiCountStride); -} - -SIMD_API void SimdBackgroundAdjustRange(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BackgroundAdjustRange(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride, hiValue, hiValueStride, threshold); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::BackgroundAdjustRange(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride, hiValue, hiValueStride, threshold); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::BackgroundAdjustRange(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride, hiValue, hiValueStride, threshold); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::BackgroundAdjustRange(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride, hiValue, hiValueStride, threshold); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::BackgroundAdjustRange(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride, hiValue, hiValueStride, threshold); - else -#endif - Base::BackgroundAdjustRange(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride, hiValue, hiValueStride, threshold); -} - -SIMD_API void SimdBackgroundAdjustRangeMasked(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BackgroundAdjustRangeMasked(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride, hiValue, hiValueStride, threshold, mask, maskStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::BackgroundAdjustRangeMasked(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride,hiValue, hiValueStride, threshold, mask, maskStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::BackgroundAdjustRangeMasked(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride,hiValue, hiValueStride, threshold, mask, maskStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::BackgroundAdjustRangeMasked(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride,hiValue, hiValueStride, threshold, mask, maskStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::BackgroundAdjustRangeMasked(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride, hiValue, hiValueStride, threshold, mask, maskStride); - else -#endif - Base::BackgroundAdjustRangeMasked(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride, hiValue, hiValueStride, threshold, mask, maskStride); -} - -SIMD_API void SimdBackgroundShiftRange(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BackgroundShiftRange(value, valueStride, width, height, lo, loStride, hi, hiStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::BackgroundShiftRange(value, valueStride, width, height, lo, loStride, hi, hiStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::BackgroundShiftRange(value, valueStride, width, height, lo, loStride, hi, hiStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::BackgroundShiftRange(value, valueStride, width, height, lo, loStride, hi, hiStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::BackgroundShiftRange(value, valueStride, width, height, lo, loStride, hi, hiStride); - else -#endif - Base::BackgroundShiftRange(value, valueStride, width, height, lo, loStride, hi, hiStride); -} - -SIMD_API void SimdBackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride, const uint8_t * mask, size_t maskStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BackgroundShiftRangeMasked(value, valueStride, width, height, lo, loStride, hi, hiStride, mask, maskStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::BackgroundShiftRangeMasked(value, valueStride, width, height, lo, loStride, hi, hiStride, mask, maskStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::BackgroundShiftRangeMasked(value, valueStride, width, height, lo, loStride, hi, hiStride, mask, maskStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::BackgroundShiftRangeMasked(value, valueStride, width, height, lo, loStride, hi, hiStride, mask, maskStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::BackgroundShiftRangeMasked(value, valueStride, width, height, lo, loStride, hi, hiStride, mask, maskStride); - else -#endif - Base::BackgroundShiftRangeMasked(value, valueStride, width, height, lo, loStride, hi, hiStride, mask, maskStride); -} - -SIMD_API void SimdBackgroundInitMask(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t index, uint8_t value, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BackgroundInitMask(src, srcStride, width, height, index, value, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::BackgroundInitMask(src, srcStride, width, height, index, value, dst, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::BackgroundInitMask(src, srcStride, width, height, index, value, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::BackgroundInitMask(src, srcStride, width, height, index, value, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::BackgroundInitMask(src, srcStride, width, height, index, value, dst, dstStride); - else -#endif - Base::BackgroundInitMask(src, srcStride, width, height, index, value, dst, dstStride); -} - -SIMD_API void SimdBayerToBgr(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgr, size_t bgrStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width >= Avx512bw::A + 2) - Avx512bw::BayerToBgr(bayer, width, height, bayerStride, bayerFormat, bgr, bgrStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A + 2) - Avx2::BayerToBgr(bayer, width, height, bayerStride, bayerFormat, bgr, bgrStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A + 2) - Ssse3::BayerToBgr(bayer, width, height, bayerStride, bayerFormat, bgr, bgrStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A + 2) - Neon::BayerToBgr(bayer, width, height, bayerStride, bayerFormat, bgr, bgrStride); - else -#endif - Base::BayerToBgr(bayer, width, height, bayerStride, bayerFormat, bgr, bgrStride); -} - -SIMD_API void SimdBayerToBgra(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgra, size_t bgraStride, uint8_t alpha) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width >= Avx512bw::A + 2) - Avx512bw::BayerToBgra(bayer, width, height, bayerStride, bayerFormat, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A + 2) - Avx2::BayerToBgra(bayer, width, height, bayerStride, bayerFormat, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if (Sse2::Enable && width >= Sse2::A + 2) - Sse2::BayerToBgra(bayer, width, height, bayerStride, bayerFormat, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A + 2) - Neon::BayerToBgra(bayer, width, height, bayerStride, bayerFormat, bgra, bgraStride, alpha); - else -#endif - Base::BayerToBgra(bayer, width, height, bayerStride, bayerFormat, bgra, bgraStride, alpha); -} - -SIMD_API void SimdBgraToBayer(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BgraToBayer(bgra, width, height, bgraStride, bayer, bayerStride, bayerFormat); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::BgraToBayer(bgra, width, height, bgraStride, bayer, bayerStride, bayerFormat); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::BgraToBayer(bgra, width, height, bgraStride, bayer, bayerStride, bayerFormat); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::BgraToBayer(bgra, width, height, bgraStride, bayer, bayerStride, bayerFormat); - else -#endif - Base::BgraToBayer(bgra, width, height, bgraStride, bayer, bayerStride, bayerFormat); -} - -SIMD_API void SimdBgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); - else -#endif - Base::BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); -} - -SIMD_API void SimdBgraToGray(const uint8_t *bgra, size_t width, size_t height, size_t bgraStride, uint8_t *gray, size_t grayStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BgraToGray(bgra, width, height, bgraStride, gray, grayStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::BgraToGray(bgra, width, height, bgraStride, gray, grayStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::BgraToGray(bgra, width, height, bgraStride, gray, grayStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::BgraToGray(bgra, width, height, bgraStride, gray, grayStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::HA) - Neon::BgraToGray(bgra, width, height, bgraStride, gray, grayStride); - else -#endif - Base::BgraToGray(bgra, width, height, bgraStride, gray, grayStride); -} - -SIMD_API void SimdBgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); - else -#endif - Base::BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); -} - -SIMD_API void SimdBgraToYuv420p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BgraToYuv420p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::DA) - Avx2::BgraToYuv420p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::DA) - Ssse3::BgraToYuv420p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::DA) - Sse2::BgraToYuv420p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::DA) - Vmx::BgraToYuv420p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::DA) - Neon::BgraToYuv420p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else -#endif - Base::BgraToYuv420p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); -} - -SIMD_API void SimdBgraToYuv422p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BgraToYuv422p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::DA) - Avx2::BgraToYuv422p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::DA) - Ssse3::BgraToYuv422p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::DA) - Sse2::BgraToYuv422p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::DA) - Vmx::BgraToYuv422p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::DA) - Neon::BgraToYuv422p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else -#endif - Base::BgraToYuv422p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); -} - -SIMD_API void SimdBgraToYuv444p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BgraToYuv444p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::BgraToYuv444p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::BgraToYuv444p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::BgraToYuv444p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::BgraToYuv444p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else -#endif - Base::BgraToYuv444p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); -} - -SIMD_API void SimdBgraToYuva420p(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, - uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride, uint8_t * a, size_t aStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BgraToYuva420p(bgra, bgraStride, width, height, y, yStride, u, uStride, v, vStride, a, aStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::DA) - Avx2::BgraToYuva420p(bgra, bgraStride, width, height, y, yStride, u, uStride, v, vStride, a, aStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::DA) - Ssse3::BgraToYuva420p(bgra, bgraStride, width, height, y, yStride, u, uStride, v, vStride, a, aStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if (Sse2::Enable && width >= Sse2::DA) - Sse2::BgraToYuva420p(bgra, bgraStride, width, height, y, yStride, u, uStride, v, vStride, a, aStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::DA) - Neon::BgraToYuva420p(bgra, bgraStride, width, height, y, yStride, u, uStride, v, vStride, a, aStride); - else -#endif - Base::BgraToYuva420p(bgra, bgraStride, width, height, y, yStride, u, uStride, v, vStride, a, aStride); -} - -SIMD_API void SimdBgrToBayer(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BgrToBayer(bgr, width, height, bgrStride, bayer, bayerStride, bayerFormat); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::BgrToBayer(bgr, width, height, bgrStride, bayer, bayerStride, bayerFormat); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::BgrToBayer(bgr, width, height, bgrStride, bayer, bayerStride, bayerFormat); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::BgrToBayer(bgr, width, height, bgrStride, bayer, bayerStride, bayerFormat); - else -#endif - Base::BgrToBayer(bgr, width, height, bgrStride, bayer, bayerStride, bayerFormat); -} - -SIMD_API void SimdBgrToBgra(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); - else -#endif -#if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR) - if(Avx2::Enable && width >= Avx2::A) - Avx2::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); - else -#endif - Base::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); -} - -SIMD_API void SimdBgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, - const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Bgr48pToBgra32(blue, blueStride, width, height, green, greenStride, red, redStride, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::HA) - Avx2::Bgr48pToBgra32(blue, blueStride, width, height, green, greenStride, red, redStride, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::HA) - Sse2::Bgr48pToBgra32(blue, blueStride, width, height, green, greenStride, red, redStride, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::HA) - Vmx::Bgr48pToBgra32(blue, blueStride, width, height, green, greenStride, red, redStride, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::Bgr48pToBgra32(blue, blueStride, width, height, green, greenStride, red, redStride, bgra, bgraStride, alpha); - else -#endif - Base::Bgr48pToBgra32(blue, blueStride, width, height, green, greenStride, red, redStride, bgra, bgraStride, alpha); -} - -SIMD_API void SimdBgrToGray(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *gray, size_t grayStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BgrToGray(bgr, width, height, bgrStride, gray, grayStride); - else -#endif -#if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR) - if(Avx2::Enable && width >= Avx2::A) - Avx2::BgrToGray(bgr, width, height, bgrStride, gray, grayStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::BgrToGray(bgr, width, height, bgrStride, gray, grayStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::BgrToGray(bgr, width, height, bgrStride, gray, grayStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::BgrToGray(bgr, width, height, bgrStride, gray, grayStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::BgrToGray(bgr, width, height, bgrStride, gray, grayStride); - else -#endif - Base::BgrToGray(bgr, width, height, bgrStride, gray, grayStride); -} - -SIMD_API void SimdBgrToHsl(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * hsl, size_t hslStride) -{ - Base::BgrToHsl(bgr, width, height, bgrStride, hsl, hslStride); -} - -SIMD_API void SimdBgrToHsv(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * hsv, size_t hsvStride) -{ - Base::BgrToHsv(bgr, width, height, bgrStride, hsv, hsvStride); -} - -SIMD_API void SimdBgrToRgb(const uint8_t *bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); - else -#endif - Base::BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); -} - -SIMD_API void SimdBgrToYuv420p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BgrToYuv420p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::DA) - Avx2::BgrToYuv420p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::DA) - Ssse3::BgrToYuv420p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::DA) - Vmx::BgrToYuv420p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::DA) - Neon::BgrToYuv420p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else -#endif - Base::BgrToYuv420p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); -} - -SIMD_API void SimdBgrToYuv422p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BgrToYuv422p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::DA) - Avx2::BgrToYuv422p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::DA) - Ssse3::BgrToYuv422p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::DA) - Vmx::BgrToYuv422p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::DA) - Neon::BgrToYuv422p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else -#endif - Base::BgrToYuv422p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); -} - -SIMD_API void SimdBgrToYuv444p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::BgrToYuv444p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::BgrToYuv444p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::BgrToYuv444p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::BgrToYuv444p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::BgrToYuv444p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else -#endif - Base::BgrToYuv444p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); -} - -SIMD_API void SimdBinarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride, SimdCompareType compareType) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride, compareType); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride, compareType); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride, compareType); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride, compareType); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride, compareType); - else -#endif - Base::Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride, compareType); -} - -SIMD_API void SimdAveragingBinarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, size_t neighborhood, uint8_t threshold, uint8_t positive, uint8_t negative, - uint8_t * dst, size_t dstStride, SimdCompareType compareType) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride, compareType); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride, compareType); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride, compareType); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride, compareType); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride, compareType); - else -#endif - Base::AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride, compareType); -} - -SIMD_API void SimdConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, - uint8_t value, SimdCompareType compareType, uint32_t * count) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::ConditionalCount8u(src, stride, width, height, value, compareType, count); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::ConditionalCount8u(src, stride, width, height, value, compareType, count); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::ConditionalCount8u(src, stride, width, height, value, compareType, count); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::ConditionalCount8u(src, stride, width, height, value, compareType, count); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::ConditionalCount8u(src, stride, width, height, value, compareType, count); - else -#endif - Base::ConditionalCount8u(src, stride, width, height, value, compareType, count); -} - -SIMD_API void SimdConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, - int16_t value, SimdCompareType compareType, uint32_t * count) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::ConditionalCount16i(src, stride, width, height, value, compareType, count); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::HA) - Avx2::ConditionalCount16i(src, stride, width, height, value, compareType, count); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::HA) - Sse2::ConditionalCount16i(src, stride, width, height, value, compareType, count); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::HA) - Vmx::ConditionalCount16i(src, stride, width, height, value, compareType, count); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::HA) - Neon::ConditionalCount16i(src, stride, width, height, value, compareType, count); - else -#endif - Base::ConditionalCount16i(src, stride, width, height, value, compareType, count); -} - -SIMD_API void SimdConditionalSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::ConditionalSum(src, srcStride, width, height, mask, maskStride, value, compareType, sum); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::ConditionalSum(src, srcStride, width, height, mask, maskStride, value, compareType, sum); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::ConditionalSum(src, srcStride, width, height, mask, maskStride, value, compareType, sum); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::ConditionalSum(src, srcStride, width, height, mask, maskStride, value, compareType, sum); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::ConditionalSum(src, srcStride, width, height, mask, maskStride, value, compareType, sum); - else -#endif - Base::ConditionalSum(src, srcStride, width, height, mask, maskStride, value, compareType, sum); -} - -SIMD_API void SimdConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, compareType, sum); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, compareType, sum); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, compareType, sum); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, compareType, sum); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, compareType, sum); - else -#endif - Base::ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, compareType, sum); -} - -SIMD_API void SimdConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, compareType, sum); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A + 2) - Avx2::ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, compareType, sum); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A + 2) - Sse2::ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, compareType, sum); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A + 2) - Vmx::ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, compareType, sum); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A + 2) - Neon::ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, compareType, sum); - else -#endif - Base::ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, compareType, sum); -} - -SIMD_API void SimdConditionalFill(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t threshold, SimdCompareType compareType, uint8_t value, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::ConditionalFill(src, srcStride, width, height, threshold, compareType, value, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::ConditionalFill(src, srcStride, width, height, threshold, compareType, value, dst, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if (Sse2::Enable && width >= Sse2::A) - Sse2::ConditionalFill(src, srcStride, width, height, threshold, compareType, value, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if (Vmx::Enable && width >= Vmx::A) - Vmx::ConditionalFill(src, srcStride, width, height, threshold, compareType, value, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::ConditionalFill(src, srcStride, width, height, threshold, compareType, value, dst, dstStride); - else -#endif - Base::ConditionalFill(src, srcStride, width, height, threshold, compareType, value, dst, dstStride); -} - -SIMD_API void SimdCopy(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, uint8_t * dst, size_t dstStride) -{ - Base::Copy(src, srcStride, width, height, pixelSize, dst, dstStride); -} - -SIMD_API void SimdCopyFrame(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, - size_t frameLeft, size_t frameTop, size_t frameRight, size_t frameBottom, uint8_t * dst, size_t dstStride) -{ - Base::CopyFrame(src, srcStride, width, height, pixelSize, frameLeft, frameTop, frameRight, frameBottom, dst, dstStride); -} - -SIMD_API void SimdDeinterleaveUv(const uint8_t * uv, size_t uvStride, size_t width, size_t height, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::DeinterleaveUv(uv, uvStride, width, height, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::DeinterleaveUv(uv, uvStride, width, height, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::DeinterleaveUv(uv, uvStride, width, height, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::DeinterleaveUv(uv, uvStride, width, height, u, uStride, v, vStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::DeinterleaveUv(uv, uvStride, width, height, u, uStride, v, vStride); - else -#endif - Base::DeinterleaveUv(uv, uvStride, width, height, u, uStride, v, vStride); -} - -SIMD_API void SimdDeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); - else -#endif - Base::DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); -} - -SIMD_API void SimdDeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); - else -#endif - Base::DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); -} - -SIMD_API void * SimdDetectionLoadStringXml(char * xml) -{ - return Base::DetectionLoadStringXml(xml); -} - -SIMD_API void * SimdDetectionLoadA(const char * path) -{ - return Base::DetectionLoadA(path); -} - -SIMD_API void SimdDetectionInfo(const void * data, size_t * width, size_t * height, SimdDetectionInfoFlags * flags) -{ - Base::DetectionInfo(data, width, height, flags); -} - -SIMD_API void * SimdDetectionInit(const void * data, uint8_t * sum, size_t sumStride, size_t width, size_t height, - uint8_t * sqsum, size_t sqsumStride, uint8_t * tilted, size_t tiltedStride, int throughColumn, int int16) -{ - return Base::DetectionInit(data, sum, sumStride, width, height, sqsum, sqsumStride, tilted, tiltedStride, throughColumn, int16); -} - -SIMD_API void SimdDetectionPrepare(void * hid) -{ - Base::DetectionPrepare(hid); -} - -SIMD_API void SimdDetectionHaarDetect32fp(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) -{ - size_t width = right - left; -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::DetectionHaarDetect32fp(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::DetectionHaarDetect32fp(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif -#ifdef SIMD_SSE41_ENABLE - if (Sse41::Enable && width >= Sse41::A) - Sse41::DetectionHaarDetect32fp(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::DetectionHaarDetect32fp(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif - Base::DetectionHaarDetect32fp(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); -} - -SIMD_API void SimdDetectionHaarDetect32fi(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) -{ - size_t width = right - left; -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::DetectionHaarDetect32fi(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::DetectionHaarDetect32fi(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif -#ifdef SIMD_SSE41_ENABLE - if (Sse41::Enable && width >= Sse41::A) - Sse41::DetectionHaarDetect32fi(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::DetectionHaarDetect32fi(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif - Base::DetectionHaarDetect32fi(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); -} - -SIMD_API void SimdDetectionLbpDetect32fp(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) -{ - size_t width = right - left; -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::DetectionLbpDetect32fp(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::DetectionLbpDetect32fp(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif -#ifdef SIMD_SSE41_ENABLE - if (Sse41::Enable && width >= Sse41::A) - Sse41::DetectionLbpDetect32fp(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::DetectionLbpDetect32fp(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif - Base::DetectionLbpDetect32fp(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); -} - -SIMD_API void SimdDetectionLbpDetect32fi(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) -{ - size_t width = right - left; -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::DetectionLbpDetect32fi(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::DetectionLbpDetect32fi(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif -#ifdef SIMD_SSE41_ENABLE - if (Sse41::Enable && width >= Sse41::A) - Sse41::DetectionLbpDetect32fi(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::DetectionLbpDetect32fi(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif - Base::DetectionLbpDetect32fi(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); -} - -SIMD_API void SimdDetectionLbpDetect16ip(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) -{ - size_t width = right - left; -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::DetectionLbpDetect16ip(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::DetectionLbpDetect16ip(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif -#ifdef SIMD_SSE41_ENABLE - if (Sse41::Enable && width >= Sse41::A) - Sse41::DetectionLbpDetect16ip(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::DetectionLbpDetect16ip(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif - Base::DetectionLbpDetect16ip(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); -} - -SIMD_API void SimdDetectionLbpDetect16ii(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) -{ - size_t width = right - left; -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::DetectionLbpDetect16ii(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::DetectionLbpDetect16ii(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif -#ifdef SIMD_SSE41_ENABLE - if (Sse41::Enable && width >= Sse41::A) - Sse41::DetectionLbpDetect16ii(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::DetectionLbpDetect16ii(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); - else -#endif - Base::DetectionLbpDetect16ii(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); -} - -SIMD_API void SimdEdgeBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::EdgeBackgroundGrowRangeSlow(value, valueStride, width, height, background, backgroundStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::EdgeBackgroundGrowRangeSlow(value, valueStride, width, height, background, backgroundStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::EdgeBackgroundGrowRangeSlow(value, valueStride, width, height, background, backgroundStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::EdgeBackgroundGrowRangeSlow(value, valueStride, width, height, background, backgroundStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::EdgeBackgroundGrowRangeSlow(value, valueStride, width, height, background, backgroundStride); - else -#endif - Base::EdgeBackgroundGrowRangeSlow(value, valueStride, width, height, background, backgroundStride); -} - -SIMD_API void SimdEdgeBackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::EdgeBackgroundGrowRangeFast(value, valueStride, width, height, background, backgroundStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::EdgeBackgroundGrowRangeFast(value, valueStride, width, height, background, backgroundStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::EdgeBackgroundGrowRangeFast(value, valueStride, width, height, background, backgroundStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::EdgeBackgroundGrowRangeFast(value, valueStride, width, height, background, backgroundStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::EdgeBackgroundGrowRangeFast(value, valueStride, width, height, background, backgroundStride); - else -#endif - Base::EdgeBackgroundGrowRangeFast(value, valueStride, width, height, background, backgroundStride); -} - -SIMD_API void SimdEdgeBackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t * backgroundCount, size_t backgroundCountStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::EdgeBackgroundIncrementCount(value, valueStride, width, height, backgroundValue, backgroundValueStride, backgroundCount, backgroundCountStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::EdgeBackgroundIncrementCount(value, valueStride, width, height, backgroundValue, backgroundValueStride, backgroundCount, backgroundCountStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::EdgeBackgroundIncrementCount(value, valueStride, width, height, backgroundValue, backgroundValueStride, backgroundCount, backgroundCountStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::EdgeBackgroundIncrementCount(value, valueStride, width, height, backgroundValue, backgroundValueStride, backgroundCount, backgroundCountStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::EdgeBackgroundIncrementCount(value, valueStride, width, height, backgroundValue, backgroundValueStride, backgroundCount, backgroundCountStride); - else -#endif - Base::EdgeBackgroundIncrementCount(value, valueStride, width, height, backgroundValue, backgroundValueStride, backgroundCount, backgroundCountStride); -} - -SIMD_API void SimdEdgeBackgroundAdjustRange(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::EdgeBackgroundAdjustRange(backgroundCount, backgroundCountStride, width, height, backgroundValue, backgroundValueStride, threshold); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::EdgeBackgroundAdjustRange(backgroundCount, backgroundCountStride, width, height, backgroundValue, backgroundValueStride, threshold); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::EdgeBackgroundAdjustRange(backgroundCount, backgroundCountStride, width, height, backgroundValue, backgroundValueStride, threshold); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::EdgeBackgroundAdjustRange(backgroundCount, backgroundCountStride, width, height, backgroundValue, backgroundValueStride, threshold); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::EdgeBackgroundAdjustRange(backgroundCount, backgroundCountStride, width, height, backgroundValue, backgroundValueStride, threshold); - else -#endif - Base::EdgeBackgroundAdjustRange(backgroundCount, backgroundCountStride, width, height, backgroundValue, backgroundValueStride, threshold); -} - -SIMD_API void SimdEdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::EdgeBackgroundAdjustRangeMasked(backgroundCount, backgroundCountStride, width, height, backgroundValue, backgroundValueStride, threshold, mask, maskStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::EdgeBackgroundAdjustRangeMasked(backgroundCount, backgroundCountStride, width, height, backgroundValue, backgroundValueStride, threshold, mask, maskStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::EdgeBackgroundAdjustRangeMasked(backgroundCount, backgroundCountStride, width, height, backgroundValue, backgroundValueStride, threshold, mask, maskStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::EdgeBackgroundAdjustRangeMasked(backgroundCount, backgroundCountStride, width, height, backgroundValue, backgroundValueStride, threshold, mask, maskStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::EdgeBackgroundAdjustRangeMasked(backgroundCount, backgroundCountStride, width, height, backgroundValue, backgroundValueStride, threshold, mask, maskStride); - else -#endif - Base::EdgeBackgroundAdjustRangeMasked(backgroundCount, backgroundCountStride, width, height, backgroundValue, backgroundValueStride, threshold, mask, maskStride); -} - -SIMD_API void SimdEdgeBackgroundShiftRange(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride) -{ - Base::EdgeBackgroundShiftRange(value, valueStride, width, height, background, backgroundStride); -} - -SIMD_API void SimdEdgeBackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride, const uint8_t * mask, size_t maskStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::EdgeBackgroundShiftRangeMasked(value, valueStride, width, height, background, backgroundStride, mask, maskStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::EdgeBackgroundShiftRangeMasked(value, valueStride, width, height, background, backgroundStride, mask, maskStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::EdgeBackgroundShiftRangeMasked(value, valueStride, width, height, background, backgroundStride, mask, maskStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::EdgeBackgroundShiftRangeMasked(value, valueStride, width, height, background, backgroundStride, mask, maskStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::EdgeBackgroundShiftRangeMasked(value, valueStride, width, height, background, backgroundStride, mask, maskStride); - else -#endif - Base::EdgeBackgroundShiftRangeMasked(value, valueStride, width, height, background, backgroundStride, mask, maskStride); -} - -SIMD_API void SimdFill(uint8_t * dst, size_t stride, size_t width, size_t height, size_t pixelSize, uint8_t value) -{ - Base::Fill(dst, stride, width, height, pixelSize, value); -} - -SIMD_API void SimdFillFrame(uint8_t * dst, size_t stride, size_t width, size_t height, size_t pixelSize, - size_t frameLeft, size_t frameTop, size_t frameRight, size_t frameBottom, uint8_t value) -{ - Base::FillFrame(dst, stride, width, height, pixelSize, frameLeft, frameTop, frameRight, frameBottom, value); -} - -SIMD_API void SimdFillBgr(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::FillBgr(dst, stride, width, height, blue, green, red); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::FillBgr(dst, stride, width, height, blue, green, red); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::FillBgr(dst, stride, width, height, blue, green, red); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::FillBgr(dst, stride, width, height, blue, green, red); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::FillBgr(dst, stride, width, height, blue, green, red); - else -#endif - Base::FillBgr(dst, stride, width, height, blue, green, red); -} - -SIMD_API void SimdFillBgra(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red, uint8_t alpha) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::FillBgra(dst, stride, width, height, blue, green, red, alpha); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::F) - Avx2::FillBgra(dst, stride, width, height, blue, green, red, alpha); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::F) - Sse2::FillBgra(dst, stride, width, height, blue, green, red, alpha); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::F) - Vmx::FillBgra(dst, stride, width, height, blue, green, red, alpha); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::F) - Neon::FillBgra(dst, stride, width, height, blue, green, red, alpha); - else -#endif - Base::FillBgra(dst, stride, width, height, blue, green, red, alpha); -} - -SIMD_API void SimdFillPixel(uint8_t * dst, size_t stride, size_t width, size_t height, const uint8_t * pixel, size_t pixelSize) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::FillPixel(dst, stride, width, height, pixel, pixelSize); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::FillPixel(dst, stride, width, height, pixel, pixelSize); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if (Sse2::Enable && width >= Sse2::A) - Sse2::FillPixel(dst, stride, width, height, pixel, pixelSize); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::FillPixel(dst, stride, width, height, pixel, pixelSize); - else -#endif - Base::FillPixel(dst, stride, width, height, pixel, pixelSize); -} - -SIMD_API void SimdFill32f(float * dst, size_t size, const float * value) -{ - typedef void(*SimdFill32fPtr) (float * dst, size_t size, const float * value); - const static SimdFill32fPtr simdFill32f = SIMD_FUNC4(Fill32f, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdFill32f(dst, size, value); -} - -SIMD_API void SimdFloat32ToFloat16(const float * src, size_t size, uint16_t * dst) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Float32ToFloat16(src, size, dst); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && size >= Avx2::F) - Avx2::Float32ToFloat16(src, size, dst); - else -#endif -#if defined(SIMD_NEON_ENABLE) && defined(SIMD_NEON_FP16_ENABLE) - if (Neon::Enable && size >= Neon::F) - Neon::Float32ToFloat16(src, size, dst); - else -#endif - Base::Float32ToFloat16(src, size, dst); -} - -SIMD_API void SimdFloat16ToFloat32(const uint16_t * src, size_t size, float * dst) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Float16ToFloat32(src, size, dst); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && size >= Avx2::F) - Avx2::Float16ToFloat32(src, size, dst); - else -#endif -#if defined(SIMD_NEON_ENABLE) && defined(SIMD_NEON_FP16_ENABLE) - if (Neon::Enable && size >= Neon::F) - Neon::Float16ToFloat32(src, size, dst); - else -#endif - Base::Float16ToFloat32(src, size, dst); -} - -SIMD_API void SimdSquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::SquaredDifferenceSum16f(a, b, size, sum); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && size >= Avx2::F) - Avx2::SquaredDifferenceSum16f(a, b, size, sum); - else -#endif -#if defined(SIMD_NEON_ENABLE) && defined(SIMD_NEON_FP16_ENABLE) - if (Neon::Enable && size >= Neon::F) - Neon::SquaredDifferenceSum16f(a, b, size, sum); - else -#endif - Base::SquaredDifferenceSum16f(a, b, size, sum); -} - -SIMD_API void SimdCosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::CosineDistance16f(a, b, size, distance); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && size >= Avx2::F) - Avx2::CosineDistance16f(a, b, size, distance); - else -#endif -#if defined(SIMD_NEON_ENABLE) && defined(SIMD_NEON_FP16_ENABLE) - if (Neon::Enable && size >= Neon::F) - Neon::CosineDistance16f(a, b, size, distance); - else -#endif - Base::CosineDistance16f(a, b, size, distance); -} - -SIMD_API void SimdCosineDistancesMxNa16f(size_t M, size_t N, size_t K, const uint16_t * const * A, const uint16_t * const * B, float * distances) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && K >= Avx512bw::F) - Avx512bw::CosineDistancesMxNa16f(M, N, K, A, B, distances); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && K >= Avx2::F) - Avx2::CosineDistancesMxNa16f(M, N, K, A, B, distances); - else -#endif -#if defined(SIMD_NEON_ENABLE) && defined(SIMD_NEON_FP16_ENABLE) - if (Neon::Enable && K >= Neon::F) - Neon::CosineDistancesMxNa16f(M, N, K, A, B, distances); - else -#endif - Base::CosineDistancesMxNa16f(M, N, K, A, B, distances); -} - -SIMD_API void SimdFloat32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Float32ToUint8(src, size, lower, upper, dst); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && size >= Avx2::A) - Avx2::Float32ToUint8(src, size, lower, upper, dst); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if (Sse2::Enable && size >= Sse2::A) - Sse2::Float32ToUint8(src, size, lower, upper, dst); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && size >= Neon::A) - Neon::Float32ToUint8(src, size, lower, upper, dst); - else -#endif - Base::Float32ToUint8(src, size, lower, upper, dst); -} - -SIMD_API void SimdUint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Uint8ToFloat32(src, size, lower, upper, dst); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && size >= Avx2::HA) - Avx2::Uint8ToFloat32(src, size, lower, upper, dst); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if (Sse2::Enable && size >= Sse2::A) - Sse2::Uint8ToFloat32(src, size, lower, upper, dst); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && size >= Neon::A) - Neon::Uint8ToFloat32(src, size, lower, upper, dst); - else -#endif - Base::Uint8ToFloat32(src, size, lower, upper, dst); -} - -SIMD_API void SimdCosineDistance32f(const float * a, const float * b, size_t size, float * distance) -{ - typedef void(*SimdCosineDistance32fPtr) (const float * a, const float * b, size_t size, float * distance); - const static SimdCosineDistance32fPtr simdCosineDistance32f = SIMD_FUNC5(CosineDistance32f, SIMD_AVX512BW_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdCosineDistance32f(a, b, size, distance); -} - -SIMD_API void SimdGaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && (width - 1)*channelCount >= Avx512bw::A) - Avx512bw::GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && (width - 1)*channelCount >= Avx2::A) - Avx2::GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && (width - 1)*channelCount >= Ssse3::A) - Ssse3::GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && (width - 1)*channelCount >= Sse2::A) - Sse2::GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && (width - 1)*channelCount >= Vmx::A) - Vmx::GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && (width - 1)*channelCount >= Neon::A) - Neon::GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif - Base::GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); -} - -typedef void(*SimdGemm32fPtr) (size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); - -SIMD_API void SimdGemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc) -{ - const static SimdGemm32fPtr simdGemm32fNN = SIMD_FUNC5(Gemm32fNN, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdGemm32fNN(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); -} - -SIMD_API void SimdGemm32fNT(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc) -{ - const static SimdGemm32fPtr simdGemm32fNT = SIMD_FUNC5(Gemm32fNT, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE3_FUNC, SIMD_NEON_FUNC); - - simdGemm32fNT(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); -} - -SIMD_API void SimdGrayToBgr(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgr, size_t bgrStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); - else -#endif - Base::GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); -} - -SIMD_API void SimdGrayToBgra(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::GrayToBgra(gray, width, height, grayStride, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::GrayToBgra(gray, width, height, grayStride, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::GrayToBgra(gray, width, height, grayStride, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::GrayToBgra(gray, width, height, grayStride, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::GrayToBgra(gray, width, height, grayStride, bgra, bgraStride, alpha); - else -#endif - Base::GrayToBgra(gray, width, height, grayStride, bgra, bgraStride, alpha); -} - -SIMD_API void SimdAbsSecondDerivativeHistogram(const uint8_t *src, size_t width, size_t height, size_t stride, size_t step, size_t indent, uint32_t * histogram) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width >= Avx512bw::A + 2 * indent) - Avx512bw::AbsSecondDerivativeHistogram(src, width, height, stride, step, indent, histogram); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A + 2*indent) - Avx2::AbsSecondDerivativeHistogram(src, width, height, stride, step, indent, histogram); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A + 2*indent) - Sse2::AbsSecondDerivativeHistogram(src, width, height, stride, step, indent, histogram); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A + 2*indent) - Vmx::AbsSecondDerivativeHistogram(src, width, height, stride, step, indent, histogram); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A + 2 * indent) - Neon::AbsSecondDerivativeHistogram(src, width, height, stride, step, indent, histogram); - else -#endif - Base::AbsSecondDerivativeHistogram(src, width, height, stride, step, indent, histogram); -} - -SIMD_API void SimdHistogram(const uint8_t *src, size_t width, size_t height, size_t stride, uint32_t * histogram) -{ - Base::Histogram(src, width, height, stride, histogram); -} - -SIMD_API void SimdHistogramMasked(const uint8_t *src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t index, uint32_t * histogram) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::HistogramMasked(src, srcStride, width, height, mask, maskStride, index, histogram); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::HistogramMasked(src, srcStride, width, height, mask, maskStride, index, histogram); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::HistogramMasked(src, srcStride, width, height, mask, maskStride, index, histogram); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::HistogramMasked(src, srcStride, width, height, mask, maskStride, index, histogram); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::HistogramMasked(src, srcStride, width, height, mask, maskStride, index, histogram); - else -#endif - Base::HistogramMasked(src, srcStride, width, height, mask, maskStride, index, histogram); -} - -SIMD_API void SimdHistogramConditional(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint32_t * histogram) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::HistogramConditional(src, srcStride, width, height, mask, maskStride, value, compareType, histogram); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::HistogramConditional(src, srcStride, width, height, mask, maskStride, value, compareType, histogram); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if (Sse2::Enable && width >= Sse2::A) - Sse2::HistogramConditional(src, srcStride, width, height, mask, maskStride, value, compareType, histogram); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::HistogramConditional(src, srcStride, width, height, mask, maskStride, value, compareType, histogram); - else -#endif - Base::HistogramConditional(src, srcStride, width, height, mask, maskStride, value, compareType, histogram); -} - -SIMD_API void SimdNormalizedColors(const uint32_t * histogram, uint8_t * colors) -{ - Base::NormalizedColors(histogram, colors); -} - -SIMD_API void SimdChangeColors(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * colors, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width >= Avx512bw::HA) - Avx512bw::ChangeColors(src, srcStride, width, height, colors, dst, dstStride); - else -#endif - Base::ChangeColors(src, srcStride, width, height, colors, dst, dstStride); -} - -SIMD_API void SimdNormalizeHistogram(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width >= Avx512bw::HA) - Avx512bw::NormalizeHistogram(src, srcStride, width, height, dst, dstStride); - else -#endif - Base::NormalizeHistogram(src, srcStride, width, height, dst, dstStride); -} - -SIMD_API void SimdHogDirectionHistograms(const uint8_t * src, size_t stride, size_t width, size_t height, - size_t cellX, size_t cellY, size_t quantization, float * histograms) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width >= Avx512bw::HA + 2) - Avx512bw::HogDirectionHistograms(src, stride, width, height, cellX, cellY, quantization, histograms); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A + 2) - Avx2::HogDirectionHistograms(src, stride, width, height, cellX, cellY, quantization, histograms); - else -#endif -#ifdef SIMD_SSE41_ENABLE - if (Sse41::Enable && width >= Sse41::A + 2) - Sse41::HogDirectionHistograms(src, stride, width, height, cellX, cellY, quantization, histograms); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A + 2) - Sse2::HogDirectionHistograms(src, stride, width, height, cellX, cellY, quantization, histograms); - else -#endif -#ifdef SIMD_VSX_ENABLE - if(Vsx::Enable && width >= Vsx::A + 2) - Vsx::HogDirectionHistograms(src, stride, width, height, cellX, cellY, quantization, histograms); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A + 2) - Neon::HogDirectionHistograms(src, stride, width, height, cellX, cellY, quantization, histograms); - else -#endif - Base::HogDirectionHistograms(src, stride, width, height, cellX, cellY, quantization, histograms); -} - -SIMD_API void SimdHogExtractFeatures(const uint8_t * src, size_t stride, size_t width, size_t height, float * features) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width >= Avx512bw::HA + 2) - Avx512bw::HogExtractFeatures(src, stride, width, height, features); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::HA + 2) - Avx2::HogExtractFeatures(src, stride, width, height, features); - else -#endif -#ifdef SIMD_SSE41_ENABLE - if (Sse41::Enable && width >= Sse41::A + 2) - Sse41::HogExtractFeatures(src, stride, width, height, features); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A + 2) - Neon::HogExtractFeatures(src, stride, width, height, features); - else -#endif - Base::HogExtractFeatures(src, stride, width, height, features); -} - -SIMD_API void SimdHogDeinterleave(const float * src, size_t srcStride, size_t width, size_t height, size_t count, float ** dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width >= Avx512bw::F && count >= Sse::F) - Avx512bw::HogDeinterleave(src, srcStride, width, height, count, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::F && count >= Sse::F) - Avx2::HogDeinterleave(src, srcStride, width, height, count, dst, dstStride); - else -#endif -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable && width >= Sse::F && count >= Sse::F) - Sse::HogDeinterleave(src, srcStride, width, height, count, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::F && count >= Neon::F) - Neon::HogDeinterleave(src, srcStride, width, height, count, dst, dstStride); - else -#endif - Base::HogDeinterleave(src, srcStride, width, height, count, dst, dstStride); -} - -SIMD_API void SimdHogFilterSeparable(const float * src, size_t srcStride, size_t width, size_t height, - const float * rowFilter, size_t rowSize, const float * colFilter, size_t colSize, float * dst, size_t dstStride, int add) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width >= Avx512bw::F + colSize - 1) - Avx512bw::HogFilterSeparable(src, srcStride, width, height, rowFilter, rowSize, colFilter, colSize, dst, dstStride, add); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::F + colSize - 1) - Avx2::HogFilterSeparable(src, srcStride, width, height, rowFilter, rowSize, colFilter, colSize, dst, dstStride, add); - else -#endif -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable && width >= Sse::F + colSize - 1) - Sse::HogFilterSeparable(src, srcStride, width, height, rowFilter, rowSize, colFilter, colSize, dst, dstStride, add); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::F + colSize - 1) - Neon::HogFilterSeparable(src, srcStride, width, height, rowFilter, rowSize, colFilter, colSize, dst, dstStride, add); - else -#endif - Base::HogFilterSeparable(src, srcStride, width, height, rowFilter, rowSize, colFilter, colSize, dst, dstStride, add); -} - -SIMD_API void SimdHogLiteExtractFeatures(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t cell, float * features, size_t featuresStride) -{ -#if defined(SIMD_AVX2_ENABLE) || defined(SIMD_SSE41_ENABLE) || defined(SIMD_NEON_ENABLE) - size_t size = (width / cell - 1)*cell; -#endif -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::HogLiteExtractFeatures(src, srcStride, width, height, cell, features, featuresStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && size >= Avx2::A) - Avx2::HogLiteExtractFeatures(src, srcStride, width, height, cell, features, featuresStride); - else -#endif -#ifdef SIMD_SSE41_ENABLE - if (Sse41::Enable && size >= Sse41::A) - Sse41::HogLiteExtractFeatures(src, srcStride, width, height, cell, features, featuresStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && size >= Neon::A) - Neon::HogLiteExtractFeatures(src, srcStride, width, height, cell, features, featuresStride); - else -#endif - Base::HogLiteExtractFeatures(src, srcStride, width, height, cell, features, featuresStride); -} - -SIMD_API void SimdHogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable) - Avx2::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - else -#endif -#ifdef SIMD_AVX_ENABLE - if (Avx::Enable) - Avx::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - else -#endif -#ifdef SIMD_SSE41_ENABLE - if (Sse41::Enable) - Sse41::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable) - Neon::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - else -#endif - Base::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); -} - -SIMD_API void SimdHogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::HogLiteResizeFeatures(src, srcStride, srcWidth, srcHeight, featureSize, dst, dstStride, dstWidth, dstHeight); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable) - Avx2::HogLiteResizeFeatures(src, srcStride, srcWidth, srcHeight, featureSize, dst, dstStride, dstWidth, dstHeight); - else -#endif -#ifdef SIMD_AVX_ENABLE - if (Avx::Enable) - Avx::HogLiteResizeFeatures(src, srcStride, srcWidth, srcHeight, featureSize, dst, dstStride, dstWidth, dstHeight); - else -#endif -#ifdef SIMD_SSE41_ENABLE - if (Sse41::Enable) - Sse41::HogLiteResizeFeatures(src, srcStride, srcWidth, srcHeight, featureSize, dst, dstStride, dstWidth, dstHeight); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable) - Neon::HogLiteResizeFeatures(src, srcStride, srcWidth, srcHeight, featureSize, dst, dstStride, dstWidth, dstHeight); - else -#endif - Base::HogLiteResizeFeatures(src, srcStride, srcWidth, srcHeight, featureSize, dst, dstStride, dstWidth, dstHeight); -} - -SIMD_API void SimdHogLiteCompressFeatures(const float * src, size_t srcStride, size_t width, size_t height, const float * pca, float * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::HogLiteCompressFeatures(src, srcStride, width, height, pca, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable) - Avx2::HogLiteCompressFeatures(src, srcStride, width, height, pca, dst, dstStride); - else -#endif -#ifdef SIMD_AVX_ENABLE - if (Avx::Enable) - Avx::HogLiteCompressFeatures(src, srcStride, width, height, pca, dst, dstStride); - else -#endif -#ifdef SIMD_SSE41_ENABLE - if (Sse41::Enable) - Sse41::HogLiteCompressFeatures(src, srcStride, width, height, pca, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable) - Neon::HogLiteCompressFeatures(src, srcStride, width, height, pca, dst, dstStride); - else -#endif - Base::HogLiteCompressFeatures(src, srcStride, width, height, pca, dst, dstStride); -} - -SIMD_API void SimdHogLiteFilterSeparable(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * hFilter, size_t hSize, const float * vFilter, size_t vSize, float * dst, size_t dstStride, int add) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::HogLiteFilterSeparable(src, srcStride, srcWidth, srcHeight, featureSize, hFilter, hSize, vFilter, vSize, dst, dstStride, add); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && srcWidth >= hSize - 1 + Avx2::F) - Avx2::HogLiteFilterSeparable(src, srcStride, srcWidth, srcHeight, featureSize, hFilter, hSize, vFilter, vSize, dst, dstStride, add); - else -#endif -#ifdef SIMD_AVX_ENABLE - if (Avx::Enable && srcWidth >= hSize - 1 + Avx::F) - Avx::HogLiteFilterSeparable(src, srcStride, srcWidth, srcHeight, featureSize, hFilter, hSize, vFilter, vSize, dst, dstStride, add); - else -#endif -#ifdef SIMD_SSE41_ENABLE - if (Sse41::Enable && srcWidth >= hSize - 1 + Sse41::F) - Sse41::HogLiteFilterSeparable(src, srcStride, srcWidth, srcHeight, featureSize, hFilter, hSize, vFilter, vSize, dst, dstStride, add); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && srcWidth >= hSize - 1 + Neon::F) - Neon::HogLiteFilterSeparable(src, srcStride, srcWidth, srcHeight, featureSize, hFilter, hSize, vFilter, vSize, dst, dstStride, add); - else -#endif - Base::HogLiteFilterSeparable(src, srcStride, srcWidth, srcHeight, featureSize, hFilter, hSize, vFilter, vSize, dst, dstStride, add); -} - -SIMD_API void SimdHogLiteFindMax7x7(const float * a, size_t aStride, const float * b, size_t bStride, size_t height, float * value, size_t * col, size_t * row) -{ - typedef void(*SimdHogLiteFindMax7x7Ptr) (const float * a, size_t aStride, const float * b, size_t bStride, size_t height, float * value, size_t * col, size_t * row); - const static SimdHogLiteFindMax7x7Ptr simdHogLiteFindMax7x7 = SIMD_FUNC3(HogLiteFindMax7x7, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC); - - simdHogLiteFindMax7x7(a, aStride, b, bStride, height, value, col, row); -} - -SIMD_API void SimdHogLiteCreateMask(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, const float * threshold, size_t scale, size_t size, uint32_t * dst, size_t dstStride) -{ - typedef void(*SimdHogLiteCreateMaskPtr) (const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, const float * threshold, size_t scale, size_t size, uint32_t * dst, size_t dstStride); - const static SimdHogLiteCreateMaskPtr simdHogLiteCreateMask = SIMD_FUNC4(HogLiteCreateMask, SIMD_AVX512BW_FUNC, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC); - - simdHogLiteCreateMask(src, srcStride, srcWidth, srcHeight, threshold, scale, size, dst, dstStride); -} - -SIMD_API void SimdInt16ToGray(const uint8_t * src, size_t width, size_t height, size_t srcStride, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Int16ToGray(src, width, height, srcStride, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::Int16ToGray(src, width, height, srcStride, dst, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if (Sse2::Enable && width >= Sse2::A) - Sse2::Int16ToGray(src, width, height, srcStride, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::HA) - Neon::Int16ToGray(src, width, height, srcStride, dst, dstStride); - else -#endif - Base::Int16ToGray(src, width, height, srcStride, dst, dstStride); -} - -SIMD_API void SimdIntegral(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t * sum, size_t sumStride, uint8_t * sqsum, size_t sqsumStride, uint8_t * tilted, size_t tiltedStride, - SimdPixelFormatType sumFormat, SimdPixelFormatType sqsumFormat) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Integral(src, srcStride, width, height, sum, sumStride, sqsum, sqsumStride, tilted, tiltedStride, sumFormat, sqsumFormat); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable) - Avx2::Integral(src, srcStride, width, height, sum, sumStride, sqsum, sqsumStride, tilted, tiltedStride, sumFormat, sqsumFormat); - else -#endif - Base::Integral(src, srcStride, width, height, sum, sumStride, sqsum, sqsumStride, tilted, tiltedStride, sumFormat, sqsumFormat); -} - -SIMD_API void SimdInterferenceIncrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t increment, int16_t saturation) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::InterferenceIncrement(statistic, stride, width, height, increment, saturation); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::HA) - Avx2::InterferenceIncrement(statistic, stride, width, height, increment, saturation); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::HA) - Sse2::InterferenceIncrement(statistic, stride, width, height, increment, saturation); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::HA) - Vmx::InterferenceIncrement(statistic, stride, width, height, increment, saturation); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::HA) - Neon::InterferenceIncrement(statistic, stride, width, height, increment, saturation); - else -#endif - Base::InterferenceIncrement(statistic, stride, width, height, increment, saturation); -} - -SIMD_API void SimdInterferenceIncrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t increment, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::InterferenceIncrementMasked(statistic, statisticStride, width, height, increment, saturation, mask, maskStride, index); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::InterferenceIncrementMasked(statistic, statisticStride, width, height, increment, saturation, mask, maskStride, index); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::InterferenceIncrementMasked(statistic, statisticStride, width, height, increment, saturation, mask, maskStride, index); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::InterferenceIncrementMasked(statistic, statisticStride, width, height, increment, saturation, mask, maskStride, index); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::InterferenceIncrementMasked(statistic, statisticStride, width, height, increment, saturation, mask, maskStride, index); - else -#endif - Base::InterferenceIncrementMasked(statistic, statisticStride, width, height, increment, saturation, mask, maskStride, index); -} - -SIMD_API void SimdInterferenceDecrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t decrement, int16_t saturation) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::InterferenceDecrement(statistic, stride, width, height, decrement, saturation); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::HA) - Avx2::InterferenceDecrement(statistic, stride, width, height, decrement, saturation); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::HA) - Sse2::InterferenceDecrement(statistic, stride, width, height, decrement, saturation); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::HA) - Vmx::InterferenceDecrement(statistic, stride, width, height, decrement, saturation); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::HA) - Neon::InterferenceDecrement(statistic, stride, width, height, decrement, saturation); - else -#endif - Base::InterferenceDecrement(statistic, stride, width, height, decrement, saturation); -} - -SIMD_API void SimdInterferenceDecrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t decrement, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::InterferenceDecrementMasked(statistic, statisticStride, width, height, decrement, saturation, mask, maskStride, index); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::InterferenceDecrementMasked(statistic, statisticStride, width, height, decrement, saturation, mask, maskStride, index); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::InterferenceDecrementMasked(statistic, statisticStride, width, height, decrement, saturation, mask, maskStride, index); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::InterferenceDecrementMasked(statistic, statisticStride, width, height, decrement, saturation, mask, maskStride, index); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::InterferenceDecrementMasked(statistic, statisticStride, width, height, decrement, saturation, mask, maskStride, index); - else -#endif - Base::InterferenceDecrementMasked(statistic, statisticStride, width, height, decrement, saturation, mask, maskStride, index); -} - -SIMD_API void SimdInterleaveUv(const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * uv, size_t uvStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::InterleaveUv(u, uStride, v, vStride, width, height, uv, uvStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::InterleaveUv(u, uStride, v, vStride, width, height, uv, uvStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if (Sse2::Enable && width >= Sse2::A) - Sse2::InterleaveUv(u, uStride, v, vStride, width, height, uv, uvStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if (Vmx::Enable && width >= Vmx::A) - Vmx::InterleaveUv(u, uStride, v, vStride, width, height, uv, uvStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::InterleaveUv(u, uStride, v, vStride, width, height, uv, uvStride); - else -#endif - Base::InterleaveUv(u, uStride, v, vStride, width, height, uv, uvStride); -} - -SIMD_API void SimdInterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride); - else -#endif - Base::InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride); -} - -SIMD_API void SimdInterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride); - else -#endif - Base::InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride); -} - -SIMD_API void SimdLaplace(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width > Avx512bw::A) - Avx512bw::Laplace(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width > Avx2::A) - Avx2::Laplace(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width > Ssse3::A) - Ssse3::Laplace(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width > Sse2::A) - Sse2::Laplace(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width > Vmx::A) - Vmx::Laplace(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width > Neon::A) - Neon::Laplace(src, srcStride, width, height, dst, dstStride); - else -#endif - Base::Laplace(src, srcStride, width, height, dst, dstStride); -} - -SIMD_API void SimdLaplaceAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width > Avx512bw::A) - Avx512bw::LaplaceAbs(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width > Avx2::A) - Avx2::LaplaceAbs(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width > Ssse3::A) - Ssse3::LaplaceAbs(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width > Vmx::A) - Vmx::LaplaceAbs(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width > Neon::A) - Neon::LaplaceAbs(src, srcStride, width, height, dst, dstStride); - else -#endif - Base::LaplaceAbs(src, srcStride, width, height, dst, dstStride); -} - -SIMD_API void SimdLaplaceAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width > Avx512bw::A) - Avx512bw::LaplaceAbsSum(src, stride, width, height, sum); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width > Avx2::A) - Avx2::LaplaceAbsSum(src, stride, width, height, sum); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width > Ssse3::A) - Ssse3::LaplaceAbsSum(src, stride, width, height, sum); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width > Vmx::A) - Vmx::LaplaceAbsSum(src, stride, width, height, sum); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width > Neon::A) - Neon::LaplaceAbsSum(src, stride, width, height, sum); - else -#endif - Base::LaplaceAbsSum(src, stride, width, height, sum); -} - -SIMD_API void SimdLbpEstimate(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width >= Avx512bw::A + 2) - Avx512bw::LbpEstimate(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A + 2) - Avx2::LbpEstimate(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A + 2) - Sse2::LbpEstimate(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A + 2) - Vmx::LbpEstimate(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A + 2) - Neon::LbpEstimate(src, srcStride, width, height, dst, dstStride); - else -#endif - Base::LbpEstimate(src, srcStride, width, height, dst, dstStride); -} - -SIMD_API void SimdMeanFilter3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && (width - 1)*channelCount >= Avx512bw::A) - Avx512bw::MeanFilter3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && (width - 1)*channelCount >= Avx2::A) - Avx2::MeanFilter3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && (width - 1)*channelCount >= Ssse3::A) - Ssse3::MeanFilter3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if (Sse2::Enable && (width - 1)*channelCount >= Sse2::A) - Sse2::MeanFilter3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if (Vmx::Enable && (width - 1)*channelCount >= Vmx::A) - Vmx::MeanFilter3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && (width - 1)*channelCount >= Neon::A) - Neon::MeanFilter3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif - Base::MeanFilter3x3(src, srcStride, width, height, channelCount, dst, dstStride); -} - -SIMD_API void SimdMedianFilterRhomb3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && (width - 1)*channelCount >= Avx512bw::A) - Avx512bw::MedianFilterRhomb3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && (width - 1)*channelCount >= Avx2::A) - Avx2::MedianFilterRhomb3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && (width - 1)*channelCount >= Sse2::A) - Sse2::MedianFilterRhomb3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && (width - 1)*channelCount >= Vmx::A) - Vmx::MedianFilterRhomb3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && (width - 1)*channelCount >= Neon::A) - Neon::MedianFilterRhomb3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif - Base::MedianFilterRhomb3x3(src, srcStride, width, height, channelCount, dst, dstStride); -} - -SIMD_API void SimdMedianFilterRhomb5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && (width - 2)*channelCount >= Avx512bw::A) - Avx512bw::MedianFilterRhomb5x5(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && (width - 2)*channelCount >= Avx2::A) - Avx2::MedianFilterRhomb5x5(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && (width - 2)*channelCount >= Sse2::A) - Sse2::MedianFilterRhomb5x5(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && (width - 2)*channelCount >= Vmx::A) - Vmx::MedianFilterRhomb5x5(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && (width - 2)*channelCount >= Neon::A) - Neon::MedianFilterRhomb5x5(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif - Base::MedianFilterRhomb5x5(src, srcStride, width, height, channelCount, dst, dstStride); -} - -SIMD_API void SimdMedianFilterSquare3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && (width - 1)*channelCount >= Avx512bw::A) - Avx512bw::MedianFilterSquare3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && (width - 1)*channelCount >= Avx2::A) - Avx2::MedianFilterSquare3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && (width - 1)*channelCount >= Sse2::A) - Sse2::MedianFilterSquare3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && (width - 1)*channelCount >= Vmx::A) - Vmx::MedianFilterSquare3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && (width - 1)*channelCount >= Neon::A) - Neon::MedianFilterSquare3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif - Base::MedianFilterSquare3x3(src, srcStride, width, height, channelCount, dst, dstStride); -} - -SIMD_API void SimdMedianFilterSquare5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && (width - 2)*channelCount >= Avx512bw::A) - Avx512bw::MedianFilterSquare5x5(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && (width - 2)*channelCount >= Avx2::A) - Avx2::MedianFilterSquare5x5(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && (width - 2)*channelCount >= Sse2::A) - Sse2::MedianFilterSquare5x5(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && (width - 2)*channelCount >= Vmx::A) - Vmx::MedianFilterSquare5x5(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && (width - 2)*channelCount >= Neon::A) - Neon::MedianFilterSquare5x5(src, srcStride, width, height, channelCount, dst, dstStride); - else -#endif - Base::MedianFilterSquare5x5(src, srcStride, width, height, channelCount, dst, dstStride); -} - -SIMD_API void SimdNeuralConvert(const uint8_t * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride, int inversion) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width >= Avx512f::F) - Avx512bw::NeuralConvert(src, srcStride, width, height, dst, dstStride, inversion); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx::F) - Avx2::NeuralConvert(src, srcStride, width, height, dst, dstStride, inversion); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if (Sse2::Enable && width >= Sse2::A) - Sse2::NeuralConvert(src, srcStride, width, height, dst, dstStride, inversion); - else -#endif -#ifdef SIMD_VSX_ENABLE - if (Vsx::Enable && width >= Vsx::A) - Vsx::NeuralConvert(src, srcStride, width, height, dst, dstStride, inversion); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::NeuralConvert(src, srcStride, width, height, dst, dstStride, inversion); - else -#endif - Base::NeuralConvert(src, srcStride, width, height, dst, dstStride, inversion); -} - -SIMD_API void SimdNeuralProductSum(const float * a, const float * b, size_t size, float * sum) -{ - typedef void(*SimdNeuralProductSumPtr) (const float * a, const float * b, size_t size, float * sum); - const static SimdNeuralProductSumPtr simdNeuralProductSum = SIMD_FUNC6(NeuralProductSum, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_VSX_FUNC, SIMD_NEON_FUNC); - - simdNeuralProductSum(a, b, size, sum); -} - -SIMD_API void SimdNeuralAddVectorMultipliedByValue(const float * src, size_t size, const float * value, float * dst) -{ - typedef void(*SimdNeuralAddVectorMultipliedByValuePtr) (const float * src, size_t size, const float * value, float * dst); - const static SimdNeuralAddVectorMultipliedByValuePtr simdNeuralAddVectorMultipliedByValue = SIMD_FUNC5(NeuralAddVectorMultipliedByValue, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdNeuralAddVectorMultipliedByValue(src, size, value, dst); -} - -SIMD_API void SimdNeuralAddVector(const float * src, size_t size, float * dst) -{ - typedef void(*SimdNeuralAddVectorPtr) (const float * src, size_t size, float * dst); - const static SimdNeuralAddVectorPtr simdNeuralAddVector = SIMD_FUNC4(NeuralAddVector, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdNeuralAddVector(src, size, dst); -} - -SIMD_API void SimdNeuralAddValue(const float * value, float * dst, size_t size) -{ - typedef void(*SimdNeuralAddValuePtr) (const float * value, float * dst, size_t size); - const static SimdNeuralAddValuePtr simdNeuralAddValue = SIMD_FUNC4(NeuralAddValue, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdNeuralAddValue(value, dst, size); -} - -SIMD_API void SimdNeuralRoughSigmoid(const float * src, size_t size, const float * slope, float * dst) -{ - typedef void(*SimdNeuralRoughSigmoidPtr) (const float * src, size_t size, const float * slope, float * dst); - const static SimdNeuralRoughSigmoidPtr simdNeuralRoughSigmoid = SIMD_FUNC5(NeuralRoughSigmoid, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_VSX_FUNC, SIMD_NEON_FUNC); - - simdNeuralRoughSigmoid(src, size, slope, dst); -} - -SIMD_API void SimdNeuralRoughSigmoid2(const float * src, size_t size, const float * slope, float * dst) -{ - typedef void(*SimdNeuralRoughSigmoid2Ptr) (const float * src, size_t size, const float * slope, float * dst); - const static SimdNeuralRoughSigmoid2Ptr simdNeuralRoughSigmoid2 = SIMD_FUNC5(NeuralRoughSigmoid2, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdNeuralRoughSigmoid2(src, size, slope, dst); -} - -SIMD_API void SimdNeuralDerivativeSigmoid(const float * src, size_t size, const float * slope, float * dst) -{ - typedef void(*SimdNeuralDerivativeSigmoidPtr) (const float * src, size_t size, const float * slope, float * dst); - const static SimdNeuralDerivativeSigmoidPtr simdNeuralDerivativeSigmoid = SIMD_FUNC4(NeuralDerivativeSigmoid, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdNeuralDerivativeSigmoid(src, size, slope, dst); -} - -SIMD_API void SimdNeuralRoughTanh(const float * src, size_t size, const float * slope, float * dst) -{ - typedef void(*SimdNeuralRoughTanhPtr) (const float * src, size_t size, const float * slope, float * dst); - const static SimdNeuralRoughTanhPtr simdNeuralRoughTanh = SIMD_FUNC4(NeuralRoughTanh, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdNeuralRoughTanh(src, size, slope, dst); -} - -SIMD_API void SimdNeuralDerivativeTanh(const float * src, size_t size, const float * slope, float * dst) -{ - typedef void(*SimdNeuralDerivativeTanhPtr) (const float * src, size_t size, const float * slope, float * dst); - const static SimdNeuralDerivativeTanhPtr simdNeuralDerivativeTanh = SIMD_FUNC4(NeuralDerivativeTanh, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdNeuralDerivativeTanh(src, size, slope, dst); -} - -SIMD_API void SimdNeuralDerivativeRelu(const float * src, size_t size, const float * slope, float * dst) -{ - typedef void(*SimdNeuralDerivativeReluPtr) (const float * src, size_t size, const float * slope, float * dst); - const static SimdNeuralDerivativeReluPtr simdNeuralDerivativeRelu = SIMD_FUNC4(NeuralDerivativeRelu, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdNeuralDerivativeRelu(src, size, slope, dst); -} - -SIMD_API void SimdNeuralPow(const float * src, size_t size, const float * exponent, float * dst) -{ - typedef void(*SimdNeuralPowPtr) (const float * src, size_t size, const float * exponent, float * dst); - const static SimdNeuralPowPtr simdNeuralPow = SIMD_FUNC4(NeuralPow, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_SSE2_FUNC, SIMD_NEON_FUNC); - - simdNeuralPow(src, size, exponent, dst); -} - -SIMD_API void SimdNeuralUpdateWeights(const float * x, size_t size, const float * a, const float * b, float * d, float * w) -{ - typedef void(*SimdNeuralUpdateWeightsPtr) (const float * x, size_t size, const float * a, const float * b, float * d, float * w); - const static SimdNeuralUpdateWeightsPtr simdNeuralUpdateWeights = SIMD_FUNC4(NeuralUpdateWeights, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdNeuralUpdateWeights(x, size, a, b, d, w); -} - -SIMD_API void SimdNeuralAdaptiveGradientUpdate(const float * delta, size_t size, size_t batch, const float * alpha, const float * epsilon, float * gradient, float * weight) -{ - typedef void(*SimdNeuralAdaptiveGradientUpdatePtr) (const float * delta, size_t size, size_t batch, const float * alpha, const float * epsilon, float * gradient, float * weight); - const static SimdNeuralAdaptiveGradientUpdatePtr simdNeuralAdaptiveGradientUpdate = SIMD_FUNC4(NeuralAdaptiveGradientUpdate, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdNeuralAdaptiveGradientUpdate(delta, size, batch, alpha, epsilon, gradient, weight); -} - -SIMD_API void SimdNeuralAddConvolution2x2Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512F_ENABLE - if (Avx512f::Enable && width >= Avx512f::F) - Avx512f::NeuralAddConvolution2x2Forward(src, srcStride, width, height, weights, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::F) - Avx2::NeuralAddConvolution2x2Forward(src, srcStride, width, height, weights, dst, dstStride); - else -#endif -#ifdef SIMD_AVX_ENABLE - if (Avx::Enable && width >= Avx::F) - Avx::NeuralAddConvolution2x2Forward(src, srcStride, width, height, weights, dst, dstStride); - else -#endif -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable && width >= Sse::F) - Sse::NeuralAddConvolution2x2Forward(src, srcStride, width, height, weights, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::F) - Neon::NeuralAddConvolution2x2Forward(src, srcStride, width, height, weights, dst, dstStride); - else -#endif - Base::NeuralAddConvolution2x2Forward(src, srcStride, width, height, weights, dst, dstStride); -} - -SIMD_API void SimdNeuralAddConvolution3x3Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512F_ENABLE - if (Avx512f::Enable && width >= Avx512f::F) - Avx512f::NeuralAddConvolution3x3Forward(src, srcStride, width, height, weights, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::F) - Avx2::NeuralAddConvolution3x3Forward(src, srcStride, width, height, weights, dst, dstStride); - else -#endif -#ifdef SIMD_AVX_ENABLE - if (Avx::Enable && width >= Avx::F) - Avx::NeuralAddConvolution3x3Forward(src, srcStride, width, height, weights, dst, dstStride); - else -#endif -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable && width >= Sse::F) - Sse::NeuralAddConvolution3x3Forward(src, srcStride, width, height, weights, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::F) - Neon::NeuralAddConvolution3x3Forward(src, srcStride, width, height, weights, dst, dstStride); - else -#endif - Base::NeuralAddConvolution3x3Forward(src, srcStride, width, height, weights, dst, dstStride); -} - -SIMD_API void SimdNeuralAddConvolution4x4Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512F_ENABLE - if (Avx512f::Enable && width >= Avx512f::F) - Avx512f::NeuralAddConvolution4x4Forward(src, srcStride, width, height, weights, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::F) - Avx2::NeuralAddConvolution4x4Forward(src, srcStride, width, height, weights, dst, dstStride); - else -#endif -#ifdef SIMD_AVX_ENABLE - if (Avx::Enable && width >= Avx::F) - Avx::NeuralAddConvolution4x4Forward(src, srcStride, width, height, weights, dst, dstStride); - else -#endif -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable && width >= Sse::F) - Sse::NeuralAddConvolution4x4Forward(src, srcStride, width, height, weights, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::F) - Neon::NeuralAddConvolution4x4Forward(src, srcStride, width, height, weights, dst, dstStride); - else -#endif - Base::NeuralAddConvolution4x4Forward(src, srcStride, width, height, weights, dst, dstStride); -} - -SIMD_API void SimdNeuralAddConvolution5x5Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512F_ENABLE - if (Avx512f::Enable && width >= Avx512f::F) - Avx512f::NeuralAddConvolution5x5Forward(src, srcStride, width, height, weights, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::F) - Avx2::NeuralAddConvolution5x5Forward(src, srcStride, width, height, weights, dst, dstStride); - else -#endif -#ifdef SIMD_AVX_ENABLE - if (Avx::Enable && width >= Avx::F) - Avx::NeuralAddConvolution5x5Forward(src, srcStride, width, height, weights, dst, dstStride); - else -#endif -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable && width >= Sse::F) - Sse::NeuralAddConvolution5x5Forward(src, srcStride, width, height, weights, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::F) - Neon::NeuralAddConvolution5x5Forward(src, srcStride, width, height, weights, dst, dstStride); - else -#endif - Base::NeuralAddConvolution5x5Forward(src, srcStride, width, height, weights, dst, dstStride); -} - -SIMD_API void SimdNeuralAddConvolution2x2Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) -{ - typedef void(*SimdNeuralAddConvolution2x2BackwardPtr) (const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - const static SimdNeuralAddConvolution2x2BackwardPtr simdNeuralAddConvolution2x2Backward = SIMD_FUNC5(NeuralAddConvolution2x2Backward, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdNeuralAddConvolution2x2Backward(src, srcStride, width, height, weights, dst, dstStride); -} - -SIMD_API void SimdNeuralAddConvolution3x3Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) -{ - typedef void(*SimdNeuralAddConvolution3x3BackwardPtr) (const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - const static SimdNeuralAddConvolution3x3BackwardPtr simdNeuralAddConvolution3x3Backward = SIMD_FUNC5(NeuralAddConvolution3x3Backward, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdNeuralAddConvolution3x3Backward(src, srcStride, width, height, weights, dst, dstStride); -} - -SIMD_API void SimdNeuralAddConvolution4x4Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) -{ - typedef void(*SimdNeuralAddConvolution4x4BackwardPtr) (const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - const static SimdNeuralAddConvolution4x4BackwardPtr simdNeuralAddConvolution4x4Backward = SIMD_FUNC5(NeuralAddConvolution4x4Backward, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdNeuralAddConvolution4x4Backward(src, srcStride, width, height, weights, dst, dstStride); -} - -SIMD_API void SimdNeuralAddConvolution5x5Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) -{ - typedef void(*SimdNeuralAddConvolution5x5BackwardPtr) (const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - const static SimdNeuralAddConvolution5x5BackwardPtr simdNeuralAddConvolution5x5Backward = SIMD_FUNC5(NeuralAddConvolution5x5Backward, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdNeuralAddConvolution5x5Backward(src, srcStride, width, height, weights, dst, dstStride); -} - -SIMD_API void SimdNeuralAddConvolution2x2Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) -{ -#ifdef SIMD_AVX512F_ENABLE - if (Avx512f::Enable && width >= Avx512f::F) - Avx512f::NeuralAddConvolution2x2Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::F) - Avx2::NeuralAddConvolution2x2Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif -#ifdef SIMD_AVX_ENABLE - if (Avx::Enable && width >= Avx::F) - Avx::NeuralAddConvolution2x2Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif -#ifdef SIMD_SSE3_ENABLE - if (Sse3::Enable && width >= Sse3::F) - Sse3::NeuralAddConvolution2x2Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable && width >= Sse::F) - Sse::NeuralAddConvolution2x2Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::F) - Neon::NeuralAddConvolution2x2Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif - Base::NeuralAddConvolution2x2Sum(src, srcStride, dst, dstStride, width, height, sums); -} - -SIMD_API void SimdNeuralAddConvolution3x3Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) -{ -#ifdef SIMD_AVX512F_ENABLE - if (Avx512f::Enable && width >= Avx512f::F) - Avx512f::NeuralAddConvolution3x3Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::F) - Avx2::NeuralAddConvolution3x3Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif -#ifdef SIMD_AVX_ENABLE - if (Avx::Enable && width >= Avx::F) - Avx::NeuralAddConvolution3x3Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif -#ifdef SIMD_SSE3_ENABLE - if (Sse3::Enable && width >= Sse3::F) - Sse3::NeuralAddConvolution3x3Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable && width >= Sse::F) - Sse::NeuralAddConvolution3x3Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::F) - Neon::NeuralAddConvolution3x3Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif - Base::NeuralAddConvolution3x3Sum(src, srcStride, dst, dstStride, width, height, sums); -} - -SIMD_API void SimdNeuralAddConvolution4x4Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) -{ -#ifdef SIMD_AVX512F_ENABLE - if (Avx512f::Enable && width >= Avx512f::F) - Avx512f::NeuralAddConvolution4x4Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::F) - Avx2::NeuralAddConvolution4x4Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif -#ifdef SIMD_AVX_ENABLE - if (Avx::Enable && width >= Avx::F) - Avx::NeuralAddConvolution4x4Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif -#ifdef SIMD_SSE3_ENABLE - if (Sse3::Enable && width >= Sse3::F) - Sse3::NeuralAddConvolution4x4Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable && width >= Sse::F) - Sse::NeuralAddConvolution4x4Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::F) - Neon::NeuralAddConvolution4x4Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif - Base::NeuralAddConvolution4x4Sum(src, srcStride, dst, dstStride, width, height, sums); -} - -SIMD_API void SimdNeuralAddConvolution5x5Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) -{ -#ifdef SIMD_AVX512F_ENABLE - if (Avx512f::Enable && width >= Avx512f::F) - Avx512f::NeuralAddConvolution5x5Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::F) - Avx2::NeuralAddConvolution5x5Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif -#ifdef SIMD_AVX_ENABLE - if (Avx::Enable && width >= Avx::F) - Avx::NeuralAddConvolution5x5Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif -#ifdef SIMD_SSE3_ENABLE - if (Sse3::Enable && width >= Sse3::F) - Sse3::NeuralAddConvolution5x5Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable && width >= Sse::F) - Sse::NeuralAddConvolution5x5Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::F) - Neon::NeuralAddConvolution5x5Sum(src, srcStride, dst, dstStride, width, height, sums); - else -#endif - Base::NeuralAddConvolution5x5Sum(src, srcStride, dst, dstStride, width, height, sums); -} - -SIMD_API void SimdNeuralPooling1x1Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512F_ENABLE - if (Avx512f::Enable && width > Avx512f::F) - Avx512f::NeuralPooling1x1Max3x3(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width > Avx2::F) - Avx2::NeuralPooling1x1Max3x3(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable && width > Sse::F) - Sse::NeuralPooling1x1Max3x3(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width > Neon::F) - Neon::NeuralPooling1x1Max3x3(src, srcStride, width, height, dst, dstStride); - else -#endif - Base::NeuralPooling1x1Max3x3(src, srcStride, width, height, dst, dstStride); -} - -SIMD_API void SimdNeuralPooling2x2Max2x2(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512F_ENABLE - if (Avx512f::Enable && width >= Avx512f::DF) - Avx512f::NeuralPooling2x2Max2x2(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_AVX_ENABLE - if (Avx::Enable && width >= Avx::DF) - Avx::NeuralPooling2x2Max2x2(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable && width >= Sse::DF) - Sse::NeuralPooling2x2Max2x2(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::DF) - Neon::NeuralPooling2x2Max2x2(src, srcStride, width, height, dst, dstStride); - else -#endif - Base::NeuralPooling2x2Max2x2(src, srcStride, width, height, dst, dstStride); -} - -SIMD_API void SimdNeuralPooling2x2Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512F_ENABLE - if (Avx512f::Enable && width > Avx512f::DF) - Avx512f::NeuralPooling2x2Max3x3(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width > Avx2::DF) - Avx2::NeuralPooling2x2Max3x3(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable && width > Sse::DF) - Sse::NeuralPooling2x2Max3x3(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width > Neon::DF) - Neon::NeuralPooling2x2Max3x3(src, srcStride, width, height, dst, dstStride); - else -#endif - Base::NeuralPooling2x2Max3x3(src, srcStride, width, height, dst, dstStride); -} - -SIMD_API void SimdNeuralConvolutionForward(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, - const float * weight, size_t kernelX, size_t kernelY, size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, - void * buffer, size_t * size, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth, int add) -{ - typedef void(*SimdNeuralConvolutionForwardPtr) (const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, - const float * weight, size_t kernelX, size_t kernelY, size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, - void * buffer, size_t * size, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth, int add); - const static SimdNeuralConvolutionForwardPtr simdNeuralConvolutionForward = SIMD_FUNC5(NeuralConvolutionForward, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE3_FUNC, SIMD_NEON_FUNC); - - simdNeuralConvolutionForward(src, srcWidth, srcHeight, srcDepth, weight, kernelX, kernelY, padX, padY, strideX, strideY, dilationX, dilationY, buffer, size, dst, dstWidth, dstHeight, dstDepth, add); -} - -SIMD_API void SimdOperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride, SimdOperationBinary8uType type) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride, type); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width*channelCount >= Avx2::A) - Avx2::OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride, type); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width*channelCount >= Sse2::A) - Sse2::OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride, type); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width*channelCount >= Vmx::A) - Vmx::OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride, type); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width*channelCount >= Neon::A) - Neon::OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride, type); - else -#endif -#ifdef SIMD_MSA_ENABLE - if (Msa::Enable && width*channelCount >= Msa::A) - Msa::OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride, type); - else -#endif - Base::OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride, type); -} - -SIMD_API void SimdOperationBinary16i(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint8_t * dst, size_t dstStride, SimdOperationBinary16iType type) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride, type); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::HA) - Avx2::OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride, type); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::HA) - Sse2::OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride, type); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::HA) - Vmx::OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride, type); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::HA) - Neon::OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride, type); - else -#endif -#ifdef SIMD_MSA_ENABLE - if (Msa::Enable && width >= Msa::HA) - Msa::OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride, type); - else -#endif - Base::OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride, type); -} - -SIMD_API void SimdVectorProduct(const uint8_t * vertical, const uint8_t * horizontal, uint8_t * dst, size_t stride, size_t width, size_t height) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::VectorProduct(vertical, horizontal, dst, stride, width, height); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::VectorProduct(vertical, horizontal, dst, stride, width, height); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::VectorProduct(vertical, horizontal, dst, stride, width, height); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::VectorProduct(vertical, horizontal, dst, stride, width, height); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::VectorProduct(vertical, horizontal, dst, stride, width, height); - else -#endif - Base::VectorProduct(vertical, horizontal, dst, stride, width, height); -} - -SIMD_API void SimdReduceColor2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && srcWidth >= Avx2::DA) - Avx2::ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && srcWidth >= Ssse3::DA) - Ssse3::ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if (Sse2::Enable && srcWidth >= Sse2::DA) - Sse2::ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && srcWidth >= Neon::DA) - Neon::ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - else -#endif - Base::ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); -} - -SIMD_API void SimdReduceGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && srcWidth >= Avx2::DA) - Avx2::ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && srcWidth >= Ssse3::DA) - Ssse3::ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && srcWidth >= Sse2::DA) - Sse2::ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && srcWidth >= Vmx::DA) - Vmx::ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && srcWidth >= Neon::DA) - Neon::ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else -#endif - Base::ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); -} - -SIMD_API void SimdReduceGray3x3(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && srcWidth >= Avx512bw::DA) - Avx512bw::ReduceGray3x3(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, compensation); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && srcWidth >= Avx2::DA) - Avx2::ReduceGray3x3(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, compensation); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && srcWidth >= Sse2::A) - Sse2::ReduceGray3x3(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, compensation); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && srcWidth >= Vmx::DA) - Vmx::ReduceGray3x3(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, compensation); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && srcWidth >= Neon::DA) - Neon::ReduceGray3x3(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, compensation); - else -#endif - Base::ReduceGray3x3(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, compensation); -} - -SIMD_API void SimdReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && srcWidth > Avx512bw::DA) - Avx512bw::ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && srcWidth > Avx2::DA) - Avx2::ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && srcWidth > Ssse3::A) - Ssse3::ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && srcWidth > Sse2::A) - Sse2::ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && srcWidth > Vmx::DA) - Vmx::ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && srcWidth > Neon::DA) - Neon::ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else -#endif - Base::ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); -} - -SIMD_API void SimdReduceGray5x5(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && srcWidth >= Avx512bw::DA) - Avx512bw::ReduceGray5x5(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, compensation); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && srcWidth >= Avx2::DA) - Avx2::ReduceGray5x5(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, compensation); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && srcWidth >= Sse2::A) - Sse2::ReduceGray5x5(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, compensation); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && srcWidth >= Vmx::DA) - Vmx::ReduceGray5x5(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, compensation); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && srcWidth >= Neon::DA) - Neon::ReduceGray5x5(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, compensation); - else -#endif - Base::ReduceGray5x5(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, compensation); -} - -SIMD_API void SimdReorder16bit(const uint8_t * src, size_t size, uint8_t * dst) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Reorder16bit(src, size, dst); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && size >= Avx2::A) - Avx2::Reorder16bit(src, size, dst); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && size >= Ssse3::A) - Ssse3::Reorder16bit(src, size, dst); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && size >= Sse2::A) - Sse2::Reorder16bit(src, size, dst); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && size >= Vmx::A) - Vmx::Reorder16bit(src, size, dst); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && size >= Neon::A) - Neon::Reorder16bit(src, size, dst); - else -#endif - Base::Reorder16bit(src, size, dst); -} - -SIMD_API void SimdReorder32bit(const uint8_t * src, size_t size, uint8_t * dst) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Reorder32bit(src, size, dst); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && size >= Avx2::A) - Avx2::Reorder32bit(src, size, dst); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && size >= Ssse3::A) - Ssse3::Reorder32bit(src, size, dst); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && size >= Sse2::A) - Sse2::Reorder32bit(src, size, dst); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && size >= Vmx::A) - Vmx::Reorder32bit(src, size, dst); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && size >= Neon::A) - Neon::Reorder32bit(src, size, dst); - else -#endif - Base::Reorder32bit(src, size, dst); -} - -SIMD_API void SimdReorder64bit(const uint8_t * src, size_t size, uint8_t * dst) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Reorder64bit(src, size, dst); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && size >= Avx2::A) - Avx2::Reorder64bit(src, size, dst); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && size >= Ssse3::A) - Ssse3::Reorder64bit(src, size, dst); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && size >= Sse2::A) - Sse2::Reorder64bit(src, size, dst); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && size >= Vmx::A) - Vmx::Reorder64bit(src, size, dst); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && size >= Neon::A) - Neon::Reorder64bit(src, size, dst); - else -#endif - Base::Reorder64bit(src, size, dst); -} - -SIMD_API void SimdResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && dstWidth >= Avx512bw::A) - Avx512bw::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && dstWidth >= Avx2::A) - Avx2::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && dstWidth >= Ssse3::A) - Ssse3::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && dstWidth >= Sse2::A) - Sse2::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && dstWidth >= Vmx::A) - Vmx::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && dstWidth >= Neon::A) - Neon::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - else -#endif - Base::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); -} - -SIMD_API void * SimdResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - return Avx512bw::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - else -#endif -#ifdef SIMD_AVX512F_ENABLE - if (Avx512f::Enable) - return Avx512f::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable) - return Avx2::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - else -#endif -#ifdef SIMD_AVX_ENABLE - if (Avx::Enable) - return Avx::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - else -#endif -#ifdef SIMD_SSE41_ENABLE - if (Sse41::Enable) - return Sse41::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable) - return Ssse3::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if (Sse2::Enable) - return Sse2::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - else -#endif -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable) - return Sse::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable) - return Neon::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - else -#endif - return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); -} - -SIMD_API void SimdResizerRun(const void * resizer, const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) -{ - ((Resizer*)resizer)->Run(src, srcStride, dst, dstStride); -} - -SIMD_API void SimdRgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); - else -#endif -#if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR) - if (Avx2::Enable && width >= Avx2::A) - Avx2::RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); - else -#endif - Base::RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); -} - -SIMD_API void SimdRgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - else -#endif -#if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR) - if (Avx2::Enable && width >= Avx2::A) - Avx2::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - else -#endif - Base::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); -} - -SIMD_API void SimdSegmentationChangeIndex(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t oldIndex, uint8_t newIndex) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::SegmentationChangeIndex(mask, stride, width, height, oldIndex, newIndex); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::SegmentationChangeIndex(mask, stride, width, height, oldIndex, newIndex); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::SegmentationChangeIndex(mask, stride, width, height, oldIndex, newIndex); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::SegmentationChangeIndex(mask, stride, width, height, oldIndex, newIndex); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::SegmentationChangeIndex(mask, stride, width, height, oldIndex, newIndex); - else -#endif - Base::SegmentationChangeIndex(mask, stride, width, height, oldIndex, newIndex); -} - -SIMD_API void SimdSegmentationFillSingleHoles(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::SegmentationFillSingleHoles(mask, stride, width, height, index); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width > Avx2::A + 2) - Avx2::SegmentationFillSingleHoles(mask, stride, width, height, index); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width > Sse2::A + 2) - Sse2::SegmentationFillSingleHoles(mask, stride, width, height, index); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width > Vmx::A + 2) - Vmx::SegmentationFillSingleHoles(mask, stride, width, height, index); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width > Neon::A + 2) - Neon::SegmentationFillSingleHoles(mask, stride, width, height, index); - else -#endif - Base::SegmentationFillSingleHoles(mask, stride, width, height, index); -} - -SIMD_API void SimdSegmentationPropagate2x2(const uint8_t * parent, size_t parentStride, size_t width, size_t height, - uint8_t * child, size_t childStride, const uint8_t * difference, size_t differenceStride, - uint8_t currentIndex, uint8_t invalidIndex, uint8_t emptyIndex, uint8_t differenceThreshold) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::SegmentationPropagate2x2(parent, parentStride, width, height, child, childStride, - difference, differenceStride, currentIndex, invalidIndex, emptyIndex, differenceThreshold); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A + 1) - Avx2::SegmentationPropagate2x2(parent, parentStride, width, height, child, childStride, - difference, differenceStride, currentIndex, invalidIndex, emptyIndex, differenceThreshold); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A + 1) - Sse2::SegmentationPropagate2x2(parent, parentStride, width, height, child, childStride, - difference, differenceStride, currentIndex, invalidIndex, emptyIndex, differenceThreshold); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A + 1) - Vmx::SegmentationPropagate2x2(parent, parentStride, width, height, child, childStride, - difference, differenceStride, currentIndex, invalidIndex, emptyIndex, differenceThreshold); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A + 1) - Neon::SegmentationPropagate2x2(parent, parentStride, width, height, child, childStride, - difference, differenceStride, currentIndex, invalidIndex, emptyIndex, differenceThreshold); - else -#endif - Base::SegmentationPropagate2x2(parent, parentStride, width, height, child, childStride, - difference, differenceStride, currentIndex, invalidIndex, emptyIndex, differenceThreshold); -} - -SIMD_API void SimdSegmentationShrinkRegion(const uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index, - ptrdiff_t * left, ptrdiff_t * top, ptrdiff_t * right, ptrdiff_t * bottom) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::SegmentationShrinkRegion(mask, stride, width, height, index, left, top, right, bottom); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A && *right - *left >= (ptrdiff_t)Avx2::A) - Avx2::SegmentationShrinkRegion(mask, stride, width, height, index, left, top, right, bottom); - else -#endif -#ifdef SIMD_SSE41_ENABLE - if(Sse41::Enable && width >= Sse41::A && *right - *left >= (ptrdiff_t)Sse41::A) - Sse41::SegmentationShrinkRegion(mask, stride, width, height, index, left, top, right, bottom); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A && *right - *left >= (ptrdiff_t)Vmx::A) - Vmx::SegmentationShrinkRegion(mask, stride, width, height, index, left, top, right, bottom); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A && *right - *left >= (ptrdiff_t)Neon::A) - Neon::SegmentationShrinkRegion(mask, stride, width, height, index, left, top, right, bottom); - else -#endif - Base::SegmentationShrinkRegion(mask, stride, width, height, index, left, top, right, bottom); -} - -SIMD_API void SimdShiftBilinear(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t * bkg, size_t bkgStride, const double * shiftX, const double * shiftY, - size_t cropLeft, size_t cropTop, size_t cropRight, size_t cropBottom, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::ShiftBilinear(src, srcStride, width, height, channelCount, bkg, bkgStride, - shiftX, shiftY, cropLeft, cropTop, cropRight, cropBottom, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable) - Avx2::ShiftBilinear(src, srcStride, width, height, channelCount, bkg, bkgStride, - shiftX, shiftY, cropLeft, cropTop, cropRight, cropBottom, dst, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable) - Sse2::ShiftBilinear(src, srcStride, width, height, channelCount, bkg, bkgStride, - shiftX, shiftY, cropLeft, cropTop, cropRight, cropBottom, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable) - Vmx::ShiftBilinear(src, srcStride, width, height, channelCount, bkg, bkgStride, - shiftX, shiftY, cropLeft, cropTop, cropRight, cropBottom, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable) - Neon::ShiftBilinear(src, srcStride, width, height, channelCount, bkg, bkgStride, - shiftX, shiftY, cropLeft, cropTop, cropRight, cropBottom, dst, dstStride); - else -#endif - Base::ShiftBilinear(src, srcStride, width, height, channelCount, bkg, bkgStride, - shiftX, shiftY, cropLeft, cropTop, cropRight, cropBottom, dst, dstStride); -} - -SIMD_API void SimdSobelDx(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width > Avx512bw::A) - Avx512bw::SobelDx(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width > Avx2::A) - Avx2::SobelDx(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width > Ssse3::A) - Ssse3::SobelDx(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width > Sse2::A) - Sse2::SobelDx(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width > Vmx::A) - Vmx::SobelDx(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width > Neon::A) - Neon::SobelDx(src, srcStride, width, height, dst, dstStride); - else -#endif - Base::SobelDx(src, srcStride, width, height, dst, dstStride); -} - -SIMD_API void SimdSobelDxAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width > Avx512bw::A) - Avx512bw::SobelDxAbs(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width > Avx2::A) - Avx2::SobelDxAbs(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width > Ssse3::A) - Ssse3::SobelDxAbs(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width > Vmx::A) - Vmx::SobelDxAbs(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width > Neon::A) - Neon::SobelDxAbs(src, srcStride, width, height, dst, dstStride); - else -#endif - Base::SobelDxAbs(src, srcStride, width, height, dst, dstStride); -} - -SIMD_API void SimdSobelDxAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width > Avx512bw::A) - Avx512bw::SobelDxAbsSum(src, stride, width, height, sum); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width > Avx2::A) - Avx2::SobelDxAbsSum(src, stride, width, height, sum); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width > Ssse3::A) - Ssse3::SobelDxAbsSum(src, stride, width, height, sum); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width > Vmx::A) - Vmx::SobelDxAbsSum(src, stride, width, height, sum); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width > Neon::A) - Neon::SobelDxAbsSum(src, stride, width, height, sum); - else -#endif - Base::SobelDxAbsSum(src, stride, width, height, sum); -} - -SIMD_API void SimdSobelDy(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width > Avx512bw::A) - Avx512bw::SobelDy(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width > Avx2::A) - Avx2::SobelDy(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width > Ssse3::A) - Ssse3::SobelDy(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width > Sse2::A) - Sse2::SobelDy(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width > Vmx::A) - Vmx::SobelDy(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width > Neon::A) - Neon::SobelDy(src, srcStride, width, height, dst, dstStride); - else -#endif - Base::SobelDy(src, srcStride, width, height, dst, dstStride); -} - -SIMD_API void SimdSobelDyAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width > Avx512bw::A) - Avx512bw::SobelDyAbs(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width > Avx2::A) - Avx2::SobelDyAbs(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width > Ssse3::A) - Ssse3::SobelDyAbs(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width > Vmx::A) - Vmx::SobelDyAbs(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width > Neon::A) - Neon::SobelDyAbs(src, srcStride, width, height, dst, dstStride); - else -#endif - Base::SobelDyAbs(src, srcStride, width, height, dst, dstStride); -} - -SIMD_API void SimdSobelDyAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width > Avx512bw::A) - Avx512bw::SobelDyAbsSum(src, stride, width, height, sum); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width > Avx2::A) - Avx2::SobelDyAbsSum(src, stride, width, height, sum); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width > Ssse3::A) - Ssse3::SobelDyAbsSum(src, stride, width, height, sum); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width > Vmx::A) - Vmx::SobelDyAbsSum(src, stride, width, height, sum); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width > Neon::A) - Neon::SobelDyAbsSum(src, stride, width, height, sum); - else -#endif - Base::SobelDyAbsSum(src, stride, width, height, sum); -} - -SIMD_API void SimdContourMetrics(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width > Avx512bw::A) - Avx512bw::ContourMetrics(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width > Avx2::A) - Avx2::ContourMetrics(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width > Ssse3::A) - Ssse3::ContourMetrics(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width > Vmx::A) - Vmx::ContourMetrics(src, srcStride, width, height, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width > Neon::A) - Neon::ContourMetrics(src, srcStride, width, height, dst, dstStride); - else -#endif - Base::ContourMetrics(src, srcStride, width, height, dst, dstStride); -} - -SIMD_API void SimdContourMetricsMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t indexMin, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width > Avx512bw::A) - Avx512bw::ContourMetricsMasked(src, srcStride, width, height, mask, maskStride, indexMin, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width > Avx2::A) - Avx2::ContourMetricsMasked(src, srcStride, width, height, mask, maskStride, indexMin, dst, dstStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width > Ssse3::A) - Ssse3::ContourMetricsMasked(src, srcStride, width, height, mask, maskStride, indexMin, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width > Vmx::A) - Vmx::ContourMetricsMasked(src, srcStride, width, height, mask, maskStride, indexMin, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width > Neon::A) - Neon::ContourMetricsMasked(src, srcStride, width, height, mask, maskStride, indexMin, dst, dstStride); - else -#endif - Base::ContourMetricsMasked(src, srcStride, width, height, mask, maskStride, indexMin, dst, dstStride); -} - -SIMD_API void SimdContourAnchors(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t step, int16_t threshold, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width > Avx512bw::A) - Avx512bw::ContourAnchors(src, srcStride, width, height, step, threshold, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width > Avx2::A) - Avx2::ContourAnchors(src, srcStride, width, height, step, threshold, dst, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width > Sse2::A) - Sse2::ContourAnchors(src, srcStride, width, height, step, threshold, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width > Vmx::A) - Vmx::ContourAnchors(src, srcStride, width, height, step, threshold, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width > Neon::A) - Neon::ContourAnchors(src, srcStride, width, height, step, threshold, dst, dstStride); - else -#endif - Base::ContourAnchors(src, srcStride, width, height, step, threshold, dst, dstStride); -} - -SIMD_API void SimdSquaredDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - size_t width, size_t height, uint64_t * sum) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::SquaredDifferenceSum(a, aStride, b, bStride, width, height, sum); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::SquaredDifferenceSum(a, aStride, b, bStride, width, height, sum); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::SquaredDifferenceSum(a, aStride, b, bStride, width, height, sum); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::SquaredDifferenceSum(a, aStride, b, bStride, width, height, sum); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::SquaredDifferenceSum(a, aStride, b, bStride, width, height, sum); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::SquaredDifferenceSum(a, aStride, b, bStride, width, height, sum); - else -#endif - Base::SquaredDifferenceSum(a, aStride, b, bStride, width, height, sum); -} - -SIMD_API void SimdSquaredDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::SquaredDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::SquaredDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::SquaredDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::SquaredDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::SquaredDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::SquaredDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - else -#endif - Base::SquaredDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); -} - -SIMD_API void SimdSquaredDifferenceSum32f(const float * a, const float * b, size_t size, float * sum) -{ - typedef void (* SimdSquaredDifferenceSum32fPtr) (const float * a, const float * b, size_t size, float * sum); - const static SimdSquaredDifferenceSum32fPtr simdSquaredDifferenceSum32f = SIMD_FUNC5(SquaredDifferenceSum32f, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_VSX_FUNC, SIMD_NEON_FUNC); - - simdSquaredDifferenceSum32f(a, b, size, sum); -} - -SIMD_API void SimdSquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum) -{ - typedef void (* SimdSquaredDifferenceKahanSum32fPtr) (const float * a, const float * b, size_t size, float * sum); - const static SimdSquaredDifferenceKahanSum32fPtr simdSquaredDifferenceKahanSum32f = SIMD_FUNC5(SquaredDifferenceKahanSum32f, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_VSX_FUNC, SIMD_NEON_FUNC); - - simdSquaredDifferenceKahanSum32f(a, b, size, sum); -} - -SIMD_API void SimdGetStatistic(const uint8_t * src, size_t stride, size_t width, size_t height, - uint8_t * min, uint8_t * max, uint8_t * average) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::GetStatistic(src, stride, width, height, min, max, average); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::GetStatistic(src, stride, width, height, min, max, average); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::GetStatistic(src, stride, width, height, min, max, average); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::GetStatistic(src, stride, width, height, min, max, average); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::GetStatistic(src, stride, width, height, min, max, average); - else -#endif - Base::GetStatistic(src, stride, width, height, min, max, average); -} - -SIMD_API void SimdGetMoments(const uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index, - uint64_t * area, uint64_t * x, uint64_t * y, uint64_t * xx, uint64_t * xy, uint64_t * yy) -{ - const bool simd = width < SHRT_MAX && height < SHRT_MAX; -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width >= Avx512bw::A && simd) - Avx512bw::GetMoments(mask, stride, width, height, index, area, x, y, xx, xy, yy); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::GetMoments(mask, stride, width, height, index, area, x, y, xx, xy, yy); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::GetMoments(mask, stride, width, height, index, area, x, y, xx, xy, yy); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A && simd) - Vmx::GetMoments(mask, stride, width, height, index, area, x, y, xx, xy, yy); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A && simd) - Neon::GetMoments(mask, stride, width, height, index, area, x, y, xx, xy, yy); - else -#endif - Base::GetMoments(mask, stride, width, height, index, area, x, y, xx, xy, yy); -} - -SIMD_API void SimdGetObjectMoments(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t* mask, size_t maskStride, uint8_t index, - uint64_t* n, uint64_t* s, uint64_t* sx, uint64_t* sy, uint64_t* sxx, uint64_t* sxy, uint64_t* syy) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::GetObjectMoments(src, srcStride, width, height, mask, maskStride, index, n, s, sx, sy, sxx, sxy, syy); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::GetObjectMoments(src, srcStride, width, height, mask, maskStride, index, n, s, sx, sy, sxx, sxy, syy); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if (Sse2::Enable && width >= Sse2::A) - Sse2::GetObjectMoments(src, srcStride, width, height, mask, maskStride, index, n, s, sx, sy, sxx, sxy, syy); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::GetObjectMoments(src, srcStride, width, height, mask, maskStride, index, n, s, sx, sy, sxx, sxy, syy); - else -#endif - Base::GetObjectMoments(src, srcStride, width, height, mask, maskStride, index, n, s, sx, sy, sxx, sxy, syy); -} - -SIMD_API void SimdGetRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::GetRowSums(src, stride, width, height, sums); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::GetRowSums(src, stride, width, height, sums); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::GetRowSums(src, stride, width, height, sums); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::GetRowSums(src, stride, width, height, sums); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::GetRowSums(src, stride, width, height, sums); - else -#endif - Base::GetRowSums(src, stride, width, height, sums); -} - -SIMD_API void SimdGetColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::GetColSums(src, stride, width, height, sums); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::GetColSums(src, stride, width, height, sums); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::GetColSums(src, stride, width, height, sums); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::GetColSums(src, stride, width, height, sums); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::GetColSums(src, stride, width, height, sums); - else -#endif - Base::GetColSums(src, stride, width, height, sums); -} - -SIMD_API void SimdGetAbsDyRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::GetAbsDyRowSums(src, stride, width, height, sums); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::GetAbsDyRowSums(src, stride, width, height, sums); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::GetAbsDyRowSums(src, stride, width, height, sums); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::GetAbsDyRowSums(src, stride, width, height, sums); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::GetAbsDyRowSums(src, stride, width, height, sums); - else -#endif - Base::GetAbsDyRowSums(src, stride, width, height, sums); -} - -SIMD_API void SimdGetAbsDxColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::GetAbsDxColSums(src, stride, width, height, sums); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::GetAbsDxColSums(src, stride, width, height, sums); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::GetAbsDxColSums(src, stride, width, height, sums); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::GetAbsDxColSums(src, stride, width, height, sums); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::GetAbsDxColSums(src, stride, width, height, sums); - else -#endif - Base::GetAbsDxColSums(src, stride, width, height, sums); -} - -SIMD_API void SimdValueSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::ValueSum(src, stride, width, height, sum); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::ValueSum(src, stride, width, height, sum); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::ValueSum(src, stride, width, height, sum); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::ValueSum(src, stride, width, height, sum); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::ValueSum(src, stride, width, height, sum); - else -#endif - Base::ValueSum(src, stride, width, height, sum); -} - -SIMD_API void SimdSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::SquareSum(src, stride, width, height, sum); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::SquareSum(src, stride, width, height, sum); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::SquareSum(src, stride, width, height, sum); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::SquareSum(src, stride, width, height, sum); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::SquareSum(src, stride, width, height, sum); - else -#endif - Base::SquareSum(src, stride, width, height, sum); -} - -SIMD_API void SimdValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::ValueSquareSum(src, stride, width, height, valueSum, squareSum); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::ValueSquareSum(src, stride, width, height, valueSum, squareSum); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::ValueSquareSum(src, stride, width, height, valueSum, squareSum); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::ValueSquareSum(src, stride, width, height, valueSum, squareSum); - else -#endif - Base::ValueSquareSum(src, stride, width, height, valueSum, squareSum); -} - -SIMD_API void SimdCorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::CorrelationSum(a, aStride, b, bStride, width, height, sum); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::CorrelationSum(a, aStride, b, bStride, width, height, sum); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::CorrelationSum(a, aStride, b, bStride, width, height, sum); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::CorrelationSum(a, aStride, b, bStride, width, height, sum); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::CorrelationSum(a, aStride, b, bStride, width, height, sum); - else -#endif - Base::CorrelationSum(a, aStride, b, bStride, width, height, sum); -} - -SIMD_API void SimdStretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::StretchGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && srcWidth >= Avx2::A) - Avx2::StretchGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && srcWidth >= Sse2::A) - Sse2::StretchGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && srcWidth >= Vmx::A) - Vmx::StretchGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && srcWidth >= Neon::A) - Neon::StretchGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else -#endif - Base::StretchGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); -} - -SIMD_API void SimdSvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum) -{ -#ifdef SIMD_AVX512F_ENABLE - if (Avx512f::Enable) - Avx512f::SvmSumLinear(x, svs, weights, length, count, sum); - else -#endif -#ifdef SIMD_AVX_ENABLE - if(Avx::Enable) - Avx::SvmSumLinear(x, svs, weights, length, count, sum); - else -#endif -#ifdef SIMD_SSE_ENABLE - if(Sse::Enable) - Sse::SvmSumLinear(x, svs, weights, length, count, sum); - else -#endif -#ifdef SIMD_VSX_ENABLE - if(Vsx::Enable) - Vsx::SvmSumLinear(x, svs, weights, length, count, sum); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable) - Neon::SvmSumLinear(x, svs, weights, length, count, sum); - else -#endif - Base::SvmSumLinear(x, svs, weights, length, count, sum); -} - -SIMD_API void SimdSynetAddBias(const float * bias, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) -{ - typedef void(*SimdSynetAddBiasPtr) (const float * bias, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - const static SimdSynetAddBiasPtr simdSynetAddBias = SIMD_FUNC4(SynetAddBias, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdSynetAddBias(bias, channels, spatial, dst, format); -} - -SIMD_API void SimdSynetConvert32fTo8u(const float* src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float* shift, uint8_t* dst, SimdSynetCompatibilityType compatibility) -{ - typedef void(*SimdSynetConvert32fTo8uPtr) (const float* src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float* shift, uint8_t* dst, SimdSynetCompatibilityType compatibility); - const static SimdSynetConvert32fTo8uPtr simdSynetConvert32fTo8u = SIMD_FUNC4(SynetConvert32fTo8u, SIMD_AVX512BW_FUNC, SIMD_AVX2_FUNC, SIMD_SSE2_FUNC, SIMD_NEON_FUNC); - - simdSynetConvert32fTo8u(src, batch, channels, height, width, format, scale, shift, dst, compatibility); -} - -SIMD_API void * SimdSynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * params, SimdGemm32fNNPtr gemm) -{ - typedef void* (*SimdSynetConvolution32fInitPtr) (size_t batch, const SimdConvolutionParameters * params, SimdGemm32fNNPtr gemm); - const static SimdSynetConvolution32fInitPtr simdSynetConvolution32fInit = SIMD_FUNC6(SynetConvolution32fInit, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE3_FUNC, SIMD_SSE2_FUNC, SIMD_NEON_FUNC); - - return simdSynetConvolution32fInit(batch, params, gemm); -} - -SIMD_API size_t SimdSynetConvolution32fExternalBufferSize(const void * context) -{ - return ((SynetConvolution32f*)context)->ExternalBufferSize(); -} - -SIMD_API size_t SimdSynetConvolution32fInternalBufferSize(const void * context) -{ - return ((SynetConvolution32f*)context)->InternalBufferSize(); -} - -SIMD_API void SimdSynetConvolution32fSetParams(void * context, const float * weight, SimdBool * internal, const float * bias, const float * params) -{ - ((SynetConvolution32f*)context)->SetParams(weight, internal, bias, params); -} - -SIMD_API void SimdSynetConvolution32fForward(void * context, const float * src, float * buf, float * dst) -{ - SynetConvolution32f * c = (SynetConvolution32f*)context; - SIMD_PERF_EXT(c); - c->Forward(src, buf, dst); -} - -SIMD_API void* SimdSynetConvolution8iInit(size_t batch, const SimdConvolutionParameters* conv, SimdSynetCompatibilityType compatibility) -{ - typedef void* (*SimdSynetConvolution8iInitPtr) (size_t batch, const SimdConvolutionParameters* conv, SimdSynetCompatibilityType compatibility); - const static SimdSynetConvolution8iInitPtr simdSynetConvolution8iInit = SIMD_FUNC5(SynetConvolution8iInit, SIMD_AVX512VNNI_FUNC, SIMD_AVX512BW_FUNC, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC); - - return simdSynetConvolution8iInit(batch, conv, compatibility); -} - -SIMD_API size_t SimdSynetConvolution8iExternalBufferSize(const void* context) -{ - return ((SynetConvolution8i*)context)->ExternalBufferSize(); -} - -SIMD_API size_t SimdSynetConvolution8iInternalBufferSize(const void* context) -{ - return ((SynetConvolution8i*)context)->InternalBufferSize(); - -} - -SIMD_API void SimdSynetConvolution8iSetParams(void* context, const float* weight, const float* bias, const float* params, const float* const* stats) -{ - ((SynetConvolution8i*)context)->SetParams(weight, bias, params, stats); -} - -SIMD_API void SimdSynetConvolution8iForward(void* context, const uint8_t* src, uint8_t* buf, uint8_t* dst) -{ - SynetConvolution8i* c = (SynetConvolution8i*)context; - SIMD_PERF_EXT(c); - c->Forward(src, buf, dst); -} - -SIMD_API void * SimdSynetDeconvolution32fInit(size_t batch, const SimdConvolutionParameters * params, SimdGemm32fNNPtr gemm) -{ - typedef void* (*SimdSynetDeconvolution32fInitPtr) (size_t batch, const SimdConvolutionParameters * params, SimdGemm32fNNPtr gemm); - const static SimdSynetDeconvolution32fInitPtr simdSynetDeconvolution32fInit = SIMD_FUNC5(SynetDeconvolution32fInit, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE2_FUNC, SIMD_NEON_FUNC); - - return simdSynetDeconvolution32fInit(batch, params, gemm); -} - -SIMD_API size_t SimdSynetDeconvolution32fExternalBufferSize(const void * context) -{ - return ((SynetDeconvolution32f*)context)->ExternalBufferSize(); -} - -SIMD_API size_t SimdSynetDeconvolution32fInternalBufferSize(const void * context) -{ - return ((SynetDeconvolution32f*)context)->InternalBufferSize(); -} - -SIMD_API void SimdSynetDeconvolution32fSetParams(void * context, const float * weight, SimdBool * internal, const float * bias, const float * params) -{ - ((SynetDeconvolution32f*)context)->SetParams(weight, internal, bias, params); -} - -SIMD_API void SimdSynetDeconvolution32fForward(void * context, const float * src, float * buf, float * dst) -{ - SynetDeconvolution32f * d = (SynetDeconvolution32f*)context; - SIMD_PERF_EXT(d); - d->Forward(src, buf, dst); -} - -SIMD_API void SimdSynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst) -{ - typedef void(*SimdSynetEltwiseLayerForwardPtr) (float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst); - const static SimdSynetEltwiseLayerForwardPtr simdSynetEltwiseLayerForward = SIMD_FUNC5(SynetEltwiseLayerForward, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdSynetEltwiseLayerForward(src, weight, count, size, type, dst); -} - -SIMD_API void SimdSynetElu32f(const float * src, size_t size, const float * alpha, float * dst) -{ - typedef void(*SimdSynetElu32fPtr) (const float * src, size_t size, const float * alpha, float * dst); - const static SimdSynetElu32fPtr simdSynetElu32f = SIMD_FUNC4(SynetElu32f, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_SSE2_FUNC, SIMD_NEON_FUNC); - - simdSynetElu32f(src, size, alpha, dst); -} - -SIMD_API void SimdSynetFusedLayerForward0(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) -{ - typedef void(*SimdSynetFusedLayerForward0Ptr) (const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - const static SimdSynetFusedLayerForward0Ptr simdSynetFusedLayerForward0 = SIMD_FUNC4(SynetFusedLayerForward0, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdSynetFusedLayerForward0(src, bias, scale, channels, spatial, dst, format); -} - -SIMD_API void SimdSynetFusedLayerForward1(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) -{ - typedef void(*SimdSynetFusedLayerForward1Ptr) (const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - const static SimdSynetFusedLayerForward1Ptr simdSynetFusedLayerForward1 = SIMD_FUNC4(SynetFusedLayerForward1, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdSynetFusedLayerForward1(src, bias0, scale1, bias1, channels, spatial, dst, format); -} - -SIMD_API void SimdSynetFusedLayerForward2(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst, SimdTensorFormatType format) -{ - typedef void(*SimdSynetFusedLayerForward2Ptr) (const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst, SimdTensorFormatType format); - const static SimdSynetFusedLayerForward2Ptr simdSynetFusedLayerForward2 = SIMD_FUNC4(SynetFusedLayerForward2, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdSynetFusedLayerForward2(src, scale, bias, channels, spatial, slope, dst, format); -} - -SIMD_API void SimdSynetFusedLayerForward3(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) -{ - typedef void(*SimdSynetFusedLayerForward3Ptr) (const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - const static SimdSynetFusedLayerForward3Ptr simdSynetFusedLayerForward3 = SIMD_FUNC4(SynetFusedLayerForward3, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdSynetFusedLayerForward3(src, bias, scale, channels, spatial, dst, format); -} - -SIMD_API void SimdSynetFusedLayerForward4(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) -{ - typedef void(*SimdSynetFusedLayerForward4Ptr) (const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - const static SimdSynetFusedLayerForward4Ptr simdSynetFusedLayerForward4 = SIMD_FUNC4(SynetFusedLayerForward4, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdSynetFusedLayerForward4(src, bias0, scale1, bias1, channels, spatial, dst, format); -} - -SIMD_API void SimdSynetFusedLayerForward8(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) -{ - typedef void(*SimdSynetFusedLayerForward8Ptr) (const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - const static SimdSynetFusedLayerForward8Ptr simdSynetFusedLayerForward8 = SIMD_FUNC4(SynetFusedLayerForward8, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdSynetFusedLayerForward8(src0, src1, src2, channels, spatial, dst, format); -} - -SIMD_API void SimdSynetFusedLayerForward9(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1, SimdTensorFormatType format) -{ - typedef void(*SimdSynetFusedLayerForward9Ptr) (const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1, SimdTensorFormatType format); - const static SimdSynetFusedLayerForward9Ptr simdSynetFusedLayerForward9 = SIMD_FUNC4(SynetFusedLayerForward9, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdSynetFusedLayerForward9(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1, format); -} - -SIMD_API void SimdSynetHswish32f(const float * src, size_t size, const float * shift, const float * scale, float * dst) -{ - typedef void(*SimdSynetHswish32fPtr) (const float * src, size_t size, const float * shift, const float * scale, float * dst); - const static SimdSynetHswish32fPtr simdSynetHswish32f = SIMD_FUNC4(SynetHswish32f, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdSynetHswish32f(src, size, shift, scale, dst); -} - -SIMD_API void SimdSynetInnerProductLayerForward(const float * src, const float * weight, const float * bias, size_t count, size_t size, float * dst) -{ - typedef void(*SimdSynetInnerProductLayerForwardPtr) (const float * src, const float * weight, const float * bias, size_t count, size_t size, float * dst); - const static SimdSynetInnerProductLayerForwardPtr simdSynetInnerProductLayerForward = SIMD_FUNC5(SynetInnerProductLayerForward, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdSynetInnerProductLayerForward(src, weight, bias, count, size, dst); -} - -SIMD_API void SimdSynetLrnLayerCrossChannels(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst, SimdTensorFormatType format) -{ - typedef void(*SimdSynetLrnLayerCrossChannelsPtr) (const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst, SimdTensorFormatType format); - const static SimdSynetLrnLayerCrossChannelsPtr simdSynetLrnLayerCrossChannels = SIMD_FUNC4(SynetLrnLayerCrossChannels, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_SSE2_FUNC, SIMD_NEON_FUNC); - - simdSynetLrnLayerCrossChannels(src, half, channels, spatial, k, dst, format); -} - -SIMD_API void * SimdSynetMergedConvolution32fInit(size_t batch, const SimdConvolutionParameters * convs, size_t count, SimdBool add) -{ - typedef void* (*SimdSynetMergedConvolution32fInitPtr) (size_t batch, const SimdConvolutionParameters * convs, size_t count, SimdBool add); - const static SimdSynetMergedConvolution32fInitPtr simdSynetMergedConvolution32fInit = SIMD_FUNC5(SynetMergedConvolution32fInit, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE2_FUNC, SIMD_NEON_FUNC); - - return simdSynetMergedConvolution32fInit(batch, convs, count, add); -} - -SIMD_API size_t SimdSynetMergedConvolution32fExternalBufferSize(const void * context) -{ - return ((SynetMergedConvolution32f*)context)->ExternalBufferSize(); -} - -SIMD_API size_t SimdSynetMergedConvolution32fInternalBufferSize(const void * context) -{ - return ((SynetMergedConvolution32f*)context)->InternalBufferSize(); -} - -SIMD_API void SimdSynetMergedConvolution32fSetParams(void * context, const float * const * weight, SimdBool * internal, const float * const * bias, const float * const * params) -{ - ((SynetMergedConvolution32f*)context)->SetParams(weight, internal, bias, params); -} - -SIMD_API void SimdSynetMergedConvolution32fForward(void * context, const float * src, float * buf, float * dst) -{ - SynetMergedConvolution32f * c = (SynetMergedConvolution32f*)context; - SIMD_PERF_EXT(c); - c->Forward(src, buf, dst); -} - -void SimdSynetPoolingForwardAverage(const float* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float* dst, size_t dstH, size_t dstW, SimdBool excludePad, SimdTensorFormatType format) -{ - typedef void(*SimdSynetPoolingForwardAveragePtr) (const float* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float* dst, size_t dstH, size_t dstW, SimdBool exludePad, SimdTensorFormatType format); - const static SimdSynetPoolingForwardAveragePtr simdSynetPoolingForwardAverage = SIMD_FUNC4(SynetPoolingForwardAverage, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdSynetPoolingForwardAverage(src, srcC, srcH, srcW, kernelY, kernelX, strideY, strideX, padY, padX, dst, dstH, dstW, excludePad, format); -} - -SIMD_API void SimdSynetPoolingForwardMax32f(const float * src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float * dst, size_t dstH, size_t dstW, SimdTensorFormatType format) -{ - typedef void(*SimdSynetPoolingForwardMax32fPtr) (const float * src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float * dst, size_t dstH, size_t dstW, SimdTensorFormatType format); - const static SimdSynetPoolingForwardMax32fPtr simdSynetPoolingForwardMax32f = SIMD_FUNC5(SynetPoolingForwardMax32f, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdSynetPoolingForwardMax32f(src, srcC, srcH, srcW, kernelY, kernelX, strideY, strideX, padY, padX, dst, dstH, dstW, format); -} - -SIMD_API void SimdSynetPoolingForwardMax8u(const uint8_t* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, uint8_t* dst, size_t dstH, size_t dstW, SimdTensorFormatType format) -{ - typedef void(*SimdSynetPoolingForwardMax8uPtr) (const uint8_t* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, uint8_t* dst, size_t dstH, size_t dstW, SimdTensorFormatType format); - const static SimdSynetPoolingForwardMax8uPtr simdSynetPoolingForwardMax8u = SIMD_FUNC4(SynetPoolingForwardMax8u, SIMD_AVX512BW_FUNC, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC); - - simdSynetPoolingForwardMax8u(src, srcC, srcH, srcW, kernelY, kernelX, strideY, strideX, padY, padX, dst, dstH, dstW, format); -} - -SIMD_API void SimdSynetPreluLayerForward(const float * src, const float * slope, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) -{ - typedef void(*SimdSynetPreluLayerForwardPtr) (const float * src, const float * slope, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - const static SimdSynetPreluLayerForwardPtr simdSynetPreluLayerForward = SIMD_FUNC4(SynetPreluLayerForward, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdSynetPreluLayerForward(src, slope, channels, spatial, dst, format); -} - -SIMD_API void SimdSynetRelu32f(const float* src, size_t size, const float* slope, float* dst) -{ - typedef void(*SimdSynetRelu32fPtr) (const float* src, size_t size, const float* slope, float* dst); - const static SimdSynetRelu32fPtr simdSynetRelu32f = SIMD_FUNC4(SynetRelu32f, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdSynetRelu32f(src, size, slope, dst); -} - -SIMD_API void SimdSynetReorderImage(size_t batch, size_t channels, size_t spatial, const float* src, SimdTensorFormatType srcFormat, float* dst, SimdTensorFormatType dstFormat) -{ - typedef void(*SimdSynetReorderImagePtr) (size_t batch, size_t channels, size_t spatial, const float* src, SimdTensorFormatType srcFormat, float* dst, SimdTensorFormatType dstFormat); - const static SimdSynetReorderImagePtr simdSynetReorderImage = SIMD_FUNC4(SynetReorderImage, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdSynetReorderImage(batch, channels, spatial, src, srcFormat, dst, dstFormat); -} - -SIMD_API void SimdSynetReorderFilter(size_t output, size_t input, size_t kernel, const float* src, SimdTensorFormatType srcFormat, float* dst, SimdTensorFormatType dstFormat) -{ - typedef void(*SimdSynetReorderFilterPtr) (size_t output, size_t input, size_t kernel, const float* src, SimdTensorFormatType srcFormat, float* dst, SimdTensorFormatType dstFormat); - const static SimdSynetReorderFilterPtr simdSynetReorderFilter = SIMD_FUNC4(SynetReorderFilter, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdSynetReorderFilter(output, input, kernel, src, srcFormat, dst, dstFormat); -} - -SIMD_API void SimdSynetRestrictRange32f(const float * src, size_t size, const float * lower, const float * upper, float * dst) -{ - typedef void(*SimdSynetRestrictRange32fPtr) (const float * src, size_t size, const float * lower, const float * upper, float * dst); - const static SimdSynetRestrictRange32fPtr simdSynetRestrictRange32f = SIMD_FUNC4(SynetRestrictRange32f, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdSynetRestrictRange32f(src, size, lower, upper, dst); -} - -SIMD_API void SimdSynetScaleLayerForward(const float* src, const float* scale, const float* bias, size_t channels, size_t height, size_t width, float* dst, SimdTensorFormatType format, SimdSynetCompatibilityType compatibility) -{ - typedef void(*SimdSynetScaleLayerForwardPtr) (const float* src, const float* scale, const float* bias, size_t channels, size_t height, size_t width, float* dst, SimdTensorFormatType format, SimdSynetCompatibilityType compatibility); - const static SimdSynetScaleLayerForwardPtr simdSynetScaleLayerForward = SIMD_FUNC5(SynetScaleLayerForward, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdSynetScaleLayerForward(src, scale, bias, channels, height, width, dst, format, compatibility); -} - -SIMD_API void SimdSynetSetInput(const uint8_t * src, size_t width, size_t height, size_t stride, SimdPixelFormatType srcFormat, - const float * lower, const float * upper, float * dst, size_t channels, SimdTensorFormatType dstFormat) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable && width >= Avx512bw::A) - Avx512bw::SynetSetInput(src, width, height, stride, srcFormat, lower, upper, dst, channels, dstFormat); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::SynetSetInput(src, width, height, stride, srcFormat, lower, upper, dst, channels, dstFormat); - else -#endif -#ifdef SIMD_SSE41_ENABLE - if (Sse41::Enable && width >= Sse41::A) - Sse41::SynetSetInput(src, width, height, stride, srcFormat, lower, upper, dst, channels, dstFormat); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::SynetSetInput(src, width, height, stride, srcFormat, lower, upper, dst, channels, dstFormat); - else -#endif - Base::SynetSetInput(src, width, height, stride, srcFormat, lower, upper, dst, channels, dstFormat); -} - -SIMD_API void SimdSynetShuffleLayerForward(const float* src0, const float* src1, size_t channels0, size_t channels1, size_t spatial, float* dst0, float* dst1, SimdTensorFormatType format, int type) -{ - typedef void(*SimdSynetShuffleLayerForwardPtr) (const float* src0, const float* src1, size_t channels0, size_t channels1, size_t spatial, float* dst0, float* dst1, SimdTensorFormatType format, int type); - const static SimdSynetShuffleLayerForwardPtr simdSynetShuffleLayerForward = SIMD_FUNC4(SynetShuffleLayerForward, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdSynetShuffleLayerForward(src0, src1, channels0, channels1, spatial, dst0, dst1, format, type); -} - -SIMD_API void SimdSynetSigmoid32f(const float* src, size_t size, const float* slope, float* dst) -{ - typedef void(*SimdSynetSigmoid32fPtr) (const float* src, size_t size, const float* slope, float* dst); - const static SimdSynetSigmoid32fPtr simdSynetSigmoid32f = SIMD_FUNC4(SynetSigmoid32f, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_SSE2_FUNC, SIMD_NEON_FUNC); - - simdSynetSigmoid32f(src, size, slope, dst); -} - -SIMD_API void SimdSynetSoftmaxLayerForward(const float * src, size_t outer, size_t count, size_t inner, float * dst) -{ - typedef void(*SimdSynetSoftmaxLayerForwardPtr) (const float * src, size_t outer, size_t count, size_t inner, float * dst); - const static SimdSynetSoftmaxLayerForwardPtr simdSynetSoftmaxLayerForward = SIMD_FUNC4(SynetSoftmaxLayerForward, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_SSE2_FUNC, SIMD_NEON_FUNC); - - simdSynetSoftmaxLayerForward(src, outer, count, inner, dst); -} - -SIMD_API void SimdSynetSoftplus32f(const float* src, size_t size, const float* beta, const float* threshold, float* dst) -{ - typedef void(*SimdSynetSoftplus32fPtr) (const float* src, size_t size, const float* beta, const float* threshold, float* dst); - const static SimdSynetSoftplus32fPtr simdSynetSoftplus32f = SIMD_FUNC4(SynetSoftplus32f, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_SSE2_FUNC, SIMD_NEON_FUNC); - - simdSynetSoftplus32f(src, size, beta, threshold, dst); -} - -SIMD_API SimdTensorFormatType SimdSynetSpecifyTensorFormat(SimdTensorFormatType format) -{ - return Base::SynetSpecifyTensorFormat(format); -} - -SIMD_API void SimdSynetTanh32f(const float* src, size_t size, const float* slope, float* dst) -{ - typedef void(*SimdSynetTanh32fPtr) (const float* src, size_t size, const float* slope, float* dst); - const static SimdSynetTanh32fPtr simdSynetTanh32f = SIMD_FUNC4(SynetTanh32f, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_SSE2_FUNC, SIMD_NEON_FUNC); - - simdSynetTanh32f(src, size, slope, dst); -} - - -SIMD_API size_t SimdSynetTensorAlignment(SimdTensorFormatType format) -{ - return Base::SynetTensorAlignment(format); -} - -SIMD_API void SimdSynetUnaryOperation32fLayerForward(const float* src, size_t size, SimdSynetUnaryOperation32fType type, float* dst) -{ - typedef void(*SimdSynetUnaryOperation32fLayerForwardPtr) (const float* src, size_t size, SimdSynetUnaryOperation32fType type, float* dst); - const static SimdSynetUnaryOperation32fLayerForwardPtr simdSynetUnaryOperation32fLayerForward = SIMD_FUNC4(SynetUnaryOperation32fLayerForward, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_SSE2_FUNC, SIMD_NEON_FUNC); - - simdSynetUnaryOperation32fLayerForward(src, size, type, dst); -} - -SIMD_API void SimdTextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::TextureBoostedSaturatedGradient(src, srcStride, width, height, saturation, boost, dx, dxStride, dy, dyStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::TextureBoostedSaturatedGradient(src, srcStride, width, height, saturation, boost, dx, dxStride, dy, dyStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::TextureBoostedSaturatedGradient(src, srcStride, width, height, saturation, boost, dx, dxStride, dy, dyStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::TextureBoostedSaturatedGradient(src, srcStride, width, height, saturation, boost, dx, dxStride, dy, dyStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::TextureBoostedSaturatedGradient(src, srcStride, width, height, saturation, boost, dx, dxStride, dy, dyStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::TextureBoostedSaturatedGradient(src, srcStride, width, height, saturation, boost, dx, dxStride, dy, dyStride); - else -#endif - Base::TextureBoostedSaturatedGradient(src, srcStride, width, height, saturation, boost, dx, dxStride, dy, dyStride); -} - -SIMD_API void SimdTextureBoostedUv(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t boost, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::TextureBoostedUv(src, srcStride, width, height, boost, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::TextureBoostedUv(src, srcStride, width, height, boost, dst, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::TextureBoostedUv(src, srcStride, width, height, boost, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::TextureBoostedUv(src, srcStride, width, height, boost, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::TextureBoostedUv(src, srcStride, width, height, boost, dst, dstStride); - else -#endif - Base::TextureBoostedUv(src, srcStride, width, height, boost, dst, dstStride); -} - -SIMD_API void SimdTextureGetDifferenceSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, int64_t * sum) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::TextureGetDifferenceSum(src, srcStride, width, height, lo, loStride, hi, hiStride, sum); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::TextureGetDifferenceSum(src, srcStride, width, height, lo, loStride, hi, hiStride, sum); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::TextureGetDifferenceSum(src, srcStride, width, height, lo, loStride, hi, hiStride, sum); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::TextureGetDifferenceSum(src, srcStride, width, height, lo, loStride, hi, hiStride, sum); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::TextureGetDifferenceSum(src, srcStride, width, height, lo, loStride, hi, hiStride, sum); - else -#endif - Base::TextureGetDifferenceSum(src, srcStride, width, height, lo, loStride, hi, hiStride, sum); -} - -SIMD_API void SimdTexturePerformCompensation(const uint8_t * src, size_t srcStride, size_t width, size_t height, - int32_t shift, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::TexturePerformCompensation(src, srcStride, width, height, shift, dst, dstStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::TexturePerformCompensation(src, srcStride, width, height, shift, dst, dstStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::TexturePerformCompensation(src, srcStride, width, height, shift, dst, dstStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if (Vmx::Enable && width >= Vmx::A) - Vmx::TexturePerformCompensation(src, srcStride, width, height, shift, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::TexturePerformCompensation(src, srcStride, width, height, shift, dst, dstStride); - else -#endif - Base::TexturePerformCompensation(src, srcStride, width, height, shift, dst, dstStride); -} - -SIMD_API void SimdTransformImage(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, SimdTransformType transform, uint8_t * dst, size_t dstStride) -{ -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::TransformImage(src, srcStride, width, height, pixelSize, transform, dst, dstStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::HA) - Neon::TransformImage(src, srcStride, width, height, pixelSize, transform, dst, dstStride); - else -#endif - Base::TransformImage(src, srcStride, width, height, pixelSize, transform, dst, dstStride); -} - -typedef void(*SimdWinogradSetFilterPtr) (const float * src, size_t size, float * dst, SimdBool trans); -typedef void(*SimdWinogradSetInputPtr) (const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); -typedef void(*SimdWinogradSetOutputPtr) (const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - -SIMD_API void SimdWinogradKernel1x3Block1x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans) -{ - const static SimdWinogradSetFilterPtr simdWinogradKernel1x3Block1x4SetFilter = SIMD_FUNC4(WinogradKernel1x3Block1x4SetFilter, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdWinogradKernel1x3Block1x4SetFilter(src, size, dst, trans); -} - -SIMD_API void SimdWinogradKernel1x3Block1x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) -{ - const static SimdWinogradSetInputPtr simdWinogradKernel1x3Block1x4SetInput = SIMD_FUNC4(WinogradKernel1x3Block1x4SetInput, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdWinogradKernel1x3Block1x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); -} - -SIMD_API void SimdWinogradKernel1x3Block1x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) -{ - const static SimdWinogradSetOutputPtr simdWinogradKernel1x3Block1x4SetOutput = SIMD_FUNC4(WinogradKernel1x3Block1x4SetOutput, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdWinogradKernel1x3Block1x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); -} - -SIMD_API void SimdWinogradKernel1x5Block1x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans) -{ - const static SimdWinogradSetFilterPtr simdWinogradKernel1x5Block1x4SetFilter = SIMD_FUNC4(WinogradKernel1x5Block1x4SetFilter, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdWinogradKernel1x5Block1x4SetFilter(src, size, dst, trans); -} - -SIMD_API void SimdWinogradKernel1x5Block1x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) -{ - const static SimdWinogradSetInputPtr simdWinogradKernel1x5Block1x4SetInput = SIMD_FUNC4(WinogradKernel1x5Block1x4SetInput, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdWinogradKernel1x5Block1x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); -} - -SIMD_API void SimdWinogradKernel1x5Block1x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) -{ - const static SimdWinogradSetOutputPtr simdWinogradKernel1x5Block1x4SetOutput = SIMD_FUNC4(WinogradKernel1x5Block1x4SetOutput, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdWinogradKernel1x5Block1x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); -} - -SIMD_API void SimdWinogradKernel2x2Block2x2SetFilter(const float* src, size_t size, float* dst, SimdBool trans) -{ - const static SimdWinogradSetFilterPtr simdWinogradKernel2x2Block2x2SetFilter = SIMD_FUNC4(WinogradKernel2x2Block2x2SetFilter, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdWinogradKernel2x2Block2x2SetFilter(src, size, dst, trans); -} - -SIMD_API void SimdWinogradKernel2x2Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) -{ - const static SimdWinogradSetInputPtr simdWinogradKernel2x2Block2x2SetInput = SIMD_FUNC4(WinogradKernel2x2Block2x2SetInput, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdWinogradKernel2x2Block2x2SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); -} - -SIMD_API void SimdWinogradKernel2x2Block2x2SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) -{ - const static SimdWinogradSetOutputPtr simdWinogradKernel2x2Block2x2SetOutput = SIMD_FUNC4(WinogradKernel2x2Block2x2SetOutput, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdWinogradKernel2x2Block2x2SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); -} - -SIMD_API void SimdWinogradKernel2x2Block4x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans) -{ - const static SimdWinogradSetFilterPtr simdWinogradKernel2x2Block4x4SetFilter = SIMD_FUNC4(WinogradKernel2x2Block4x4SetFilter, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdWinogradKernel2x2Block4x4SetFilter(src, size, dst, trans); -} - -SIMD_API void SimdWinogradKernel2x2Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) -{ - const static SimdWinogradSetInputPtr simdWinogradKernel2x2Block4x4SetInput = SIMD_FUNC4(WinogradKernel2x2Block4x4SetInput, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdWinogradKernel2x2Block4x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); -} - -SIMD_API void SimdWinogradKernel2x2Block4x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) -{ - const static SimdWinogradSetOutputPtr simdWinogradKernel2x2Block4x4SetOutput = SIMD_FUNC4(WinogradKernel2x2Block4x4SetOutput, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdWinogradKernel2x2Block4x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); -} - -SIMD_API void SimdWinogradKernel3x3Block2x2SetFilter(const float * src, size_t size, float * dst, SimdBool trans) -{ - const static SimdWinogradSetFilterPtr simdWinogradKernel3x3Block2x2SetFilter = SIMD_FUNC4(WinogradKernel3x3Block2x2SetFilter, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdWinogradKernel3x3Block2x2SetFilter(src, size, dst, trans); -} - -SIMD_API void SimdWinogradKernel3x3Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) -{ - const static SimdWinogradSetInputPtr simdWinogradKernel3x3Block2x2SetInput = SIMD_FUNC4(WinogradKernel3x3Block2x2SetInput, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdWinogradKernel3x3Block2x2SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); -} - -SIMD_API void SimdWinogradKernel3x3Block2x2SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) -{ - const static SimdWinogradSetOutputPtr simdWinogradKernel3x3Block2x2SetOutput = SIMD_FUNC4(WinogradKernel3x3Block2x2SetOutput, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdWinogradKernel3x3Block2x2SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); -} - -SIMD_API void SimdWinogradKernel3x3Block3x3SetFilter(const float * src, size_t size, float * dst, SimdBool trans) -{ - const static SimdWinogradSetFilterPtr simdWinogradKernel3x3Block3x3SetFilter = SIMD_FUNC4(WinogradKernel3x3Block3x3SetFilter, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdWinogradKernel3x3Block3x3SetFilter(src, size, dst, trans); -} - -SIMD_API void SimdWinogradKernel3x3Block3x3SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) -{ - const static SimdWinogradSetInputPtr simdWinogradKernel3x3Block3x3SetInput = SIMD_FUNC4(WinogradKernel3x3Block3x3SetInput, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdWinogradKernel3x3Block3x3SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); -} - -SIMD_API void SimdWinogradKernel3x3Block3x3SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) -{ - const static SimdWinogradSetOutputPtr simdWinogradKernel3x3Block3x3SetOutput = SIMD_FUNC4(WinogradKernel3x3Block3x3SetOutput, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdWinogradKernel3x3Block3x3SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); -} - -SIMD_API void SimdWinogradKernel3x3Block4x4SetFilter(const float * src, size_t size, float * dst, SimdBool trans) -{ - const static SimdWinogradSetFilterPtr simdWinogradKernel3x3Block4x4SetFilter = SIMD_FUNC4(WinogradKernel3x3Block4x4SetFilter, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdWinogradKernel3x3Block4x4SetFilter(src, size, dst, trans); -} - -SIMD_API void SimdWinogradKernel3x3Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) -{ - const static SimdWinogradSetInputPtr simdWinogradKernel3x3Block4x4SetInput = SIMD_FUNC4(WinogradKernel3x3Block4x4SetInput, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdWinogradKernel3x3Block4x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); -} - -SIMD_API void SimdWinogradKernel3x3Block4x4SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) -{ - const static SimdWinogradSetOutputPtr simdWinogradKernel3x3Block4x4SetOutput = SIMD_FUNC4(WinogradKernel3x3Block4x4SetOutput, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC, SIMD_NEON_FUNC); - - simdWinogradKernel3x3Block4x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); -} - -SIMD_API void SimdYuva420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Yuva420pToBgra(y, yStride, u, uStride, v, vStride, a, aStride, width, height, bgra, bgraStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::DA) - Avx2::Yuva420pToBgra(y, yStride, u, uStride, v, vStride, a, aStride, width, height, bgra, bgraStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if (Sse2::Enable && width >= Sse2::DA) - Sse2::Yuva420pToBgra(y, yStride, u, uStride, v, vStride, a, aStride, width, height, bgra, bgraStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::DA) - Neon::Yuva420pToBgra(y, yStride, u, uStride, v, vStride, a, aStride, width, height, bgra, bgraStride); - else -#endif - Base::Yuva420pToBgra(y, yStride, u, uStride, v, vStride, a, aStride, width, height, bgra, bgraStride); -} - -SIMD_API void SimdYuv420pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Yuv420pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::DA) - Avx2::Yuv420pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::DA) - Ssse3::Yuv420pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::DA) - Vmx::Yuv420pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::DA) - Neon::Yuv420pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else -#endif - Base::Yuv420pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); -} - -SIMD_API void SimdYuv422pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Yuv422pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::DA) - Avx2::Yuv422pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::DA) - Ssse3::Yuv422pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::DA) - Vmx::Yuv422pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::DA) - Neon::Yuv422pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else -#endif - Base::Yuv422pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); -} - -SIMD_API void SimdYuv444pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Yuv444pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::Yuv444pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::Yuv444pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::Yuv444pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::Yuv444pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else -#endif - Base::Yuv444pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); -} - -SIMD_API void SimdYuv420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Yuv420pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::DA) - Avx2::Yuv420pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::DA) - Sse2::Yuv420pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::DA) - Vmx::Yuv420pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::DA) - Neon::Yuv420pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else -#endif - Base::Yuv420pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); -} - -SIMD_API void SimdYuv422pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Yuv422pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::DA) - Avx2::Yuv422pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::DA) - Sse2::Yuv422pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::DA) - Vmx::Yuv422pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::DA) - Neon::Yuv422pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else -#endif - Base::Yuv422pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); -} - -SIMD_API void SimdYuv444pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Yuv444pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::Yuv444pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::Yuv444pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_VMX_ENABLE - if(Vmx::Enable && width >= Vmx::A) - Vmx::Yuv444pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::Yuv444pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else -#endif - Base::Yuv444pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); -} - -SIMD_API void SimdYuv444pToHsl(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hsl, size_t hslStride) -{ - Base::Yuv444pToHsl(y, yStride, u, uStride, v, vStride, width, height, hsl, hslStride); -} - -SIMD_API void SimdYuv444pToHsv(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hsv, size_t hsvStride) -{ - Base::Yuv444pToHsv(y, yStride, u, uStride, v, vStride, width, height, hsv, hsvStride); -} - -SIMD_API void SimdYuv420pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Yuv420pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::DA) - Avx2::Yuv420pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::DA) - Sse2::Yuv420pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - else -#endif -#ifdef SIMD_VSX_ENABLE - if(Vsx::Enable && width >= Vsx::DA) - Vsx::Yuv420pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::DA) - Neon::Yuv420pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - else -#endif - Base::Yuv420pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); -} - -SIMD_API void SimdYuv444pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Yuv444pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::Yuv444pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::Yuv444pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - else -#endif -#ifdef SIMD_VSX_ENABLE - if(Vsx::Enable && width >= Vsx::A) - Vsx::Yuv444pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::Yuv444pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - else -#endif - Base::Yuv444pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); -} - -SIMD_API void SimdYuv420pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Yuv420pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::DA) - Avx2::Yuv420pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::DA) - Ssse3::Yuv420pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::DA) - Neon::Yuv420pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - else -#endif - Base::Yuv420pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); -} - -SIMD_API void SimdYuv422pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Yuv422pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::DA) - Avx2::Yuv422pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::DA) - Ssse3::Yuv422pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::DA) - Neon::Yuv422pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - else -#endif - Base::Yuv422pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); -} - -SIMD_API void SimdYuv444pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) -{ -#ifdef SIMD_AVX512BW_ENABLE - if (Avx512bw::Enable) - Avx512bw::Yuv444pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - else -#endif -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable && width >= Avx2::A) - Avx2::Yuv444pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::Yuv444pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::Yuv444pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - else -#endif - Base::Yuv444pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); -} - - - diff --git a/src/3rd/Simd/Simd/SimdLib.h b/src/3rd/Simd/Simd/SimdLib.h deleted file mode 100644 index 8d6e269e..00000000 --- a/src/3rd/Simd/Simd/SimdLib.h +++ /dev/null @@ -1,7485 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar, -* 2014-2019 Antonenka Mikhail, -* 2019-2019 Facundo Galan. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ - -#ifndef __SimdLib_h__ -#define __SimdLib_h__ - -#include "Simd/SimdConfig.h" - -#include - -#if defined(_MSC_VER) || defined(__CODEGEARC__) - -#define SIMD_INLINE __forceinline - -#elif defined(__GNUC__) - -#define SIMD_INLINE inline __attribute__ ((always_inline)) - -#else - -#error This platform is unsupported! - -#endif - -#if defined(__GNUC__) || (defined(_MSC_VER) && (_MSC_VER >= 1600)) || (defined(__CODEGEARC__) && (__CODEGEARC__ >= 1840)) -#include -#else -# if (_MSC_VER < 1300) -typedef signed char int8_t; -typedef signed short int16_t; -typedef signed int int32_t; -typedef unsigned char uint8_t; -typedef unsigned short uint16_t; -typedef unsigned int uint32_t; -# else -typedef signed __int8 int8_t; -typedef signed __int16 int16_t; -typedef signed __int32 int32_t; -typedef unsigned __int8 uint8_t; -typedef unsigned __int16 uint16_t; -typedef unsigned __int32 uint32_t; -# endif -typedef signed __int64 int64_t; -typedef unsigned __int64 uint64_t; -#endif - -/*! @ingroup c_types - Describes boolean type. -*/ -typedef enum -{ - SimdFalse = 0, /*!< False value. */ - SimdTrue = 1, /*!< True value. */ -} SimdBool; - -/*! @ingroup c_types - Describes types of compare operation. - Operation compare(a, b) is -*/ -typedef enum -{ - /*! equal to: a == b */ - SimdCompareEqual, - /*! equal to: a != b */ - SimdCompareNotEqual, - /*! equal to: a > b */ - SimdCompareGreater, - /*! equal to: a >= b */ - SimdCompareGreaterOrEqual, - /*! equal to: a < b */ - SimdCompareLesser, - /*! equal to: a <= b */ - SimdCompareLesserOrEqual, -} SimdCompareType; - -/*! @ingroup synet - Describes type of activation function. It is used in ::SimdSynetConvolution32fInit, ::SimdSynetConvolution8iInit, ::SimdSynetDeconvolution32fInit and ::SimdSynetMergedConvolution32fInit. -*/ -typedef enum -{ - /*! - Identity (activation function is absent). - */ - SimdConvolutionActivationIdentity = 0, - /*! - ReLU activation function. - \verbatim - dst[i] = Max(0, src[i]); - \endverbatim - */ - SimdConvolutionActivationRelu, - /*! - Leaky ReLU activation function. - It has one parameter: slope (params[0]). - \verbatim - dst[i] = src[i] > 0 ? src[i] : slope*src[i]; - \endverbatim - */ - SimdConvolutionActivationLeakyRelu, - /*! - The activation function restricts range. - It has two parameters: lower (params[0]) and upper (params[1]) bound. - \verbatim - dst[i] = Min(Max(lower, src[i]), upper); - \endverbatim - */ - SimdConvolutionActivationRestrictRange, - /*! - Leaky PReLU activation function. - It has m parameters: slopes[m] (m = dstC, n = dstH*dstW). - \verbatim - dst[i*n + j] = src[i*n + j] > 0 ? src[i*n + j] : slopes[i]*src[i*n + j]; - \endverbatim - */ - SimdConvolutionActivationPrelu, - /*! - Leaky ELU activation function. - It has one parameter: alpha (params[0]). - \verbatim - dst[i] = src[i] >= 0 ? src[i] : alpha*(Exp(src[i]) - 1); - \endverbatim - */ - SimdConvolutionActivationElu, - /*! - H-Swish (https://arxiv.org/pdf/1905.02244.pdf) activation function. - It has two parameters: shift (params[0]) and scale (params[1]). - \verbatim - dst[i] = Max(Min(src[i], shift) + shift, 0)*scale*src[i]; - \endverbatim - */ - SimdConvolutionActivationHswish, -} SimdConvolutionActivationType; - -/*! @ingroup c_types - Describes type of information which can return function ::SimdCpuInfo. -*/ -typedef enum -{ - SimdCpuInfoSockets,/*!< A number of sockets. */ - SimdCpuInfoCores, /*!< A number of psysical CPU cores. */ - SimdCpuInfoThreads, /*!< A number of logical CPU cores. */ - SimdCpuInfoCacheL1, /*!< A size of level 1 data cache. */ - SimdCpuInfoCacheL2, /*!< A size of level 2 cache. */ - SimdCpuInfoCacheL3, /*!< A size of level 3 cache. */ - SimdCpuInfoSse, /*!< Availability of SSE (x86). */ - SimdCpuInfoSse2, /*!< Availability of SSE2 (x86). */ - SimdCpuInfoSse3, /*!< Availability of SSE3 (x86). */ - SimdCpuInfoSsse3, /*!< Availability of SSSE3 (x86). */ - SimdCpuInfoSse41, /*!< Availability of SSE4.1 (x86). */ - SimdCpuInfoSse42, /*!< Availability of SSE4.2 (x86). */ - SimdCpuInfoAvx, /*!< Availability of AVX (x86). */ - SimdCpuInfoAvx2, /*!< Availability of AVX2 (x86). */ - SimdCpuInfoAvx512f, /*!< Availability of AVX-512F (x86). */ - SimdCpuInfoAvx512bw, /*!< Availability of AVX-512BW (x86). */ - SimdCpuInfoAvx512vnni, /*!< Availability of AVX-512VNNI (x86). */ - SimdCpuInfoVmx, /*!< Availability of VMX or Altivec (PowerPC). */ - SimdCpuInfoVsx, /*!< Availability of VSX (PowerPC). */ - SimdCpuInfoNeon, /*!< Availability of NEON (ARM). */ - SimdCpuInfoMsa, /*!< Availability of MSA (MIPS). */ -} SimdCpuInfoType; - -/*! @ingroup c_types - Describes types and flags to get information about classifier cascade with using function ::SimdDetectionInfo. - \note This type is used for implementation of Simd::Detection. -*/ -typedef enum -{ - /*! A HAAR cascade classifier type. */ - SimdDetectionInfoFeatureHaar = 0, - /*! A LBP cascade classifier type. */ - SimdDetectionInfoFeatureLbp, - /*! A mask to select cascade classifier type. */ - SimdDetectionInfoFeatureMask = 3, - /*! A flag which defines existence of tilted features in the HAAR cascade. */ - SimdDetectionInfoHasTilted = 4, - /*! A flag which defines possibility to use 16-bit integers for calculation. */ - SimdDetectionInfoCanInt16 = 8, -} SimdDetectionInfoFlags; - -/*! @ingroup c_types - Describes types of binary operation between two images performed by function ::SimdOperationBinary8u. - Images must have the same format (unsigned 8-bit integer for every channel). -*/ -typedef enum -{ - /*! Computes the average value for every channel of every point of two images. \n Average(a, b) = (a + b + 1)/2. */ - SimdOperationBinary8uAverage, - /*! Computes the bitwise AND between two images. */ - SimdOperationBinary8uAnd, - /*! Computes the bitwise OR between two images. */ - SimdOperationBinary8uOr, - /*! Computes maximal value for every channel of every point of two images. */ - SimdOperationBinary8uMaximum, - /*! Computes minimal value for every channel of every point of two images. */ - SimdOperationBinary8uMinimum, - /*!Subtracts unsigned 8-bit integer b from unsigned 8-bit integer a and saturates (for every channel of every point of the images). */ - SimdOperationBinary8uSaturatedSubtraction, - /*!Adds unsigned 8-bit integer b from unsigned 8-bit integer a and saturates (for every channel of every point of the images). */ - SimdOperationBinary8uSaturatedAddition, -} SimdOperationBinary8uType; - -/*! @ingroup c_types - Describes types of binary operation between two images performed by function ::SimdOperationBinary16i. - Images must have ::SimdPixelFormatInt16 pixel format (signed 16-bit integer for every point). -*/ -typedef enum -{ - /*! Performs addition of two images for every point. */ - SimdOperationBinary16iAddition, - /*! Performs subtraction of two images for every point. */ - SimdOperationBinary16iSubtraction, -} SimdOperationBinary16iType; - -/*! @ingroup c_types - Describes pixel format types of an image. - In particular this type is used in functions ::SimdBayerToBgr, ::SimdBayerToBgra, ::SimdBgraToBayer and ::SimdBgrToBayer. - \note This type is corresponds to C++ type Simd::View::Format. -*/ -typedef enum -{ - /*! An undefined pixel format. */ - SimdPixelFormatNone = 0, - /*! A 8-bit gray pixel format. */ - SimdPixelFormatGray8, - /*! A 16-bit (2 8-bit channels) pixel format (UV plane of NV12 pixel format). */ - SimdPixelFormatUv16, - /*! A 24-bit (3 8-bit channels) BGR (Blue, Green, Red) pixel format. */ - SimdPixelFormatBgr24, - /*! A 32-bit (4 8-bit channels) BGRA (Blue, Green, Red, Alpha) pixel format. */ - SimdPixelFormatBgra32, - /*! A single channel 16-bit integer pixel format. */ - SimdPixelFormatInt16, - /*! A single channel 32-bit integer pixel format. */ - SimdPixelFormatInt32, - /*! A single channel 64-bit integer pixel format. */ - SimdPixelFormatInt64, - /*! A single channel 32-bit float point pixel format. */ - SimdPixelFormatFloat, - /*! A single channel 64-bit float point pixel format. */ - SimdPixelFormatDouble, - /*! A 8-bit Bayer pixel format (GRBG). */ - SimdPixelFormatBayerGrbg, - /*! A 8-bit Bayer pixel format (GBRG). */ - SimdPixelFormatBayerGbrg, - /*! A 8-bit Bayer pixel format (RGGB). */ - SimdPixelFormatBayerRggb, - /*! A 8-bit Bayer pixel format (BGGR). */ - SimdPixelFormatBayerBggr, - /*! A 24-bit (3 8-bit channels) HSV (Hue, Saturation, Value) pixel format. */ - SimdPixelFormatHsv24, - /*! A 24-bit (3 8-bit channels) HSL (Hue, Saturation, Lightness) pixel format. */ - SimdPixelFormatHsl24, - /*! A 24-bit (3 8-bit channels) RGB (Red, Green, Blue) pixel format. */ - SimdPixelFormatRgb24, -} SimdPixelFormatType; - -/*! @ingroup c_types - Describes type of algorithm used for image reducing (downscale in 2 times) (see function Simd::ReduceGray). -*/ -enum SimdReduceType -{ - SimdReduce2x2, /*!< Using of function ::SimdReduceGray2x2 for image reducing. */ - SimdReduce3x3, /*!< Using of function ::SimdReduceGray3x3 for image reducing. */ - SimdReduce4x4, /*!< Using of function ::SimdReduceGray4x4 for image reducing. */ - SimdReduce5x5, /*!< Using of function ::SimdReduceGray5x5 for image reducing. */ -}; - -/*! @ingroup resizing - Describes resized image channel types. -*/ -typedef enum -{ - /*! 8-bit integer channel type. */ - SimdResizeChannelByte, - /*! 32-bit float channel type. */ - SimdResizeChannelFloat, -} SimdResizeChannelType; - -/*! @ingroup resizing - Describes methods used in oreder to resize image. -*/ -typedef enum -{ - /*! Bilinear method. */ - SimdResizeMethodBilinear, - /*! caffe::interp compatible method. */ - SimdResizeMethodCaffeInterp, - /*! Area method. */ - SimdResizeMethodArea, - /*! InferenceEngine::Extension::Cpu::Interp compatible method. */ - SimdResizeMethodInferenceEngineInterp, -} SimdResizeMethodType; - -/*! @ingroup synet - Describes Synet compatibility flags. This type used in functions ::SimdSynetScaleLayerForward, ::SimdSynetConvert32fTo8u. -*/ -typedef enum -{ - SimdSynetCompatibilityFast = 0, /*!< Fast (No compatibility for fast code). */ - SimdSynetCompatibilityNoFmaTail = 1, /*!< Not use FMA instructions at row tail. */ - SimdSynetCompatibilityNoFma = 2, /*!< Not use FMA instructions. */ - SimdSynetCompatibilityFmaMask = 3, /*!< Bit mask of options of FMA instructions using. */ - SimdSynetCompatibilityOverflow16i = 4, /*!< 16-bit integer overflow. */ -} SimdSynetCompatibilityType; - -/*! @ingroup synet - Describes operation type used in function ::SimdSynetEltwiseLayerForward. -*/ -typedef enum -{ - SimdSynetEltwiseOperationProduct, /*!< Product. */ - SimdSynetEltwiseOperationSum, /*!< Weighted sum. */ - SimdSynetEltwiseOperationMax, /*!< Maximum. */ - SimdSynetEltwiseOperationMin, /*!< Minimum. */ -} SimdSynetEltwiseOperationType; - -/*! @ingroup synet - Describes operation type used in function ::SimdSynetUnaryOperation32fLayerForward. -*/ -typedef enum -{ - /*! Gets absolute value for every point of input tensor. */ - SimdSynetUnaryOperation32fAbs, - /*! Gets exponent for every point of input tensor. */ - SimdSynetUnaryOperation32fExp, - /*! Gets logarithm for every point of input tensor. */ - SimdSynetUnaryOperation32fLog, - /*! Gets negative for every point of input tensor. */ - SimdSynetUnaryOperation32fNeg, - /*! Gets reverse square root for every point of input tensor. */ - SimdSynetUnaryOperation32fRsqrt, - /*! Gets square root for every point of input tensor. */ - SimdSynetUnaryOperation32fSqrt, - /*! Gets hyperbolic tangent for every point of input tensor. */ - SimdSynetUnaryOperation32fTanh, - /*! Gets zero value for every point of input tensor. */ - SimdSynetUnaryOperation32fZero, -} SimdSynetUnaryOperation32fType; - -/*! @ingroup synet - Describes Synet Framework 4D-tensor format type. -*/ -typedef enum -{ - SimdTensorFormatUnknown = -1, /*!< Unknown tensor format. */ - SimdTensorFormatNchw, /*!< NCHW (N - batch, C - channels, H - height, W - width) 4D-tensor format of (input/output) image. */ - SimdTensorFormatNhwc, /*!< NHWC (N - batch, H - height, W - width, C - channels) 4D-tensor format of (input/output) image. */ - SimdTensorFormatNchw4c, /*!< NCHW4c (N - batch, C - (channels + 3) / 4, H - height, W - width, 4c - channels gropped by 4) special 5D-tensor format of (input/output) image optimized for SSE and NEON. */ - SimdTensorFormatNchw8c, /*!< NCHW8c (N - batch, C - (channels + 7) / 8, H - height, W - width, 8c - channels gropped by 8) special 5D-tensor format of (input/output) image optimized for AVX and AVX2. */ - SimdTensorFormatNchw16c, /*!< NCHW16c (N - batch, C - (channels + 15) / 16, H - height, W - width, 16c - channels gropped by 16) special 5D-tensor format of (input/output) image optimized for AVX-512. */ - SimdTensorFormatNchwXc, /*!< Unspecified hardware optimized 5D-tensor format of (input/output) image. Specific format (::SimdTensorFormatNchw4c, ::SimdTensorFormatNchw8c or ::SimdTensorFormatNchw16c) is determinated by function ::SimdSynetSpecifyTensorFormat. */ - SimdTensorFormatOiyx, /*!< OIYX (O - output channels, I - input channels, Y - kernel height, X - kernel width) 4D-tensor format of 2D-convolution filter. */ - SimdTensorFormatYxio, /*!< YXIO (Y - kernel height, X - kernel width, I - input channels, O - output channels) 4D-tensor format of 2D-convolution filter. */ - SimdTensorFormatOyxi4o, /*!< OYXI4o (O - (output channels + 3)/4, Y - kernel height, X - kernel width, I - input channels, 4o - output channels gropped by 4) special 5D-tensor format of 2D-convolution filter optimized for SSE and NEON. */ - SimdTensorFormatOyxi8o, /*!< OYXI8o (O - (output channels + 7)/8, Y - kernel height, X - kernel width, I - input channels, 8o - output channels gropped by 8) special 5D-tensor format of 2D-convolution filter optimized for AVX and AVX2. */ - SimdTensorFormatOyxi16o, /*!< OYXI16o (O - (output channels + 15)/16, Y - kernel height, X - kernel width, I - input channels, 16o - output channels gropped by 16) special 5D-tensor format of 2D-convolution filter optimized for AVX-512. */ - SimdTensorFormatOyxiXo, /*!< Unspecified hardware optimized 5D-tensor format of 2D-convolution filter. Specific format (::SimdTensorFormatOyxi4o, ::SimdTensorFormatOyxi8o or ::SimdTensorFormatOyxi16o) is determinated by function ::SimdSynetSpecifyTensorFormat. */ -} SimdTensorFormatType; - -/*! @ingroup synet - Describes Synet Framework tensor data type. -*/ -typedef enum -{ - SimdTensorDataUnknown = -1, /*!< Unknown tensor data type. */ - SimdTensorData32f, /*!< 32-bit float point. */ - SimdTensorData32i, /*!< 32-bit signed integer. */ - SimdTensorData8i, /*!< 8-bit signed integer. */ - SimdTensorData8u, /*!< 8-bit unsigned integer. */ -} SimdTensorDataType; - -/*! @ingroup transform - Describes transform type used in function ::SimdTransformImage in order to describe result of transformation. -*/ -typedef enum -{ - SimdTransformRotate0 = 0, /*!< An original image. The output image has the same size as input image.*/ - SimdTransformRotate90, /*!< Image rotated 90 degrees counterclockwise. The output width and height are equal to the input height and widht. */ - SimdTransformRotate180, /*!< Image rotated 180 degrees counterclockwise. The output image has the same size as input image. */ - SimdTransformRotate270, /*!< Image rotated 270 degrees counterclockwise. The output width and height are equal to the input height and widht. */ - SimdTransformTransposeRotate0, /*!< Transposed image. The output width and height are equal to the input height and widht. */ - SimdTransformTransposeRotate90, /*!< Image transposed and rotated 90 degrees counterclockwise. It is equal to horizontal mirroring of image. The output image has the same size as input image.*/ - SimdTransformTransposeRotate180, /*!< Image transposed and rotated 180 degrees counterclockwise. The output width and height are equal to the input height and widht. */ - SimdTransformTransposeRotate270, /*!< Image transposed and rotated 270 degrees counterclockwise. It is equal to vertical mirroring of image. The output image has the same size as input image.*/ -} SimdTransformType; - -/*! @ingroup synet - \brief Callback function type "SimdGemm32fNNPtr"; - - The function has to perform general matrix multiplication (for 32-bit float numbers). - - \verbatim - C(M, N) = alpha*A(M, K)*B(K, N) + beta*C(M, N); - \endverbatim - - \param [in] M - a height of A and height of C matrices. - \param [in] N - a width of B and width of C matrices. - \param [in] K - a width of A and height of B matrices. - \param [in] alpha - a pointer to multiplier of the first term. - \param [in] A - a pointer to input A matrix. - \param [in] lda - a leading dimension of A matrix. - \param [in] B - a pointer to input B matrix. - \param [in] ldb - a leading dimension of B matrix. - \param [in] beta - a pointer to multiplier of the second term. - \param [out] C - a pointer to output C matrix. - \param [in] ldc - a leading dimension of C matrix. -*/ -typedef void(*SimdGemm32fNNPtr)(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); - -/*! @ingroup synet - Describes convolution (deconvolution) parameters. It is used in ::SimdSynetConvolution32fInit, ::SimdSynetConvolution8iInit, ::SimdSynetDeconvolution32fInit and ::SimdSynetMergedConvolution32fInit. -*/ -typedef struct SimdConvolutionParameters -{ - /*! - A number of input tensor channels. - */ - size_t srcC; - /*! - An input tensor height. - */ - size_t srcH; - /*! - An input tensor width. - */ - size_t srcW; - /*! - An input tensor data type. - */ - SimdTensorDataType srcT; - /*! - An input tensor data format. - */ - SimdTensorFormatType srcF; - /*! - A number of output tensor channels. - */ - size_t dstC; - /*! - An output tensor height. - */ - size_t dstH; - /*! - An output tensor width. - */ - size_t dstW; - /*! - An output tensor data type. - */ - SimdTensorDataType dstT; - /*! - An output tensor data format. - */ - SimdTensorFormatType dstF; - /*! - A convolution (deconvolution) kernel window height. - */ - size_t kernelY; - /*! - A convolution (deconvolution) kernel window width. - */ - size_t kernelX; - /*! - A convolution (deconvolution) dilation along Y-axis. - */ - size_t dilationY; - /*! - A convolution (deconvolution) dilation along X-axis. - */ - size_t dilationX; - /*! - A convolution (deconvolution) stride along Y-axis. - */ - size_t strideY; - /*! - A convolution (deconvolution) stride along X-axis. - */ - size_t strideX; - /*! - An additional zero padding of input image at the beginning of Y-axis. - */ - size_t padY; - /*! - An additional zero padding of input image at the beginning of X-axis. - */ - size_t padX; - /*! - An additional zero padding of input image at the end of Y-axis. - */ - size_t padH; - /*! - An additional zero padding of input image at the end of X-axis. - */ - size_t padW; - /*! - A number of convolution (deconvolution) groups. - */ - size_t group; - /*! - An activation function type used after convolution (deconvolution). - */ - SimdConvolutionActivationType activation; -} SimdConvolutionParameters; - -#if defined(WIN32) && !defined(SIMD_STATIC) -# ifdef SIMD_EXPORTS -# define SIMD_API __declspec(dllexport) -# else//SIMD_EXPORTS -# define SIMD_API __declspec(dllimport) -# endif//SIMD_EXPORTS -#else //WIN32 -# define SIMD_API -#endif//WIN32 - -#ifdef __cplusplus -extern "C" -{ -#endif//__cplusplus - - /*! @ingroup info - - \fn const char * SimdVersion(); - - \short Gets version of %Simd Library. - - \return string with version of %Simd Library (major version number, minor version number, release number, number of SVN's commits). - */ - SIMD_API const char * SimdVersion(); - - /*! @ingroup info - - \fn size_t SimdCpuInfo(SimdCpuInfoType type); - - \short Gets info about CPU and %Simd Library. - - \note See enumeration ::SimdCpuInfoType. - - Using example: - \verbatim - #include "Simd/SimdLib.h" - #include - - int main() - { - std::cout << "Sockets : " << SimdCpuInfo(SimdCpuInfoSockets) << std::endl; - std::cout << "Cores : " << SimdCpuInfo(SimdCpuInfoCores) << std::endl; - std::cout << "Threads : " << SimdCpuInfo(SimdCpuInfoThreads) << std::endl; - std::cout << "L1D Cache : " << SimdCpuInfo(SimdCpuInfoCacheL1) / 1024 << " KB" << std::endl; - std::cout << "L2 Cache : " << SimdCpuInfo(SimdCpuInfoCacheL2) / 1024 << " KB" << std::endl; - std::cout << "L3 Cache : " << SimdCpuInfo(SimdCpuInfoCacheL3) / 1024 << " KB" << std::endl; - std::cout << "SSE: " << (SimdCpuInfo(SimdCpuInfoSse) ? "Yes" : "No") << std::endl; - std::cout << "SSE2: " << (SimdCpuInfo(SimdCpuInfoSse2) ? "Yes" : "No") << std::endl; - std::cout << "SSE3: " << (SimdCpuInfo(SimdCpuInfoSse3) ? "Yes" : "No") << std::endl; - std::cout << "SSSE3: " << (SimdCpuInfo(SimdCpuInfoSsse3) ? "Yes" : "No") << std::endl; - std::cout << "SSE4.1: " << (SimdCpuInfo(SimdCpuInfoSse41) ? "Yes" : "No") << std::endl; - std::cout << "SSE4.2: " << (SimdCpuInfo(SimdCpuInfoSse42) ? "Yes" : "No") << std::endl; - std::cout << "AVX: " << (SimdCpuInfo(SimdCpuInfoAvx) ? "Yes" : "No") << std::endl; - std::cout << "AVX2: " << (SimdCpuInfo(SimdCpuInfoAvx2) ? "Yes" : "No") << std::endl; - std::cout << "AVX-512F: " << (SimdCpuInfo(SimdCpuInfoAvx512f) ? "Yes" : "No") << std::endl; - std::cout << "AVX-512BW: " << (SimdCpuInfo(SimdCpuInfoAvx512bw) ? "Yes" : "No") << std::endl; - std::cout << "AVX-512VNNI: " << (SimdCpuInfo(SimdCpuInfoAvx512vnni) ? "Yes" : "No") << std::endl; - std::cout << "PowerPC-Altivec: " << (SimdCpuInfo(SimdCpuInfoVmx) ? "Yes" : "No") << std::endl; - std::cout << "PowerPC-VSX: " << (SimdCpuInfo(SimdCpuInfoVsx) ? "Yes" : "No") << std::endl; - std::cout << "ARM-NEON: " << (SimdCpuInfo(SimdCpuInfoNeon) ? "Yes" : "No") << std::endl; - std::cout << "MIPS-MSA: " << (SimdCpuInfo(SimdCpuInfoMsa) ? "Yes" : "No") << std::endl; - return 0; - } - \endverbatim - - \param [in] type - a type of required information. - \return a value which contains information about CPU and %Simd Library. - */ - SIMD_API size_t SimdCpuInfo(SimdCpuInfoType type); - - /*! @ingroup info - - \fn const char *SimdPerformanceStatistic(); - - \short Gets internal performance statistics of %Simd Library. - - \note %Simd Library have to be build with defined SIMD_PERFORMANCE_STATISTIC macro. - - \return string with internal performance statistics of %Simd Library. - */ - SIMD_API const char * SimdPerformanceStatistic(); - - /*! @ingroup memory - - \fn void * SimdAllocate(size_t size, size_t align); - - \short Allocates aligned memory block. - - \note The memory allocated by this function is must be deleted by function ::SimdFree. - - \param [in] size - a size of memory block. - \param [in] align - a required alignment of memory block. - - \return a pointer to allocated memory. - */ - SIMD_API void * SimdAllocate(size_t size, size_t align); - - /*! @ingroup memory - - \fn void SimdFree(void * ptr); - - \short Frees aligned memory block. - - \note This function frees a memory allocated by function ::SimdAllocate. - - \param [in] ptr - a pointer to the memory to be deleted. - */ - SIMD_API void SimdFree(void * ptr); - - /*! @ingroup memory - - \fn size_t SimdAlign(size_t size, size_t align); - - \short Gets aligned size. - - \param [in] size - an original size. - \param [in] align - a required alignment. - - \return an aligned size. - */ - SIMD_API size_t SimdAlign(size_t size, size_t align); - - /*! @ingroup memory - - \fn size_t SimdAlignment(); - - \short Gets alignment required for the most productive work of the Simd Library. - - \return a required alignment. - */ - SIMD_API size_t SimdAlignment(); - - /*! @ingroup memory - - \fn void SimdRelease(void * context); - - \short Releases context created with using of Simd Library API. - - \note This function releases a context created by functions ::SimdDetectionLoadA and ::SimdDetectionInit. - - \param [in] context - a context to be released. - */ - SIMD_API void SimdRelease(void * context); - - /*! @ingroup thread - - \fn size_t SimdGetThreadNumber(); - - \short Gets number of threads used by Simd Library to parallelize some algorithms. - - \return current thread number. - */ - SIMD_API size_t SimdGetThreadNumber(); - - /*! @ingroup thread - - \fn void SimdSetThreadNumber(size_t threadNumber); - - \short Sets number of threads used by Simd Library to parallelize some algorithms. - - \param [in] threadNumber - a number of threads. - */ - SIMD_API void SimdSetThreadNumber(size_t threadNumber); - - /*! @ingroup cpu_flags - - \fn SimdBool SimdGetFastMode(); - - \short Gets current CPU Flush-To-Zero (FTZ) and Denormals-Are-Zero (DAZ) flags. It is used in order to process subnormal numbers. - - \return current 'fast' mode. - */ - SIMD_API SimdBool SimdGetFastMode(); - - /*! @ingroup cpu_flags - - \fn void SimdSetFastMode(SimdBool value); - - \short Sets current CPU Flush-To-Zero (FTZ) and Denormals-Are-Zero (DAZ) flags. It is used in order to process subnormal numbers. - - \param [in] value - a value of 'fast' mode. - */ - SIMD_API void SimdSetFastMode(SimdBool value); - - /*! @ingroup hash - - \fn uint32_t SimdCrc32c(const void * src, size_t size); - - \short Gets 32-bit cyclic redundancy check (CRC32c) for current data. - - Calculation is performed for polynomial 0x1EDC6F41 (Castagnoli-crc). - - \param [in] src - a pointer to data. - \param [in] size - a size of the data. - \return 32-bit cyclic redundancy check (CRC32c). - */ - SIMD_API uint32_t SimdCrc32c(const void * src, size_t size); - - /*! @ingroup correlation - - \fn void SimdAbsDifference(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, uint8_t * c, size_t cStride, size_t width, size_t height); - - \short Gets absolute difference of two gray 8-bit images, pyxel by pixel. - - The three images must have the same width and height. - - \note This function has a C++ wrapper Simd::AbsDifference(const View & a, const View & b, View & c). - - \param [in] a - a pointer to pixels data of first image. - \param [in] aStride - a row size of first image. - \param [in] b - a pointer to pixels data of second image. - \param [in] bStride - a row size of second image. - \param [out] c - a pointer to pixels data of destination image. - \param [in] cStride - a row size of destination image. - \param [in] width - an image width. - \param [in] height - an image height. - */ - SIMD_API void SimdAbsDifference(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, uint8_t * c, size_t cStride, size_t width, size_t height); - - /*! @ingroup correlation - - \fn void SimdAbsDifferenceSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum); - - \short Gets sum of absolute difference of two gray 8-bit images. - - Both images must have the same width and height. - - \note This function has a C++ wrapper Simd::AbsDifferenceSum(const View & a, const View & b, uint64_t & sum). - - \param [in] a - a pointer to pixels data of first image. - \param [in] aStride - a row size of first image. - \param [in] b - a pointer to pixels data of second image. - \param [in] bStride - a row size of second image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] sum - the result sum of absolute difference of two images. - */ - SIMD_API void SimdAbsDifferenceSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum); - - /*! @ingroup correlation - - \fn void SimdAbsDifferenceSumMasked(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, const uint8_t * mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum); - - \short Gets sum of absolute difference of two gray 8-bit images based on gray 8-bit mask. - - Gets the absolute difference sum for all points when mask[i] == index. - Both images and mask must have the same width and height. - - \note This function has a C++ wrapper Simd::AbsDifferenceSum(const View& a, const View& b, const View& mask, uint8_t index, uint64_t & sum). - - \param [in] a - a pointer to pixels data of first image. - \param [in] aStride - a row size of first image. - \param [in] b - a pointer to pixels data of second image. - \param [in] bStride - a row size of second image. - \param [in] mask - a pointer to pixels data of mask image. - \param [in] maskStride - a row size of mask image. - \param [in] index - a mask index. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] sum - the result sum of absolute difference of two images. - */ - SIMD_API void SimdAbsDifferenceSumMasked(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - const uint8_t * mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum); - - /*! @ingroup correlation - - \fn void SimdAbsDifferenceSums3x3(const uint8_t * current, size_t currentStride, const uint8_t * background, size_t backgroundStride, size_t width, size_t height, uint64_t * sums); - - \short Gets 9 sums of absolute difference of two gray 8-bit images with various relative shifts in neighborhood 3x3. - - Both images must have the same width and height. The image height and width must be equal or greater 3. - The sums are calculated with central part (indent width = 1) of current image and with part of background image with corresponding shift. - The shifts are lain in the range [-1, 1] for axis x and y. - - \note This function has a C++ wrapper Simd::AbsDifferenceSums3x3(const View& current, const View& background, uint64_t * sums). - - \param [in] current - a pointer to pixels data of current image. - \param [in] currentStride - a row size of the current image. - \param [in] background - a pointer to pixels data of the background image. - \param [in] backgroundStride - a row size of the background image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] sums - the pointer to buffer with result sums. Buffer size must be equal or greater 9. - */ - SIMD_API void SimdAbsDifferenceSums3x3(const uint8_t * current, size_t currentStride, const uint8_t * background, size_t backgroundStride, - size_t width, size_t height, uint64_t * sums); - - /*! @ingroup correlation - - \fn void SimdAbsDifferenceSums3x3Masked(const uint8_t *current, size_t currentStride, const uint8_t *background, size_t backgroundStride, const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sums); - - \short Gets 9 sums of absolute difference of two gray 8-bit images with various relative shifts in neighborhood 3x3 based on gray 8-bit mask. - - Gets the absolute difference sums for all points when mask[i] == index. - Both images and mask must have the same width and height. The image height and width must be equal or greater 3. - The sums are calculated with central part (indent width = 1) of current image and with part of background image with corresponding shift. - The shifts are lain in the range [-1, 1] for axis x and y. - - \note This function has a C++ wrapper Simd::AbsDifferenceSums3x3(const View& current, const View& background, const View& mask, uint8_t index, uint64_t * sums). - - \param [in] current - a pointer to pixels data of current image. - \param [in] currentStride - a row size of the current image. - \param [in] background - a pointer to pixels data of the background image. - \param [in] backgroundStride - a row size of the background image. - \param [in] mask - a pointer to pixels data of mask image. - \param [in] maskStride - a row size of mask image. - \param [in] index - a mask index. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] sums - the pointer to buffer with result sums. Buffer size must be equal or greater 9. - */ - SIMD_API void SimdAbsDifferenceSums3x3Masked(const uint8_t *current, size_t currentStride, const uint8_t *background, size_t backgroundStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sums); - - /*! @ingroup other_filter - - \fn void SimdAbsGradientSaturatedSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - \short Puts to destination 8-bit gray image saturated sum of absolute gradient for every point of source 8-bit gray image. - - Both images must have the same width and height. - - For border pixels: - \verbatim - dst[x, y] = 0; - \endverbatim - - For other pixels: - \verbatim - dx = abs(src[x + 1, y] - src[x - 1, y]); - dy = abs(src[x, y + 1] - src[x, y - 1]); - dst[x, y] = min(dx + dy, 255); - \endverbatim - - \note This function has a C++ wrapper Simd::AbsGradientSaturatedSum(const View& src, View& dst). - - \param [in] src - a pointer to pixels data of source 8-bit gray image. - \param [in] srcStride - a row size of source image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] dst - a pointer to pixels data of destination 8-bit gray image. - \param [in] dstStride - a row size of destination image. - */ - SIMD_API void SimdAbsGradientSaturatedSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t * dst, size_t dstStride); - - /*! @ingroup difference_estimation - - \fn void SimdAddFeatureDifference(const uint8_t * value, size_t valueStride, size_t width, size_t height, const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, uint16_t weight, uint8_t * difference, size_t differenceStride); - - \short Adds feature difference to common difference sum. - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - excess = max(lo[i] - value[i], 0) + max(value[i] - hi[i], 0); - difference[i] += (weight * excess*excess) >> 16; - \endverbatim - - This function is used for difference estimation in algorithm of motion detection. - - \note This function has a C++ wrapper Simd::AddFeatureDifference(const View& value, const View& lo, const View& hi, uint16_t weight, View& difference). - - \param [in] value - a pointer to pixels data of current feature value. - \param [in] valueStride - a row size of the value image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] lo - a pointer to pixels data of feature lower bound of dynamic background. - \param [in] loStride - a row size of the lo image. - \param [in] hi - a pointer to pixels data of feature upper bound of dynamic background. - \param [in] hiStride - a row size of the hi image. - \param [in] weight - a current feature weight (unsigned 16-bit value). - \param [in, out] difference - a pointer to pixels data of image with total difference. - \param [in] differenceStride - a row size of difference image. - */ - SIMD_API void SimdAddFeatureDifference(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, - uint16_t weight, uint8_t * difference, size_t differenceStride); - - /*! @ingroup drawing - - \fn void SimdAlphaBlending(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, const uint8_t * alpha, size_t alphaStride, uint8_t * dst, size_t dstStride); - - \short Performs alpha blending operation. - - All images must have the same width and height. Source and destination images must have the same format (8 bit per channel, for example GRAY8, BGR24 or BGRA32). Alpha must be 8-bit gray image. - - For every point: - \verbatim - dst[x, y, c] = (src[x, y, c]*alpha[x, y] + dst[x, y, c]*(255 - alpha[x, y]))/255; - \endverbatim - - This function is used for image drawing. - - \note This function has a C++ wrapper Simd::AlphaBlending(const View& src, const View& alpha, View& dst). - - \param [in] src - a pointer to pixels data of foreground image. - \param [in] srcStride - a row size of the foreground image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] channelCount - a channel count for foreground and background images (1 <= channelCount <= 4). - \param [in] alpha - a pointer to pixels data of image with alpha channel. - \param [in] alphaStride - a row size of the alpha image. - \param [in, out] dst - a pointer to pixels data of background image. - \param [in] dstStride - a row size of the background image. - */ - SIMD_API void SimdAlphaBlending(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t * alpha, size_t alphaStride, uint8_t * dst, size_t dstStride); - - /*! @ingroup drawing - - \fn void SimdAlphaFilling(uint8_t * dst, size_t dstStride, size_t width, size_t height, const uint8_t * channel, size_t channelCount, const uint8_t * alpha, size_t alphaStride); - - \short Performs alpha filling operation. - - All images must have the same width and height. Destination images must have 8 bit per channel (for example GRAY8, BGR24 or BGRA32). Alpha must be 8-bit gray image. - - For every point: - \verbatim - dst[x, y, c] = (channel[c]*alpha[x, y] + dst[x, y, c]*(255 - alpha[x, y]))/255; - \endverbatim - - This function is used for image drawing. - - \note This function has a C++ wrapper Simd::AlphaFilling(View & dst, const Pixel & pixel, const View & alpha). - - \param [in, out] dst - a pointer to pixels data of background image. - \param [in] dstStride - a row size of the background image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] channel - a pointer to pixel with foreground color. - \param [in] channelCount - a channel count for foreground color and background images (1 <= channelCount <= 4). - \param [in] alpha - a pointer to pixels data of image with alpha channel. - \param [in] alphaStride - a row size of the alpha image. - */ - SIMD_API void SimdAlphaFilling(uint8_t * dst, size_t dstStride, size_t width, size_t height, const uint8_t * channel, size_t channelCount, const uint8_t * alpha, size_t alphaStride); - - /*! @ingroup background - - \fn void SimdBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride); - - \short Performs background update (initial grow, slow mode). - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - lo[i] -= value[i] < lo[i] ? 1 : 0; - hi[i] += value[i] > hi[i] ? 1 : 0; - \endverbatim - - This function is used for background updating in motion detection algorithm. - - \note This function has a C++ wrapper Simd::BackgroundGrowRangeSlow(const View& value, View& lo, View& hi). - - \param [in] value - a pointer to pixels data of current feature value. - \param [in] valueStride - a row size of the value image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in, out] lo - a pointer to pixels data of feature lower bound of dynamic background. - \param [in] loStride - a row size of the lo image. - \param [in, out] hi - a pointer to pixels data of feature upper bound of dynamic background. - \param [in] hiStride - a row size of the hi image. - */ - SIMD_API void SimdBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride); - - /*! @ingroup background - - \fn void SimdBackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride); - - \short Performs background update (initial grow, fast mode). - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - lo[i] = value[i] < lo[i] ? value[i] : lo[i]; - hi[i] = value[i] > hi[i] ? value[i] : hi[i]; - \endverbatim - - This function is used for background updating in motion detection algorithm. - - \note This function has a C++ wrapper Simd::BackgroundGrowRangeFast(const View& value, View& lo, View& hi). - - \param [in] value - a pointer to pixels data of current feature value. - \param [in] valueStride - a row size of the value image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in, out] lo - a pointer to pixels data of feature lower bound of dynamic background. - \param [in] loStride - a row size of the lo image. - \param [in, out] hi - a pointer to pixels data of feature upper bound of dynamic background. - \param [in] hiStride - a row size of the hi image. - */ - SIMD_API void SimdBackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride); - - /*! @ingroup background - - \fn void SimdBackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, const uint8_t * loValue, size_t loValueStride, const uint8_t * hiValue, size_t hiValueStride, uint8_t * loCount, size_t loCountStride, uint8_t * hiCount, size_t hiCountStride); - - \short Performs collection of background statistic. - - All images must have the same width, height and format (8-bit gray). - - Updates background statistic counters for every point: - \verbatim - loCount[i] += (value[i] < loValue[i] && loCount[i] < 255) ? 1 : 0; - hiCount[i] += (value[i] > hiValue[i] && hiCount[i] < 255) ? 1 : 0; - \endverbatim - - This function is used for background updating in motion detection algorithm. - - \note This function has a C++ wrapper Simd::BackgroundIncrementCount(const View& value, const View& loValue, const View& hiValue, View& loCount, View& hiCount). - - \param [in] value - a pointer to pixels data of current feature value. - \param [in] valueStride - a row size of the value image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] loValue - a pointer to pixels data of value of feature lower bound of dynamic background. - \param [in] loValueStride - a row size of the loValue image. - \param [in] hiValue - a pointer to pixels data of value of feature upper bound of dynamic background. - \param [in] hiValueStride - a row size of the hiValue image. - \param [in, out] loCount - a pointer to pixels data of count of feature lower bound of dynamic background. - \param [in] loCountStride - a row size of the loCount image. - \param [in, out] hiCount - a pointer to pixels data of count of feature upper bound of dynamic background. - \param [in] hiCountStride - a row size of the hiCount image. - */ - SIMD_API void SimdBackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * loValue, size_t loValueStride, const uint8_t * hiValue, size_t hiValueStride, - uint8_t * loCount, size_t loCountStride, uint8_t * hiCount, size_t hiCountStride); - - /*! @ingroup background - - \fn void SimdBackgroundAdjustRange(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, uint8_t * hiValue, size_t hiValueStride, uint8_t threshold); - - \short Performs adjustment of background range. - - All images must have the same width, height and format (8-bit gray). - - Adjusts background range for every point: - \verbatim - loValue[i] -= (loCount[i] > threshold && loValue[i] > 0) ? 1 : 0; - loValue[i] += (loCount[i] < threshold && loValue[i] < 255) ? 1 : 0; - loCount[i] = 0; - hiValue[i] += (hiCount[i] > threshold && hiValue[i] < 255) ? 1 : 0; - hiValue[i] -= (hiCount[i] < threshold && hiValue[i] > 0) ? 1 : 0; - hiCount[i] = 0; - \endverbatim - - This function is used for background updating in motion detection algorithm. - - \note This function has a C++ wrapper Simd::BackgroundAdjustRange(View& loCount, View& loValue, View& hiCount, View& hiValue, uint8_t threshold). - - \param [in, out] loCount - a pointer to pixels data of count of feature lower bound of dynamic background. - \param [in] loCountStride - a row size of the loCount image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in, out] hiCount - a pointer to pixels data of count of feature upper bound of dynamic background. - \param [in] hiCountStride - a row size of the hiCount image. - \param [in, out] loValue - a pointer to pixels data of value of feature lower bound of dynamic background. - \param [in] loValueStride - a row size of the loValue image. - \param [in, out] hiValue - a pointer to pixels data of value of feature upper bound of dynamic background. - \param [in] hiValueStride - a row size of the hiValue image. - \param [in] threshold - a count threshold. - */ - SIMD_API void SimdBackgroundAdjustRange(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold); - - /*! @ingroup background - - \fn void SimdBackgroundAdjustRangeMasked(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, uint8_t * hiValue, size_t hiValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride); - - \short Performs adjustment of background range with using adjust range mask. - - All images must have the same width, height and format (8-bit gray). - - Adjusts background range for every point: - \verbatim - if(mask[i]) - { - loValue[i] -= (loCount[i] > threshold && loValue[i] > 0) ? 1 : 0; - loValue[i] += (loCount[i] < threshold && loValue[i] < 255) ? 1 : 0; - loCount[i] = 0; - hiValue[i] += (hiCount[i] > threshold && hiValue[i] < 255) ? 1 : 0; - hiValue[i] -= (hiCount[i] < threshold && hiValue[i] > 0) ? 1 : 0; - hiCount[i] = 0; - } - \endverbatim - - This function is used for background updating in motion detection algorithm. - - \note This function has a C++ wrapper Simd::BackgroundAdjustRange(View& loCount, View& loValue, View& hiCount, View& hiValue, uint8_t threshold, const View& mask). - - \param [in, out] loCount - a pointer to pixels data of count of feature lower bound of dynamic background. - \param [in] loCountStride - a row size of the loCount image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in, out] hiCount - a pointer to pixels data of count of feature upper bound of dynamic background. - \param [in] hiCountStride - a row size of the hiCount image. - \param [in, out] loValue - a pointer to pixels data of value of feature lower bound of dynamic background. - \param [in] loValueStride - a row size of the loValue image. - \param [in, out] hiValue - a pointer to pixels data of value of feature upper bound of dynamic background. - \param [in] hiValueStride - a row size of the hiValue image. - \param [in] threshold - a count threshold. - \param [in] mask - a pointer to pixels data of adjust range mask. - \param [in] maskStride - a row size of the mask image. - */ - SIMD_API void SimdBackgroundAdjustRangeMasked(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride); - - /*! @ingroup background - - \fn void SimdBackgroundShiftRange(const uint8_t * value, size_t valueStride, size_t width, size_t height, uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride); - - \short Shifts background range. - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - if (value[i] > hi[i]) - { - lo[i] = min(lo[i] + value[i] - hi[i], 255); - hi[i] = value[i]; - } - if (lo[i] > value[i]) - { - lo[i] = value[i]; - hi[i] = max(hi[i] - lo[i] + value[i], 0); - } - \endverbatim - - This function is used for fast background updating in motion detection algorithm. - - \note This function has a C++ wrapper Simd::BackgroundShiftRange(const View& value, View& lo, View& hi). - - \param [in] value - a pointer to pixels data of current feature value. - \param [in] valueStride - a row size of the value image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in, out] lo - a pointer to pixels data of feature lower bound of dynamic background. - \param [in] loStride - a row size of the lo image. - \param [in, out] hi - a pointer to pixels data of feature upper bound of dynamic background. - \param [in] hiStride - a row size of the hi image. - */ - SIMD_API void SimdBackgroundShiftRange(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride); - - /*! @ingroup background - - \fn void SimdBackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride, const uint8_t * mask, size_t maskStride); - - \short Shifts background range with using shift range mask. - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - if(mask[i]) - { - if (value[i] > hi[i]) - { - lo[i] = min(lo[i] + value[i] - hi[i], 255); - hi[i] = value[i]; - } - if (lo[i] > value[i]) - { - lo[i] = value[i]; - hi[i] = max(hi[i] - lo[i] + value[i], 0); - } - } - \endverbatim - - This function is used for fast background updating in motion detection algorithm. - - \note This function has a C++ wrapper Simd::BackgroundShiftRange(const View& value, View& lo, View& hi, const View& mask). - - \param [in] value - a pointer to pixels data of current feature value. - \param [in] valueStride - a row size of the value image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in, out] lo - a pointer to pixels data of feature lower bound of dynamic background. - \param [in] loStride - a row size of the lo image. - \param [in, out] hi - a pointer to pixels data of feature upper bound of dynamic background. - \param [in] hiStride - a row size of the hi image. - \param [in] mask - a pointer to pixels data of shift range mask. - \param [in] maskStride - a row size of the mask image. - */ - SIMD_API void SimdBackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride, const uint8_t * mask, size_t maskStride); - - /*! @ingroup background - - \fn void SimdBackgroundInitMask(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t index, uint8_t value, uint8_t * dst, size_t dstStride); - - \short Creates background update mask. - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - if(mask[i] == index) - dst[i] = value; - \endverbatim - - This function is used for background updating in motion detection algorithm. - - \note This function has a C++ wrapper Simd::BackgroundInitMask(const View& src, uint8_t index, uint8_t value, View& dst). - - \param [in] src - a pointer to pixels data of input mask image. - \param [in] srcStride - a row size of input mask image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] index - a mask index into input mask. - \param [in] value - a value to fill the output mask. - \param [out] dst - a pointer to pixels data of output mask image. - \param [in] dstStride - a row size of output mask image. - */ - SIMD_API void SimdBackgroundInitMask(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t index, uint8_t value, uint8_t * dst, size_t dstStride); - - /*! @ingroup bayer_conversion - - \fn void SimdBayerToBgr(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgr, size_t bgrStride); - - \short Converts 8-bit Bayer image to 24-bit BGR. - - All images must have the same width and height. The width and the height must be even. - - \note This function has a C++ wrapper Simd::BayerToBgr(const View& bayer, View& bgr). - - \param [in] bayer - a pointer to pixels data of input 8-bit Bayer image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] bayerStride - a row size of the bayer image. - \param [in] bayerFormat - a format of the input bayer image. It can be ::SimdPixelFormatBayerGrbg, ::SimdPixelFormatBayerGbrg, ::SimdPixelFormatBayerRggb or ::SimdPixelFormatBayerBggr. - \param [out] bgr - a pointer to pixels data of output 24-bit BGR image. - \param [in] bgrStride - a row size of the bgr image. - */ - SIMD_API void SimdBayerToBgr(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgr, size_t bgrStride); - - /*! @ingroup bayer_conversion - - \fn void SimdBayerToBgra(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - \short Converts 8-bit Bayer image to 32-bit BGRA. - - All images must have the same width and height. The width and the height must be even. - - \note This function has a C++ wrapper Simd::BayerToBgra(const View& bayer, View& bgra, uint8_t alpha). - - \param [in] bayer - a pointer to pixels data of input 8-bit Bayer image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] bayerStride - a row size of the bayer image. - \param [in] bayerFormat - a format of the input bayer image. It can be ::SimdPixelFormatBayerGrbg, ::SimdPixelFormatBayerGbrg, ::SimdPixelFormatBayerRggb or ::SimdPixelFormatBayerBggr. - \param [out] bgra - a pointer to pixels data of output 32-bit BGRA image. - \param [in] bgraStride - a row size of the bgra image. - \param [in] alpha - a value of alpha channel. - */ - SIMD_API void SimdBayerToBgra(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - /*! @ingroup bgra_conversion - - \fn void SimdBgraToBayer(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat); - - \short Converts 32-bit BGRA image to 8-bit Bayer image. - - All images must have the same width and height. The width and the height must be even. - - \note This function has a C++ wrapper Simd::BgraToBayer(const View& bgra, View& bayer). - - \param [in] bgra - a pointer to pixels data of input 32-bit BGRA image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] bgraStride - a row size of the bgra image. - \param [out] bayer - a pointer to pixels data of output 8-bit Bayer image. - \param [in] bayerStride - a row size of the bayer image. - \param [in] bayerFormat - a format of the output bayer image. It can be ::SimdPixelFormatBayerGrbg, ::SimdPixelFormatBayerGbrg, ::SimdPixelFormatBayerRggb or ::SimdPixelFormatBayerBggr. - */ - SIMD_API void SimdBgraToBayer(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat); - - /*! @ingroup bgra_conversion - - \fn void SimdBgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride); - - \short Converts 32-bit BGRA image to 24-bit BGR image. - - All images must have the same width and height. - - \note This function has a C++ wrapper Simd::BgraToBgr(const View& bgra, View& bgr). - - \param [in] bgra - a pointer to pixels data of input 32-bit BGRA image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] bgraStride - a row size of the bgra image. - \param [out] bgr - a pointer to pixels data of output 24-bit BGR image. - \param [in] bgrStride - a row size of the bgr image. - */ - SIMD_API void SimdBgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride); - - /*! @ingroup bgra_conversion - - \fn void SimdBgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride); - - \short Converts 32-bit BGRA image to 8-bit gray image. - - All images must have the same width and height. - - \note This function has a C++ wrapper Simd::BgraToGray(const View& bgra, View& gray). - - \param [in] bgra - a pointer to pixels data of input 32-bit BGRA image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] bgraStride - a row size of the bgra image. - \param [out] gray - a pointer to pixels data of output 8-bit gray image. - \param [in] grayStride - a row size of the gray image. - */ - SIMD_API void SimdBgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride); - - /*! @ingroup bgra_conversion - - \fn void SimdBgraToRgb(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgb, size_t rgbStride); - - \short Converts 32-bit BGRA image to 24-bit RGB image. - - All images must have the same width and height. - - \note This function has a C++ wrapper Simd::BgraToRgb(const View& bgra, View& rgb). - - \param [in] bgra - a pointer to pixels data of input 32-bit BGRA image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] bgraStride - a row size of the bgra image. - \param [out] rgb - a pointer to pixels data of output 24-bit RGB image. - \param [in] rgbStride - a row size of the rgb image. - */ - SIMD_API void SimdBgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride); - - /*! @ingroup bgra_conversion - - \fn void SimdBgraToYuv420p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - \short Converts 32-bit BGRA image to YUV420P. - - The input BGRA and output Y images must have the same width and height. - The input U and V images must have the same width and height (half size relative to Y component). - - \note This function has a C++ wrapper Simd::BgraToYuv420p(const View& bgra, View& y, View& u, View& v). - - \param [in] bgra - a pointer to pixels data of input 32-bit BGRA image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] bgraStride - a row size of the BGRA image. - \param [out] y - a pointer to pixels data of output 8-bit image with Y color plane. - \param [in] yStride - a row size of the y image. - \param [out] u - a pointer to pixels data of output 8-bit image with U color plane. - \param [in] uStride - a row size of the u image. - \param [out] v - a pointer to pixels data of output 8-bit image with V color plane. - \param [in] vStride - a row size of the v image. - */ - SIMD_API void SimdBgraToYuv420p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - /*! @ingroup bgra_conversion - - \fn void SimdBgraToYuv422p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - \short Converts 32-bit BGRA image to YUV422P. - - The input BGRA and output Y images must have the same width and height. - The input U and V images must have the same width and height (their width is equal to half width of Y component). - - \note This function has a C++ wrapper Simd::BgraToYuv422p(const View& bgra, View& y, View& u, View& v). - - \param [in] bgra - a pointer to pixels data of input 32-bit BGRA image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] bgraStride - a row size of the BGRA image. - \param [out] y - a pointer to pixels data of output 8-bit image with Y color plane. - \param [in] yStride - a row size of the y image. - \param [out] u - a pointer to pixels data of output 8-bit image with U color plane. - \param [in] uStride - a row size of the u image. - \param [out] v - a pointer to pixels data of output 8-bit image with V color plane. - \param [in] vStride - a row size of the v image. - */ - SIMD_API void SimdBgraToYuv422p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - /*! @ingroup bgra_conversion - - \fn void SimdBgraToYuv444p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - \short Converts 32-bit BGRA image to YUV444P. - - The input BGRA and output Y, U and V images must have the same width and height. - - \note This function has a C++ wrapper Simd::BgraToYuv444p(const View& bgra, View& y, View& u, View& v). - - \param [in] bgra - a pointer to pixels data of input 32-bit BGRA image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] bgraStride - a row size of the BGRA image. - \param [out] y - a pointer to pixels data of output 8-bit image with Y color plane. - \param [in] yStride - a row size of the y image. - \param [out] u - a pointer to pixels data of output 8-bit image with U color plane. - \param [in] uStride - a row size of the u image. - \param [out] v - a pointer to pixels data of output 8-bit image with V color plane. - \param [in] vStride - a row size of the v image. - */ - SIMD_API void SimdBgraToYuv444p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - /*! @ingroup bgra_conversion - - \fn void SimdBgraToYuva420p(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride, uint8_t * a, size_t aStride); - - \short Converts 32-bit BGRA image to YUVA420P. - - The input BGRA and output Y and A images must have the same width and height. - The input U and V images must have the same width and height (half size relative to Y component). - - \note This function has a C++ wrapper Simd::BgraToYuva420p(const View & bgra, View & y, View & u, View & v, View & a). - - \param [in] bgra - a pointer to pixels data of input 32-bit BGRA image. - \param [in] bgraStride - a row size of the BGRA image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] y - a pointer to pixels data of output 8-bit image with Y color plane. - \param [in] yStride - a row size of the y image. - \param [out] u - a pointer to pixels data of output 8-bit image with U color plane. - \param [in] uStride - a row size of the u image. - \param [out] v - a pointer to pixels data of output 8-bit image with V color plane. - \param [in] vStride - a row size of the v image. - \param [out] a - a pointer to pixels data of output 8-bit image with alpha plane. - \param [in] aStride - a row size of the a image. - */ - SIMD_API void SimdBgraToYuva420p(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, - uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride, uint8_t * a, size_t aStride); - - /*! @ingroup bgr_conversion - - \fn void SimdBgrToBayer(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat); - - \short Converts 24-bit BGR image to 8-bit Bayer image. - - All images must have the same width and height. The width and the height must be even. - - \note This function has a C++ wrapper Simd::BgrToBayer(const View& bgr, View& bayer). - - \param [in] bgr - a pointer to pixels data of input 24-bit BGR image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] bgrStride - a row size of the bgr image. - \param [out] bayer - a pointer to pixels data of output 8-bit Bayer image. - \param [in] bayerStride - a row size of the bayer image. - \param [in] bayerFormat - a format of the output bayer image. It can be ::SimdPixelFormatBayerGrbg, ::SimdPixelFormatBayerGbrg, ::SimdPixelFormatBayerRggb or ::SimdPixelFormatBayerBggr. - */ - SIMD_API void SimdBgrToBayer(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat); - - /*! @ingroup bgr_conversion - - \fn void SimdBgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - \short Converts 24-bit BGR image to 32-bit BGRA image. - - All images must have the same width and height. - - \note This function has a C++ wrapper Simd::BgrToBgra(const View& bgr, View& bgra, uint8_t alpha). - - \param [in] bgr - a pointer to pixels data of input 24-bit BGR image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] bgrStride - a row size of the bgr image. - \param [out] bgra - a pointer to pixels data of output 32-bit BGRA image. - \param [in] bgraStride - a row size of the bgra image. - \param [in] alpha - a value of alpha channel. - */ - SIMD_API void SimdBgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - /*! @ingroup other_conversion - - \fn void SimdBgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - \short Converts 48-bit planar BGR image to 32-bit BGRA image. - - All images must have the same width and height. - - \note This function has a C++ wrapper Simd::Bgr48pToBgra32(const View& blue, const View& green, const View& red, View& bgra, uint8_t alpha). - - \param [in] blue - a pointer to pixels data of input 16-bit image with blue color plane. - \param [in] blueStride - a row size of the blue image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] green - a pointer to pixels data of input 16-bit image with green color plane. - \param [in] greenStride - a row size of the blue image. - \param [in] red - a pointer to pixels data of input 16-bit image with red color plane. - \param [in] redStride - a row size of the red image. - \param [out] bgra - a pointer to pixels data of output 32-bit BGRA image. - \param [in] bgraStride - a row size of the bgra image. - \param [in] alpha - a value of alpha channel. - */ - SIMD_API void SimdBgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, - const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - /*! @ingroup bgr_conversion - - \fn void SimdBgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride); - - \short Converts 24-bit BGR image to 8-bit gray image. - - All images must have the same width and height. - - \note This function has a C++ wrapper Simd::BgrToGray(const View& bgr, View& gray). - - \param [in] bgr - a pointer to pixels data of input 24-bit BGR image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] bgrStride - a row size of the bgr image. - \param [out] gray - a pointer to pixels data of output 8-bit gray image. - \param [in] grayStride - a row size of the gray image. - */ - SIMD_API void SimdBgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride); - - /*! @ingroup bgr_conversion - - \fn void SimdBgrToHsl(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * hsl, size_t hslStride); - - \short Converts 24-bit BGR image to 24-bit HSL(Hue, Saturation, Lightness) image. - - All images must have the same width and height. - - \note This function has a C++ wrapper Simd::BgrToHsl(const View& bgr, View& hsl). - - \param [in] bgr - a pointer to pixels data of input 24-bit BGR image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] bgrStride - a row size of the bgr image. - \param [out] hsl - a pointer to pixels data of output 24-bit HSL image. - \param [in] hslStride - a row size of the hsl image. - */ - SIMD_API void SimdBgrToHsl(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * hsl, size_t hslStride); - - /*! @ingroup bgr_conversion - - \fn void SimdBgrToHsv(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * hsv, size_t hsvStride); - - \short Converts 24-bit BGR image to 24-bit HSV(Hue, Saturation, Value) image. - - All images must have the same width and height. - - \note This function has a C++ wrapper Simd::BgrToHsv(const View& bgr, View& hsv). - - \param [in] bgr - a pointer to pixels data of input 24-bit BGR image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] bgrStride - a row size of the bgr image. - \param [out] hsv - a pointer to pixels data of output 24-bit HSV image. - \param [in] hsvStride - a row size of the hsv image. - */ - SIMD_API void SimdBgrToHsv(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * hsv, size_t hsvStride); - - /*! @ingroup bgr_conversion - - \fn void SimdBgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride); - - \short Converts 24-bit BGR image to 24-bit RGB image (also it performs backward conversion). - - All images must have the same width and height. - - \note This function has a C++ wrapper Simd::BgrToRgb(const View & bgr, View & rgb). - - \param [in] bgr - a pointer to pixels data of input 24-bit BGR image. - \param [in] bgrStride - a row size of the bgr image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] rgb - a pointer to pixels data of output 24-bit RGB image. - \param [in] rgbStride - a row size of the rgb image. - */ - SIMD_API void SimdBgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride); - - /*! @ingroup bgr_conversion - - \fn void SimdBgrToYuv420p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - \short Converts 24-bit BGR image to YUV420P. - - The input BGR and output Y images must have the same width and height. - The input U and V images must have the same width and height (half size relative to Y component). - - \note This function has a C++ wrapper Simd::BgrToYuv420p(const View& bgr, View& y, View& u, View& v). - - \param [in] bgr - a pointer to pixels data of input 24-bit BGR image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] bgrStride - a row size of the BGR image. - \param [out] y - a pointer to pixels data of output 8-bit image with Y color plane. - \param [in] yStride - a row size of the y image. - \param [out] u - a pointer to pixels data of output 8-bit image with U color plane. - \param [in] uStride - a row size of the u image. - \param [out] v - a pointer to pixels data of output 8-bit image with V color plane. - \param [in] vStride - a row size of the v image. - */ - SIMD_API void SimdBgrToYuv420p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - /*! @ingroup bgr_conversion - - \fn void SimdBgrToYuv422p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - \short Converts 24-bit BGR image to YUV422P. - - The input BGR and output Y images must have the same width and height. - The input U and V images must have the same width and height (their width is equal to half width of Y component). - - \note This function has a C++ wrapper Simd::BgrToYuv422p(const View& bgr, View& y, View& u, View& v). - - \param [in] bgr - a pointer to pixels data of input 24-bit BGR image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] bgrStride - a row size of the BGR image. - \param [out] y - a pointer to pixels data of output 8-bit image with Y color plane. - \param [in] yStride - a row size of the y image. - \param [out] u - a pointer to pixels data of output 8-bit image with U color plane. - \param [in] uStride - a row size of the u image. - \param [out] v - a pointer to pixels data of output 8-bit image with V color plane. - \param [in] vStride - a row size of the v image. - */ - SIMD_API void SimdBgrToYuv422p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - /*! @ingroup bgr_conversion - - \fn void SimdBgrToYuv444p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - \short Converts 24-bit BGR image to YUV444P. - - The input BGR and output Y, U and V images must have the same width and height. - - \note This function has a C++ wrapper Simd::BgrToYuv444p(const View& bgr, View& y, View& u, View& v). - - \param [in] bgr - a pointer to pixels data of input 24-bit BGR image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] bgrStride - a row size of the BGR image. - \param [out] y - a pointer to pixels data of output 8-bit image with Y color plane. - \param [in] yStride - a row size of the y image. - \param [out] u - a pointer to pixels data of output 8-bit image with U color plane. - \param [in] uStride - a row size of the u image. - \param [out] v - a pointer to pixels data of output 8-bit image with V color plane. - \param [in] vStride - a row size of the v image. - */ - SIMD_API void SimdBgrToYuv444p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - /*! @ingroup binarization - - \fn void SimdBinarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t value, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride, SimdCompareType compareType); - - \short Performs binarization of 8-bit gray image. - - All images must have 8-bit gray format and must have the same width and height. - - For every point: - \verbatim - dst[i] = compare(src[i], value) ? positive : negative; - \endverbatim - where compare(a, b) depends from compareType (see ::SimdCompareType). - - \note This function has a C++ wrapper Simd::Binarization(const View& src, uint8_t value, uint8_t positive, uint8_t negative, View& dst, SimdCompareType compareType). - - \param [in] src - a pointer to pixels data of input 8-bit gray image (first value for compare operation). - \param [in] srcStride - a row size of the src image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] value - a second value for compare operation. - \param [in] positive - a destination value if comparison operation has a positive result. - \param [in] negative - a destination value if comparison operation has a negative result. - \param [out] dst - a pointer to pixels data of output 8-bit gray binarized image. - \param [in] dstStride - a row size of the dst image. - \param [in] compareType - a compare operation type (see ::SimdCompareType). - */ - SIMD_API void SimdBinarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride, SimdCompareType compareType); - - /*! @ingroup binarization - - \fn void SimdAveragingBinarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t value, size_t neighborhood, uint8_t threshold, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride, SimdCompareType compareType); - - \short Performs averaging binarization of 8-bit gray image. - - All images must have 8-bit gray format and must have the same width and height. - - For every point: - \verbatim - sum = 0; area = 0; - for(dy = -neighborhood; dy <= neighborhood; ++dy) - { - for(dx = -neighborhood; dx <= neighborhood; ++dx) - { - if(x + dx >= 0 && x + dx < width && y + dy >= 0 && y + dy < height) - { - area++; - if(compare(src[x + dx, x + dy], value)) - sum++; - } - } - } - dst[x, y] = sum*255 > area*threshold ? positive : negative; - \endverbatim - where compare(a, b) depends from compareType (see ::SimdCompareType). - - \note This function has a C++ wrapper Simd::AveragingBinarization(const View& src, uint8_t value, size_t neighborhood, uint8_t threshold, uint8_t positive, uint8_t negative, View& dst, SimdCompareType compareType). - - \param [in] src - a pointer to pixels data of input 8-bit gray image (first value for compare operation). - \param [in] srcStride - a row size of the src image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] value - a second value for compare operation. - \param [in] neighborhood - an averaging neighborhood. - \param [in] threshold - a threshold value for binarization. It can range from 0 to 255. - \param [in] positive - a destination value if for neighborhood of this point number of positive comparison is greater then threshold. - \param [in] negative - a destination value if for neighborhood of this point number of positive comparison is lesser or equal then threshold. - \param [out] dst - a pointer to pixels data of output 8-bit gray binarized image. - \param [in] dstStride - a row size of the dst image. - \param [in] compareType - a compare operation type (see ::SimdCompareType). - */ - SIMD_API void SimdAveragingBinarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, size_t neighborhood, uint8_t threshold, uint8_t positive, uint8_t negative, - uint8_t * dst, size_t dstStride, SimdCompareType compareType); - - /*! @ingroup conditional - - \fn void SimdConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, uint8_t value, SimdCompareType compareType, uint32_t * count); - - \short Calculates number of points satisfying certain condition for 8-bit gray image. - - For every point: - \verbatim - if(compare(src[i], value)) - count++; - \endverbatim - where compare(a, b) depends from compareType (see ::SimdCompareType). - - \note This function has a C++ wrapper Simd::ConditionalCount8u(const View & src, uint8_t value, SimdCompareType compareType, uint32_t & count). - - \param [in] src - a pointer to pixels data of input 8-bit gray image (first value for compare operation). - \param [in] stride - a row size of the src image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] value - a second value for compare operation. - \param [in] compareType - a compare operation type (see ::SimdCompareType). - \param [out] count - a pointer to result unsigned 32-bit value. - */ - SIMD_API void SimdConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, - uint8_t value, SimdCompareType compareType, uint32_t * count); - - /*! @ingroup conditional - - \fn void SimdConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, int16_t value, SimdCompareType compareType, uint32_t * count); - - \short Calculates number of points satisfying certain condition for 16-bit signed integer image. - - For every point: - \verbatim - if(compare(src[i], value)) - count++; - \endverbatim - where compare(a, b) depends from compareType (see ::SimdCompareType). - - \note This function has a C++ wrapper Simd::ConditionalCount16i(const View & src, int16_t value, SimdCompareType compareType, uint32_t & count). - - \param [in] src - a pointer to pixels data of input 16-bit signed integer image (first value for compare operation). - \param [in] stride - a row size of the src image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] value - a second value for compare operation. - \param [in] compareType - a compare operation type (see ::SimdCompareType). - \param [out] count - a pointer to result unsigned 32-bit value. - */ - SIMD_API void SimdConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, - int16_t value, SimdCompareType compareType, uint32_t * count); - - /*! @ingroup conditional - - \fn void SimdConditionalSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum); - - \short Calculates sum of image points when mask points satisfying certain condition. - - All images must have 8-bit gray format and must have the same width and height. - - For every point: - \verbatim - if(compare(mask[i], value)) - sum += src[i]; - \endverbatim - where compare(a, b) depends from compareType (see ::SimdCompareType). - - \note This function has a C++ wrapper Simd::ConditionalSum(const View & src, const View & mask, uint8_t value, SimdCompareType compareType, uint64_t & sum). - - \param [in] src - a pointer to pixels data of input 8-bit gray image. - \param [in] srcStride - a row size of the src image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] mask - a pointer to pixels data of 8-bit gray mask (first value for compare operation). - \param [in] maskStride - a row size of the mask image. - \param [in] value - a second value for compare operation. - \param [in] compareType - a compare operation type (see ::SimdCompareType). - \param [out] sum - a pointer to result unsigned 64-bit value. - */ - SIMD_API void SimdConditionalSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum); - - /*! @ingroup conditional - - \fn void SimdConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum); - - \short Calculates sum of squared image points when mask points satisfying certain condition. - - All images must have 8-bit gray format and must have the same width and height. - - For every point: - \verbatim - if(compare(mask[i], value)) - sum += src[i]*src[i]; - \endverbatim - where compare(a, b) depends from compareType (see ::SimdCompareType). - - \note This function has a C++ wrapper Simd::ConditionalSquareSum(const View & src, const View & mask, uint8_t value, SimdCompareType compareType, uint64_t & sum). - - \param [in] src - a pointer to pixels data of input 8-bit gray image. - \param [in] srcStride - a row size of the src image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] mask - a pointer to pixels data of 8-bit gray mask (first value for compare operation). - \param [in] maskStride - a row size of the mask image. - \param [in] value - a second value for compare operation. - \param [in] compareType - a compare operation type (see ::SimdCompareType). - \param [out] sum - a pointer to result unsigned 64-bit value. - */ - SIMD_API void SimdConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum); - - /*! @ingroup conditional - - \fn void SimdConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum); - - \short Calculates sum of squared gradient of image points when mask points satisfying certain condition. - - All images must have 8-bit gray format and must have the same width and height. The image height and width must be equal or greater 3. - - For every point except border: - \verbatim - if(compare(mask[x, y], value)) - { - dx = src[x + 1, y] - src[x - 1, y]; - dy = src[x, y + 1] - src[x, y - 1]; - sum += dx*dx + dy*dy; - } - \endverbatim - where compare(a, b) depends from compareType (see ::SimdCompareType). - - \note This function has a C++ wrapper Simd::ConditionalSquareGradientSum(const View & src, const View & mask, uint8_t value, SimdCompareType compareType, uint64_t & sum). - - \param [in] src - a pointer to pixels data of input 8-bit gray image. - \param [in] srcStride - a row size of the src image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] mask - a pointer to pixels data of 8-bit gray mask (first value for compare operation). - \param [in] maskStride - a row size of the mask image. - \param [in] value - a second value for compare operation. - \param [in] compareType - a compare operation type (see ::SimdCompareType). - \param [out] sum - a pointer to result unsigned 64-bit value. - */ - SIMD_API void SimdConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum); - - /*! @ingroup conditional - - \fn void SimdConditionalFill(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t threshold, SimdCompareType compareType, uint8_t value, uint8_t * dst, size_t dstStride); - - \short Fills pixels of 8-bit gray image by given value if corresponding pixels of input 8-bit gray image satisfy certain condition. - - All images must have the same width and height. - - For every point: - \verbatim - if(compare(src[i], threshold)) - dst[i] = value; - \endverbatim - where compare(a, b) depends from compareType (see ::SimdCompareType). - - \note This function has a C++ wrapper Simd::ConditionalFill(const View & src, uint8_t threshold, SimdCompareType compareType, uint8_t value, View & dst). - - \param [in] src - a pointer to pixels data of input 8-bit gray image. - \param [in] srcStride - a row size of input image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] threshold - a second value for compare operation. - \param [in] compareType - a compare operation type (see ::SimdCompareType). - \param [in] value - a value for fill operation. - \param [in, out] dst - a pointer to pixels data of the output 8-bit gray image. - \param [in] dstStride - a row size of output image. - */ - SIMD_API void SimdConditionalFill(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t threshold, SimdCompareType compareType, uint8_t value, uint8_t * dst, size_t dstStride); - - /*! @ingroup copying - - \fn void SimdCopy(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, uint8_t * dst, size_t dstStride); - - \short Copies pixels data of image from source to destination. - - All images must have the same width, height and format. - - \note This function has a C++ wrapper Simd::Copy(const View & src, View & dst). - - \param [in] src - a pointer to pixels data of source image. - \param [in] srcStride - a row size of the src image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] pixelSize - a size of the image pixel. - \param [out] dst - a pointer to pixels data of destination image. - \param [in] dstStride - a row size of the dst image. - */ - SIMD_API void SimdCopy(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, uint8_t * dst, size_t dstStride); - - /*! @ingroup copying - - \fn void SimdCopyFrame(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, size_t frameLeft, size_t frameTop, size_t frameRight, size_t frameBottom, uint8_t * dst, size_t dstStride); - - \short Copies pixels data of image from source to destination except for the portion bounded frame. - - All images must have the same width, height and format. - - \note This function has a C++ wrapper Simd::CopyFrame(const View& src, const Rectangle & frame, View& dst). - - \param [in] src - a pointer to pixels data of source image. - \param [in] srcStride - a row size of the src image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] pixelSize - a size of the image pixel. - \param [in] frameLeft - a frame left side. - \param [in] frameTop - a frame top side. - \param [in] frameRight - a frame right side. - \param [in] frameBottom - a frame bottom side. - \param [out] dst - a pointer to pixels data of destination image. - \param [in] dstStride - a row size of the dst image. - */ - SIMD_API void SimdCopyFrame(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, - size_t frameLeft, size_t frameTop, size_t frameRight, size_t frameBottom, uint8_t * dst, size_t dstStride); - - /*! @ingroup other_conversion - - \fn void SimdDeinterleaveUv(const uint8_t * uv, size_t uvStride, size_t width, size_t height, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - \short Deinterleaves 16-bit UV interleaved image into separated 8-bit U and V planar images. - - All images must have the same width and height. - This function used for NV12 to YUV420P conversion. - - \note This function has a C++ wrapper Simd::DeinterleaveUv(const View& uv, View& u, View& v). - - \param [in] uv - a pointer to pixels data of input 16-bit UV interleaved image. - \param [in] uvStride - a row size of the uv image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] u - a pointer to pixels data of 8-bit U planar image. - \param [in] uStride - a row size of the u image. - \param [out] v - a pointer to pixels data of 8-bit V planar image. - \param [in] vStride - a row size of the v image. - */ - SIMD_API void SimdDeinterleaveUv(const uint8_t * uv, size_t uvStride, size_t width, size_t height, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - /*! @ingroup other_conversion - - \fn void SimdDeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride); - - \short Deinterleaves 24-bit BGR interleaved image into separated 8-bit Blue, Green and Red planar images. - - All images must have the same width and height. - - \note This function has a C++ wrapper Simd::DeinterleaveBgr(const View& bgr, View& b, View& g, View& r). - - \param [in] bgr - a pointer to pixels data of input 24-bit BGR interleaved image. - \param [in] bgrStride - a row size of the bgr image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] b - a pointer to pixels data of 8-bit Blue planar image. - \param [in] bStride - a row size of the b image. - \param [out] g - a pointer to pixels data of 8-bit Green planar image. - \param [in] gStride - a row size of the g image. - \param [out] r - a pointer to pixels data of 8-bit Red planar image. - \param [in] rStride - a row size of the r image. - */ - SIMD_API void SimdDeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride); - - /*! @ingroup other_conversion - - \fn void SimdDeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride); - - \short Deinterleaves 32-bit BGRA interleaved image into separated 8-bit Blue, Green, Red and Alpha planar images. - - All images must have the same width and height. - - \note This function has a C++ wrapper Simd::DeinterleaveBgra(const View& bgra, View& b, View& g, View& r, View& a). - - \param [in] bgra - a pointer to pixels data of input 32-bit BGRA interleaved image. - \param [in] bgraStride - a row size of the bgra image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] b - a pointer to pixels data of 8-bit Blue planar image. - \param [in] bStride - a row size of the b image. - \param [out] g - a pointer to pixels data of 8-bit Green planar image. - \param [in] gStride - a row size of the g image. - \param [out] r - a pointer to pixels data of 8-bit Red planar image. - \param [in] rStride - a row size of the r image. - \param [out] a - a pointer to pixels data of 8-bit Alpha planar image. - \param [in] aStride - a row size of the a image. - */ - SIMD_API void SimdDeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride); - - /*! @ingroup object_detection - - \fn void * SimdDetectionLoadA(const char * path); - - \short Loads a classifier cascade from file. - - This function supports OpenCV HAAR and LBP cascades type. - Tree based cascades and old cascade formats are not supported. - - \note This function is used for implementation of Simd::Detection. - - \param [in] path - a path to cascade. - \return a pointer to loaded cascade. On error it returns NULL. - This pointer is used in functions ::SimdDetectionInfo and ::SimdDetectionInit, and must be released with using of function ::SimdRelease. - */ - SIMD_API void * SimdDetectionLoadA(const char * path); - - /*! @ingroup object_detection - - \fn void * SimdDetectionLoadStringXml(char * xml); - - \short Loads a classifier cascade from a string. - - This function supports OpenCV HAAR and LBP cascades type. - Tree based cascades and old cascade formats are not supported. - - \note This function is used for implementation of Simd::Detection. - - \param [in,out] xml - A string with the xml of a classifier cascade. - \return a pointer to loaded cascade. On error it returns NULL. - This pointer is used in functions ::SimdDetectionInfo and ::SimdDetectionInit, and must be released with using of function ::SimdRelease. - */ - SIMD_API void * SimdDetectionLoadStringXml(char * xml); - - /*! @ingroup object_detection - - \fn void SimdDetectionInfo(const void * data, size_t * width, size_t * height, SimdDetectionInfoFlags * flags); - - \short Gets information about the classifier cascade. - - \note This function is used for implementation of Simd::Detection. - - \param [in] data - a pointer to cascade which was received with using of function ::SimdDetectionLoadA. - \param [out] width - a pointer to returned width of cascade window. - \param [out] height - a pointer to returned height of cascade window. - \param [out] flags - a pointer to flags with other information (See ::SimdDetectionInfoFlags). - */ - SIMD_API void SimdDetectionInfo(const void * data, size_t * width, size_t * height, SimdDetectionInfoFlags * flags); - - /*! @ingroup object_detection - - \fn void * SimdDetectionInit(const void * data, uint8_t * sum, size_t sumStride, size_t width, size_t height, uint8_t * sqsum, size_t sqsumStride, uint8_t * tilted, size_t tiltedStride, int throughColumn, int int16); - - \short Initializes hidden classifier cascade structure to work with given size of input 8-bit gray image. - - \note This function is used for implementation of Simd::Detection. - - \param [in] data - a pointer to cascade which was received with using of function ::SimdDetectionLoadA. - \param [in] sum - a pointer to pixels data of 32-bit integer image with integral sum of given input 8-bit gray image. - See function ::SimdIntegral in order to estimate this integral sum. - \param [in] sumStride - a row size of the sum image. - \param [in] width - a width of the sum image. It must be per unit greater than width of input 8-bit gray image. - \param [in] height - a height of the sum image. It must be per unit greater than height of input 8-bit gray image. - \param [in] sqsum - a pointer to pixels data of 32-bit integer image with squared integral sum of given input 8-bit gray image. - Its size must be equal to sum image. See function ::SimdIntegral in order to estimate this squared integral sum. Its - \param [in] sqsumStride - a row size of the sqsum image. - \param [in] tilted - a pointer to pixels data of 32-bit integer image with tilted integral sum of given input 8-bit gray image. - Its size must be equal to sum image. See function ::SimdIntegral in order to estimate this tilted integral sum. - \param [in] tiltedStride - a row size of the tilted image. - \param [in] throughColumn - a flag to detect objects only in even columns and rows (to increase performance). - \param [in] int16 - a flag use for 16-bit integer version of detection algorithm. (See ::SimdDetectionInfo). - \return a pointer to hidden cascade. On error it returns NULL. - This pointer is used in functions ::SimdDetectionPrepare, ::SimdDetectionHaarDetect32fp, ::SimdDetectionHaarDetect32fi, - ::SimdDetectionLbpDetect32fp, ::SimdDetectionLbpDetect32fi, ::SimdDetectionLbpDetect16ip and ::SimdDetectionLbpDetect16ii. - It must be released with using of function ::SimdRelease. - */ - SIMD_API void * SimdDetectionInit(const void * data, uint8_t * sum, size_t sumStride, size_t width, size_t height, - uint8_t * sqsum, size_t sqsumStride, uint8_t * tilted, size_t tiltedStride, int throughColumn, int int16); - - /*! @ingroup object_detection - - \fn void SimdDetectionPrepare(void * hid); - - \short Prepares hidden classifier cascade structure to work with given input 8-bit gray image. - - You must call this function before calling of functions ::SimdDetectionHaarDetect32fp, ::SimdDetectionHaarDetect32fi, - ::SimdDetectionLbpDetect32fp, ::SimdDetectionLbpDetect32fi, ::SimdDetectionLbpDetect16ip and ::SimdDetectionLbpDetect16ii. - - \note This function is used for implementation of Simd::Detection. - - \param [in] hid - a pointer to hidden cascade which was received with using of function ::SimdDetectionInit. - */ - SIMD_API void SimdDetectionPrepare(void * hid); - - /*! @ingroup object_detection - - \fn void SimdDetectionHaarDetect32fp(const void * hid, const uint8_t * mask, size_t maskStride, ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - \short Performs object detection with using of HAAR cascade classifier (uses 32-bit float numbers, processes all points). - - You must call function ::SimdDetectionPrepare before calling of this functions. - All restriction (input mask and bounding box) affects to left-top corner of scanning window. - - \note This function is used for implementation of Simd::Detection. - - \param [in] hid - a pointer to hidden cascade which was received with using of function ::SimdDetectionInit. - \param [in] mask - a pointer to pixels data of 8-bit image with mask. The mask restricts detection region. - \param [in] maskStride - a row size of the mask image. - \param [in] left - a left side of bounding box which restricts detection region. - \param [in] top - a top side of bounding box which restricts detection region. - \param [in] right - a right side of bounding box which restricts detection region. - \param [in] bottom - a bottom side of bounding box which restricts detection region. - \param [out] dst - a pointer to pixels data of 8-bit image with output result. None zero points refer to left-top corner of detected objects. - \param [in] dstStride - a row size of the dst image. - */ - SIMD_API void SimdDetectionHaarDetect32fp(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - /*! @ingroup object_detection - - \fn void SimdDetectionHaarDetect32fi(const void * hid, const uint8_t * mask, size_t maskStride, ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - \short Performs object detection with using of HAAR cascade classifier (uses 32-bit float numbers, processes only even points). - - You must call function ::SimdDetectionPrepare before calling of this functions. - All restriction (input mask and bounding box) affects to left-top corner of scanning window. - - \note This function is used for implementation of Simd::Detection. - - \param [in] hid - a pointer to hidden cascade which was received with using of function ::SimdDetectionInit. - \param [in] mask - a pointer to pixels data of 8-bit image with mask. The mask restricts detection region. - \param [in] maskStride - a row size of the mask image. - \param [in] left - a left side of bounding box which restricts detection region. - \param [in] top - a top side of bounding box which restricts detection region. - \param [in] right - a right side of bounding box which restricts detection region. - \param [in] bottom - a bottom side of bounding box which restricts detection region. - \param [out] dst - a pointer to pixels data of 8-bit image with output result. None zero points refer to left-top corner of detected objects. - \param [in] dstStride - a row size of the dst image. - */ - SIMD_API void SimdDetectionHaarDetect32fi(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - /*! @ingroup object_detection - - \fn void SimdDetectionLbpDetect32fp(const void * hid, const uint8_t * mask, size_t maskStride, ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - \short Performs object detection with using of LBP cascade classifier (uses 32-bit float numbers, processes all points). - - You must call function ::SimdDetectionPrepare before calling of this functions. - All restriction (input mask and bounding box) affects to left-top corner of scanning window. - - \note This function is used for implementation of Simd::Detection. - - \param [in] hid - a pointer to hidden cascade which was received with using of function ::SimdDetectionInit. - \param [in] mask - a pointer to pixels data of 8-bit image with mask. The mask restricts detection region. - \param [in] maskStride - a row size of the mask image. - \param [in] left - a left side of bounding box which restricts detection region. - \param [in] top - a top side of bounding box which restricts detection region. - \param [in] right - a right side of bounding box which restricts detection region. - \param [in] bottom - a bottom side of bounding box which restricts detection region. - \param [out] dst - a pointer to pixels data of 8-bit image with output result. None zero points refer to left-top corner of detected objects. - \param [in] dstStride - a row size of the dst image. - */ - SIMD_API void SimdDetectionLbpDetect32fp(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - /*! @ingroup object_detection - - \fn void SimdDetectionLbpDetect32fi(const void * hid, const uint8_t * mask, size_t maskStride, ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - \short Performs object detection with using of LBP cascade classifier (uses 32-bit float numbers, processes only even points). - - You must call function ::SimdDetectionPrepare before calling of this functions. - All restriction (input mask and bounding box) affects to left-top corner of scanning window. - - \note This function is used for implementation of Simd::Detection. - - \param [in] hid - a pointer to hidden cascade which was received with using of function ::SimdDetectionInit. - \param [in] mask - a pointer to pixels data of 8-bit image with mask. The mask restricts detection region. - \param [in] maskStride - a row size of the mask image. - \param [in] left - a left side of bounding box which restricts detection region. - \param [in] top - a top side of bounding box which restricts detection region. - \param [in] right - a right side of bounding box which restricts detection region. - \param [in] bottom - a bottom side of bounding box which restricts detection region. - \param [out] dst - a pointer to pixels data of 8-bit image with output result. None zero points refer to left-top corner of detected objects. - \param [in] dstStride - a row size of the dst image. - */ - SIMD_API void SimdDetectionLbpDetect32fi(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - /*! @ingroup object_detection - - \fn void SimdDetectionLbpDetect16ip(const void * hid, const uint8_t * mask, size_t maskStride, ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - \short Performs object detection with using of LBP cascade classifier (uses 16-bit integer numbers, processes all points). - - You must call function ::SimdDetectionPrepare before calling of this functions. - All restriction (input mask and bounding box) affects to left-top corner of scanning window. - - \note This function is used for implementation of Simd::Detection. - - \param [in] hid - a pointer to hidden cascade which was received with using of function ::SimdDetectionInit. - \param [in] mask - a pointer to pixels data of 8-bit image with mask. The mask restricts detection region. - \param [in] maskStride - a row size of the mask image. - \param [in] left - a left side of bounding box which restricts detection region. - \param [in] top - a top side of bounding box which restricts detection region. - \param [in] right - a right side of bounding box which restricts detection region. - \param [in] bottom - a bottom side of bounding box which restricts detection region. - \param [out] dst - a pointer to pixels data of 8-bit image with output result. None zero points refer to left-top corner of detected objects. - \param [in] dstStride - a row size of the dst image. - */ - SIMD_API void SimdDetectionLbpDetect16ip(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - /*! @ingroup object_detection - - \fn void SimdDetectionLbpDetect16ii(const void * hid, const uint8_t * mask, size_t maskStride, ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - \short Performs object detection with using of LBP cascade classifier (uses 16-bit integer numbers, processes only even points). - - You must call function ::SimdDetectionPrepare before calling of this functions. - All restriction (input mask and bounding box) affects to left-top corner of scanning window. - - \note This function is used for implementation of Simd::Detection. - - \param [in] hid - a pointer to hidden cascade which was received with using of function ::SimdDetectionInit. - \param [in] mask - a pointer to pixels data of 8-bit image with mask. The mask restricts detection region. - \param [in] maskStride - a row size of the mask image. - \param [in] left - a left side of bounding box which restricts detection region. - \param [in] top - a top side of bounding box which restricts detection region. - \param [in] right - a right side of bounding box which restricts detection region. - \param [in] bottom - a bottom side of bounding box which restricts detection region. - \param [out] dst - a pointer to pixels data of 8-bit image with output result. None zero points refer to left-top corner of detected objects. - \param [in] dstStride - a row size of the dst image. - */ - SIMD_API void SimdDetectionLbpDetect16ii(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - /*! @ingroup edge_background - - \fn void SimdEdgeBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, uint8_t * background, size_t backgroundStride); - - \short Performs edge background update (initial grow, slow mode). - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - background[i] += value[i] > background[i] ? 1 : 0; - \endverbatim - - This function is used for edge background updating in motion detection algorithm. - - \note This function has a C++ wrapper Simd::EdgeBackgroundGrowRangeSlow(const View& value, View& background). - - \param [in] value - a pointer to pixels data of current feature value. - \param [in] valueStride - a row size of the value image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in, out] background - a pointer to pixels data of feature value of edge dynamic background. - \param [in] backgroundStride - a row size of the background image. - */ - SIMD_API void SimdEdgeBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride); - - /*! @ingroup edge_background - - \fn void SimdEdgeBackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, uint8_t * background, size_t backgroundStride); - - \short Performs edge background update (initial grow, fast mode). - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - background[i] = value[i] > background[i] ? value[i] : background[i]; - \endverbatim - - This function is used for edge background updating in motion detection algorithm. - - \note This function has a C++ wrapper Simd::EdgeBackgroundGrowRangeFast(const View& value, View& background). - - \param [in] value - a pointer to pixels data of current feature value. - \param [in] valueStride - a row size of the value image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in, out] background - a pointer to pixels data of feature value of edge dynamic background. - \param [in] backgroundStride - a row size of the background image. - */ - SIMD_API void SimdEdgeBackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride); - - /*! @ingroup edge_background - - \fn void SimdEdgeBackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, const uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t * backgroundCount, size_t backgroundCountStride); - - \short Performs collection of edge background statistic. - - All images must have the same width, height and format (8-bit gray). - - Updates background statistic counters for every point: - \verbatim - backgroundCount[i] += (value[i] > backgroundValue[i] && backgroundCount[i] < 255) ? 1 : 0; - \endverbatim - - This function is used for edge background updating in motion detection algorithm. - - \note This function has a C++ wrapper Simd::EdgeBackgroundIncrementCount(const View& value, const View& backgroundValue, View& backgroundCount). - - \param [in] value - a pointer to pixels data of current feature value. - \param [in] valueStride - a row size of the value image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] backgroundValue - a pointer to pixels data of value of feature of edge dynamic background. - \param [in] backgroundValueStride - a row size of the backgroundValue image. - \param [in, out] backgroundCount - a pointer to pixels data of count of feature of edge dynamic background. - \param [in] backgroundCountStride - a row size of the backgroundCount image. - */ - SIMD_API void SimdEdgeBackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t * backgroundCount, size_t backgroundCountStride); - - /*! @ingroup edge_background - - \fn void SimdEdgeBackgroundAdjustRange(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold); - - \short Performs adjustment of edge background range. - - All images must have the same width, height and format (8-bit gray). - - Adjusts edge background range for every point: - \verbatim - backgroundValue[i] += (backgroundCount[i] > threshold && backgroundValue[i] < 255) ? 1 : 0; - backgroundValue[i] -= (backgroundCount[i] < threshold && backgroundValue[i] > 0) ? 1 : 0; - backgroundCount[i] = 0; - \endverbatim - - This function is used for edge background updating in motion detection algorithm. - - \note This function has a C++ wrapper Simd::EdgeBackgroundAdjustRange(View& backgroundCount, View& backgroundValue, uint8_t threshold). - - \param [in, out] backgroundCount - a pointer to pixels data of count of feature of edge dynamic background. - \param [in] backgroundCountStride - a row size of the backgroundCount image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in, out] backgroundValue - a pointer to pixels data of value of feature of edge dynamic background. - \param [in] backgroundValueStride - a row size of the backgroundValue image. - \param [in] threshold - a count threshold. - */ - SIMD_API void SimdEdgeBackgroundAdjustRange(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold); - - /*! @ingroup edge_background - - \fn void SimdEdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride); - - \short Performs adjustment of edge background range with using adjust range mask. - - All images must have the same width, height and format (8-bit gray). - - Adjusts edge background range for every point: - \verbatim - if(mask[i]) - { - backgroundValue[i] += (backgroundCount[i] > threshold && backgroundValue[i] < 255) ? 1 : 0; - backgroundValue[i] -= (backgroundCount[i] < threshold && backgroundValue[i] > 0) ? 1 : 0; - backgroundCount[i] = 0; - } - \endverbatim - - This function is used for edge background updating in motion detection algorithm. - - \note This function has a C++ wrapper Simd::EdgeBackgroundAdjustRange(View& backgroundCount, View& backgroundValue, uint8_t threshold, const View& mask). - - \param [in, out] backgroundCount - a pointer to pixels data of count of feature of edge dynamic background. - \param [in] backgroundCountStride - a row size of the backgroundCount image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in, out] backgroundValue - a pointer to pixels data of value of feature of edge dynamic background. - \param [in] backgroundValueStride - a row size of the backgroundValue image. - \param [in] threshold - a count threshold. - \param [in] mask - a pointer to pixels data of adjust range mask. - \param [in] maskStride - a row size of the mask image. - */ - SIMD_API void SimdEdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride); - - /*! @ingroup edge_background - - \fn void SimdEdgeBackgroundShiftRange(const uint8_t * value, size_t valueStride, size_t width, size_t height, uint8_t * background, size_t backgroundStride); - - \short Shifts edge background range. - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - background[i] = value[i]; - \endverbatim - - This function is used for fast edge background updating in motion detection algorithm. - - \note This function has a C++ wrapper Simd::EdgeBackgroundShiftRange(const View& value, View& background). - - \param [in] value - a pointer to pixels data of current feature value. - \param [in] valueStride - a row size of the value image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in, out] background - a pointer to pixels data of feature of edge dynamic background. - \param [in] backgroundStride - a row size of the background image. - */ - SIMD_API void SimdEdgeBackgroundShiftRange(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride); - - /*! @ingroup edge_background - - \fn void SimdEdgeBackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, uint8_t * background, size_t backgroundStride, const uint8_t * mask, size_t maskStride); - - \short Shifts edge background range with using shift range mask. - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - if(mask[i]]) - background[i] = value[i]; - \endverbatim - - This function is used for fast edge background updating in motion detection algorithm. - - \note This function has a C++ wrapper Simd::EdgeBackgroundShiftRange(const View& value, View& background, const View& mask). - - \param [in] value - a pointer to pixels data of current feature value. - \param [in] valueStride - a row size of the value image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in, out] background - a pointer to pixels data of feature of edge dynamic background. - \param [in] backgroundStride - a row size of the background image. - \param [in] mask - a pointer to pixels data of shift range mask. - \param [in] maskStride - a row size of the mask image. - */ - SIMD_API void SimdEdgeBackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride, const uint8_t * mask, size_t maskStride); - - /*! @ingroup filling - - \fn void SimdFill(uint8_t * dst, size_t stride, size_t width, size_t height, size_t pixelSize, uint8_t value); - - \short Fills pixels data of image by given value. - - \note This function has a C++ wrapper Simd::Fill(View& dst, uint8_t value). - - \param [out] dst - a pointer to pixels data of destination image. - \param [in] stride - a row size of the dst image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] pixelSize - a size of the image pixel. - \param [in] value - a value to fill image. - */ - SIMD_API void SimdFill(uint8_t * dst, size_t stride, size_t width, size_t height, size_t pixelSize, uint8_t value); - - /*! @ingroup filling - - \fn void SimdFillFrame(uint8_t * dst, size_t stride, size_t width, size_t height, size_t pixelSize, size_t frameLeft, size_t frameTop, size_t frameRight, size_t frameBottom, uint8_t value); - - \short Fills pixels data of image except for the portion bounded frame by given value. - - \note This function has a C++ wrapper Simd::FillFrame(View& dst, const Rectangle & frame, uint8_t value). - - \param [out] dst - a pointer to pixels data of destination image. - \param [in] stride - a row size of the dst image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] pixelSize - a size of the image pixel. - \param [in] frameLeft - a frame left side. - \param [in] frameTop - a frame top side. - \param [in] frameRight - a frame right side. - \param [in] frameBottom - a frame bottom side. - \param [in] value - a value to fill image. - */ - SIMD_API void SimdFillFrame(uint8_t * dst, size_t stride, size_t width, size_t height, size_t pixelSize, - size_t frameLeft, size_t frameTop, size_t frameRight, size_t frameBottom, uint8_t value); - - /*! @ingroup filling - - \fn void SimdFillBgr(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red); - - \short Fills pixels data of 24-bit BGR image by given color(blue, green, red). - - \note This function has a C++ wrapper Simd::FillBgr(View& dst, uint8_t blue, uint8_t green, uint8_t red). - - \param [out] dst - a pointer to pixels data of destination image. - \param [in] stride - a row size of the dst image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] blue - a blue channel of BGR to fill image. - \param [in] green - a green channel of BGR to fill image. - \param [in] red - a red channel of BGR to fill image. - */ - SIMD_API void SimdFillBgr(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red); - - /*! @ingroup filling - - \fn void SimdFillBgra(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red, uint8_t alpha); - - \short Fills pixels data of 32-bit BGRA image by given color(blue, green, red, alpha). - - \note This function has a C++ wrapper Simd::FillBgra(View& dst, uint8_t blue, uint8_t green, uint8_t red, uint8_t alpha). - - \param [out] dst - a pointer to pixels data of destination image. - \param [in] stride - a row size of the dst image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] blue - a blue channel of BGRA to fill image. - \param [in] green - a green channel of BGRA to fill image. - \param [in] red - a red channel of BGRA to fill image. - \param [in] alpha - a alpha channel of BGRA to fill image. - */ - SIMD_API void SimdFillBgra(uint8_t * dst, size_t stride, size_t width, size_t height, - uint8_t blue, uint8_t green, uint8_t red, uint8_t alpha); - - /*! @ingroup filling - - \fn void SimdFillPixel(uint8_t * dst, size_t stride, size_t width, size_t height, const uint8_t * pixel, size_t pixelSize); - - \short Fills image by value of given pixel. - - \note This function has a C++ wrapper Simd::FillPixel(View & dst, const Pixel & pixel). - - \param [out] dst - a pointer to pixels data of destination image. - \param [in] stride - a row size of the dst image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] pixel - a pointer to pixel to fill. - \param [in] pixelSize - a size of the image pixel. Parameter is restricted by range [1, 4]. - */ - SIMD_API void SimdFillPixel(uint8_t * dst, size_t stride, size_t width, size_t height, const uint8_t * pixel, size_t pixelSize); - - /*! @ingroup filling - - \fn void SimdFill32f(float * dst, size_t size, const float * value); - - \short Fills 32-bit float array by given value. - - \param [out] dst - a pointer to 32-bit float array. - \param [in] size - a size of the array. - \param [in] value - a pointer to value to fill. Can be NULL (filling value is assumed to be equal to zero). - */ - SIMD_API void SimdFill32f(float * dst, size_t size, const float * value); - - /*! @ingroup float16 - - \fn void SimdFloat32ToFloat16(const float * src, size_t size, uint16_t * dst); - - \short Converts numbers in the array from 32-bit float to 16-bit float format. - - \param [in] src - a pointer to the input array with 32-bit float point numbers. - \param [in] size - a size of input and output array. - \param [out] dst - a pointer to the output array with 16-bit float point numbers. - */ - SIMD_API void SimdFloat32ToFloat16(const float * src, size_t size, uint16_t * dst); - - /*! @ingroup float16 - - \fn void SimdFloat16ToFloat32(const uint16_t* src, size_t size, float * dst); - - \short Converts numbers in the array from 16-bit float to 32-bit float format. - - \param [in] src - a pointer to the input array with 16-bit float point numbers. - \param [in] size - a size of input and output array. - \param [out] dst - a pointer to the output array with 32-bit float point numbers. - */ - SIMD_API void SimdFloat16ToFloat32(const uint16_t * src, size_t size, float * dst); - - /*! @ingroup float16 - - \fn void SimdSquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum); - - \short Calculates sum of squared differences for two 16-bit float arrays. - - All arrays must have the same size. - - For every element: - \verbatim - sum += (a[i] - b[i])*(a[i] - b[i]); - \endverbatim - - \param [in] a - a pointer to the first 16-bit float array. - \param [in] b - a pointer to the second 16-bit float array. - \param [in] size - a size of arrays. - \param [out] sum - a pointer to 32-bit float point sum of squared differences. - */ - SIMD_API void SimdSquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum); - - /*! @ingroup float16 - - \fn void SimdCosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance); - - \short Calculates cosine distance of two 16-bit float arrays. - - All arrays must have the same size. - - Algorithm description: - \verbatim - distance = 1 - Sum(a[i]*b[i])/Sqrt(Sum(a[i]*a[i])*Sum(b[i]*b[i])); - \endverbatim - - \param [in] a - a pointer to the first 16-bit float array. - \param [in] b - a pointer to the second 16-bit float array. - \param [in] size - a size of arrays. - \param [out] distance - a pointer to 32-bit float with cosine distance. - */ - SIMD_API void SimdCosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance); - - /*! @ingroup float16 - - \fn void SimdCosineDistancesMxNa16f(size_t M, size_t N, size_t K, const uint16_t * const * A, const uint16_t * const * B, float * distances); - - \short Calculates mutual cosine distance of two arrays of 16-bit float arrays. - - Algorithm description: - \verbatim - distances[i, j] = 1 - Sum(A[i][k]*B[j][k])/Sqrt(Sum(A[i][k]*A[i][k])*Sum(B[j][k]*B[j][k])); - \endverbatim - - \param [in] M - a number of A arrays. - \param [in] N - a number of B arrays. - \param [in] K - a size of A and B arrays. - \param [in] A - a pointer to the first array with pointers to 16-bit float arrays. - \param [in] B - a pointer to the second array with pointers to 16-bit float arrays. - \param [out] distances - a pointer to result 32-bit float array with cosine distances. It size must be M*N. - */ - SIMD_API void SimdCosineDistancesMxNa16f(size_t M, size_t N, size_t K, const uint16_t * const * A, const uint16_t * const * B, float * distances); - - /*! @ingroup other_conversion - - \fn void SimdFloat32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst); - - \short Converts numbers in the array from 32-bit float to 8-bit unsigned integer format. - - For every element: - \verbatim - dst[i] = (min(max(src[i], lower), upper) - lower)*255/(upper - lower); - \endverbatim - - \param [in] src - a pointer to the input array with 32-bit float point numbers. - \param [in] size - a size of input and output array. - \param [in] lower - a pointer to lower saturated bound of the input array. - \param [in] upper - a pointer to upper saturated bound of the input array. - \param [out] dst - a pointer to the output array with 8-bit unsigned integer numbers. - */ - SIMD_API void SimdFloat32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst); - - /*! @ingroup other_conversion - - \fn void SimdUint8ToFloat32(const uint8_t* src, size_t size, const float * lower, const float * upper, float * dst); - - \short Converts numbers in the array from 8-bit unsigned integer to 32-bit float format. - - For every element: - \verbatim - dst[i] = src[i]*(upper - lower)/255 + lower; - \endverbatim - - \param [in] src - a pointer to the input array with 8-bit unsigned integer numbers. - \param [in] size - a size of input and output array. - \param [in] lower - a pointer to lower bound of the output array. - \param [in] upper - a pointer to upper bound of the output array. - \param [out] dst - a pointer to the output array with 32-bit float point numbers. - */ - SIMD_API void SimdUint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst); - - /*! @ingroup correlation - - \fn void SimdCosineDistance32f(const float * a, const float * b, size_t size, float * distance); - - \short Calculates cosine distance of two 32-bit float arrays. - - All arrays must have the same size. - - Algorithm description: - \verbatim - distance = 1 - Sum(a[i]*b[i])/Sqrt(Sum(a[i]*a[i])*Sum(b[i]*b[i])); - \endverbatim - - \param [in] a - a pointer to the first 32-bit float array. - \param [in] b - a pointer to the second 32-bit float array. - \param [in] size - a size of arrays. - \param [out] distance - a pointer to 32-bit float with cosine distance. - */ - SIMD_API void SimdCosineDistance32f(const float * a, const float * b, size_t size, float * distance); - - /*! @ingroup other_filter - - \fn void SimdGaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride); - - \short Performs Gaussian blur filtration with window 3x3. - - For every point: - \verbatim - dst[x, y] = (src[x-1, y-1] + 2*src[x, y-1] + src[x+1, y-1] + - 2*(src[x-1, y] + 2*src[x, y] + src[x+1, y]) + - src[x-1, y+1] + 2*src[x, y+1] + src[x+1, y+1] + 8) / 16; - \endverbatim - - All images must have the same width, height and format (8-bit gray, 16-bit UV, 24-bit BGR or 32-bit BGRA). - - \note This function has a C++ wrapper Simd::GaussianBlur3x3(const View& src, View& dst). - - \param [in] src - a pointer to pixels data of source image. - \param [in] srcStride - a row size of the src image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] channelCount - a channel count. - \param [out] dst - a pointer to pixels data of destination image. - \param [in] dstStride - a row size of the dst image. - */ - SIMD_API void SimdGaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - /*! @ingroup matrix - - \fn void SimdGemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); - - \short Performs general matrix multiplication (for 32-bit float numbers). - - \verbatim - C(M, N) = alpha*A(M, K)*B(K, N) + beta*C(M, N); - \endverbatim - - \note This function supports multithreading (See functions ::SimdGetThreadNumber and ::SimdSetThreadNumber). - - \param [in] M - a height of A and height of C matrices. - \param [in] N - a width of B and width of C matrices. - \param [in] K - a width of A and height of B matrices. - \param [in] alpha - a pointer to multiplier of the first term. - \param [in] A - a pointer to input A matrix. - \param [in] lda - a leading dimension of A matrix. - \param [in] B - a pointer to input B matrix. - \param [in] ldb - a leading dimension of B matrix. - \param [in] beta - a pointer to multiplier of the second term. - \param [out] C - a pointer to output C matrix. - \param [in] ldc - a leading dimension of C matrix. - */ - SIMD_API void SimdGemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); - - /*! @ingroup matrix - - \fn void SimdGemm32fNT(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); - - \short Performs general matrix multiplication (for 32-bit float numbers). - - \verbatim - C(M, N) = alpha*A(M, K)*Trans(B(N, K)) + beta*C(M, N); - \endverbatim - - \note This function supports multithreading (See functions ::SimdGetThreadNumber and ::SimdSetThreadNumber). - - \param [in] M - a height of A and height of C matrices. - \param [in] N - a height of B and width of C matrices. - \param [in] K - a width of A and width of B matrices. - \param [in] alpha - a pointer to multiplier of the first term. - \param [in] A - a pointer to input A matrix. - \param [in] lda - a leading dimension of A matrix. - \param [in] B - a pointer to input B matrix. - \param [in] ldb - a leading dimension of B matrix. - \param [in] beta - a pointer to multiplier of the second term. - \param [out] C - a pointer to output C matrix. - \param [in] ldc - a leading dimension of C matrix. - */ - SIMD_API void SimdGemm32fNT(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); - - /*! @ingroup gray_conversion - - \fn void SimdGrayToBgr(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgr, size_t bgrStride); - - \short Converts 8-bit gray image to 24-bit BGR image. - - All images must have the same width and height. - - \note This function has a C++ wrapper Simd::GrayToBgr(const View& gray, View& bgr). - - \param [in] gray - a pointer to pixels data of input 8-bit gray image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] grayStride - a row size of the gray image. - \param [out] bgr - a pointer to pixels data of output 24-bit BGR image. - \param [in] bgrStride - a row size of the bgr image. - */ - SIMD_API void SimdGrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride); - - /*! @ingroup gray_conversion - - \fn void SimdGrayToBgra(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - \short Converts 8-bit gray image to 32-bit BGRA image. - - All images must have the same width and height. - - \note This function has a C++ wrapper Simd::GrayToBgra(const View& gray, View& bgra, uint8_t alpha). - - \param [in] gray - a pointer to pixels data of input 8-bit gray image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] grayStride - a row size of the gray image. - \param [out] bgra - a pointer to pixels data of output 32-bit BGRA image. - \param [in] bgraStride - a row size of the bgra image. - \param [in] alpha - a value of alpha channel. - */ - SIMD_API void SimdGrayToBgra(const uint8_t *gray, size_t width, size_t height, size_t grayStride, - uint8_t *bgra, size_t bgraStride, uint8_t alpha); - - /*! @ingroup histogram - - \fn void SimdAbsSecondDerivativeHistogram(const uint8_t * src, size_t width, size_t height, size_t stride, size_t step, size_t indent, uint32_t * histogram); - - \short Calculates histogram of second derivative for 8-bit gray image. - - For all points except the boundary (defined by parameter indent): - \verbatim - dx = abs(src[x, y] - average(src[x+step, y], src[x-step, y])); - dy = abs(src[x, y] - average(src[x, y+step], src[x, y-step])); - histogram[max(dx, dy)]++; - \endverbatim - - \note This function has a C++ wrapper Simd::AbsSecondDerivativeHistogram(const View& src, size_t step, size_t indent, uint32_t * histogram). - - \param [in] src - a pointer to pixels data of input 8-bit gray image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] stride - a row size of the image. - \param [in] step - a step for second derivative calculation. - \param [in] indent - a indent from image boundary. - \param [out] histogram - a pointer to histogram (array of 256 unsigned 32-bit values). - */ - SIMD_API void SimdAbsSecondDerivativeHistogram(const uint8_t * src, size_t width, size_t height, size_t stride, - size_t step, size_t indent, uint32_t * histogram); - - /*! @ingroup histogram - - \fn void SimdHistogram(const uint8_t * src, size_t width, size_t height, size_t stride, uint32_t * histogram); - - \short Calculates histogram for 8-bit gray image. - - For all points: - \verbatim - histogram[src[i]]++. - \endverbatim - - \note This function has a C++ wrapper Simd::Histogram(const View& src, uint32_t * histogram). - - \param [in] src - a pointer to pixels data of input 8-bit gray image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] stride - a row size of the image. - \param [out] histogram - a pointer to histogram (array of 256 unsigned 32-bit values). - */ - SIMD_API void SimdHistogram(const uint8_t * src, size_t width, size_t height, size_t stride, uint32_t * histogram); - - /*! @ingroup histogram - - \fn void SimdHistogramMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t index, uint32_t * histogram); - - \short Calculates histogram for 8-bit gray image with using mask. - - For every point: - \verbatim - if(mask[i] == index) - histogram[src[i]]++. - \endverbatim - - \note This function has a C++ wrapper Simd::HistogramMasked(const View & src, const View & mask, uint8_t index, uint32_t * histogram). - - \param [in] src - a pointer to pixels data of input 8-bit gray image. - \param [in] srcStride - a row size of the image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] mask - a pointer to pixels data of the mask 8-bit image. - \param [in] maskStride - a row size of the mask image. - \param [in] index - a mask index. - \param [out] histogram - a pointer to histogram (array of 256 unsigned 32-bit values). - */ - SIMD_API void SimdHistogramMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t index, uint32_t * histogram); - - /*! @ingroup histogram - - \fn void SimdHistogramConditional(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint32_t * histogram); - - \short Calculates histogram of 8-bit gray image for those points when mask points satisfying certain condition. - - For every point: - \verbatim - if(compare(mask[x, y], value)) - histogram[src[x, y]]++. - \endverbatim - - \note This function has a C++ wrapper Simd::HistogramConditional(const View& src, const View& mask, uint8_t value, SimdCompareType compareType, uint32_t * histogram). - - \param [in] src - a pointer to pixels data of input 8-bit gray image. - \param [in] srcStride - a row size of the image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] mask - a pointer to pixels data of the mask 8-bit image. - \param [in] maskStride - a row size of the mask image. - \param [in] value - a second value for compare operation. - \param [in] compareType - a compare operation type (see ::SimdCompareType). - \param [out] histogram - a pointer to histogram (array of 256 unsigned 32-bit values). - */ - SIMD_API void SimdHistogramConditional(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint32_t * histogram); - - /*! @ingroup histogram - - \fn void SimdNormalizedColors(const uint32_t * histogram, uint8_t * colors); - - \short Gets normalized color map for given histogram. - - \param [in] histogram - a pointer to histogram (array of 256 unsigned 32-bit values). - \param [out] colors - a pointer to the color map (array of 256 unsigned 8-bit values). - */ - SIMD_API void SimdNormalizedColors(const uint32_t * histogram, uint8_t * colors); - - /*! @ingroup histogram - - \fn void SimdChangeColors(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * colors, uint8_t * dst, size_t dstStride); - - \short Changes colors for 8-bit gray image with using of color map. - - The input and output 8-bit gray images must have the same size. - Algorithm description: - \verbatim - for(y = 0; y < height; ++y) - for(x = 0; x < width; ++x) - dst[x, y] = colors[src[x, y]]; - \endverbatim - - \note This function has a C++ wrapper Simd::ChangeColors(const View & src, const uint8_t * colors, View & dst). - - \param [in] src - a pointer to pixels data of input 8-bit gray image. - \param [in] srcStride - a row size of the image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] colors - a pointer to the color map (array of 256 unsigned 8-bit values). - \param [out] dst - a pointer to pixels data of output 8-bit gray image. - \param [in] dstStride - a row size of the output gray image. - */ - SIMD_API void SimdChangeColors(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * colors, uint8_t * dst, size_t dstStride); - - /*! @ingroup histogram - - \fn void SimdNormalizeHistogram(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - \short Normalizes histogram for 8-bit gray image. - - The input and output 8-bit gray images must have the same size. - - \note This function has a C++ wrapper Simd::NormalizeHistogram(const View & src, View & dst). - - \param [in] src - a pointer to pixels data of input 8-bit gray image. - \param [in] srcStride - a row size of the image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] dst - a pointer to pixels data of output 8-bit image with normalized histogram. - \param [in] dstStride - a row size of the output image. - */ - SIMD_API void SimdNormalizeHistogram(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - /*! @ingroup hog - - \fn void SimdHogDirectionHistograms(const uint8_t * src, size_t stride, size_t width, size_t height, size_t cellX, size_t cellY, size_t quantization, float * histograms); - - \short Calculates HOG direction histograms for 8-bit gray image. - - Calculates HOG direction histogram for every cell of 8-bit gray image. This function is useful for face recognition. - - \note This function has a C++ wrapper Simd::HogDirectionHistograms(const View & src, const Point & cell, size_t quantization, float * histograms). - - \param [in] src - a pointer to pixels data of input 8-bit gray image. - \param [in] stride - a row size of the image. - \param [in] width - an image width. It must be a multiple of cellX. - \param [in] height - an image height. It must be a multiple of cellY. - \param [in] cellX - a width of cell. - \param [in] cellY - a height of cell. - \param [in] quantization - a direction quantization. Must be even. - \param [out] histograms - a pointer to buffer with histograms. Array must has size grater or equal to (width/cellX)*(height/cellY)*quantization. - */ - SIMD_API void SimdHogDirectionHistograms(const uint8_t * src, size_t stride, size_t width, size_t height, - size_t cellX, size_t cellY, size_t quantization, float * histograms); - - /*! @ingroup hog - - \fn void SimdHogExtractFeatures(const uint8_t * src, size_t stride, size_t width, size_t height, float * features); - - \short Extracts HOG features for 8-bit gray image. - - Extracts HOG features 8-bit gray image. 31 features are extracted for 8x8 cell size and 2x2 block size. This function is useful for face recognition. - - \note This function has a C++ wrapper Simd::HogExtractFeatures(const View & src, float * features). - - \param [in] src - a pointer to pixels data of input 8-bit gray image. - \param [in] stride - a row size of the image. - \param [in] width - an image width. It must be a multiple of 8. Its minimal value is 16. - \param [in] height - an image height. It must be a multiple of 8. Its minimal value is 16. - \param [out] features - a pointer to buffer with features. Array must has size grater or equal to (width/8)*(height/8)*31. - */ - SIMD_API void SimdHogExtractFeatures(const uint8_t * src, size_t stride, size_t width, size_t height, float * features); - - /*! @ingroup hog - - \fn void SimdHogDeinterleave(const float * src, size_t srcStride, size_t width, size_t height, size_t count, float ** dst, size_t dstStride); - - \short Separates one interleaved 32-bit float point image to separate planes. - - \param [in] src - a pointer to the input interleaved 32-bit float point image. - \param [in] srcStride - a row size of input image. - \param [in] width - a width of input and output images. - \param [in] height - a height of input and output images. - \param [in] count - the number of output planes. - \param [out] dst - a pointer to array with pointers to output planes. - \param [in] dstStride - a row size of output images. - */ - SIMD_API void SimdHogDeinterleave(const float * src, size_t srcStride, size_t width, size_t height, size_t count, float ** dst, size_t dstStride); - - /*! @ingroup hog - - \fn void SimdHogFilterSeparable(const float * src, size_t srcStride, size_t width, size_t height, const float * rowFilter, size_t rowSize, const float * colFilter, size_t colSize, float * dst, size_t dstStride, int add); - - \short Applies separable filter to given image of 32-bit float point format. - - For every point (except border): - \verbatim - sum = 0; - for(dy = 0; dy < colSize; dy++) - for(dx = 0; dx < rowSize; dx++) - sum += src[x + dx, y + dy]*colFilter[dy]*rowFilter[dx]; - if(add) - dst[x, y] += sum; - else - dst[x, y] = sum; - \endverbatim - - \note Input image has to have size at least not less then size of filter: (width <= rowSize and height <= colSize). - - \param [in] src - a pointer to input 32-bit float point image. - \param [in] srcStride - a row size of input image. - \param [in] width - a width of input image. It must be not less then size of row filter. - \param [in] height - a height of input image. It must be not less then size of column filter. - \param [in] rowFilter - a pointer to 32-bit float point array with row filter. - \param [in] rowSize- a size of row filter. - \param [in] colFilter - a pointer to 32-bit float point array with column filter. - \param [in] colSize- a size of column filter. - \param [in, out] dst - a pointer to output 32-bit float point image. - \param [in] dstStride - a row size of output image. - \param [in] add - a flag which signalizes that result has to be added to existing image. - */ - SIMD_API void SimdHogFilterSeparable(const float * src, size_t srcStride, size_t width, size_t height, const float * rowFilter, size_t rowSize, const float * colFilter, size_t colSize, float * dst, size_t dstStride, int add); - - /*! @ingroup hog - - \fn void SimdHogLiteExtractFeatures(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t cell, float * features, size_t featuresStride); - - \short Extracts lite HOG features for 8-bit gray image. - - Extracts lite (for 8 directions) HOG features 8-bit gray image. 16 features are extracted for 8x8 or 4x4 cell size and 2x2 block size. - - \note This function has a C++ wrapper Simd::HogLiteExtractFeatures(const View & src, size_t cell, float * features, size_t featuresStride). - - \param [in] src - a pointer to pixels data of input 8-bit gray image. - \param [in] srcStride - a row size of the image. - \param [in] width - an image width. Its minimal value is cell*3. - \param [in] height - an image height. Its minimal value is cell*3. - \param [in] cell - a size of cell. It must be 4 or 8. - \param [out] features - a pointer to buffer with features. Array must has size greater or equal to (height/cell - 2)*featuresStride. - \param [in] featuresStride - a row size of the buffer with features. It must be greater or equal to (width/cell - 2)*16. - */ - SIMD_API void SimdHogLiteExtractFeatures(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t cell, float * features, size_t featuresStride); - - /*! @ingroup hog - - \fn void SimdHogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride); - - \short Applies filter to lite HOG features. - - Applies filter of square shape to lite HOG features. - - For every point of output image: - \verbatim - if(mask[x, y]) - sum = 0; - for(dy = 0; dy < filterHeight; dy++) - for(dx = 0; dx < filterWidth*featureSize; dx++) - sum += src[x*featureSize + dx, y + dy]*filter[dx, dy]; - dst[x, y] = sum; - else - dst[x, y] = -FLT_MAX; - \endverbatim - - \param [in] src - a pointer to the input 32-bit float array with features. - \param [in] srcStride - a row size of input array with features. - \param [in] srcWidth - a width of input array with features. Its minimal value is filterSize. - \param [in] srcHeight - a height of input array with features. Its minimal value is filterSize. - \param [in] featureSize - a size of cell with features. It must be 8 or 16. - \param [in] filter - a pointer to the 32-bit float array with filter values. - Array must have size equal to filterSize*filterSize*featureSize. - \param [in] filterWidth - a width of used filter. - \param [in] filterHeight - a height of used filter. - \param [in] mask - a pointer to the 32-bit integer array with mask (0 or -1). - Pointer can be null otherwise the array must have size greater then (srcHeight - filterSize)*(srcWidth - filterSize). - A function ::SimdHogLiteCreateMask is usefull in order to create this mask. - \param [in] maskStride - a row size of mask array. - \param [out] dst - a pointer to output buffer with result of filtration. Array must have size greater then (srcHeight - filterSize)*(srcWidth - filterSize). - \param [in] dstStride - a row size of the output buffer with result of filtration. - */ - SIMD_API void SimdHogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride); - - /*! @ingroup hog - - \fn void SimdHogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight); - - \short Resizes 2D-array with lite HOG features. - - Resizes 2D-array with lite HOG features. It use method of bilinear interpolation. - - \param [in] src - a pointer to the input 32-bit float array with features. - \param [in] srcStride - a row size of input array with features. - \param [in] srcWidth - a width of input array with features. - \param [in] srcHeight - a height of input array with features. - \param [in] featureSize - a size of cell with features. It must be 8 or 16. - \param [out] dst - a pointer to the output 32-bit float array with features. - \param [in] dstStride - a row size of output array with features. - \param [in] dstWidth - a width of output array with features. - \param [in] dstHeight - a height of output array with features. - */ - SIMD_API void SimdHogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight); - - /*! @ingroup hog - - \fn void SimdHogLiteCompressFeatures(const float * src, size_t srcStride, size_t width, size_t height, const float * pca, float * dst, size_t dstStride); - - \short Compresses 16 features to 8 features for 2D-array. - - Compresses 16 features to 8 features for 2D-array. The method uses PCA. - - \param [in] src - a pointer to the input 32-bit float array with uncompessed features. - \param [in] srcStride - a row size of input array with uncompessed features. - \param [in] width - a width of 2D-array with features. - \param [in] height - a height of 2D-array with features. - \param [in] pca - a pointer to the PCA matrix with size 16x8. - \param [out] dst - a pointer to the output 32-bit float array with compessed features. - \param [in] dstStride - a row size of output array with compessed features. - */ - SIMD_API void SimdHogLiteCompressFeatures(const float * src, size_t srcStride, size_t width, size_t height, const float * pca, float * dst, size_t dstStride); - - /*! @ingroup hog - - \fn void SimdHogLiteFilterSeparable(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * hFilter, size_t hSize, const float * vFilter, size_t vSize, float * dst, size_t dstStride, int add); - - \short Applies separable filter to lite HOG features. - - For every point (except border): - \verbatim - sum = 0; - for(dy = 0; dy < vSize; dy++) - for(dx = 0; dx < hSize*featureSize; dx++) - sum += src[x*featureSize + dx, y + dy]*vFilter[dy]*hFilter[dx]; - if(add) - dst[x, y] += sum; - else - dst[x, y] = sum; - \endverbatim - - \note Input image has to have size at least not less then size of filter: (srcWidth <= hSize and srcHeight <= vSize). - - \param [in] src - a pointer to the input 32-bit float array with features. - \param [in] srcStride - a row size of input array with features. - \param [in] srcWidth - a width of input array with features. Its minimal value is hSize. - \param [in] srcHeight - a height of input array with features. Its minimal value is vSize. - \param [in] featureSize - a size of cell with features. It must be 8 or 16. - \param [in] hFilter - a pointer to 32-bit float point array with horizontal filter. - \param [in] hSize - a size of horizontal filter (in featureSize). Total size of horizontal filter is hSize*featureSize. - \param [in] vFilter - a pointer to 32-bit float point array with vertical filter. - \param [in] vSize- a size of vertical filter. - \param [in, out] dst - a pointer to output 32-bit float point image. - \param [in] dstStride - a row size of output image. - \param [in] add - a flag which signalizes that result has to be added to existing image. - */ - SIMD_API void SimdHogLiteFilterSeparable(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * hFilter, size_t hSize, const float * vFilter, size_t vSize, float * dst, size_t dstStride, int add); - - /*! @ingroup hog - - \fn void SimdHogLiteFindMax7x7(const float * a, size_t aStride, const float * b, size_t bStride, size_t height, float * value, size_t * col, size_t * row); - - \short Adds two 32-bit float point 2D-array with size 7x7 and finds value and position of maximum in the result array. - - Algorithm description: - \verbatim - value = -FLT_MAX; - for (y = 0; y < height; ++y) - { - for (x = 0; x < 7; ++x) - { - v = a[x, y] + b[x, y]; - if (v > value) - { - value = v; - col = x; - row = y; - break; - } - } - } - \endverbatim - - \param [in] a - a pointer to the first input 32-bit float array with size 7x7. - \param [in] aStride - a row size of the first input array. - \param [in] b - a pointer to the second input 32-bit float array with size 7x7. - \param [in] bStride - a row size of the second input array. - \param [in] height - a height of the input arrays. It must be equal or less then 7. - \param [out] value - a pointer to the output 32-bit float value with maximum. - \param [out] col - a pointer to the output integer value with x-position of maximum. - \param [out] row - a pointer to the output integer value with y-position of maximum. - */ - SIMD_API void SimdHogLiteFindMax7x7(const float * a, size_t aStride, const float * b, size_t bStride, size_t height, float * value, size_t * col, size_t * row); - - /*! @ingroup hog - - \fn void SimdHogLiteCreateMask(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, const float * threshold, size_t scale, size_t size, uint32_t * dst, size_t dstStride); - - \short Creates mask for function ::SimdHogLiteFilterFeatures. - - Zeroes destination mask. Then for every source point: - \verbatim - if(src[x, y] > threshold) - for (dy = 0; dy < size; ++dy) - for (dx = 0; dx < size; ++dx) - dst[x*scale + dx, y*scale + dy] = -1; - \endverbatim - - \param [in] src - a pointer to the input 32-bit float 2D array. - \param [in] srcStride - a row size of the input array. - \param [in] srcWidth - a width of input array. - \param [in] srcHeight - a height of input array. - \param [in] threshold - a pointer to 32-bit float threshold. - \param [in] scale - a scale coefficient between input and output array. - \param [in] size - a size of neighborhood. - \param [out] dst - a pointer to the output 32-bit integer array with mask (0 or -1). - \param [in] dstStride - a row size of the output array. - */ - SIMD_API void SimdHogLiteCreateMask(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, const float * threshold, size_t scale, size_t size, uint32_t * dst, size_t dstStride); - - /*! @ingroup other_conversion - - \fn void SimdInt16ToGray(const uint8_t * src, size_t width, size_t height, size_t srcStride, uint8_t * dst, size_t dstStride); - - \short Converts 16-bit signed integer image to 8-bit gray image with saturation - - All images must have the same width and height. - - For every point: - \verbatim - dst[i] = Max(0, Min(255, src[i])); - \endverbatim - - \note This function has a C++ wrapper Simd::Int16ToGray(const View & src, View & dst). - - \param [in] src - a pointer to pixels data of input 16-bit signed integer image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] srcStride - a row size of the 16-bit signed integer image. - \param [out] dst - a pointer to pixels data of input 8-bit gray image. - \param [out] dstStride - a row size of the gray image. - */ - SIMD_API void SimdInt16ToGray(const uint8_t * src, size_t width, size_t height, size_t srcStride, uint8_t * dst, size_t dstStride); - - /*! @ingroup integral - - \fn void SimdIntegral(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * sum, size_t sumStride, uint8_t * sqsum, size_t sqsumStride, uint8_t * tilted, size_t tiltedStride, SimdPixelFormatType sumFormat, SimdPixelFormatType sqsumFormat); - - \short Calculates integral images for input 8-bit gray image. - - The function can calculates sum integral image, square sum integral image (optionally) and tilted sum integral image (optionally). - A integral images must have width and height per unit greater than that of the input image. - - \note This function has a C++ wrappers: - \n Simd::Integral(const View& src, View& sum), - \n Simd::Integral(const View& src, View& sum, View& sqsum), - \n Simd::Integral(const View& src, View& sum, View& sqsum, View& tilted). - - \param [in] src - a pointer to pixels data of input 8-bit gray image. - \param [in] srcStride - a row size of src image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] sum - a pointer to pixels data of 32-bit integer sum image. - \param [in] sumStride - a row size of sum image (in bytes). - \param [out] sqsum - a pointer to pixels data of 32-bit integer or 64-bit float point square sum image. It can be NULL. - \param [in] sqsumStride - a row size of sqsum image (in bytes). - \param [out] tilted - a pointer to pixels data of 32-bit integer tilted sum image. It can be NULL. - \param [in] tiltedStride - a row size of tilted image (in bytes). - \param [in] sumFormat - a format of sum image and tilted image. It can be equal to ::SimdPixelFormatInt32. - \param [in] sqsumFormat - a format of sqsum image. It can be equal to ::SimdPixelFormatInt32 or ::SimdPixelFormatDouble. - */ - SIMD_API void SimdIntegral(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t * sum, size_t sumStride, uint8_t * sqsum, size_t sqsumStride, uint8_t * tilted, size_t tiltedStride, - SimdPixelFormatType sumFormat, SimdPixelFormatType sqsumFormat); - - /*! @ingroup interference - - \fn void SimdInterferenceIncrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t increment, int16_t saturation); - - \short Increments statistic of interference detector. - - For every point: - \verbatim - statistic[i] = min(statistic[i] + increment, saturation); - \endverbatim - - This function is used for interference detection in motion detection algorithm. - - \note This function has a C++ wrappers: Simd::InterferenceIncrement(View & dst, uint8_t increment, int16_t saturation). - - \param [in, out] statistic - a pointer to pixels data of 16-bit signed integer image with statistic. - \param [in] stride - a row size of statistic image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] increment - an increment of statistic. - \param [in] saturation - an upper saturation of statistic. - */ - SIMD_API void SimdInterferenceIncrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t increment, int16_t saturation); - - /*! @ingroup interference - - \fn void SimdInterferenceIncrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, uint8_t increment, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index); - - \short Increments statistic of interference detector with using segmentation mask. - - For every point: - \verbatim - if(mask[i] == index) - statistic[i] = min(statistic[i] + increment, saturation); - \endverbatim - - All images must have the same width, height. - This function is used for interference detection in motion detection algorithm. - - \note This function has a C++ wrappers: Simd::InterferenceIncrementMasked(View & dst, uint8_t increment, int16_t saturation, const View& mask, uint8_t index). - - \param [in, out] statistic - a pointer to pixels data of 16-bit signed integer image with statistic. - \param [in] statisticStride - a row size of statistic image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] increment - an increment of statistic. - \param [in] saturation - an upper saturation of statistic. - \param [in] mask - a pointer to pixels data of 8-bit gray image with mask. - \param [in] maskStride - a row size of mask image. - \param [in] index - an index of mask. - */ - SIMD_API void SimdInterferenceIncrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t increment, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index); - - /*! @ingroup interference - - \fn void SimdInterferenceDecrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t decrement, int16_t saturation); - - \short Decrements statistic of interference detector. - - For every point: - \verbatim - statistic[i] = max(statistic[i] - decrement, saturation); - \endverbatim - - This function is used for interference detection in motion detection algorithm. - - \note This function has a C++ wrappers: Simd::InterferenceDecrement(View & dst, uint8_t decrement, int16_t saturation). - - \param [in, out] statistic - a pointer to pixels data of 16-bit signed integer image with statistic. - \param [in] stride - a row size of statistic image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] decrement - a decrement of statistic. - \param [in] saturation - a lower saturation of statistic. - */ - SIMD_API void SimdInterferenceDecrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t decrement, int16_t saturation); - - /*! @ingroup interference - - \fn void SimdInterferenceDecrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, uint8_t decrement, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index); - - \short Decrements statistic of interference detector with using segmentation mask. - - For every point: - \verbatim - if(mask[i] == index) - statistic[i] = max(statistic[i] - decrement, saturation); - \endverbatim - - All images must have the same width, height. - This function is used for interference detection in motion detection algorithm. - - \note This function has a C++ wrappers: Simd::InterferenceDecrementMasked(View & dst, uint8_t decrement, int16_t saturation, const View& mask, uint8_t index). - - \param [in, out] statistic - a pointer to pixels data of 16-bit signed integer image with statistic. - \param [in] statisticStride - a row size of statistic image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] decrement - a decrement of statistic. - \param [in] saturation - a lower saturation of statistic. - \param [in] mask - a pointer to pixels data of 8-bit gray image with mask. - \param [in] maskStride - a row size of mask image. - \param [in] index - an index of mask. - */ - SIMD_API void SimdInterferenceDecrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t decrement, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index); - - /*! @ingroup other_conversion - - \fn void SimdInterleaveUv(const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * uv, size_t uvStride); - - \short Interleaves 8-bit U and V planar images into one 16-bit UV interleaved image. - - All images must have the same width and height. - This function used for YUV420P to NV12 conversion. - - \note This function has a C++ wrapper Simd::InterleaveUv(const View& u, const View& v, View& uv). - - \param [in] u - a pointer to pixels data of input 8-bit U planar image. - \param [in] uStride - a row size of the u image. - \param [in] v - a pointer to pixels data of input 8-bit V planar image. - \param [in] vStride - a row size of the v image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] uv - a pointer to pixels data of output 16-bit UV interleaved image. - \param [in] uvStride - a row size of the uv image. - */ - SIMD_API void SimdInterleaveUv(const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * uv, size_t uvStride); - - /*! @ingroup other_conversion - - \fn void SimdInterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - \short Interleaves 8-bit Blue, Green and Red planar images into one 24-bit BGR interleaved image. - - All images must have the same width and height. - - \note This function has a C++ wrapper Simd::InterleaveBgr(const View& b, const View& g, const View& r, View& bgr). - - \param [in] b - a pointer to pixels data of input 8-bit Blue planar image. - \param [in] bStride - a row size of the b image. - \param [in] g - a pointer to pixels data of input 8-bit Green planar image. - \param [in] gStride - a row size of the g image. - \param [in] r - a pointer to pixels data of input 8-bit Red planar image. - \param [in] rStride - a row size of the r image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] bgr - a pointer to pixels data of output 24-bit BGR interleaved image. - \param [in] bgrStride - a row size of the bgr image. - */ - SIMD_API void SimdInterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - /*! @ingroup other_conversion - - \fn void SimdInterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride); - - \short Interleaves 8-bit Blue, Green, Red and Alpha planar images into one 32-bit BGRA interleaved image. - - All images must have the same width and height. - - \note This function has a C++ wrapper Simd::InterleaveBgra(const View& b, const View& g, const View& r, const View& a, View& bgra). - - \param [in] b - a pointer to pixels data of input 8-bit Blue planar image. - \param [in] bStride - a row size of the b image. - \param [in] g - a pointer to pixels data of input 8-bit Green planar image. - \param [in] gStride - a row size of the g image. - \param [in] r - a pointer to pixels data of input 8-bit Red planar image. - \param [in] rStride - a row size of the r image. - \param [in] a - a pointer to pixels data of input 8-bit Alpha planar image. - \param [in] aStride - a row size of the a image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] bgra - a pointer to pixels data of output 32-bit BGRA interleaved image. - \param [in] bgraStride - a row size of the bgr image. - */ - SIMD_API void SimdInterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride); - - /*! @ingroup laplace_filter - - \fn void SimdLaplace(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - \short Calculates Laplace's filter. - - All images must have the same width and height. Input image must has 8-bit gray format, output image must has 16-bit integer format. - - For every point: - \verbatim - dst[x, y] = - - src[x-1, y-1] - src[x, y-1] - src[x+1, y-1] - - src[x-1, y] + 8*src[x, y] - src[x+1, y] - - src[x-1, y+1] - src[x, y+1] - src[x+1, y+1]. - \endverbatim - - \note This function has a C++ wrappers: Simd::Laplace(const View& src, View& dst). - - \param [in] src - a pointer to pixels data of the input image. - \param [in] srcStride - a row size of the input image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] dst - a pointer to pixels data of the output image. - \param [in] dstStride - a row size of the output image (in bytes). - */ - SIMD_API void SimdLaplace(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - /*! @ingroup laplace_filter - - \fn void SimdLaplaceAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - \short Calculates absolute value of Laplace's filter. - - All images must have the same width and height. Input image must has 8-bit gray format, output image must has 16-bit integer format. - - For every point: - \verbatim - dst[x, y] = abs( - - src[x-1, y-1] - src[x, y-1] - src[x+1, y-1] - - src[x-1, y] + 8*src[x, y] - src[x+1, y] - - src[x-1, y+1] - src[x, y+1] - src[x+1, y+1]). - \endverbatim - - \note This function has a C++ wrappers: Simd::LaplaceAbs(const View& src, View& dst). - - \param [in] src - a pointer to pixels data of the input image. - \param [in] srcStride - a row size of the input image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] dst - a pointer to pixels data of the output image. - \param [in] dstStride - a row size of the output image (in bytes). - */ - SIMD_API void SimdLaplaceAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - /*! @ingroup other_statistic - - \fn void SimdLaplaceAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - \short Calculates sum of absolute value of Laplace's filter. - - Input image must has 8-bit gray format. - - For every point: - \verbatim - sum += abs( - - src[x-1, y-1] - src[x, y-1] - src[x+1, y-1] - - src[x-1, y] + 8*src[x, y] - src[x+1, y] - - src[x-1, y+1] - src[x, y+1] - src[x+1, y+1]). - \endverbatim - - \note This function has a C++ wrappers: Simd::LaplaceAbsSum(const View& src, uint64_t & sum). - - \param [in] src - a pointer to pixels data of the input image. - \param [in] stride - a row size of the input image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] sum - a pointer to result sum. - */ - SIMD_API void SimdLaplaceAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - /*! @ingroup other_filter - - \fn void SimdLbpEstimate(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - \short Calculates LBP (Local Binary Patterns) for 8-bit gray image. - - All images must have the same width and height. - - \note This function has a C++ wrappers: Simd::LbpEstimate(const View& src, View& dst). - - \param [in] src - a pointer to pixels data of input 8-bit gray image. - \param [in] srcStride - a row size of src image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] dst - a pointer to pixels data of output 8-bit gray image with LBP. - \param [in] dstStride - a row size of dst image. - */ - SIMD_API void SimdLbpEstimate(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - /*! @ingroup other_filter - - \fn void SimdMeanFilter3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride); - - \short Performs an averaging with window 3x3. - - For every point: - \verbatim - dst[x, y] = (src[x-1, y-1] + src[x, y-1] + src[x+1, y-1] + - src[x-1, y] + src[x, y] + src[x+1, y] + - src[x-1, y+1] + src[x, y+1] + src[x+1, y+1] + 4) / 9; - \endverbatim - - All images must have the same width, height and format (8-bit gray, 16-bit UV, 24-bit BGR or 32-bit BGRA). - - \note This function has a C++ wrapper Simd::MeanFilter3x3(const View& src, View& dst). - - \param [in] src - a pointer to pixels data of source image. - \param [in] srcStride - a row size of the src image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] channelCount - a channel count. - \param [out] dst - a pointer to pixels data of destination image. - \param [in] dstStride - a row size of the dst image. - */ - SIMD_API void SimdMeanFilter3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - /*! @ingroup median_filter - - \fn void SimdMedianFilterRhomb3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride); - - \short Performs median filtration of input image (filter window is a rhomb 3x3). - - All images must have the same width, height and format (8-bit gray, 16-bit UV, 24-bit BGR or 32-bit BGRA). - - \note This function has a C++ wrappers: Simd::MedianFilterRhomb3x3(const View& src, View& dst). - - \param [in] src - a pointer to pixels data of original input image. - \param [in] srcStride - a row size of src image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] channelCount - a channel count. - \param [out] dst - a pointer to pixels data of filtered output image. - \param [in] dstStride - a row size of dst image. - */ - SIMD_API void SimdMedianFilterRhomb3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - /*! @ingroup median_filter - - \fn void SimdMedianFilterRhomb5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride); - - \short Performs median filtration of input image (filter window is a rhomb 5x5). - - All images must have the same width, height and format (8-bit gray, 16-bit UV, 24-bit BGR or 32-bit BGRA). - - \note This function has a C++ wrappers: Simd::MedianFilterRhomb5x5(const View& src, View& dst). - - \param [in] src - a pointer to pixels data of original input image. - \param [in] srcStride - a row size of src image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] channelCount - a channel count. - \param [out] dst - a pointer to pixels data of filtered output image. - \param [in] dstStride - a row size of dst image. - */ - SIMD_API void SimdMedianFilterRhomb5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - /*! @ingroup median_filter - - \fn void SimdMedianFilterSquare3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride); - - \short Performs median filtration of input image (filter window is a square 3x3). - - All images must have the same width, height and format (8-bit gray, 16-bit UV, 24-bit BGR or 32-bit BGRA). - - \note This function has a C++ wrappers: Simd::MedianFilterSquare3x3(const View& src, View& dst). - - \param [in] src - a pointer to pixels data of original input image. - \param [in] srcStride - a row size of src image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] channelCount - a channel count. - \param [out] dst - a pointer to pixels data of filtered output image. - \param [in] dstStride - a row size of dst image. - */ - SIMD_API void SimdMedianFilterSquare3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - /*! @ingroup median_filter - - \fn void SimdMedianFilterSquare5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride); - - \short Performs median filtration of input image (filter window is a square 5x5). - - All images must have the same width, height and format (8-bit gray, 16-bit UV, 24-bit BGR or 32-bit BGRA). - - \note This function has a C++ wrappers: Simd::MedianFilterSquare5x5(const View& src, View& dst). - - \param [in] src - a pointer to pixels data of original input image. - \param [in] srcStride - a row size of src image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] channelCount - a channel count. - \param [out] dst - a pointer to pixels data of filtered output image. - \param [in] dstStride - a row size of dst image. - */ - SIMD_API void SimdMedianFilterSquare5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - /*! @ingroup neural - - \fn void SimdNeuralConvert(const uint8_t * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride, int inversion); - - \short Converts a 8-bit gray image to the 32-bit float array. - - The length of output array must be equal to the area of input image. - - For every point: - \verbatim - dst[i] = inversion ? (255 - src[col]) / 255 : src[i]/255; - \endverbatim - - \note This function has a C++ wrapper Simd::NeuralConvert(const View& src, float * dst, bool inversion). - - \param [in] src - a pointer to pixels data of input image. - \param [in] srcStride - a row size (in bytes) of the image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] dst - a pointer to output array. - \param [in] dstStride - a row size of the output array. - \param [in] inversion - a flag of color inversion. - */ - SIMD_API void SimdNeuralConvert(const uint8_t * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride, int inversion); - - /*! @ingroup neural - - \fn void SimdNeuralRoughSigmoid(const float * src, size_t size, const float * slope, float * dst); - - \short Calculates rough sigmoid for 32-bit float array. - - All arrays must have the same size. - - For every element: - \verbatim - x = ::abs(src[i]*slope); - e = 1 + x + x*x*0.5417 + x*x*x*x*0.1460; - dst[i] = 1 / (1 + (src[i] > 0 ? 1 / e : e)); - \endverbatim - It is approximate way (maximal absolute error is 0.002294 (~0.23%) ) of sigmoid function (::SimdSynetSigmoid32f) calculation: - \verbatim - dst[i] = 1/(1 + exp(-slope*src[i])); - \endverbatim - - \note This function is used in Simd::Neural::Function. - - \param [in] src - a pointer to the input array. - \param [in] size - a size of arrays. - \param [in] slope - a pointer to the slope parameter. - \param [out] dst - a pointer to output array. - */ - SIMD_API void SimdNeuralRoughSigmoid(const float * src, size_t size, const float * slope, float * dst); - - /*! @ingroup neural - - \fn void SimdNeuralRoughSigmoid2(const float * src, size_t size, const float * slope, float * dst); - - \short Calculates rough sigmoid for 32-bit float array. - - All arrays must have the same size. - - For every element: - \verbatim - x = -src[i]*slope; - e = max(1 + x/128, 0.5)^128; - dst[i] = 1 / (1 + e); - \endverbatim - It is approximate way (maximal absolute error is 0.001721 (~0.17%) ) of sigmoid function (::SimdSynetSigmoid32f) calculation: - \verbatim - dst[i] = 1/(1 + exp(-slope*src[i])); - \endverbatim - - \note This function is used in Simd::Neural::Function. - - \param [in] src - a pointer to the input array. - \param [in] size - a size of arrays. - \param [in] slope - a pointer to the slope parameter. - \param [out] dst - a pointer to output array. - */ - SIMD_API void SimdNeuralRoughSigmoid2(const float * src, size_t size, const float * slope, float * dst); - - /*! @ingroup neural - - \fn void SimdNeuralDerivativeSigmoid(const float * src, size_t size, const float * slope, float * dst); - - \short Multiplies output 32-bit float array by derivative of sigmoid from input 32-bit float array. - - All arrays must have the same size. - - For every element: - \verbatim - dst[i] *= slope*(1 - src[i])*src[i]; - \endverbatim - - \note This function is used in Simd::Neural::Function. - - \param [in] src - a pointer to the input array. - \param [in] size - a size of arrays. - \param [in] slope - a pointer to the slope parameter. - \param [in, out] dst - a pointer to output array. - */ - SIMD_API void SimdNeuralDerivativeSigmoid(const float * src, size_t size, const float * slope, float * dst); - - /*! @ingroup neural - - \fn void SimdNeuralRoughTanh(const float * src, size_t size, const float * slope, float * dst); - - \short Calculates rough hyperbolic tangent for 32-bit float array. - - All arrays must have the same size. - - For every element: - \verbatim - x = ::abs(src[i]*slope); - e = 1 + x + x*x*0.5658 + x*x*x*x*0.1430; - dst[i] = (src[i] > 0 ? 1 : -1)*(e - 1/e)/(e + 1/e); - \endverbatim - It is approximate way (maximal absolute error is 0.001514 (~0.15%) ) of hyperbolic tangent (::SimdSynetTanh32f) function calculation: - \verbatim - x = slope*src[i]; - dst[i] = (exp(x) - exp(-x))/(exp(x) + exp(-x)); - \endverbatim - - \note This function is used in Simd::Neural::Function. - - \param [in] src - a pointer to the input array. - \param [in] size - a size of arrays. - \param [in] slope - a pointer to the slope parameter. - \param [out] dst - a pointer to output array. - */ - SIMD_API void SimdNeuralRoughTanh(const float * src, size_t size, const float * slope, float * dst); - - /*! @ingroup neural - - \fn void SimdNeuralDerivativeTanh(const float * src, size_t size, const float * slope, float * dst); - - \short Multiplies output 32-bit float array by derivative of hyperbolic tangent from input 32-bit float array. - - All arrays must have the same size. - - For every element: - \verbatim - dst[i] *= slope*(1 - src[i]*src[i]); - \endverbatim - - \note This function is used in Simd::Neural::Function. - - \param [in] src - a pointer to the input array. - \param [in] size - a size of arrays. - \param [in] slope - a pointer to the slope parameter. - \param [in, out] dst - a pointer to output array. - */ - SIMD_API void SimdNeuralDerivativeTanh(const float * src, size_t size, const float * slope, float * dst); - - /*! @ingroup neural - - \fn void SimdNeuralDerivativeRelu(const float * src, size_t size, const float * slope, float * dst); - - \short Multiplies output 32-bit float array by derivative of Relu (rectified linear unit) from input 32-bit float array. - - All arrays must have the same size. - - For every element: - \verbatim - dst[i] *= src[i] > 0 ? 1 : slope; - \endverbatim - - \note This function is used in Simd::Neural::Function. - - \param [in] src - a pointer to the input array. - \param [in] size - a size of arrays. - \param [in] slope - a pointer to the slope parameter. - \param [in, out] dst - a pointer to output array. - */ - SIMD_API void SimdNeuralDerivativeRelu(const float * src, size_t size, const float * slope, float * dst); - - /*! @ingroup neural - - \fn void SimdNeuralPow(const float * src, size_t size, const float * exponent, float * dst); - - \short Calculates Pow function for 32-bit float array. - - All arrays must have the same size. - - For every element: - \verbatim - dst[i] = Pow(src[i], exponent[0]); - \endverbatim - - \note This function is used in Simd::Neural::Function. - - \param [in] src - a pointer to the input array. - \param [in] size - a size of arrays. - \param [in] exponent - a pointer to exponent parameter. - \param [out] dst - a pointer to output array. - */ - SIMD_API void SimdNeuralPow(const float * src, size_t size, const float * exponent, float * dst); - - /*! @ingroup neural - - \fn void SimdNeuralProductSum(const float * a, const float * b, size_t size, float * sum); - - \short Calculates sum of products for two 32-bit float arrays. - - All arrays must have the same size. - - For every element: - \verbatim - sum += a[i]*b[i]; - \endverbatim - - \note This function is used in Simd::Neural. - - \param [in] a - a pointer to the first 32-bit float array. - \param [in] b - a pointer to the second 32-bit float array. - \param [in] size - a size of arrays. - \param [out] sum - a pointer to 32-bit float sum of products. - */ - SIMD_API void SimdNeuralProductSum(const float * a, const float * b, size_t size, float * sum); - - /*! @ingroup neural - - \fn void SimdNeuralAddVectorMultipliedByValue(const float * src, size_t size, const float * value, float * dst); - - \short Adds the product of a vector and a scalar to given vector. - - All arrays must have the same size. - - For every element: - \verbatim - dst[i] += src[i]*value[0]; - \endverbatim - - \note This function is used in Simd::Neural. - - \param [in] src - a pointer to the input 32-bit float array. - \param [in] size - a size of arrays. - \param [in] value - a pointer to the scalar 32-bit float value. - \param [in, out] dst - a pointer to cumulative 32-bit float array. - */ - SIMD_API void SimdNeuralAddVectorMultipliedByValue(const float * src, size_t size, const float * value, float * dst); - - /*! @ingroup neural - - \fn void SimdNeuralAddVector(const float * src, size_t size, float * dst); - - \short Adds a vector to given vector. - - All arrays must have the same size. - - For every element: - \verbatim - dst[i] += src[i]; - \endverbatim - - \note This function is used in Simd::Neural. - - \param [in] src - a pointer to the input 32-bit float array. - \param [in] size - a size of the arrays. - \param [in, out] dst - a pointer to cumulative 32-bit float array. - */ - SIMD_API void SimdNeuralAddVector(const float * src, size_t size, float * dst); - - /*! @ingroup neural - - \fn void SimdNeuralAddValue(const float * value, float * dst, size_t size); - - \short Adds a value to each elements of given vector. - - For every element: - \verbatim - dst[i] += value; - \endverbatim - - \note This function is used in Simd::Neural. - - \param [in] value - a pointer to the scalar 32-bit float value. - \param [in, out] dst - a pointer to cumulative 32-bit float array. - \param [in] size - a size of the array. - */ - SIMD_API void SimdNeuralAddValue(const float * value, float * dst, size_t size); - - /*! @ingroup neural - - \fn void SimdNeuralUpdateWeights(const float * x, size_t size, const float * a, const float * b, float * d, float * w); - - \short Updates ANN weights. - - All arrays must have the same size. - - The algorithm performs: - \verbatim - for (size_t k = 0; k < size; ++k) - { - d[k] = a[0]*d[k] + b[0]*x[k]; - w[k] += d[k]; - } - \endverbatim - - \param [in] x - a pointer to the X array. - \param [in] size - a size of arrays. - \param [in] a - a pointer to the first parameter. - \param [in] b - a pointer to the second parameter. - \param [in, out] d - a pointer to the D array. - \param [in, out] w - a pointer to the W array. - */ - SIMD_API void SimdNeuralUpdateWeights(const float * x, size_t size, const float * a, const float * b, float * d, float * w); - - /*! @ingroup neural - - \fn void SimdNeuralAdaptiveGradientUpdate(const float * delta, size_t size, size_t batch, const float * alpha, const float * epsilon, float * gradient, float * weight); - - \short Updates neural network weights with using of adaptive gradients method. - - Adaptive gradients method. - J Duchi, E Hazan and Y Singer, - "Adaptive subgradient methods for online learning and stochastic optimization" - The Journal of Machine Learning Research, pages 2121-2159, 2011. - - The algorithm performs: - \verbatim - for (i = 0; i < size; ++i) - { - d = delta[i]/batch; - gradient[i] += d*d; - weight[i] -= alpha * d / sqrt(gradient[i] + epsilon); - } - \endverbatim - - \note All arrays must have the same size. This function is used in Simd::Neural. - - \param [in] delta - a pointer to the array with error (delta). - \param [in] size - a size of arrays. - \param [in] batch - a batch size. - \param [in] alpha - a pointer to alpha parameter (update speed). - \param [in] epsilon - a pointer to epsilon parameter (a small number used to avoid division by zero). - \param [in, out] gradient - a pointer to the array with gradients. - \param [in, out] weight - a pointer to the array with weights. - */ - SIMD_API void SimdNeuralAdaptiveGradientUpdate(const float * delta, size_t size, size_t batch, const float * alpha, const float * epsilon, float * gradient, float * weight); - - /*! @ingroup neural - - \fn void SimdNeuralAddConvolution2x2Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - \short Adds 2x2 convolution of 32-bit float image. - - \note This function is used in Simd::Neural. - - \param [in] src - a pointer to the input 32-bit float image. - \param [in] srcStride - a row size of the input image (in 32-float values). - \param [in] width - a width of the output image (input image width must be equal to output image width + 1). - \param [in] height - a height of the output image (input image height must be equal to output image height + 1). - \param [in] weights - a pointer to the array with weights (its size must be at least 4). - \param [in, out] dst - a pointer to the output 32-bit float image. - \param [in] dstStride - a row size of the output image (in 32-float values). - */ - SIMD_API void SimdNeuralAddConvolution2x2Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - /*! @ingroup neural - - \fn void SimdNeuralAddConvolution3x3Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - \short Adds 3x3 convolution of 32-bit float image. - - \note This function is used in Simd::Neural. - - \param [in] src - a pointer to the input 32-bit float image. - \param [in] srcStride - a row size of the input image (in 32-float values). - \param [in] width - a width of the output image (input image width must be equal to output image width + 2). - \param [in] height - a height of the output image (input image height must be equal to output image height + 2). - \param [in] weights - a pointer to the array with weights (its size must be at least 9). - \param [in, out] dst - a pointer to the output 32-bit float image. - \param [in] dstStride - a row size of the output image (in 32-float values). - */ - SIMD_API void SimdNeuralAddConvolution3x3Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - /*! @ingroup neural - - \fn void SimdNeuralAddConvolution4x4Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - \short Adds 4x4 convolution of 32-bit float image. - - \note This function is used in Simd::Neural. - - \param [in] src - a pointer to the input 32-bit float image. - \param [in] srcStride - a row size of the input image (in 32-float values). - \param [in] width - a width of the output image (input image width must be equal to output image width + 3). - \param [in] height - a height of the output image (input image height must be equal to output image height + 3). - \param [in] weights - a pointer to the array with weights (its size must be at least 16). - \param [in, out] dst - a pointer to the output 32-bit float image. - \param [in] dstStride - a row size of the output image (in 32-float values). - */ - SIMD_API void SimdNeuralAddConvolution4x4Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - - /*! @ingroup neural - - \fn void SimdNeuralAddConvolution5x5Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - \short Adds 5x5 convolution of 32-bit float image (forward propagation). - - \note This function is used in Simd::Neural. - - \param [in] src - a pointer to the input 32-bit float image. - \param [in] srcStride - a row size of the input image (in 32-float values). - \param [in] width - a width of the output image (input image width must be equal to output image width + 4). - \param [in] height - a height of the output image (input image height must be equal to output image height + 4). - \param [in] weights - a pointer to the array with weights (its size must be at least 25). - \param [in, out] dst - a pointer to the output 32-bit float image. - \param [in] dstStride - a row size of the output image (in 32-float values). - */ - SIMD_API void SimdNeuralAddConvolution5x5Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - /*! @ingroup neural - - \fn void SimdNeuralAddConvolution2x2Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - \short Adds 2x2 convolution of 32-bit float image (backward propagation). - - \note This function is used in Simd::Neural. - - \param [in] src - a pointer to the input 32-bit float image. - \param [in] srcStride - a row size of the input image (in 32-float values). - \param [in] width - a width of the input image (output image width must be equal to input image width + 1). - \param [in] height - a height of the input image (output image height must be equal to input image height + 1). - \param [in] weights - a pointer to the array with weights (its size must be at least 4). - \param [in, out] dst - a pointer to the output 32-bit float image. - \param [in] dstStride - a row size of the output image (in 32-float values). - */ - SIMD_API void SimdNeuralAddConvolution2x2Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - /*! @ingroup neural - - \fn void SimdNeuralAddConvolution3x3Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - \short Adds 3x3 convolution of 32-bit float image (backward propagation). - - \note This function is used in Simd::Neural. - - \param [in] src - a pointer to the input 32-bit float image. - \param [in] srcStride - a row size of the input image (in 32-float values). - \param [in] width - a width of the input image (output image width must be equal to input image width + 2). - \param [in] height - a height of the input image (output image height must be equal to input image height + 2). - \param [in] weights - a pointer to the array with weights (its size must be at least 9). - \param [in, out] dst - a pointer to the output 32-bit float image. - \param [in] dstStride - a row size of the output image (in 32-float values). - */ - SIMD_API void SimdNeuralAddConvolution3x3Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - /*! @ingroup neural - - \fn void SimdNeuralAddConvolution4x4Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - \short Adds 4x4 convolution of 32-bit float image (backward propagation). - - \note This function is used in Simd::Neural. - - \param [in] src - a pointer to the input 32-bit float image. - \param [in] srcStride - a row size of the input image (in 32-float values). - \param [in] width - a width of the input image (output image width must be equal to input image width + 3). - \param [in] height - a height of the input image (output image height must be equal to input image height + 3). - \param [in] weights - a pointer to the array with weights (its size must be at least 16). - \param [in, out] dst - a pointer to the output 32-bit float image. - \param [in] dstStride - a row size of the output image (in 32-float values). - */ - SIMD_API void SimdNeuralAddConvolution4x4Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - /*! @ingroup neural - - \fn void SimdNeuralAddConvolution5x5Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - \short Adds 5x5 convolution of 32-bit float image (backward propagation). - - \note This function is used in Simd::Neural. - - \param [in] src - a pointer to the input 32-bit float image. - \param [in] srcStride - a row size of the input image (in 32-float values). - \param [in] width - a width of the input image (output image width must be equal to input image width + 4). - \param [in] height - a height of the input image (output image height must be equal to input image height + 4). - \param [in] weights - a pointer to the array with weights (its size must be at least 25). - \param [in, out] dst - a pointer to the output 32-bit float image. - \param [in] dstStride - a row size of the output image (in 32-float values). - */ - SIMD_API void SimdNeuralAddConvolution5x5Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - /*! @ingroup neural - - \fn void SimdNeuralAddConvolution2x2Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - \short Accumulates changes of weights for 2x2 convolution of 32-bit float image during backward propagation. - - \note This function is used in Simd::Neural. - - \param [in] src - a pointer to the input 32-bit float image. - \param [in] srcStride - a row size of the input image (in 32-float values). - \param [in] dst - a pointer to the output 32-bit float image. - \param [in] dstStride - a row size of the output image (in 32-float values). - \param [in] width - a width of the output image (input image width must be equal to output image width + 1). - \param [in] height - a height of the output image (input image height must be equal to output image height + 1). - \param [in, out] sums - a pointer to the array with changes of weights (its size must be at least 4). - */ - SIMD_API void SimdNeuralAddConvolution2x2Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - /*! @ingroup neural - - \fn void SimdNeuralAddConvolution3x3Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - \short Accumulates changes of weights for 3x3 convolution of 32-bit float image during backward propagation. - - \note This function is used in Simd::Neural. - - \param [in] src - a pointer to the input 32-bit float image. - \param [in] srcStride - a row size of the input image (in 32-float values). - \param [in] dst - a pointer to the output 32-bit float image. - \param [in] dstStride - a row size of the output image (in 32-float values). - \param [in] width - a width of the output image (input image width must be equal to output image width + 2). - \param [in] height - a height of the output image (input image height must be equal to output image height + 2). - \param [in, out] sums - a pointer to the array with changes of weights (its size must be at least 9). - */ - SIMD_API void SimdNeuralAddConvolution3x3Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - /*! @ingroup neural - - \fn void SimdNeuralAddConvolution4x4Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - \short Accumulates changes of weights for 4x4 convolution of 32-bit float image during backward propagation. - - \note This function is used in Simd::Neural. - - \param [in] src - a pointer to the input 32-bit float image. - \param [in] srcStride - a row size of the input image (in 32-float values). - \param [in] dst - a pointer to the output 32-bit float image. - \param [in] dstStride - a row size of the output image (in 32-float values). - \param [in] width - a width of the output image (input image width must be equal to output image width + 3). - \param [in] height - a height of the output image (input image height must be equal to output image height + 3). - \param [in, out] sums - a pointer to the array with changes of weights (its size must be at least 16). - */ - SIMD_API void SimdNeuralAddConvolution4x4Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - /*! @ingroup neural - - \fn void SimdNeuralAddConvolution5x5Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - \short Accumulates changes of weights for 5x5 convolution of 32-bit float image during backward propagation. - - \note This function is used in Simd::Neural. - - \param [in] src - a pointer to the input 32-bit float image. - \param [in] srcStride - a row size of the input image (in 32-float values). - \param [in] dst - a pointer to the output 32-bit float image. - \param [in] dstStride - a row size of the output image (in 32-float values). - \param [in] width - a width of the output image (input image width must be equal to output image width + 4). - \param [in] height - a height of the output image (input image height must be equal to output image height + 4). - \param [in, out] sums - a pointer to the array with changes of weights (its size must be at least 25). - */ - SIMD_API void SimdNeuralAddConvolution5x5Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - /*! @ingroup neural - - \fn void SimdNeuralPooling1x1Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride); - - \short Takes maximum value in 3x3 window of input 32-bit float image and copies to the output image. - - \note This function is used in Simd::Neural. Output image must have the same size. - - \param [in] src - a pointer to the input 32-bit float image. - \param [in] srcStride - a row size of the input image (in 32-float values). - \param [in] width - a width of the input image. - \param [in] height - a height of the input image. - \param [in, out] dst - a pointer to the output 32-bit float image. - \param [in] dstStride - a row size of the output image (in 32-float values). - */ - SIMD_API void SimdNeuralPooling1x1Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride); - - /*! @ingroup neural - - \fn void SimdNeuralPooling2x2Max2x2(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride); - - \short Reduces input 32-bit float image in two times (takes maximum value in 2x2 window and copies to the output image). - - \note This function is used in Simd::Neural. - - \param [in] src - a pointer to the input 32-bit float image. - \param [in] srcStride - a row size of the input image (in 32-float values). - \param [in] width - a width of the input image (output image width must have size (width + 1)/2). - \param [in] height - a height of the input image (output image height must have size (height + 1)/2). - \param [in, out] dst - a pointer to the output 32-bit float image. - \param [in] dstStride - a row size of the output image (in 32-float values). - */ - SIMD_API void SimdNeuralPooling2x2Max2x2(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride); - - /*! @ingroup neural - - \fn void SimdNeuralPooling2x2Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride); - - \short Reduces input 32-bit float image in two times (takes maximum value in 3x3 window and copies to the output image). - - \note This function is used in Simd::Neural. - - \param [in] src - a pointer to the input 32-bit float image. - \param [in] srcStride - a row size of the input image (in 32-float values). - \param [in] width - a width of the input image (output image width must have size width/2). - \param [in] height - a height of the input image (output image height must have size height/2). - \param [in, out] dst - a pointer to the output 32-bit float image. - \param [in] dstStride - a row size of the output image (in 32-float values). - */ - SIMD_API void SimdNeuralPooling2x2Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride); - - /*! @ingroup neural - - \fn void SimdNeuralConvolutionForward(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, const float * weight, size_t kernelX, size_t kernelY, size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, void * buffer, size_t * size, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth, int add); - - \short Adds convolution of the input multichannel 32-bit float image to the output multichannel 32-bit float image. - - \note There is a restriction to the size of output image: - \verbatim - dstWidth = (srcWidth + 2 * padX - (dilationX * (kernelX - 1) + 1)) / strideX + 1. - dstHeight = (srcHeight + 2 * padY - (dilationY * (kernelY - 1) + 1)) / strideY + 1. - \endverbatim - - \param [in] src - a pointer to the input multichannel 32-bit float image. Total size of the input image is equal to srcWidth*srcHeight*srcDepth. - \param [in] srcWidth - a width of the input image. - \param [in] srcHeight - a height of the input image. - \param [in] srcDepth - a number of channels in the input image. - \param [in] weight - a pointer to the convolution weights. Total size of the weights is equal to `kernelX*kernelY*srcDepth*dstDepth`. - \param [in] kernelX - a width of the convolution kernel. - \param [in] kernelY - a height of the convolution kernel. - \param [in] padX - a pad to the x-coordinate of the input image. - \param [in] padY - a pad to the y-coordinate of the input image. - \param [in] strideX - a x-stride of the convolution. - \param [in] strideY - a y-stride of the convolution. - \param [in] dilationX - a x-stride of the convolution. - \param [in] dilationY - a y-stride of the convolution. - \param [in, out] buffer - a pointer to the external temporal buffer used by the algorithm. Can be NULL (the algorithm uses internal buffer). - \param [in, out] size - a pointer to the size of the external temporal buffer. If the size is too small it will contain required value. Required size is approximately equal to `dstWidth*dstHeight*srcDepth*kernelX*kernelY*sizeof(float)`. Can be NULL. - \param [in, out] dst - a pointer to the output multichannel 32-bit float image. Total size of the output image is equal to `dstWidth*dstHeight*dstDepth`. - \param [in] dstWidth - a width of the output image. - \param [in] dstHeight - a height of the output image. - \param [in] dstDepth - a number of channels in the output image. - \param [in] add - a flag which signalizes that we want add or assign value of convolution to the output image. - */ - SIMD_API void SimdNeuralConvolutionForward(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, const float * weight, size_t kernelX, size_t kernelY, size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, void * buffer, size_t * size, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth, int add); - - /*! @ingroup operation - - \fn void SimdOperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride, SimdOperationBinary8uType type); - - \short Performs given operation between two images. - - All images must have the same width, height and format (8-bit gray, 16-bit UV (UV plane of NV12 pixel format), 24-bit BGR or 32-bit BGRA). - - \note This function has a C++ wrappers: Simd::OperationBinary8u(const View& a, const View& b, View& dst, SimdOperationBinary8uType type). - - \param [in] a - a pointer to pixels data of the first input image. - \param [in] aStride - a row size of the first image. - \param [in] b - a pointer to pixels data of the second input image. - \param [in] bStride - a row size of the second image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] channelCount - a channel count. - \param [out] dst - a pointer to pixels data of output image. - \param [in] dstStride - a row size of dst image. - \param [in] type - a type of operation (see ::SimdOperationBinary8uType). - */ - SIMD_API void SimdOperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride, SimdOperationBinary8uType type); - - /*! @ingroup operation - - \fn void SimdOperationBinary16i(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint8_t * dst, size_t dstStride, SimdOperationBinary16iType type); - - \short Performs given operation between two images. - - All images must have the same width, height and ::SimdPixelFormatInt16 pixel format. - - \note This function has a C++ wrappers: Simd::OperationBinary16i(const View& a, const View& b, View& dst, SimdOperationBinary16iType type). - - \param [in] a - a pointer to pixels data of the first input image. - \param [in] aStride - a row size of the first image. - \param [in] b - a pointer to pixels data of the second input image. - \param [in] bStride - a row size of the second image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] dst - a pointer to pixels data of output image. - \param [in] dstStride - a row size of dst image. - \param [in] type - a type of operation (see ::SimdOperationBinary16iType). - */ - SIMD_API void SimdOperationBinary16i(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint8_t * dst, size_t dstStride, SimdOperationBinary16iType type); - - /*! @ingroup operation - - \fn void SimdVectorProduct(const uint8_t * vertical, const uint8_t * horizontal, uint8_t * dst, size_t stride, size_t width, size_t height); - - \short Calculates result 8-bit gray image as product of two vectors. - - For all points: - \verbatim - dst[x, y] = horizontal[x]*vertical[y]/255; - \endverbatim - - \note This function has a C++ wrappers: Simd::VectorProduct(const uint8_t * vertical, const uint8_t * horizontal, View& dst). - - \param [in] vertical - a pointer to pixels data of vertical vector. It length is equal to result image height. - \param [in] horizontal - a pointer to pixels data of horizontal vector. It length is equal to result image width. - \param [out] dst - a pointer to pixels data of result image. - \param [in] stride - a row size of dst image. - \param [in] width - an image width. - \param [in] height - an image height. - */ - SIMD_API void SimdVectorProduct(const uint8_t * vertical, const uint8_t * horizontal, - uint8_t * dst, size_t stride, size_t width, size_t height); - - /*! @ingroup resizing - - \fn void SimdReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); - - \short Performs reducing and Gaussian blurring (in two time) a 8-bit channel color image with using window 2x2. - - For input and output image must be performed: dstWidth = (srcWidth + 1)/2, dstHeight = (srcHeight + 1)/2. - - For all points: - \verbatim - dst[x, y, c] = (src[2*x, 2*y, c] + src[2*x, 2*y + 1, c] + src[2*x + 1, 2*y, c] + src[2*x + 1, 2*y + 1, c] + 2)/4; - \endverbatim - - \note This function has a C++ wrappers: Simd::Reduce2x2(const View & src, View & dst). - - \param [in] src - a pointer to pixels data of the original input image. - \param [in] srcWidth - a width of the input image. - \param [in] srcHeight - a height of the input image. - \param [in] srcStride - a row size of the input image. - \param [out] dst - a pointer to pixels data of the reduced output image. - \param [in] dstWidth - a width of the output image. - \param [in] dstHeight - a height of the output image. - \param [in] dstStride - a row size of the output image. - \param [in] channelCount - a nmber of channels for input and output images. - */ - SIMD_API void SimdReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); - - /*! @ingroup resizing - - \fn void SimdReduceGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - \short Performs reducing and Gaussian blurring (in two time) a 8-bit gray image with using window 2x2. - - For input and output image must be performed: dstWidth = (srcWidth + 1)/2, dstHeight = (srcHeight + 1)/2. - - For all points: - \verbatim - dst[x, y] = (src[2*x, 2*y] + src[2*x, 2*y + 1] + src[2*x + 1, 2*y] + src[2*x + 1, 2*y + 1] + 2)/4; - \endverbatim - - \note This function has a C++ wrappers: Simd::ReduceGray2x2(const View& src, View& dst). - - \param [in] src - a pointer to pixels data of the original input image. - \param [in] srcWidth - a width of the input image. - \param [in] srcHeight - a height of the input image. - \param [in] srcStride - a row size of the input image. - \param [out] dst - a pointer to pixels data of the reduced output image. - \param [in] dstWidth - a width of the output image. - \param [in] dstHeight - a height of the output image. - \param [in] dstStride - a row size of the output image. - */ - SIMD_API void SimdReduceGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - /*! @ingroup resizing - - \fn void SimdReduceGray3x3(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation); - - \short Performs reducing and Gaussian blurring (in two time) a 8-bit gray image with using window 3x3. - - For input and output image must be performed: dstWidth = (srcWidth + 1)/2, dstHeight = (srcHeight + 1)/2. - - For every point: - \verbatim - dst[x, y] = (src[2*x-1, 2*y-1] + 2*src[2*x, 2*y-1] + src[2*x+1, 2*y-1] + - 2*(src[2*x-1, 2*y] + 2*src[2*x, 2*y] + src[2*x+1, 2*y]) + - src[2*x-1, 2*y+1] + 2*src[2*x, 2*y+1] + src[2*x+1, 2*y+1] + compensation ? 8 : 0) / 16; - \endverbatim - - \note This function has a C++ wrappers: Simd::ReduceGray3x3(const View& src, View& dst, bool compensation). - - \param [in] src - a pointer to pixels data of the original input image. - \param [in] srcWidth - a width of the input image. - \param [in] srcHeight - a height of the input image. - \param [in] srcStride - a row size of the input image. - \param [out] dst - a pointer to pixels data of the reduced output image. - \param [in] dstWidth - a width of the output image. - \param [in] dstHeight - a height of the output image. - \param [in] dstStride - a row size of the output image. - \param [in] compensation - a flag of compensation of rounding. - */ - SIMD_API void SimdReduceGray3x3(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation); - - /*! @ingroup resizing - - \fn void SimdReduceGray4x4(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - \short Performs reducing and Gaussian blurring (in two time) a 8-bit gray image with using window 4x4. - - For input and output image must be performed: dstWidth = (srcWidth + 1)/2, dstHeight = (srcHeight + 1)/2. - - For every point: - \verbatim - dst[x, y] = (src[2*x-1, 2*y-1] + 3*src[2*x, 2*y-1] + 3*src[2*x+1, 2*y-1] + src[2*x+2, 2*y-1] - 3*(src[2*x-1, 2*y] + 3*src[2*x, 2*y] + 3*src[2*x+1, 2*y] + src[2*x+2, 2*y]) + - 3*(src[2*x-1, 2*y+1] + 3*src[2*x, 2*y+1] + 3*src[2*x+1, 2*y+1] + src[2*x+2, 2*y+1]) + - src[2*x-1, 2*y+2] + 3*src[2*x, 2*y+2] + 3*src[2*x+1, 2*y+2] + src[2*x+2, 2*y+2] + 32) / 64; - \endverbatim - - \note This function has a C++ wrappers: Simd::ReduceGray4x4(const View& src, View& dst). - - \param [in] src - a pointer to pixels data of the original input image. - \param [in] srcWidth - a width of the input image. - \param [in] srcHeight - a height of the input image. - \param [in] srcStride - a row size of the input image. - \param [out] dst - a pointer to pixels data of the reduced output image. - \param [in] dstWidth - a width of the output image. - \param [in] dstHeight - a height of the output image. - \param [in] dstStride - a row size of the output image. - */ - SIMD_API void SimdReduceGray4x4(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - /*! @ingroup resizing - - \fn void SimdReduceGray5x5(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation); - - \short Performs reducing and Gaussian blurring (in two time) a 8-bit gray image with using window 5x5. - - For input and output image must be performed: dstWidth = (srcWidth + 1)/2, dstHeight = (srcHeight + 1)/2. - - For every point: - \verbatim - dst[x, y] = ( - src[2*x-2, 2*y-2] + 4*src[2*x-1, 2*y-2] + 6*src[2*x, 2*y-2] + 4*src[2*x+1, 2*y-2] + src[2*x+2, 2*y-2] + - 4*(src[2*x-2, 2*y-1] + 4*src[2*x-1, 2*y-1] + 6*src[2*x, 2*y-1] + 4*src[2*x+1, 2*y-1] + src[2*x+2, 2*y-1]) + - 6*(src[2*x-2, 2*y] + 4*src[2*x-1, 2*y] + 6*src[2*x, 2*y] + 4*src[2*x+1, 2*y] + src[2*x+2, 2*y]) + - 4*(src[2*x-2, 2*y+1] + 4*src[2*x-1, 2*y+1] + 6*src[2*x, 2*y+1] + 4*src[2*x+1, 2*y+1] + src[2*x+2, 2*y+1]) + - src[2*x-2, 2*y+2] + 4*src[2*x-1, 2*y+2] + 6*src[2*x, 2*y+2] + 4*src[2*x+1, 2*y+2] + src[2*x+2, 2*y+2] + - compensation ? 128 : 0) / 256; - \endverbatim - - \note This function has a C++ wrappers: Simd::ReduceGray5x5(const Viewc& src, View& dst, bool compensation). - - \param [in] src - a pointer to pixels data of the original input image. - \param [in] srcWidth - a width of the input image. - \param [in] srcHeight - a height of the input image. - \param [in] srcStride - a row size of the input image. - \param [out] dst - a pointer to pixels data of the reduced output image. - \param [in] dstWidth - a width of the output image. - \param [in] dstHeight - a height of the output image. - \param [in] dstStride - a row size of the output image. - \param [in] compensation - a flag of compensation of rounding. - */ - SIMD_API void SimdReduceGray5x5(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation); - - /*! @ingroup reordering - - \fn void SimdReorder16bit(const uint8_t * src, size_t size, uint8_t * dst); - - \short Performs bytes reordering for data array. - - For every 2 bytes: - \verbatim - dst[2*i + 0] = src[2*i + 1]; - dst[2*i + 1] = src[2*i + 0]; - \endverbatim - - The data size must be a multiple of 2. - - \param [in] src - a pointer to the input data. - \param [in] size - a size of input and output data. - \param [out] dst - a pointer to the output data. - */ - SIMD_API void SimdReorder16bit(const uint8_t * src, size_t size, uint8_t * dst); - - /*! @ingroup reordering - - \fn void SimdReorder32bit(const uint8_t * src, size_t size, uint8_t * dst); - - \short Performs bytes reordering for data array. - - For every 4 bytes: - \verbatim - dst[4*i + 0] = src[4*i + 3]; - dst[4*i + 1] = src[4*i + 2]; - dst[4*i + 2] = src[4*i + 1]; - dst[4*i + 3] = src[4*i + 0]; - \endverbatim - - The data size must be a multiple of 4. - - \param [in] src - a pointer to the input data. - \param [in] size - a size of input and output data. - \param [out] dst - a pointer to the output data. - */ - SIMD_API void SimdReorder32bit(const uint8_t * src, size_t size, uint8_t * dst); - - /*! @ingroup reordering - - \fn void SimdReorder64bit(const uint8_t * src, size_t size, uint8_t * dst); - - \short Performs bytes reordering for data array. - - For every 8 bytes: - \verbatim - dst[8*i + 0] = src[8*i + 7]; - dst[8*i + 1] = src[8*i + 6]; - dst[8*i + 2] = src[8*i + 5]; - dst[8*i + 3] = src[8*i + 4]; - dst[8*i + 4] = src[8*i + 3]; - dst[8*i + 5] = src[8*i + 2]; - dst[8*i + 6] = src[8*i + 1]; - dst[8*i + 7] = src[8*i + 0]; - \endverbatim - - The data size must be a multiple of 8. - - \param [in] src - a pointer to the input data. - \param [in] size - a size of input and output data. - \param [out] dst - a pointer to the output data. - */ - SIMD_API void SimdReorder64bit(const uint8_t * src, size_t size, uint8_t * dst); - - /*! @ingroup resizing - - \fn void SimdResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); - - \short Performs resizing of input image with using bilinear interpolation. - - All images must have the same format (8-bit gray, 16-bit UV, 24-bit BGR or 32-bit BGRA). - - \note This function has a C++ wrappers: Simd::ResizeBilinear(const View& src, View& dst). - - \param [in] src - a pointer to pixels data of the original input image. - \param [in] srcWidth - a width of the input image. - \param [in] srcHeight - a height of the input image. - \param [in] srcStride - a row size of the input image. - \param [out] dst - a pointer to pixels data of the reduced output image. - \param [in] dstWidth - a width of the output image. - \param [in] dstHeight - a height of the output image. - \param [in] dstStride - a row size of the output image. - \param [in] channelCount - a channel count. - */ - SIMD_API void SimdResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); - - /*! @ingroup resizing - - \fn void * SimdResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method); - - \short Creates resize context. - - \param [in] srcX - a width of the input image. - \param [in] srcY - a height of the input image. - \param [in] dstX - a width of the output image. - \param [in] dstY - a height of the output image. - \param [in] channels - a channel number of input and output image. - \param [in] type - a type of input and output image channel. - \param [in] method - a method used in order to resize image. - \return a pointer to resize context. On error it returns NULL. - This pointer is used in functions ::SimdResizerRun. - It must be released with using of function ::SimdRelease. - */ - SIMD_API void * SimdResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method); - - /*! @ingroup resizing - - \fn void SimdResizerRun(const void * resizer, const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride); - - \short Performs image resizing. - - \param [in] resizer - a resize context. It must be created by function ::SimdResizerInit and released by function ::SimdRelease. - \param [in] src - a pointer to pixels data of the original input image. - \param [in] srcStride - a row size (in bytes) of the input image. - \param [out] dst - a pointer to pixels data of the resized output image. - \param [in] dstStride - a row size (in bytes) of the output image. - */ - SIMD_API void SimdResizerRun(const void * resizer, const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride); - - /*! @ingroup rgb_conversion - - \fn void SimdRgbToBgra(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - \short Converts 24-bit RGB image to 32-bit BGRA image. - - All images must have the same width and height. - - \note This function has a C++ wrapper Simd::RgbToBgra(const View& rgb, View& bgra, uint8_t alpha). - - \param [in] rgb - a pointer to pixels data of input 24-bit RGB image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] rgbStride - a row size of the rgb image. - \param [out] bgra - a pointer to pixels data of output 32-bit BGRA image. - \param [in] bgraStride - a row size of the bgra image. - \param [in] alpha - a value of alpha channel. - */ - SIMD_API void SimdRgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); - - /*! @ingroup rgb_conversion - - \fn void SimdRgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride); - - \short Converts 24-bit RGB image to 8-bit gray image. - - All images must have the same width and height. - - \note This function has a C++ wrapper Simd::RgbToGray(const View& rgb, View& gray). - - \param [in] rgb - a pointer to pixels data of input 24-bit RGB image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] rgbStride - a row size of the rgb image. - \param [out] gray - a pointer to pixels data of output 8-bit gray image. - \param [in] grayStride - a row size of the gray image. - */ - SIMD_API void SimdRgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride); - - /*! @ingroup segmentation - - \fn void SimdSegmentationChangeIndex(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t oldIndex, uint8_t newIndex); - - \short Changes certain index in mask. - - Mask must has 8-bit gray pixel format. - - For every point: - \verbatim - if(mask[i] == oldIndex) - mask[i] = newIndex; - \endverbatim - - \note This function has a C++ wrappers: Simd::SegmentationChangeIndex(View & mask, uint8_t oldIndex, uint8_t newIndex). - - \param [in, out] mask - a pointer to pixels data of 8-bit gray mask image. - \param [in] stride - a row size of the mask image. - \param [in] width - a mask width. - \param [in] height - a mask height. - \param [in] oldIndex - a mask old index. - \param [in] newIndex - a mask new index. - */ - SIMD_API void SimdSegmentationChangeIndex(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t oldIndex, uint8_t newIndex); - - /*! @ingroup segmentation - - \fn void SimdSegmentationFillSingleHoles(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index); - - \short Fill single holes in mask. - - Mask must has 8-bit gray pixel format. - - \note This function has a C++ wrappers: Simd::SegmentationFillSingleHoles(View & mask, uint8_t index). - - \param [in, out] mask - a pointer to pixels data of 8-bit gray mask image. - \param [in] stride - a row size of the mask image. - \param [in] width - an mask width. - \param [in] height - an mask height. - \param [in] index - a mask index. - */ - SIMD_API void SimdSegmentationFillSingleHoles(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index); - - /*! @ingroup segmentation - - \fn void SimdSegmentationPropagate2x2(const uint8_t * parent, size_t parentStride, size_t width, size_t height, uint8_t * child, size_t childStride, const uint8_t * difference, size_t differenceStride, uint8_t currentIndex, uint8_t invalidIndex, uint8_t emptyIndex, uint8_t differenceThreshold); - - \short Propagates mask index from parent (upper) to child (lower) level of mask pyramid with using 2x2 scan window. - - For parent and child image must be performed: parentWidth = (childWidth + 1)/2, parentHeight = (childHeight + 1)/2. - All images must have 8-bit gray pixel format. Size of different image is equal to child image. - - \note This function has a C++ wrappers: Simd::SegmentationPropagate2x2(const View & parent, View & child, const View & difference, uint8_t currentIndex, uint8_t invalidIndex, uint8_t emptyIndex, uint8_t thresholdDifference). - - \param [in] parent - a pointer to pixels data of 8-bit gray parent mask image. - \param [in] parentStride - a row size of the parent mask image. - \param [in] width - a parent mask width. - \param [in] height - a parent mask height. - \param [in, out] child - a pointer to pixels data of 8-bit gray child mask image. - \param [in] childStride - a row size of the child mask image. - \param [in] difference - a pointer to pixels data of 8-bit gray difference image. - \param [in] differenceStride - a row size of the difference image. - \param [in] currentIndex - propagated mask index. - \param [in] invalidIndex - invalid mask index. - \param [in] emptyIndex - empty mask index. - \param [in] differenceThreshold - a difference threshold for conditional index propagating. - */ - SIMD_API void SimdSegmentationPropagate2x2(const uint8_t * parent, size_t parentStride, size_t width, size_t height, - uint8_t * child, size_t childStride, const uint8_t * difference, size_t differenceStride, - uint8_t currentIndex, uint8_t invalidIndex, uint8_t emptyIndex, uint8_t differenceThreshold); - - /*! @ingroup segmentation - - \fn void SimdSegmentationShrinkRegion(const uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index, ptrdiff_t * left, ptrdiff_t * top, ptrdiff_t * right, ptrdiff_t * bottom); - - \short Finds actual region of mask index location. - - Mask must has 8-bit gray pixel format. - - \note This function has a C++ wrappers: Simd::SegmentationShrinkRegion(const View & mask, uint8_t index, Rectangle & rect). - - \param [in] mask - a pointer to pixels data of 8-bit gray mask image. - \param [in] stride - a row size of the mask image. - \param [in] width - an mask width. - \param [in] height - an mask height. - \param [in] index - a mask index. - \param [in, out] left - a pointer to left side. - \param [in, out] top - a pointer to top side. - \param [in, out] right - a pointer to right side. - \param [in, out] bottom - a pointer to bottom side. - */ - SIMD_API void SimdSegmentationShrinkRegion(const uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index, - ptrdiff_t * left, ptrdiff_t * top, ptrdiff_t * right, ptrdiff_t * bottom); - - /*! @ingroup shifting - - \fn void SimdShiftBilinear(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, const uint8_t * bkg, size_t bkgStride, const double * shiftX, const double * shiftY, size_t cropLeft, size_t cropTop, size_t cropRight, size_t cropBottom, uint8_t * dst, size_t dstStride); - - \short Performs shifting of input image with using bilinear interpolation. - - All images must have the same width, height and format (8-bit gray, 16-bit UV, 24-bit BGR or 32-bit BGRA). - - \note This function has a C++ wrappers: Simd::ShiftBilinear(const View & src, const View & bkg, const Point & shift, const Rectangle & crop, View & dst). - - \param [in] src - a pointer to pixels data of the foreground input image. - \param [in] srcStride - a row size of the input image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] channelCount - a channel count. - \param [in] bkg - a pointer to pixels data of the background input image. - \param [in] bkgStride - a row size of the background image. - \param [in] shiftX - an image shift along x axis. - \param [in] shiftY - an image shift along y axis. - \param [in] cropLeft - a crop left side. - \param [in] cropTop - a crop top side. - \param [in] cropRight - a crop right side. - \param [in] cropBottom - a crop bottom side. - \param [out] dst - a pointer to pixels data of the output image. - \param [in] dstStride - a row size of the output image. - */ - SIMD_API void SimdShiftBilinear(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t * bkg, size_t bkgStride, const double * shiftX, const double * shiftY, - size_t cropLeft, size_t cropTop, size_t cropRight, size_t cropBottom, uint8_t * dst, size_t dstStride); - - /*! @ingroup sobel_filter - - \fn void SimdSobelDx(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - \short Calculates Sobel's filter along x axis. - - All images must have the same width and height. Input image must has 8-bit gray format, output image must has 16-bit integer format. - - For every point: - \n dst[x, y] = (src[x+1,y-1] + 2*src[x+1, y] + src[x+1, y+1]) - (src[x-1,y-1] + 2*src[x-1, y] + src[x-1, y+1]). - - \note This function has a C++ wrappers: Simd::SobelDx(const View& src, View& dst). - - \param [in] src - a pointer to pixels data of the input image. - \param [in] srcStride - a row size of the input image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] dst - a pointer to pixels data of the output image. - \param [in] dstStride - a row size of the output image (in bytes). - */ - SIMD_API void SimdSobelDx(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - /*! @ingroup sobel_filter - - \fn void SimdSobelDxAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - \short Calculates absolute value of Sobel's filter along x axis. - - All images must have the same width and height. Input image must has 8-bit gray format, output image must has 16-bit integer format. - - For every point: - \verbatim - dst[x, y] = (src[x+1,y-1] + 2*src[x+1, y] + src[x+1, y+1]) - (src[x-1,y-1] + 2*src[x-1, y] + src[x-1, y+1]). - \endverbatim - - \note This function has a C++ wrappers: Simd::SobelDxAbs(const View& src, View& dst). - - \param [in] src - a pointer to pixels data of the input image. - \param [in] srcStride - a row size of the input image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] dst - a pointer to pixels data of the output image. - \param [in] dstStride - a row size of the output image (in bytes). - */ - SIMD_API void SimdSobelDxAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - /*! @ingroup sobel_statistic - - \fn void SimdSobelDxAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - \short Calculates sum of absolute value of Sobel's filter along x axis. - - Input image must has 8-bit gray format. - - For every point: - \verbatim - dst[x, y] = abs((src[x+1,y-1] + 2*src[x+1, y] + src[x+1, y+1]) - (src[x-1,y-1] + 2*src[x-1, y] + src[x-1, y+1])). - \endverbatim - - \note This function has a C++ wrappers: Simd::SobelDxAbsSum(const View& src, uint64_t & sum). - - \param [in] src - a pointer to pixels data of the input image. - \param [in] stride - a row size of the input image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] sum - a pointer to unsigned 64-bit integer value with result sum. - */ - SIMD_API void SimdSobelDxAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - /*! @ingroup sobel_filter - - \fn void SimdSobelDy(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - \short Calculates Sobel's filter along y axis. - - All images must have the same width and height. Input image must has 8-bit gray format, output image must has 16-bit integer format. - - For every point: - \verbatim - dst[x, y] = (src[x-1,y+1] + 2*src[x, y+1] + src[x+1, y+1]) - (src[x-1,y-1] + 2*src[x, y-1] + src[x+1, y-1]); - \endverbatim - - \note This function has a C++ wrappers: Simd::SobelDy(const View& src, View& dst). - - \param [in] src - a pointer to pixels data of the input image. - \param [in] srcStride - a row size of the input image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] dst - a pointer to pixels data of the output image. - \param [in] dstStride - a row size of the output image (in bytes). - */ - SIMD_API void SimdSobelDy(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - /*! @ingroup sobel_filter - - \fn void SimdSobelDyAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - \short Calculates absolute value of Sobel's filter along y axis. - - All images must have the same width and height. Input image must has 8-bit gray format, output image must has 16-bit integer format. - - For every point: - \verbatim - dst[x, y] = abs((src[x-1,y+1] + 2*src[x, y+1] + src[x+1, y+1]) - (src[x-1,y-1] + 2*src[x, y-1] + src[x+1, y-1])); - \endverbatim - - \note This function has a C++ wrappers: Simd::SobelDyAbs(const View& src, View& dst). - - \param [in] src - a pointer to pixels data of the input image. - \param [in] srcStride - a row size of the input image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] dst - a pointer to pixels data of the output image. - \param [in] dstStride - a row size of the output image (in bytes). - */ - SIMD_API void SimdSobelDyAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - /*! @ingroup sobel_statistic - - \fn void SimdSobelDyAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - \short Calculates sum of absolute value of Sobel's filter along y axis. - - Input image must has 8-bit gray format. - - For every point: - \verbatim - sum += abs((src[x-1,y+1] + 2*src[x, y+1] + src[x+1, y+1]) - (src[x-1,y-1] + 2*src[x, y-1] + src[x+1, y-1])); - \endverbatim - - \note This function has a C++ wrappers: Simd::SobelDyAbsSum(const View& src, uint64_t & sum). - - \param [in] src - a pointer to pixels data of the input image. - \param [in] stride - a row size of the input image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] sum - a pointer to unsigned 64-bit integer value with result sum. - */ - SIMD_API void SimdSobelDyAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - /*! @ingroup contour - - \fn void SimdContourMetrics(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - - \short Calculates contour metrics based on absolute value and direction of Sobel's filter along y and y axis. - - All images must have the same width and height. Input image must has 8-bit gray format, output image must has 16-bit integer format. - This function is used for contour extraction. - - For every point: - \verbatim - dy = abs((src[x-1,y+1] + 2*src[x, y+1] + src[x+1, y+1]) - (src[x-1,y-1] + 2*src[x, y-1] + src[x+1, y-1])); - dx = abs((src[x+1,y-1] + 2*src[x+1, y] + src[x+1, y+1]) - (src[x-1,y-1] + 2*src[x-1, y] + src[x-1, y+1])); - dst[x, y] = (dx + dy)*2 + (dx >= dy ? 0 : 1); - \endverbatim - - \note This function has a C++ wrappers: Simd::ContourMetrics(const View& src, View& dst). - - \param [in] src - a pointer to pixels data of the gray 8-bit input image. - \param [in] srcStride - a row size of the input image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] dst - a pointer to pixels data of the output 16-bit image. - \param [in] dstStride - a row size of the output image (in bytes). - */ - SIMD_API void SimdContourMetrics(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - /*! @ingroup contour - - \fn void SimdContourMetricsMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t indexMin, uint8_t * dst, size_t dstStride) - - \short Calculates contour metrics based on absolute value and direction of Sobel's filter along y and y axis with using mask. - - All images must have the same width and height. Input image must has 8-bit gray format, output image must has 16-bit integer format. - This function is used for contour extraction. - - For every point: - \verbatim - dy = abs((src[x-1,y+1] + 2*src[x, y+1] + src[x+1, y+1]) - (src[x-1,y-1] + 2*src[x, y-1] + src[x+1, y-1])); - dx = abs((src[x+1,y-1] + 2*src[x+1, y] + src[x+1, y+1]) - (src[x-1,y-1] + 2*src[x-1, y] + src[x-1, y+1])); - dst[x, y] = mask[x, y] < indexMin ? 0 : (dx + dy)*2 + (dx >= dy ? 0 : 1); - \endverbatim - - \note This function has a C++ wrappers: Simd::ContourMetrics(const View& src, const View& mask, uint8_t indexMin, View& dst). - - \param [in] src - a pointer to pixels data of the gray 8-bit input image. - \param [in] srcStride - a row size of the input image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] mask - a pointer to pixels data of the mask 8-bit image. - \param [in] maskStride - a row size of the mask image. - \param [in] indexMin - a mask minimal permissible index. - \param [out] dst - a pointer to pixels data of the output 16-bit image. - \param [in] dstStride - a row size of the output image (in bytes). - */ - SIMD_API void SimdContourMetricsMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t indexMin, uint8_t * dst, size_t dstStride); - - /*! @ingroup contour - - \fn void SimdContourAnchors(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t step, int16_t threshold, uint8_t * dst, size_t dstStride); - - \short Extract contour anchors from contour metrics. - - All images must have the same width and height. Input image must has 16-bit integer format, output image must has 8-bit gray format. - Input image with metrics can be estimated by using ::SimdContourMetrics or ::SimdContourMetricsMasked functions. - This function is used for contour extraction. - - For every point (except border): - \verbatim - a[x, y] = src[x, y] >> 1. - if(src[x, y] & 1) - dst[x, y] = a[x, y] > 0 && (a[x, y] - a[x + 1, y] >= threshold) && (a[x, y] - a[x - 1, y] >= threshold) ? 255 : 0; - else - dst[x, y] = a[x, y] > 0 && (a[x, y] - a[x, y + 1] >= threshold) && (a[x, y] - a[x, y - 1] >= threshold) ? 255 : 0; - \endverbatim - - \note This function has a C++ wrappers: Simd::ContourAnchors(const View& src, size_t step, int16_t threshold, View& dst). - - \param [in] src - a pointer to pixels data of the 16-bit input image. - \param [in] srcStride - a row size of the input image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] step - a row step (to skip some rows). - \param [in] threshold - a threshold of anchor creation. - \param [out] dst - a pointer to pixels data of the output 8-bit gray image. - \param [in] dstStride - a row size of the output image (in bytes). - */ - SIMD_API void SimdContourAnchors(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t step, int16_t threshold, uint8_t * dst, size_t dstStride); - - /*! @ingroup correlation - - \fn void SimdSquaredDifferenceSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum); - - \short Calculates sum of squared differences for two 8-bit gray images. - - All images must have the same width and height. - - For every point: - \verbatim - sum += (a[i] - b[i])*(a[i] - b[i]); - \endverbatim - - \note This function has a C++ wrappers: Simd::SquaredDifferenceSum(const View& a, const View& b, uint64_t & sum). - - \param [in] a - a pointer to pixels data of the first image. - \param [in] aStride - a row size of the first image. - \param [in] b - a pointer to pixels data of the second image. - \param [in] bStride - a row size of the second image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] sum - a pointer to unsigned 64-bit integer value with result sum. - */ - SIMD_API void SimdSquaredDifferenceSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint64_t * sum); - - /*! @ingroup correlation - - \fn void SimdSquaredDifferenceSumMasked(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, const uint8_t * mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum); - - \short Calculates sum of squared differences for two images with using mask. - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - if(mask[i] == index) - sum += (a[i] - b[i])*(a[i] - b[i]); - \endverbatim - - \note This function has a C++ wrappers: Simd::SquaredDifferenceSum(const View& a, const View& b, const View& mask, uint8_t index, uint64_t & sum). - - \param [in] a - a pointer to pixels data of the first image. - \param [in] aStride - a row size of the first image. - \param [in] b - a pointer to pixels data of the second image. - \param [in] bStride - a row size of the second image. - \param [in] mask - a pointer to pixels data of the mask image. - \param [in] maskStride - a row size of the mask image. - \param [in] index - a mask index. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] sum - a pointer to unsigned 64-bit integer value with result sum. - */ - SIMD_API void SimdSquaredDifferenceSumMasked(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - const uint8_t * mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum); - - /*! @ingroup correlation - - \fn void SimdSquaredDifferenceSum32f(const float * a, const float * b, size_t size, float * sum); - - \short Calculates sum of squared differences for two 32-bit float arrays. - - All arrays must have the same size. - - For every element: - \verbatim - sum += (a[i] - b[i])*(a[i] - b[i]); - \endverbatim - - \param [in] a - a pointer to the first array. - \param [in] b - a pointer to the second array. - \param [in] size - a size of arrays. - \param [out] sum - a sum of squared differences. - */ - SIMD_API void SimdSquaredDifferenceSum32f(const float * a, const float * b, size_t size, float * sum); - - /*! @ingroup correlation - - \fn void SimdSquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum); - - \short Calculates sum of squared differences for two 32-bit float arrays with using Kahan summation algorithm. - - All arrays must have the same size. - - Algorithm pseudo code: - \verbatim - sum = 0; corr = 0; - for(i = 0; i < size; ++i) - { - diff = (a[i] - b[i])*(a[i] - b[i]) - corr; - temp = sum + diff; - corr = (temp - sum) - diff; - sum = temp; - } - \endverbatim - - \param [in] a - a pointer to the first array. - \param [in] b - a pointer to the second array. - \param [in] size - a size of arrays. - \param [out] sum - a sum of squared differences. - */ - SIMD_API void SimdSquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum); - - /*! @ingroup other_statistic - - \fn void SimdGetStatistic(const uint8_t * src, size_t stride, size_t width, size_t height, uint8_t * min, uint8_t * max, uint8_t * average); - - \short Finds minimal, maximal and average pixel values for given image. - - The image must has 8-bit gray format. - - \note This function has a C++ wrappers: Simd::GetStatistic(const View& src, uint8_t & min, uint8_t & max, uint8_t & average). - - \param [in] src - a pointer to pixels data of the input image. - \param [in] stride - a row size of the image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] min - a pointer to unsigned 8-bit integer value with found minimal pixel value. - \param [out] max - a pointer to unsigned 8-bit integer value with found maximal pixel value. - \param [out] average - a pointer to unsigned 8-bit integer value with found average pixel value. - */ - SIMD_API void SimdGetStatistic(const uint8_t * src, size_t stride, size_t width, size_t height, - uint8_t * min, uint8_t * max, uint8_t * average); - - /*! @ingroup other_statistic - - \fn void SimdGetMoments(const uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index, uint64_t * area, uint64_t * x, uint64_t * y, uint64_t * xx, uint64_t * xy, uint64_t * yy); - - \short Calculate statistical characteristics (moments) of pixels with given index. - - The image must has 8-bit gray format. - - For every point: - \verbatim - if(mask[X, Y] == index) - { - area += 1. - x += X. - y += Y. - xx += X*X. - xy += X*Y. - yy += Y*Y. - } - \endverbatim - - \note This function has a C++ wrappers: Simd::GetMoments(const View& mask, uint8_t index, uint64_t & area, uint64_t & x, uint64_t & y, uint64_t & xx, uint64_t & xy, uint64_t & yy). - - \param [in] mask - a pointer to pixels data of the mask image. - \param [in] stride - a row size of the mask image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] index - a mask index. - \param [out] area - a pointer to unsigned 64-bit integer value with found area (number of pixels with given index). - \param [out] x - a pointer to unsigned 64-bit integer value with found first-order moment x. - \param [out] y - a pointer to unsigned 64-bit integer value with found first-order moment y. - \param [out] xx - a pointer to unsigned 64-bit integer value with found second-order moment xx. - \param [out] xy - a pointer to unsigned 64-bit integer value with found second-order moment xy. - \param [out] yy - a pointer to unsigned 64-bit integer value with found second-order moment yy. - */ - SIMD_API void SimdGetMoments(const uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index, - uint64_t * area, uint64_t * x, uint64_t * y, uint64_t * xx, uint64_t * xy, uint64_t * yy); - - /*! @ingroup other_statistic - - \fn void SimdGetObjectMoments(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t index, uint64_t * n, uint64_t * s, uint64_t * sx, uint64_t * sy, uint64_t * sxx, uint64_t * sxy, uint64_t * syy); - - \short Calculate statistical characteristics (moments) of given object. - - The images must has 8-bit gray format and equal size. One of them can be empty. - - For every point: - \verbatim - if(mask[X, Y] == index || mask == 0) - { - S = src ? src[X, Y] : 1; - n += 1. - s += S; - sx += S*X. - sy += S*Y. - sxx += S*X*X. - sxy += S*X*Y. - syy += S*Y*Y. - } - \endverbatim - - \note This function has a C++ wrappers: Simd::GetObjectMoments(const View & src, const View & mask, uint8_t index, uint64_t & n, uint64_t & s, uint64_t & sx, uint64_t & sy, uint64_t & sxx, uint64_t & sxy, uint64_t & syy). - - \param [in] src - a pointer to pixels data of the input image. Can be NULL (its behaviour is equal to function SimdGetMoments). - \param [in] srcStride - a row size of the input image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] mask - a pointer to pixels data of the mask image. Can be NULL (the moments will be collected over whole image). - \param [in] maskStride - a row size of the mask image. - \param [in] index - a mask index. - \param [out] n - a pointer to unsigned 64-bit integer value with found area of given object. - \param [out] s - a pointer to unsigned 64-bit integer value with sum of image values of given object. - \param [out] sx - a pointer to unsigned 64-bit integer value with found first-order moment x of given object. - \param [out] sy - a pointer to unsigned 64-bit integer value with found first-order moment y of given object. - \param [out] sxx - a pointer to unsigned 64-bit integer value with found second-order moment xx of given object. - \param [out] sxy - a pointer to unsigned 64-bit integer value with found second-order moment xy of given object. - \param [out] syy - a pointer to unsigned 64-bit integer value with found second-order moment yy of given object. - */ - SIMD_API void SimdGetObjectMoments(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t index, - uint64_t * n, uint64_t * s, uint64_t * sx, uint64_t * sy, uint64_t * sxx, uint64_t * sxy, uint64_t * syy); - - /*! @ingroup row_statistic - - \fn void SimdGetRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - \short Calculate sums of rows for given 8-bit gray image. - - For all rows: - \verbatim - for(x = 0; x < width; ++x) - sums[y] += src[x, y]; - \endverbatim - - \note This function has a C++ wrappers: Simd::GetRowSums(const View& src, uint32_t * sums). - - \param [in] src - a pointer to pixels data of the input image. - \param [in] stride - a row size of the input image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] sums - a pointer to array of unsigned 32-bit integers result sums of rows. It length must be equal to image height. - */ - SIMD_API void SimdGetRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - /*! @ingroup col_statistic - - \fn void SimdGetColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - \short Calculate sums of columns for given 8-bit gray image. - - For all columns: - \verbatim - for(y = 0; y < height; ++y) - sums[x] += src[x, y]; - \endverbatim - - \note This function has a C++ wrappers: Simd::GetColSums(const View& src, uint32_t * sums). - - \param [in] src - a pointer to pixels data of the input image. - \param [in] stride - a row size of the input image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] sums - a pointer to array of unsigned 32-bit integers result sums of columns. It length must be equal to image width. - */ - SIMD_API void SimdGetColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - /*! @ingroup row_statistic - - \fn void SimdGetAbsDyRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - \short Calculate sums of absolute derivate along y axis for rows for given 8-bit gray image. - - For all rows except the last: - \verbatim - for(x = 0; x < width; ++x) - sums[y] += abs(src[x, y+1] - src[x, y]); - \endverbatim - For the last row: - \verbatim - sums[height-1] = 0; - \endverbatim - - \note This function has a C++ wrappers: Simd::GetAbsDyRowSums(const View& src, uint32_t * sums). - - \param [in] src - a pointer to pixels data of the input image. - \param [in] stride - a row size of the input image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] sums - a pointer to array of unsigned 32-bit integers result sums. It length must be equal to image height. - */ - SIMD_API void SimdGetAbsDyRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - /*! @ingroup col_statistic - - \fn void SimdGetAbsDxColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - \short Calculate sums of absolute derivate along x axis for columns for given 8-bit gray image. - - For all columns except the last: - \verbatim - for(y = 0; y < height; ++y) - sums[y] += abs(src[x+1, y] - src[x, y]); - \endverbatim - For the last column: - \verbatim - sums[width-1] = 0; - \endverbatim - - \note This function has a C++ wrappers: Simd::GetAbsDxColSums(const View& src, uint32_t * sums). - - \param [in] src - a pointer to pixels data of the input image. - \param [in] stride - a row size of the input image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] sums - a pointer to array of unsigned 32-bit integers result columns. It length must be equal to image width. - */ - SIMD_API void SimdGetAbsDxColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - /*! @ingroup other_statistic - - \fn void SimdValueSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - \short Gets sum of value of pixels for gray 8-bit image. - - \note This function has a C++ wrappers: Simd::ValueSum(const View& src, uint64_t & sum). - - \param [in] src - a pointer to pixels data of the image. - \param [in] stride - a row size of the image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] sum - the result sum. - */ - SIMD_API void SimdValueSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - /*! @ingroup other_statistic - - \fn void SimdSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - \short Gets sum of squared value of pixels for gray 8-bit image . - - \note This function has a C++ wrappers: Simd::SquareSum(const View& src, uint64_t & sum). - - \param [in] src - a pointer to pixels data of the image. - \param [in] stride - a row size of the image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] sum - the result sum. - */ - - SIMD_API void SimdSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - /*! @ingroup other_statistic - - \fn void SimdValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum); - - \short Gets sum and squared sum of value of pixels for gray 8-bit image. - - \note This function has a C++ wrappers: Simd::ValueSquareSum(const View& src, uint64_t & valueSum, uint64_t & squareSum). - - \param [in] src - a pointer to pixels data of the image. - \param [in] stride - a row size of the image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] valueSum - the result value sum. - \param [out] squareSum - the result square sum. - */ - SIMD_API void SimdValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum); - - /*! @ingroup other_statistic - - \fn void SimdCorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum); - - \short Gets sum of pixel correlation for two gray 8-bit images. - - For all points: - \verbatim - sum += a[i]*b[i]; - \endverbatim - - All images must have the same width and height and 8-bit gray pixel format. - - \note This function has a C++ wrappers: Simd::CorrelationSum(const View & a, const View & b, uint64_t & sum). - - \param [in] a - a pointer to pixels data of the first image. - \param [in] aStride - a row size of the first image. - \param [in] b - a pointer to pixels data of the second image. - \param [in] bStride - a row size of the second image. - \param [in] width - an images width. - \param [in] height - an images height. - \param [out] sum - a pointer to result sum. - */ - SIMD_API void SimdCorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum); - - /*! @ingroup resizing - - \fn void SimdStretchGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - \short Stretches input 8-bit gray image in two times. - - \note This function has a C++ wrappers: Simd::StretchGray2x2(const View& src, View& dst). - - \param [in] src - a pointer to pixels data of the original input image. - \param [in] srcWidth - a width of the input image. - \param [in] srcHeight - a height of the input image. - \param [in] srcStride - a row size of the input image. - \param [out] dst - a pointer to pixels data of the stretched output image. - \param [in] dstWidth - a width of the output image. - \param [in] dstHeight - a height of the output image. - \param [in] dstStride - a row size of the output image. - */ - SIMD_API void SimdStretchGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - /*! @ingroup svm - - \fn void SimdSvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum); - - \short It is a part of linear SVM (Support Vector Machine) prediction algorithm. - - Algorithm's details: - \verbatim - sum = 0; - for(i = 0; i < count; ++i) - for(j = 0; j < length; ++j) - sum += x[j]*svs[j][i]*weight[i]; - \endverbatim - - \note The array with support vectors must has following structure: svs[length][count]. - - \param [in] x - a vector of features which need to predict with using SVM. - \param [in] svs - an array with support vectors. - \param [in] weights - a weight coefficient of each support vector. - \param [in] length - a length of these current and support vectors. - \param [in] count - a count of support vectors. - \param [out] sum - a pointer to result sum. - */ - SIMD_API void SimdSvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum); - - /*! @ingroup synet - - \fn void SimdSynetAddBias(const float * bias, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - \short Adds a bias to given vector. - - Algorithm's details (example for NCHW tensor format): - \verbatim - for(c = 0; c < channels; ++c) - for(j = 0; j < spatial; ++j) - dst[c*spatial + s] += bias[c]; - \endverbatim - - \note This function is used in Synet Framework. - - \param [in] bias - a pointer to the 32-bit float array with bias coefficients. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)). - \param [in] channels - a number of channels in the image tensor. - \param [in] spatial - a spatial size of image tensor. - \param [in, out] dst - a pointer to cumulative 32-bit image tensor. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] format - a format of image tensor. - */ - SIMD_API void SimdSynetAddBias(const float * bias, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - /*! @ingroup synet_conversion - - \fn void SimdSynetConvert32fTo8u(const float * src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float * shift, uint8_t * dst, SimdSynetCompatibilityType compatibility); - - \short Converts 32-bit float point image to 8-bit unsigned integer image. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the 32-bit float array with input image tensor. - \param [in] batch - a number of images in the batch of (input/output) image tensor. - \param [in] channels - a number of channels in the (input/output) image tensor. - \param [in] height - a height of (input/output) image tensor. - \param [in] width - a width of (input/output) image tensor. - \param [in] format - a format of (input/output) image tensor. - \param [in] scale - a pointer to the 32-bit float array with scale coefficients. - \param [in] shift - a pointer to the 32-bit float array with shift coefficients. - \param [out] dst - a pointer to the 8-bit unsigned integer array with output image tensor. - \param [in] compatibility - a flags of bitwise compatibility. - */ - SIMD_API void SimdSynetConvert32fTo8u(const float * src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float * shift, uint8_t* dst, SimdSynetCompatibilityType compatibility); - - /*! @ingroup synet_convolution - - \fn void * SimdSynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdGemm32fNNPtr gemm); - - \short Initilizes FP32 convolution algorithm. - - \param [in] batch - a batch size. - \param [in] conv - a pointer to convolution parameters. - \param [in] gemm - a pointer to external function of matrix multiplication. Can be NULL. - \return a pointer to FP32 convolution context. On error it returns NULL. It must be released with using of function ::SimdRelease. - This pointer is used in functions ::SimdSynetConvolution32fExternalBufferSize, ::SimdSynetConvolution32fInternalBufferSize, ::SimdSynetConvolution32fSetParams and ::SimdSynetConvolution32fForward. - */ - SIMD_API void * SimdSynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdGemm32fNNPtr gemm); - - /*! @ingroup synet_convolution - - \fn size_t SimdSynetConvolution32fExternalBufferSize(const void * context); - - \short Gets size of external temporary buffer required for FP32 convolution algorithm. - - \param [in] context - a pointer to FP32 convolution context. It must be created by function ::SimdSynetConvolution32fInit and released by function ::SimdRelease. - \return size of external temporary buffer required for FP32 convolution algorithm. - */ - SIMD_API size_t SimdSynetConvolution32fExternalBufferSize(const void * context); - - /*! @ingroup synet_convolution - - \fn size_t SimdSynetConvolution32fInternalBufferSize(const void * context); - - \short Gets size of internal buffer used inside FP32 convolution algorithm. - - \param [in] context - a pointer to FP32 convolution context. It must be created by function ::SimdSynetConvolution32fInit and released by function ::SimdRelease. - \return size of internal buffer used inside FP32 convolution algorithm. - */ - SIMD_API size_t SimdSynetConvolution32fInternalBufferSize(const void * context); - - /*! @ingroup synet_convolution - - \fn void SimdSynetConvolution32fSetParams(void * context, const float * weight, SimdBool * internal, const float * bias, const float * params); - - \short Sets weights, beases and parameters of activation function required for FP32 convolution algorithm. - - \param [in, out] context - a pointer to FP32 convolution context. It must be created by function ::SimdSynetConvolution32fInit and released by function ::SimdRelease. - \param [in] weight - a pointer to convolution weights. - \param [out] internal - a flag signalized that weight is stored in the internal buffer. Can be NULL. - \param [in] bias - a pointer to bias. Can be NULL. - \param [in] params - a pointer to parameters of activation functions (see ::SimdConvolutionActivationType). Can be NULL. - */ - SIMD_API void SimdSynetConvolution32fSetParams(void * context, const float * weight, SimdBool * internal, const float * bias, const float * params); - - /*! @ingroup synet_convolution - - \fn void SimdSynetConvolution32fForward(void * context, const float * src, float * buf, float * dst); - - \short Performs forward propagation of FP32 convolution algorithm. - - \param [in] context - a pointer to FP32 convolution context. It must be created by function ::SimdSynetConvolution32fInit and released by function ::SimdRelease. - \param [in] src - a pointer to input tensor. - \param [out] buf - a pointer to external temporary buffer. The size of the external temporary buffer is determined by function ::SimdSynetConvolution32fExternalBufferSize. Can be NULL (it causes usage of internal buffer). - \param [out] dst - a pointer to output tensor. - */ - SIMD_API void SimdSynetConvolution32fForward(void * context, const float * src, float * buf, float * dst); - - /*! @ingroup synet_convolution - - \fn void * SimdSynetConvolution8iInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility); - - \short Initilizes INT8 convolution algorithm. - - \param [in] batch - a batch size. - \param [in] conv - a pointer to convolution parameters. - \param [in] compatibility - a flags of bitwise compatibility. - \return a pointer to INT8 convolution context. On error it returns NULL. It must be released with using of function ::SimdRelease. - This pointer is used in functions ::SimdSynetConvolution8iExternalBufferSize, ::SimdSynetConvolution8iInternalBufferSize, ::SimdSynetConvolution8iSetParams and ::SimdSynetConvolution8iForward. - */ - SIMD_API void * SimdSynetConvolution8iInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility); - - /*! @ingroup synet_convolution - - \fn size_t SimdSynetConvolution8iExternalBufferSize(const void * context); - - \short Gets size in bytes of external temporary buffer required for INT8 convolution algorithm. - - \param [in] context - a pointer to INT8 convolution context. It must be created by function ::SimdSynetConvolution8iInit and released by function ::SimdRelease. - \return size of external temporary buffer required for INT8 convolution algorithm. - */ - SIMD_API size_t SimdSynetConvolution8iExternalBufferSize(const void * context); - - /*! @ingroup synet_convolution - - \fn size_t SimdSynetConvolution8iInternalBufferSize(const void * context); - - \short Gets size of internal buffer used inside INT8 convolution algorithm. - - \param [in] context - a pointer to INT8 convolution context. It must be created by function ::SimdSynetConvolution8iInit and released by function ::SimdRelease. - \return size of internal buffer used inside INT8 convolution algorithm. - */ - SIMD_API size_t SimdSynetConvolution8iInternalBufferSize(const void * context); - - /*! @ingroup synet_convolution - - \fn void SimdSynetConvolution8iSetParams(void * context, const float * weight, const float * bias, const float * params, const float * const * stats); - - \short Sets weights, beases, parameters of activation function, input/output tensor statistics required for INT8 convolution algorithm. - - \param [in, out] context - a pointer to INT8 convolution context. It must be created by function ::SimdSynetConvolution8iInit and released by function ::SimdRelease. - \param [in] weight - a pointer to original (32-bit float point) convolution weights. - \param [in] bias - a pointer to original (32-bit float point) bias. Can be NULL. - \param [in] params - a pointer to original (32-bit float point) parameters of activation functions (see ::SimdConvolutionActivationType). Can be NULL. - \param [in] stats - a pointer to pointers with statistics of input(min - stats[0], max - stats[1]) and output(min - stats[2], max - stats[3]) tensors. - */ - SIMD_API void SimdSynetConvolution8iSetParams(void * context, const float * weight, const float * bias, const float * params, const float * const* stats); - - /*! @ingroup synet_convolution - - \fn void SimdSynetConvolution8iForward(void * context, const uint8_t * src, uint8_t * buf, uint8_t * dst); - - \short Performs forward propagation of INT8 convolution algorithm. - - \param [in] context - a pointer to INT8 convolution context. It must be created by function ::SimdSynetConvolution8iInit and released by function ::SimdRelease. - \param [in] src - a pointer to input tensor. - \param [out] buf - a pointer to external temporary buffer. The size of the external temporary buffer is determined by function ::SimdSynetConvolution8iExternalBufferSize. Can be NULL (it causes usage of internal buffer). - \param [out] dst - a pointer to output tensor. - */ - SIMD_API void SimdSynetConvolution8iForward(void * context, const uint8_t * src, uint8_t * buf, uint8_t * dst); - - /*! @ingroup synet - - \fn void * SimdSynetDeconvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdGemm32fNNPtr gemm); - - \short Initilizes FP32 deconvolution algorithm. - - \param [in] batch - a batch size. - \param [in] conv - a pointer to deconvolution parameters. - \param [in] gemm - a pointer to external function of matrix multiplication. Can be NULL. - \return a pointer to FP32 deconvolution context. On error it returns NULL. It must be released with using of function ::SimdRelease. - This pointer is used in functions ::SimdSynetDeconvolution32fExternalBufferSize, ::SimdSynetDeconvolution32fInternalBufferSize, ::SimdSynetDeconvolution32fSetParams and ::SimdSynetDeconvolution32fForward. - */ - SIMD_API void * SimdSynetDeconvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdGemm32fNNPtr gemm); - - /*! @ingroup synet - - \fn size_t SimdSynetDeconvolution32fExternalBufferSize(const void * context); - - \short Gets size of external temporary buffer required for FP32 deconvolution algorithm. - - \param [in] context - a pointer to FP32 deconvolution context. It must be created by function ::SimdSynetDeconvolution32fInit and released by function ::SimdRelease. - \return size of external temporary buffer required for FP32 deconvolution algorithm. - */ - SIMD_API size_t SimdSynetDeconvolution32fExternalBufferSize(const void * context); - - /*! @ingroup synet - - \fn size_t SimdSynetDeconvolution32fInternalBufferSize(const void * context); - - \short Gets size of internal buffer used inside FP32 deconvolution algorithm. - - \param [in] context - a pointer to FP32 deconvolution context. It must be created by function ::SimdSynetDeconvolution32fInit and released by function ::SimdRelease. - \return size of internal buffer used inside FP32 deconvolution algorithm. - */ - SIMD_API size_t SimdSynetDeconvolution32fInternalBufferSize(const void * context); - - /*! @ingroup synet - - \fn void SimdSynetDeconvolution32fSetParams(void * context, const float * weight, SimdBool * internal, const float * bias, const float * params); - - \short Sets weights, beases and parameters of activation function required for FP32 deconvolution algorithm. - - \param [in, out] context - a pointer to FP32 deconvolution context. It must be created by function ::SimdSynetDeconvolution32fInit and released by function ::SimdRelease. - \param [in] weight - a pointer to deconvolution weights. - \param [out] internal - a flag signalized that weight is stored in the internal buffer. Can be NULL. - \param [in] bias - a pointer to bias. Can be NULL. - \param [in] params - a pointer to parameters of activation functions (see ::SimdConvolutionActivationType). Can be NULL. - */ - SIMD_API void SimdSynetDeconvolution32fSetParams(void * context, const float * weight, SimdBool * internal, const float * bias, const float * params); - - /*! @ingroup synet - - \fn void SimdSynetDeconvolution32fForward(void * context, const float * src, float * buf, float * dst); - - \short Performs forward propagation of FP32 deconvolution algorithm. - - \param [in] context - a pointer to FP32 deconvolution context. It must be created by function ::SimdSynetDeconvolution32fInit and released by function ::SimdRelease. - \param [in] src - a pointer to input tensor. - \param [out] buf - a pointer to external temporary buffer. The size of the external temporary buffer is determined by function ::SimdSynetDeconvolution32fExternalBufferSize. Can be NULL (it causes usage of internal buffer). - \param [out] dst - a pointer to output tensor. - */ - SIMD_API void SimdSynetDeconvolution32fForward(void * context, const float * src, float * buf, float * dst); - - /*! @ingroup synet - - \fn void SimdSynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst); - - \short This function is used for forward propagation of EltwiseLayer. - - Algorithm's details for ::SimdSynetEltwiseOperationProduct: - \verbatim - for(j = 0; j < size; ++j) - dst[j] = 1; - for(i = 0; i < count; ++i) - for(j = 0; j < size; ++j) - dst[j] *= src[i][j]; - \endverbatim - - Algorithm's details for ::SimdSynetEltwiseOperationSum: - \verbatim - for(j = 0; j < size; ++j) - dst[j] = 0; - for(i = 0; i < count; ++i) - for(j = 0; j < size; ++j) - dst[j] += src[i][j]*weight[i]; - \endverbatim - - Algorithm's details for ::SimdSynetEltwiseOperationMax: - \verbatim - for(j = 0; j < size; ++j) - dst[j] = -FLT_MAX; - for(i = 0; i < count; ++i) - for(j = 0; j < size; ++j) - dst[j] = Max(dst[j], src[i][j]); - \endverbatim - - Algorithm's details for ::SimdSynetEltwiseOperationMin: - \verbatim - for(j = 0; j < size; ++j) - dst[j] = FLT_MAX; - for(i = 0; i < count; ++i) - for(j = 0; j < size; ++j) - dst[j] = Min(dst[j], src[i][j]); - \endverbatim - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to poitres to the input 32-bit float arrays. - \param [in] weight - a pointer to the 32-bit float array with sum coefficients. It is need only for ::SimdSynetEltwiseOperationSum operation type otherwise it can be NULL. - \param [in] count - a count of input arrays. Must be at least 2. - \param [in] size - a size of the input and output arrays. - \param [in] type - a type of operation (see ::SimdSynetEltwiseOperationType). - \param [out] dst - a pointer to the output 32-bit float array. - */ - SIMD_API void SimdSynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst); - - /*! @ingroup synet_activation - - \fn void SimdSynetElu32f(const float * src, size_t size, const float * alpha, float * dst); - - \short Calculates ELU activation function for 32-bit float array. - - The input and output arrays must have the same size. - - Algorithm's details: - \verbatim - for(i = 0; i < size; ++i) - dst[i] = src[i] >= 0 ? src[i] : alpha*(Exp(src[i]) - 1); - \endverbatim - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input 32-bit float array. - \param [in] size - a size of input and output arrays. - \param [in] alpha - a pointer to alpha parameter. - \param [out] dst - a pointer to the output 32-bit float array. - */ - SIMD_API void SimdSynetElu32f(const float * src, size_t size, const float * alpha, float * dst); - - /*! @ingroup synet - - \fn void SimdSynetFusedLayerForward0(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - \short This function is used for forward propagation of FusedLayer (type 0). - - Algorithm's details (example for NCHW tensor format): - \verbatim - for(c = 0; c < channels; ++c) - for(s = 0; s < spatial; ++s) - { - o = c*spatial + s; - x = src[o] + bias[c]; - dst[o] = (x - abs(x))*scale[c] + max(0, x); - } - \endverbatim - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the 32-bit float array with input image tensor. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] bias - a pointer to the 32-bit float array with bias coefficients. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)). - \param [in] scale - a pointer to the 32-bit float array with scale coefficients. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)). - \param [in] channels - a number of channels in the (input/output) image tensor. - \param [in] spatial - a spatial size of (input/output) image tensor. - \param [out] dst - a pointer to the 32-bit float array with output image tensor. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] format - a format of (input/output) image tensor. - */ - SIMD_API void SimdSynetFusedLayerForward0(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - /*! @ingroup synet - - \fn void SimdSynetFusedLayerForward1(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - \short This function is used for forward propagation of FusedLayer (type 1). - - Algorithm's details (example for NCHW tensor format): - \verbatim - for(c = 0; c < channels; ++c) - for(s = 0; s < spatial; ++s) - { - o = c*spatial + s; - x = src[o] + bias0[c]; - dst[o] = max(0, -x)*scale1[c] + bias1[c] + max(0, x); - } - \endverbatim - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the 32-bit float array with input image tensor. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] bias0 - a pointer to the 32-bit float array with bias0 coefficients. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)). - \param [in] scale1 - a pointer to the 32-bit float array with scale1 coefficients. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)). - \param [in] bias1 - a pointer to the 32-bit float array with bias1 coefficients. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)). - \param [in] channels - a number of channels in the (input/output) image tensor. - \param [in] spatial - a spatial size of (input/output) image tensor. - \param [out] dst - a pointer to the 32-bit float array with output image tensor. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] format - a format of (input/output) image tensor. - */ - SIMD_API void SimdSynetFusedLayerForward1(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - /*! @ingroup synet - - \fn void SimdSynetFusedLayerForward2(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst, SimdTensorFormatType format); - - \short This function is used for forward propagation of FusedLayer (type 2). - - Algorithm's details (example for NCHW tensor format): - \verbatim - for(c = 0; c < channels; ++c) - for(s = 0; s < spatial; ++s) - { - o = c*spatial + s; - x = src[o]*scale[c] + bias[c]; - dst[o] = max(0, x) + min(0, x)*slope[0]; - } - \endverbatim - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the 32-bit float array with input image tensor. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] scale - a pointer to the 32-bit float array with scale coefficients. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)). - \param [in] bias - a pointer to the 32-bit float array with bias coefficients. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)). - \param [in] channels - a number of channels in the (input/output) image tensor. - \param [in] spatial - a spatial size of (input/output) image tensor. - \param [in] slope - a pointer to the 32-bit float slope coefficient. - \param [out] dst - a pointer to the 32-bit float array with output image tensor. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] format - a format of (input/output) image tensor. - */ - SIMD_API void SimdSynetFusedLayerForward2(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst, SimdTensorFormatType format); - - /*! @ingroup synet - - \fn void SimdSynetFusedLayerForward3(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - \short This function is used for forward propagation of FusedLayer (type 3). - - Algorithm's details (example for NCHW tensor format): - \verbatim - for(c = 0; c < channels; ++c) - for(s = 0; s < spatial; ++s) - { - o = c*spatial + s; - x = src[o] + bias[c]; - dst[o] = max(0, x) + min(0, x)*scale[c]; - } - \endverbatim - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the 32-bit float array with input image tensor. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] bias - a pointer to the 32-bit float array with bias coefficients. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)). - \param [in] scale - a pointer to the 32-bit float array with scale coefficients. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)). - \param [in] channels - a number of channels in the (input/output) image tensor. - \param [in] spatial - a spatial size of (input/output) image tensor. - \param [out] dst - a pointer to the 32-bit float array with output image tensor. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] format - a format of (input/output) image tensor. - */ - SIMD_API void SimdSynetFusedLayerForward3(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - /*! @ingroup synet - - \fn void SimdSynetFusedLayerForward4(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - \short This function is used for forward propagation of FusedLayer (type 4). - - Algorithm's details (example for NCHW tensor format): - \verbatim - for(c = 0; c < channels; ++c) - for(s = 0; s < spatial; ++s) - { - x = src[c*spatial + s] + bias0[c]; - dst[c*spatial + s] = std::max((T)0, x); - dst[(c + channels)*spatial + s] = std::max((T)0, x*scale1[0] + bias1[0]); - } - \endverbatim - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the 32-bit float array with input image tensor. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] bias0 - a pointer to the 32-bit float array with bias0 coefficients. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)). - \param [in] scale1 - a pointer to the 32-bit float array with scale1 coefficients. The size of the array is 1. - \param [in] bias1 - a pointer to the 32-bit float array with bias1 coefficients. The size of the array is 1. - \param [in] channels - a number of channels in the input image tensor. Output image tensor has 2 * channels. - \param [in] spatial - a spatial size of (input/output) image tensor. - \param [out] dst - a pointer to the 32-bit float array with output image tensor. The size of the array is ::SimdAlign (2 * channels, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] format - a format of (input/output) image tensor. - */ - SIMD_API void SimdSynetFusedLayerForward4(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - /*! @ingroup synet - - \fn void SimdSynetFusedLayerForward8(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - \short This function is used for forward propagation of FusedLayer (type 8). - - Algorithm's details (example for NCHW tensor format): - \verbatim - for(c = 0; c < channels; ++c) - for(s = 0; s < spatial; ++s) - { - o = c*spatial + s; - dst[o] = src0[o] + src1[o]*src2[c]; - } - \endverbatim - - \note This function is used in Synet Framework. - - \param [in] src0 - a pointer to the first input 32-bit float array. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] src1 - a pointer to the second input 32-bit float array. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] src2 - a pointer to the third input 32-bit float array. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)). - \param [in] channels - a number of channels in the (input/output) image tensor. - \param [in] spatial - a spatial size of (input/output) image tensor. - \param [out] dst - a pointer to the output 32-bit float array. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] format - a format of (input/output) image tensor. - */ - SIMD_API void SimdSynetFusedLayerForward8(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - /*! @ingroup synet - - \fn void SimdSynetFusedLayerForward9(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1, SimdTensorFormatType format); - - \short This function is used for forward propagation of FusedLayer (type 9). - - Algorithm's details (example for NCHW tensor format): - \verbatim - for(c = 0; c < channels0; ++c) - for(s = 0; s < spatial; ++s) - { - dst0[c*spatial + s] = max(0, src0[c*spatial + s]*scale[c] + bias[c]); - if(dst1) - dst1[c*spatial + s] = src0[c*spatial + s]; - } - for(c = 0; c < channels1; ++c) - for(s = 0; s < spatial; ++s) - { - dst0[(c + channels0)*spatial + s] = max(0, src1[c*spatial + s]*scale[channels0 + c] + bias[channels0 + c]); - if(dst1) - dst1[(c + channels0)*spatial + s] = src1[c*spatial + s]; - } - \endverbatim - - \note This function is used in Synet Framework. - - \param [in] src0 - a pointer to the first input 32-bit float array. The size of the array is ::SimdAlign (channels0, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] src1 - a pointer to the second input 32-bit float array. The size of the array is ::SimdAlign (channels1, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] scale - a pointer to the 32-bit float array with scale coefficients. The size of the array is ::SimdAlign (channels0 + channels1, ::SimdSynetTensorAlignment (format)). - \param [in] bias - a pointer to the 32-bit float array with bias coefficients. The size of the array is ::SimdAlign (channels0 + channels1, ::SimdSynetTensorAlignment (format)). - \param [in] channels0 - a number of channels in the first input image tensor. - \param [in] channels1 - a number of channels in the second input image tensor. - \param [in] spatial - a spatial size of (input/output) image tensor. - \param [out] dst0 - a pointer to the first output 32-bit float array. The size of the array is ::SimdAlign (channels0 + channels1, ::SimdSynetTensorAlignment (format)) * spatial. - \param [out] dst1 - a pointer to the second output 32-bit float array. The size of the array is ::SimdAlign (channels0 + channels1, ::SimdSynetTensorAlignment (format)) * spatial. The pointer can be NULL. - \param [in] format - a format of (input/output) image tensor. - */ - SIMD_API void SimdSynetFusedLayerForward9(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1, SimdTensorFormatType format); - - /*! @ingroup synet_activation - - \fn void SimdSynetHswish32f(const float * src, size_t size, const float * shift, const float * scale, float * dst); - - \short Calculates H-Swish activation function (https://arxiv.org/pdf/1905.02244.pdf) for 32-bit float array. - - Input and output arrays must have the same size. - - Algorithm's details: - \verbatim - for(i = 0; i < size; ++i) - dst[i] = Max(Min(src[i], shift) + shift, 0)*scale*src[i]; - \endverbatim - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input 32-bit float array. - \param [in] size - a size of input and output arrays. - \param [in] shift - a pointer to shift parameter. It is equal to 3 in original paper. - \param [in] scale - a pointer to scale parameter. It is equal to 1/6 in original paper. - \param [out] dst - a pointer to the output 32-bit float array. - */ - SIMD_API void SimdSynetHswish32f(const float * src, size_t size, const float * shift, const float * scale, float * dst); - - /*! @ingroup synet - - \fn void SimdSynetInnerProductLayerForward(const float * src, const float * weight, const float * bias, size_t count, size_t size, float * dst); - - \short This function is used for forward propagation of InnerProductLayer. - - Algorithm's details: - \verbatim - for(i = 0; i < count; ++i) - { - dst[i] = (bias ? bias[i] : 0); - for(j = 0; j < size; ++j) - dst[i] += src[j]*weight[i*size + j]; - } - \endverbatim - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input 32-bit float array. The size of the array must be equal to size. - \param [in] weight - a pointer to the 32-bit float array with weight coefficients. The size of the array must be equal to count*size. - \param [in] bias - a pointer to the 32-bit float array with bias coefficients. The size of the array must be equal to count. Can be NULL. - \param [in] count - a size of output array. - \param [in] size - a size of input array. - \param [out] dst - a pointer to the output 32-bit float array. The size of the array must be equal to count. - */ - SIMD_API void SimdSynetInnerProductLayerForward(const float * src, const float * weight, const float * bias, size_t count, size_t size, float * dst); - - /*! @ingroup synet - - \fn void SimdSynetLrnLayerCrossChannels(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst, SimdTensorFormatType format); - - \short This function is used for forward propagation of LrnLayer (cross channels normalization). - - Algorithm's details (example for NCHW tensor format): - \verbatim - for(c = 0; c < channels; ++c) - for(s = 0; s < spatial; ++s) - { - lo = Max(0, c - half); - hi = Min(channels, c + half + 1); - sum = 0; - for(i = lo; i < ln; ++i) - sum += Square(src[i*spatial + s]); - dst[c*spatial + s] = src[c*spatial + s]*Pow(k[0] + sum*k[1], k[2]); - } - \endverbatim - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the 32-bit float array with input image tensor. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] half - a local normalization half size. - \param [in] channels - a number of channels in the (input/output) image tensor - \param [in] spatial - a spatial size of (input/output) image tensor. - \param [in] k - a pointer to the 32-bit float array with 3 coefficients (see algorithm details). - \param [out] dst - a pointer to the 32-bit float array with output image tensor. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] format - a format of (input/output) image tensor. - */ - SIMD_API void SimdSynetLrnLayerCrossChannels(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst, SimdTensorFormatType format); - - /*! @ingroup synet - - \fn void * SimdSynetMergedConvolution32fInit(size_t batch, const SimdConvolutionParameters * convs, size_t count, SimdBool add); - - \short Initilizes FP32 merged convolution algorithm. - - \param [in] batch - a batch size. - \param [in] convs - an array with convolutions parameters. - \param [in] count - a number of merged convolutions. - \param [in] add - a flag that signilizes if we need to add output to source value. - \return a pointer to FP32 merged convolution context. On error it returns NULL. It must be released with using of function ::SimdRelease. - This pointer is used in functions ::SimdSynetMergedConvolution32fExternalBufferSize, ::SimdSynetMergedConvolution32fInternalBufferSize, ::SimdSynetMergedConvolution32fSetParams and ::SimdSynetMergedConvolution32fForward. - */ - SIMD_API void * SimdSynetMergedConvolution32fInit(size_t batch, const SimdConvolutionParameters * convs, size_t count, SimdBool add); - - /*! @ingroup synet - - \fn size_t SimdSynetMergedConvolution32fExternalBufferSize(const void * context); - - \short Gets size of external temporary buffer required for FP32 merged convolution algorithm. - - \param [in] context - a pointer to FP32 merged convolution context. It must be created by function ::SimdSynetMergedConvolution32fInit and released by function ::SimdRelease. - \return size of external temporary buffer required for FP32 merged convolution algorithm. - */ - SIMD_API size_t SimdSynetMergedConvolution32fExternalBufferSize(const void * context); - - /*! @ingroup synet - - \fn size_t SimdSynetMergedConvolution32fInternalBufferSize(const void * context); - - \short Gets size of internal buffer used inside FP32 merged convolution algorithm. - - \param [in] context - a pointer to FP32 merged convolution context. It must be created by function ::SimdSynetMergedConvolution32fInit and released by function ::SimdRelease. - \return size of internal buffer used inside FP32 merged convolution algorithm. - */ - SIMD_API size_t SimdSynetMergedConvolution32fInternalBufferSize(const void * context); - - /*! @ingroup synet - - \fn void SimdSynetMergedConvolution32fSetParams(void * context, const float * const * weight, SimdBool * internal, const float * const * bias, const float * const * params); - - \short Sets weights, beases and parameters of activation function required for FP32 merged convolution algorithm. - - \param [in, out] context - a pointer to FP32 merged convolution context. It must be created by function ::SimdSynetMergedConvolution32fInit and released by function ::SimdRelease. - \param [in] weight - a pointer to the array with pointers to convolution weights. The array size is determined by number of merged convolutions. - \param [out] internal - a ponter to the array of flags signalized that weights are stored in the internal buffer. The array size is determined by number of merged convolutions. Can be NULL. - \param [in] bias - a pointer to the array with pointers to bias. The array size is determined by number of merged convolutions. Can be NULL. - \param [in] params - a pointer to the array with pointers to parameters of the activation functions (see ::SimdConvolutionActivationType). The array size is determined by number of merged convolutions. Can be NULL. - */ - SIMD_API void SimdSynetMergedConvolution32fSetParams(void * context, const float * const * weight, SimdBool * internal, const float * const * bias, const float * const * params); - - /*! @ingroup synet - - \fn void SimdSynetMergedConvolution32fForward(void * context, const float * src, float * buf, float * dst); - - \short Performs forward propagation of FP32 merged convolution algorithm. - - \param [in] context - a pointer to FP32 merged convolution context. It must be created by function ::SimdSynetMergedConvolution32fInit and released by function ::SimdRelease. - \param [in] src - a pointer to input image. - \param [out] buf - a pointer to external temporary buffer. The size of the external temporary buffer is determined by function ::SimdSynetMergedConvolution32fExternalBufferSize. Can be NULL (it causes usage of internal buffer). - \param [out] dst - a pointer to output image. - */ - SIMD_API void SimdSynetMergedConvolution32fForward(void * context, const float * src, float * buf, float * dst); - - /*! @ingroup synet - - \fn void SimdSynetPoolingForwardAverage(const float * src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, size_t strideY, size_t strideX, size_t padY, size_t padX, float * dst, size_t dstH, size_t dstW, SimdBool excludePad, SimdTensorFormatType format); - - \short This function is used for forward propagation of PoolingLayer (AveragePooling). - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input 32-bit float array. The size of the array must be equal to srcC*srcH*srcW. - \param [in] srcC - a number of input and output channels. - \param [in] srcH - an input height. - \param [in] srcW - an input width. - \param [in] kernelY - a height of the pooling kernel. - \param [in] kernelX - a width of the pooling kernel. - \param [in] strideY - a y-stride of the pooling. - \param [in] strideX - a x-stride of the pooling. - \param [in] padY - a pad to the top of the input image. - \param [in] padX - a pad to the left of the input image. - \param [out] dst - a pointer to the output 32-bit float array. The size of the array must be equal to srcC*dstH*dstW. - \param [in] dstH - an output height. - \param [in] dstW - an output width. - \param [in] excludePad - a flag of exclude pad from average value calculation. - \param [in] format - a format of (input/output) image tensor. - */ - SIMD_API void SimdSynetPoolingForwardAverage(const float * src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float * dst, size_t dstH, size_t dstW, SimdBool excludePad, SimdTensorFormatType format); - - /*! @ingroup synet - - \fn void SimdSynetPoolingForwardMax32f(const float * src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, size_t strideY, size_t strideX, size_t padY, size_t padX, float * dst, size_t dstH, size_t dstW, SimdTensorFormatType format); - - \short This function is used for forward propagation of PoolingLayer (MaxPooling, 32-bit float). - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input 32-bit float array. The size of the array must be equal to srcC*srcH*srcW. - \param [in] srcC - a number of input and output channels. - \param [in] srcH - an input height. - \param [in] srcW - an input width. - \param [in] kernelY - a height of the pooling kernel. - \param [in] kernelX - a width of the pooling kernel. - \param [in] strideY - a y-stride of the pooling. - \param [in] strideX - a x-stride of the pooling. - \param [in] padY - a pad to the top of the input image. - \param [in] padX - a pad to the left of the input image. - \param [out] dst - a pointer to the output 32-bit float array. The size of the array must be equal to srcC*dstH*dstW. - \param [in] dstH - an output height. - \param [in] dstW - an output width. - \param [in] format - a format of (input/output) image tensor. - */ - SIMD_API void SimdSynetPoolingForwardMax32f(const float * src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float * dst, size_t dstH, size_t dstW, SimdTensorFormatType format); - - /*! @ingroup synet - - \fn void SimdSynetPoolingForwardMax8u(const uint8_t * src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, size_t strideY, size_t strideX, size_t padY, size_t padX, uint8_t * dst, size_t dstH, size_t dstW, SimdTensorFormatType format); - - \short This function is used for forward propagation of PoolingLayer (MaxPooling, 8-bit unsigned integer). - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input 8-bit unsigned integer array. The size of the array must be equal to srcC*srcH*srcW. - \param [in] srcC - a number of input and output channels. - \param [in] srcH - an input height. - \param [in] srcW - an input width. - \param [in] kernelY - a height of the pooling kernel. - \param [in] kernelX - a width of the pooling kernel. - \param [in] strideY - a y-stride of the pooling. - \param [in] strideX - a x-stride of the pooling. - \param [in] padY - a pad to the top of the input image. - \param [in] padX - a pad to the left of the input image. - \param [out] dst - a pointer to the output 8-bit unsigned integer array. The size of the array must be equal to srcC*dstH*dstW. - \param [in] dstH - an output height. - \param [in] dstW - an output width. - \param [in] format - a format of (input/output) image tensor. - */ - SIMD_API void SimdSynetPoolingForwardMax8u(const uint8_t* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, uint8_t* dst, size_t dstH, size_t dstW, SimdTensorFormatType format); - - - /*! @ingroup synet_activation - - \fn void SimdSynetPreluLayerForward(const float * src, const float * slope, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - \short This function is used for forward propagation of PreluLayer (PReLU). - - Algorithm's details (example for NCHW tensor format): - \verbatim - for(c = 0; c < channels; ++c) - for(s = 0; s < spatial; ++s) - dst[c*spatial + s] = src[c*spatial + s] > 0 ? src[c*spatial + s] : slope[c]*src[c*spatial + s]; - \endverbatim - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the 32-bit float array with input image tensor. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] slope - a pointer to the 32-bit float array with slope coefficients. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format). - \param [in] channels - a number of channels in the (input/output) image tensor - \param [in] spatial - a spatial size of (input/output) image tensor. - \param [out] dst - a pointer to the 32-bit float array with output image tensor. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] format - a format of (input/output) image tensor. - */ - SIMD_API void SimdSynetPreluLayerForward(const float * src, const float * slope, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - /*! @ingroup synet_activation - - \fn void SimdSynetRelu32f(const float* src, size_t size, const float* slope, float* dst); - - \short Calculates ReLU (rectified linear unit) function for 32-bit float array. - - Algorithm's details: - \verbatim - for(i = 0; i < size; ++i) - dst[i] = src[i] > 0 ? src[i] : slope*src[i]; - \endverbatim - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input 32-bit float array. - \param [in] size - a size of input and output arrays. - \param [in] slope - a pointer to the 'slope' parameter. - \param [out] dst - a pointer to output 32-bit float array. - */ - SIMD_API void SimdSynetRelu32f(const float* src, size_t size, const float* slope, float* dst); - - /*! @ingroup synet_conversion - - \fn void SimdSynetReorderImage(size_t batch, size_t channels, size_t spatial, const float * src, SimdTensorFormatType srcFormat, float * dst, SimdTensorFormatType dstFormat); - - \short Converts (input/output) image between different formats of 4D-tensor. - - \note This function is used in Synet Framework. Conversion between ::SimdTensorFormatNchw4c, ::SimdTensorFormatNchw8c, ::SimdTensorFormatNchw16c is not supported. - - \param [in] batch - a batch (number of images in the batch). - \param [in] channels - a number of image channels. - \param [in] spatial - a spatial size (height*width) of image. - \param [in] src - a pointer to input image data. - \param [in] srcFormat - a format of input image. It can be ::SimdTensorFormatNchw, ::SimdTensorFormatNhwc, ::SimdTensorFormatNchw4c, ::SimdTensorFormatNchw8c, ::SimdTensorFormatNchw16c. - \param [out] dst - a pointer to output image data. - \param [in] dstFormat - a format of output image. It can be ::SimdTensorFormatNchw, ::SimdTensorFormatNhwc, ::SimdTensorFormatNchw4c, ::SimdTensorFormatNchw8c, ::SimdTensorFormatNchw16c. - */ - SIMD_API void SimdSynetReorderImage(size_t batch, size_t channels, size_t spatial, const float* src, SimdTensorFormatType srcFormat, float* dst, SimdTensorFormatType dstFormat); - - /*! @ingroup synet_conversion - - \fn void SimdSynetReorderFilter(size_t output, size_t input, size_t kernel, const float * src, SimdTensorFormatType srcFormat, float * dst, SimdTensorFormatType dstFormat); - - \short Converts 2d-convolution filter weight between different formats of 4D-tensor. - - \note This function is used in Synet Framework. Conversion between ::SimdTensorFormatOyxi4o, ::SimdTensorFormatOyxi8o, ::SimdTensorFormatOyxi16o is not supported. - - \param [in] output - a number of output channels in filter. - \param [in] input - a number of intput channels in filter. - \param [in] kernel - a size (width*height) of filter kernel. - \param [in] src - a pointer to input filter data. - \param [in] srcFormat - a format of input filter. It can be ::SimdTensorFormatOiyx, ::SimdTensorFormatYxio, ::SimdTensorFormatOyxi4o, ::SimdTensorFormatOyxi8o, ::SimdTensorFormatOyxi16o. - \param [out] dst - a pointer to output filter data. - \param [in] dstFormat - a format of output filter. It can be SimdTensorFormatOiyx, ::SimdTensorFormatYxio, ::SimdTensorFormatOyxi4o, ::SimdTensorFormatOyxi8o, ::SimdTensorFormatOyxi16o. - */ - SIMD_API void SimdSynetReorderFilter(size_t output, size_t input, size_t kernel, const float* src, SimdTensorFormatType srcFormat, float* dst, SimdTensorFormatType dstFormat); - - /*! @ingroup synet_activation - - \fn void SimdSynetRestrictRange32f(const float * src, size_t size, const float * lower, const float * upper, float * dst); - - \short This function is used in order to restrict range for given 320bit float array. - - Algorithm's details: - \verbatim - for(i = 0; i < size; ++i) - dst[i] = Min(Max(lower, src[i]), upper); - \endverbatim - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input 32-bit float array. - \param [in] size - a size of input and output arrays. - \param [in] lower - a pointer to lower restrict bound. - \param [in] upper - a pointer to upper restrict bound. - \param [out] dst - a pointer to the output 32-bit float array. - */ - SIMD_API void SimdSynetRestrictRange32f(const float * src, size_t size, const float * lower, const float * upper, float * dst); - - /*! @ingroup synet - - \fn void SimdSynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t channels, size_t height, size_t width, float * dst, SimdTensorFormatType format, SimdSynetCompatibilityType compatibility); - - \short This function is used for forward propagation of ScaleLayer. - - Algorithm's details (example for NCHW tensor format): - \verbatim - for(c = 0; c < channels; ++c) - for(h = 0; h < height; ++h) - for(w = 0; w < width; ++w) - dst[(c*height + h)*width + w] = src[(c*height + h)*width + w]*scale[c] + (bias ? bias[c] : 0); - \endverbatim - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the 32-bit float array with input image tensor. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] scale - a pointer to the 32-bit float array with scale coefficients. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)). - \param [in] bias - a pointer to the 32-bit float array with bias coefficients. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)). Can be NULL. - \param [in] channels - a number of channels in the (input/output) image tensor. - \param [in] height - a height of (input/output) image tensor. - \param [in] width - a width of (input/output) image tensor. - \param [out] dst - a pointer to the 32-bit float array with output image tensor. The size of the array is ::SimdAlign (channels, ::SimdSynetTensorAlignment (format)) * spatial. - \param [in] format - a format of (input/output) image tensor. - \param [in] compatibility - a flags of bitwise compatibility. - */ - SIMD_API void SimdSynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t channels, size_t height, size_t width, float * dst, SimdTensorFormatType format, SimdSynetCompatibilityType compatibility); - - /*! @ingroup synet_conversion - - \fn void void SimdSynetSetInput(const uint8_t * src, size_t width, size_t height, size_t stride, SimdPixelFormatType srcFormat, const float * lower, const float * upper, float * dst, size_t channels, SimdTensorFormatType dstFormat); - - \short Sets image to the input of neural network of Synet Framework. - - Algorithm's details (example for BGRA pixel format and NCHW tensor format): - \verbatim - for(c = 0; c < channels; ++c) - for(y = 0; y < height; ++y) - for(x = 0; x < width; ++x) - dst[(c*height + y)*width + x] = src[stride*y + width*4 + c]*(upper[c] - lower[c])/255 + lower[c]; - \endverbatim - - \note This function has a C++ wrappers: Simd::SynetSetInput(const View & src, const float * lower, const float * upper, float * dst, size_t channels, SimdTensorFormatType format). - - \param [in] src - a pointer to pixels data of input image. - \param [in] width - a width of input image and output image tensor. - \param [in] height - a height of input image and output image tensor. - \param [in] stride - a row size of input image. - \param [in] srcFormat - a pixel format of input image. There are supported following pixel formats: ::SimdPixelFormatGray8, ::SimdPixelFormatBgr24, ::SimdPixelFormatBgra32, ::SimdPixelFormatRgb24. - \param [in] lower - a pointer to the array with lower bound of values of the output tensor. The size of the array have to correspond number of channels in the output image tensor. - \param [in] upper - a pointer to the array with upper bound of values of the output tensor. The size of the array have to correspond number of channels in the output image tensor. - \param [out] dst - a pointer to the output 32-bit float image tensor. - \param [in] channels - a number of channels in the output image tensor. It can be 1 or 3. - \param [in] dstFormat - a format of output image tensor. There are supported following tensor formats: ::SimdTensorFormatNchw, ::SimdTensorFormatNhwc. - */ - SIMD_API void SimdSynetSetInput(const uint8_t * src, size_t width, size_t height, size_t stride, SimdPixelFormatType srcFormat, - const float * lower, const float * upper, float * dst, size_t channels, SimdTensorFormatType dstFormat); - - /*! @ingroup synet - - \fn void SimdSynetShuffleLayerForward(const float * src0, const float * src1, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1, SimdTensorFormatType format, int type); - - \short This function is used for forward propagation of ShuffleLayer. - - \note This function is used in Synet Framework. - - \param [in] src0 - a pointer to the 32-bit float array with the first input image tensor. - \param [in] src1 - a pointer to the 32-bit float array with the second input image tensor. - \param [in] channels0 - a number of channels in the first input (type == 0) or output (type == 1) image tensor. It must be even number. - \param [in] channels1 - a number of channels in the second input (type == 0) or output (type == 1) image tensor. It must be even number. - \param [in] spatial - a spatial size of (input/output) image tensors. - \param [out] dst0 - a pointer to the 32-bit float array with the first output image tensor. - \param [out] dst1 - a pointer to the 32-bit float array with the second output image tensor. - \param [in] format - a format of (input/output) image tensors. - \param [in] type - a shuffle type (it can be 0 or 1). - */ - SIMD_API void SimdSynetShuffleLayerForward(const float * src0, const float * src1, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1, SimdTensorFormatType format, int type); - - /*! @ingroup synet_activation - - \fn void SimdSynetSigmoid32f(const float * src, size_t size, const float * slope, float * dst); - - \short This function is used for forward propagation of SigmoidLayer. - - Algorithm's details: - \verbatim - for(i = 0; i < size; ++i) - dst[i] = 1/(1 + exp(-slope*src[i])); - \endverbatim - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the 32-bit float array. - \param [in] size - a size of input and output arrays. - \param [in] slope - a pointer to the 'slope' parameter. - \param [out] dst - a pointer to output 32-bit float array. - */ - SIMD_API void SimdSynetSigmoid32f(const float* src, size_t size, const float* slope, float* dst); - - /*! @ingroup synet - - \fn void SimdSynetSoftmaxLayerForward(const float * src, size_t outer, size_t count, size_t inner, float * dst); - - \short This function is used for forward propagation of SoftmaxLayer. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input 32-bit float array. The size of the array must be equal to outer*count*inner. - \param [in] outer - an outer size of input and output arrays. - \param [in] count - a size of softmax dimmension. - \param [in] inner - an inner size of input and output arrays. - \param [out] dst - a pointer to the output 32-bit float array. The size of the array must be equal to outer*count*inner. - */ - SIMD_API void SimdSynetSoftmaxLayerForward(const float * src, size_t outer, size_t count, size_t inner, float * dst); - - /*! @ingroup synet_activation - - \fn void SimdSynetSoftplus32f(const float* src, size_t size, const float * beta, const float * threshold, float * dst); - - \short This function is used for forward propagation of SoftplusLayer. - - Algorithm's details: - \verbatim - for(i = 0; i < size; ++i) - dst[i] = src[i] > threshold ? src[i] : log(1 + exp(src[i]*beta))/beta; - \endverbatim - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input 32-bit float array. - \param [in] size - a size of input and output arrays. - \param [in] beta - a pointer to 'beta' parameter. - \param [in] threshold - a pointer to 'threshold' parameter. - \param [out] dst - a pointer to the output 32-bit float array. - */ - SIMD_API void SimdSynetSoftplus32f(const float* src, size_t size, const float * beta, const float * threshold, float * dst); - - /*! @ingroup synet - - \fn SimdTensorFormatType SimdSynetSpecifyTensorFormat(SimdTensorFormatType format); - - \short Specifies hardware optimized tensor format of 5D-tensor for (input/output) image or 2D-convolution filter. - - \note This function is used in Synet Framework. - - \param [in] format - an unspecified hardware optimized 5D-tensor format of (input/output) image or 2D-convolution filter. It can be ::SimdTensorFormatNchwXc or ::SimdTensorFormatOyxiXo. - \return specified hardware optimized 5D-tensor format. - */ - SIMD_API SimdTensorFormatType SimdSynetSpecifyTensorFormat(SimdTensorFormatType format); - - /*! @ingroup synet_activation - - \fn void SimdSynetTanh32f(const float * src, size_t size, const float * slope, float * dst); - - \short Calculates hyperbolic tangent for 32-bit float array. - - \note This function is used in Synet Framework. - - Algorithm's details: - \verbatim - for(i = 0; i < size; ++i) - { - x = slope*src[i]; - dst[i] = (exp(x) - exp(-x))/(exp(x) + exp(-x)); - } - \endverbatim - - \param [in] src - a pointer to the input 32-bit float array. - \param [in] size - a size of input and output arrays. - \param [in] slope - a pointer to the 'slope' parameter. - \param [out] dst - a pointer to output 32-bit float array. - */ - SIMD_API void SimdSynetTanh32f(const float* src, size_t size, const float* slope, float* dst); - - /*! @ingroup synet - - \fn size_t SimdSynetTensorAlignment(SimdTensorFormatType format); - - \short Gets alignment requred for current tensor format. - - \note This function is used in Synet Framework. - - \param [in] format - a tensor format. - \return alignment requred for current tensor format. - */ - SIMD_API size_t SimdSynetTensorAlignment(SimdTensorFormatType format); - - /*! @ingroup synet - - \fn void SimdSynetUnaryOperation32fLayerForward(const float * src, size_t size, SimdSynetUnaryOperation32fType type, float* dst); - - \short This function is used for forward propagation of UnaryOperationLayer. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to poitres to the input 32-bit float arrays. - \param [in] size - a size of the input and output arrays. - \param [in] type - an unary operation type (see ::SimdSynetUnaryOperation32fType). - \param [out] dst - a pointer to the output 32-bit float array. - */ - SIMD_API void SimdSynetUnaryOperation32fLayerForward(const float * src, size_t size, SimdSynetUnaryOperation32fType type, float * dst); - - /*! @ingroup texture_estimation - - \fn void SimdTextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride); - - \short Calculates boosted saturated gradients for given input image. - - All images must have the same width, height and format (8-bit gray). - - For border pixels: - \verbatim - dx[x, y] = 0; - dy[x, y] = 0; - \endverbatim - For other pixels: - \verbatim - dx[x, y] = (saturation + max(-saturation, min(saturation, (src[x + 1, y] - src[x - 1, y]))))*boost; - dy[x, y] = (saturation + max(-saturation, min(saturation, (src[x, y + 1] - src[x, y - 1]))))*boost; - \endverbatim - - \note This function has a C++ wrappers: Simd::TextureBoostedSaturatedGradient(const View& src, uint8_t saturation, uint8_t boost, View& dx, View& dy). - - \param [in] src - a pointer to pixels data of source 8-bit gray image. - \param [in] srcStride - a row size of source image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] saturation - a saturation of gradient. - \param [in] boost - a boost coefficient. - \param [out] dx - a pointer to pixels data of image with boosted saturated gradient along x axis. - \param [in] dxStride - a row size of dx image. - \param [out] dy - a pointer to pixels data of image with boosted saturated gradient along y axis. - \param [in] dyStride - a row size of dy image. - */ - SIMD_API void SimdTextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride); - - /*! @ingroup texture_estimation - - \fn void SimdTextureBoostedUv(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t boost, uint8_t * dst, size_t dstStride); - - \short Calculates boosted colorized texture feature of input image (actual for U and V components of YUV format). - - All images must have the same width, height and format (8-bit gray). - - For every pixel: - \verbatim - lo = 128 - (128/boost); - hi = 255 - lo; - dst[x, y] = max(lo, min(hi, src[i]))*boost; - \endverbatim - - \note This function has a C++ wrappers: Simd::TextureBoostedUv(const View& src, uint8_t boost, View& dst). - - \param [in] src - a pointer to pixels data of source 8-bit gray image. - \param [in] srcStride - a row size of source image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] boost - a boost coefficient. - \param [out] dst - a pointer to pixels data of result image. - \param [in] dstStride - a row size of destination image. - */ - SIMD_API void SimdTextureBoostedUv(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t boost, uint8_t * dst, size_t dstStride); - - /*! @ingroup texture_estimation - - \fn void SimdTextureGetDifferenceSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, int64_t * sum); - - \short Calculates difference between current image and background. - - All images must have the same width, height and format (8-bit gray). - - For every pixel: - \verbatim - sum += current - average(lo[i], hi[i]); - \endverbatim - - \note This function has a C++ wrappers: Simd::TextureGetDifferenceSum(const View& src, const View& lo, const View& hi, int64_t & sum). - - \param [in] src - a pointer to pixels data of current image. - \param [in] srcStride - a row size of current image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] lo - a pointer to pixels data of image with lower bound of background feature. - \param [in] loStride - a row size of lo image. - \param [in] hi - a pointer to pixels data of image with upper bound of background feature. - \param [in] hiStride - a row size of hi image. - \param [out] sum - a pointer to 64-bit integer with result sum. - */ - SIMD_API void SimdTextureGetDifferenceSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, int64_t * sum); - - /*! @ingroup texture_estimation - - \fn void SimdTexturePerformCompensation(const uint8_t * src, size_t srcStride, size_t width, size_t height, int32_t shift, uint8_t * dst, size_t dstStride); - - \short Performs brightness compensation of input image. - - All images must have the same width, height and format (8-bit gray). - - For every pixel: - \verbatim - dst[i] = max(0, min(255, src[i] + shift)); - \endverbatim - - \note This function has a C++ wrappers: Simd::TexturePerformCompensation(const View& src, int shift, View& dst). - - \param [in] src - a pointer to pixels data of input image. - \param [in] srcStride - a row size of input image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] shift - a compensation shift. - \param [out] dst - a pointer to pixels data of output image. - \param [in] dstStride - a row size of output image. - */ - SIMD_API void SimdTexturePerformCompensation(const uint8_t * src, size_t srcStride, size_t width, size_t height, - int32_t shift, uint8_t * dst, size_t dstStride); - - /*! @ingroup transform - - \fn void SimdTransformImage(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, SimdTransformType transform, uint8_t * dst, size_t dstStride); - - \short Performs transformation of input image. The type of transformation is defined by ::SimdTransformType enumeration. - - \note This function has a C++ wrappers: Simd::TransformImage(const View & src, ::SimdTransformType transform, View & dst). - - \param [in] src - a pointer to pixels data of input image. - \param [in] srcStride - a row size of input image. - \param [in] width - an input image width. - \param [in] height - an input image height. - \param [in] pixelSize - a pixel size in input and output images. It can be 1, 2, 3, 4. - \param [in] transform - a type of image transformation. - \param [out] dst - a pointer to pixels data of output image. - \param [in] dstStride - a row size of output image. - */ - SIMD_API void SimdTransformImage(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, SimdTransformType transform, uint8_t * dst, size_t dstStride); - - /*! @ingroup synet_winograd - - \fn void SimdWinogradKernel1x3Block1x4SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - \short This function is used for filter conversion in Winograd F(1x4,1x3) or F(4x1,3x1) convolution algorithm. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input 32-bit float array with filter weights. - \param [in] size - (number of input channels)*(number of output channels). - \param [out] dst - a pointer to the output 32-bit float array with filter weights. - \param [in] trans - a flag of transposed data. - */ - SIMD_API void SimdWinogradKernel1x3Block1x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans); - - /*! @ingroup synet_winograd - - \fn void SimdWinogradKernel1x3Block1x4SetInput(const float * src, size_t srcChannels, size_t srcHeight, size_t srcWidth, size_t padY, size_t padX, size_t padH, size_t padW, float * dst, size_t dstStride, SimdBool trans); - - \short This function is used for input image conversion in Winograd F(1x4,1x3) convolution algorithm. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input image. - \param [in] srcChannels - a number of input channels. - \param [in] srcHeight - a height of input image. - \param [in] srcWidth - a width of input image. - \param [in] padY - an additional zero padding of input image at the beginning of Y-axis. - \param [in] padX - an additional zero padding of input image at the beginning of X-axis. - \param [in] padH - an additional zero padding of input image at the end of Y-axis. - \param [in] padW - an additional zero padding of input image at the end of X-axis. - \param [out] dst - a pointer to the output array with converted image. - \param [in] dstStride - a stride of output image. - \param [in] trans - a flag of transposed data. - */ - SIMD_API void SimdWinogradKernel1x3Block1x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - /*! @ingroup synet_winograd - - \fn void SimdWinogradKernel1x3Block1x4SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - \short This function is used for output image conversion in Winograd F(1x4,1x3) convolution algorithm. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input image. - \param [in] srcStride - a stride of input image. - \param [out] dst - a pointer to the output image. - \param [in] dstChannels - a number of output channels. - \param [in] dstHeight - a height of output image. - \param [in] dstWidth - a width of output image. - \param [in] trans - a flag of transposed data. - */ - SIMD_API void SimdWinogradKernel1x3Block1x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - /*! @ingroup synet_winograd - - \fn void SimdWinogradKernel1x5Block1x4SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - \short This function is used for filter conversion in Winograd F(1x4,1x5) or F(4x1,5x1) convolution algorithm. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input 32-bit float array with filter weights. - \param [in] size - (number of input channels)*(number of output channels). - \param [out] dst - a pointer to the output 32-bit float array with filter weights. - \param [in] trans - a flag of transposed data. - */ - SIMD_API void SimdWinogradKernel1x5Block1x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans); - - /*! @ingroup synet_winograd - - \fn void SimdWinogradKernel1x5Block1x4SetInput(const float * src, size_t srcChannels, size_t srcHeight, size_t srcWidth, size_t padY, size_t padX, size_t padH, size_t padW, float * dst, size_t dstStride, SimdBool trans); - - \short This function is used for input image conversion in Winograd F(1x4,1x5) convolution algorithm. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input image. - \param [in] srcChannels - a number of input channels. - \param [in] srcHeight - a height of input image. - \param [in] srcWidth - a width of input image. - \param [in] padY - an additional zero padding of input image at the beginning of Y-axis. - \param [in] padX - an additional zero padding of input image at the beginning of X-axis. - \param [in] padH - an additional zero padding of input image at the end of Y-axis. - \param [in] padW - an additional zero padding of input image at the end of X-axis. - \param [out] dst - a pointer to the output array with converted image. - \param [in] dstStride - a stride of output image. - \param [in] trans - a flag of transposed data. - */ - SIMD_API void SimdWinogradKernel1x5Block1x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - /*! @ingroup synet_winograd - - \fn void SimdWinogradKernel1x5Block1x4SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - \short This function is used for output image conversion in Winograd F(1x4,1x5) convolution algorithm. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input image. - \param [in] srcStride - a stride of input image. - \param [out] dst - a pointer to the output image. - \param [in] dstChannels - a number of output channels. - \param [in] dstHeight - a height of output image. - \param [in] dstWidth - a width of output image. - \param [in] trans - a flag of transposed data. - */ - SIMD_API void SimdWinogradKernel1x5Block1x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - /*! @ingroup synet_winograd - - \fn void SimdWinogradKernel2x2Block2x2SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - \short This function is used for filter conversion in Winograd F(2x2,2x2) convolution algorithm. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input 32-bit float array with filter weights. - \param [in] size - (number of input channels)*(number of output channels). - \param [out] dst - a pointer to the output 32-bit float array with filter weights. - \param [in] trans - a flag of transposed data. - */ - SIMD_API void SimdWinogradKernel2x2Block2x2SetFilter(const float* src, size_t size, float* dst, SimdBool trans); - - /*! @ingroup synet_winograd - - \fn void SimdWinogradKernel2x2Block2x2SetInput(const float * src, size_t srcChannels, size_t srcHeight, size_t srcWidth, size_t padY, size_t padX, size_t padH, size_t padW, float * dst, size_t dstStride, SimdBool trans); - - \short This function is used for input image conversion in Winograd F(2x2,2x2) convolution algorithm. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input image. - \param [in] srcChannels - a number of input channels. - \param [in] srcHeight - a height of input image. - \param [in] srcWidth - a width of input image. - \param [in] padY - an additional zero padding of input image at the beginning of Y-axis. - \param [in] padX - an additional zero padding of input image at the beginning of X-axis. - \param [in] padH - an additional zero padding of input image at the end of Y-axis. - \param [in] padW - an additional zero padding of input image at the end of X-axis. - \param [out] dst - a pointer to the output array with converted image. - \param [in] dstStride - a stride of output image. - \param [in] trans - a flag of transposed data. - */ - SIMD_API void SimdWinogradKernel2x2Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - /*! @ingroup synet_winograd - - \fn void SimdWinogradKernel2x2Block2x2SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - \short This function is used for output image conversion in Winograd F(2x2,2x2) convolution algorithm. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input image. - \param [in] srcStride - a stride of input image. - \param [out] dst - a pointer to the output image. - \param [in] dstChannels - a number of output channels. - \param [in] dstHeight - a height of output image. - \param [in] dstWidth - a width of output image. - \param [in] trans - a flag of transposed data. - */ - SIMD_API void SimdWinogradKernel2x2Block2x2SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - /*! @ingroup synet_winograd - - \fn void SimdWinogradKernel2x2Block4x4SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - \short This function is used for filter conversion in Winograd F(4x4,2x2) convolution algorithm. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input 32-bit float array with filter weights. - \param [in] size - (number of input channels)*(number of output channels). - \param [out] dst - a pointer to the output 32-bit float array with filter weights. - \param [in] trans - a flag of transposed data. - */ - SIMD_API void SimdWinogradKernel2x2Block4x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans); - - /*! @ingroup synet_winograd - - \fn void SimdWinogradKernel2x2Block4x4SetInput(const float * src, size_t srcChannels, size_t srcHeight, size_t srcWidth, size_t padY, size_t padX, size_t padH, size_t padW, float * dst, size_t dstStride, SimdBool trans); - - \short This function is used for input image conversion in Winograd F(4x4,2x2) convolution algorithm. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input image. - \param [in] srcChannels - a number of input channels. - \param [in] srcHeight - a height of input image. - \param [in] srcWidth - a width of input image. - \param [in] padY - an additional zero padding of input image at the beginning of Y-axis. - \param [in] padX - an additional zero padding of input image at the beginning of X-axis. - \param [in] padH - an additional zero padding of input image at the end of Y-axis. - \param [in] padW - an additional zero padding of input image at the end of X-axis. - \param [out] dst - a pointer to the output array with converted image. - \param [in] dstStride - a stride of output image. - \param [in] trans - a flag of transposed data. - */ - SIMD_API void SimdWinogradKernel2x2Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - /*! @ingroup synet_winograd - - \fn void SimdWinogradKernel2x2Block4x4SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - \short This function is used for output image conversion in Winograd F(4x4,2x2) convolution algorithm. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input image. - \param [in] srcStride - a stride of input image. - \param [out] dst - a pointer to the output image. - \param [in] dstChannels - a number of output channels. - \param [in] dstHeight - a height of output image. - \param [in] dstWidth - a width of output image. - \param [in] trans - a flag of transposed data. - */ - SIMD_API void SimdWinogradKernel2x2Block4x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - /*! @ingroup synet_winograd - - \fn void SimdWinogradKernel3x3Block2x2SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - \short This function is used for filter conversion in Winograd F(2x2,3x3) convolution algorithm. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input 32-bit float array with filter weights. - \param [in] size - (number of input channels)*(number of output channels). - \param [out] dst - a pointer to the output 32-bit float array with filter weights. - \param [in] trans - a flag of transposed data. - */ - SIMD_API void SimdWinogradKernel3x3Block2x2SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - /*! @ingroup synet_winograd - - \fn void SimdWinogradKernel3x3Block2x2SetInput(const float * src, size_t srcChannels, size_t srcHeight, size_t srcWidth, size_t padY, size_t padX, size_t padH, size_t padW, float * dst, size_t dstStride, SimdBool trans); - - \short This function is used for input image conversion in Winograd F(2x2,3x3) convolution algorithm. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input image. - \param [in] srcChannels - a number of input channels. - \param [in] srcHeight - a height of input image. - \param [in] srcWidth - a width of input image. - \param [in] padY - an additional zero padding of input image at the beginning of Y-axis. - \param [in] padX - an additional zero padding of input image at the beginning of X-axis. - \param [in] padH - an additional zero padding of input image at the end of Y-axis. - \param [in] padW - an additional zero padding of input image at the end of X-axis. - \param [out] dst - a pointer to the output array with converted image. - \param [in] dstStride - a stride of output image. - \param [in] trans - a flag of transposed data. - */ - SIMD_API void SimdWinogradKernel3x3Block2x2SetInput(const float * src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float * dst, size_t dstStride, SimdBool trans); - - /*! @ingroup synet_winograd - - \fn void SimdWinogradKernel3x3Block2x2SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - \short This function is used for output image conversion in Winograd F(2x2,3x3) convolution algorithm. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input image. - \param [in] srcStride - a stride of input image. - \param [out] dst - a pointer to the output image. - \param [in] dstChannels - a number of output channels. - \param [in] dstHeight - a height of output image. - \param [in] dstWidth - a width of output image. - \param [in] trans - a flag of transposed data. - */ - SIMD_API void SimdWinogradKernel3x3Block2x2SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - /*! @ingroup synet_winograd - - \fn void SimdWinogradKernel3x3Block3x3SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - \short This function is used for filter conversion in Winograd F(3x3,3x3) convolution algorithm. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input 32-bit float array with filter weights. - \param [in] size - (number of input channels)*(number of output channels). - \param [out] dst - a pointer to the output 32-bit float array with filter weights. - \param [in] trans - a flag of transposed data. - */ - SIMD_API void SimdWinogradKernel3x3Block3x3SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - /*! @ingroup synet_winograd - - \fn void SimdWinogradKernel3x3Block3x3SetInput(const float * src, size_t srcChannels, size_t srcHeight, size_t srcWidth, size_t padY, size_t padX, size_t padH, size_t padW, float * dst, size_t dstStride, SimdBool trans); - - \short This function is used for input image conversion in Winograd F(3x3,3x3) convolution algorithm. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input image. - \param [in] srcChannels - a number of input channels. - \param [in] srcHeight - a height of input image. - \param [in] srcWidth - a width of input image. - \param [in] padY - an additional zero padding of input image at the beginning of Y-axis. - \param [in] padX - an additional zero padding of input image at the beginning of X-axis. - \param [in] padH - an additional zero padding of input image at the end of Y-axis. - \param [in] padW - an additional zero padding of input image at the end of X-axis. - \param [out] dst - a pointer to the output array with converted image. - \param [in] dstStride - a stride of output image. - \param [in] trans - a flag of transposed data. - */ - SIMD_API void SimdWinogradKernel3x3Block3x3SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - /*! @ingroup synet_winograd - - \fn void SimdWinogradKernel3x3Block3x3SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - \short This function is used for output image conversion in Winograd F(3x3,3x3) convolution algorithm. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input image. - \param [in] srcStride - a stride of input image. - \param [out] dst - a pointer to the output image. - \param [in] dstChannels - a number of output channels. - \param [in] dstHeight - a height of output image. - \param [in] dstWidth - a width of output image. - \param [in] trans - a flag of transposed data. - */ - SIMD_API void SimdWinogradKernel3x3Block3x3SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - /*! @ingroup synet_winograd - - \fn void SimdWinogradKernel3x3Block4x4SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - \short This function is used for filter conversion in Winograd F(4x4,3x3) convolution algorithm. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input 32-bit float array with filter weights. - \param [in] size - (number of input channels)*(number of output channels). - \param [out] dst - a pointer to the output 32-bit float array with filter weights. - \param [in] trans - a flag of transposed data. - */ - SIMD_API void SimdWinogradKernel3x3Block4x4SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - /*! @ingroup synet_winograd - - \fn void SimdWinogradKernel3x3Block4x4SetInput(const float * src, size_t srcChannels, size_t srcHeight, size_t srcWidth, size_t padY, size_t padX, size_t padH, size_t padW, float * dst, size_t dstStride, SimdBool trans); - - \short This function is used for input image conversion in Winograd F(4x4,3x3) convolution algorithm. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input image. - \param [in] srcChannels - a number of input channels. - \param [in] srcHeight - a height of input image. - \param [in] srcWidth - a width of input image. - \param [in] padY - an additional zero padding of input image at the beginning of Y-axis. - \param [in] padX - an additional zero padding of input image at the beginning of X-axis. - \param [in] padH - an additional zero padding of input image at the end of Y-axis. - \param [in] padW - an additional zero padding of input image at the end of X-axis. - \param [out] dst - a pointer to the output array with converted image. - \param [in] dstStride - a stride of output image. - \param [in] trans - a flag of transposed data. - */ - SIMD_API void SimdWinogradKernel3x3Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - /*! @ingroup synet_winograd - - \fn void SimdWinogradKernel3x3Block4x4SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - \short This function is used for output image conversion in Winograd F(4x4,3x3) convolution algorithm. - - \note This function is used in Synet Framework. - - \param [in] src - a pointer to the input image. - \param [in] srcStride - a stride of input image. - \param [out] dst - a pointer to the output image. - \param [in] dstChannels - a number of output channels. - \param [in] dstHeight - a height of output image. - \param [in] dstWidth - a width of output image. - \param [in] trans - a flag of transposed data. - */ - SIMD_API void SimdWinogradKernel3x3Block4x4SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - /*! @ingroup yuv_conversion - - \fn void SimdYuva420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride); - - \short Converts YUVA420P image to 32-bit BGRA image. - - The input Y, A and output BGRA images must have the same width and height. - The input U and V images must have the same width and height (half size relative to Y component). - - \note This function has a C++ wrappers: Simd::Yuva420pToBgra(const View& y, const View& u, const View& v, const View & a, View& bgra). - - \param [in] y - a pointer to pixels data of input 8-bit image with Y color plane. - \param [in] yStride - a row size of the y image. - \param [in] u - a pointer to pixels data of input 8-bit image with U color plane. - \param [in] uStride - a row size of the u image. - \param [in] v - a pointer to pixels data of input 8-bit image with V color plane. - \param [in] vStride - a row size of the v image. - \param [in] a - a pointer to pixels data of input 8-bit image with alpha channel. - \param [in] aStride - a row size of the a image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] bgra - a pointer to pixels data of output 32-bit BGRA image. - \param [in] bgraStride - a row size of the bgra image. - */ - SIMD_API void SimdYuva420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride); - - /*! @ingroup yuv_conversion - - \fn void SimdYuv420pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - \short Converts YUV420P image to 24-bit BGR image. - - The input Y and output BGR images must have the same width and height. - The input U and V images must have the same width and height (half size relative to Y component). - - \note This function has a C++ wrappers: Simd::Yuv420pToBgr(const View& y, const View& u, const View& v, View& bgr); - - \param [in] y - a pointer to pixels data of input 8-bit image with Y color plane. - \param [in] yStride - a row size of the y image. - \param [in] u - a pointer to pixels data of input 8-bit image with U color plane. - \param [in] uStride - a row size of the u image. - \param [in] v - a pointer to pixels data of input 8-bit image with V color plane. - \param [in] vStride - a row size of the v image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] bgr - a pointer to pixels data of output 24-bit BGR image. - \param [in] bgrStride - a row size of the bgr image. - */ - SIMD_API void SimdYuv420pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - /*! @ingroup yuv_conversion - - \fn void SimdYuv422pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - \short Converts YUV422P image to 24-bit BGR image. - - The input Y and output BGR images must have the same width and height. - The input U and V images must have the same width and height (their width is equal to half width of Y component). - - \note This function has a C++ wrappers: Simd::Yuv422pToBgr(const View& y, const View& u, const View& v, View& bgr); - - \param [in] y - a pointer to pixels data of input 8-bit image with Y color plane. - \param [in] yStride - a row size of the y image. - \param [in] u - a pointer to pixels data of input 8-bit image with U color plane. - \param [in] uStride - a row size of the u image. - \param [in] v - a pointer to pixels data of input 8-bit image with V color plane. - \param [in] vStride - a row size of the v image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] bgr - a pointer to pixels data of output 24-bit BGR image. - \param [in] bgrStride - a row size of the bgr image. - */ - SIMD_API void SimdYuv422pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - /*! @ingroup yuv_conversion - - \fn void SimdYuv444pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - \short Converts YUV444P image to 24-bit BGR image. - - The input Y, U, V and output BGR images must have the same width and height. - - \note This function has a C++ wrappers: Simd::Yuv444pToBgr(const View& y, const View& u, const View& v, View& bgr); - - \param [in] y - a pointer to pixels data of input 8-bit image with Y color plane. - \param [in] yStride - a row size of the y image. - \param [in] u - a pointer to pixels data of input 8-bit image with U color plane. - \param [in] uStride - a row size of the u image. - \param [in] v - a pointer to pixels data of input 8-bit image with V color plane. - \param [in] vStride - a row size of the v image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] bgr - a pointer to pixels data of output 24-bit BGR image. - \param [in] bgrStride - a row size of the bgr image. - */ - SIMD_API void SimdYuv444pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - /*! @ingroup yuv_conversion - - \fn void SimdYuv420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - \short Converts YUV420P image to 32-bit BGRA image. - - The input Y and output BGRA images must have the same width and height. - The input U and V images must have the same width and height (half size relative to Y component). - - \note This function has a C++ wrappers: Simd::Yuv420pToBgra(const View& y, const View& u, const View& v, View& bgra, uint8_t alpha). - - \param [in] y - a pointer to pixels data of input 8-bit image with Y color plane. - \param [in] yStride - a row size of the y image. - \param [in] u - a pointer to pixels data of input 8-bit image with U color plane. - \param [in] uStride - a row size of the u image. - \param [in] v - a pointer to pixels data of input 8-bit image with V color plane. - \param [in] vStride - a row size of the v image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] bgra - a pointer to pixels data of output 32-bit BGRA image. - \param [in] bgraStride - a row size of the bgra image. - \param [in] alpha - a value of alpha channel. - */ - SIMD_API void SimdYuv420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - /*! @ingroup yuv_conversion - - \fn void SimdYuv422pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - \short Converts YUV422P image to 32-bit BGRA image. - - The input Y and output BGRA images must have the same width and height. - The input U and V images must have the same width and height (their width is equal to half width of Y component). - - \note This function has a C++ wrappers: Simd::Yuv422pToBgra(const View& y, const View& u, const View& v, View& bgra, uint8_t alpha). - - \param [in] y - a pointer to pixels data of input 8-bit image with Y color plane. - \param [in] yStride - a row size of the y image. - \param [in] u - a pointer to pixels data of input 8-bit image with U color plane. - \param [in] uStride - a row size of the u image. - \param [in] v - a pointer to pixels data of input 8-bit image with V color plane. - \param [in] vStride - a row size of the v image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] bgra - a pointer to pixels data of output 32-bit BGRA image. - \param [in] bgraStride - a row size of the bgra image. - \param [in] alpha - a value of alpha channel. - */ - SIMD_API void SimdYuv422pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - /*! @ingroup yuv_conversion - - \fn void SimdYuv444pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - \short Converts YUV444P image to 32-bit BGRA image. - - The input Y, U, V and output BGRA images must have the same width and height. - - \note This function has a C++ wrappers: Simd::Yuv444pToBgra(const View& y, const View& u, const View& v, View& bgra, uint8_t alpha). - - \param [in] y - a pointer to pixels data of input 8-bit image with Y color plane. - \param [in] yStride - a row size of the y image. - \param [in] u - a pointer to pixels data of input 8-bit image with U color plane. - \param [in] uStride - a row size of the u image. - \param [in] v - a pointer to pixels data of input 8-bit image with V color plane. - \param [in] vStride - a row size of the v image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] bgra - a pointer to pixels data of output 32-bit BGRA image. - \param [in] bgraStride - a row size of the bgra image. - \param [in] alpha - a value of alpha channel. - */ - SIMD_API void SimdYuv444pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - /*! @ingroup yuv_conversion - - \fn void SimdYuv444pToHsl(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * hsl, size_t hslStride); - - \short Converts YUV444P image to 24-bit HSL(Hue, Saturation, Lightness) image. - - The input Y, U, V and output HSL images must have the same width and height. - - \note This function has a C++ wrappers: Simd::Yuv444pToHsl(const View& y, const View& u, const View& v, View& hsl). - - \param [in] y - a pointer to pixels data of input 8-bit image with Y color plane. - \param [in] yStride - a row size of the y image. - \param [in] u - a pointer to pixels data of input 8-bit image with U color plane. - \param [in] uStride - a row size of the u image. - \param [in] v - a pointer to pixels data of input 8-bit image with V color plane. - \param [in] vStride - a row size of the v image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] hsl - a pointer to pixels data of output 24-bit HSL image. - \param [in] hslStride - a row size of the hsl image. - */ - SIMD_API void SimdYuv444pToHsl(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hsl, size_t hslStride); - - /*! @ingroup yuv_conversion - - \fn void SimdYuv444pToHsv(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * hsv, size_t hsvStride); - - \short Converts YUV444P image to 24-bit HSV(Hue, Saturation, Value) image. - - The input Y, U, V and output HSV images must have the same width and height. - - \note This function has a C++ wrappers: Simd::Yuv444pToHsv(const View& y, const View& u, const View& v, View& hsv). - - \param [in] y - a pointer to pixels data of input 8-bit image with Y color plane. - \param [in] yStride - a row size of the y image. - \param [in] u - a pointer to pixels data of input 8-bit image with U color plane. - \param [in] uStride - a row size of the u image. - \param [in] v - a pointer to pixels data of input 8-bit image with V color plane. - \param [in] vStride - a row size of the v image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] hsv - a pointer to pixels data of output 24-bit HSV image. - \param [in] hsvStride - a row size of the hsv image. - */ - SIMD_API void SimdYuv444pToHsv(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hsv, size_t hsvStride); - - /*! @ingroup yuv_conversion - - \fn void SimdYuv420pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * hue, size_t hueStride); - - \short Converts YUV420P image to 8-bit image with Hue component of HSV or HSL color space. - - The input Y and output Hue images must have the same width and height. - The input U and V images must have the same width and height (half size relative to Y component). - - \note This function has a C++ wrappers: Simd::Yuv420pToHue(const View& y, const View& u, const View& v, View& hue). - - \param [in] y - a pointer to pixels data of input 8-bit image with Y color plane. - \param [in] yStride - a row size of the y image. - \param [in] u - a pointer to pixels data of input 8-bit image with U color plane. - \param [in] uStride - a row size of the u image. - \param [in] v - a pointer to pixels data of input 8-bit image with V color plane. - \param [in] vStride - a row size of the v image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] hue - a pointer to pixels data of output 8-bit Hue image. - \param [in] hueStride - a row size of the hue image. - */ - SIMD_API void SimdYuv420pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride); - - /*! @ingroup yuv_conversion - - \fn void SimdYuv444pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * hue, size_t hueStride); - - \short Converts YUV444P image to 8-bit image with Hue component of HSV or HSL color space. - - The input Y, U, V and output Hue images must have the same width and height. - - \note This function has a C++ wrappers: Simd::Yuv444pToHue(const View& y, const View& u, const View& v, View& hue). - - \param [in] y - a pointer to pixels data of input 8-bit image with Y color plane. - \param [in] yStride - a row size of the y image. - \param [in] u - a pointer to pixels data of input 8-bit image with U color plane. - \param [in] uStride - a row size of the u image. - \param [in] v - a pointer to pixels data of input 8-bit image with V color plane. - \param [in] vStride - a row size of the v image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] hue - a pointer to pixels data of output 8-bit Hue image. - \param [in] hueStride - a row size of the hue image. - */ - SIMD_API void SimdYuv444pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride); - - /*! @ingroup yuv_conversion - - \fn void SimdYuv420pToRgb(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride); - - \short Converts YUV420P image to 24-bit RGB image. - - The input Y and output RGB images must have the same width and height. - The input U and V images must have the same width and height (half size relative to Y component). - - \note This function has a C++ wrappers: Simd::Yuv420pToRgb(const View& y, const View& u, const View& v, View& rgb); - - \param [in] y - a pointer to pixels data of input 8-bit image with Y color plane. - \param [in] yStride - a row size of the y image. - \param [in] u - a pointer to pixels data of input 8-bit image with U color plane. - \param [in] uStride - a row size of the u image. - \param [in] v - a pointer to pixels data of input 8-bit image with V color plane. - \param [in] vStride - a row size of the v image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] rgb - a pointer to pixels data of output 24-bit RGB image. - \param [in] rgbStride - a row size of the rgb image. - */ - SIMD_API void SimdYuv420pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride); - - /*! @ingroup yuv_conversion - - \fn void SimdYuv422pToRgb(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride); - - \short Converts YUV422P image to 24-bit RGB image. - - The input Y and output RGB images must have the same width and height. - The input U and V images must have the same width and height (their width is equal to half width of Y component). - - \note This function has a C++ wrappers: Simd::Yuv422pToRgb(const View& y, const View& u, const View& v, View& rgb); - - \param [in] y - a pointer to pixels data of input 8-bit image with Y color plane. - \param [in] yStride - a row size of the y image. - \param [in] u - a pointer to pixels data of input 8-bit image with U color plane. - \param [in] uStride - a row size of the u image. - \param [in] v - a pointer to pixels data of input 8-bit image with V color plane. - \param [in] vStride - a row size of the v image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] rgb - a pointer to pixels data of output 24-bit RGB image. - \param [in] rgbStride - a row size of the rgb image. - */ - SIMD_API void SimdYuv422pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride); - - /*! @ingroup yuv_conversion - - \fn void SimdYuv444pToRgb(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride); - - \short Converts YUV444P image to 24-bit RGB image. - - The input Y, U, V and output RGB images must have the same width and height. - - \note This function has a C++ wrappers: Simd::Yuv444pToRgb(const View& y, const View& u, const View& v, View& rgb); - - \param [in] y - a pointer to pixels data of input 8-bit image with Y color plane. - \param [in] yStride - a row size of the y image. - \param [in] u - a pointer to pixels data of input 8-bit image with U color plane. - \param [in] uStride - a row size of the u image. - \param [in] v - a pointer to pixels data of input 8-bit image with V color plane. - \param [in] vStride - a row size of the v image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [out] rgb - a pointer to pixels data of output 24-bit RGB image. - \param [in] rgbStride - a row size of the rgb image. - */ - SIMD_API void SimdYuv444pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride); - -#ifdef __cplusplus -} -#endif // __cplusplus - -#endif//__SimdLib_h__ diff --git a/src/3rd/Simd/Simd/SimdLib.hpp b/src/3rd/Simd/Simd/SimdLib.hpp deleted file mode 100644 index 8a7f2e09..00000000 --- a/src/3rd/Simd/Simd/SimdLib.hpp +++ /dev/null @@ -1,4348 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar, -* 2014-2019 Antonenka Mikhail, -* 2019-2019 Facundo Galan. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ - -#include "Simd/SimdView.hpp" -#include "Simd/SimdPixel.hpp" -#include "Simd/SimdPyramid.hpp" - -#ifndef __SimdLib_hpp__ -#define __SimdLib_hpp__ - -/*! \namespace Simd */ -namespace Simd -{ - /*! @ingroup info - - \fn void PrintInfo(std::ostream & os) - - \short Prints information about %Simd Library and CPU properties. - - \param [in, out] os - output stream. - */ - SIMD_INLINE void PrintInfo(std::ostream & os) - { - os << "Simd Library: " << SimdVersion(); - os << "; System Sockets: " << SimdCpuInfo(SimdCpuInfoSockets); - os << ", Cores: " << SimdCpuInfo(SimdCpuInfoCores); - os << ", Threads: " << SimdCpuInfo(SimdCpuInfoThreads); - os << "; Cache L1D: " << SimdCpuInfo(SimdCpuInfoCacheL1) / 1024 << " KB"; - os << ", L2: " << SimdCpuInfo(SimdCpuInfoCacheL2) / 1024 << " KB"; - os << ", L3: " << SimdCpuInfo(SimdCpuInfoCacheL3) / 1024 << " KB"; - os << "; Available SIMD:"; - os << (SimdCpuInfo(SimdCpuInfoAvx512vnni) ? " AVX-512VNNI" : ""); - os << (SimdCpuInfo(SimdCpuInfoAvx512bw) ? " AVX-512BW" : ""); - os << (SimdCpuInfo(SimdCpuInfoAvx512f) ? " AVX-512F" : ""); - os << (SimdCpuInfo(SimdCpuInfoAvx2) ? " AVX2 FMA" : ""); - os << (SimdCpuInfo(SimdCpuInfoAvx) ? " AVX" : ""); - os << (SimdCpuInfo(SimdCpuInfoSse41) ? " SSE4.1" : ""); - os << (SimdCpuInfo(SimdCpuInfoSsse3) ? " SSSE3" : ""); - os << (SimdCpuInfo(SimdCpuInfoSse3) ? " SSE3" : ""); - os << (SimdCpuInfo(SimdCpuInfoSse2) ? " SSE2" : ""); - os << (SimdCpuInfo(SimdCpuInfoSse) ? " SSE" : ""); - os << (SimdCpuInfo(SimdCpuInfoVmx) ? " Altivec" : ""); - os << (SimdCpuInfo(SimdCpuInfoVsx) ? " VSX" : ""); - os << (SimdCpuInfo(SimdCpuInfoNeon) ? " NEON" : ""); - os << (SimdCpuInfo(SimdCpuInfoMsa) ? " MSA" : ""); - os << std::endl; - } - - /*! @ingroup correlation - - \fn void AbsDifference(const View & a, const View & b, View & c) - - \short Gets absolute difference of two gray 8-bit images, pyxel by pixel. - - Both images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdAbsDifference. - - \param [in] a - a first image. - \param [in] b - a second image. - \param [out] c - a destination image. - */ - template class A> SIMD_INLINE void AbsDifference(const View & a, const View & b, View & c) - { - assert(Compatible(a, b) && Compatible(b, c) && a.format == View::Gray8); - - SimdAbsDifference(a.data, a.stride, b.data, b.stride, c.data, c.stride, a.width, a.height); - } - - /*! @ingroup correlation - - \fn void AbsDifferenceSum(const View& a, const View& b, uint64_t & sum) - - \short Gets sum of absolute difference of two gray 8-bit images. - - Both images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdAbsDifferenceSum. - - \param [in] a - a first image. - \param [in] b - a second image. - \param [out] sum - the result sum of absolute difference of two images. - */ - template class A> SIMD_INLINE void AbsDifferenceSum(const View& a, const View& b, uint64_t & sum) - { - assert(Compatible(a, b) && a.format == View::Gray8); - - SimdAbsDifferenceSum(a.data, a.stride, b.data, b.stride, a.width, a.height, &sum); - } - - /*! @ingroup correlation - - \fn void AbsDifferenceSum(const View& a, const View& b, const View& mask, uint8_t index, uint64_t & sum) - - \short Gets sum of absolute difference of two gray 8-bit images based on gray 8-bit mask. - - Gets the absolute difference sum for all points when mask[i] == index. - Both images and mask must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdAbsDifferenceSumMasked. - - \param [in] a - a first image. - \param [in] b - a second image. - \param [in] mask - a mask image. - \param [in] index - a mask index. - \param [out] sum - the result sum of absolute difference of two images. - */ - template class A> SIMD_INLINE void AbsDifferenceSum(const View& a, const View& b, const View& mask, uint8_t index, uint64_t & sum) - { - assert(Compatible(a, b, mask) && a.format == View::Gray8); - - SimdAbsDifferenceSumMasked(a.data, a.stride, b.data, b.stride, mask.data, mask.stride, index, a.width, a.height, &sum); - } - - /*! @ingroup correlation - - \fn void AbsDifferenceSums3x3(const View& current, const View& background, uint64_t * sums) - - \short Gets 9 sums of absolute difference of two gray 8-bit images with various relative shifts in neighborhood 3x3. - - Both images must have the same width and height. The image height and width must be equal or greater 3. - The sums are calculated with central part (indent width = 1) of current image and with part of background image with corresponding shift. - The shifts are lain in the range [-1, 1] for axis x and y. - - \note This function is a C++ wrapper for function ::SimdAbsDifferenceSums3x3. - - \param [in] current - a current image. - \param [in] background - a background image. - \param [out] sums - the pointer to buffer with result sums. Buffer size must be equal or greater 9. - */ - template class A> SIMD_INLINE void AbsDifferenceSums3x3(const View& current, const View& background, uint64_t * sums) - { - assert(Compatible(current, background) && current.format == View::Gray8 && current.width > 2 && current.height > 2); - - SimdAbsDifferenceSums3x3(current.data, current.stride, background.data, background.stride, current.width, current.height, sums); - } - - /*! @ingroup correlation - - \fn void AbsDifferenceSums3x3(const View& current, const View& background, const View& mask, uint8_t index, uint64_t * sums) - - \short Gets 9 sums of absolute difference of two gray 8-bit images with various relative shifts in neighborhood 3x3 based on gray 8-bit mask. - - Gets the absolute difference sums for all points when mask[i] == index. - Both images and mask must have the same width and height. The image height and width must be equal or greater 3. - The sums are calculated with central part (indent width = 1) of current image and with part of background image with corresponding shift. - The shifts are lain in the range [-1, 1] for axis x and y. - - \note This function is a C++ wrapper for function ::SimdAbsDifferenceSums3x3Masked. - - \param [in] current - a current image. - \param [in] background - a background image. - \param [in] mask - a mask image. - \param [in] index - a mask index. - \param [out] sums - the pointer to buffer with result sums. Buffer size must be equal or greater 9. - */ - template class A> SIMD_INLINE void AbsDifferenceSums3x3(const View& current, const View& background, const View& mask, uint8_t index, uint64_t * sums) - { - assert(Compatible(current, background, mask) && current.format == View::Gray8 && current.width > 2 && current.height > 2); - - SimdAbsDifferenceSums3x3Masked(current.data, current.stride, background.data, background.stride, - mask.data, mask.stride, index, current.width, current.height, sums); - } - - /*! @ingroup other_filter - - \fn void AbsGradientSaturatedSum(const View& src, View& dst) - - \short Puts to destination 8-bit gray image saturated sum of absolute gradient for every point of source 8-bit gray image. - - Both images must have the same width and height. - - For border pixels: - \verbatim - dst[x, y] = 0; - \endverbatim - - For other pixels: - \verbatim - dx = abs(src[x + 1, y] - src[x - 1, y]); - dy = abs(src[x, y + 1] - src[x, y - 1]); - dst[x, y] = min(dx + dy, 255); - \endverbatim - - \note This function is a C++ wrapper for function ::SimdAbsGradientSaturatedSum. - - \param [in] src - a source 8-bit gray image. - \param [out] dst - a destination 8-bit gray image. - */ - template class A> SIMD_INLINE void AbsGradientSaturatedSum(const View& src, View& dst) - { - assert(Compatible(src, dst) && src.format == View::Gray8 && src.height >= 3 && src.width >= 3); - - SimdAbsGradientSaturatedSum(src.data, src.stride, src.width, src.height, dst.data, dst.stride); - } - - /*! @ingroup difference_estimation - - \fn void AddFeatureDifference(const View& value, const View& lo, const View& hi, uint16_t weight, View& difference) - - \short Adds feature difference to common difference sum. - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - excess = max(lo[i] - value[i], 0) + max(value[i] - hi[i], 0); - difference[i] += (weight * excess*excess) >> 16; - \endverbatim - - This function is used for difference estimation in algorithm of motion detection. - - \note This function is a C++ wrapper for function ::SimdAddFeatureDifference. - - \param [in] value - a current feature value. - \param [in] lo - a feature lower bound of dynamic background. - \param [in] hi - a feature upper bound of dynamic background. - \param [in] weight - a current feature weight (unsigned 16-bit value). - \param [in, out] difference - an image with total difference. - */ - template class A> SIMD_INLINE void AddFeatureDifference(const View& value, const View& lo, const View& hi, uint16_t weight, View& difference) - { - assert(Compatible(value, lo, hi, difference) && value.format == View::Gray8); - - SimdAddFeatureDifference(value.data, value.stride, value.width, value.height, - lo.data, lo.stride, hi.data, hi.stride, weight, difference.data, difference.stride); - } - - /*! @ingroup drawing - - \fn void AlphaBlending(const View& src, const View& alpha, View& dst) - - \short Performs alpha blending operation. - - All images must have the same width and height. Source and destination images must have the same format (8 bit per channel, for example GRAY8, BGR24 or BGRA32). Alpha must be 8-bit gray image. - - For every point: - \verbatim - dst[x, y, c] = (src[x, y, c]*alpha[x, y] + dst[x, y, c]*(255 - alpha[x, y]))/255; - \endverbatim - - This function is used for image drawing. - - \note This function is a C++ wrapper for function ::SimdAlphaBlending. - - \param [in] src - a foreground image. - \param [in] alpha - an image with alpha channel. - \param [in, out] dst - a background image. - */ - template class A> SIMD_INLINE void AlphaBlending(const View& src, const View& alpha, View& dst) - { - assert(Compatible(src, dst) && EqualSize(src, alpha) && alpha.format == View::Gray8 && src.ChannelSize() == 1); - - SimdAlphaBlending(src.data, src.stride, src.width, src.height, src.ChannelCount(), alpha.data, alpha.stride, dst.data, dst.stride); - } - - /*! @ingroup drawing - - \fn void AlphaFilling(View & dst, const Pixel & pixel, const View & alpha) - - \short Performs alpha filling operation. - - All images must have the same width and height. Destination images must have 8 bit per channel (for example GRAY8, BGR24 or BGRA32). Alpha must be 8-bit gray image. - - For every point: - \verbatim - dst[x, y, c] = (pixel[c]*alpha[x, y] + dst[x, y, c]*(255 - alpha[x, y]))/255; - \endverbatim - - This function is used for image drawing. - - \note This function is a C++ wrapper for function ::SimdAlphaFilling. - - \param [in, out] dst - a background image. - \param [in] pixel - a foreground color. - \param [in] alpha - an image with alpha channel. - */ - template class A, class Pixel> SIMD_INLINE void AlphaFilling(View & dst, const Pixel & pixel, const View & alpha) - { - assert(EqualSize(dst, alpha) && alpha.format == View::Gray8 && dst.ChannelSize() == 1 && dst.ChannelCount() == sizeof(Pixel)); - - SimdAlphaFilling(dst.data, dst.stride, dst.width, dst.height, (uint8_t*)&pixel, sizeof(Pixel), alpha.data, alpha.stride); - } - - /*! @ingroup background - - \fn void BackgroundGrowRangeSlow(const View& value, View& lo, View& hi) - - \short Performs background update (initial grow, slow mode). - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - lo[i] -= value[i] < lo[i] ? 1 : 0; - hi[i] += value[i] > hi[i] ? 1 : 0; - \endverbatim - - This function is used for background updating in motion detection algorithm. - - \note This function is a C++ wrapper for function ::SimdBackgroundGrowRangeSlow. - - \param [in] value - a current feature value. - \param [in, out] lo - a feature lower bound of dynamic background. - \param [in, out] hi - a feature upper bound of dynamic background. - */ - template class A> SIMD_INLINE void BackgroundGrowRangeSlow(const View& value, View& lo, View& hi) - { - assert(Compatible(value, lo, hi) && value.format == View::Gray8); - - SimdBackgroundGrowRangeSlow(value.data, value.stride, value.width, value.height, lo.data, lo.stride, hi.data, hi.stride); - } - - /*! @ingroup background - - \fn void BackgroundGrowRangeFast(const View& value, View& lo, View& hi) - - \short Performs background update (initial grow, fast mode). - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - lo[i] = value[i] < lo[i] ? value[i] : lo[i]; - hi[i] = value[i] > hi[i] ? value[i] : hi[i]; - \endverbatim - - This function is used for background updating in motion detection algorithm. - - \note This function is a C++ wrapper for function ::SimdBackgroundGrowRangeFast. - - \param [in] value - a current feature value. - \param [in, out] lo - a feature lower bound of dynamic background. - \param [in, out] hi - a feature upper bound of dynamic background. - */ - template class A> SIMD_INLINE void BackgroundGrowRangeFast(const View& value, View& lo, View& hi) - { - assert(Compatible(value, lo, hi) && value.format == View::Gray8); - - SimdBackgroundGrowRangeFast(value.data, value.stride, value.width, value.height, lo.data, lo.stride, hi.data, hi.stride); - } - - /*! @ingroup background - - \fn void BackgroundIncrementCount(const View& value, const View& loValue, const View& hiValue, View& loCount, View& hiCount) - - \short Performs collection of background statistic. - - All images must have the same width, height and format (8-bit gray). - - Updates background statistic counters for every point: - \verbatim - loCount[i] += (value[i] < loValue[i] && loCount[i] < 255) ? 1 : 0; - hiCount[i] += (value[i] > hiValue[i] && hiCount[i] < 255) ? 1 : 0; - \endverbatim - - This function is used for background updating in motion detection algorithm. - - \note This function is a C++ wrapper for function ::SimdBackgroundIncrementCount. - - \param [in] value - a current feature value. - \param [in] loValue - a value of feature lower bound of dynamic background. - \param [in] hiValue - a value of feature upper bound of dynamic background. - \param [in, out] loCount - a count of feature lower bound of dynamic background. - \param [in, out] hiCount - a count of feature upper bound of dynamic background. - */ - template class A> SIMD_INLINE void BackgroundIncrementCount(const View& value, const View& loValue, const View& hiValue, View& loCount, View& hiCount) - { - assert(Compatible(value, loValue, hiValue, loCount, hiCount) && value.format == View::Gray8); - - SimdBackgroundIncrementCount(value.data, value.stride, value.width, value.height, - loValue.data, loValue.stride, hiValue.data, hiValue.stride, - loCount.data, loCount.stride, hiCount.data, hiCount.stride); - } - - /*! @ingroup background - - \fn void BackgroundAdjustRange(View& loCount, View& loValue, View& hiCount, View& hiValue, uint8_t threshold) - - \short Performs adjustment of background range. - - All images must have the same width, height and format (8-bit gray). - - Adjusts background range for every point: - \verbatim - loValue[i] -= (loCount[i] > threshold && loValue[i] > 0) ? 1 : 0; - loValue[i] += (loCount[i] < threshold && loValue[i] < 255) ? 1 : 0; - loCount[i] = 0; - hiValue[i] += (hiCount[i] > threshold && hiValue[i] < 255) ? 1 : 0; - hiValue[i] -= (hiCount[i] < threshold && hiValue[i] > 0) ? 1 : 0; - hiCount[i] = 0; - \endverbatim - - This function is used for background updating in motion detection algorithm. - - \note This function is a C++ wrapper for function ::SimdBackgroundAdjustRange. - - \param [in, out] loCount - a count of feature lower bound of dynamic background. - \param [in, out] hiCount - a count of feature upper bound of dynamic background. - \param [in, out] loValue - a value of feature lower bound of dynamic background. - \param [in, out] hiValue - a value of feature upper bound of dynamic background. - \param [in] threshold - a count threshold. - */ - template class A> SIMD_INLINE void BackgroundAdjustRange(View& loCount, View& loValue, View& hiCount, View& hiValue, uint8_t threshold) - { - assert(Compatible(loValue, hiValue, loCount, hiCount) && loValue.format == View::Gray8); - - SimdBackgroundAdjustRange(loCount.data, loCount.stride, loCount.width, loCount.height, - loValue.data, loValue.stride, hiCount.data, hiCount.stride, hiValue.data, hiValue.stride, threshold); - } - - /*! @ingroup background - - \fn void BackgroundAdjustRange(View& loCount, View& loValue, View& hiCount, View& hiValue, uint8_t threshold, const View& mask) - - \short Performs adjustment of background range with using adjust range mask. - - All images must have the same width, height and format (8-bit gray). - - Adjusts background range for every point: - \verbatim - if(mask[i]) - { - loValue[i] -= (loCount[i] > threshold && loValue[i] > 0) ? 1 : 0; - loValue[i] += (loCount[i] < threshold && loValue[i] < 255) ? 1 : 0; - loCount[i] = 0; - hiValue[i] += (hiCount[i] > threshold && hiValue[i] < 255) ? 1 : 0; - hiValue[i] -= (hiCount[i] < threshold && hiValue[i] > 0) ? 1 : 0; - hiCount[i] = 0; - } - \endverbatim - - This function is used for background updating in motion detection algorithm. - - \note This function is a C++ wrapper for function ::SimdBackgroundAdjustRangeMasked. - - \param [in, out] loCount - a count of feature lower bound of dynamic background. - \param [in, out] hiCount - a count of feature upper bound of dynamic background. - \param [in, out] loValue - a value of feature lower bound of dynamic background. - \param [in, out] hiValue - a value of feature upper bound of dynamic background. - \param [in] threshold - a count threshold. - \param [in] mask - an adjust range mask. - */ - template class A> SIMD_INLINE void BackgroundAdjustRange(View& loCount, View& loValue, View& hiCount, View& hiValue, uint8_t threshold, const View& mask) - { - assert(Compatible(loValue, hiValue, loCount, hiCount, mask) && loValue.format == View::Gray8); - - SimdBackgroundAdjustRangeMasked(loCount.data, loCount.stride, loCount.width, loCount.height, - loValue.data, loValue.stride, hiCount.data, hiCount.stride, hiValue.data, hiValue.stride, - threshold, mask.data, mask.stride); - } - - /*! @ingroup background - - \fn void BackgroundShiftRange(const View& value, View& lo, View& hi) - - \short Shifts background range. - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - if (value[i] > hi[i]) - { - lo[i] = min(lo[i] + value[i] - hi[i], 255); - hi[i] = value[i]; - } - if (lo[i] > value[i]) - { - lo[i] = value[i]; - hi[i] = max(hi[i] - lo[i] + value[i], 0); - } - \endverbatim - - This function is used for fast background updating in motion detection algorithm. - - \note This function is a C++ wrapper for function ::SimdBackgroundShiftRange. - - \param [in] value - a current feature value. - \param [in, out] lo - a feature lower bound of dynamic background. - \param [in, out] hi - a feature upper bound of dynamic background. - */ - template class A> SIMD_INLINE void BackgroundShiftRange(const View& value, View& lo, View& hi) - { - assert(Compatible(value, lo, hi) && value.format == View::Gray8); - - SimdBackgroundShiftRange(value.data, value.stride, value.width, value.height, lo.data, lo.stride, hi.data, hi.stride); - } - - /*! @ingroup background - - \fn void BackgroundShiftRange(const View& value, View& lo, View& hi, const View& mask); - - \short Shifts background range with using shift range mask. - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - if(mask[i]) - { - if (value[i] > hi[i]) - { - lo[i] = min(lo[i] + value[i] - hi[i], 255); - hi[i] = value[i]; - } - if (lo[i] > value[i]) - { - lo[i] = value[i]; - hi[i] = max(hi[i] - lo[i] + value[i], 0); - } - } - \endverbatim - - This function is used for fast background updating in motion detection algorithm. - - \note This function is a C++ wrapper for function ::SimdBackgroundShiftRangeMasked. - - \param [in] value - a current feature value. - \param [in, out] lo - a feature lower bound of dynamic background. - \param [in, out] hi - a feature upper bound of dynamic background. - \param [in] mask - a shift range mask. - */ - template class A> SIMD_INLINE void BackgroundShiftRange(const View& value, View& lo, View& hi, const View& mask) - { - assert(Compatible(value, lo, hi, mask) && value.format == View::Gray8); - - SimdBackgroundShiftRangeMasked(value.data, value.stride, value.width, value.height, - lo.data, lo.stride, hi.data, hi.stride, mask.data, mask.stride); - } - - /*! @ingroup background - - \fn void BackgroundInitMask(const View& src, uint8_t index, uint8_t value, View& dst); - - \short Creates background update mask. - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - if(mask[i] == index) - dst[i] = value; - \endverbatim - - This function is used for background updating in motion detection algorithm. - - \note This function is a C++ wrapper for function ::SimdBackgroundInitMask. - - \param [in] src - an input mask image. - \param [in] index - a mask index into input mask. - \param [in] value - a value to fill the output mask. - \param [out] dst - an output mask image. - */ - template class A> SIMD_INLINE void BackgroundInitMask(const View& src, uint8_t index, uint8_t value, View& dst) - { - assert(Compatible(src, dst) && src.format == View::Gray8); - - SimdBackgroundInitMask(src.data, src.stride, src.width, src.height, index, value, dst.data, dst.stride); - } - - /*! @ingroup bayer_conversion - - \fn void BayerToBgr(const View& bayer, View& bgr); - - \short Converts 8-bit Bayer image to 24-bit BGR. - - All images must have the same width and height. The width and the height must be even. - - \note This function is a C++ wrapper for function ::SimdBayerToBgr. - - \param [in] bayer - an input 8-bit Bayer image. - \param [out] bgr - an output 24-bit BGR image. - */ - template class A> SIMD_INLINE void BayerToBgr(const View& bayer, View& bgr) - { - assert(EqualSize(bgr, bayer) && bgr.format == View::Bgr24); - assert(bayer.format >= View::BayerGrbg && bayer.format <= View::BayerBggr); - assert((bayer.width % 2 == 0) && (bayer.height % 2 == 0)); - - SimdBayerToBgr(bayer.data, bayer.width, bayer.height, bayer.stride, (SimdPixelFormatType)bayer.format, bgr.data, bgr.stride); - } - - /*! @ingroup bayer_conversion - - \fn void BayerToBgra(const View& bayer, View& bgra, uint8_t alpha = 0xFF); - - \short Converts 8-bit Bayer image to 32-bit BGRA. - - All images must have the same width and height. The width and the height must be even. - - \note This function is a C++ wrapper for function ::SimdBayerToBgra. - - \param [in] bayer - an input 8-bit Bayer image. - \param [out] bgra - an output 32-bit BGRA image. - \param [in] alpha - a value of alpha channel. It is equal to 256 by default. - */ - template class A> SIMD_INLINE void BayerToBgra(const View& bayer, View& bgra, uint8_t alpha = 0xFF) - { - assert(EqualSize(bgra, bayer) && bgra.format == View::Bgra32); - assert(bayer.format >= View::BayerGrbg && bayer.format <= View::BayerBggr); - assert((bayer.width % 2 == 0) && (bayer.height % 2 == 0)); - - SimdBayerToBgra(bayer.data, bayer.width, bayer.height, bayer.stride, (SimdPixelFormatType)bayer.format, bgra.data, bgra.stride, alpha); - } - - /*! @ingroup bgra_conversion - - \fn void BgraToBayer(const View& bgra, View& bayer) - - \short Converts 32-bit BGRA image to 8-bit Bayer image. - - All images must have the same width and height. The width and the height must be even. - - \note This function is a C++ wrapper for function ::SimdBgraToBayer. - - \param [in] bgra - an input 32-bit BGRA image. - \param [out] bayer - an output 8-bit Bayer image. - */ - template class A> SIMD_INLINE void BgraToBayer(const View& bgra, View& bayer) - { - assert(EqualSize(bgra, bayer) && bgra.format == View::Bgra32); - assert(bayer.format >= View::BayerGrbg && bayer.format <= View::BayerBggr); - assert((bayer.width % 2 == 0) && (bayer.height % 2 == 0)); - - SimdBgraToBayer(bgra.data, bgra.width, bgra.height, bgra.stride, bayer.data, bayer.stride, (SimdPixelFormatType)bayer.format); - } - - /*! @ingroup bgra_conversion - - \fn void BgraToBgr(const View& bgra, View& bgr) - - \short Converts 32-bit BGRA image to 24-bit BGR image. - - All images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdBgraToBgr. - - \param [in] bgra - an input 32-bit BGRA image. - \param [out] bgr - an output 24-bit BGR image. - */ - template class A> SIMD_INLINE void BgraToBgr(const View& bgra, View& bgr) - { - assert(EqualSize(bgra, bgr) && bgra.format == View::Bgra32 && bgr.format == View::Bgr24); - - SimdBgraToBgr(bgra.data, bgra.width, bgra.height, bgra.stride, bgr.data, bgr.stride); - } - - /*! @ingroup bgra_conversion - - \fn void BgraToGray(const View& bgra, View& gray) - - \short Converts 32-bit BGRA image to 8-bit gray image. - - All images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdBgraToGray. - - \param [in] bgra - an input 32-bit BGRA image. - \param [out] gray - an output 8-bit gray image. - */ - template class A> SIMD_INLINE void BgraToGray(const View& bgra, View& gray) - { - assert(EqualSize(bgra, gray) && bgra.format == View::Bgra32 && gray.format == View::Gray8); - - SimdBgraToGray(bgra.data, bgra.width, bgra.height, bgra.stride, gray.data, gray.stride); - } - - /*! @ingroup bgra_conversion - - \fn void BgraToRgb(const View& bgra, View& rgb) - - \short Converts 32-bit BGRA image to 24-bit RGB image. - - All images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdBgraToRgb. - - \param [in] bgra - an input 32-bit BGRA image. - \param [out] rgb - an output 24-bit RGB image. - */ - template class A> SIMD_INLINE void BgraToRgb(const View& bgra, View& rgb) - { - assert(EqualSize(bgra, rgb) && bgra.format == View::Bgra32 && rgb.format == View::Rgb24); - - SimdBgraToRgb(bgra.data, bgra.width, bgra.height, bgra.stride, rgb.data, rgb.stride); - } - - /*! @ingroup bgra_conversion - - \fn void BgraToYuv420p(const View& bgra, View& y, View& u, View& v) - - \short Converts 32-bit BGRA image to YUV420P. - - The input BGRA and output Y images must have the same width and height. - The input U and V images must have the same width and height (half size relative to Y component). - - \note This function is a C++ wrapper for function ::SimdBgraToYuv420p. - - \param [in] bgra - an input 32-bit BGRA image. - \param [out] y - an output 8-bit image with Y color plane. - \param [out] u - an output 8-bit image with U color plane. - \param [out] v - an output 8-bit image with V color plane. - */ - template class A> SIMD_INLINE void BgraToYuv420p(const View& bgra, View& y, View& u, View& v) - { - assert(y.width == 2 * u.width && y.height == 2 * u.height && y.format == u.format); - assert(y.width == 2 * v.width && y.height == 2 * v.height && y.format == v.format); - assert(y.width == bgra.width && y.height == bgra.height); - assert(y.format == View::Gray8 && bgra.format == View::Bgra32); - - SimdBgraToYuv420p(bgra.data, bgra.width, bgra.height, bgra.stride, y.data, y.stride, u.data, u.stride, v.data, v.stride); - } - - /*! @ingroup bgra_conversion - - \fn void BgraToYuv422p(const View& bgra, View& y, View& u, View& v) - - \short Converts 32-bit BGRA image to YUV422P. - - The input BGRA and output Y images must have the same width and height. - The input U and V images must have the same width and height (their width is equal to half width of Y component). - - \note This function is a C++ wrapper for function ::SimdBgraToYuv422p. - - \param [in] bgra - an input 32-bit BGRA image. - \param [out] y - an output 8-bit image with Y color plane. - \param [out] u - an output 8-bit image with U color plane. - \param [out] v - an output 8-bit image with V color plane. - */ - template class A> SIMD_INLINE void BgraToYuv422p(const View& bgra, View& y, View& u, View& v) - { - assert(y.width == 2 * u.width && y.height == u.height && y.format == u.format); - assert(y.width == 2 * v.width && y.height == v.height && y.format == v.format); - assert(y.width == bgra.width && y.height == bgra.height); - assert(y.format == View::Gray8 && bgra.format == View::Bgra32); - - SimdBgraToYuv422p(bgra.data, bgra.width, bgra.height, bgra.stride, y.data, y.stride, u.data, u.stride, v.data, v.stride); - } - - /*! @ingroup bgra_conversion - - \fn void BgraToYuv444p(const View& bgra, View& y, View& u, View& v) - - \short Converts 32-bit BGRA image to YUV444P. - - The input BGRA and output Y, U and V images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdBgraToYuv444p. - - \param [in] bgra - an input 32-bit BGRA image. - \param [out] y - an output 8-bit image with Y color plane. - \param [out] u - an output 8-bit image with U color plane. - \param [out] v - an output 8-bit image with V color plane. - */ - template class A> SIMD_INLINE void BgraToYuv444p(const View& bgra, View& y, View& u, View& v) - { - assert(EqualSize(bgra, y) && Compatible(y, u, v)); - assert(y.format == View::Gray8 && bgra.format == View::Bgra32); - - SimdBgraToYuv444p(bgra.data, bgra.width, bgra.height, bgra.stride, y.data, y.stride, u.data, u.stride, v.data, v.stride); - } - - /*! @ingroup bgra_conversion - - \fn void BgraToYuva420p(const View & bgra, View & y, View & u, View & v, View & a) - - \short Converts 32-bit BGRA image to YUVA420P. - - The input BGRA and output Y and A images must have the same width and height. - The input U and V images must have the same width and height (half size relative to Y component). - - \note This function is a C++ wrapper for function ::SimdBgraToYuva420p. - - \param [in] bgra - an input 32-bit BGRA image. - \param [out] y - an output 8-bit image with Y color plane. - \param [out] u - an output 8-bit image with U color plane. - \param [out] v - an output 8-bit image with V color plane. - \param [out] a - an output 8-bit image with alpha plane. - */ - template class A> SIMD_INLINE void BgraToYuva420p(const View & bgra, View & y, View & u, View & v, View & a) - { - assert(y.width == 2 * u.width && y.height == 2 * u.height && y.format == u.format); - assert(Compatible(y, a) && Compatible(u, v) && EqualSize(y, bgra)); - assert(y.format == View::Gray8 && bgra.format == View::Bgra32); - - SimdBgraToYuva420p(bgra.data, bgra.stride, bgra.width, bgra.height, y.data, y.stride, u.data, u.stride, v.data, v.stride, a.data, a.stride); - } - - /*! @ingroup bgr_conversion - - \fn void BgrToBayer(const View& bgr, View& bayer) - - \short Converts 24-bit BGR image to 8-bit Bayer image. - - All images must have the same width and height. The width and the height must be even. - - \note This function is a C++ wrapper for function ::SimdBgrToBayer. - - \param [in] bgr - an input 24-bit BGR image. - \param [out] bayer - an output 8-bit Bayer image. - */ - template class A> SIMD_INLINE void BgrToBayer(const View& bgr, View& bayer) - { - assert(EqualSize(bgr, bayer) && bgr.format == View::Bgr24); - assert(bayer.format >= View::BayerGrbg && bayer.format <= View::BayerBggr); - assert((bayer.width % 2 == 0) && (bayer.height % 2 == 0)); - - SimdBgrToBayer(bgr.data, bgr.width, bgr.height, bgr.stride, bayer.data, bayer.stride, (SimdPixelFormatType)bayer.format); - } - - /*! @ingroup bgr_conversion - - \fn void BgrToBgra(const View& bgr, View& bgra, uint8_t alpha = 0xFF) - - \short Converts 24-bit BGR image to 32-bit BGRA image. - - All images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdBgrToBgra. - - \param [in] bgr - an input 24-bit BGR image. - \param [out] bgra - an output 32-bit BGRA image. - \param [in] alpha - a value of alpha channel. It is equal to 256 by default. - */ - template class A> SIMD_INLINE void BgrToBgra(const View& bgr, View& bgra, uint8_t alpha = 0xFF) - { - assert(EqualSize(bgr, bgra) && bgra.format == View::Bgra32 && bgr.format == View::Bgr24); - - SimdBgrToBgra(bgr.data, bgr.width, bgr.height, bgr.stride, bgra.data, bgra.stride, alpha); - } - - /*! @ingroup other_conversion - - \fn void Bgr48pToBgra32(const View& blue, const View& green, const View& red, View& bgra, uint8_t alpha = 0xFF) - - \short Converts 48-bit planar BGR image to 32-bit BGRA image. - - All images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdBgr48pToBgra32. - - \param [in] blue - an input 16-bit image with blue color plane. - \param [in] green - an input 16-bit image with green color plane. - \param [in] red - an input 16-bit image with red color plane. - \param [out] bgra - an output 32-bit BGRA image. - \param [in] alpha - a value of alpha channel. It is equal to 256 by default. - */ - template class A> SIMD_INLINE void Bgr48pToBgra32(const View& blue, const View& green, const View& red, View& bgra, uint8_t alpha = 0xFF) - { - assert(Compatible(blue, green, red) && EqualSize(blue, bgra) && blue.format == View::Int16 && bgra.format == View::Bgra32); - - SimdBgr48pToBgra32(blue.data, blue.stride, blue.width, blue.height, green.data, green.stride, red.data, red.stride, bgra.data, bgra.stride, alpha); - } - - /*! @ingroup bgr_conversion - - \fn void BgrToGray(const View& bgr, View& gray) - - \short Converts 24-bit BGR image to 8-bit gray image. - - All images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdBgrToGray. - - \param [in] bgr - an input 24-bit BGR image. - \param [out] gray - an output 8-bit gray image. - */ - template class A> SIMD_INLINE void BgrToGray(const View& bgr, View& gray) - { - assert(EqualSize(bgr, gray) && bgr.format == View::Bgr24 && gray.format == View::Gray8); - - SimdBgrToGray(bgr.data, bgr.width, bgr.height, bgr.stride, gray.data, gray.stride); - } - - /*! @ingroup bgr_conversion - - \fn void BgrToHsl(const View & bgr, View & hsl) - - \short Converts 24-bit BGR image to 24-bit HSL(Hue, Saturation, Lightness) image. - - All images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdBgrToHsl. - - \param [in] bgr - an input 24-bit BGR image. - \param [out] hsl - an output 24-bit HSL image. - */ - template class A> SIMD_INLINE void BgrToHsl(const View & bgr, View & hsl) - { - assert(EqualSize(bgr, hsl) && bgr.format == View::Bgr24 && hsl.format == View::Hsl24); - - SimdBgrToHsl(bgr.data, bgr.width, bgr.height, bgr.stride, hsl.data, hsl.stride); - } - - /*! @ingroup bgr_conversion - - \fn void BgrToHsv(const View & bgr, View & hsv) - - \short Converts 24-bit BGR image to 24-bit HSV(Hue, Saturation, Value) image. - - All images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdBgrToHsv. - - \param [in] bgr - an input 24-bit BGR image. - \param [out] hsv - an output 24-bit HSV image. - */ - template class A> SIMD_INLINE void BgrToHsv(const View & bgr, View & hsv) - { - assert(EqualSize(bgr, hsv) && bgr.format == View::Bgr24 && hsv.format == View::Hsv24); - - SimdBgrToHsv(bgr.data, bgr.width, bgr.height, bgr.stride, hsv.data, hsv.stride); - } - - /*! @ingroup bgr_conversion - - \fn void BgrToRgb(const View & bgr, View & rgb) - - \short Converts 24-bit BGR image to 24-bit RGB image (also it performs backward conversion). - - All images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdBgrToRgb. - - \param [in] bgr - an input 24-bit BGR image. - \param [out] rgb - an output 24-bit RGB image. - */ - template class A> SIMD_INLINE void BgrToRgb(const View & bgr, View & rgb) - { - assert(EqualSize(bgr, rgb) && bgr.PixelSize() == 3 && rgb.PixelSize() == 3); - - SimdBgrToRgb(bgr.data, bgr.stride, bgr.width, bgr.height, rgb.data, rgb.stride); - } - - /*! @ingroup bgr_conversion - - \fn void BgrToYuv420p(const View& bgr, View& y, View& u, View& v) - - \short Converts 24-bit BGR image to YUV420P. - - The input BGR and output Y images must have the same width and height. - The input U and V images must have the same width and height (half size relative to Y component). - - \note This function is a C++ wrapper for function ::SimdBgrToYuv420p. - - \param [in] bgr - an input 24-bit BGR image. - \param [out] y - an output 8-bit image with Y color plane. - \param [out] u - an output 8-bit image with U color plane. - \param [out] v - an output 8-bit image with V color plane. - */ - template class A> SIMD_INLINE void BgrToYuv420p(const View& bgr, View& y, View& u, View& v) - { - assert(y.width == 2 * u.width && y.height == 2 * u.height && y.format == u.format); - assert(y.width == 2 * v.width && y.height == 2 * v.height && y.format == v.format); - assert(y.width == bgr.width && y.height == bgr.height); - assert(y.format == View::Gray8 && bgr.format == View::Bgr24); - - SimdBgrToYuv420p(bgr.data, bgr.width, bgr.height, bgr.stride, y.data, y.stride, u.data, u.stride, v.data, v.stride); - } - - /*! @ingroup bgr_conversion - - \fn void BgrToYuv422p(const View& bgr, View& y, View& u, View& v) - - \short Converts 24-bit BGR image to YUV422P. - - The input BGR and output Y images must have the same width and height. - The input U and V images must have the same width and height (their width is equal to half width of Y component). - - \note This function is a C++ wrapper for function ::SimdBgrToYuv422p. - - \param [in] bgr - an input 24-bit BGR image. - \param [out] y - an output 8-bit image with Y color plane. - \param [out] u - an output 8-bit image with U color plane. - \param [out] v - an output 8-bit image with V color plane. - */ - template class A> SIMD_INLINE void BgrToYuv422p(const View& bgr, View& y, View& u, View& v) - { - assert(y.width == 2 * u.width && y.height == u.height && y.format == u.format); - assert(y.width == 2 * v.width && y.height == v.height && y.format == v.format); - assert(y.width == bgr.width && y.height == bgr.height); - assert(y.format == View::Gray8 && bgr.format == View::Bgr24); - - SimdBgrToYuv422p(bgr.data, bgr.width, bgr.height, bgr.stride, y.data, y.stride, u.data, u.stride, v.data, v.stride); - } - - /*! @ingroup bgr_conversion - - \fn void BgrToYuv444p(const View& bgr, View& y, View& u, View& v) - - \short Converts 24-bit BGR image to YUV444P. - - The input BGR and output Y, U and V images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdBgrToYuv444p. - - \param [in] bgr - an input 24-bit BGR image. - \param [out] y - an output 8-bit image with Y color plane. - \param [out] u - an output 8-bit image with U color plane. - \param [out] v - an output 8-bit image with V color plane. - */ - template class A> SIMD_INLINE void BgrToYuv444p(const View& bgr, View& y, View& u, View& v) - { - assert(EqualSize(bgr, y) && Compatible(y, u, v)); - assert(y.format == View::Gray8 && bgr.format == View::Bgr24); - - SimdBgrToYuv444p(bgr.data, bgr.width, bgr.height, bgr.stride, y.data, y.stride, u.data, u.stride, v.data, v.stride); - } - - /*! @ingroup binarization - - \fn void Binarization(const View& src, uint8_t value, uint8_t positive, uint8_t negative, View& dst, SimdCompareType compareType) - - \short Performs binarization of 8-bit gray image. - - All images must have 8-bit gray format and must have the same width and height. - - For every point: - \verbatim - dst[i] = compare(src[i], value) ? positive : negative; - \endverbatim - where compare(a, b) depends from compareType (see ::SimdCompareType). - - \note This function is a C++ wrapper for function ::SimdBinarization. - - \param [in] src - an input 8-bit gray image (first value for compare operation). - \param [in] value - a second value for compare operation. - \param [in] positive - a destination value if comparison operation has a positive result. - \param [in] negative - a destination value if comparison operation has a negative result. - \param [out] dst - an output 8-bit gray binarized image. - \param [in] compareType - a compare operation type (see ::SimdCompareType). - */ - template class A> SIMD_INLINE void Binarization(const View& src, uint8_t value, uint8_t positive, uint8_t negative, View& dst, SimdCompareType compareType) - { - assert(Compatible(src, dst) && src.format == View::Gray8); - - SimdBinarization(src.data, src.stride, src.width, src.height, value, positive, negative, dst.data, dst.stride, compareType); - } - - /*! @ingroup binarization - - \fn void AveragingBinarization(const View& src, uint8_t value, size_t neighborhood, uint8_t threshold, uint8_t positive, uint8_t negative, View& dst, SimdCompareType compareType) - - \short Performs averaging binarization of 8-bit gray image. - - All images must have 8-bit gray format and must have the same width and height. - - For every point: - \verbatim - sum = 0; area = 0; - for(dy = -neighborhood; dy <= neighborhood; ++dy) - { - for(dx = -neighborhood; dx <= neighborhood; ++dx) - { - if(x + dx >= 0 && x + dx < width && y + dy >= 0 && y + dy < height) - { - area++; - if(compare(src[x + dx, x + dy], value)) - sum++; - } - } - } - dst[x, y] = sum*255 > area*threshold ? positive : negative; - \endverbatim - where compare(a, b) depends from compareType (see ::SimdCompareType). - - \note This function is a C++ wrapper for function ::SimdAveragingBinarization. - - \param [in] src - an input 8-bit gray image (first value for compare operation). - \param [in] value - a second value for compare operation. - \param [in] neighborhood - an averaging neighborhood. - \param [in] threshold - a threshold value for binarization. It can range from 0 to 255. - \param [in] positive - a destination value if for neighborhood of this point number of positive comparison is greater then threshold. - \param [in] negative - a destination value if for neighborhood of this point number of positive comparison is lesser or equal then threshold. - \param [out] dst - an output 8-bit gray binarized image. - \param [in] compareType - a compare operation type (see ::SimdCompareType). - */ - template class A> SIMD_INLINE void AveragingBinarization(const View& src, uint8_t value, size_t neighborhood, uint8_t threshold, uint8_t positive, uint8_t negative, View& dst, SimdCompareType compareType) - { - assert(Compatible(src, dst) && src.format == View::Gray8); - - SimdAveragingBinarization(src.data, src.stride, src.width, src.height, value, - neighborhood, threshold, positive, negative, dst.data, dst.stride, compareType); - } - - /*! @ingroup conditional - - \fn void ConditionalCount8u(const View & src, uint8_t value, SimdCompareType compareType, uint32_t & count) - - \short Calculates number of points satisfying certain condition for 8-bit gray image. - - For every point: - \verbatim - if(compare(src[i], value)) - count++; - \endverbatim - where compare(a, b) depends from compareType (see ::SimdCompareType). - - \note This function is a C++ wrapper for function ::SimdConditionalCount8u. - - \param [in] src - an input 8-bit gray image (first value for compare operation). - \param [in] value - a second value for compare operation. - \param [in] compareType - a compare operation type (see ::SimdCompareType). - \param [out] count - a pointer to result unsigned 32-bit value. - */ - template class A> SIMD_INLINE void ConditionalCount8u(const View & src, uint8_t value, SimdCompareType compareType, uint32_t & count) - { - assert(src.format == View::Gray8); - - SimdConditionalCount8u(src.data, src.stride, src.width, src.height, value, compareType, &count); - } - - /*! @ingroup conditional - - \fn void ConditionalCount16i(const View & src, int16_t value, SimdCompareType compareType, uint32_t & count) - - \short Calculates number of points satisfying certain condition for 16-bit signed integer image. - - For every point: - \verbatim - if(compare(src[i], value)) - count++; - \endverbatim - where compare(a, b) depends from compareType (see ::SimdCompareType). - - \note This function is a C++ wrapper for function ::SimdConditionalCount16i. - - \param [in] src - an input 16-bit signed integer image (first value for compare operation). - \param [in] value - a second value for compare operation. - \param [in] compareType - a compare operation type (see ::SimdCompareType). - \param [out] count - a pointer to result unsigned 32-bit value. - */ - template class A> SIMD_INLINE void ConditionalCount16i(const View & src, int16_t value, SimdCompareType compareType, uint32_t & count) - { - assert(src.format == View::Int16); - - SimdConditionalCount16i(src.data, src.stride, src.width, src.height, value, compareType, &count); - } - - /*! @ingroup conditional - - \fn void ConditionalSum(const View & src, const View & mask, uint8_t value, SimdCompareType compareType, uint64_t & sum) - - \short Calculates sum of image points when mask points satisfying certain condition. - - All images must have 8-bit gray format and must have the same width and height. - - For every point: - \verbatim - if(compare(mask[i], value)) - sum += src[i]; - \endverbatim - where compare(a, b) depends from compareType (see ::SimdCompareType). - - \note This function is a C++ wrapper for function ::SimdConditionalSum. - - \param [in] src - an input 8-bit gray image. - \param [in] mask - a 8-bit gray mask (first value for compare operation). - \param [in] value - a second value for compare operation. - \param [in] compareType - a compare operation type (see ::SimdCompareType). - \param [out] sum - a pointer to result unsigned 64-bit value. - */ - template class A> SIMD_INLINE void ConditionalSum(const View & src, const View & mask, uint8_t value, SimdCompareType compareType, uint64_t & sum) - { - assert(Compatible(src, mask) && src.format == View::Gray8); - - SimdConditionalSum(src.data, src.stride, src.width, src.height, mask.data, mask.stride, value, compareType, &sum); - } - - /*! @ingroup conditional - - \fn void ConditionalSquareSum(const View& src, const View& mask, uint8_t value, SimdCompareType compareType, uint64_t & sum) - - \short Calculates sum of squared image points when mask points satisfying certain condition. - - All images must have 8-bit gray format and must have the same width and height. - - For every point: - \verbatim - if(compare(mask[i], value)) - sum += src[i]*src[i]; - \endverbatim - where compare(a, b) depends from compareType (see ::SimdCompareType). - - \note This function is a C++ wrapper for function ::SimdConditionalSquareSum. - - \param [in] src - an input 8-bit gray image. - \param [in] mask - a 8-bit gray mask (first value for compare operation). - \param [in] value - a second value for compare operation. - \param [in] compareType - a compare operation type (see ::SimdCompareType). - \param [out] sum - a pointer to result unsigned 64-bit value. - */ - template class A> SIMD_INLINE void ConditionalSquareSum(const View& src, const View& mask, uint8_t value, SimdCompareType compareType, uint64_t & sum) - { - assert(Compatible(src, mask) && src.format == View::Gray8); - - SimdConditionalSquareSum(src.data, src.stride, src.width, src.height, mask.data, mask.stride, value, compareType, &sum); - } - - /*! @ingroup conditional - - \fn void ConditionalSquareGradientSum(const View& src, const View& mask, uint8_t value, SimdCompareType compareType, uint64_t & sum) - - \short Calculates sum of squared gradient of image points when mask points satisfying certain condition. - - All images must have 8-bit gray format and must have the same width and height. The image height and width must be equal or greater 3. - - For every point except border: - \verbatim - if(compare(mask[x, y], value)) - { - dx = src[x + 1, y] - src[x - 1, y]; - dy = src[x, y + 1] - src[x, y - 1]; - sum += dx*dx + dy*dy; - } - \endverbatim - where compare(a, b) depends from compareType (see ::SimdCompareType). - - \note This function is a C++ wrapper for function ::SimdConditionalSquareGradientSum. - - \param [in] src - an input 8-bit gray image. - \param [in] mask - a 8-bit gray mask (first value for compare operation). - \param [in] value - a second value for compare operation. - \param [in] compareType - a compare operation type (see ::SimdCompareType). - \param [out] sum - a pointer to result unsigned 64-bit value. - */ - template class A> SIMD_INLINE void ConditionalSquareGradientSum(const View& src, const View& mask, uint8_t value, SimdCompareType compareType, uint64_t & sum) - { - assert(Compatible(src, mask) && src.format == View::Gray8 && src.width >= 3 && src.height >= 3); - - SimdConditionalSquareGradientSum(src.data, src.stride, src.width, src.height, mask.data, mask.stride, value, compareType, &sum); - } - - /*! @ingroup conditional - - \fn void ConditionalFill(const View & src, uint8_t threshold, SimdCompareType compareType, uint8_t value, View & dst); - - \short Fills pixels of 8-bit gray image by given value if corresponding pixels of input 8-bit gray image satisfy certain condition. - - All images must have the same width and height. - - For every point: - \verbatim - if(compare(src[i], threshold)) - dst[i] = value; - \endverbatim - where compare(a, b) depends from compareType (see ::SimdCompareType). - - \note This function is a C++ wrapper for function ::SimdConditionalFill - - \param [in] src - an input 8-bit gray image. - \param [in] threshold - a second value for compare operation. - \param [in] compareType - a compare operation type (see ::SimdCompareType). - \param [in] value - a value for fill operation. - \param [in, out] dst - an output 8-bit gray image. - */ - template class A> SIMD_INLINE void ConditionalFill(const View & src, uint8_t threshold, SimdCompareType compareType, uint8_t value, View & dst) - { - assert(Compatible(src, dst) && src.format == View::Gray8); - - SimdConditionalFill(src.data, src.stride, src.width, src.height, threshold, compareType, value, dst.data, dst.stride); - } - - /*! @ingroup copying - - \fn void Copy(const View & src, View & dst) - - \short Copies pixels data of image from source to destination. - - All images must have the same width, height and format. - - \note This function is a C++ wrapper for function ::SimdCopy. - - \param [in] src - a source image. - \param [out] dst - a destination image. - */ - template class A, template class B> SIMD_INLINE void Copy(const View & src, View & dst) - { - assert(Compatible(src, dst)); - - if (src.format) - { - SimdCopy(src.data, src.stride, src.width, src.height, src.PixelSize(), dst.data, dst.stride); - } - } - - /*! @ingroup copying - - \fn void CopyFrame(const View& src, const Rectangle & frame, View& dst) - - \short Copies pixels data of image from source to destination except for the portion bounded frame. - - All images must have the same width, height and format. - - \note This function is a C++ wrapper for function ::SimdCopyFrame. - - \param [in] src - a source image. - \param [in] frame - a frame rectangle. - \param [out] dst - a destination image. - */ - template class A> SIMD_INLINE void CopyFrame(const View& src, const Rectangle & frame, View& dst) - { - assert(Compatible(src, dst) && frame.Width() >= 0 && frame.Height() >= 0); - assert(frame.left >= 0 && frame.top >= 0 && frame.right <= ptrdiff_t(src.width) && frame.bottom <= ptrdiff_t(src.height)); - - SimdCopyFrame(src.data, src.stride, src.width, src.height, src.PixelSize(), - frame.left, frame.top, frame.right, frame.bottom, dst.data, dst.stride); - } - - /*! @ingroup other_conversion - - \fn void DeinterleaveUv(const View& uv, View& u, View& v) - - \short Deinterleaves 16-bit UV interleaved image into separated 8-bit U and V planar images. - - All images must have the same width and height. - This function used for NV12 to YUV420P conversion. - - \note This function is a C++ wrapper for function ::SimdDeinterleaveUv. - - \param [in] uv - an input 16-bit UV interleaved image. - \param [out] u - an output 8-bit U planar image. - \param [out] v - an output 8-bit V planar image. - */ - template class A> SIMD_INLINE void DeinterleaveUv(const View& uv, View& u, View& v) - { - assert(EqualSize(uv, u, v) && uv.format == View::Uv16 && u.format == View::Gray8 && v.format == View::Gray8); - - SimdDeinterleaveUv(uv.data, uv.stride, uv.width, uv.height, u.data, u.stride, v.data, v.stride); - } - - /*! @ingroup other_conversion - - \fn void DeinterleaveBgr(const View& bgr, View& b, View& g, View& r) - - \short Deinterleaves 24-bit BGR interleaved image into separated 8-bit Blue, Green and Red planar images. - - All images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdDeinterleaveBgr. - - \param [in] bgr - an input 24-bit BGR interleaved image. - \param [out] b - an output 8-bit Blue planar image. - \param [out] g - an output 8-bit Green planar image. - \param [out] r - an output 8-bit Red planar image. - */ - template class A> SIMD_INLINE void DeinterleaveBgr(const View& bgr, View& b, View& g, View& r) - { - assert(EqualSize(bgr, b) && Compatible(b, g, r) && bgr.format == View::Bgr24 && b.format == View::Gray8); - - SimdDeinterleaveBgr(bgr.data, bgr.stride, bgr.width, bgr.height, b.data, b.stride, g.data, g.stride, r.data, r.stride); - } - - /*! @ingroup other_conversion - - \fn void DeinterleaveBgra(const View& bgra, View& b, View& g, View& r, View& a) - - \short Deinterleaves 32-bit BGRA interleaved image into separated 8-bit Blue, Green, Red and Alpha planar images. - - All images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdDeinterleaveBgra. - - \param [in] bgra - an input 32-bit BGRA interleaved image. - \param [out] b - an output 8-bit Blue planar image. - \param [out] g - an output 8-bit Green planar image. - \param [out] r - an output 8-bit Red planar image. - \param [out] a - an output 8-bit Alpha planar image. - */ - template class A> SIMD_INLINE void DeinterleaveBgra(const View& bgra, View& b, View& g, View& r, View& a) - { - assert(EqualSize(bgra, b) && Compatible(b, g, r, a) && bgra.format == View::Bgra32 && b.format == View::Gray8); - - SimdDeinterleaveBgra(bgra.data, bgra.stride, bgra.width, bgra.height, b.data, b.stride, g.data, g.stride, r.data, r.stride, a.data, a.stride); - } - - /*! @ingroup edge_background - - \fn void EdgeBackgroundGrowRangeSlow(const View& value, View& background) - - \short Performs edge background update (initial grow, slow mode). - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - background[i] += value[i] > background[i] ? 1 : 0; - \endverbatim - - This function is used for edge background updating in motion detection algorithm. - - \note This function is a C++ wrapper for function ::SimdEdgeBackgroundGrowRangeSlow. - - \param [in] value - a current feature value. - \param [in, out] background - a feature value of edge dynamic background. - */ - template class A> SIMD_INLINE void EdgeBackgroundGrowRangeSlow(const View& value, View& background) - { - assert(Compatible(value, background) && value.format == View::Gray8); - - SimdEdgeBackgroundGrowRangeSlow(value.data, value.stride, value.width, value.height, background.data, background.stride); - } - - /*! @ingroup edge_background - - \fn void EdgeBackgroundGrowRangeFast(const View& value, View& background) - - \short Performs edge background update (initial grow, fast mode). - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - background[i] = value[i] > background[i] ? value[i] : background[i]; - \endverbatim - - This function is used for edge background updating in motion detection algorithm. - - \note This function is a C++ wrapper for function ::SimdEdgeBackgroundGrowRangeFast. - - \param [in] value - a current feature value. - \param [in, out] background - a feature value of edge dynamic background. - */ - template class A> SIMD_INLINE void EdgeBackgroundGrowRangeFast(const View& value, View& background) - { - assert(Compatible(value, background) && value.format == View::Gray8); - - SimdEdgeBackgroundGrowRangeFast(value.data, value.stride, value.width, value.height, background.data, background.stride); - } - - /*! @ingroup edge_background - - \fn void EdgeBackgroundIncrementCount(const View& value, const View& backgroundValue, View& backgroundCount) - - \short Performs collection of edge background statistic. - - All images must have the same width, height and format (8-bit gray). - - Updates background statistic counters for every point: - \verbatim - backgroundCount[i] += (value[i] > backgroundValue[i] && backgroundCount[i] < 255) ? 1 : 0; - \endverbatim - - This function is used for edge background updating in motion detection algorithm. - - \note This function is a C++ wrapper for function ::SimdEdgeBackgroundIncrementCount. - - \param [in] value - a current feature value. - \param [in] backgroundValue - a value of feature of edge dynamic background. - \param [in, out] backgroundCount - a count of feature of edge dynamic background. - */ - template class A> SIMD_INLINE void EdgeBackgroundIncrementCount(const View& value, const View& backgroundValue, View& backgroundCount) - { - assert(Compatible(value, backgroundValue, backgroundCount) && value.format == View::Gray8); - - SimdEdgeBackgroundIncrementCount(value.data, value.stride, value.width, value.height, - backgroundValue.data, backgroundValue.stride, backgroundCount.data, backgroundCount.stride); - } - - /*! @ingroup edge_background - - \fn void EdgeBackgroundAdjustRange(View& backgroundCount, View& backgroundValue, uint8_t threshold) - - \short Performs adjustment of edge background range. - - All images must have the same width, height and format (8-bit gray). - - Adjusts edge background range for every point: - \verbatim - backgroundValue[i] += (backgroundCount[i] > threshold && backgroundValue[i] < 255) ? 1 : 0; - backgroundValue[i] -= (backgroundCount[i] < threshold && backgroundValue[i] > 0) ? 1 : 0; - backgroundCount[i] = 0; - \endverbatim - - This function is used for edge background updating in motion detection algorithm. - - \note This function is a C++ wrapper for function ::SimdEdgeBackgroundAdjustRange. - - \param [in, out] backgroundCount - a count of feature of edge dynamic background. - \param [in, out] backgroundValue - a value of feature of edge dynamic background. - \param [in] threshold - a count threshold. - */ - template class A> SIMD_INLINE void EdgeBackgroundAdjustRange(View& backgroundCount, View& backgroundValue, uint8_t threshold) - { - assert(Compatible(backgroundCount, backgroundValue) && backgroundCount.format == View::Gray8); - - SimdEdgeBackgroundAdjustRange(backgroundCount.data, backgroundCount.stride, backgroundCount.width, backgroundCount.height, - backgroundValue.data, backgroundValue.stride, threshold); - } - - /*! @ingroup edge_background - - \fn void EdgeBackgroundAdjustRange(View& backgroundCount, View& backgroundValue, uint8_t threshold, const View& mask) - - \short Performs adjustment of edge background range with using adjust range mask. - - All images must have the same width, height and format (8-bit gray). - - Adjusts edge background range for every point: - \verbatim - if(mask[i]) - { - backgroundValue[i] += (backgroundCount[i] > threshold && backgroundValue[i] < 255) ? 1 : 0; - backgroundValue[i] -= (backgroundCount[i] < threshold && backgroundValue[i] > 0) ? 1 : 0; - backgroundCount[i] = 0; - } - \endverbatim - - This function is used for edge background updating in motion detection algorithm. - - \note This function is a C++ wrapper for function ::SimdEdgeBackgroundAdjustRangeMasked. - - \param [in, out] backgroundCount - a count of feature of edge dynamic background. - \param [in, out] backgroundValue - a value of feature of edge dynamic background. - \param [in] threshold - a count threshold. - \param [in] mask - an adjust range mask. - */ - template class A> SIMD_INLINE void EdgeBackgroundAdjustRange(View& backgroundCount, View& backgroundValue, uint8_t threshold, const View& mask) - { - assert(Compatible(backgroundCount, backgroundValue, mask) && backgroundCount.format == View::Gray8); - - SimdEdgeBackgroundAdjustRangeMasked(backgroundCount.data, backgroundCount.stride, backgroundCount.width, backgroundCount.height, - backgroundValue.data, backgroundValue.stride, threshold, mask.data, mask.stride); - } - - /*! @ingroup edge_background - - \fn void EdgeBackgroundShiftRange(const View& value, View& background) - - \short Shifts edge background range. - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - background[i] = value[i]; - \endverbatim - - This function is used for fast edge background updating in motion detection algorithm. - - \note This function is a C++ wrapper for function ::SimdEdgeBackgroundShiftRange. - - \param [in] value - a current feature value. - \param [in, out] background - a feature of the edge dynamic background. - */ - template class A> SIMD_INLINE void EdgeBackgroundShiftRange(const View& value, View& background) - { - assert(Compatible(value, background) && value.format == View::Gray8); - - SimdEdgeBackgroundShiftRange(value.data, value.stride, value.width, value.height, background.data, background.stride); - } - - /*! @ingroup edge_background - - \fn void EdgeBackgroundShiftRange(const View& value, View& background, const View& mask) - - \short Shifts edge background range with using shift range mask. - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - if(mask[i]]) - background[i] = value[i]; - \endverbatim - - This function is used for fast edge background updating in motion detection algorithm. - - \note This function is a C++ wrapper for function ::SimdEdgeBackgroundShiftRangeMasked. - - \param [in] value - a current feature value. - \param [in, out] background - a feature of the edge dynamic background. - \param [in] mask - a shift range mask. - */ - template class A> SIMD_INLINE void EdgeBackgroundShiftRange(const View& value, View& background, const View& mask) - { - assert(Compatible(value, background, mask) && value.format == View::Gray8); - - SimdEdgeBackgroundShiftRangeMasked(value.data, value.stride, value.width, value.height, - background.data, background.stride, mask.data, mask.stride); - } - - /*! @ingroup filling - - \fn void Fill(View& dst, uint8_t value) - - \short Fills pixels data of image by given value. - - \note This function is a C++ wrapper for function ::SimdFill. - - \param [out] dst - a destination image. - \param [in] value - a value to fill image. - */ - template class A> SIMD_INLINE void Fill(View& dst, uint8_t value) - { - SimdFill(dst.data, dst.stride, dst.width, dst.height, dst.PixelSize(), value); - } - - /*! @ingroup filling - - \fn void FillFrame(View& dst, const Rectangle & frame, uint8_t value) - - \short Fills pixels data of image except for the portion bounded frame by given value. - - \note This function is a C++ wrapper for function ::SimdFillFrame. - - \param [out] dst - a destination image. - \param [in] frame - a frame rectangle. - \param [in] value - a value to fill image. - */ - template class A> SIMD_INLINE void FillFrame(View& dst, const Rectangle & frame, uint8_t value) - { - SimdFillFrame(dst.data, dst.stride, dst.width, dst.height, dst.PixelSize(), - frame.left, frame.top, frame.right, frame.bottom, value); - } - - /*! @ingroup filling - - \fn void FillBgr(View& dst, uint8_t blue, uint8_t green, uint8_t red) - - \short Fills pixels data of 24-bit BGR image by given color(blue, green, red). - - \note This function is a C++ wrapper for function ::SimdFillBgr. - - \param [out] dst - a destination image. - \param [in] blue - a blue channel of BGR to fill image. - \param [in] green - a green channel of BGR to fill image. - \param [in] red - a red channel of BGR to fill image. - */ - template class A> SIMD_INLINE void FillBgr(View& dst, uint8_t blue, uint8_t green, uint8_t red) - { - assert(dst.format == View::Bgr24); - - SimdFillBgr(dst.data, dst.stride, dst.width, dst.height, blue, green, red); - } - - /*! @ingroup filling - - \fn void FillBgra(View& dst, uint8_t blue, uint8_t green, uint8_t red, uint8_t alpha = 0xFF) - - \short Fills pixels data of 32-bit BGRA image by given color(blue, green, red, alpha). - - \note This function is a C++ wrapper for function ::SimdFillBgra. - - \param [out] dst - a destination image. - \param [in] blue - a blue channel of BGRA to fill image. - \param [in] green - a green channel of BGRA to fill image. - \param [in] red - a red channel of BGRA to fill image. - \param [in] alpha - a alpha channel of BGRA to fill image. It is equal to 255 by default. - */ - template class A> SIMD_INLINE void FillBgra(View& dst, uint8_t blue, uint8_t green, uint8_t red, uint8_t alpha = 0xFF) - { - assert(dst.format == View::Bgra32); - - SimdFillBgra(dst.data, dst.stride, dst.width, dst.height, blue, green, red, alpha); - } - - /*! @ingroup filling - - \fn void FillPixel(View & dst, const Pixel & pixel) - - \short Fills image by value of given pixel. - - \note This function is a C++ wrapper for function ::SimdFillPixel. - - \param [out] dst - a destination image. - \param [in] pixel - a pixel of type which correspond to image format. The size of the type is restricted by range [1, 4]. - */ - template class A, class Pixel> SIMD_INLINE void FillPixel(View & dst, const Pixel & pixel) - { - assert(dst.PixelSize() == sizeof(Pixel)); - - SimdFillPixel(dst.data, dst.stride, dst.width, dst.height, (uint8_t*)&pixel, sizeof(Pixel)); - } - - /*! @ingroup other_filter - - \fn void GaussianBlur3x3(const View& src, View& dst) - - \short Performs Gaussian blur filtration with window 3x3. - - For every point: - \verbatim - dst[x, y] = (src[x-1, y-1] + 2*src[x, y-1] + src[x+1, y-1] + - 2*(src[x-1, y] + 2*src[x, y] + src[x+1, y]) + - src[x-1, y+1] + 2*src[x, y+1] + src[x+1, y+1] + 8) / 16; - \endverbatim - All images must have the same width, height and format (8-bit gray, 16-bit UV, 24-bit BGR or 32-bit BGRA). - - \note This function is a C++ wrapper for function ::SimdGaussianBlur3x3. - - \param [in] src - a source image. - \param [out] dst - a destination image. - */ - template class A> SIMD_INLINE void GaussianBlur3x3(const View& src, View& dst) - { - assert(Compatible(src, dst) && src.ChannelSize() == 1); - - SimdGaussianBlur3x3(src.data, src.stride, src.width, src.height, src.ChannelCount(), dst.data, dst.stride); - } - - /*! @ingroup gray_conversion - - \fn void GrayToBgr(const View& gray, View& bgr) - - \short Converts 8-bit gray image to 24-bit BGR image. - - All images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdGrayToBgr. - - \param [in] gray - an input 8-bit gray image. - \param [out] bgr - an output 24-bit BGR image. - */ - template class A> SIMD_INLINE void GrayToBgr(const View& gray, View& bgr) - { - assert(EqualSize(gray, bgr) && bgr.format == View::Bgr24 && gray.format == View::Gray8); - - SimdGrayToBgr(gray.data, gray.width, gray.height, gray.stride, bgr.data, bgr.stride); - } - - /*! @ingroup gray_conversion - - \fn void GrayToBgra(const View& gray, View& bgra, uint8_t alpha = 0xFF) - - \short Converts 8-bit gray image to 32-bit BGRA image. - - All images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdGrayToBgra. - - \param [in] gray - an input 8-bit gray image. - \param [out] bgra - an output 32-bit BGRA image. - \param [in] alpha - a value of alpha channel. It is equal to 255 by default. - */ - template class A> SIMD_INLINE void GrayToBgra(const View& gray, View& bgra, uint8_t alpha = 0xFF) - { - assert(EqualSize(gray, bgra) && bgra.format == View::Bgra32 && gray.format == View::Gray8); - - SimdGrayToBgra(gray.data, gray.width, gray.height, gray.stride, bgra.data, bgra.stride, alpha); - } - - /*! @ingroup histogram - - \fn void AbsSecondDerivativeHistogram(const View& src, size_t step, size_t indent, uint32_t * histogram) - - \short Calculates histogram of second derivative for 8-bit gray image. - - For all points except the boundary (defined by parameter indent): - \verbatim - dx = abs(src[x, y] - average(src[x+step, y], src[x-step, y])); - dy = abs(src[x, y] - average(src[x, y+step], src[x, y-step])); - histogram[max(dx, dy)]++; - \endverbatim - - \note This function is a C++ wrapper for function ::SimdAbsSecondDerivativeHistogram. - - \param [in] src - an input 8-bit gray image. - \param [in] step - a step for second derivative calculation. - \param [in] indent - a indent from image boundary. - \param [out] histogram - a pointer to histogram (array of 256 unsigned 32-bit values). - */ - template class A> SIMD_INLINE void AbsSecondDerivativeHistogram(const View& src, size_t step, size_t indent, uint32_t * histogram) - { - assert(src.format == View::Gray8 && indent >= step && src.width > 2 * indent && src.height > 2 * indent); - - SimdAbsSecondDerivativeHistogram(src.data, src.width, src.height, src.stride, step, indent, histogram); - } - - /*! @ingroup histogram - - \fn void Histogram(const View& src, uint32_t * histogram) - - \short Calculates histogram for 8-bit gray image. - - For all points: - \verbatim - histogram[src[i]]++. - \endverbatim - - \note This function is a C++ wrapper for function ::SimdHistogram. - - \param [in] src - an input 8-bit gray image. - \param [out] histogram - a pointer to histogram (array of 256 unsigned 32-bit values). - */ - template class A> SIMD_INLINE void Histogram(const View& src, uint32_t * histogram) - { - assert(src.format == View::Gray8); - - SimdHistogram(src.data, src.width, src.height, src.stride, histogram); - } - - /*! @ingroup histogram - - \fn void HistogramMasked(const View & src, const View & mask, uint8_t index, uint32_t * histogram) - - \short Calculates histogram for 8-bit gray image with using mask. - - For every point: - \verbatim - if(mask[i] == index) - histogram[src[i]]++. - \endverbatim - - \note This function is a C++ wrapper for function ::SimdHistogramMasked. - - \param [in] src - an input 8-bit gray image. - \param [in] mask - a mask 8-bit image. - \param [in] index - a mask index. - \param [out] histogram - a pointer to histogram (array of 256 unsigned 32-bit values). - */ - template class A> SIMD_INLINE void HistogramMasked(const View & src, const View & mask, uint8_t index, uint32_t * histogram) - { - assert(Compatible(src, mask) && src.format == View::Gray8); - - SimdHistogramMasked(src.data, src.stride, src.width, src.height, mask.data, mask.stride, index, histogram); - } - - /*! @ingroup histogram - - \fn void HistogramConditional(const View& src, const View& mask, uint8_t value, SimdCompareType compareType, uint32_t * histogram) - - \short Calculates histogram of 8-bit gray image for those points when mask points satisfying certain condition. - - For every point: - \verbatim - if(compare(mask[x, y], value)) - histogram[src[x, y]]++. - \endverbatim - - \note This function is a C++ wrapper for function ::SimdHistogramConditional. - - \param [in] src - an input 8-bit gray image. - \param [in] mask - a mask 8-bit image. - \param [in] value - a second value for compare operation. - \param [in] compareType - a compare operation type (see ::SimdCompareType). - \param [out] histogram - a pointer to histogram (array of 256 unsigned 32-bit values). - */ - template class A> SIMD_INLINE void HistogramConditional(const View& src, const View& mask, uint8_t value, SimdCompareType compareType, uint32_t * histogram) - { - assert(Compatible(src, mask) && src.format == View::Gray8); - - SimdHistogramConditional(src.data, src.stride, src.width, src.height, mask.data, mask.stride, value, compareType, histogram); - } - - /*! @ingroup histogram - - \fn void ChangeColors(const View & src, const uint8_t * colors, View & dst) - - \short Changes colors for 8-bit gray image with using of color map. - - The input and output 8-bit gray images must have the same size. - Algorithm description: - \verbatim - for(y = 0; y < height; ++y) - for(x = 0; x < width; ++x) - dst[x, y] = colors[src[x, y]]; - \endverbatim - \note This function is a C++ wrapper for function ::SimdChangeColors. - - \param [in] src - an input 8-bit gray image. - \param [in] colors - a pointer to the color map (array of 256 unsigned 8-bit values). - \param [out] dst - an output 8-bit gray image. - */ - template class A> SIMD_INLINE void ChangeColors(const View & src, const uint8_t * colors, View & dst) - { - assert(Compatible(src, dst) && src.format == View::Gray8); - - SimdChangeColors(src.data, src.stride, src.width, src.height, colors, dst.data, dst.stride); - } - - /*! @ingroup histogram - - \fn void NormalizeHistogram(const View & src, View & dst) - - \short Normalizes histogram for 8-bit gray image. - - The input and output 8-bit gray images must have the same size. - - \note This function is a C++ wrapper for function ::SimdNormalizeHistogram. - - \param [in] src - an input 8-bit gray image. - \param [out] dst - an output 8-bit image with normalized histogram. - */ - template class A> SIMD_INLINE void NormalizeHistogram(const View & src, View & dst) - { - assert(Compatible(src, dst) && src.format == View::Gray8); - - SimdNormalizeHistogram(src.data, src.stride, src.width, src.height, dst.data, dst.stride); - } - - /*! @ingroup hog - - \fn void SimdHogDirectionHistograms(const View & src, const Point & cell, size_t quantization, float * histograms); - - \short Calculates HOG direction histograms for 8-bit gray image. - - Calculates HOG direction histogram for every cell of 8-bit gray image. This function is useful for face recognition. - - \note This function is a C++ wrapper for function ::SimdHogDirectionHistograms. - - \param [in] src - an input 8-bit gray image. Its size must be a multiple of cell size. - \param [in] cell - a size of cell. - \param [in] quantization - a direction quantization. Must be even. - \param [out] histograms - a pointer to buffer with histograms. Array must has size grater or equal to (src.width/cell.x)*(src.height/cell.y)*quantization. - */ - template class A> SIMD_INLINE void HogDirectionHistograms(const View & src, const Point & cell, size_t quantization, float * histograms) - { - assert(src.format == View::Gray8 && src.width%cell.x == 0 && src.height%cell.y == 0 && quantization % 2 == 0); - - SimdHogDirectionHistograms(src.data, src.stride, src.width, src.height, cell.x, cell.y, quantization, histograms); - } - - /*! @ingroup hog - - \fn void HogExtractFeatures(const View & src, float * features) - - \short Extracts HOG features for 8-bit gray image. - - Extracts HOG features 8-bit gray image. 31 features are extracted for 8x8 cell size and 2x2 block size. This function is useful for face recognition. - - \note This function is a C++ wrapper for function ::SimdHogExtractFeatures. - - \param [in] src - an input 8-bit gray image. Its width and height must be a multiple of 8 and greater or equal to 16. - \param [out] features - a pointer to buffer with features. Array must has size grater or equal to (width/8)*(height/8)*31. - */ - template class A> SIMD_INLINE void HogExtractFeatures(const View & src, float * features) - { - assert(src.format == View::Gray8 && src.width % 8 == 0 && src.height % 8 == 0 && src.width >= 16 && src.height >= 16); - - SimdHogExtractFeatures(src.data, src.stride, src.width, src.height, features); - } - - /*! @ingroup hog - - \fn void HogLiteExtractFeatures(const View & src, size_t cell, float * features, size_t featuresStride) - - \short Extracts lite HOG features for 8-bit gray image. - - Extracts lite (for 8 directions) HOG features 8-bit gray image. 16 features are extracted for 8x8 or 4x4 cell size and 2x2 block size. - - \note This function is a C++ wrapper for function ::SimdHogLiteExtractFeatures. - - \param [in] src - an input 8-bit gray image. Its width and height must be a multiple of cell and greater or equal to cell*3. - \param [in] cell - a size of cell. It must be 4 or 8. - \param [out] features - a pointer to buffer with features. Array must has size grater or equal to (height/cell - 2)*featuresStride. - \param [in] featuresStride - a row size of the buffer with features. It must be grater or equal to (width/cell - 2)*16. - */ - template class A> SIMD_INLINE void HogLiteExtractFeatures(const View & src, size_t cell, float * features, size_t featuresStride) - { - assert((cell == 4 || cell == 8) && featuresStride >= (src.width / cell - 2) * 16); - assert(src.format == View::Gray8 && src.width >= cell * 3 && src.height >= cell * 3); - - SimdHogLiteExtractFeatures(src.data, src.stride, src.width, src.height, cell, features, featuresStride); - } - - /*! @ingroup other_conversion - - \fn void Int16ToGray(const View & src, View & dst) - - \short Converts 16-bit signed integer image to 8-bit gray image with saturation. - - All images must have the same width and height. - - For every point: - \verbatim - dst[i] = Max(0, Min(255, src[i])); - \endverbatim - - \note This function is a C++ wrapper for function ::SimdInt16ToGray. - - \param [in] src - an input 16-bit signed integer image - \param [out] dst - an output 8-bit gray image. - */ - template class A> SIMD_INLINE void Int16ToGray(const View & src, View & dst) - { - assert(EqualSize(src, dst) && src.format == View::Int16 && dst.format == View::Gray8); - - SimdInt16ToGray(src.data, src.width, src.height, src.stride, dst.data, dst.stride); - } - - /*! @ingroup integral - - \fn void Integral(const View& src, View& sum) - - \short Calculates integral images for input 8-bit gray image. - - The function can calculates sum integral image. - A integral image must have width and height per unit greater than that of the input image. - - \note This function is a C++ wrapper for function ::SimdIntegral. - - \param [in] src - an input 8-bit gray image. - \param [out] sum - a 32-bit integer sum image. - */ - template class A> SIMD_INLINE void Integral(const View& src, View& sum) - { - assert(src.width + 1 == sum.width && src.height + 1 == sum.height); - assert(src.format == View::Gray8 && sum.format == View::Int32); - - SimdIntegral(src.data, src.stride, src.width, src.height, sum.data, sum.stride, NULL, 0, NULL, 0, - (SimdPixelFormatType)sum.format, SimdPixelFormatNone); - } - - /*! @ingroup integral - - \fn void Integral(const View& src, View& sum, View& sqsum) - - \short Calculates integral images for input 8-bit gray image. - - The function can calculates sum integral image and square sum integral image. - A integral images must have width and height per unit greater than that of the input image. - - \note This function is a C++ wrapper for function ::SimdIntegral. - - \param [in] src - an input 8-bit gray image. - \param [out] sum - a 32-bit integer sum image. - \param [out] sqsum - a 32-bit integer or 64-bit float point square sum image. - */ - template class A> SIMD_INLINE void Integral(const View& src, View& sum, View& sqsum) - { - assert(src.width + 1 == sum.width && src.height + 1 == sum.height && EqualSize(sum, sqsum)); - assert(src.format == View::Gray8 && sum.format == View::Int32 && (sqsum.format == View::Int32 || sqsum.format == View::Double)); - - SimdIntegral(src.data, src.stride, src.width, src.height, sum.data, sum.stride, sqsum.data, sqsum.stride, NULL, 0, - (SimdPixelFormatType)sum.format, (SimdPixelFormatType)sqsum.format); - } - - /*! @ingroup integral - - \fn void Integral(const View& src, View& sum, View& sqsum, View& tilted) - - \short Calculates integral images for input 8-bit gray image. - - The function can calculates sum integral image, square sum integral image and tilted sum integral image. - A integral images must have width and height per unit greater than that of the input image. - - \note This function is a C++ wrapper for function ::SimdIntegral. - - \param [in] src - an input 8-bit gray image. - \param [out] sum - a 32-bit integer sum image. - \param [out] sqsum - a 32-bit integer or 64-bit float point square sum image. - \param [out] tilted - a 32-bit integer tilted sum image. - */ - template class A> SIMD_INLINE void Integral(const View& src, View& sum, View& sqsum, View& tilted) - { - assert(src.width + 1 == sum.width && src.height + 1 == sum.height && EqualSize(sum, sqsum) && Compatible(sum, tilted)); - assert(src.format == View::Gray8 && sum.format == View::Int32 && (sqsum.format == View::Int32 || sqsum.format == View::Double)); - - SimdIntegral(src.data, src.stride, src.width, src.height, sum.data, sum.stride, sqsum.data, sqsum.stride, tilted.data, tilted.stride, - (SimdPixelFormatType)sum.format, (SimdPixelFormatType)sqsum.format); - } - - /*! @ingroup interference - - \fn void InterferenceIncrement(View & dst, uint8_t increment, int16_t saturation) - - \short Increments statistic of interference detector. - - For every point: - \verbatim - statistic[i] = min(statistic[i] + increment, saturation); - \endverbatim - - This function is used for interference detection in motion detection algorithm. - - \note This function is a C++ wrapper for function ::SimdInterferenceIncrement. - - \param [in, out] dst - a 16-bit signed integer image with statistic. - \param [in] increment - an increment of statistic. - \param [in] saturation - an upper saturation of statistic. - */ - template class A> SIMD_INLINE void InterferenceIncrement(View & dst, uint8_t increment, int16_t saturation) - { - assert(dst.format == View::Int16); - - SimdInterferenceIncrement(dst.data, dst.stride, dst.width, dst.height, increment, saturation); - } - - /*! @ingroup interference - - \fn void InterferenceIncrementMasked(View & dst, uint8_t increment, int16_t saturation, const View& mask, uint8_t index) - - \short Increments statistic of interference detector with using segmentation mask. - - For every point: - \verbatim - if(mask[i] == index) - statistic[i] = min(statistic[i] + increment, saturation); - \endverbatim - - All images must have the same width, height. - This function is used for interference detection in motion detection algorithm. - - \note This function is a C++ wrapper for function ::SimdInterferenceIncrementMasked. - - \param [in, out] dst - a 16-bit signed integer image with statistic. - \param [in] increment - an increment of statistic. - \param [in] saturation - an upper saturation of statistic. - \param [in] mask - a 8-bit gray image with mask. - \param [in] index - an index of mask. - */ - template class A> SIMD_INLINE void InterferenceIncrementMasked(View & dst, uint8_t increment, int16_t saturation, const View& mask, uint8_t index) - { - assert(dst.format == View::Int16 && mask.format == View::Gray8 && EqualSize(dst, mask)); - - SimdInterferenceIncrementMasked(dst.data, dst.stride, dst.width, dst.height, increment, saturation, mask.data, mask.stride, index); - } - - /*! @ingroup interference - - \fn void InterferenceDecrement(View & dst, uint8_t decrement, int16_t saturation) - - \short Decrements statistic of interference detector. - - For every point: - \verbatim - statistic[i] = max(statistic[i] - decrement, saturation); - \endverbatim - - This function is used for interference detection in motion detection algorithm. - - \note This function is a C++ wrapper for function ::SimdInterferenceDecrement. - - \param [in, out] dst - a 16-bit signed integer image with statistic. - \param [in] decrement - a decrement of statistic. - \param [in] saturation - a lower saturation of statistic. - */ - template class A> SIMD_INLINE void InterferenceDecrement(View & dst, uint8_t decrement, int16_t saturation) - { - assert(dst.format == View::Int16); - - SimdInterferenceDecrement(dst.data, dst.stride, dst.width, dst.height, decrement, saturation); - } - - /*! @ingroup interference - - \fn void InterferenceDecrementMasked(View & dst, uint8_t decrement, int16_t saturation, const View& mask, uint8_t index) - - \short Decrements statistic of interference detector with using segmentation mask. - - For every point: - \verbatim - if(mask[i] == index) - statistic[i] = max(statistic[i] - decrement, saturation); - \endverbatim - - All images must have the same width, height. - This function is used for interference detection in motion detection algorithm. - - \note This function is a C++ wrapper for function ::SimdInterferenceDecrementMasked. - - \param [in, out] dst - a 16-bit signed integer image with statistic. - \param [in] decrement - a decrement of statistic. - \param [in] saturation - a lower saturation of statistic. - \param [in] mask - a 8-bit gray image with mask. - \param [in] index - an index of mask. - */ - template class A> SIMD_INLINE void InterferenceDecrementMasked(View & dst, uint8_t decrement, int16_t saturation, const View& mask, uint8_t index) - { - assert(dst.format == View::Int16 && mask.format == View::Gray8 && EqualSize(dst, mask)); - - SimdInterferenceDecrementMasked(dst.data, dst.stride, dst.width, dst.height, decrement, saturation, mask.data, mask.stride, index); - } - - /*! @ingroup other_conversion - - \fn void InterleaveUv(const View& u, const View& v, View& uv) - - \short Interleaves 8-bit U and V planar images into one 16-bit UV interleaved image. - - All images must have the same width and height. - This function used for YUV420P to NV12 conversion. - - \note This function is a C++ wrapper for function ::SimdInterleaveUv. - - \param [in] u - an input 8-bit U planar image. - \param [in] v - an input 8-bit V planar image. - \param [out] uv - an output 16-bit UV interleaved image. - */ - template class A> SIMD_INLINE void InterleaveUv(const View& u, const View& v, View& uv) - { - assert(EqualSize(uv, u, v) && uv.format == View::Uv16 && u.format == View::Gray8 && v.format == View::Gray8); - - SimdInterleaveUv(u.data, u.stride, v.data, v.stride, u.width, u.height, uv.data, uv.stride); - } - - /*! @ingroup other_conversion - - \fn void InterleaveBgr(const View & b, const View & g, const View & r, View & bgr) - - \short Interleaves 8-bit Blue, Green and Red planar images into one 24-bit BGR interleaved image. - - All images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdInterleaveBgr. - - \param [in] b - an input 8-bit Blue planar image. - \param [in] g - an input 8-bit Green planar image. - \param [in] r - an input 8-bit Red planar image. - \param [out] bgr - an output 24-bit BGR interleaved image. - */ - template class A> SIMD_INLINE void InterleaveBgr(const View & b, const View & g, const View & r, View & bgr) - { - assert(EqualSize(bgr, b, g, r) && Compatible(b, g, r) && bgr.format == View::Bgr24 && b.format == View::Gray8); - - SimdInterleaveBgr(b.data, b.stride, g.data, g.stride, r.data, r.stride, bgr.width, bgr.height, bgr.data, bgr.stride); - } - - /*! @ingroup other_conversion - - \fn void InterleaveBgra(const View& b, const View& g, const View& r, const View& a, View& bgra) - - \short Interleaves 8-bit Blue, Green, Red and Alpha planar images into one 32-bit BGRA interleaved image. - - All images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdInterleaveBgra. - - \param [in] b - an input 8-bit Blue planar image. - \param [in] g - an input 8-bit Green planar image. - \param [in] r - an input 8-bit Red planar image. - \param [in] a - an input 8-bit Alpha planar image. - \param [out] bgra - an output 32-bit BGRA interleaved image. - */ - template class A> SIMD_INLINE void InterleaveBgra(const View& b, const View& g, const View& r, const View& a, View& bgra) - { - assert(EqualSize(bgra, b) && Compatible(b, g, r, a) && bgra.format == View::Bgra32 && b.format == View::Gray8); - - SimdInterleaveBgra(b.data, b.stride, g.data, g.stride, r.data, r.stride, a.data, a.stride, bgra.width, bgra.height, bgra.data, bgra.stride); - } - - /*! @ingroup laplace_filter - - \fn void Laplace(const View& src, View& dst) - - \short Calculates Laplace's filter. - - All images must have the same width and height. Input image must has 8-bit gray format, output image must has 16-bit integer format. - - For every point: - \verbatim - dst[x, y] = - - src[x-1, y-1] - src[x, y-1] - src[x+1, y-1] - - src[x-1, y] + 8*src[x, y] - src[x+1, y] - - src[x-1, y+1] - src[x, y+1] - src[x+1, y+1]. - \endverbatim - - \note This function is a C++ wrapper for function ::SimdLaplace. - - \param [in] src - an input image. - \param [out] dst - an output image. - */ - template class A> SIMD_INLINE void Laplace(const View& src, View& dst) - { - assert(EqualSize(src, dst) && src.format == View::Gray8 && dst.format == View::Int16); - - SimdLaplace(src.data, src.stride, src.width, src.height, dst.data, dst.stride); - } - - /*! @ingroup laplace_filter - - \fn void LaplaceAbs(const View& src, View& dst) - - \short Calculates absolute value of Laplace's filter. - - All images must have the same width and height. Input image must has 8-bit gray format, output image must has 16-bit integer format. - - For every point: - \verbatim - dst[x, y] = abs( - - src[x-1, y-1] - src[x, y-1] - src[x+1, y-1] - - src[x-1, y] + 8*src[x, y] - src[x+1, y] - - src[x-1, y+1] - src[x, y+1] - src[x+1, y+1]). - \endverbatim - - \note This function is a C++ wrapper for function ::SimdLaplaceAbs. - - \param [in] src - an input image. - \param [out] dst - an output image. - */ - template class A> SIMD_INLINE void LaplaceAbs(const View& src, View& dst) - { - assert(EqualSize(src, dst) && src.format == View::Gray8 && dst.format == View::Int16); - - SimdLaplaceAbs(src.data, src.stride, src.width, src.height, dst.data, dst.stride); - } - - /*! @ingroup other_statistic - - \fn void LaplaceAbsSum(const View& src, uint64_t & sum) - - \short Calculates sum of absolute value of Laplace's filter. - - Input image must has 8-bit gray format. - - For every point: - \verbatim - sum += abs( - - src[x-1, y-1] - src[x, y-1] - src[x+1, y-1] - - src[x-1, y] + 8*src[x, y] - src[x+1, y] - - src[x-1, y+1] - src[x, y+1] - src[x+1, y+1]). - \endverbatim - - \note This function is a C++ wrapper for function ::SimdLaplaceAbsSum. - - \param [in] src - an input image. - \param [out] sum - a result sum. - */ - template class A> SIMD_INLINE void LaplaceAbsSum(const View & src, uint64_t & sum) - { - assert(src.format == View::Gray8); - - SimdLaplaceAbsSum(src.data, src.stride, src.width, src.height, &sum); - } - - /*! @ingroup other_filter - - \fn void LbpEstimate(const View& src, View& dst) - - \short Calculates LBP (Local Binary Patterns) for 8-bit gray image. - - All images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdLbpEstimate. - - \param [in] src - an input 8-bit gray image. - \param [out] dst - an output 8-bit gray image with LBP. - */ - template class A> SIMD_INLINE void LbpEstimate(const View& src, View& dst) - { - assert(Compatible(src, dst) && src.format == View::Gray8); - - SimdLbpEstimate(src.data, src.stride, src.width, src.height, dst.data, dst.stride); - } - - /*! @ingroup memory - - \fn void LitterCpuCache(size_t k = 2) - - \short It creates a large buffer and fills it. - - This function litters CPU cache. It is useful for test purposes. - - \param [in] k - a boosting coefficient of stub buffer size relative to CPU L3 cache size. Its default value is 2. - */ - SIMD_INLINE void LitterCpuCache(size_t k = 2) - { - size_t size = SimdCpuInfo(SimdCpuInfoCacheL3)*k; - uint8_t * buffer = (uint8_t*)SimdAllocate(size, SimdAlignment()); - SimdFillBgra(buffer, size, size / 4, 1, 0, 1, 2, 3); - SimdFree(buffer); - } - - /*! @ingroup other_filter - - \fn void MeanFilter3x3(const View& src, View& dst) - - \short Performs an averaging with window 3x3. - - For every point: - \verbatim - dst[x, y] = (src[x-1, y-1] + src[x, y-1] + src[x+1, y-1] + - src[x-1, y] + src[x, y] + src[x+1, y] + - src[x-1, y+1] + src[x, y+1] + src[x+1, y+1] + 4) / 9; - \endverbatim - - All images must have the same width, height and format (8-bit gray, 16-bit UV, 24-bit BGR or 32-bit BGRA). - - \note This function is a C++ wrapper for function ::SimdMeanFilter3x3. - - \param [in] src - a source image. - \param [out] dst - a destination image. - */ - template class A> SIMD_INLINE void MeanFilter3x3(const View& src, View& dst) - { - assert(Compatible(src, dst) && src.ChannelSize() == 1); - - SimdMeanFilter3x3(src.data, src.stride, src.width, src.height, src.ChannelCount(), dst.data, dst.stride); - } - - /*! @ingroup median_filter - - \fn void MedianFilterRhomb3x3(const View& src, View& dst) - - \short Performs median filtration of input image (filter window is a rhomb 3x3). - - All images must have the same width, height and format (8-bit gray, 16-bit UV, 24-bit BGR or 32-bit BGRA). - - \note This function is a C++ wrapper for function ::SimdMedianFilterRhomb3x3. - - \param [in] src - an original input image. - \param [out] dst - a filtered output image. - */ - template class A> SIMD_INLINE void MedianFilterRhomb3x3(const View& src, View& dst) - { - assert(Compatible(src, dst) && src.ChannelSize() == 1); - - SimdMedianFilterRhomb3x3(src.data, src.stride, src.width, src.height, src.ChannelCount(), dst.data, dst.stride); - } - - /*! @ingroup median_filter - - \fn void MedianFilterRhomb5x5(const View& src, View& dst) - - \short Performs median filtration of input image (filter window is a rhomb 5x5). - - All images must have the same width, height and format (8-bit gray, 16-bit UV, 24-bit BGR or 32-bit BGRA). - - \note This function is a C++ wrapper for function ::SimdMedianFilterRhomb5x5. - - \param [in] src - an original input image. - \param [out] dst - a filtered output image. - */ - template class A> SIMD_INLINE void MedianFilterRhomb5x5(const View& src, View& dst) - { - assert(Compatible(src, dst) && src.ChannelSize() == 1); - - SimdMedianFilterRhomb5x5(src.data, src.stride, src.width, src.height, src.ChannelCount(), dst.data, dst.stride); - } - - /*! @ingroup median_filter - - \fn void MedianFilterSquare3x3(const View& src, View& dst) - - \short Performs median filtration of input image (filter window is a square 3x3). - - All images must have the same width, height and format (8-bit gray, 16-bit UV, 24-bit BGR or 32-bit BGRA). - - \note This function is a C++ wrapper for function ::SimdMedianFilterSquare3x3. - - \param [in] src - an original input image. - \param [out] dst - a filtered output image. - */ - template class A> SIMD_INLINE void MedianFilterSquare3x3(const View& src, View& dst) - { - assert(Compatible(src, dst) && src.ChannelSize() == 1); - - SimdMedianFilterSquare3x3(src.data, src.stride, src.width, src.height, src.ChannelCount(), dst.data, dst.stride); - } - - /*! @ingroup median_filter - - \fn void MedianFilterSquare5x5(const View& src, View& dst) - - \short Performs median filtration of input image (filter window is a square 5x5). - - All images must have the same width, height and format (8-bit gray, 16-bit UV, 24-bit BGR or 32-bit BGRA). - - \note This function is a C++ wrapper for function ::SimdMedianFilterSquare5x5. - - \param [in] src - an original input image. - \param [out] dst - a filtered output image. - */ - template class A> SIMD_INLINE void MedianFilterSquare5x5(const View& src, View& dst) - { - assert(Compatible(src, dst) && src.ChannelSize() == 1); - - SimdMedianFilterSquare5x5(src.data, src.stride, src.width, src.height, src.ChannelCount(), dst.data, dst.stride); - } - - /*! @ingroup neural - - \fn void NeuralConvert(const View & src, float * dst, size_t stride, bool inversion) - - \short Converts a 8-bit gray image to the 32-bit float array. - - The length of output array must be equal to the area of input image. - - For every point: - \verbatim - dst[i] = inversion ? (255 - src[col]) / 255 : src[i]/255; - \endverbatim - - \note This function is a C++ wrapper for function ::SimdNeuralConvert. - - \param [in] src - an input image. - \param [out] dst - a pointer to output array. - \param [in] stride - a row size of the output array. - \param [in] inversion - a flag of color inversion. - */ - template class A> SIMD_INLINE void NeuralConvert(const View & src, float * dst, size_t stride, bool inversion) - { - assert(src.format == View::Gray8); - - SimdNeuralConvert(src.data, src.stride, src.width, src.height, dst, stride, inversion ? 1 : 0); - } - - /*! @ingroup operation - - \fn void OperationBinary8u(const View& a, const View& b, View& dst, SimdOperationBinary8uType type) - - \short Performs given operation between two images. - - All images must have the same width, height and format (8-bit gray, 16-bit UV (UV plane of NV12 pixel format), 24-bit BGR or 32-bit BGRA). - - \note This function is a C++ wrapper for function ::SimdOperationBinary8u. - - \param [in] a - a first input image. - \param [in] b - a second input image. - \param [out] dst - an output image. - \param [in] type - a type of operation (see ::SimdOperationBinary8uType). - */ - template class A> SIMD_INLINE void OperationBinary8u(const View& a, const View& b, View& dst, SimdOperationBinary8uType type) - { - assert(Compatible(a, b, dst) && a.ChannelSize() == 1); - - SimdOperationBinary8u(a.data, a.stride, b.data, b.stride, a.width, a.height, a.ChannelCount(), dst.data, dst.stride, type); - } - - /*! @ingroup operation - - \fn void OperationBinary16i(const View& a, const View& b, View& dst, SimdOperationBinary16iType type) - - \short Performs given operation between two images. - - All images must have the same width, height and Simd::View::Int16 pixel format. - - \note This function is a C++ wrapper for function ::SimdOperationBinary16i. - - \param [in] a - a first input image. - \param [in] b - a second input image. - \param [out] dst - an output image. - \param [in] type - a type of operation (see ::SimdOperationBinary16iType). - */ - template class A> SIMD_INLINE void OperationBinary16i(const View& a, const View& b, View& dst, SimdOperationBinary16iType type) - { - assert(Compatible(a, b, dst) && a.format == View::Int16); - - SimdOperationBinary16i(a.data, a.stride, b.data, b.stride, a.width, a.height, dst.data, dst.stride, type); - } - - /*! @ingroup operation - - \fn void VectorProduct(const uint8_t * vertical, const uint8_t * horizontal, View& dst) - - \short Calculates result 8-bit gray image as product of two vectors. - - For all points: - \verbatim - dst[x, y] = horizontal[x]*vertical[y]/255; - \endverbatim - - \note This function is a C++ wrapper for function ::SimdVectorProduct. - - \param [in] vertical - a pointer to pixels data of vertical vector. It length is equal to result image height. - \param [in] horizontal - a pointer to pixels data of horizontal vector. It length is equal to result image width. - \param [out] dst - a result image. - */ - template class A> SIMD_INLINE void VectorProduct(const uint8_t * vertical, const uint8_t * horizontal, View& dst) - { - assert(dst.format == View::Gray8); - - SimdVectorProduct(vertical, horizontal, dst.data, dst.stride, dst.width, dst.height); - } - - /*! @ingroup resizing - - \fn void ReduceGray2x2(const View& src, View& dst) - - \short Performs reducing (in 2 times) and Gaussian blurring a 8-bit gray image with using window 2x2. - - For input and output image must be performed: dst.width = (src.width + 1)/2, dst.height = (src.height + 1)/2. - - For all points: - \verbatim - dst[x, y] = (src[2*x, 2*y] + src[2*x, 2*y + 1] + src[2*x + 1, 2*y] + src[2*x + 1, 2*y + 1] + 2)/4; - \endverbatim - - \note This function is a C++ wrapper for function ::SimdReduceGray2x2. - - \param [in] src - an original input image. - \param [out] dst - a reduced output image. - */ - template class A> SIMD_INLINE void ReduceGray2x2(const View& src, View& dst) - { - assert(src.format == View::Gray8 && dst.format == View::Gray8 && Scale(src.Size()) == dst.Size()); - - SimdReduceGray2x2(src.data, src.width, src.height, src.stride, dst.data, dst.width, dst.height, dst.stride); - } - - /*! @ingroup resizing - - \fn void ReduceGray3x3(const View& src, View& dst, bool compensation = true) - - \short Performs reducing (in 2 times) and Gaussian blurring a 8-bit gray image with using window 3x3. - - For input and output image must be performed: dst.width = (src.width + 1)/2, dst.height = (src.height + 1)/2. - - For every point: - \verbatim - dst[x, y] = (src[2*x-1, 2*y-1] + 2*src[2*x, 2*y-1] + src[2*x+1, 2*y-1] + - 2*(src[2*x-1, 2*y] + 2*src[2*x, 2*y] + src[2*x+1, 2*y]) + - src[2*x-1, 2*y+1] + 2*src[2*x, 2*y+1] + src[2*x+1, 2*y+1] + compensation ? 8 : 0) / 16; - \endverbatim - - \note This function is a C++ wrapper for function ::SimdReduceGray3x3. - - \param [in] src - an original input image. - \param [out] dst - a reduced output image. - \param [in] compensation - a flag of compensation of rounding. It is equal to 'true' by default. - */ - template class A> SIMD_INLINE void ReduceGray3x3(const View& src, View& dst, bool compensation = true) - { - assert(src.format == View::Gray8 && dst.format == View::Gray8 && Scale(src.Size()) == dst.Size()); - - SimdReduceGray3x3(src.data, src.width, src.height, src.stride, dst.data, dst.width, dst.height, dst.stride, compensation ? 1 : 0); - } - - /*! @ingroup resizing - - \fn void ReduceGray4x4(const View& src, View& dst) - - \short Performs reducing (in 2 times) and Gaussian blurring a 8-bit gray image with using window 4x4. - - For input and output image must be performed: dst.width = (src.width + 1)/2, dst.height = (src.height + 1)/2. - - For every point: - \verbatim - dst[x, y] = (src[2*x-1, 2*y-1] + 3*src[2*x, 2*y-1] + 3*src[2*x+1, 2*y-1] + src[2*x+2, 2*y-1] - 3*(src[2*x-1, 2*y] + 3*src[2*x, 2*y] + 3*src[2*x+1, 2*y] + src[2*x+2, 2*y]) + - 3*(src[2*x-1, 2*y+1] + 3*src[2*x, 2*y+1] + 3*src[2*x+1, 2*y+1] + src[2*x+2, 2*y+1]) + - src[2*x-1, 2*y+2] + 3*src[2*x, 2*y+2] + 3*src[2*x+1, 2*y+2] + src[2*x+2, 2*y+2] + 32) / 64; - \endverbatim - - \note This function is a C++ wrapper for function ::SimdReduceGray4x4. - - \param [in] src - an original input image. - \param [out] dst - a reduced output image. - */ - template class A> SIMD_INLINE void ReduceGray4x4(const View& src, View& dst) - { - assert(src.format == View::Gray8 && dst.format == View::Gray8 && Scale(src.Size()) == dst.Size()); - - SimdReduceGray4x4(src.data, src.width, src.height, src.stride, dst.data, dst.width, dst.height, dst.stride); - } - - /*! @ingroup resizing - - \fn void ReduceGray5x5(const View& src, View& dst, bool compensation = true) - - \short Performs reducing (in 2 times) and Gaussian blurring a 8-bit gray image with using window 5x5. - - For input and output image must be performed: dst.width = (src.width + 1)/2, dst.height = (src.height + 1)/2. - - For every point: - \verbatim - dst[x, y] = ( - src[2*x-2, 2*y-2] + 4*src[2*x-1, 2*y-2] + 6*src[2*x, 2*y-2] + 4*src[2*x+1, 2*y-2] + src[2*x+2, 2*y-2] + - 4*(src[2*x-2, 2*y-1] + 4*src[2*x-1, 2*y-1] + 6*src[2*x, 2*y-1] + 4*src[2*x+1, 2*y-1] + src[2*x+2, 2*y-1]) + - 6*(src[2*x-2, 2*y] + 4*src[2*x-1, 2*y] + 6*src[2*x, 2*y] + 4*src[2*x+1, 2*y] + src[2*x+2, 2*y]) + - 4*(src[2*x-2, 2*y+1] + 4*src[2*x-1, 2*y+1] + 6*src[2*x, 2*y+1] + 4*src[2*x+1, 2*y+1] + src[2*x+2, 2*y+1]) + - src[2*x-2, 2*y+2] + 4*src[2*x-1, 2*y+2] + 6*src[2*x, 2*y+2] + 4*src[2*x+1, 2*y+2] + src[2*x+2, 2*y+2] + - compensation ? 128 : 0) / 256; - \endverbatim - - \note This function is a C++ wrapper for function ::SimdReduceGray5x5. - - \param [in] src - an original input image. - \param [out] dst - a reduced output image. - \param [in] compensation - a flag of compensation of rounding. It is equal to 'true' by default. - */ - template class A> SIMD_INLINE void ReduceGray5x5(const View& src, View& dst, bool compensation = true) - { - assert(src.format == View::Gray8 && dst.format == View::Gray8 && Scale(src.Size()) == dst.Size()); - - SimdReduceGray5x5(src.data, src.width, src.height, src.stride, dst.data, dst.width, dst.height, dst.stride, compensation ? 1 : 0); - } - - /*! @ingroup resizing - - \fn void ReduceGray(const View & src, View & dst, ::SimdReduceType reduceType, bool compensation = true) - - \short Performs reducing (in 2 times) and Gaussian blurring a 8-bit gray image. - - For input and output image must be performed: dst.width = (src.width + 1)/2, dst.height = (src.height + 1)/2. - - \param [in] src - an original input image. - \param [out] dst - a reduced output image. - \param [in] reduceType - a type of function used for image reducing. - \param [in] compensation - a flag of compensation of rounding. It is relevant only for ::SimdReduce3x3 and ::SimdReduce5x5. It is equal to 'true' by default. - */ - template class A> SIMD_INLINE void ReduceGray(const View & src, View & dst, ::SimdReduceType reduceType, bool compensation = true) - { - assert(src.format == View::Gray8 && dst.format == View::Gray8 && Scale(src.Size()) == dst.Size()); - - switch (reduceType) - { - case SimdReduce2x2: - Simd::ReduceGray2x2(src, dst); - break; - case SimdReduce3x3: - Simd::ReduceGray3x3(src, dst, compensation); - break; - case SimdReduce4x4: - Simd::ReduceGray4x4(src, dst); - break; - case SimdReduce5x5: - Simd::ReduceGray5x5(src, dst, compensation); - break; - default: - assert(0); - } - } - - /*! @ingroup resizing - - \fn void Reduce2x2(const View & src, View & dst) - - \short Performs reducing of image (in 2 times). - - For input and output image must be performed: dst.width = (src.width + 1)/2, dst.height = (src.height + 1)/2. - - \param [in] src - an original input image. - \param [out] dst - a reduced output image. - */ - template class A> SIMD_INLINE void Reduce2x2(const View & src, View & dst) - { - assert(src.format == dst.format && Scale(src.Size()) == dst.Size() && src.ChannelSize() == 1); - - SimdReduceColor2x2(src.data, src.width, src.height, src.stride, dst.data, dst.width, dst.height, dst.stride, src.ChannelCount()); - } - - /*! @ingroup resizing - - \fn void ResizeBilinear(const View& src, View& dst) - - \short Performs resizing of input image with using bilinear interpolation. - - All images must have the same format (8-bit gray, 16-bit UV, 24-bit BGR or 32-bit BGRA). - - \note This function is a C++ wrapper for function ::SimdResizeBilinear. - - \param [in] src - an original input image. - \param [out] dst - a resized output image. - */ - template class A> SIMD_INLINE void ResizeBilinear(const View & src, View & dst) - { - assert(src.format == dst.format && src.ChannelSize() == 1); - - if (EqualSize(src, dst)) - { - Copy(src, dst); - } - else - { - SimdResizeBilinear(src.data, src.width, src.height, src.stride, - dst.data, dst.width, dst.height, dst.stride, src.ChannelCount()); - } - } - - /*! @ingroup resizing - - \fn void ResizeAreaGray(const View & src, View & dst) - - \short Performs resizing of input image with using area interpolation. - - All images must have the same format (8-bit gray). - - \param [in] src - an original input image. - \param [out] dst - a resized output image. - */ - template class A> SIMD_INLINE void ResizeAreaGray(const View & src, View & dst) - { - assert(src.format == dst.format && src.format == View::Gray8); - - if (EqualSize(src, dst)) - { - Copy(src, dst); - } - else - { - size_t level = 0; - for (; (dst.width << (level + 1)) < (size_t)src.width; level++); - Point size = src.Size() << level; - if (level) - { - Pyramid pyramid(size, level + 1); - Simd::ResizeBilinear(src, pyramid[0]); - for (size_t i = 0; i < level; ++i) - Simd::ReduceGray(pyramid.At(i), pyramid.At(i + 1), ::SimdReduce2x2); - Simd::Copy(pyramid[level], dst); - } - else - Simd::ResizeBilinear(src, dst); - } - } - - /*! @ingroup resizing - - \fn void ResizeArea(const View & src, View & dst) - - \short Performs resizing of input image with using area interpolation. - - All images must have the same format. - - \param [in] src - an original input image. - \param [out] dst - a resized output image. - */ - template class A> SIMD_INLINE void ResizeArea(const View & src, View & dst) - { - assert(src.format == dst.format); - - if (EqualSize(src, dst)) - { - Copy(src, dst); - } - else - { - size_t level = 0; - for (; (dst.width << (level + 1)) < (size_t)src.width; level++); - Point size = src.Size() << level; - if (level) - { - std::vector> pyramid(level); - pyramid[0].Resize(size, src.format); - Simd::ResizeBilinear(src, pyramid[0]); - for (size_t i = 1; i < level; ++i) - { - size = Simd::Scale(size); - pyramid[i].Resize(size, src.format); - Simd::Reduce2x2(pyramid.At(i - 1), pyramid.At(i)); - } - Simd::Reduce2x2(pyramid.At(level - 1), dst); - } - else - Simd::ResizeBilinear(src, dst); - } - } - - /*! @ingroup resizing - - \fn void Resize(const View & src, View & dst, ::SimdResizeMethodType method = ::SimdResizeMethodBilinear) - - \short Performs resizing of image. - - All images must have the same format. - - \param [in] src - an original input image. - \param [out] dst - a resized output image. - \param [in] method - a resizing method. By default it is equal to ::SimdResizeMethodBilinear. - */ - template class A> SIMD_INLINE void Resize(const View & src, View & dst, ::SimdResizeMethodType method = ::SimdResizeMethodBilinear) - { - assert(src.format == dst.format && (src.format == View::Float || src.ChannelSize() == 1)); - - if (EqualSize(src, dst)) - { - Copy(src, dst); - } - else - { - SimdResizeChannelType type = src.format == View::Float ? SimdResizeChannelFloat : SimdResizeChannelByte; - void * resizer = SimdResizerInit(src.width, src.height, dst.width, dst.height, src.ChannelCount(), type, method); - if (resizer) - { - SimdResizerRun(resizer, src.data, src.stride, dst.data, dst.stride); - SimdRelease(resizer); - } - else - assert(0); - } - } - - /*! @ingroup rgb_conversion - - \fn void RgbToGray(const View& rgb, View& gray) - - \short Converts 24-bit RGB image to 8-bit gray image. - - All images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdRgbToGray. - - \param [in] rgb - an input 24-bit RGB image. - \param [out] gray - an output 8-bit gray image. - */ - template class A> SIMD_INLINE void RgbToGray(const View& rgb, View& gray) - { - assert(EqualSize(rgb, gray) && rgb.format == View::Rgb24 && gray.format == View::Gray8); - - SimdRgbToGray(rgb.data, rgb.width, rgb.height, rgb.stride, gray.data, gray.stride); - } - - /*! @ingroup rgb_conversion - - \fn void RgbToBgra(const View& rgb, View& bgra, uint8_t alpha = 0xFF) - - \short Converts 24-bit RGB image to 32-bit BGRA image. - - All images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdRgbToBgra. - - \param [in] rgb - an input 24-bit RGB image. - \param [out] bgra - an output 32-bit BGRA image. - \param [in] alpha - a value of alpha channel. It is equal to 256 by default. - */ - template class A> SIMD_INLINE void RgbToBgra(const View& rgb, View& bgra, uint8_t alpha = 0xFF) - { - assert(EqualSize(rgb, bgra) && bgra.format == View::Bgra32 && rgb.format == View::Rgb24); - - SimdRgbToBgra(rgb.data, rgb.width, rgb.height, rgb.stride, bgra.data, bgra.stride, alpha); - } - - /*! @ingroup segmentation - - \fn void SegmentationChangeIndex(View & mask, uint8_t oldIndex, uint8_t newIndex) - - \short Changes certain index in mask. - - Mask must has 8-bit gray pixel format. - - For every point: - \verbatim - if(mask[i] == oldIndex) - mask[i] = newIndex; - \endverbatim - - \note This function is a C++ wrapper for function ::SimdSegmentationChangeIndex. - - \param [in, out] mask - a 8-bit gray mask image. - \param [in] oldIndex - a mask old index. - \param [in] newIndex - a mask new index. - */ - template class A> SIMD_INLINE void SegmentationChangeIndex(View & mask, uint8_t oldIndex, uint8_t newIndex) - { - assert(mask.format == View::Gray8); - - SimdSegmentationChangeIndex(mask.data, mask.stride, mask.width, mask.height, oldIndex, newIndex); - } - - /*! @ingroup segmentation - - \fn void SegmentationFillSingleHoles(View & mask, uint8_t index) - - \short Fill single holes in mask. - - Mask must has 8-bit gray pixel format. - - \note This function is a C++ wrapper for function ::SimdSegmentationFillSingleHoles. - - \param [in, out] mask - a 8-bit gray mask image. - \param [in] index - a mask index. - */ - template class A> SIMD_INLINE void SegmentationFillSingleHoles(View & mask, uint8_t index) - { - assert(mask.format == View::Gray8 && mask.width > 2 && mask.height > 2); - - SimdSegmentationFillSingleHoles(mask.data, mask.stride, mask.width, mask.height, index); - } - - /*! @ingroup segmentation - - \fn void SegmentationPropagate2x2(const View & parent, View & child, const View & difference, uint8_t currentIndex, uint8_t invalidIndex, uint8_t emptyIndex, uint8_t differenceThreshold) - - \short Propagates mask index from parent (upper) to child (lower) level of mask pyramid with using 2x2 scan window. - - For parent and child image must be performed: parent.width = (child.width + 1)/2, parent.height = (child.height + 1)/2. - All images must have 8-bit gray pixel format. Size of different image is equal to child image. - - \note This function is a C++ wrapper for function ::SimdSegmentationPropagate2x2. - - \param [in] parent - a 8-bit gray parent mask image. - \param [in, out] child - a 8-bit gray child mask image. - \param [in] difference - a 8-bit gray difference image. - \param [in] currentIndex - propagated mask index. - \param [in] invalidIndex - invalid mask index. - \param [in] emptyIndex - empty mask index. - \param [in] differenceThreshold - a difference threshold for conditional index propagating. - */ - template class A> SIMD_INLINE void SegmentationPropagate2x2(const View & parent, View & child, const View & difference, uint8_t currentIndex, uint8_t invalidIndex, uint8_t emptyIndex, uint8_t differenceThreshold) - { - assert(parent.format == View::Gray8 && parent.width >= 2 && parent.height >= 2); - assert((child.width + 1) / 2 == parent.width && (child.height + 1) / 2 == parent.height); - assert(Compatible(child, difference) && child.format == View::Gray8); - - SimdSegmentationPropagate2x2(parent.data, parent.stride, parent.width, parent.height, child.data, child.stride, - difference.data, difference.stride, currentIndex, invalidIndex, emptyIndex, differenceThreshold); - } - - /*! @ingroup segmentation - - \fn void SegmentationShrinkRegion(const View & mask, uint8_t index, Rectangle & rect) - - \short Finds actual region of mask index location. - - Mask must has 8-bit gray pixel format. - - \note This function is a C++ wrapper for function ::SimdSegmentationShrinkRegion. - - \param [in] mask - a 8-bit gray mask image. - \param [in] index - a mask index. - \param [in, out] rect - a region bounding box rectangle. - */ - template class A> SIMD_INLINE void SegmentationShrinkRegion(const View & mask, uint8_t index, Rectangle & rect) - { - assert(mask.format == View::Gray8); - assert(rect.Width() > 0 && rect.Height() > 0 && Rectangle(mask.Size()).Contains(rect)); - - SimdSegmentationShrinkRegion(mask.data, mask.stride, mask.width, mask.height, index, &rect.left, &rect.top, &rect.right, &rect.bottom); - } - - /*! @ingroup shifting - - \fn void ShiftBilinear(const View & src, const View & bkg, const Point & shift, const Rectangle & crop, View & dst) - - \short Performs shifting of input image with using bilinear interpolation. - - All images must have the same width, height and format (8-bit gray, 16-bit UV, 24-bit BGR or 32-bit BGRA). - - \note This function is a C++ wrapper for function ::SimdShiftBilinear. - - \param [in] src - a foreground input image. - \param [in] bkg - a background input image. - \param [in] shift - an image shift. - \param [in] crop - a crop rectangle. - \param [out] dst - an output image. - */ - template class A> SIMD_INLINE void ShiftBilinear(const View & src, const View & bkg, const Point & shift, const Rectangle & crop, View & dst) - { - assert(Compatible(src, bkg, dst) && src.ChannelSize() == 1); - - SimdShiftBilinear(src.data, src.stride, src.width, src.height, src.ChannelCount(), bkg.data, bkg.stride, - &shift.x, &shift.y, crop.left, crop.top, crop.right, crop.bottom, dst.data, dst.stride); - } - - /*! @ingroup sobel_filter - - \fn void SobelDx(const View& src, View& dst) - - \short Calculates Sobel's filter along x axis. - - All images must have the same width and height. Input image must has 8-bit gray format, output image must has 16-bit integer format. - - For every point: - \verbatim - dst[x, y] = (src[x+1,y-1] + 2*src[x+1, y] + src[x+1, y+1]) - (src[x-1,y-1] + 2*src[x-1, y] + src[x-1, y+1]). - \endverbatim - - \note This function is a C++ wrapper for function ::SimdSobelDx. - - \param [in] src - an input image. - \param [out] dst - an output image. - */ - template class A> SIMD_INLINE void SobelDx(const View& src, View& dst) - { - assert(EqualSize(src, dst) && src.format == View::Gray8 && dst.format == View::Int16); - - SimdSobelDx(src.data, src.stride, src.width, src.height, dst.data, dst.stride); - } - - /*! @ingroup sobel_filter - - \fn void SobelDxAbs(const View& src, View& dst) - - \short Calculates absolute value of Sobel's filter along x axis. - - All images must have the same width and height. Input image must has 8-bit gray format, output image must has 16-bit integer format. - - For every point: - \verbatim - dst[x, y] = abs((src[x+1,y-1] + 2*src[x+1, y] + src[x+1, y+1]) - (src[x-1,y-1] + 2*src[x-1, y] + src[x-1, y+1])). - \endverbatim - - \note This function is a C++ wrapper for function ::SimdSobelDxAbs. - - \param [in] src - an input image. - \param [out] dst - an output image. - */ - template class A> SIMD_INLINE void SobelDxAbs(const View& src, View& dst) - { - assert(EqualSize(src, dst) && src.format == View::Gray8 && dst.format == View::Int16); - - SimdSobelDxAbs(src.data, src.stride, src.width, src.height, dst.data, dst.stride); - } - - /*! @ingroup sobel_statistic - - \fn void SobelDxAbsSum(const View& src, uint64_t & sum) - - \short Calculates sum of absolute value of Sobel's filter along x axis. - - Input image must has 8-bit gray format. - - For every point: - \verbatim - sum += abs((src[x+1,y-1] + 2*src[x+1, y] + src[x+1, y+1]) - (src[x-1,y-1] + 2*src[x-1, y] + src[x-1, y+1])); - \endverbatim - - \note This function is a C++ wrapper for function ::SimdSobelDxAbsSum. - - \param [in] src - an input image. - \param [out] sum - an unsigned 64-bit integer value with result sum. - */ - template class A> SIMD_INLINE void SobelDxAbsSum(const View& src, uint64_t & sum) - { - assert(src.format == View::Gray8); - - SimdSobelDxAbsSum(src.data, src.stride, src.width, src.height, &sum); - } - - /*! @ingroup sobel_filter - - \fn void SobelDy(const View& src, View& dst) - - \short Calculates Sobel's filter along y axis. - - All images must have the same width and height. Input image must has 8-bit gray format, output image must has 16-bit integer format. - - For every point: - \verbatim - dst[x, y] = (src[x-1,y+1] + 2*src[x, y+1] + src[x+1, y+1]) - (src[x-1,y-1] + 2*src[x, y-1] + src[x+1, y-1]); - \endverbatim - - \note This function is a C++ wrapper for function ::SimdSobelDy. - - \param [in] src - an input image. - \param [out] dst - an output image. - */ - template class A> SIMD_INLINE void SobelDy(const View& src, View& dst) - { - assert(EqualSize(src, dst) && src.format == View::Gray8 && dst.format == View::Int16); - - SimdSobelDy(src.data, src.stride, src.width, src.height, dst.data, dst.stride); - } - - /*! @ingroup sobel_filter - - \fn void SobelDyAbs(const View& src, View& dst) - - \short Calculates absolute value of Sobel's filter along y axis. - - All images must have the same width and height. Input image must has 8-bit gray format, output image must has 16-bit integer format. - - For every point: - \verbatim - dst[x, y] = abs((src[x-1,y+1] + 2*src[x, y+1] + src[x+1, y+1]) - (src[x-1,y-1] + 2*src[x, y-1] + src[x+1, y-1])); - \endverbatim - - \note This function is a C++ wrapper for function ::SimdSobelDyAbs. - - \param [in] src - an input image. - \param [out] dst - an output image. - */ - template class A> SIMD_INLINE void SobelDyAbs(const View& src, View& dst) - { - assert(EqualSize(src, dst) && src.format == View::Gray8 && dst.format == View::Int16); - - SimdSobelDyAbs(src.data, src.stride, src.width, src.height, dst.data, dst.stride); - } - - /*! @ingroup sobel_statistic - - \fn void SobelDyAbsSum(const View& src, uint64_t & sum) - - \short Calculates sum of absolute value of Sobel's filter along y axis. - - Input image must has 8-bit gray format. - - For every point: - \verbatim - sum += abs((src[x-1,y+1] + 2*src[x, y+1] + src[x+1, y+1]) - (src[x-1,y-1] + 2*src[x, y-1] + src[x+1, y-1])); - \endverbatim - - \note This function is a C++ wrapper for function ::SimdSobelDyAbsSum. - - \param [in] src - an input image. - \param [out] sum - an unsigned 64-bit integer value with result sum. - */ - template class A> SIMD_INLINE void SobelDyAbsSum(const View& src, uint64_t & sum) - { - assert(src.format == View::Gray8); - - SimdSobelDyAbsSum(src.data, src.stride, src.width, src.height, &sum); - } - - /*! @ingroup contour - - \fn void ContourMetrics(const View& src, View& dst) - - \short Calculates contour metrics based on absolute value and direction of Sobel's filter along y and y axis. - - All images must have the same width and height. Input image must has 8-bit gray format, output image must has 16-bit integer format. - This function is used for contour extraction. - - For every point: - \verbatim - dy = abs((src[x-1,y+1] + 2*src[x, y+1] + src[x+1, y+1]) - (src[x-1,y-1] + 2*src[x, y-1] + src[x+1, y-1])); - dx = abs((src[x+1,y-1] + 2*src[x+1, y] + src[x+1, y+1]) - (src[x-1,y-1] + 2*src[x-1, y] + src[x-1, y+1])); - dst[x, y] = (dx + dy)*2 + (dx >= dy ? 0 : 1); - \endverbatim - - \note This function is a C++ wrapper for function ::SimdContourMetrics. - - \param [in] src - a gray 8-bit input image. - \param [out] dst - an output 16-bit image. - */ - template class A> SIMD_INLINE void ContourMetrics(const View& src, View& dst) - { - assert(EqualSize(src, dst) && src.format == View::Gray8 && dst.format == View::Int16); - - SimdContourMetrics(src.data, src.stride, src.width, src.height, dst.data, dst.stride); - } - - /*! @ingroup contour - - \fn void ContourMetrics(const View& src, const View& mask, uint8_t indexMin, View& dst) - - \short Calculates contour metrics based on absolute value and direction of Sobel's filter along y and y axis with using mask. - - All images must have the same width and height. Input image must has 8-bit gray format, output image must has 16-bit integer format. - This function is used for contour extraction. - - For every point: - \verbatim - dy = abs((src[x-1,y+1] + 2*src[x, y+1] + src[x+1, y+1]) - (src[x-1,y-1] + 2*src[x, y-1] + src[x+1, y-1])); - dx = abs((src[x+1,y-1] + 2*src[x+1, y] + src[x+1, y+1]) - (src[x-1,y-1] + 2*src[x-1, y] + src[x-1, y+1])); - dst[x, y] = mask[x, y] < indexMin ? 0 : (dx + dy)*2 + (dx >= dy ? 0 : 1); - \endverbatim - - \note This function is a C++ wrapper for function ::SimdContourMetricsMasked. - - \param [in] src - a gray 8-bit input image. - \param [in] mask - a mask 8-bit image. - \param [in] indexMin - a mask minimal permissible index. - \param [out] dst - an output 16-bit image. - */ - template class A> SIMD_INLINE void ContourMetrics(const View& src, const View& mask, uint8_t indexMin, View& dst) - { - assert(Compatible(src, mask) && EqualSize(src, dst) && src.format == View::Gray8 && dst.format == View::Int16); - - SimdContourMetricsMasked(src.data, src.stride, src.width, src.height, mask.data, mask.stride, indexMin, dst.data, dst.stride); - } - - /*! @ingroup contour - - \fn void ContourAnchors(const View& src, size_t step, int16_t threshold, View& dst) - - \short Extract contour anchors from contour metrics. - - All images must have the same width and height. Input image must has 16-bit integer format, output image must has 8-bit gray format. - Input image with metrics can be estimated by using ::SimdContourMetrics or ::SimdContourMetricsMasked functions. - This function is used for contour extraction. - - For every point (except border): - \verbatim - a[x, y] = src[x, y] >> 1. - if(src[x, y] & 1) - dst[x, y] = a[x, y] > 0 && (a[x, y] - a[x + 1, y] >= threshold) && (a[x, y] - a[x - 1, y] >= threshold) ? 255 : 0; - else - dst[x, y] = a[x, y] > 0 && (a[x, y] - a[x, y + 1] >= threshold) && (a[x, y] - a[x, y - 1] >= threshold) ? 255 : 0; - \endverbatim - - \note This function is a C++ wrapper for function ::SimdContourAnchors. - - \param [in] src - a 16-bit input image. - \param [in] step - a row step (to skip some rows). - \param [in] threshold - a threshold of anchor creation. - \param [out] dst - an output 8-bit gray image. - */ - template class A> SIMD_INLINE void ContourAnchors(const View& src, size_t step, int16_t threshold, View& dst) - { - assert(EqualSize(src, dst) && src.format == View::Int16 && dst.format == View::Gray8); - - SimdContourAnchors(src.data, src.stride, src.width, src.height, step, threshold, dst.data, dst.stride); - } - - /*! @ingroup correlation - - \fn void SquaredDifferenceSum(const View& a, const View& b, uint64_t & sum) - - \short Calculates sum of squared differences for two 8-bit gray images. - - All images must have the same width and height. - - For every point: - \verbatim - sum += (a[i] - b[i])*(a[i] - b[i]); - \endverbatim - - \note This function is a C++ wrapper for function ::SimdSquaredDifferenceSum. - - \param [in] a - a first image. - \param [in] b - a second image. - \param [out] sum - a reference to unsigned 64-bit integer value with result sum. - */ - template class A> SIMD_INLINE void SquaredDifferenceSum(const View& a, const View& b, uint64_t & sum) - { - assert(Compatible(a, b) && a.format == View::Gray8); - - SimdSquaredDifferenceSum(a.data, a.stride, b.data, b.stride, a.width, a.height, &sum); - } - - /*! @ingroup correlation - - \fn void SquaredDifferenceSum(const View& a, const View& b, const View& mask, uint8_t index, uint64_t & sum) - - \short Calculates sum of squared differences for two images with using mask. - - All images must have the same width, height and format (8-bit gray). - - For every point: - \verbatim - if(mask[i] == index) - sum += (a[i] - b[i])*(a[i] - b[i]); - \endverbatim - - \note This function is a C++ wrapper for function ::SimdSquaredDifferenceSumMasked. - - \param [in] a - a first image. - \param [in] b - a second image. - \param [in] mask - a mask image. - \param [in] index - a mask index. - \param [out] sum - a reference to unsigned 64-bit integer value with result sum. - */ - template class A> SIMD_INLINE void SquaredDifferenceSum(const View& a, const View& b, const View& mask, uint8_t index, uint64_t & sum) - { - assert(Compatible(a, b, mask) && a.format == View::Gray8); - - SimdSquaredDifferenceSumMasked(a.data, a.stride, b.data, b.stride, mask.data, mask.stride, index, a.width, a.height, &sum); - } - - /*! @ingroup other_statistic - - \fn void GetStatistic(const View& src, uint8_t & min, uint8_t & max, uint8_t & average) - - \short Finds minimal, maximal and average pixel values for given image. - - The image must has 8-bit gray format. - - \note This function is a C++ wrapper for function ::SimdGetStatistic. - - \param [in] src - an input image. - \param [out] min - a reference to unsigned 8-bit integer value with found minimal pixel value. - \param [out] max - a reference to unsigned 8-bit integer value with found maximal pixel value. - \param [out] average - a reference to unsigned 8-bit integer value with found average pixel value. - */ - template class A> SIMD_INLINE void GetStatistic(const View& src, uint8_t & min, uint8_t & max, uint8_t & average) - { - assert(src.format == View::Gray8); - - SimdGetStatistic(src.data, src.stride, src.width, src.height, &min, &max, &average); - } - - /*! @ingroup other_statistic - - \fn void GetMoments(const View& mask, uint8_t index, uint64_t & area, uint64_t & x, uint64_t & y, uint64_t & xx, uint64_t & xy, uint64_t & yy) - - \short Calculate statistical characteristics (moments) of pixels with given index. - - The image must has 8-bit gray format. - - For every point: - \verbatim - if(mask[X, Y] == index) - { - area += 1. - x += X. - y += Y. - xx += X*X. - xy += X*Y. - yy += Y*Y. - } - \endverbatim - - \note This function is a C++ wrapper for function ::SimdGetMoments. - - \param [in] mask - a mask image. - \param [in] index - a mask index. - \param [out] area - a reference to unsigned 64-bit integer value with found area (number of pixels with given index). - \param [out] x - a reference to unsigned 64-bit integer value with found first-order moment x. - \param [out] y - a reference to unsigned 64-bit integer value with found first-order moment y. - \param [out] xx - a reference to unsigned 64-bit integer value with found second-order moment xx. - \param [out] xy - a reference to unsigned 64-bit integer value with found second-order moment xy. - \param [out] yy - a reference to unsigned 64-bit integer value with found second-order moment yy. - */ - template class A> SIMD_INLINE void GetMoments(const View& mask, uint8_t index, uint64_t & area, uint64_t & x, uint64_t & y, uint64_t & xx, uint64_t & xy, uint64_t & yy) - { - assert(mask.format == View::Gray8); - - SimdGetMoments(mask.data, mask.stride, mask.width, mask.height, index, &area, &x, &y, &xx, &xy, &yy); - } - - - /*! @ingroup other_statistic - - \fn void GetObjectMoments(const View & src, const View & mask, uint8_t index, uint64_t & n, uint64_t & s, uint64_t & sx, uint64_t & sy, uint64_t & sxx, uint64_t & sxy, uint64_t & syy) - - \short Calculate statistical characteristics (moments) of given object. - - The images must has 8-bit gray format and equal size. One of them can be empty. - - For every point: - \verbatim - if(mask[X, Y] == index || mask == 0) - { - S = src ? src[X, Y] : 1; - n += 1. - s += S; - sx += S*X. - sy += S*Y. - sxx += S*X*X. - sxy += S*X*Y. - syy += S*Y*Y. - } - \endverbatim - - \note This function is a C++ wrapper for function ::SimdGetObjectMoments. - - \param [in] src - an input image. - \param [in] mask - a mask image. Can be empty. - \param [in] index - an object index. - \param [out] n - a reference to unsigned 64-bit integer value with found are of given object. - \param [out] s - a reference to unsigned 64-bit integer value with sum of image values of given object. - \param [out] sx - a reference to unsigned 64-bit integer value with found first-order moment x of given object. - \param [out] sy - a reference to unsigned 64-bit integer value with found first-order moment y of given object. - \param [out] sxx - a reference to unsigned 64-bit integer value with found second-order moment xx of given object. - \param [out] sxy - a reference to unsigned 64-bit integer value with found second-order moment xy of given object. - \param [out] syy - a reference to unsigned 64-bit integer value with found second-order moment yy of given object. - */ - template class A> SIMD_INLINE void GetObjectMoments(const View & src, const View & mask, uint8_t index, uint64_t & n, uint64_t & s, uint64_t & sx, uint64_t & sy, uint64_t & sxx, uint64_t & sxy, uint64_t & syy) - { - assert(src.format == View::Empty || src.format == View::Gray8); - assert(mask.format == View::Empty || mask.format == View::Gray8); - assert(src.format == View::Gray8 || mask.format == View::Gray8); - assert(src.format == mask.format ? EqualSize(src, mask) : true); - - if (src.format) - SimdGetObjectMoments(src.data, src.stride, src.width, src.height, mask.data, mask.stride, index, &n, &s, &sx, &sy, &sxx, &sxy, &syy); - else - SimdGetObjectMoments(src.data, src.stride, mask.width, mask.height, mask.data, mask.stride, index, &n, &s, &sx, &sy, &sxx, &sxy, &syy); - } - - /*! @ingroup row_statistic - - \fn void GetRowSums(const View& src, uint32_t * sums) - - \short Calculate sums of rows for given 8-bit gray image. - - For all rows: - \verbatim - for(x = 0; x < width; ++x) - sums[y] += src[x, y]; - \endverbatim - - \note This function is a C++ wrapper for function ::SimdGetRowSums. - - \param [in] src - an input image. - \param [out] sums - a pointer to array of unsigned 32-bit integers result sums of rows. It length must be equal to image height. - */ - template class A> SIMD_INLINE void GetRowSums(const View& src, uint32_t * sums) - { - assert(src.format == View::Gray8); - - SimdGetRowSums(src.data, src.stride, src.width, src.height, sums); - } - - /*! @ingroup col_statistic - - \fn void GetColSums(const View& src, uint32_t * sums) - - \short Calculate sums of columns for given 8-bit gray image. - - For all columns: - \verbatim - for(y = 0; y < height; ++y) - sums[x] += src[x, y]; - \endverbatim - - \note This function is a C++ wrapper for function ::SimdGetColSums. - - \param [in] src - an input image. - \param [out] sums - a pointer to array of unsigned 32-bit integers result sums of columns. It length must be equal to image width. - */ - template class A> SIMD_INLINE void GetColSums(const View& src, uint32_t * sums) - { - assert(src.format == View::Gray8); - - SimdGetColSums(src.data, src.stride, src.width, src.height, sums); - } - - /*! @ingroup row_statistic - - \fn void GetAbsDyRowSums(const View& src, uint32_t * sums) - - \short Calculate sums of absolute derivate along y axis for rows for given 8-bit gray image. - - For all rows except the last: - \verbatim - for(x = 0; x < width; ++x) - sums[y] += abs(src[x, y+1] - src[x, y]); - \endverbatim - For the last row: - \verbatim - sums[height-1] = 0; - \endverbatim - - \note This function is a C++ wrapper for function ::SimdGetAbsDyRowSums. - - \param [in] src - an input image. - \param [out] sums - a pointer to array of unsigned 32-bit integers result sums. It length must be equal to image height. - */ - template class A> SIMD_INLINE void GetAbsDyRowSums(const View& src, uint32_t * sums) - { - assert(src.format == View::Gray8); - - SimdGetAbsDyRowSums(src.data, src.stride, src.width, src.height, sums); - } - - /*! @ingroup col_statistic - - \fn void GetAbsDxColSums(const View& src, uint32_t * sums) - - \short Calculate sums of absolute derivate along x axis for columns for given 8-bit gray image. - - For all columns except the last: - \verbatim - for(y = 0; y < height; ++y) - sums[y] += abs(src[x+1, y] - src[x, y]); - \endverbatim - For the last column: - \verbatim - sums[width-1] = 0; - \endverbatim - - \note This function is a C++ wrapper for function ::SimdGetAbsDxColSums. - - \param [in] src - an input image. - \param [out] sums - a pointer to array of unsigned 32-bit integers result columns. It length must be equal to image width. - */ - template class A> SIMD_INLINE void GetAbsDxColSums(const View& src, uint32_t * sums) - { - assert(src.format == View::Gray8); - - SimdGetAbsDxColSums(src.data, src.stride, src.width, src.height, sums); - } - - /*! @ingroup other_statistic - - \fn void ValueSum(const View& src, uint64_t & sum) - - \short Gets sum of value of pixels for gray 8-bit image. - - \note This function is a C++ wrapper for function ::SimdValueSum. - - \param [in] src - an input image. - \param [out] sum - a result sum. - */ - template class A> SIMD_INLINE void ValueSum(const View& src, uint64_t & sum) - { - assert(src.format == View::Gray8); - - SimdValueSum(src.data, src.stride, src.width, src.height, &sum); - } - - /*! @ingroup other_statistic - - \fn void SquareSum(const View& src, uint64_t & sum) - - \short Gets sum of squared value of pixels for gray 8-bit image. - - \note This function is a C++ wrapper for function ::SimdSquareSum. - - \param [in] src - an input image. - \param [out] sum - a result sum. - */ - template class A> SIMD_INLINE void SquareSum(const View & src, uint64_t & sum) - { - assert(src.format == View::Gray8); - - SimdSquareSum(src.data, src.stride, src.width, src.height, &sum); - } - - /*! @ingroup other_statistic - - \fn void ValueSquareSum(const View& src, uint64_t & valueSum, uint64_t & squareSum) - - \short Gets sum and sum of squared value of pixels for gray 8-bit image. - - \note This function is a C++ wrapper for function ::SimdValueSquareSum. - - \param [in] src - an input image. - \param [out] valueSum - a result value sum. - \param [out] squareSum - a result square sum. - */ - template class A> SIMD_INLINE void ValueSquareSum(const View& src, uint64_t & valueSum, uint64_t & squareSum) - { - assert(src.format == View::Gray8); - - SimdValueSquareSum(src.data, src.stride, src.width, src.height, &valueSum, &squareSum); - } - - /*! @ingroup other_statistic - - \fn void CorrelationSum(const View & a, const View & b, uint64_t & sum) - - \short Gets sum of pixel correlation for two gray 8-bit images. - - For all points: - \verbatim - sum += a[i]*b[i]; - \endverbatim - - All images must have the same width and height and 8-bit gray pixel format. - - \note This function is a C++ wrapper for function ::SimdCorrelationSum. - - \param [in] a - a first image. - \param [in] b - a second image. - \param [out] sum - a result sum. - */ - template class A> SIMD_INLINE void CorrelationSum(const View & a, const View & b, uint64_t & sum) - { - assert(Compatible(a, b) && a.format == View::Gray8); - - SimdCorrelationSum(a.data, a.stride, b.data, b.stride, a.width, a.height, &sum); - } - - /*! @ingroup resizing - - \fn void StretchGray2x2(const View& src, View& dst) - - \short Stretches input 8-bit gray image in two times. - - \note This function is a C++ wrapper for function ::SimdStretchGray2x2. - - \param [in] src - an original input image. - \param [out] dst - a stretched output image. - */ - template class A> SIMD_INLINE void StretchGray2x2(const View & src, View & dst) - { - assert(src.format == View::Gray8 && dst.format == View::Gray8); - assert(src.width * 2 == dst.width && src.height * 2 == dst.height); - - SimdStretchGray2x2(src.data, src.width, src.height, src.stride, dst.data, dst.width, dst.height, dst.stride); - } - - /*! @ingroup synet_conversion - - \fn void SynetSetInput(const View & src, const float * lower, const float * upper, float * dst, size_t channels, SimdTensorFormatType format) - - \short Sets image to the input of neural network of Synet Framework. - - Algorithm's details (example for BGRA pixel format and NCHW tensor format): - \verbatim - for(c = 0; c < channels; ++c) - for(y = 0; y < src.height; ++y) - for(x = 0; x < src.width; ++x) - dst[(c*height + y)*width + x] = src.data[src.stride*y + src.width*4 + c]*(upper[c] - lower[c])/255 + lower[c]; - \endverbatim - - \note This function is a C++ wrapper for function ::SimdSynetSetInput. - - \param [in] src - an input image.There are supported following image formats: View::Gray8, View::Bgr24, View::Bgra32, View::Rgb24. - \param [in] lower - a pointer to the array with lower bound of values of the output tensor. The size of the array have to correspond number of channels in the output image tensor. - \param [in] upper - a pointer to the array with upper bound of values of the output tensor. The size of the array have to correspond number of channels in the output image tensor. - \param [out] dst - a pointer to the output 32-bit float image tensor. - \param [in] channels - a number of channels in the output image tensor. It can be 1 or 3. - \param [in] format - a format of output image tensor. There are supported following tensor formats: ::SimdTensorFormatNchw, ::SimdTensorFormatNhwc. - */ - template class A> SIMD_INLINE void SynetSetInput(const View & src, const float * lower, const float * upper, float * dst, size_t channels, SimdTensorFormatType format) - { - assert(src.format == View::Gray8 || src.format == View::Bgr24 || src.format == View::Bgra32 || src.format == View::Rgb24); - assert(format == SimdTensorFormatNchw || format == SimdTensorFormatNhwc); - - SimdSynetSetInput(src.data, src.width, src.height, src.stride, (SimdPixelFormatType)src.format, lower, upper, dst, channels, format); - } - - /*! @ingroup texture_estimation - - \fn void TextureBoostedSaturatedGradient(const View& src, uint8_t saturation, uint8_t boost, View& dx, View& dy) - - \short Calculates boosted saturated gradients for given input image. - - All images must have the same width, height and format (8-bit gray). - - For border pixels: - \verbatim - dx[x, y] = 0; - dy[x, y] = 0; - \endverbatim - For other pixels: - \verbatim - dx[x, y] = (saturation + max(-saturation, min(saturation, (src[x + 1, y] - src[x - 1, y]))))*boost; - dy[x, y] = (saturation + max(-saturation, min(saturation, (src[x, y + 1] - src[x, y - 1]))))*boost; - \endverbatim - - \note This function is a C++ wrapper for function ::SimdTextureBoostedSaturatedGradient. - - \param [in] src - a source 8-bit gray image. - \param [in] saturation - a saturation of gradient. - \param [in] boost - a boost coefficient. - \param [out] dx - an image with boosted saturated gradient along x axis. - \param [out] dy - an image with boosted saturated gradient along y axis. - */ - template class A> SIMD_INLINE void TextureBoostedSaturatedGradient(const View& src, uint8_t saturation, uint8_t boost, View& dx, View& dy) - { - assert(Compatible(src, dx, dy) && src.format == View::Gray8 && src.height >= 3 && src.width >= 3); - - SimdTextureBoostedSaturatedGradient(src.data, src.stride, src.width, src.height, saturation, boost, dx.data, dx.stride, dy.data, dy.stride); - } - - /*! @ingroup texture_estimation - - \fn void TextureBoostedUv(const View& src, uint8_t boost, View& dst) - - \short Calculates boosted colorized texture feature of input image (actual for U and V components of YUV format). - - All images must have the same width, height and format (8-bit gray). - - For every pixel: - \verbatim - lo = 128 - (128/boost); - hi = 255 - lo; - dst[x, y] = max(lo, min(hi, src[i]))*boost; - \endverbatim - - \note This function is a C++ wrapper for function ::SimdTextureBoostedUv. - - \param [in] src - a source 8-bit gray image. - \param [in] boost - a boost coefficient. - \param [out] dst - a result image. - */ - template class A> SIMD_INLINE void TextureBoostedUv(const View& src, uint8_t boost, View& dst) - { - assert(Compatible(src, dst) && src.format == View::Gray8); - - SimdTextureBoostedUv(src.data, src.stride, src.width, src.height, boost, dst.data, dst.stride); - } - - /*! @ingroup texture_estimation - - \fn void TextureGetDifferenceSum(const View& src, const View& lo, const View& hi, int64_t & sum) - - \short Calculates difference between current image and background. - - All images must have the same width, height and format (8-bit gray). - - For every pixel: - \verbatim - sum += current - average(lo[i], hi[i]); - \endverbatim - - \note This function is a C++ wrapper for function ::SimdTextureGetDifferenceSum. - - \param [in] src - a current image. - \param [in] lo - an image with lower bound of background feature. - \param [in] hi - an image with upper bound of background feature. - \param [out] sum - a reference to 64-bit integer with result sum. - */ - template class A> SIMD_INLINE void TextureGetDifferenceSum(const View& src, const View& lo, const View& hi, int64_t & sum) - { - assert(Compatible(src, lo, hi) && src.format == View::Gray8); - - SimdTextureGetDifferenceSum(src.data, src.stride, src.width, src.height, lo.data, lo.stride, hi.data, hi.stride, &sum); - } - - /*! @ingroup texture_estimation - - \fn void TexturePerformCompensation(const View& src, int shift, View& dst) - - \short Performs brightness compensation of input image. - - All images must have the same width, height and format (8-bit gray). - - For every pixel: - \verbatim - dst[i] = max(0, min(255, src[i] + shift)); - \endverbatim - - \note This function is a C++ wrapper for function ::SimdTexturePerformCompensation. - - \param [in] src - an input image. - \param [in] shift - a compensation shift. - \param [out] dst - an output image. - */ - template class A> SIMD_INLINE void TexturePerformCompensation(const View& src, int shift, View& dst) - { - assert(Compatible(src, dst) && src.format == View::Gray8 && shift > -0xFF && shift < 0xFF); - - SimdTexturePerformCompensation(src.data, src.stride, src.width, src.height, shift, dst.data, dst.stride); - } - - /*! @ingroup transform - - \fn Point TransformSize(const Point & size, ::SimdTransformType transform); - - \short Gets size of transformed image. - - \param [in] size - a size of input image. - \param [in] transform - a type of image transformation. - \return - the size of transformed image. - */ - SIMD_INLINE Point TransformSize(const Point & size, ::SimdTransformType transform) - { - switch (transform) - { - case ::SimdTransformRotate0: - case ::SimdTransformRotate180: - case ::SimdTransformTransposeRotate90: - case ::SimdTransformTransposeRotate270: - return size; - case ::SimdTransformRotate90: - case ::SimdTransformRotate270: - case ::SimdTransformTransposeRotate0: - case ::SimdTransformTransposeRotate180: - return Point(size.y, size.x); - default: - assert(0); - return Point(); - } - } - - /*! @ingroup transform - - \fn void TransformImage(const View & src, ::SimdTransformType transform, View & dst); - - \short Performs transformation of input image. The type of transformation is defined by ::SimdTransformType enumeration. - - \note This function is a C++ wrapper for function ::SimdTransformImage. - - \param [in] src - an input image. - \param [in] transform - a type of image transformation. - \param [out] dst - an output image. - */ - template class A> SIMD_INLINE void TransformImage(const View & src, ::SimdTransformType transform, View & dst) - { - assert(src.format == dst.format && TransformSize(src.Size(), transform) == dst.Size()); - - SimdTransformImage(src.data, src.stride, src.width, src.height, src.PixelSize(), transform, dst.data, dst.stride); - } - - /*! @ingroup yuv_conversion - - \fn void Yuva420pToBgra(const View& y, const View& u, const View& v, const View& a, View& bgra) - - \short Converts YUVA420P image to 32-bit BGRA image. - - The input Y, A and output BGRA images must have the same width and height. - The input U and V images must have the same width and height (half size relative to Y component). - - \note This function is a C++ wrapper for function ::SimdYuva420pToBgra. - - \param [in] y - an input 8-bit image with Y color plane. - \param [in] u - an input 8-bit image with U color plane. - \param [in] v - an input 8-bit image with V color plane. - \param [in] a - an input 8-bit image with alpha channel. - \param [out] bgra - an output 32-bit BGRA image. - */ - template class A> SIMD_INLINE void Yuva420pToBgra(const View& y, const View& u, const View& v, const View& a, View& bgra) - { - assert(y.width == 2 * u.width && y.height == 2 * u.height && y.format == u.format); - assert(y.width == 2 * v.width && y.height == 2 * v.height && y.format == v.format); - assert(Compatible(y, a) && EqualSize(y, bgra)); - assert(y.format == View::Gray8 && bgra.format == View::Bgra32); - - SimdYuva420pToBgra(y.data, y.stride, u.data, u.stride, v.data, v.stride, a.data, a.stride, y.width, y.height, bgra.data, bgra.stride); - } - - /*! @ingroup yuv_conversion - - \fn void Yuv420pToBgr(const View& y, const View& u, const View& v, View& bgr) - - \short Converts YUV420P image to 24-bit BGR image. - - The input Y and output BGR images must have the same width and height. - The input U and V images must have the same width and height (half size relative to Y component). - - \note This function is a C++ wrapper for function ::SimdYuv420pToBgr. - - \param [in] y - an input 8-bit image with Y color plane. - \param [in] u - an input 8-bit image with U color plane. - \param [in] v - an input 8-bit image with V color plane. - \param [out] bgr - an output 24-bit BGR image. - */ - template class A> SIMD_INLINE void Yuv420pToBgr(const View& y, const View& u, const View& v, View& bgr) - { - assert(y.width == 2 * u.width && y.height == 2 * u.height && y.format == u.format); - assert(y.width == 2 * v.width && y.height == 2 * v.height && y.format == v.format); - assert(y.width == bgr.width && y.height == bgr.height); - assert(y.format == View::Gray8 && bgr.format == View::Bgr24); - - SimdYuv420pToBgr(y.data, y.stride, u.data, u.stride, v.data, v.stride, y.width, y.height, bgr.data, bgr.stride); - } - - /*! @ingroup yuv_conversion - - \fn void Yuv422pToBgr(const View& y, const View& u, const View& v, View& bgr) - - \short Converts YUV422P image to 24-bit BGR image. - - The input Y and output BGR images must have the same width and height. - The input U and V images must have the same width and height (their width is equal to half width of Y component). - - \note This function is a C++ wrapper for function ::SimdYuv422pToBgr. - - \param [in] y - an input 8-bit image with Y color plane. - \param [in] u - an input 8-bit image with U color plane. - \param [in] v - an input 8-bit image with V color plane. - \param [out] bgr - an output 24-bit BGR image. - */ - template class A> SIMD_INLINE void Yuv422pToBgr(const View& y, const View& u, const View& v, View& bgr) - { - assert(y.width == 2 * u.width && y.height == u.height && y.format == u.format); - assert(y.width == 2 * v.width && y.height == v.height && y.format == v.format); - assert(y.width == bgr.width && y.height == bgr.height); - assert(y.format == View::Gray8 && bgr.format == View::Bgr24); - - SimdYuv422pToBgr(y.data, y.stride, u.data, u.stride, v.data, v.stride, y.width, y.height, bgr.data, bgr.stride); - } - - /*! @ingroup yuv_conversion - - \fn void Yuv444pToBgr(const View& y, const View& u, const View& v, View& bgr) - - \short Converts YUV444P image to 24-bit BGR image. - - The input Y, U, V and output BGR images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdYuv444pToBgr. - - \param [in] y - an input 8-bit image with Y color plane. - \param [in] u - an input 8-bit image with U color plane. - \param [in] v - an input 8-bit image with V color plane. - \param [out] bgr - an output 24-bit BGR image. - */ - template class A> SIMD_INLINE void Yuv444pToBgr(const View& y, const View& u, const View& v, View& bgr) - { - assert(Compatible(y, u, v) && EqualSize(y, bgr) && y.format == View::Gray8 && bgr.format == View::Bgr24); - - SimdYuv444pToBgr(y.data, y.stride, u.data, u.stride, v.data, v.stride, y.width, y.height, bgr.data, bgr.stride); - } - - /*! @ingroup yuv_conversion - - \fn void Yuv420pToBgra(const View& y, const View& u, const View& v, View& bgra, uint8_t alpha = 0xFF) - - \short Converts YUV420P image to 32-bit BGRA image. - - The input Y and output BGRA images must have the same width and height. - The input U and V images must have the same width and height (half size relative to Y component). - - \note This function is a C++ wrapper for function ::SimdYuv420pToBgra. - - \param [in] y - an input 8-bit image with Y color plane. - \param [in] u - an input 8-bit image with U color plane. - \param [in] v - an input 8-bit image with V color plane. - \param [out] bgra - an output 32-bit BGRA image. - \param [in] alpha - a value of alpha channel. It is equal to 255 by default. - */ - template class A> SIMD_INLINE void Yuv420pToBgra(const View& y, const View& u, const View& v, View& bgra, uint8_t alpha = 0xFF) - { - assert(y.width == 2 * u.width && y.height == 2 * u.height && y.format == u.format); - assert(y.width == 2 * v.width && y.height == 2 * v.height && y.format == v.format); - assert(y.width == bgra.width && y.height == bgra.height); - assert(y.format == View::Gray8 && bgra.format == View::Bgra32); - - SimdYuv420pToBgra(y.data, y.stride, u.data, u.stride, v.data, v.stride, y.width, y.height, bgra.data, bgra.stride, alpha); - } - - /*! @ingroup yuv_conversion - - \fn void Yuv422pToBgra(const View& y, const View& u, const View& v, View& bgra, uint8_t alpha = 0xFF) - - \short Converts YUV422P image to 32-bit BGRA image. - - The input Y and output BGRA images must have the same width and height. - The input U and V images must have the same width and height (their width is equal to half width of Y component). - - \note This function is a C++ wrapper for function ::SimdYuv422pToBgra. - - \param [in] y - an input 8-bit image with Y color plane. - \param [in] u - an input 8-bit image with U color plane. - \param [in] v - an input 8-bit image with V color plane. - \param [out] bgra - an output 32-bit BGRA image. - \param [in] alpha - a value of alpha channel. It is equal to 255 by default. - */ - template class A> SIMD_INLINE void Yuv422pToBgra(const View& y, const View& u, const View& v, View& bgra, uint8_t alpha = 0xFF) - { - assert(y.width == 2 * u.width && y.height == u.height && y.format == u.format); - assert(y.width == 2 * v.width && y.height == v.height && y.format == v.format); - assert(y.width == bgra.width && y.height == bgra.height); - assert(y.format == View::Gray8 && bgra.format == View::Bgra32); - - SimdYuv422pToBgra(y.data, y.stride, u.data, u.stride, v.data, v.stride, y.width, y.height, bgra.data, bgra.stride, alpha); - } - - /*! @ingroup yuv_conversion - - \fn void Yuv444pToBgra(const View& y, const View& u, const View& v, View& bgra, uint8_t alpha = 0xFF) - - \short Converts YUV444P image to 32-bit BGRA image. - - The input Y, U, V and output BGRA images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdYuv444pToBgra. - - \param [in] y - an input 8-bit image with Y color plane. - \param [in] u - an input 8-bit image with U color plane. - \param [in] v - an input 8-bit image with V color plane. - \param [out] bgra - an output 32-bit BGRA image. - \param [in] alpha - a value of alpha channel. It is equal to 255 by default. - */ - template class A> SIMD_INLINE void Yuv444pToBgra(const View& y, const View& u, const View& v, View& bgra, uint8_t alpha = 0xFF) - { - assert(Compatible(y, u, v) && EqualSize(y, bgra) && y.format == View::Gray8 && bgra.format == View::Bgra32); - - SimdYuv444pToBgra(y.data, y.stride, u.data, u.stride, v.data, v.stride, y.width, y.height, bgra.data, bgra.stride, alpha); - } - - /*! @ingroup yuv_conversion - - \fn void Yuv444pToHsl(const View& y, const View& u, const View& v, View& hsl) - - \short Converts YUV444P image to 24-bit HSL(Hue, Saturation, Lightness) image. - - The input Y, U, V and output HSL images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdYuv444pToHsl. - - \param [in] y - an input 8-bit image with Y color plane. - \param [in] u - an input 8-bit image with U color plane. - \param [in] v - an input 8-bit image with V color plane. - \param [out] hsl - an output 24-bit HSL image. - */ - template class A> SIMD_INLINE void Yuv444pToHsl(const View& y, const View& u, const View& v, View& hsl) - { - assert(Compatible(y, u, v) && EqualSize(y, hsl) && y.format == View::Gray8 && hsl.format == View::Hsl24); - - SimdYuv444pToHsl(y.data, y.stride, u.data, u.stride, v.data, v.stride, y.width, y.height, hsl.data, hsl.stride); - } - - /*! @ingroup yuv_conversion - - \fn void Yuv444pToHsv(const View& y, const View& u, const View& v, View& hsv) - - \short Converts YUV444P image to 24-bit HSV(Hue, Saturation, Value) image. - - The input Y, U, V and output HSV images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdYuv444pToHsv. - - \param [in] y - an input 8-bit image with Y color plane. - \param [in] u - an input 8-bit image with U color plane. - \param [in] v - an input 8-bit image with V color plane. - \param [out] hsv - an output 24-bit HSV image. - */ - template class A> SIMD_INLINE void Yuv444pToHsv(const View& y, const View& u, const View& v, View& hsv) - { - assert(Compatible(y, u, v) && EqualSize(y, hsv) && y.format == View::Gray8 && hsv.format == View::Hsv24); - - SimdYuv444pToHsv(y.data, y.stride, u.data, u.stride, v.data, v.stride, y.width, y.height, hsv.data, hsv.stride); - } - - /*! @ingroup yuv_conversion - - \fn void Yuv420pToHue(const View& y, const View& u, const View& v, View& hue) - - \short Converts YUV420P image to 8-bit image with Hue component of HSV or HSL color space. - - The input Y and output Hue images must have the same width and height. - The input U and V images must have the same width and height (half size relative to Y component). - - \note This function is a C++ wrapper for function ::SimdYuv420pToHue. - - \param [in] y - an input 8-bit image with Y color plane. - \param [in] u - an input 8-bit image with U color plane. - \param [in] v - an input 8-bit image with V color plane. - \param [out] hue - an output 8-bit Hue image. - */ - template class A> SIMD_INLINE void Yuv420pToHue(const View& y, const View& u, const View& v, View& hue) - { - assert(y.width == 2 * u.width && y.height == 2 * u.height && y.format == u.format); - assert(y.width == 2 * v.width && y.height == 2 * v.height && y.format == v.format); - assert(Compatible(y, hue) && y.format == View::Gray8); - - SimdYuv420pToHue(y.data, y.stride, u.data, u.stride, v.data, v.stride, y.width, y.height, hue.data, hue.stride); - } - - /*! @ingroup yuv_conversion - - \fn void Yuv444pToHue(const View & y, const View & u, const View & v, View & hue) - - \short Converts YUV444P image to 8-bit image with Hue component of HSV or HSL color space. - - The input Y, U, V and output Hue images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdYuv444pToHue. - - \param [in] y - an input 8-bit image with Y color plane. - \param [in] u - an input 8-bit image with U color plane. - \param [in] v - an input 8-bit image with V color plane. - \param [out] hue - an output 8-bit Hue image. - */ - template class A> SIMD_INLINE void Yuv444pToHue(const View & y, const View & u, const View & v, View & hue) - { - assert(Compatible(y, u, v, hue) && y.format == View::Gray8); - - SimdYuv444pToHue(y.data, y.stride, u.data, u.stride, v.data, v.stride, y.width, y.height, hue.data, hue.stride); - } - - /*! @ingroup yuv_conversion - - \fn void Yuv420pToRgb(const View& y, const View& u, const View& v, View& rgb) - - \short Converts YUV420P image to 24-bit RGB image. - - The input Y and output RGB images must have the same width and height. - The input U and V images must have the same width and height (half size relative to Y component). - - \note This function is a C++ wrapper for function ::SimdYuv420pToRgb. - - \param [in] y - an input 8-bit image with Y color plane. - \param [in] u - an input 8-bit image with U color plane. - \param [in] v - an input 8-bit image with V color plane. - \param [out] rgb - an output 24-bit RGB image. - */ - template class A> SIMD_INLINE void Yuv420pToRgb(const View& y, const View& u, const View& v, View& rgb) - { - assert(y.width == 2 * u.width && y.height == 2 * u.height && y.format == u.format); - assert(y.width == 2 * v.width && y.height == 2 * v.height && y.format == v.format); - assert(y.width == rgb.width && y.height == rgb.height); - assert(y.format == View::Gray8 && rgb.format == View::Rgb24); - - SimdYuv420pToRgb(y.data, y.stride, u.data, u.stride, v.data, v.stride, y.width, y.height, rgb.data, rgb.stride); - } - - /*! @ingroup yuv_conversion - - \fn void Yuv422pToRgb(const View& y, const View& u, const View& v, View& rgb) - - \short Converts YUV422P image to 24-bit RGB image. - - The input Y and output RGB images must have the same width and height. - The input U and V images must have the same width and height (their width is equal to half width of Y component). - - \note This function is a C++ wrapper for function ::SimdYuv422pToRgb. - - \param [in] y - an input 8-bit image with Y color plane. - \param [in] u - an input 8-bit image with U color plane. - \param [in] v - an input 8-bit image with V color plane. - \param [out] rgb - an output 24-bit RGB image. - */ - template class A> SIMD_INLINE void Yuv422pToRgb(const View& y, const View& u, const View& v, View& rgb) - { - assert(y.width == 2 * u.width && y.height == u.height && y.format == u.format); - assert(y.width == 2 * v.width && y.height == v.height && y.format == v.format); - assert(y.width == rgb.width && y.height == rgb.height); - assert(y.format == View::Gray8 && rgb.format == View::Rgb24); - - SimdYuv422pToRgb(y.data, y.stride, u.data, u.stride, v.data, v.stride, y.width, y.height, rgb.data, rgb.stride); - } - - /*! @ingroup yuv_conversion - - \fn void Yuv444pToRgb(const View& y, const View& u, const View& v, View& rgb) - - \short Converts YUV444P image to 24-bit RGB image. - - The input Y, U, V and output RGB images must have the same width and height. - - \note This function is a C++ wrapper for function ::SimdYuv444pToRgb. - - \param [in] y - an input 8-bit image with Y color plane. - \param [in] u - an input 8-bit image with U color plane. - \param [in] v - an input 8-bit image with V color plane. - \param [out] rgb - an output 24-bit RGB image. - */ - template class A> SIMD_INLINE void Yuv444pToRgb(const View& y, const View& u, const View& v, View& rgb) - { - assert(Compatible(y, u, v) && EqualSize(y, rgb) && y.format == View::Gray8 && rgb.format == View::Rgb24); - - SimdYuv444pToRgb(y.data, y.stride, u.data, u.stride, v.data, v.stride, y.width, y.height, rgb.data, rgb.stride); - } - - /*! @ingroup universal_conversion - - \fn void Convert(const View & src, View & dst) - - \short Converts an image of one format to an image of another format. - - The input and output images must have the same width and height. - - \note This function supports conversion between Gray8, Bgr24, Bgra32 and Rgb24 image formats. - - \param [in] src - an input image. - \param [out] dst - an output image. - */ - template class A> SIMD_INLINE void Convert(const View & src, View & dst) - { - assert(EqualSize(src, dst) && src.format && dst.format); - - if (src.format == dst.format) - { - Copy(src, dst); - return; - } - - switch (src.format) - { - case View::Gray8: - switch (dst.format) - { - case View::Bgra32: - GrayToBgra(src, dst); - break; - case View::Bgr24: - case View::Rgb24: - GrayToBgr(src, dst); - break; - default: - assert(0); - } - break; - - case View::Bgr24: - switch (dst.format) - { - case View::Bgra32: - BgrToBgra(src, dst); - break; - case View::Gray8: - BgrToGray(src, dst); - break; - case View::Rgb24: - BgrToRgb(src, dst); - break; - default: - assert(0); - } - break; - - case View::Rgb24: - switch (dst.format) - { - case View::Bgra32: - RgbToBgra(src, dst); - break; - case View::Bgr24: - BgrToRgb(src, dst); - break; - case View::Gray8: - RgbToGray(src, dst); - break; - default: - assert(0); - } - break; - - case View::Bgra32: - switch (dst.format) - { - case View::Bgr24: - BgraToBgr(src, dst); - break; - case View::Gray8: - BgraToGray(src, dst); - break; - case View::Rgb24: - BgraToRgb(src, dst); - break; - default: - assert(0); - } - break; - - default: - assert(0); - } - } - - /*! @ingroup cpp_pyramid_functions - - \fn void Fill(Pyramid & pyramid, uint8_t value) - - \short Fills pixels data of images in the pyramid by given value. - - \param [out] pyramid - a pyramid. - \param [in] value - a value to fill the pyramid. - */ - template class A> SIMD_INLINE void Fill(Pyramid & pyramid, uint8_t value) - { - for (size_t level = 0; level < pyramid.Size(); ++level) - Simd::Fill(pyramid.At(level), value); - } - - /*! @ingroup cpp_pyramid_functions - - \fn void Copy(const Pyramid & src, Pyramid & dst) - - \short Copies one pyramid to another pyramid. - - \note Input and output pyramids must have the same size. - - \param [in] src - an input pyramid. - \param [out] dst - an output pyramid. - */ - template class A> SIMD_INLINE void Copy(const Pyramid & src, Pyramid & dst) - { - assert(src.Size() == dst.Size()); - for (size_t level = 0; level < src.Size(); ++level) - Simd::Copy(src.At(level), dst.At(level)); - } - - /*! @ingroup cpp_pyramid_functions - - \fn void Build(Pyramid & pyramid, ::SimdReduceType reduceType, bool compensation = true) - - \short Builds the pyramid (fills upper levels on the base of the lowest level). - - \param [out] pyramid - a built pyramid. - \param [in] reduceType - a type of function used for image reducing. - \param [in] compensation - a flag of compensation of rounding. It is relevant only for ::SimdReduce3x3 and ::SimdReduce5x5. It is equal to 'true' by default. - */ - template class A> SIMD_INLINE void Build(Pyramid & pyramid, ::SimdReduceType reduceType, bool compensation = true) - { - for (size_t level = 1; level < pyramid.Size(); ++level) - Simd::ReduceGray(pyramid.At(level - 1), pyramid.At(level), reduceType, compensation); - } -} - -#endif//__SimdLib_hpp__ - diff --git a/src/3rd/Simd/Simd/SimdLoad.h b/src/3rd/Simd/Simd/SimdLoad.h deleted file mode 100644 index 5453f002..00000000 --- a/src/3rd/Simd/Simd/SimdLoad.h +++ /dev/null @@ -1,1468 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdLoad_h__ -#define __SimdLoad_h__ - -#include "Simd/SimdConst.h" - -namespace Simd -{ - enum PadType - { - PadNose1, - PadNone, - PadTail1, - PadTail2, - }; - -#ifdef SIMD_SSE_ENABLE - namespace Sse - { - template SIMD_INLINE __m128 Load(const float * p); - - template <> SIMD_INLINE __m128 Load(const float * p) - { - return _mm_loadu_ps(p); - } - - template <> SIMD_INLINE __m128 Load(const float * p) - { - return _mm_load_ps(p); - } - - SIMD_INLINE __m128 Load(const float * p0, const float * p1) - { - return _mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)p0), (__m64*)p1); - } - - SIMD_INLINE __m128 LoadPadZeroNose1(const float * p) - { - SIMD_ALIGNED(16) const int32_t m[F] = { 0, -1, -1, -1 }; - __m128 a = _mm_loadu_ps(p + 1); - __m128 b = _mm_shuffle_ps(a, a, 0x90); - return _mm_and_ps(b, _mm_load_ps((float*)m)); - } - - SIMD_INLINE __m128 LoadPadZeroTail1(const float * p) - { - SIMD_ALIGNED(16) const int32_t m[F] = { -1, -1, -1, 0 }; - __m128 a = _mm_loadu_ps(p - 1); - __m128 b = _mm_shuffle_ps(a, a, 0xF9); - return _mm_and_ps(b, _mm_load_ps((float*)m)); - } - - SIMD_INLINE __m128 LoadPadZeroTail2(const float * p) - { - SIMD_ALIGNED(16) const int32_t m[F] = { -1, -1, 0, 0 }; - __m128 a = _mm_loadu_ps(p - 2); - __m128 b = _mm_shuffle_ps(a, a, 0xFE); - return _mm_and_ps(b, _mm_load_ps((float*)m)); - } - } -#endif//SIMD_SSE_ENABLE - -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { - using namespace Sse; - - template SIMD_INLINE __m128i Load(const __m128i * p); - - template <> SIMD_INLINE __m128i Load(const __m128i * p) - { - return _mm_loadu_si128(p); - } - - template <> SIMD_INLINE __m128i Load(const __m128i * p) - { - return _mm_load_si128(p); - } - - template SIMD_INLINE __m128i LoadMaskI8(const __m128i * p, __m128i index) - { - return _mm_cmpeq_epi8(Load(p), index); - } - - template SIMD_INLINE __m128i LoadBeforeFirst(__m128i first) - { - return _mm_or_si128(_mm_slli_si128(first, count), _mm_and_si128(first, _mm_srli_si128(K_INV_ZERO, A - count))); - } - - template SIMD_INLINE __m128i LoadAfterLast(__m128i last) - { - return _mm_or_si128(_mm_srli_si128(last, count), _mm_and_si128(last, _mm_slli_si128(K_INV_ZERO, A - count))); - } - - template SIMD_INLINE void LoadNose3(const uint8_t * p, __m128i a[3]) - { - a[1] = Load((__m128i*)p); - a[0] = LoadBeforeFirst(a[1]); - a[2] = _mm_loadu_si128((__m128i*)(p + step)); - } - - template SIMD_INLINE void LoadBody3(const uint8_t * p, __m128i a[3]) - { - a[0] = _mm_loadu_si128((__m128i*)(p - step)); - a[1] = Load((__m128i*)p); - a[2] = _mm_loadu_si128((__m128i*)(p + step)); - } - - template SIMD_INLINE void LoadTail3(const uint8_t * p, __m128i a[3]) - { - a[0] = _mm_loadu_si128((__m128i*)(p - step)); - a[1] = Load((__m128i*)p); - a[2] = LoadAfterLast(a[1]); - } - - template SIMD_INLINE void LoadNose5(const uint8_t * p, __m128i a[5]) - { - a[2] = Load((__m128i*)p); - a[1] = LoadBeforeFirst(a[2]); - a[0] = LoadBeforeFirst(a[1]); - a[3] = _mm_loadu_si128((__m128i*)(p + step)); - a[4] = _mm_loadu_si128((__m128i*)(p + 2 * step)); - } - - template SIMD_INLINE void LoadBody5(const uint8_t * p, __m128i a[5]) - { - a[0] = _mm_loadu_si128((__m128i*)(p - 2 * step)); - a[1] = _mm_loadu_si128((__m128i*)(p - step)); - a[2] = Load((__m128i*)p); - a[3] = _mm_loadu_si128((__m128i*)(p + step)); - a[4] = _mm_loadu_si128((__m128i*)(p + 2 * step)); - } - - template SIMD_INLINE void LoadTail5(const uint8_t * p, __m128i a[5]) - { - a[0] = _mm_loadu_si128((__m128i*)(p - 2 * step)); - a[1] = _mm_loadu_si128((__m128i*)(p - step)); - a[2] = Load((__m128i*)p); - a[3] = LoadAfterLast(a[2]); - a[4] = LoadAfterLast(a[3]); - } - - SIMD_INLINE void LoadNoseDx(const uint8_t * p, __m128i a[3]) - { - a[0] = LoadBeforeFirst<1>(_mm_loadu_si128((__m128i*)p)); - a[2] = _mm_loadu_si128((__m128i*)(p + 1)); - } - - SIMD_INLINE void LoadBodyDx(const uint8_t * p, __m128i a[3]) - { - a[0] = _mm_loadu_si128((__m128i*)(p - 1)); - a[2] = _mm_loadu_si128((__m128i*)(p + 1)); - } - - SIMD_INLINE void LoadTailDx(const uint8_t * p, __m128i a[3]) - { - a[0] = _mm_loadu_si128((__m128i*)(p - 1)); - a[2] = LoadAfterLast<1>(_mm_loadu_si128((__m128i*)p)); - } - } -#endif//SIMD_SSE2_ENABLE - -#ifdef SIMD_SSE3_ENABLE - namespace Sse3 - { -#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug - using Sse::Load; - using Sse2::Load; -#endif - } -#endif - -#ifdef SIMD_SSE41_ENABLE - namespace Sse41 - { -#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug - using Sse::Load; - using Sse2::Load; -#endif - } -#endif - -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - template SIMD_INLINE __m256 Load(const float * p); - - template <> SIMD_INLINE __m256 Load(const float * p) - { - return _mm256_loadu_ps(p); - } - - template <> SIMD_INLINE __m256 Load(const float * p) - { - return _mm256_load_ps(p); - } - - template SIMD_INLINE __m256 Load(const float * p0, const float * p1) - { - return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse::Load(p0)), Sse::Load(p1), 1); - } - - SIMD_INLINE __m256 Load(const float * p0, const float * p1, const float * p2, const float * p3) - { - return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse::Load(p0, p1)), Sse::Load(p2, p3), 1); - } - - SIMD_INLINE __m256 Load(const float * ptr, __m256i mask) - { - return _mm256_maskload_ps(ptr, mask); - } - } -#endif//SIMD_AVX_ENABLE - -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - using namespace Avx; - - template SIMD_INLINE __m256i Load(const __m256i * p); - - template <> SIMD_INLINE __m256i Load(const __m256i * p) - { - return _mm256_loadu_si256(p); - } - - template <> SIMD_INLINE __m256i Load(const __m256i * p) - { - return _mm256_load_si256(p); - } - - template SIMD_INLINE __m128i LoadHalf(const __m128i * p); - - template <> SIMD_INLINE __m128i LoadHalf(const __m128i * p) - { - return _mm_loadu_si128(p); - } - - template <> SIMD_INLINE __m128i LoadHalf(const __m128i * p) - { - return _mm_load_si128(p); - } - - template SIMD_INLINE __m128i LoadHalfBeforeFirst(__m128i first) - { - return _mm_or_si128(_mm_slli_si128(first, count), _mm_and_si128(first, _mm_srli_si128(Sse2::K_INV_ZERO, HA - count))); - } - - template SIMD_INLINE __m128i LoadHalfAfterLast(__m128i last) - { - return _mm_or_si128(_mm_srli_si128(last, count), _mm_and_si128(last, _mm_slli_si128(Sse2::K_INV_ZERO, HA - count))); - } - - template SIMD_INLINE __m256i LoadPermuted(const __m256i * p) - { - return _mm256_permute4x64_epi64(Load(p), 0xD8); - } - - template SIMD_INLINE __m256i LoadMaskI8(const __m256i * p, __m256i index) - { - return _mm256_cmpeq_epi8(Load(p), index); - } - - SIMD_INLINE __m256i PermutedUnpackLoU8(__m256i a, __m256i b = K_ZERO) - { - return _mm256_permute4x64_epi64(_mm256_unpacklo_epi8(a, b), 0xD8); - } - - SIMD_INLINE __m256i PermutedUnpackHiU8(__m256i a, __m256i b = K_ZERO) - { - return _mm256_permute4x64_epi64(_mm256_unpackhi_epi8(a, b), 0xD8); - } - - SIMD_INLINE __m256i PermutedUnpackLoU16(__m256i a, __m256i b = K_ZERO) - { - return _mm256_permute4x64_epi64(_mm256_unpacklo_epi16(a, b), 0xD8); - } - - SIMD_INLINE __m256i PermutedUnpackHiU16(__m256i a, __m256i b = K_ZERO) - { - return _mm256_permute4x64_epi64(_mm256_unpackhi_epi16(a, b), 0xD8); - } - - template SIMD_INLINE __m256i LoadBeforeFirst(const uint8_t * p) - { - __m128i lo = LoadHalfBeforeFirst(LoadHalf((__m128i*)p)); - __m128i hi = _mm_loadu_si128((__m128i*)(p + HA - step)); - return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 0x1); - } - - template SIMD_INLINE void LoadBeforeFirst(const uint8_t * p, __m256i & first, __m256i & second) - { - __m128i firstLo = LoadHalfBeforeFirst(LoadHalf((__m128i*)p)); - __m128i firstHi = _mm_loadu_si128((__m128i*)(p + HA - step)); - first = _mm256_inserti128_si256(_mm256_castsi128_si256(firstLo), firstHi, 0x1); - - __m128i secondLo = LoadHalfBeforeFirst(firstLo); - __m128i secondHi = _mm_loadu_si128((__m128i*)(p + HA - 2 * step)); - second = _mm256_inserti128_si256(_mm256_castsi128_si256(secondLo), secondHi, 0x1); - } - - template SIMD_INLINE __m256i LoadAfterLast(const uint8_t * p) - { - __m128i lo = _mm_loadu_si128((__m128i*)(p + step)); - __m128i hi = LoadHalfAfterLast(LoadHalf((__m128i*)(p + HA))); - return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 0x1); - } - - template SIMD_INLINE void LoadAfterLast(const uint8_t * p, __m256i & first, __m256i & second) - { - __m128i firstLo = _mm_loadu_si128((__m128i*)(p + step)); - __m128i firstHi = LoadHalfAfterLast(LoadHalf((__m128i*)(p + HA))); - first = _mm256_inserti128_si256(_mm256_castsi128_si256(firstLo), firstHi, 0x1); - - __m128i secondLo = _mm_loadu_si128((__m128i*)(p + 2 * step)); - __m128i secondHi = LoadHalfAfterLast(firstHi); - second = _mm256_inserti128_si256(_mm256_castsi128_si256(secondLo), secondHi, 0x1); - } - - template SIMD_INLINE void LoadNose3(const uint8_t * p, __m256i a[3]) - { - a[0] = LoadBeforeFirst(p); - a[1] = Load((__m256i*)p); - a[2] = _mm256_loadu_si256((__m256i*)(p + step)); - } - - template SIMD_INLINE void LoadBody3(const uint8_t * p, __m256i a[3]) - { - a[0] = _mm256_loadu_si256((__m256i*)(p - step)); - a[1] = Load((__m256i*)p); - a[2] = _mm256_loadu_si256((__m256i*)(p + step)); - } - - template SIMD_INLINE void LoadTail3(const uint8_t * p, __m256i a[3]) - { - a[0] = _mm256_loadu_si256((__m256i*)(p - step)); - a[1] = Load((__m256i*)p); - a[2] = LoadAfterLast(p); - } - - template SIMD_INLINE void LoadNose5(const uint8_t * p, __m256i a[5]) - { - LoadBeforeFirst(p, a[1], a[0]); - a[2] = Load((__m256i*)p); - a[3] = _mm256_loadu_si256((__m256i*)(p + step)); - a[4] = _mm256_loadu_si256((__m256i*)(p + 2 * step)); - } - - template SIMD_INLINE void LoadBody5(const uint8_t * p, __m256i a[5]) - { - a[0] = _mm256_loadu_si256((__m256i*)(p - 2 * step)); - a[1] = _mm256_loadu_si256((__m256i*)(p - step)); - a[2] = Load((__m256i*)p); - a[3] = _mm256_loadu_si256((__m256i*)(p + step)); - a[4] = _mm256_loadu_si256((__m256i*)(p + 2 * step)); - } - - template SIMD_INLINE void LoadTail5(const uint8_t * p, __m256i a[5]) - { - a[0] = _mm256_loadu_si256((__m256i*)(p - 2 * step)); - a[1] = _mm256_loadu_si256((__m256i*)(p - step)); - a[2] = Load((__m256i*)p); - LoadAfterLast(p, a[3], a[4]); - } - - SIMD_INLINE void LoadNoseDx(const uint8_t * p, __m256i a[3]) - { - a[0] = LoadBeforeFirst(p); - a[2] = _mm256_loadu_si256((__m256i*)(p + 1)); - } - - SIMD_INLINE void LoadBodyDx(const uint8_t * p, __m256i a[3]) - { - a[0] = _mm256_loadu_si256((__m256i*)(p - 1)); - a[2] = _mm256_loadu_si256((__m256i*)(p + 1)); - } - - SIMD_INLINE void LoadTailDx(const uint8_t * p, __m256i a[3]) - { - a[0] = _mm256_loadu_si256((__m256i*)(p - 1)); - a[2] = LoadAfterLast(p); - } - - template SIMD_INLINE __m256 Load(const float * p); - - template <> SIMD_INLINE __m256 Load(const float * p) - { - return _mm256_loadu_ps(p); - } - - template <> SIMD_INLINE __m256 Load(const float * p) - { -#ifdef _MSC_VER - return _mm256_castsi256_ps(_mm256_load_si256((__m256i*)p)); -#else - return _mm256_load_ps(p); -#endif - } - } -#endif//SIMD_AVX2_ENABLE - -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - template SIMD_INLINE __m512 Load(const float * p); - - template <> SIMD_INLINE __m512 Load(const float * p) - { - return _mm512_loadu_ps(p); - } - - template <> SIMD_INLINE __m512 Load(const float * p) - { -#if defined(__clang__) && (__clang_major__ == 3) && (__clang_minor__ == 8) && (__clang_patchlevel__ == 0) - return _mm512_load_ps((const double *)p); -#else - return _mm512_load_ps(p); -#endif - } - - template SIMD_INLINE __m512 Load(const float * p, __mmask16 m) - { - return Load(p); - } - - template <> SIMD_INLINE __m512 Load(const float * p, __mmask16 m) - { - return _mm512_maskz_loadu_ps(m, p); - } - - template <> SIMD_INLINE __m512 Load(const float * p, __mmask16 m) - { - return _mm512_maskz_load_ps(m, p); - } - - template SIMD_INLINE __m512 Load(const float * p0, const float * p1) - { - return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(_mm512_castps256_ps512(Avx::Load(p0))), _mm256_castps_pd(Avx::Load(p1)), 1)); - } - - template SIMD_INLINE __m512 Load(const float * p0, const float * p1, const float * p2, const float * p3) - { - return _mm512_insertf32x4(_mm512_insertf32x4(_mm512_insertf32x4(_mm512_castps128_ps512(Sse::Load(p0)), Sse::Load(p1), 1), Sse::Load(p2), 2), Sse::Load(p3), 3); - } - - const __m512i K32_GATHER_ANY = SIMD_MM512_SET1_EPI32(1); - const __m512i K32_GATHER_3A = SIMD_MM512_SETR_EPI32(0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0, 0, 0, 0, 0); - const __m512i K32_GATHER_3B = SIMD_MM512_SETR_EPI32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 7, 10, 13); - - - template SIMD_INLINE __m512 Gather(const float * ptr) - { - return _mm512_i32gather_ps(K32_GATHER_ANY, ptr, sizeof(float)*period); - } - - template<> SIMD_INLINE __m512 Gather<3>(const float * ptr) - { - __m512 s0 = _mm512_loadu_ps(ptr + 0 * F); - __m512 s1 = _mm512_loadu_ps(ptr + 1 * F); - __m512 s2 = _mm512_loadu_ps(ptr + 2 * F); - return _mm512_mask_permutexvar_ps(_mm512_maskz_permutex2var_ps(0xFFFF, s0, K32_GATHER_3A, s1), 0xF800, K32_GATHER_3B, s2); - } - } -#endif//SIMD_AVX512F_ENABLE - -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - using namespace Avx512f; - - template SIMD_INLINE __m512i Load(const void * p); - - template <> SIMD_INLINE __m512i Load(const void * p) - { - return _mm512_loadu_si512(p); - } - - template <> SIMD_INLINE __m512i Load(const void * p) - { - return _mm512_load_si512(p); - } - - template SIMD_INLINE __m512i Load(const uint8_t * p, __mmask64 m) - { - return Load(p); - } - - template <> SIMD_INLINE __m512i Load(const uint8_t * p, __mmask64 m) - { -#if defined (SIMD_MASKZ_LOAD_ERROR) - return _mm512_mask_mov_epi8(K_ZERO, m, _mm512_maskz_loadu_epi8(m, p)); -#else - return _mm512_maskz_loadu_epi8(m, p); -#endif - } - - template <> SIMD_INLINE __m512i Load(const uint8_t * p, __mmask64 m) - { -#if defined (SIMD_MASKZ_LOAD_ERROR) - return _mm512_mask_mov_epi8(K_ZERO, m, _mm512_maskz_loadu_epi8(m, p)); -#else - return _mm512_maskz_loadu_epi8(m, p); -#endif - } - - template SIMD_INLINE __m512i Load(const int16_t * p, __mmask32 m) - { - return Load(p); - } - - template <> SIMD_INLINE __m512i Load(const int16_t * p, __mmask32 m) - { - return _mm512_maskz_loadu_epi16(m, p); - } - - template <> SIMD_INLINE __m512i Load(const int16_t * p, __mmask32 m) - { - return _mm512_maskz_loadu_epi16(m, p); - } - - template SIMD_INLINE __m512i Load(const uint16_t * p, __mmask32 m) - { - return Load((int16_t*)p, m); - } - - template SIMD_INLINE __m512i Load(const uint32_t * p, __mmask16 m) - { - return Load(p); - } - - template <> SIMD_INLINE __m512i Load(const uint32_t * p, __mmask16 m) - { - return _mm512_maskz_loadu_epi32(m, p); - } - - template <> SIMD_INLINE __m512i Load(const uint32_t * p, __mmask16 m) - { - return _mm512_maskz_loadu_epi32(m, p); - } - - template SIMD_INLINE __m512i Load(const int32_t * p, __mmask16 m) - { - return Load((uint32_t*)p, m); - } - - template SIMD_INLINE __m512i LoadBeforeFirst(const uint8_t * p) - { - __mmask64 m = __mmask64(-1) << step; - __m512i src = Load(p - step, m); - __m128i so = _mm512_extracti32x4_epi32(src, 0); - __m128i ss = _mm_srli_si128(so, step); - return _mm512_mask_blend_epi8(m, _mm512_inserti32x4(src, ss, 0), src); - } - - template SIMD_INLINE __m512i LoadAfterLast(const uint8_t * p) - { - __mmask64 m = __mmask64(-1) >> step; - __m512i src = Load(p + step, m); - __m128i so = _mm512_extracti32x4_epi32(src, 3); - __m128i ss = _mm_slli_si128(so, step); - return _mm512_mask_blend_epi8(m, _mm512_inserti32x4(src, ss, 3), src); - } - - template SIMD_INLINE __m512i LoadBeforeFirst2(const uint8_t * p) - { - __m512i src = Load(p - 2 * step, __mmask64(-1) << 2 * step); - return _mm512_inserti32x4(src, Sse2::LoadBeforeFirst(Sse2::LoadBeforeFirst(Sse2::Load((__m128i*)p + 0))), 0); - } - - template SIMD_INLINE __m512i LoadAfterLast2(const uint8_t * p) - { - __m512i src = Load(p + 2 * step, __mmask64(-1) >> 2 * step); - return _mm512_inserti32x4(src, Sse2::LoadAfterLast(Sse2::LoadAfterLast(Sse2::Load((__m128i*)p + 3))), 3); - } - - template SIMD_INLINE void LoadNose3(const uint8_t * p, __m512i a[3]) - { - a[0] = LoadBeforeFirst(p); - a[1] = Load(p); - a[2] = Load(p + step); - } - - template SIMD_INLINE void LoadBody3(const uint8_t * p, __m512i a[3]) - { - a[0] = Load(p - step); - a[1] = Load(p); - a[2] = Load(p + step); - } - - template SIMD_INLINE void LoadTail3(const uint8_t * p, __m512i a[3]) - { - a[0] = Load(p - step); - a[1] = Load(p); - a[2] = LoadAfterLast(p); - } - - template SIMD_INLINE void LoadNose5(const uint8_t * p, __m512i a[5]) - { - a[0] = LoadBeforeFirst2(p); - a[1] = LoadBeforeFirst(p); - a[2] = Load(p); - a[3] = Load(p + step); - a[4] = Load(p + 2 * step); - } - - template SIMD_INLINE void LoadBody5(const uint8_t * p, __m512i a[5]) - { - a[0] = Load(p - 2 * step); - a[1] = Load(p - step); - a[2] = Load(p); - a[3] = Load(p + step); - a[4] = Load(p + 2 * step); - } - - template SIMD_INLINE void LoadTail5(const uint8_t * p, __m512i a[5]) - { - a[0] = Load(p - 2 * step); - a[1] = Load(p - step); - a[2] = Load(p); - a[3] = LoadAfterLast(p); - a[4] = LoadAfterLast2(p); - } - - SIMD_INLINE void LoadNoseDx(const uint8_t * p, __m512i a[3]) - { - a[0] = LoadBeforeFirst<1>(p); - a[2] = Load(p + 1); - } - - SIMD_INLINE void LoadBodyDx(const uint8_t * p, __m512i a[3]) - { - a[0] = Load(p - 1); - a[2] = Load(p + 1); - } - - SIMD_INLINE void LoadTailDx(const uint8_t * p, __m512i a[3]) - { - a[0] = Load(p - 1); - a[2] = LoadAfterLast<1>(p); - } - - template SIMD_INLINE __m512 Load(const float * p0, const float * p1) - { - return _mm512_insertf32x8(_mm512_castps256_ps512(Avx::Load(p0)), Avx::Load(p1), 1); - } - - template SIMD_INLINE __m512 Load(const float * p0, const float * p1, const float * p2, const float * p3) - { - return _mm512_insertf32x4(_mm512_insertf32x4(_mm512_insertf32x4(_mm512_castps128_ps512(Sse::Load(p0)), Sse::Load(p1), 1), Sse::Load(p2), 2), Sse::Load(p3), 3); - } - } -#endif//SIMD_AVX512BW_ENABLE - -#ifdef SIMD_VMX_ENABLE - namespace Vmx - { - template SIMD_INLINE v128_u8 Load(const uint8_t * p); - - template <> SIMD_INLINE v128_u8 Load(const uint8_t * p) - { - v128_u8 lo = vec_ld(0, p); - v128_u8 hi = vec_ld(A, p); - return vec_perm(lo, hi, vec_lvsl(0, p)); - } - - template <> SIMD_INLINE v128_u8 Load(const uint8_t * p) - { - return vec_ld(0, p); - } - - template SIMD_INLINE v128_u16 Load(const uint16_t * p) - { - return (v128_u16)Load((const uint8_t*)p); - } - - template SIMD_INLINE v128_s16 Load(const int16_t * p) - { - return (v128_s16)Load((const uint8_t*)p); - } - - template SIMD_INLINE v128_u32 Load(const uint32_t * p) - { - return (v128_u32)Load((const uint8_t*)p); - } - - template SIMD_INLINE v128_f32 Load(const float * p) - { - return (v128_f32)Load((const uint8_t*)p); - } - - template SIMD_INLINE v128_u8 LoadMaskU8(const uint8_t * p, v128_u8 index) - { - return (v128_u8)vec_cmpeq(Load(p), index); - } - - template SIMD_INLINE v128_u8 LoadBeforeFirst(v128_u8 first); - - template <> SIMD_INLINE v128_u8 LoadBeforeFirst<1>(v128_u8 first) - { - return vec_perm(first, first, K8_PERM_LOAD_BEFORE_FIRST_1); - } - - template <> SIMD_INLINE v128_u8 LoadBeforeFirst<2>(v128_u8 first) - { - return vec_perm(first, first, K8_PERM_LOAD_BEFORE_FIRST_2); - } - - template <> SIMD_INLINE v128_u8 LoadBeforeFirst<3>(v128_u8 first) - { - return vec_perm(first, first, K8_PERM_LOAD_BEFORE_FIRST_3); - } - - template <> SIMD_INLINE v128_u8 LoadBeforeFirst<4>(v128_u8 first) - { - return vec_perm(first, first, K8_PERM_LOAD_BEFORE_FIRST_4); - } - - template SIMD_INLINE v128_u8 LoadAfterLast(v128_u8 last); - - template <> SIMD_INLINE v128_u8 LoadAfterLast<1>(v128_u8 last) - { - return vec_perm(last, last, K8_PERM_LOAD_AFTER_LAST_1); - } - - template <> SIMD_INLINE v128_u8 LoadAfterLast<2>(v128_u8 last) - { - return vec_perm(last, last, K8_PERM_LOAD_AFTER_LAST_2); - } - - template <> SIMD_INLINE v128_u8 LoadAfterLast<3>(v128_u8 last) - { - return vec_perm(last, last, K8_PERM_LOAD_AFTER_LAST_3); - } - - template <> SIMD_INLINE v128_u8 LoadAfterLast<4>(v128_u8 last) - { - return vec_perm(last, last, K8_PERM_LOAD_AFTER_LAST_4); - } - - template struct Loader; - - template <> struct Loader - { - template Loader(const T * ptr) - :_ptr((const uint8_t*)ptr) - { - } - - SIMD_INLINE v128_u8 First() const - { - return vec_ld(0, _ptr); - } - - SIMD_INLINE v128_u8 Next() const - { - _ptr += A; - return vec_ld(0, _ptr); - } - - private: - mutable const uint8_t * _ptr; - }; - - template <> struct Loader - { - template SIMD_INLINE Loader(const T * ptr) - :_ptr((const uint8_t*)ptr) - { - _perm = vec_lvsl(0, _ptr); - } - - SIMD_INLINE v128_u8 First() const - { - return vec_perm(vec_ld(0, _ptr), vec_ld(A, _ptr), _perm); - } - - SIMD_INLINE v128_u8 Next() const - { - _ptr += A; - return vec_perm(vec_ld(0, _ptr), vec_ld(A, _ptr), _perm); - } - - private: - mutable const uint8_t * _ptr; - v128_u8 _perm; - }; - - template v128_u8 Load(const Loader & loader); - - template <> SIMD_INLINE v128_u8 Load(const Loader & loader) - { - return loader.First(); - } - - template <> SIMD_INLINE v128_u8 Load(const Loader & loader) - { - return loader.First(); - } - - template <> SIMD_INLINE v128_u8 Load(const Loader & loader) - { - return loader.Next(); - } - - template <> SIMD_INLINE v128_u8 Load(const Loader & loader) - { - return loader.Next(); - } - - template SIMD_INLINE void LoadNose3(const uint8_t * p, v128_u8 a[3]) - { - a[1] = Load(p); - a[0] = LoadBeforeFirst(a[1]); - a[2] = Load(p + step); - } - - template SIMD_INLINE void LoadBody3(const uint8_t * p, v128_u8 a[3]) - { - a[0] = Load(p - step); - a[1] = Load(p); - a[2] = Load(p + step); - } - - template SIMD_INLINE void LoadTail3(const uint8_t * p, v128_u8 a[3]) - { - a[0] = Load(p - step); - a[1] = Load(p); - a[2] = LoadAfterLast(a[1]); - } - - template SIMD_INLINE void LoadNose5(const uint8_t * p, v128_u8 a[5]) - { - a[2] = Load(p); - a[1] = LoadBeforeFirst(a[2]); - a[0] = LoadBeforeFirst(a[1]); - a[3] = Load(p + step); - a[4] = Load(p + 2 * step); - } - - template SIMD_INLINE void LoadBody5(const uint8_t * p, v128_u8 a[5]) - { - a[0] = Load(p - 2 * step); - a[1] = Load(p - step); - a[2] = Load(p); - a[3] = Load(p + step); - a[4] = Load(p + 2 * step); - } - - template SIMD_INLINE void LoadTail5(const uint8_t * p, v128_u8 a[5]) - { - a[0] = Load(p - 2 * step); - a[1] = Load(p - step); - a[2] = Load(p); - a[3] = LoadAfterLast(a[2]); - a[4] = LoadAfterLast(a[3]); - } - - template v128_u16 UnpackU8(v128_u8 a, v128_u8 b = K8_00); - - template <> SIMD_INLINE v128_u16 UnpackU8<0>(v128_u8 a, v128_u8 b) - { - return (v128_u16)vec_perm(a, b, K8_PERM_UNPACK_LO_U8); - } - - template <> SIMD_INLINE v128_u16 UnpackU8<1>(v128_u8 a, v128_u8 b) - { - return (v128_u16)vec_perm(a, b, K8_PERM_UNPACK_HI_U8); - } - - SIMD_INLINE v128_u16 UnpackLoU8(v128_u8 a, v128_u8 b = K8_00) - { - return (v128_u16)vec_perm(a, b, K8_PERM_UNPACK_LO_U8); - } - - SIMD_INLINE v128_u16 UnpackHiU8(v128_u8 a, v128_u8 b = K8_00) - { - return (v128_u16)vec_perm(a, b, K8_PERM_UNPACK_HI_U8); - } - - SIMD_INLINE v128_u32 UnpackLoU16(v128_u16 a, v128_u16 b = K16_0000) - { - return (v128_u32)vec_perm(a, b, K8_PERM_UNPACK_LO_U16); - } - - SIMD_INLINE v128_u32 UnpackHiU16(v128_u16 a, v128_u16 b = K16_0000) - { - return (v128_u32)vec_perm(a, b, K8_PERM_UNPACK_HI_U16); - } - } -#endif//SIMD_VMX_ENABLE - -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE uint8x16_t Load(const uint8_t * p); - - template <> SIMD_INLINE uint8x16_t Load(const uint8_t * p) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - return vld1q_u8(p); - } - - template <> SIMD_INLINE uint8x16_t Load(const uint8_t * p) - { -#if defined(__GNUC__) -#if SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - uint8_t * _p = (uint8_t *)__builtin_assume_aligned(p, 16); - return vld1q_u8(_p); -#elif defined(_MSC_VER) - return vld1q_u8_ex(p, 128); -#else - return vld1q_u8(p); -#endif - } - - template SIMD_INLINE int8x16_t Load(const int8_t* p) - { - return (int8x16_t)Load((const uint8_t*)p); - } - - template SIMD_INLINE int16x8_t Load(const int16_t * p) - { - return (int16x8_t)Load((const uint8_t*)p); - } - - template SIMD_INLINE uint16x8_t Load(const uint16_t * p) - { - return (uint16x8_t)Load((const uint8_t*)p); - } - - template SIMD_INLINE int32x4_t Load(const int32_t * p) - { - return (int32x4_t)Load((const uint8_t*)p); - } - - template SIMD_INLINE uint32x4_t Load(const uint32_t * p) - { - return (uint32x4_t)Load((const uint8_t*)p); - } - - template SIMD_INLINE float32x4_t Load(const float * p); - - template <> SIMD_INLINE float32x4_t Load(const float * p) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - return vld1q_f32(p); - } - - template <> SIMD_INLINE float32x4_t Load(const float * p) - { -#if defined(__GNUC__) -#if SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - float * _p = (float *)__builtin_assume_aligned(p, 16); - return vld1q_f32(_p); -#elif defined(_MSC_VER) - return vld1q_f32_ex(p, 128); -#else - return vld1q_f32(p); -#endif - } - - template SIMD_INLINE uint8x16x2_t Load2(const uint8_t * p); - - template <> SIMD_INLINE uint8x16x2_t Load2(const uint8_t * p) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - return vld2q_u8(p); - } - - template <> SIMD_INLINE uint8x16x2_t Load2(const uint8_t * p) - { -#if defined(__GNUC__) -#if SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - uint8_t * _p = (uint8_t *)__builtin_assume_aligned(p, 16); - return vld2q_u8(_p); -#elif defined(_MSC_VER) - return vld2q_u8_ex(p, 128); -#else - return vld2q_u8(p); -#endif - } - - template SIMD_INLINE uint16x8x2_t Load2(const uint16_t * p); - - template <> SIMD_INLINE uint16x8x2_t Load2(const uint16_t * p) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - return vld2q_u16(p); - } - - template <> SIMD_INLINE uint16x8x2_t Load2(const uint16_t * p) - { -#if defined(__GNUC__) -#if SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - uint16_t * _p = (uint16_t *)__builtin_assume_aligned(p, 16); - return vld2q_u16(_p); -#elif defined(_MSC_VER) - return vld2q_u16_ex(p, 128); -#else - return vld2q_u16(p); -#endif - } - - template SIMD_INLINE uint8x16x3_t Load3(const uint8_t * p); - - template <> SIMD_INLINE uint8x16x3_t Load3(const uint8_t * p) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - return vld3q_u8(p); - } - - template <> SIMD_INLINE uint8x16x3_t Load3(const uint8_t * p) - { -#if defined(__GNUC__) -#if SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - uint8_t * _p = (uint8_t *)__builtin_assume_aligned(p, 16); - return vld3q_u8(_p); -#elif defined(_MSC_VER) - return vld3q_u8_ex(p, 128); -#else - return vld3q_u8(p); -#endif - } - - template SIMD_INLINE uint8x16x4_t Load4(const uint8_t * p); - - template <> SIMD_INLINE uint8x16x4_t Load4(const uint8_t * p) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - return vld4q_u8(p); - } - - template <> SIMD_INLINE uint8x16x4_t Load4(const uint8_t * p) - { -#if defined(__GNUC__) -#if SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - uint8_t * _p = (uint8_t *)__builtin_assume_aligned(p, 16); - return vld4q_u8(_p); -#elif defined(_MSC_VER) - return vld4q_u8_ex(p, 128); -#else - return vld4q_u8(p); -#endif - } - - template SIMD_INLINE float32x4x2_t Load2(const float * p); - - template <> SIMD_INLINE float32x4x2_t Load2(const float * p) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - return vld2q_f32(p); - } - - template <> SIMD_INLINE float32x4x2_t Load2(const float * p) - { -#if defined(__GNUC__) -#if SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - float * _p = (float *)__builtin_assume_aligned(p, 16); - return vld2q_f32(_p); -#elif defined(_MSC_VER) - return vld2q_f32_ex(p, 128); -#else - return vld2q_f32(p); -#endif - } - - template SIMD_INLINE float32x4x3_t Load3(const float * p); - - template <> SIMD_INLINE float32x4x3_t Load3(const float * p) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - return vld3q_f32(p); - } - - template <> SIMD_INLINE float32x4x3_t Load3(const float * p) - { -#if defined(__GNUC__) -#if SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - float * _p = (float *)__builtin_assume_aligned(p, 16); - return vld3q_f32(_p); -#elif defined(_MSC_VER) - return vld3q_f32_ex(p, 128); -#else - return vld3q_f32(p); -#endif - } - - template SIMD_INLINE float32x4x4_t Load4(const float * p); - - template <> SIMD_INLINE float32x4x4_t Load4(const float * p) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - return vld4q_f32(p); - } - - template <> SIMD_INLINE float32x4x4_t Load4(const float * p) - { -#if defined(__GNUC__) -#if SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - float * _p = (float *)__builtin_assume_aligned(p, 16); - return vld4q_f32(_p); -#elif defined(_MSC_VER) - return vld4q_f32_ex(p, 128); -#else - return vld4q_f32(p); -#endif - } - - template SIMD_INLINE uint8x8_t LoadHalf(const uint8_t * p); - - template <> SIMD_INLINE uint8x8_t LoadHalf(const uint8_t * p) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - return vld1_u8(p); - } - - template <> SIMD_INLINE uint8x8_t LoadHalf(const uint8_t * p) - { -#if defined(__GNUC__) -#if SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - uint8_t * _p = (uint8_t *)__builtin_assume_aligned(p, 8); - return vld1_u8(_p); -#elif defined(_MSC_VER) - return vld1_u8_ex(p, 64); -#else - return vld1_u8(p); -#endif - } - - template SIMD_INLINE uint16x4_t LoadHalf(const uint16_t * p) - { - return (uint16x4_t)LoadHalf((const uint8_t*)p); - } - - template SIMD_INLINE float32x2_t LoadHalf(const float * p); - - template <> SIMD_INLINE float32x2_t LoadHalf(const float * p) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - return vld1_f32(p); - } - - template <> SIMD_INLINE float32x2_t LoadHalf(const float * p) - { -#if defined(__GNUC__) -#if SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - float * _p = (float *)__builtin_assume_aligned(p, 8); - return vld1_f32(_p); -#elif defined(_MSC_VER) - return vld1_f32_ex(p, 64); -#else - return vld1_f32(p); -#endif - } - - template SIMD_INLINE uint8x8x2_t LoadHalf2(const uint8_t * p); - - template <> SIMD_INLINE uint8x8x2_t LoadHalf2(const uint8_t * p) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - return vld2_u8(p); - } - - template <> SIMD_INLINE uint8x8x2_t LoadHalf2(const uint8_t * p) - { -#if defined(__GNUC__) -#if SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - uint8_t * _p = (uint8_t *)__builtin_assume_aligned(p, 8); - return vld2_u8(_p); -#elif defined(_MSC_VER) - return vld2_u8_ex(p, 64); -#else - return vld2_u8(p); -#endif - } - - template SIMD_INLINE uint8x8x3_t LoadHalf3(const uint8_t * p); - - template <> SIMD_INLINE uint8x8x3_t LoadHalf3(const uint8_t * p) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - return vld3_u8(p); - } - - template <> SIMD_INLINE uint8x8x3_t LoadHalf3(const uint8_t * p) - { -#if defined(__GNUC__) -#if SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - uint8_t * _p = (uint8_t *)__builtin_assume_aligned(p, 8); - return vld3_u8(_p); -#elif defined(_MSC_VER) - return vld3_u8_ex(p, 64); -#else - return vld3_u8(p); -#endif - } - - template SIMD_INLINE uint8x8x4_t LoadHalf4(const uint8_t * p); - - template <> SIMD_INLINE uint8x8x4_t LoadHalf4(const uint8_t * p) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - return vld4_u8(p); - } - - template <> SIMD_INLINE uint8x8x4_t LoadHalf4(const uint8_t * p) - { -#if defined(__GNUC__) -#if SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - uint8_t * _p = (uint8_t *)__builtin_assume_aligned(p, 8); - return vld4_u8(_p); -#elif defined(_MSC_VER) - return vld4_u8_ex(p, 64); -#else - return vld4_u8(p); -#endif - } - - template SIMD_INLINE float32x2x4_t LoadHalf4(const float * p); - - template <> SIMD_INLINE float32x2x4_t LoadHalf4(const float * p) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - return vld4_f32(p); - } - - template <> SIMD_INLINE float32x2x4_t LoadHalf4(const float * p) - { -#if defined(__GNUC__) -#if SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - float * _p = (float *)__builtin_assume_aligned(p, 8); - return vld4_f32(_p); -#elif defined(_MSC_VER) - return vld4_f32_ex(p, 64); -#else - return vld4_f32(p); -#endif - } - - template SIMD_INLINE uint8x16_t LoadBeforeFirst(uint8x16_t first) - { - return vextq_u8(vextq_u8(first, first, count), first, 16 - count); - } - - template SIMD_INLINE uint8x16_t LoadAfterLast(uint8x16_t last) - { - return vextq_u8(last, vextq_u8(last, last, 16 - count), count); - } - - template SIMD_INLINE void LoadNose3(const uint8_t * p, uint8x16_t a[3]) - { - a[1] = Load(p); - a[0] = LoadBeforeFirst(a[1]); - a[2] = vld1q_u8(p + step); - } - - template SIMD_INLINE void LoadBody3(const uint8_t * p, uint8x16_t a[3]) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - a[0] = vld1q_u8(p - step); - a[1] = Load(p); - a[2] = vld1q_u8(p + step); - } - - template SIMD_INLINE void LoadTail3(const uint8_t * p, uint8x16_t a[3]) - { - a[0] = vld1q_u8(p - step); - a[1] = Load(p); - a[2] = LoadAfterLast(a[1]); - } - - template SIMD_INLINE void LoadNose5(const uint8_t * p, uint8x16_t a[5]) - { - a[2] = Load(p); - a[1] = LoadBeforeFirst(a[2]); - a[0] = LoadBeforeFirst(a[1]); - a[3] = vld1q_u8(p + step); - a[4] = vld1q_u8(p + 2 * step); - } - - template SIMD_INLINE void LoadBody5(const uint8_t * p, uint8x16_t a[5]) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - a[0] = vld1q_u8(p - 2 * step); - a[1] = vld1q_u8(p - step); - a[2] = Load(p); - a[3] = vld1q_u8(p + step); - a[4] = vld1q_u8(p + 2 * step); - } - - template SIMD_INLINE void LoadTail5(const uint8_t * p, uint8x16_t a[5]) - { - a[0] = vld1q_u8(p - 2 * step); - a[1] = vld1q_u8(p - step); - a[2] = Load(p); - a[3] = LoadAfterLast(a[2]); - a[4] = LoadAfterLast(a[3]); - } - - SIMD_INLINE void LoadNoseDx(const uint8_t * p, uint8x16_t a[3]) - { - a[0] = LoadBeforeFirst<1>(vld1q_u8(p)); - a[2] = vld1q_u8(p + 1); - } - - SIMD_INLINE void LoadBodyDx(const uint8_t * p, uint8x16_t a[3]) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - a[0] = vld1q_u8(p - 1); - a[2] = vld1q_u8(p + 1); - } - - SIMD_INLINE void LoadTailDx(const uint8_t * p, uint8x16_t a[3]) - { - a[0] = vld1q_u8(p - 1); - a[2] = LoadAfterLast<1>(vld1q_u8(p)); - } - - template SIMD_INLINE uint8x8_t LoadBeforeFirst(uint8x8_t first) - { - return vext_u8(vext_u8(first, first, count), first, 8 - count); - } - - template SIMD_INLINE uint8x8_t LoadAfterLast(uint8x8_t last) - { - return vext_u8(last, vext_u8(last, last, 8 - count), count); - } - - SIMD_INLINE float32x4_t Load(const float * p0, const float * p1) - { - return vcombine_f32(vld1_f32(p0), vld1_f32(p1)); - } - - SIMD_INLINE float32x4_t LoadPadZeroNose1(const float * p) - { - return vextq_f32(vdupq_n_f32(0.0f), Load(p + 1), 3); - } - - SIMD_INLINE float32x4_t LoadPadZeroTail1(const float * p) - { - return vextq_f32(Load(p - 1), vdupq_n_f32(0.0f), 1); - } - - SIMD_INLINE float32x4_t LoadPadZeroTail2(const float * p) - { - return vextq_f32(Load(p - 2), vdupq_n_f32(0.0f), 2); - } - } -#endif//SIMD_NEON_ENABLE - -#ifdef SIMD_MSA_ENABLE - namespace Msa - { - template SIMD_INLINE v16u8 Load(const uint8_t * p); - - template <> SIMD_INLINE v16u8 Load(const uint8_t * p) - { - return (v16u8)__msa_ld_b((v16i8*)p, 0); - } - - template <> SIMD_INLINE v16u8 Load(const uint8_t * p) - { - return (v16u8)__msa_ld_b((v16i8*)p, 0); - } - - template SIMD_INLINE v8i16 Load(const int16_t * p); - - template <> SIMD_INLINE v8i16 Load(const int16_t * p) - { - return __msa_ld_h((v8i16*)p, 0); - } - - template <> SIMD_INLINE v8i16 Load(const int16_t * p) - { - return __msa_ld_h((v8i16*)p, 0); - } - } -#endif//SIMD_MSA_ENABLE -} -#endif//__SimdLoad_h__ diff --git a/src/3rd/Simd/Simd/SimdLog.h b/src/3rd/Simd/Simd/SimdLog.h deleted file mode 100644 index 25cce2e8..00000000 --- a/src/3rd/Simd/Simd/SimdLog.h +++ /dev/null @@ -1,269 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdLog_h__ -#define __SimdLog_h__ - -#include "Simd/SimdConst.h" -#include "Simd/SimdArray.h" - -#ifdef SIMD_LOG_ENABLE -#include -#include - -namespace Simd -{ - template SIMD_INLINE void Log(const T * data, size_t size, const std::string & name) - { - std::cout << name.c_str() << " = { "; - for (size_t i = 0; i < size; i++) - { - std::cout << int(data[i]) << " "; - } - std::cout << "} " << std::endl << std::flush; - } - - template<> SIMD_INLINE void Log(const float * data, size_t size, const std::string & name) - { - std::cout << name.c_str() << " = { " << std::setprecision(3) << std::fixed; - for (size_t i = 0; i < size; i++) - { - std::cout << data[i] << " "; - } - std::cout << "} " << std::endl << std::flush; - } - - template SIMD_INLINE void Log(const Array & array, const std::string & name) - { - Log(array.data, array.size, name); - } - -#ifdef SIMD_SSE_ENABLE - namespace Sse - { - SIMD_INLINE void Log(const __m128 & value, const std::string & name) - { - float buffer[F]; - _mm_storeu_ps(buffer, value); - Simd::Log(buffer, F, name); - } - } -#endif //SIMD_SSE_ENABLE - -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { - template SIMD_INLINE void Log(const __m128i & value, const std::string & name) - { - const size_t n = sizeof(__m128i) / sizeof(T); - T buffer[n]; - _mm_storeu_si128((__m128i*)buffer, value); - Simd::Log(buffer, n, name); - } - } -#endif //SIMD_SSE2_ENABLE - -#ifdef SIMD_SSE41_ENABLE - namespace Sse41 - { - using namespace Sse; - } -#endif //SIMD_SSE41_ENABLE - -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - SIMD_INLINE void Log(const __m256 & value, const std::string & name) - { - float buffer[F]; - _mm256_storeu_ps(buffer, value); - Simd::Log(buffer, F, name); - } - } -#endif //SIMD_AVX_ENABLE - -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - using Avx::Log; - - template SIMD_INLINE void Log(const __m256i & value, const std::string & name) - { - const size_t n = sizeof(__m256i) / sizeof(T); - T buffer[n]; - _mm256_storeu_si256((__m256i*)buffer, value); - Simd::Log(buffer, n, name); - } - } -#endif //SIMD_AVX2_ENABLE - -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - SIMD_INLINE void Log(const __m512 & value, const std::string & name) - { - float buffer[F]; - _mm512_storeu_ps(buffer, value); - Simd::Log(buffer, F, name); - } - } -#endif //SIMD_AVX2512F_ENABLE - -#ifdef SIMD_VMX_ENABLE - namespace Vmx - { - SIMD_INLINE void Log(const v128_u8 & value, const std::string & name) - { - std::cout << name << " = { "; - for (int i = 0; i < 16; i++) - { - int element = vec_extract(value, i); - std::cout << element << " "; - } - std::cout << "} " << std::endl; - } - - SIMD_INLINE void Log(const v128_u16 & value, const std::string & name) - { - std::cout << name << " = { "; - for (int i = 0; i < 8; i++) - { - int element = vec_extract(value, i); - std::cout << element << " "; - } - std::cout << "} " << std::endl; - } - - SIMD_INLINE void Log(const v128_s16 & value, const std::string & name) - { - std::cout << name << " = { "; - for (int i = 0; i < 8; i++) - { - int element = vec_extract(value, i); - std::cout << element << " "; - } - std::cout << "} " << std::endl; - } - - SIMD_INLINE void Log(const v128_u32 & value, const std::string & name) - { - std::cout << name << " = { "; - for (int i = 0; i < 4; i++) - { - int element = vec_extract(value, i); - std::cout << element << " "; - } - std::cout << "} " << std::endl; - } - - SIMD_INLINE void Log(const v128_f32 & value, const std::string & name) - { - std::cout << name << " = { "; - for (int i = 0; i < 4; i++) - { - float element = vec_extract(value, i); - std::cout << element << " "; - } - std::cout << "} " << std::endl; - } - } -#endif//SIMD_VMX_ENABLE - -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE void Log(const uint8x16_t & value, const std::string & name) - { - uint8_t buffer[16]; - vst1q_u8(buffer, value); - Simd::Log(buffer, 16, name); - } - - SIMD_INLINE void Log(const uint16x8_t & value, const std::string & name) - { - uint16_t buffer[8]; - vst1q_u16(buffer, value); - Simd::Log(buffer, 8, name); - } - - SIMD_INLINE void Log(const int16x8_t & value, const std::string & name) - { - int16_t buffer[8]; - vst1q_s16(buffer, value); - Simd::Log(buffer, 8, name); - } - - SIMD_INLINE void Log(const uint32x4_t & value, const std::string & name) - { - uint32_t buffer[4]; - vst1q_u32(buffer, value); - Simd::Log(buffer, 4, name); - } - - SIMD_INLINE void Log(const int32x4_t & value, const std::string & name) - { - int32_t buffer[4]; - vst1q_s32(buffer, value); - Simd::Log(buffer, 4, name); - } - - SIMD_INLINE void Log(const float32x4_t & value, const std::string & name) - { - float buffer[4]; - vst1q_f32(buffer, value); - std::cout << name << " = { "; - for (int i = 0; i < 4; i++) - std::cout << buffer[i] << " "; - std::cout << "} " << std::endl; - } - } -#endif//SIMD_NEON_ENABLE -} - -#define SIMD_LOG(value) Log(value, #value) -#define SIMD_LOG1(value) Log(value, #value) -#define SIMD_LOG2(value) Log(value, #value) -#define SIMD_LOG4(value) Log(value, #value) - -#define SIMD_LOG_SS(message) \ -{ \ - std::cout << __FUNCTION__ << " : " << message << std::endl; \ - std::cout.flush(); \ -} - -#define SIMD_LOG_LINE() std::cout << __FUNCTION__ << " : " << __LINE__ << std::endl << std::flush; - -#else//SIMD_LOG_ENABLE - -#define SIMD_LOG(value) -#define SIMD_LOG1(value) -#define SIMD_LOG2(value) -#define SIMD_LOG4(value) - -#define SIMD_LOG_SS(message) - -#define SIMD_LOG_LINE() - -#endif//SIMD_LOG_ENABLE - -#endif//__SimdLog_h__ diff --git a/src/3rd/Simd/Simd/SimdMath.h b/src/3rd/Simd/Simd/SimdMath.h deleted file mode 100644 index 8ea43027..00000000 --- a/src/3rd/Simd/Simd/SimdMath.h +++ /dev/null @@ -1,1720 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar, -* 2018-2019 Radchenko Andrey. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdMath_h__ -#define __SimdMath_h__ - -#include "Simd/SimdDefs.h" -#include "Simd/SimdConst.h" - -namespace Simd -{ - template SIMD_INLINE void Swap(T & a, T & b) - { - T t = a; - a = b; - b = t; - } - - template SIMD_INLINE T Min(T a, T b) - { - return a < b ? a : b; - } - - template SIMD_INLINE T Max(T a, T b) - { - return a > b ? a : b; - } - - template SIMD_INLINE T Abs(T a) - { - return a < 0 ? -a : a; - } - - template SIMD_INLINE T RestrictRange(T value, T min, T max) - { - return Max(min, Min(max, value)); - } - - template SIMD_INLINE T Square(T a) - { - return a*a; - } - -#ifndef SIMD_ROUND -#define SIMD_ROUND - SIMD_INLINE int Round(double value) - { -#if defined(SIMD_X64_ENABLE) - __m128d _value = _mm_set_sd(value); - return _mm_cvtsd_si32(_value); -#else - return (int)(value + (value >= 0.0 ? 0.5 : -0.5)); -#endif - } - - SIMD_INLINE int Round(float value) - { -#if defined(SIMD_X64_ENABLE) || (defined(SIMD_X86_ENABLE) && !defined(SIMD_SSE_DISABLE)) - __m128 _value = _mm_set_ss(value); - return _mm_cvtss_si32(_value); -#else - return (int)(value + (value >= 0.0f ? 0.5f : -0.5f)); -#endif - } -#endif - - namespace Base - { - SIMD_INLINE int Min(int a, int b) - { - return a < b ? a : b; - } - - SIMD_INLINE int Max(int a, int b) - { - return a > b ? a : b; - } - - SIMD_INLINE int RestrictRange(int value, int min = 0, int max = 255) - { - return Max(min, Min(max, value)); - } - - SIMD_INLINE int Square(int a) - { - return a*a; - } - - SIMD_INLINE int SquaredDifference(int a, int b) - { - return Square(a - b); - } - - SIMD_INLINE int AbsDifference(int a, int b) - { - return a > b ? a - b : b - a; - } - - SIMD_INLINE int Average(int a, int b) - { - return (a + b + 1) >> 1; - } - - SIMD_INLINE int Average(int a, int b, int c, int d) - { - return (a + b + c + d + 2) >> 2; - } - - SIMD_INLINE void SortU8(int & a, int & b) - { - int d = a - b; - int m = ~(d >> 8); - b += d & m; - a -= d & m; - } - - SIMD_INLINE int AbsDifferenceU8(int a, int b) - { - int d = a - b; - int m = d >> 8; - return (d & ~m) | (-d & m); - } - - SIMD_INLINE int MaxU8(int a, int b) - { - int d = a - b; - int m = ~(d >> 8); - return b + (d & m); - } - - SIMD_INLINE int MinU8(int a, int b) - { - int d = a - b; - int m = ~(d >> 8); - return a - (d & m); - } - - SIMD_INLINE int SaturatedSubtractionU8(int a, int b) - { - int d = a - b; - int m = ~(d >> 8); - return (d & m); - } - - SIMD_INLINE int DivideBy255(int value) - { - return (value + 1 + (value >> 8)) >> 8; - } - - template SIMD_INLINE int DivideBy16(int value); - - template <> SIMD_INLINE int DivideBy16(int value) - { - return (value + 8) >> 4; - } - - template <> SIMD_INLINE int DivideBy16(int value) - { - return value >> 4; - } - - template SIMD_INLINE int GaussianBlur3x3(const uint8_t *s0, const uint8_t *s1, const uint8_t *s2, size_t x0, size_t x1, size_t x2) - { - return DivideBy16(s0[x0] + 2 * s0[x1] + s0[x2] + (s1[x0] + 2 * s1[x1] + s1[x2]) * 2 + s2[x0] + 2 * s2[x1] + s2[x2]); - } - - SIMD_INLINE void Reorder16bit(const uint8_t * src, uint8_t * dst) - { - uint16_t value = *(uint16_t*)src; - *(uint16_t*)dst = value >> 8 | value << 8; - } - - SIMD_INLINE void Reorder32bit(const uint8_t * src, uint8_t * dst) - { - uint32_t value = *(uint32_t*)src; - *(uint32_t*)dst = - (value & 0x000000FF) << 24 | (value & 0x0000FF00) << 8 | - (value & 0x00FF0000) >> 8 | (value & 0xFF000000) >> 24; - } - - SIMD_INLINE void Reorder64bit(const uint8_t * src, uint8_t * dst) - { - uint64_t value = *(uint64_t*)src; - *(uint64_t*)dst = - (value & 0x00000000000000FF) << 56 | (value & 0x000000000000FF00) << 40 | - (value & 0x0000000000FF0000) << 24 | (value & 0x00000000FF000000) << 8 | - (value & 0x000000FF00000000) >> 8 | (value & 0x0000FF0000000000) >> 24 | - (value & 0x00FF000000000000) >> 40 | (value & 0xFF00000000000000) >> 56; - } - - SIMD_INLINE float RoughSigmoid(float value) // maximal absolute error 0.002294 - { - float x = ::fabs(value); - float x2 = x*x; - float e = 1.0f + x + x2*0.5417f + x2*x2*0.1460f; - return 1.0f / (1.0f + (value > 0 ? 1.0f / e : e)); - } - - SIMD_INLINE float RoughSigmoid2(float value) // maximal absolute error 0.001721 - { - float e1 = Simd::Max(1.0f - value*0.0078125f, 0.5f); - float e2 = e1*e1; - float e4 = e2*e2; - float e8 = e4*e4; - float e16 = e8*e8; - float e32 = e16*e16; - float e64 = e32*e32; - return 1.0f / (1.0f + e64*e64); - } - - SIMD_INLINE float DerivativeSigmoid(float function) - { - return (1.0f - function)*function; - } - - SIMD_INLINE float RoughTanh(float value) // maximal absolute error 0.001514 - { - float x = ::fabs(value); - float x2 = x*x; - float pe = 1.0f + x + x2*0.5658f + x2*x2*0.1430f; - float ne = 1.0f / pe; - return (value > 0 ? 1.0f : -1.0f)*(pe - ne) / (pe + ne); - } - - SIMD_INLINE float DerivativeTanh(float function) - { - return (1.0f - function*function); - } - - SIMD_INLINE void UpdateWeights(const float * x, size_t offset, float a, float b, float * d, float * w) - { - float _d = a*d[offset] + b*x[offset]; - d[offset] = _d; - w[offset] += _d; - } - - SIMD_INLINE void AdaptiveGradientUpdate(const float * delta, size_t offset, float norm, float alpha, float epsilon, float * gradient, float * weight) - { - float d = delta[offset] * norm; - gradient[offset] += d*d; - weight[offset] -= alpha * d / ::sqrt(gradient[offset] + epsilon); - } - } - -#ifdef SIMD_SSE_ENABLE - namespace Sse - { - SIMD_INLINE __m128 Square(__m128 value) - { - return _mm_mul_ps(value, value); - } - - template __m128 Sqrt(__m128 value); - - template<> SIMD_INLINE __m128 Sqrt(__m128 value) - { - return _mm_sqrt_ps(value); - } - - template<> SIMD_INLINE __m128 Sqrt(__m128 value) - { - return _mm_mul_ps(_mm_rsqrt_ps(_mm_max_ps(value, _mm_set1_ps(0.00000001f))), value); - } - - SIMD_INLINE __m128 Combine(__m128 mask, __m128 positive, __m128 negative) - { - return _mm_or_ps(_mm_and_ps(mask, positive), _mm_andnot_ps(mask, negative)); - } - - SIMD_INLINE __m128 RightNotZero32f(ptrdiff_t count) - { - const int32_t mask[DF] = { 0, 0, 0, 0, -1, -1, -1, -1 }; - return _mm_loadu_ps((float*)(mask + Simd::RestrictRange(count, 0, F))); - } - - SIMD_INLINE __m128 LeftNotZero32f(ptrdiff_t count) - { - const int32_t mask[DF] = { -1, -1, -1, -1, 0, 0, 0, 0 }; - return _mm_loadu_ps((float*)(mask + F - Simd::RestrictRange(count, 0, F))); - } - - template SIMD_INLINE __m128 Masked(const __m128 & value, const __m128 & mask); - - template <> SIMD_INLINE __m128 Masked(const __m128 & value, const __m128 & mask) - { - return value; - } - - template <> SIMD_INLINE __m128 Masked(const __m128 & value, const __m128 & mask) - { - return _mm_and_ps(value, mask); - } - - SIMD_INLINE void Max2x3s(const float * src, size_t stride, float * dst) - { - __m128 z = _mm_setzero_ps(); - __m128 s0 = _mm_loadl_pi(z, (__m64*)src); - __m128 s1 = _mm_loadl_pi(z, (__m64*)(src + stride)); - __m128 s2 = _mm_loadl_pi(z, (__m64*)(src + 2 * stride)); - __m128 m = _mm_max_ps(_mm_max_ps(s0, s1), s2); - return _mm_store_ss(dst, _mm_max_ss(m, _mm_shuffle_ps(m, m, 1))); - } - - SIMD_INLINE void Max2x2s(const float * src, size_t stride, float * dst) - { - __m128 z = _mm_setzero_ps(); - __m128 s0 = _mm_loadl_pi(z, (__m64*)src); - __m128 s1 = _mm_loadl_pi(z, (__m64*)(src + stride)); - __m128 m = _mm_max_ps(s0, s1); - return _mm_store_ss(dst, _mm_max_ss(m, _mm_shuffle_ps(m, m, 1))); - } - } -#endif//SIMD_SSE_ENABLE - -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { - SIMD_INLINE __m128i SaturateI16ToU8(__m128i value) - { - return _mm_min_epi16(K16_00FF, _mm_max_epi16(value, K_ZERO)); - } - - SIMD_INLINE __m128i MaxI16(__m128i a, __m128i b, __m128i c) - { - return _mm_max_epi16(a, _mm_max_epi16(b, c)); - } - - SIMD_INLINE __m128i MinI16(__m128i a, __m128i b, __m128i c) - { - return _mm_min_epi16(a, _mm_min_epi16(b, c)); - } - - SIMD_INLINE void SortU8(__m128i & a, __m128i & b) - { - __m128i t = a; - a = _mm_min_epu8(t, b); - b = _mm_max_epu8(t, b); - } - - SIMD_INLINE __m128i ShiftLeft(__m128i a, size_t shift) - { - __m128i t = a; - if (shift & 8) - t = _mm_slli_si128(t, 8); - if (shift & 4) - t = _mm_slli_si128(t, 4); - if (shift & 2) - t = _mm_slli_si128(t, 2); - if (shift & 1) - t = _mm_slli_si128(t, 1); - return t; - } - - SIMD_INLINE __m128i ShiftRight(__m128i a, size_t shift) - { - __m128i t = a; - if (shift & 8) - t = _mm_srli_si128(t, 8); - if (shift & 4) - t = _mm_srli_si128(t, 4); - if (shift & 2) - t = _mm_srli_si128(t, 2); - if (shift & 1) - t = _mm_srli_si128(t, 1); - return t; - } - - SIMD_INLINE __m128i HorizontalSum32(__m128i a) - { - return _mm_add_epi64(_mm_unpacklo_epi32(a, K_ZERO), _mm_unpackhi_epi32(a, K_ZERO)); - } - - SIMD_INLINE __m128i AbsDifferenceU8(__m128i a, __m128i b) - { - return _mm_sub_epi8(_mm_max_epu8(a, b), _mm_min_epu8(a, b)); - } - - SIMD_INLINE __m128i AbsDifferenceI16(__m128i a, __m128i b) - { - return _mm_sub_epi16(_mm_max_epi16(a, b), _mm_min_epi16(a, b)); - } - - SIMD_INLINE __m128i MulU8(__m128i a, __m128i b) - { - __m128i lo = _mm_mullo_epi16(_mm_unpacklo_epi8(a, K_ZERO), _mm_unpacklo_epi8(b, K_ZERO)); - __m128i hi = _mm_mullo_epi16(_mm_unpackhi_epi8(a, K_ZERO), _mm_unpackhi_epi8(b, K_ZERO)); - return _mm_packus_epi16(lo, hi); - } - - SIMD_INLINE __m128i DivideI16By255(__m128i value) - { - return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(value, K16_0001), _mm_srli_epi16(value, 8)), 8); - } - - SIMD_INLINE __m128i BinomialSum16(const __m128i & a, const __m128i & b, const __m128i & c) - { - return _mm_add_epi16(_mm_add_epi16(a, c), _mm_add_epi16(b, b)); - } - - SIMD_INLINE __m128i BinomialSum16(const __m128i & a, const __m128i & b, const __m128i & c, const __m128i & d) - { - return _mm_add_epi16(_mm_add_epi16(a, d), _mm_mullo_epi16(_mm_add_epi16(b, c), K16_0003)); - } - - SIMD_INLINE __m128i Combine(__m128i mask, __m128i positive, __m128i negative) - { - return _mm_or_si128(_mm_and_si128(mask, positive), _mm_andnot_si128(mask, negative)); - } - - SIMD_INLINE __m128i AlphaBlendingI16(__m128i src, __m128i dst, __m128i alpha) - { - return DivideI16By255(_mm_add_epi16(_mm_mullo_epi16(src, alpha), _mm_mullo_epi16(dst, _mm_sub_epi16(K16_00FF, alpha)))); - } - - template SIMD_INLINE __m128i UnpackU8(__m128i a, __m128i b = K_ZERO); - - template <> SIMD_INLINE __m128i UnpackU8<0>(__m128i a, __m128i b) - { - return _mm_unpacklo_epi8(a, b); - } - - template <> SIMD_INLINE __m128i UnpackU8<1>(__m128i a, __m128i b) - { - return _mm_unpackhi_epi8(a, b); - } - - template __m128i U8To16(__m128i a); - - template <> SIMD_INLINE __m128i U8To16<0>(__m128i a) - { - return _mm_and_si128(a, K16_00FF); - } - - template <> SIMD_INLINE __m128i U8To16<1>(__m128i a) - { - return _mm_and_si128(_mm_srli_si128(a, 1), K16_00FF); - } - - template SIMD_INLINE __m128i UnpackU16(__m128i a, __m128i b = K_ZERO); - - template <> SIMD_INLINE __m128i UnpackU16<0>(__m128i a, __m128i b) - { - return _mm_unpacklo_epi16(a, b); - } - - template <> SIMD_INLINE __m128i UnpackU16<1>(__m128i a, __m128i b) - { - return _mm_unpackhi_epi16(a, b); - } - - template SIMD_INLINE __m128i UnpackI16(__m128i a); - - template <> SIMD_INLINE __m128i UnpackI16<0>(__m128i a) - { - return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); - } - - template <> SIMD_INLINE __m128i UnpackI16<1>(__m128i a) - { - return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16); - } - - SIMD_INLINE __m128i DivideBy16(__m128i value) - { - return _mm_srli_epi16(_mm_add_epi16(value, K16_0008), 4); - } - - template SIMD_INLINE __m128 Broadcast(__m128 a) - { - return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), index * 0x55)); - } - - template SIMD_INLINE __m128i Shuffle32i(__m128i lo, __m128i hi) - { - return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(lo), _mm_castsi128_ps(hi), imm)); - } - - template SIMD_INLINE __m128 Shuffle32f(__m128 a) - { - return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), imm)); - } - - SIMD_INLINE __m128i Average16(const __m128i & a, const __m128i & b, const __m128i & c, const __m128i & d) - { - return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a, b), _mm_add_epi16(c, d)), K16_0002), 2); - } - - SIMD_INLINE __m128i Merge16(const __m128i & even, __m128i odd) - { - return _mm_or_si128(_mm_slli_si128(odd, 1), even); - } - } -#endif// SIMD_SSE2_ENABLE - -#ifdef SIMD_SSE3_ENABLE - namespace Sse3 - { -#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug - using Sse::RightNotZero32f; -#endif - } -#endif//SIMD_SSE3_ENABLE - -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - using namespace Sse2; - - template __m128i ConditionalAbs(__m128i a); - - template <> SIMD_INLINE __m128i ConditionalAbs(__m128i a) - { - return _mm_abs_epi16(a); - } - - template <> SIMD_INLINE __m128i ConditionalAbs(__m128i a) - { - return a; - } - - template SIMD_INLINE __m128i SubUnpackedU8(__m128i a, __m128i b) - { - return _mm_maddubs_epi16(UnpackU8(a, b), K8_01_FF); - } - } -#endif// SIMD_SSSE3_ENABLE - -#ifdef SIMD_SSE41_ENABLE - namespace Sse41 - { -#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug - using Sse::RightNotZero32f; -#endif - - template SIMD_INLINE __m128i UnpackI8(__m128i a); - - template <> SIMD_INLINE __m128i UnpackI8<0>(__m128i a) - { - return _mm_cvtepi8_epi16(a); - } - - template <> SIMD_INLINE __m128i UnpackI8<1>(__m128i a) - { - return _mm_cvtepi8_epi16(_mm_srli_si128(a, 8)); - } - - template SIMD_INLINE __m128i UnpackI16(__m128i a); - - template <> SIMD_INLINE __m128i UnpackI16<0>(__m128i a) - { - return _mm_cvtepi16_epi32(a); - } - - template <> SIMD_INLINE __m128i UnpackI16<1>(__m128i a) - { - return _mm_cvtepi16_epi32(_mm_srli_si128(a, 8)); - } - - template SIMD_INLINE __m128 Alignr(const __m128 & s0, const __m128 & s4) - { - return _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(s4), _mm_castps_si128(s0), shift * 4)); - } - - SIMD_INLINE int TestZ(__m128 value) - { - return _mm_testz_si128(_mm_castps_si128(value), K_INV_ZERO); - } - - SIMD_INLINE int TestZ(__m128i value) - { - return _mm_testz_si128(value, K_INV_ZERO); - } - } -#endif// SIMD_SSE41_ENABLE - -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - SIMD_INLINE __m256 Square(__m256 value) - { - return _mm256_mul_ps(value, value); - } - - template __m256 Sqrt(__m256 value); - - template<> SIMD_INLINE __m256 Sqrt(__m256 value) - { - return _mm256_sqrt_ps(value); - } - - template<> SIMD_INLINE __m256 Sqrt(__m256 value) - { - return _mm256_mul_ps(_mm256_rsqrt_ps(_mm256_max_ps(value, _mm256_set1_ps(0.00000001f))), value); - } - - SIMD_INLINE __m256 RightNotZero32f(ptrdiff_t count) - { - const int32_t mask[DF] = { 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1 }; - return _mm256_loadu_ps((float*)(mask + Simd::RestrictRange(count, 0, F))); - } - - SIMD_INLINE __m256 LeftNotZero32f(ptrdiff_t count) - { - const int32_t mask[DF] = { -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 }; - return _mm256_loadu_ps((float*)(mask + F - Simd::RestrictRange(count, 0, F))); - } - - SIMD_INLINE __m256i RightNotZero32i(ptrdiff_t count) - { - const int32_t mask[DF] = { 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1 }; - return _mm256_loadu_si256((__m256i*)(mask + Simd::RestrictRange(count, 0, F))); - } - - SIMD_INLINE __m256i LeftNotZero32i(ptrdiff_t count) - { - const int32_t mask[DF] = { -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 }; - return _mm256_loadu_si256((__m256i*)(mask + F - Simd::RestrictRange(count, 0, F))); - } - - SIMD_INLINE __m256 PermutedHorizontalAdd(__m256 a, __m256 b) - { - return _mm256_hadd_ps(_mm256_permute2f128_ps(a, b, 0x20), _mm256_permute2f128_ps(a, b, 0x31)); - } - - SIMD_INLINE void Add8ExtractedSums(const __m256 * src, float * dst) - { - __m256 lo = PermutedHorizontalAdd(PermutedHorizontalAdd(src[0], src[1]), PermutedHorizontalAdd(src[2], src[3])); - __m256 hi = PermutedHorizontalAdd(PermutedHorizontalAdd(src[4], src[5]), PermutedHorizontalAdd(src[6], src[7])); - _mm256_storeu_ps(dst, _mm256_add_ps(_mm256_loadu_ps(dst), PermutedHorizontalAdd(lo, hi))); - } - - template SIMD_INLINE __m256 Masked(const __m256 & value, const __m256 & mask); - - template <> SIMD_INLINE __m256 Masked(const __m256 & value, const __m256 & mask) - { - return value; - } - - template <> SIMD_INLINE __m256 Masked(const __m256 & value, const __m256 & mask) - { - return _mm256_and_ps(value, mask); - } - } -#endif//SIMD_AVX_ENABLE - -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { -#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug - using Avx::RightNotZero32f; -#endif - - SIMD_INLINE __m256i SaturateI16ToU8(__m256i value) - { - return _mm256_min_epi16(K16_00FF, _mm256_max_epi16(value, K_ZERO)); - } - - SIMD_INLINE __m256i MaxI16(__m256i a, __m256i b, __m256i c) - { - return _mm256_max_epi16(a, _mm256_max_epi16(b, c)); - } - - SIMD_INLINE __m256i MinI16(__m256i a, __m256i b, __m256i c) - { - return _mm256_min_epi16(a, _mm256_min_epi16(b, c)); - } - - SIMD_INLINE void SortU8(__m256i & a, __m256i & b) - { - __m256i t = a; - a = _mm256_min_epu8(t, b); - b = _mm256_max_epu8(t, b); - } - - SIMD_INLINE __m256i HorizontalSum32(__m256i a) - { - return _mm256_add_epi64(_mm256_unpacklo_epi32(a, K_ZERO), _mm256_unpackhi_epi32(a, K_ZERO)); - } - - SIMD_INLINE __m256i AbsDifferenceU8(__m256i a, __m256i b) - { - return _mm256_sub_epi8(_mm256_max_epu8(a, b), _mm256_min_epu8(a, b)); - } - - SIMD_INLINE __m256i AbsDifferenceI16(__m256i a, __m256i b) - { - return _mm256_sub_epi16(_mm256_max_epi16(a, b), _mm256_min_epi16(a, b)); - } - - SIMD_INLINE __m256i MulU8(__m256i a, __m256i b) - { - __m256i lo = _mm256_mullo_epi16(_mm256_unpacklo_epi8(a, K_ZERO), _mm256_unpacklo_epi8(b, K_ZERO)); - __m256i hi = _mm256_mullo_epi16(_mm256_unpackhi_epi8(a, K_ZERO), _mm256_unpackhi_epi8(b, K_ZERO)); - return _mm256_packus_epi16(lo, hi); - } - - SIMD_INLINE __m256i DivideI16By255(__m256i value) - { - return _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(value, K16_0001), _mm256_srli_epi16(value, 8)), 8); - } - - SIMD_INLINE __m256i BinomialSum16(const __m256i & a, const __m256i & b, const __m256i & c) - { - return _mm256_add_epi16(_mm256_add_epi16(a, c), _mm256_add_epi16(b, b)); - } - - template __m256i ConditionalAbs(__m256i a); - - template <> SIMD_INLINE __m256i ConditionalAbs(__m256i a) - { - return _mm256_abs_epi16(a); - } - - template <> SIMD_INLINE __m256i ConditionalAbs(__m256i a) - { - return a; - } - - template SIMD_INLINE __m256i UnpackU8(__m256i a, __m256i b = K_ZERO); - - template <> SIMD_INLINE __m256i UnpackU8<0>(__m256i a, __m256i b) - { - return _mm256_unpacklo_epi8(a, b); - } - - template <> SIMD_INLINE __m256i UnpackU8<1>(__m256i a, __m256i b) - { - return _mm256_unpackhi_epi8(a, b); - } - - template __m256i U8To16(__m256i a); - - template <> SIMD_INLINE __m256i U8To16<0>(__m256i a) - { - return _mm256_and_si256(a, K16_00FF); - } - - template <> SIMD_INLINE __m256i U8To16<1>(__m256i a) - { - return _mm256_and_si256(_mm256_srli_si256(a, 1), K16_00FF); - } - - template SIMD_INLINE __m256i SubUnpackedU8(__m256i a, __m256i b) - { - return _mm256_maddubs_epi16(UnpackU8(a, b), K8_01_FF); - } - - template SIMD_INLINE __m256i UnpackU16(__m256i a, __m256i b = K_ZERO); - - template <> SIMD_INLINE __m256i UnpackU16<0>(__m256i a, __m256i b) - { - return _mm256_unpacklo_epi16(a, b); - } - - template <> SIMD_INLINE __m256i UnpackU16<1>(__m256i a, __m256i b) - { - return _mm256_unpackhi_epi16(a, b); - } - - template SIMD_INLINE __m256 Alignr(const __m256 & s0, const __m256 & s4) - { - return _mm256_castsi256_ps(_mm256_alignr_epi8(_mm256_castps_si256(s4), _mm256_castps_si256(s0), shift * 4)); - } - - template SIMD_INLINE __m256i Shuffle32i(__m256i lo, __m256i hi) - { - return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(lo), _mm256_castsi256_ps(hi), imm)); - } - - template SIMD_INLINE __m256 Permute4x64(__m256 a) - { - return _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(a), imm)); - } - - template SIMD_INLINE __m256 Shuffle32f(__m256 a) - { - return _mm256_castsi256_ps(_mm256_shuffle_epi32(_mm256_castps_si256(a), imm)); - } - - template SIMD_INLINE __m256 Broadcast(__m256 a) - { - return _mm256_castsi256_ps(_mm256_shuffle_epi32(_mm256_castps_si256(a), index * 0x55)); - } - - SIMD_INLINE __m256i Average16(const __m256i & a, const __m256i & b, const __m256i & c, const __m256i & d) - { - return _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(_mm256_add_epi16(a, b), _mm256_add_epi16(c, d)), K16_0002), 2); - } - - SIMD_INLINE __m256i Merge16(const __m256i & even, __m256i odd) - { - return _mm256_or_si256(_mm256_slli_si256(odd, 1), even); - } - - SIMD_INLINE const __m256i Shuffle(const __m256i & value, const __m256i & shuffle) - { - return _mm256_or_si256(_mm256_shuffle_epi8(value, _mm256_add_epi8(shuffle, K8_SHUFFLE_0)), - _mm256_shuffle_epi8(_mm256_permute4x64_epi64(value, 0x4E), _mm256_add_epi8(shuffle, K8_SHUFFLE_1))); - } - - template __m256 Fmadd(__m256 a, __m256 b, __m256 c); - - template <> SIMD_INLINE __m256 Fmadd(__m256 a, __m256 b, __m256 c) - { - return _mm256_fmadd_ps(a, b, c); - } - - template <> SIMD_INLINE __m256 Fmadd(__m256 a, __m256 b, __m256 c) - { - return _mm256_add_ps(_mm256_or_ps(_mm256_mul_ps(a, b), _mm256_setzero_ps()), c); - } - - template SIMD_INLINE __m256i Cvt8uTo16i(__m256i a) - { - return _mm256_cvtepu8_epi16(_mm256_extractf128_si256(a, part)); - } - - template SIMD_INLINE __m256i Cvt8iTo16i(__m256i a) - { - return _mm256_cvtepi8_epi16(_mm256_extractf128_si256(a, part)); - } - - SIMD_INLINE __m256i PermutedHadd32i(__m256i a, __m256i b) - { - return _mm256_hadd_epi32(_mm256_permute2f128_si256(a, b, 0x20), _mm256_permute2f128_si256(a, b, 0x31)); - } - } -#endif// SIMD_AVX2_ENABLE - -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - __mmask16 SIMD_INLINE TailMask16(ptrdiff_t tail) - { - return tail <= 0 ? __mmask16(0) : (tail >= 16 ? __mmask16(-1) : __mmask16(-1) >> (16 - tail)); - } - - SIMD_INLINE __m512 Cast(const __m512i & value) - { -#if defined(__clang__) - return (__m512)value; -#else - return _mm512_castsi512_ps(value); -#endif - } - - SIMD_INLINE __m512i Cast(const __m512 & value) - { -#if defined(__clang__) - return (__m512i)value; -#else - return _mm512_castps_si512(value); -#endif - } - - SIMD_INLINE __m512 Or(const __m512 & a, const __m512 & b) - { -#if defined(__clang__) - return (__m512)_mm512_or_epi32((__m512i)a, (__m512i)b); -#else - return _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b))); -#endif - } - - SIMD_INLINE __m512 And(const __m512 & a, const __m512 & b) - { -#if defined(__clang__) - return (__m512)_mm512_and_epi32((__m512i)a, (__m512i)b); -#else - return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b))); -#endif - } - - SIMD_INLINE __m512 AndMaskZ(const __m512 & a, const __m512 & b, __mmask16 m) - { -#if defined(__clang__) - return (__m512)_mm512_maskz_and_epi32(m, (__m512i)a, (__m512i)b); -#else - return _mm512_castsi512_ps(_mm512_maskz_and_epi32(m, _mm512_castps_si512(a), _mm512_castps_si512(b))); -#endif - } - - SIMD_INLINE __m512 AndNot(const __m512 & a, const __m512 & b) - { -#if defined(__clang__) - return (__m512)_mm512_andnot_epi32((__m512i)a, (__m512i)b); -#else - return _mm512_castsi512_ps(_mm512_andnot_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b))); -#endif - } - - SIMD_INLINE __m512 AndNotMaskZ(const __m512 & a, const __m512 & b, __mmask16 m) - { -#if defined(__clang__) - return (__m512)_mm512_maskz_andnot_epi32(m, (__m512i)a, (__m512i)b); -#else - return _mm512_castsi512_ps(_mm512_maskz_andnot_epi32(m, _mm512_castps_si512(a), _mm512_castps_si512(b))); -#endif - } - - SIMD_INLINE __m512 Xor(const __m512 & a, const __m512 & b) - { -#if defined(__clang__) - return (__m512)_mm512_xor_epi32((__m512i)a, (__m512i)b); -#else - return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b))); -#endif - } - - SIMD_INLINE __m512 Rcp14(const __m512 & a) - { -#if defined(_MSC_VER) && _MSC_VER<1922 - return _mm512_maskz_rcp14_ps(_MM_K0_REG, a); -#else - return _mm512_rcp14_ps(a); -#endif - } - - SIMD_INLINE __m512 Rsqrt14(const __m512 & a) - { -#if defined(_MSC_VER) && _MSC_VER<1922 - return _mm512_maskz_rsqrt14_ps(_MM_K0_REG, a); -#else - return _mm512_rsqrt14_ps(a); -#endif - } - - template SIMD_INLINE __m512 Mask(__m512 a, __mmask16 m); - - template<> SIMD_INLINE __m512 Mask(__m512 a, __mmask16 m) - { - return _mm512_maskz_mov_ps(m, a); - } - - template<> SIMD_INLINE __m512 Mask(__m512 a, __mmask16 m) - { - return a; - } - - template SIMD_INLINE __m512 Alignr(const __m512 & lo, const __m512 & hi) - { - return Cast(_mm512_alignr_epi32(Cast(hi), Cast(lo), shift)); - } - - template<> SIMD_INLINE __m512 Alignr<0>(const __m512 & lo, const __m512 & hi) - { - return lo; - } - - template<> SIMD_INLINE __m512 Alignr(const __m512 & lo, const __m512 & hi) - { - return hi; - } - - template SIMD_INLINE __m512 Alignr(const __m512 & lo, const __m512 & hi, __mmask16 m) - { - return Mask(Alignr(lo, hi), m); - } - - template SIMD_INLINE __m512 Interleave(const __m512 & a, const __m512 & b); - - template <> SIMD_INLINE __m512 Interleave<0>(const __m512 & a, const __m512 & b) - { - return _mm512_permutex2var_ps(a, K32_INTERLEAVE_0, b); - } - - template <> SIMD_INLINE __m512 Interleave<1>(const __m512 & a, const __m512 & b) - { - return _mm512_permutex2var_ps(a, K32_INTERLEAVE_1, b); - } - - template SIMD_INLINE __m512 Deinterleave(const __m512 & a, const __m512 & b); - - template <> SIMD_INLINE __m512 Deinterleave<0>(const __m512 & a, const __m512 & b) - { - return _mm512_permutex2var_ps(a, K32_DEINTERLEAVE_0, b); - } - - template <> SIMD_INLINE __m512 Deinterleave<1>(const __m512 & a, const __m512 & b) - { - return _mm512_permutex2var_ps(a, K32_DEINTERLEAVE_1, b); - } - - template __m512 Fmadd(__m512 a, __m512 b, __m512 c); - - template <> SIMD_INLINE __m512 Fmadd(__m512 a, __m512 b, __m512 c) - { - return _mm512_fmadd_ps(a, b, c); - } - - template <> SIMD_INLINE __m512 Fmadd(__m512 a, __m512 b, __m512 c) - { -#ifdef _MSC_VER - return _mm512_add_ps(_mm512_fmadd_ps(a, b, _mm512_setzero_ps()), c); -#else - return _mm512_maskz_add_ps(-1, _mm512_mul_ps(a, b), c); -#endif - } - } -#endif //SIMD_AVX512F_ENABLE - -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - SIMD_INLINE __mmask32 TailMask32(ptrdiff_t tail) - { - return tail <= 0 ? __mmask32(0) : (tail >= 32 ? __mmask32(-1) : __mmask32(-1) >> (32 - tail)); - } - - SIMD_INLINE __mmask64 TailMask64(ptrdiff_t tail) - { - return tail <= 0 ? __mmask64(0) : (tail >= 64 ? __mmask64(-1) : __mmask64(-1) >> (64 - tail)); - } - - SIMD_INLINE __mmask64 NoseMask64(ptrdiff_t nose) - { - return nose <= 0 ? __mmask64(0) : (nose >= 64 ? __mmask64(-1) : __mmask64(-1) << (64 - nose)); - } - -#if defined(_MSC_VER) || (defined(__GNUC__) && defined(__LZCNT__)) - SIMD_INLINE size_t FirstNotZero64(__mmask64 mask) - { -#ifdef SIMD_X64_ENABLE - return _tzcnt_u64(mask); -#else - return (__mmask32(mask) ? _tzcnt_u32(__mmask32(mask)) : _tzcnt_u32(__mmask32(mask >> 32)) + 32); -#endif - } - - SIMD_INLINE size_t LastNotZero64(__mmask64 mask) - { -#ifdef SIMD_X64_ENABLE - return 64 - _lzcnt_u64(mask); -#else - return 64 - (__mmask32(mask >> 32) ? _lzcnt_u32(__mmask32(mask >> 32)) : _lzcnt_u32(__mmask32(mask)) + 32); -#endif - } -#endif - -#if defined(_MSC_VER) || (defined(__GNUC__) && defined(__POPCNT__)) - SIMD_INLINE size_t Popcnt64(__mmask64 mask) - { -#ifdef SIMD_X64_ENABLE - return _mm_popcnt_u64(mask); -#else - return _mm_popcnt_u32(__mmask32(mask)) + _mm_popcnt_u32(__mmask32(mask >> 32)); -#endif - } -#endif - - SIMD_INLINE void SortU8(__m512i & a, __m512i & b) - { -#if 0 - __m512i t = a; - a = _mm512_min_epu8(t, b); - b = _mm512_max_epu8(t, b); -#else - __m512i d = _mm512_subs_epu8(a, b); - a = _mm512_sub_epi8(a, d); - b = _mm512_add_epi8(b, d); -#endif - } - - SIMD_INLINE __m512i BinomialSum16(const __m512i & a, const __m512i & b, const __m512i & c) - { - return _mm512_add_epi16(_mm512_add_epi16(a, c), _mm512_add_epi16(b, b)); - } - - SIMD_INLINE __m512i DivideI16By255(__m512i value) - { - return _mm512_srli_epi16(_mm512_add_epi16(_mm512_add_epi16(value, K16_0001), _mm512_srli_epi16(value, 8)), 8); - } - - template SIMD_INLINE __m512i UnpackU8(__m512i a, __m512i b = K_ZERO); - - template <> SIMD_INLINE __m512i UnpackU8<0>(__m512i a, __m512i b) - { - return _mm512_unpacklo_epi8(a, b); - } - - template <> SIMD_INLINE __m512i UnpackU8<1>(__m512i a, __m512i b) - { - return _mm512_unpackhi_epi8(a, b); - } - - template __m512i U8To16(__m512i a); - - template <> SIMD_INLINE __m512i U8To16<0>(__m512i a) - { - return _mm512_and_si512(a, K16_00FF); - } - - template <> SIMD_INLINE __m512i U8To16<1>(__m512i a) - { - return _mm512_shuffle_epi8(a, K8_SUFFLE_BGRA_TO_G0A0); - } - - template SIMD_INLINE __m512i UnpackU16(__m512i a, __m512i b = K_ZERO); - - template <> SIMD_INLINE __m512i UnpackU16<0>(__m512i a, __m512i b) - { - return _mm512_unpacklo_epi16(a, b); - } - - template <> SIMD_INLINE __m512i UnpackU16<1>(__m512i a, __m512i b) - { - return _mm512_unpackhi_epi16(a, b); - } - - SIMD_INLINE __m512i UnpackHalfU8(__m256i a, __m256i b = Avx2::K_ZERO) - { - return _mm512_unpacklo_epi8(_mm512_castsi256_si512(a), _mm512_castsi256_si512(b)); - } - - SIMD_INLINE __m512i AbsDifferenceU8(__m512i a, __m512i b) - { - return _mm512_sub_epi8(_mm512_max_epu8(a, b), _mm512_min_epu8(a, b)); - } - - SIMD_INLINE __m512i AbsDifferenceI16(__m512i a, __m512i b) - { - return _mm512_sub_epi16(_mm512_max_epi16(a, b), _mm512_min_epi16(a, b)); - } - - SIMD_INLINE __m512i Saturate16iTo8u(__m512i value) - { - return _mm512_min_epi16(K16_00FF, _mm512_max_epi16(value, K_ZERO)); - } - - SIMD_INLINE __m512i Hadd16(__m512i a, __m512i b) - { - __m512i ab0 = _mm512_permutex2var_epi16(a, K16_PERMUTE_FOR_HADD_0, b); - __m512i ab1 = _mm512_permutex2var_epi16(a, K16_PERMUTE_FOR_HADD_1, b); - return _mm512_add_epi16(ab0, ab1); - } - - SIMD_INLINE __m512i Hadd32(__m512i a, __m512i b) - { - __m512i ab0 = _mm512_permutex2var_epi32(a, K32_DEINTERLEAVE_0, b); - __m512i ab1 = _mm512_permutex2var_epi32(a, K32_DEINTERLEAVE_1, b); - return _mm512_add_epi32(ab0, ab1); - } - - SIMD_INLINE __m512i Permuted2Pack16iTo8u(__m512i lo, __m512i hi) - { - return _mm512_permutexvar_epi32(K32_PERMUTE_FOR_TWO_UNPACK, _mm512_packus_epi16(lo, hi)); - } - - template SIMD_INLINE __m512i SubUnpackedU8(__m512i a, __m512i b) - { - return _mm512_maddubs_epi16(UnpackU8(a, b), K8_01_FF); - } - - template __m512i ConditionalAbs(__m512i a); - - template <> SIMD_INLINE __m512i ConditionalAbs(__m512i a) - { - return _mm512_abs_epi16(a); - } - - template <> SIMD_INLINE __m512i ConditionalAbs(__m512i a) - { - return a; - } - - SIMD_INLINE __m512i HorizontalSum32(__m512i a) - { - return _mm512_add_epi64(_mm512_unpacklo_epi32(a, K_ZERO), _mm512_unpackhi_epi32(a, K_ZERO)); - } - - SIMD_INLINE __m512i SaturateI16ToU8(__m512i value) - { - return _mm512_min_epi16(K16_00FF, _mm512_max_epi16(value, K_ZERO)); - } - - SIMD_INLINE __m512i MaxI16(const __m512i a, __m512i b, __m512i c) - { - return _mm512_max_epi16(a, _mm512_max_epi16(b, c)); - } - - SIMD_INLINE __m512i MinI16(__m512i a, __m512i b, __m512i c) - { - return _mm512_min_epi16(a, _mm512_min_epi16(b, c)); - } - - template SIMD_INLINE __m512i Shuffle32i(__m512i lo, __m512i hi) - { - return _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(lo), _mm512_castsi512_ps(hi), imm)); - } - - template SIMD_INLINE __m512 Broadcast(__m512 a) - { - return _mm512_permute_ps(a, index * 0x55); - } - - template SIMD_INLINE __m512 Shuffle2x(__m512 a) - { - return _mm512_castsi512_ps(_mm512_permutex_epi64(_mm512_castps_si512(a), imm)); - } - - SIMD_INLINE __m512i Average16(const __m512i & a, const __m512i & b) - { - return _mm512_avg_epu16(a, b); - } - - SIMD_INLINE __m512i Average16(const __m512i & a, const __m512i & b, const __m512i & c, const __m512i & d) - { - return _mm512_srli_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(a, b), _mm512_add_epi16(c, d)), K16_0002), 2); - } - - SIMD_INLINE __m512i Merge16(const __m512i & even, __m512i odd) - { - return _mm512_or_si512(_mm512_slli_epi16(odd, 8), even); - } - - template SIMD_INLINE __m512i Cvt8uTo16i(__m512i a) - { - return _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(a, part)); - } - - template SIMD_INLINE __m512i Cvt8iTo16i(__m512i a) - { - return _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(a, part)); - } - } -#endif //SIMD_AVX512BW_ENABLE - -#ifdef SIMD_VMX_ENABLE - namespace Vmx - { - SIMD_INLINE v128_u8 ShiftLeft(v128_u8 value, size_t shift) - { - return vec_perm(K8_00, value, vec_lvsr(shift, (uint8_t*)0)); - } - - SIMD_INLINE v128_u16 ShiftLeft(v128_u16 value, size_t shift) - { - return (v128_u16)ShiftLeft((v128_u8)value, 2 * shift); - } - - SIMD_INLINE v128_u8 ShiftRight(v128_u8 value, size_t shift) - { - return vec_perm(value, K8_00, vec_lvsl(shift, (uint8_t*)0)); - } - - SIMD_INLINE v128_u16 MulHiU16(v128_u16 a, v128_u16 b) - { - return (v128_u16)vec_perm(vec_mule(a, b), vec_mulo(a, b), K8_PERM_MUL_HI_U16); - } - - SIMD_INLINE v128_u8 AbsDifferenceU8(v128_u8 a, v128_u8 b) - { - return vec_sub(vec_max(a, b), vec_min(a, b)); - } - - SIMD_INLINE v128_u16 SaturateI16ToU8(v128_s16 value) - { - return (v128_u16)vec_min((v128_s16)K16_00FF, vec_max(value, (v128_s16)K16_0000)); - } - - SIMD_INLINE void SortU8(v128_u8 & a, v128_u8 & b) - { - v128_u8 t = a; - a = vec_min(t, b); - b = vec_max(t, b); - } - - SIMD_INLINE v128_u16 DivideBy255(v128_u16 value) - { - return vec_sr(vec_add(vec_add(value, K16_0001), vec_sr(value, K16_0008)), K16_0008); - } - - SIMD_INLINE v128_u16 BinomialSum(const v128_u16 & a, const v128_u16 & b, const v128_u16 & c) - { - return vec_add(vec_add(a, c), vec_add(b, b)); - } - - template SIMD_INLINE T Max(const T & a, const T & b, const T & c) - { - return vec_max(a, vec_max(b, c)); - } - - template SIMD_INLINE T Min(const T & a, const T & b, const T & c) - { - return vec_min(a, vec_min(b, c)); - } - - template v128_u16 ConditionalAbs(v128_u16 a); - - template <> SIMD_INLINE v128_u16 ConditionalAbs(v128_u16 a) - { - return (v128_u16)vec_abs((v128_s16)a); - } - - template <> SIMD_INLINE v128_u16 ConditionalAbs(v128_u16 a) - { - return a; - } - } -#endif//SIMD_VMX_ENABLE - -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE uint8x16_t ShiftLeft(uint8x16_t value, size_t shift) - { - if (shift & 8) - value = vextq_u8(K8_00, value, 8); - if (shift & 4) - value = vextq_u8(K8_00, value, 12); - if (shift & 2) - value = vextq_u8(K8_00, value, 14); - if (shift & 1) - value = vextq_u8(K8_00, value, 15); - return value; - } - - SIMD_INLINE uint8x16_t ShiftRight(uint8x16_t value, size_t shift) - { - if (shift & 8) - value = vextq_u8(value, K8_00, 8); - if (shift & 4) - value = vextq_u8(value, K8_00, 4); - if (shift & 2) - value = vextq_u8(value, K8_00, 2); - if (shift & 1) - value = vextq_u8(value, K8_00, 1); - return value; - } - - SIMD_INLINE void SortU8(uint8x16_t & a, uint8x16_t & b) - { - uint8x16_t t = a; - a = vminq_u8(t, b); - b = vmaxq_u8(t, b); - } - - SIMD_INLINE uint16x8_t DivideI16By255(uint16x8_t value) - { - return vshrq_n_u16(vaddq_u16(vaddq_u16(value, K16_0001), vshrq_n_u16(value, 8)), 8); - } - - SIMD_INLINE uint16x8_t BinomialSum16(const uint16x8_t & a, const uint16x8_t & b, const uint16x8_t & c) - { - return vaddq_u16(vaddq_u16(a, c), vaddq_u16(b, b)); - } - - SIMD_INLINE int16x8_t BinomialSum(const int16x8_t & a, const int16x8_t & b, const int16x8_t & c) - { - return vaddq_s16(vaddq_s16(a, c), vaddq_s16(b, b)); - } - - SIMD_INLINE uint16x8_t BinomialSum16(const uint16x8_t & a, const uint16x8_t & b, const uint16x8_t & c, const uint16x8_t & d) - { - return vaddq_u16(vaddq_u16(a, d), vmulq_u16(vaddq_u16(b, c), K16_0003)); - } - - SIMD_INLINE uint16x8_t DivideBy16(uint16x8_t value) - { - return vshrq_n_u16(vaddq_u16(value, K16_0008), 4); - } - - template SIMD_INLINE uint8x8_t Half(uint8x16_t a); - - template <> SIMD_INLINE uint8x8_t Half<0>(uint8x16_t a) - { - return vget_low_u8(a); - } - - template <> SIMD_INLINE uint8x8_t Half<1>(uint8x16_t a) - { - return vget_high_u8(a); - } - - template SIMD_INLINE int8x8_t Half(int8x16_t a); - - template <> SIMD_INLINE int8x8_t Half<0>(int8x16_t a) - { - return vget_low_s8(a); - } - - template <> SIMD_INLINE int8x8_t Half<1>(int8x16_t a) - { - return vget_high_s8(a); - } - - template SIMD_INLINE uint16x4_t Half(uint16x8_t a); - - template <> SIMD_INLINE uint16x4_t Half<0>(uint16x8_t a) - { - return vget_low_u16(a); - } - - template <> SIMD_INLINE uint16x4_t Half<1>(uint16x8_t a) - { - return vget_high_u16(a); - } - - template SIMD_INLINE int16x4_t Half(int16x8_t a); - - template <> SIMD_INLINE int16x4_t Half<0>(int16x8_t a) - { - return vget_low_s16(a); - } - - template <> SIMD_INLINE int16x4_t Half<1>(int16x8_t a) - { - return vget_high_s16(a); - } - - template SIMD_INLINE uint32x2_t Half(uint32x4_t a); - - template <> SIMD_INLINE uint32x2_t Half<0>(uint32x4_t a) - { - return vget_low_u32(a); - } - - template <> SIMD_INLINE uint32x2_t Half<1>(uint32x4_t a) - { - return vget_high_u32(a); - } - - template SIMD_INLINE int32x2_t Half(int32x4_t a); - - template <> SIMD_INLINE int32x2_t Half<0>(int32x4_t a) - { - return vget_low_s32(a); - } - - template <> SIMD_INLINE int32x2_t Half<1>(int32x4_t a) - { - return vget_high_s32(a); - } - - template SIMD_INLINE float32x2_t Half(float32x4_t a); - - template <> SIMD_INLINE float32x2_t Half<0>(float32x4_t a) - { - return vget_low_f32(a); - } - - template <> SIMD_INLINE float32x2_t Half<1>(float32x4_t a) - { - return vget_high_f32(a); - } - - template SIMD_INLINE uint16x8_t UnpackU8(uint8x16_t a) - { - return vmovl_u8(Half(a)); - } - - template SIMD_INLINE int16x8_t UnpackU8s(uint8x16_t a) - { - return (int16x8_t)vmovl_u8(Half(a)); - } - - template SIMD_INLINE int16x8_t UnpackI8(int8x16_t a) - { - return vmovl_s8(Half(a)); - } - - template SIMD_INLINE uint32x4_t UnpackU16(uint16x8_t a) - { - return vmovl_u16(Half(a)); - } - - template SIMD_INLINE int32x4_t UnpackI16(int16x8_t a) - { - return vmovl_s16(Half(a)); - } - - SIMD_INLINE uint8x16_t PackU16(uint16x8_t lo, uint16x8_t hi) - { - return vcombine_u8(vmovn_u16(lo), vmovn_u16(hi)); - } - - SIMD_INLINE uint8x16_t PackSaturatedI16(int16x8_t lo, int16x8_t hi) - { - return vcombine_u8(vqmovun_s16(lo), vqmovun_s16(hi)); - } - - SIMD_INLINE uint8x16_t PackSaturatedU16(uint16x8_t lo, uint16x8_t hi) - { - return vcombine_u8(vqmovn_u16(lo), vqmovn_u16(hi)); - } - - SIMD_INLINE uint16x8_t PackU32(uint32x4_t lo, uint32x4_t hi) - { - return vcombine_u16(vmovn_u32(lo), vmovn_u32(hi)); - } - - SIMD_INLINE int16x8_t PackI32(int32x4_t lo, int32x4_t hi) - { - return vcombine_s16(vmovn_s32(lo), vmovn_s32(hi)); - } - - SIMD_INLINE uint8x8x2_t Deinterleave(uint8x16_t value) - { - uint8_t buffer[A]; - vst1q_u8(buffer, value); - return vld2_u8(buffer); - } - - template SIMD_INLINE uint8x16_t Stretch2(uint8x16_t a) - { - return (uint8x16_t)vmulq_u16(UnpackU8(a), K16_0101); - } - - template int16x8_t ConditionalAbs(int16x8_t a); - - template <> SIMD_INLINE int16x8_t ConditionalAbs(int16x8_t a) - { - return vabdq_s16(a, (int16x8_t)K16_0000); - } - - template <> SIMD_INLINE int16x8_t ConditionalAbs(int16x8_t a) - { - return a; - } - - SIMD_INLINE int16x8_t SaturateByU8(int16x8_t a) - { - return (int16x8_t)vmovl_u8(vqmovun_s16(a)); - } - - template SIMD_INLINE float32x4_t Reciprocal(const float32x4_t & a); - - template <> SIMD_INLINE float32x4_t Reciprocal<-1>(const float32x4_t & a) - { - float _a[4]; - vst1q_f32(_a, a); - float r[4] = { 1.0f / _a[0], 1.0f / _a[1], 1.0f / _a[2], 1.0f / _a[3] }; - return vld1q_f32(r); - }; - - template<> SIMD_INLINE float32x4_t Reciprocal<0>(const float32x4_t & a) - { - return vrecpeq_f32(a); - } - - template<> SIMD_INLINE float32x4_t Reciprocal<1>(const float32x4_t & a) - { - float32x4_t r = vrecpeq_f32(a); - return vmulq_f32(vrecpsq_f32(a, r), r); - } - - template<> SIMD_INLINE float32x4_t Reciprocal<2>(const float32x4_t & a) - { - float32x4_t r = vrecpeq_f32(a); - r = vmulq_f32(vrecpsq_f32(a, r), r); - return vmulq_f32(vrecpsq_f32(a, r), r); - } - - template SIMD_INLINE float32x4_t Div(const float32x4_t & a, const float32x4_t & b) - { - return vmulq_f32(a, Reciprocal(b)); - } - - template <> SIMD_INLINE float32x4_t Div<-1>(const float32x4_t & a, const float32x4_t & b) - { - float _a[4], _b[4]; - vst1q_f32(_a, a); - vst1q_f32(_b, b); - float c[4] = { _a[0] / _b[0], _a[1] / _b[1], _a[2] / _b[2], _a[3] / _b[3] }; - return vld1q_f32(c); - }; - - template SIMD_INLINE float32x4_t ReciprocalSqrt(const float32x4_t & a); - - template <> SIMD_INLINE float32x4_t ReciprocalSqrt<-1>(const float32x4_t & a) - { - float _a[4]; - vst1q_f32(_a, a); - float r[4] = { 1.0f / ::sqrtf(_a[0]), 1.0f / ::sqrtf(_a[1]), 1.0f / ::sqrtf(_a[2]), 1.0f / ::sqrtf(_a[3]) }; - return vld1q_f32(r); - } - - template<> SIMD_INLINE float32x4_t ReciprocalSqrt<0>(const float32x4_t & a) - { - return vrsqrteq_f32(a); - } - - template<> SIMD_INLINE float32x4_t ReciprocalSqrt<1>(const float32x4_t & a) - { - float32x4_t e = vrsqrteq_f32(a); - return vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), a), e); - } - - template<> SIMD_INLINE float32x4_t ReciprocalSqrt<2>(const float32x4_t & a) - { - float32x4_t e = vrsqrteq_f32(a); - e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), a), e); - return vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), a), e); - } - - template SIMD_INLINE float32x4_t Sqrt(const float32x4_t & a) - { - return vmulq_f32(a, ReciprocalSqrt(a)); - } - - template <> SIMD_INLINE float32x4_t Sqrt<-1>(const float32x4_t & a) - { - float _a[4]; - vst1q_f32(_a, a); - float r[4] = { ::sqrtf(_a[0]), ::sqrtf(_a[1]), ::sqrtf(_a[2]), ::sqrtf(_a[3]) }; - return vld1q_f32(r); - } - - template SIMD_INLINE int16x8_t Sub(uint8x16_t a, uint8x16_t b) - { - return (int16x8_t)vsubl_u8(Half(a), Half(b)); - } - - template SIMD_INLINE float32x4_t ToFloat(int16x8_t a) - { - return vcvtq_f32_s32(UnpackI16(a)); - } - - template SIMD_INLINE float32x4_t ToFloat(uint16x8_t a) - { - return vcvtq_f32_u32(UnpackU16(a)); - } - - SIMD_INLINE float32x4_t RightNotZero32f(size_t count) - { - const int32_t mask[DF] = { 0, 0, 0, 0, -1, -1, -1, -1 }; - return vld1q_f32((float*)(mask + Simd::RestrictRange(count, 0, F))); - } - - SIMD_INLINE float32x4_t LeftNotZero32f(ptrdiff_t count) - { - const int32_t mask[DF] = { -1, -1, -1, -1, 0, 0, 0, 0 }; - return vld1q_f32((float*)(mask + F - Simd::RestrictRange(count, 0, F))); - } - - SIMD_INLINE float32x4_t And(float32x4_t a, float32x4_t b) - { - return (float32x4_t)vandq_u32((uint32x4_t)a, (uint32x4_t)b); - } - - SIMD_INLINE float32x4_t Or(float32x4_t a, float32x4_t b) - { - return (float32x4_t)vorrq_u32((uint32x4_t)a, (uint32x4_t)b); - } - - template SIMD_INLINE float32x4_t Broadcast(float32x4_t a) - { - return vdupq_lane_f32(Half(a), index & 1); - } - - SIMD_INLINE uint16x8_t Hadd(uint16x8_t a, uint16x8_t b) - { - return vcombine_u16(vpadd_u16(Half<0>(a), Half<1>(a)), vpadd_u16(Half<0>(b), Half<1>(b))); - } - - SIMD_INLINE float32x4_t Hadd(float32x4_t a, float32x4_t b) - { - return vcombine_f32(vpadd_f32(Half<0>(a), Half<1>(a)), vpadd_f32(Half<0>(b), Half<1>(b))); - } - - template SIMD_INLINE float32x4_t Masked(const float32x4_t & value, const float32x4_t & mask); - - template <> SIMD_INLINE float32x4_t Masked(const float32x4_t & value, const float32x4_t & mask) - { - return value; - } - - template <> SIMD_INLINE float32x4_t Masked(const float32x4_t & value, const float32x4_t & mask) - { - return And(value, mask); - } - - SIMD_INLINE bool TestZ(uint32x4_t a) - { - return !(vgetq_lane_u32(a, 0) | vgetq_lane_u32(a, 1) | vgetq_lane_u32(a, 2) | vgetq_lane_u32(a, 3)); - } - - SIMD_INLINE int32x4_t Round(float32x4_t value) - { - uint32x4_t sign = vcgtq_f32(value, vdupq_n_f32(0)); - float32x4_t round = vbslq_f32(sign, vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)); - return vcvtq_s32_f32(vaddq_f32(value, round)); - } - - template float32x4_t Fmadd(float32x4_t a, float32x4_t b, float32x4_t c); - - template <> SIMD_INLINE float32x4_t Fmadd(float32x4_t a, float32x4_t b, float32x4_t c) - { - return vmlaq_f32(c, a, b); - } - - template <> SIMD_INLINE float32x4_t Fmadd(float32x4_t a, float32x4_t b, float32x4_t c) - { - return vaddq_f32(vmlaq_f32(vdupq_n_f32(0), a, b), c); - } - } -#endif//SIMD_NEON_ENABLE -} -#endif//__SimdMath_h__ diff --git a/src/3rd/Simd/Simd/SimdMemory.h b/src/3rd/Simd/Simd/SimdMemory.h deleted file mode 100644 index c75d26cb..00000000 --- a/src/3rd/Simd/Simd/SimdMemory.h +++ /dev/null @@ -1,287 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* 2016-2016 Sintegrial Technologies. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdMemory_h__ -#define __SimdMemory_h__ - -#include "Simd/SimdDefs.h" -#include "Simd/SimdMath.h" - -#if defined(__GNUC__) && defined(SIMD_ALLOCATE_ERROR_MESSAGE) -#include -#endif - -namespace Simd -{ - SIMD_INLINE size_t DivHi(size_t value, size_t divider) - { - return (value + divider - 1) / divider; - } - - SIMD_INLINE size_t AlignHiAny(size_t size, size_t align) - { - return (size + align - 1) / align * align; - } - - SIMD_INLINE size_t AlignLoAny(size_t size, size_t align) - { - return size / align * align; - } - - SIMD_INLINE size_t AlignHi(size_t size, size_t align) - { - return (size + align - 1) & ~(align - 1); - } - - SIMD_INLINE void * AlignHi(const void * ptr, size_t align) - { - return (void *)((((size_t)ptr) + align - 1) & ~(align - 1)); - } - - SIMD_INLINE size_t AlignLo(size_t size, size_t align) - { - return size & ~(align - 1); - } - - SIMD_INLINE void * AlignLo(const void * ptr, size_t align) - { - return (void *)(((size_t)ptr) & ~(align - 1)); - } - - SIMD_INLINE bool Aligned(size_t size, size_t align) - { - return size == AlignLo(size, align); - } - - SIMD_INLINE bool Aligned(const void * ptr, size_t align) - { - return ptr == AlignLo(ptr, align); - } - - SIMD_INLINE void * Allocate(size_t size, size_t align = SIMD_ALIGN) - { -#ifdef SIMD_NO_MANS_LAND - size += 2 * SIMD_NO_MANS_LAND; -#endif - void * ptr = NULL; -#if defined(_MSC_VER) - ptr = _aligned_malloc(size, align); -#elif defined(__MINGW32__) || defined(__MINGW64__) - ptr = __mingw_aligned_malloc(size, align); -#elif defined(__GNUC__) - align = AlignHi(align, sizeof(void *)); - size = AlignHi(size, align); - int result = ::posix_memalign(&ptr, align, size); -#ifdef SIMD_ALLOCATE_ERROR_MESSAGE - if (result != 0) - std::cout << "The function posix_memalign can't allocate " << size << " bytes with align " << align << " !" << std::endl << std::flush; -#endif -#ifdef SIMD_ALLOCATE_ASSERT - assert(result == 0); -#endif -#else - ptr = malloc(size); -#endif - -#ifdef SIMD_NO_MANS_LAND - if (ptr) - ptr = (char*)ptr + SIMD_NO_MANS_LAND; -#endif - return ptr; - } - - template T* Allocate(uint8_t*& buffer, size_t size, size_t align = SIMD_ALIGN) - { - T* ptr = (T*)buffer; - buffer = buffer + AlignHi(size * sizeof(T), align); - return ptr; - } - - SIMD_INLINE void Free(void * ptr) - { -#ifdef SIMD_NO_MANS_LAND - if (ptr) - ptr = (char*)ptr - SIMD_NO_MANS_LAND; -#endif -#if defined(_MSC_VER) - _aligned_free(ptr); -#elif defined(__MINGW32__) || defined(__MINGW64__) - return __mingw_aligned_free(ptr); -#else - free(ptr); -#endif - } - - struct Deletable - { - virtual ~Deletable() {} - }; - -#ifdef SIMD_SSE_ENABLE - namespace Sse - { - SIMD_INLINE bool Aligned(size_t size, size_t align = sizeof(__m128)) - { - return Simd::Aligned(size, align); - } - - SIMD_INLINE bool Aligned(const void * ptr, size_t align = sizeof(__m128)) - { - return Simd::Aligned(ptr, align); - } - } -#endif// SIMD_SSE_ENABLE - -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { - using Sse::Aligned; - } -#endif// SIMD_SSE2_ENABLE - -#ifdef SIMD_SSE3_ENABLE - namespace Sse3 - { - using Sse::Aligned; - } -#endif// SIMD_SSE3_ENABLE - -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - using Sse::Aligned; - } -#endif// SIMD_SSSE3_ENABLE - -#ifdef SIMD_SSE41_ENABLE - namespace Sse41 - { - using Sse::Aligned; - } -#endif// SIMD_SSE41_ENABLE - -#ifdef SIMD_SSE42_ENABLE - namespace Sse42 - { - } -#endif// SIMD_SSE42_ENABLE - -#ifdef SIMD_AVX_ENABLE - namespace Avx - { - SIMD_INLINE bool Aligned(size_t size, size_t align = sizeof(__m256)) - { - return Simd::Aligned(size, align); - } - - SIMD_INLINE bool Aligned(const void * ptr, size_t align = sizeof(__m256)) - { - return Simd::Aligned(ptr, align); - } - } -#endif// SIMD_AVX_ENABLE - -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - using Avx::Aligned; - } -#endif// SIMD_AVX2_ENABLE - -#ifdef SIMD_AVX512F_ENABLE - namespace Avx512f - { - SIMD_INLINE bool Aligned(size_t size, size_t align = sizeof(__m512)) - { - return Simd::Aligned(size, align); - } - - SIMD_INLINE bool Aligned(const void * ptr, size_t align = sizeof(__m512)) - { - return Simd::Aligned(ptr, align); - } - } -#endif// SIMD_AVX512F_ENABLE - -#ifdef SIMD_AVX512BW_ENABLE - namespace Avx512bw - { - using Avx512f::Aligned; - } -#endif// SIMD_AVX512BW_ENABLE - -#ifdef SIMD_VMX_ENABLE - namespace Vmx - { - SIMD_INLINE bool Aligned(size_t size, size_t align = sizeof(vec_uchar16)) - { - return Simd::Aligned(size, align); - } - - SIMD_INLINE bool Aligned(const void * ptr, size_t align = sizeof(vec_uchar16)) - { - return Simd::Aligned(ptr, align); - } - } -#endif// SIMD_VMX_ENABLE - -#ifdef SIMD_VSX_ENABLE - namespace Vsx - { - using Vmx::Aligned; - } -#endif// SIMD_VSX_ENABLE - -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE bool Aligned(size_t size, size_t align = sizeof(uint8x16_t)) - { - return Simd::Aligned(size, align); - } - - SIMD_INLINE bool Aligned(const void * ptr, size_t align = sizeof(uint8x16_t)) - { - return Simd::Aligned(ptr, align); - } - } -#endif// SIMD_NEON_ENABLE - -#ifdef SIMD_MSA_ENABLE - namespace Msa - { - SIMD_INLINE bool Aligned(size_t size, size_t align = sizeof(v16u8)) - { - return Simd::Aligned(size, align); - } - - SIMD_INLINE bool Aligned(const void * ptr, size_t align = sizeof(v16u8)) - { - return Simd::Aligned(ptr, align); - } - } -#endif// SIMD_MSA_ENABLE -} - -#endif//__SimdMemory_h__ diff --git a/src/3rd/Simd/Simd/SimdMotion.hpp b/src/3rd/Simd/Simd/SimdMotion.hpp deleted file mode 100644 index ce6426d1..00000000 --- a/src/3rd/Simd/Simd/SimdMotion.hpp +++ /dev/null @@ -1,1636 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdMotion_hpp__ -#define __SimdMotion_hpp__ - -#include "Simd/SimdPoint.hpp" -#include "Simd/SimdRectangle.hpp" -#include "Simd/SimdFrame.hpp" -#include "Simd/SimdDrawing.hpp" -#include "Simd/SimdFont.hpp" - -#include -#include -#include - -#ifndef SIMD_CHECK_PERFORMANCE -#define SIMD_CHECK_PERFORMANCE() -#endif - -namespace Simd -{ - /*! @ingroup cpp_motion - - \short Contains Framework for motion detection. - - \note This is wrapper around low-level \ref motion_detection API. - - Using example (motion detection in the video captured by OpenCV): - \code - #include - #include - #include - - #include "opencv2/opencv.hpp" - #ifndef SIMD_OPENCV_ENABLE - #define SIMD_OPENCV_ENABLE - #endif - #include "Simd/SimdMotion.hpp" - - using namespace Simd::Motion; - typedef std::list EventList; - typedef Simd::Pixel::Bgr24 Color; - - const Color Red(0, 0, 255), Yellow(0, 255, 255), White(0, 255, 255); - - void Annotate(const Metadata & metadata, const Simd::Font & font, EventList & events, View & image) - { - for (size_t i = 0; i < metadata.objects.size(); i++) - { - const Object & object = metadata.objects[i]; - bool alarmed = false; - for (size_t j = 0; j < metadata.events.size(); ++j) - { - const Event & event = metadata.events[j]; - if (event.objectId == object.id) - { - alarmed = true; - break; - } - } - Color color = alarmed ? Red : Yellow; - int width = alarmed ? 2 : 1; - Simd::DrawRectangle(image, object.rect, color, width); - font.Draw(image, ToString(object.id), Point(object.rect.left, object.rect.top - font.Height()), color); - for (size_t j = 1; j < object.trajectory.size(); ++j) - Simd::DrawLine(image, object.trajectory[j - 1].point, object.trajectory[j].point, color, width); - } - for (size_t i = 0; i < metadata.events.size(); ++i) - { - events.push_front(metadata.events[i]); - if (events.size()*font.Height() > image.height) - events.pop_back(); - } - Point location; - for (EventList::const_iterator it = events.begin(); it != events.end(); ++it) - { - std::stringstream ss; - Color color = White; - switch (it->type) - { - case Event::ObjectIn: - ss << "in " << it->objectId; - break; - case Event::ObjectOut: - ss << "out " << it->objectId; - break; - case Event::SabotageOn: - ss << "SABOTAGE ON"; - color = Red; - break; - case Event::SabotageOff: - ss << "SABOTAGE OFF"; - color = Red; - break; - }; - font.Draw(image, ss.str(), location, color); - location.y += font.Height(); - } - } - - int main(int argc, char * argv[]) - { - if (argc < 2) - { - std::cout << "You have to set video source! It can be 0 for camera or video file name." << std::endl; - return 1; - } - std::string source = argv[1]; - - cv::VideoCapture capture; - if (source == "0") - capture.open(0); - else - capture.open(source); - if (!capture.isOpened()) - { - std::cout << "Can't capture '" << source << "' !" << std::endl; - return 1; - } - - EventList events; - Detector detector; - Simd::Font font; - - const char * WINDOW_NAME = "MotionDetector"; - cv::namedWindow(WINDOW_NAME, 1); - double time = 0; - for (;;) - { - cv::Mat frame; - capture >> frame; - - View image = frame; - Frame input(image, false, time); - Metadata metadata; - - detector.NextFrame(input, metadata); - - font.Resize(image.height / 32); - - Annotate(metadata, font, events, image); - - cv::imshow(WINDOW_NAME, frame); - if (cvWaitKey(1) == 27)// "press 'Esc' to break video"; - break; - time += 0.040; - } - return 0; - } - \endcode - */ - namespace Motion - { - typedef double Time; /*!< \brief Time type. */ - typedef int Id; /*!< \brief ID type. */ - typedef std::string String; /*!< \brief String type. */ - typedef Simd::Point Size; /*!< \brief screen 2D-size (width and height). */ - typedef Simd::Point Point; /*!< \brief screen point (x and y). */ - typedef std::vector Points; /*!< \brief Vector of screen 2D-points. */ - typedef Simd::Rectangle Rect; /*!< \brief Screen rectangle. */ - typedef Simd::Point FSize; /*!< \brief ONVIF 2D-size (width and height). ONVIF size is restricted by range [0, 2]. */ - typedef Simd::Point FPoint; /*!< \brief ONVIF 2D-point (x and y). ONVIF coordinates are restricted by range [-1, 1]. */ - typedef std::vector FPoints; /*!< \brief Vector of ONVIF 2D-points. */ - typedef Simd::View View; /*!< \brief Image type. */ - typedef Simd::Frame Frame; /*!< \brief Frame type. */ - - /*! @ingroup cpp_motion - - \short Converts screen X-coordinate to ONVIF X-coordinate. - - \param [in] x - a screen X-coordinate. - \param [in] screenWidth - a screen width. - \return ONVIF X-coordinate. - */ - SIMD_INLINE double ScreenToOnvifX(ptrdiff_t x, ptrdiff_t screenWidth) - { - return double(2 * x - screenWidth) / screenWidth; - } - - /*! @ingroup cpp_motion - - \short Converts screen Y-coordinate to ONVIF Y-coordinate. - - \param [in] y - a screen Y-coordinate. - \param [in] screenHeight - a screen height. - \return ONVIF Y-coordinate. - */ - SIMD_INLINE double ScreenToOnvifY(ptrdiff_t y, ptrdiff_t screenHeight) - { - return double(screenHeight - 2 * y) / screenHeight; - } - - /*! @ingroup cpp_motion - - \short Converts screen 2D-coordinates to ONVIF 2D-coordinates. - - \param [in] point - a screen 2D-coordinates. - \param [in] screenSize - a screen size (width and height). - \return ONVIF 2D-coordinate. - */ - SIMD_INLINE FPoint ScreenToOnvif(const Point & point, const Point & screenSize) - { - return FPoint(ScreenToOnvifX(point.x, screenSize.x), ScreenToOnvifY(point.y, screenSize.y)); - } - - /*! @ingroup cpp_motion - - \short Converts screen 2D-size to ONVIF 2D-size. - - \param [in] size - a screen 2D-size. - \param [in] screenSize - a screen size (width and height). - \return ONVIF 2D-size. - */ - SIMD_INLINE FSize ScreenToOnvifSize(const Size & size, const Point & screenSize) - { - return FSize(double(size.x * 2 / screenSize.x), double(size.y * 2 / screenSize.y)); - } - - /*! @ingroup cpp_motion - - \short Converts ONVIF X-coordinate to screen X-coordinate. - - \param [in] x - a ONVIF X-coordinate. ONVIF coordinates are restricted by range [-1, 1]. - \param [in] screenWidth - a screen width. - \return screen X-coordinate. - */ - SIMD_INLINE ptrdiff_t OnvifToScreenX(double x, ptrdiff_t screenWidth) - { - return std::max(ptrdiff_t(0), std::min(screenWidth - 1, (ptrdiff_t)Simd::Round((1.0 + x)*screenWidth / 2.0))); - } - - /*! @ingroup cpp_motion - - \short Converts ONVIF Y-coordinate to screen Y-coordinate. - - \param [in] y - a ONVIF Y-coordinate. ONVIF coordinates are restricted by range [-1, 1]. - \param [in] screenHeight - a screen height. - \return screen Y-coordinate. - */ - SIMD_INLINE ptrdiff_t OnvifToScreenY(double y, ptrdiff_t screenHeight) - { - return std::max(ptrdiff_t(0), std::min(screenHeight - 1, (ptrdiff_t)Simd::Round((1.0 - y)*screenHeight / 2.0))); - } - - /*! @ingroup cpp_motion - - \short Converts ONVIF 2D-coordinates to screen 2D-coordinates. - - \param [in] point - a ONVIF 2D-coordinates. ONVIF coordinates are restricted by range [-1, 1]. - \param [in] screenSize - a screen size (width and height). - \return screen 2D-coordinate. - */ - SIMD_INLINE Point OnvifToScreen(const FPoint & point, const Point & screenSize) - { - return Point(OnvifToScreenX(point.x, screenSize.x), OnvifToScreenY(point.y, screenSize.y)); - } - - /*! @ingroup cpp_motion - - \short Converts ONVIF 2D-size to screen 2D-size. - - \param [in] size - a ONVIF 2D-size. ONVIF size is restricted by range [0, 2]. - \param [in] screenSize - a screen size (width and height). - \return screen 2D-size. - */ - SIMD_INLINE Size OnvifToScreenSize(const FSize & size, const Point & screenSize) - { - return Size(Round(size.x*screenSize.x / 2.0), Round(size.y*screenSize.y / 2.0)); - } - - /*! @ingroup cpp_motion - - \short Converts ID to string. - - \param [in] id - an ID. - \return string representation of ID. - */ - SIMD_INLINE String ToString(Id id) - { - std::stringstream ss; - ss << id; - return ss.str(); - } - - /*! @ingroup cpp_motion - - \short Position structure. - - Describes position (2D-point and time) of detected object. - */ - struct Position - { - Point point; /*!< \brief Screen 2D-point. */ - Time time; /*!< \brief A timestamp. */ - }; - typedef std::vector Positions; /*!< \brief Vector of object positions. */ - - /*! @ingroup cpp_motion - - \short Object structure. - - Describes object detected at screen by Simd::Motion::Detector. - */ - struct Object - { - Id id; /*!< \brief An object ID. */ - Rect rect; /*!< \brief A bounding box around the object. */ - Positions trajectory; /*!< \brief A trajectory of the object. */ - }; - typedef std::vector Objects; /*!< \brief Vector of objects. */ - - /*! @ingroup cpp_motion - - \short Event structure. - - Describes event generated by Simd::Motion::Detector. - */ - struct Event - { - /*! - \enum Type - - Describes types of event. - */ - enum Type - { - ObjectIn, /*!< \brief An appearing of new object. */ - ObjectOut, /*!< \brief A disappearing of object */ - SabotageOn, /*!< \brief An appearing of too big motion on the screen. */ - SabotageOff, /*!< \brief A disappearing of too big motion on the screen. */ - } type; /*!< \brief A type of event. */ - - String text; /*!< \brief Event text description. */ - Id objectId; /*!< \brief ID of object concerned with this event or -1. */ - - /*! - Constructs Event structure. - - \param [in] type_ - a type of a new event. - \param [in] text_ - a text description of the event. It is equal to empty string by default. - \param [in] objectId_ - an ID of object concerned with this event. It is equal to -1 by default. - */ - Event(Type type_, const String & text_ = String(), Id objectId_ = -1) - : type(type_) - , text(text_) - , objectId(objectId_) - { - } - }; - typedef std::vector Events; /*!< \brief Vector of events. */ - - /*! @ingroup cpp_motion - - \short Metadata structure. - - Contains lists of detected objects and events generated by Simd::Motion::Detector at current frame. - */ - struct Metadata - { - Objects objects; /*!< \brief A list of objects detected by Simd::Motion::Detector at current frame. */ - Events events; /*!< \brief A list of events generated by Simd::Motion::Detector at current frame. */ - }; - - /*! @ingroup cpp_motion - - \short Model structure. - - Describes screen scene. It is used by Simd::Motion::Detector for algorithm calibration. - */ - struct Model - { - FSize size; /*!< \brief A minimal size of object to detect. ONVIF size is restricted by range [0, 2]. */ - FPoints roi; /*!< \brief A ROI (region of interest). ONVIF coordinates is restricted by range [-1, 1]. */ - - /*! - Copy constructor of Model. - - \param [in] model - other model. - */ - Model(const Model & model) - : size(model.size) - , roi(model.roi) - { - } - - /*! - Constructs Event structure. - - \param [in] size_ - a minimal size of detected object. It is default value is (0.1, 0.1) ~ 0.25% of screen area. - \param [in] roi_ - a ROI (region of interest). It is empty by default (all screen). - */ - Model(const FSize & size_ = FSize(0.1, 0.1), const FPoints & roi_ = FPoints()) - : size(size_) - , roi(roi_) - { - if (roi.size() < 3) - { - roi.clear(); - roi.push_back(FPoint(-1.0, 1.0)); - roi.push_back(FPoint(1.0, 1.0)); - roi.push_back(FPoint(1.0, -1.0)); - roi.push_back(FPoint(-1.0, -1.0)); - } - } - }; - - /*! @ingroup cpp_motion - - \short Options structure. - - Describes options used by Simd::Motion::Detector. - */ - struct Options - { - int CalibrationScaleLevelMax; /*!< \brief A maximum scale of input frame. By default it is equal to 3 (maximum scale in 8 times). */ - - int DifferenceGrayFeatureWeight; /*!< \brief A weight of gray feature for difference estimation. By default it is equal to 18. */ - int DifferenceDxFeatureWeight; /*!< \brief A weight of X-gradient feature for difference estimation. By default it is equal to 18. */ - int DifferenceDyFeatureWeight; /*!< \brief A weight of Y-gradient feature for difference estimation. By default it is equal to 18. */ - bool DifferencePropagateForward; /*!< \brief An additional boosting of estimated difference. By default it is true. */ - bool DifferenceRoiMaskEnable; /*!< \brief A flag to restrict difference estimation by ROI. By default it is true. */ - - double BackgroundGrowTime; /*!< \brief Initial time (in seconds) of updated background in fast mode. By default it is equal to 1 second. */ - double BackgroundIncrementTime; /*!< \brief Background update speed (in seconds) in normal mode. By default it is equal to 1 second. */ - int BackgroundSabotageCountMax; /*!< \brief Maximal count of frame with sabotage without scene reinitialization. By default it is equal to 3. */ - - double SegmentationCreateThreshold; /*!< \brief Threshold of segmentation to create motion region. It is restricted by range [0, 1]. By default it is equal to 0.5. */ - double SegmentationExpandCoefficient; /*!< \brief Segmentation coefficient of area expansion of motion region. It is restricted by range [0, 1]. By default it is equal to 0.75. */ - - double StabilityRegionAreaMax; /*!< \brief Defines maximal total area of motion regions othervise sabotage event is generated. It is restricted by range [0, 1]. By default it is equal to 0.5. */ - - int TrackingTrajectoryMax; /*!< \brief Maximal length of object trajectory. By default it is equal to 1024. */ - double TrackingRemoveTime; /*!< \brief A time (in seconds) to remove absent object. By default it is equal to 1 second. */ - double TrackingAdditionalLinking; /*!< \brief A coefficient to boost trajectory linking. By default it is equal to 0. */ - int TrackingAveragingHalfRange; /*!< \brief A half range parameter used to average object trajectory. By default it is equal to 12. */ - - double ClassificationShiftMin; /*!< \brief A minimal shift (in screen diagonals) of motion region to detect object. By default it is equal to 0.075. */ - double ClassificationTimeMin; /*!< \brief A minimal life time (in seconds) of motion region to detect object. By default it is equal to 1 second. */ - - int DebugDrawLevel; /*!< \brief A pyramid level used for debug annotation. By default it is equal to 1. */ - int DebugDrawBottomRight; /*!< \brief A type of debug annotation in right bottom corner (0 - empty; 1 = difference; 2 - texture.gray.value; 3 - texture.dx.value; 4 - texture.dy.value). By default it is equal to 0. */ - bool DebugAnnotateModel; /*!< \brief Debug annotation of model. By default it is equal to false. */ - bool DebugAnnotateMovingRegions; /*!< \brief Debug annotation of moving region. By default it is equal to false. */ - bool DebugAnnotateTrackingObjects; /*!< \brief Debug annotation of tracked objects. By default it is equal to false. */ - - /*! - Default constructor of Options. - */ - Options() - { - CalibrationScaleLevelMax = 3; - - DifferenceGrayFeatureWeight = 18; - DifferenceDxFeatureWeight = 18; - DifferenceDyFeatureWeight = 18; - DifferencePropagateForward = true; - DifferenceRoiMaskEnable = true; - - BackgroundGrowTime = 1.0; - BackgroundIncrementTime = 1.0; - BackgroundSabotageCountMax = 3; - - SegmentationCreateThreshold = 0.5; - SegmentationExpandCoefficient = 0.75; - - StabilityRegionAreaMax = 0.5; - - TrackingTrajectoryMax = 1024; - TrackingRemoveTime = 1.0; - TrackingAdditionalLinking = 0.0; - TrackingAveragingHalfRange = 12; - - ClassificationShiftMin = 0.075; - ClassificationTimeMin = 1.0; - - DebugDrawLevel = 1; - DebugDrawBottomRight = 0; - DebugAnnotateModel = false; - DebugAnnotateMovingRegions = false; - DebugAnnotateTrackingObjects = false; - } - }; - - /*! @ingroup cpp_motion - - \short Class Detector. - - Performs motion detection. - */ - class Detector - { - public: - - /*! - Default constructor of Detector. - */ - Detector() - { - } - - /*! - Destructor of Detector. - */ - virtual ~Detector() - { - } - - /*! - Sets options of motion detector. - - \param [in] options - options of motion detector. - \return a result of the operation. - */ - bool SetOptions(const Simd::Motion::Options & options) - { - *(Simd::Motion::Options*)(&_options) = options; - return true; - } - - /*! - Sets model of scene of motion detector. - - \param [in] model - a model of scene. - \return a result of the operation. - */ - bool SetModel(const Model & model) - { - _model = model; - return true; - } - - /*! - Processes next frame. You have to successively process all frame of a movie with using of this function. - - \param [in] input - a current input frame. - \param [out] metadata - a metadata (sets of detected objects and generated events). It is a result of processing of current frame. - \param [out] output - a pointer to output frame with debug annotation. Can be NULL. - \return a result of the operation. - */ - bool NextFrame(const Frame & input, Metadata & metadata, Frame * output = NULL) - { - SIMD_CHECK_PERFORMANCE(); - - if (output && output->Size() != input.Size()) - return false; - - if (!Calibrate(input.Size())) - return false; - - _scene.metadata = &metadata; - _scene.metadata->events.clear(); - - SetFrame(input, output); - - EstimateTextures(); - - EstimateDifference(); - - PerformSegmentation(); - - VerifyStability(); - - TrackObjects(); - - ClassifyObjects(); - - UpdateBackground(); - - SetMetadata(); - - DebugAnnotation(); - - return true; - } - - private: - Simd::Motion::Model _model; - - struct Options : public Simd::Motion::Options - { - int CalibrationLevelCountMin; - int CalibrationTopLevelSizeMin; - int CalibrationObjectAreaMin; - - int TextureGradientSaturation; - int TextureGradientBoost; - - - Options() - : Simd::Motion::Options() - { - CalibrationLevelCountMin = 3; - CalibrationTopLevelSizeMin = 32; - CalibrationObjectAreaMin = 16; - - TextureGradientSaturation = 16; - TextureGradientBoost = 4; - } - } _options; - - typedef std::pair Scanline; - typedef std::vector Scanlines; - typedef std::vector Rects; - typedef Simd::Rectangle FRect; - typedef Simd::Pyramid Pyramid; - - struct SearchRegion - { - Rect rect; // rectangle on corresponding pyramid level (scale) - int scale; // pyramid level - Scanlines scanlines; - - SearchRegion() - : scale(0) - { - } - - SearchRegion(const Rect & rect_, const int & scale_) - : rect(rect_) - , scale(scale_) - { - } - }; - typedef std::vector SearchRegions; - - struct Model - { - Size originalFrameSize; - - Size frameSize; - size_t scale; - size_t scaleLevel; - - size_t levelCount; - int areaRegionMinEstimated; - - Points roi; - Pyramid roiMask; - SearchRegions searchRegions; - }; - - struct Object; - - struct MovingRegion - { - Rects rects; - - uint8_t index; - Rect rect; - int level; - Time time; - Point point; - Detector::Object * object, * nearest; - - MovingRegion(const uint8_t & index_, const Rect & rect_, int level_, const Time & time_) - : index(index_) - , rect(rect_) - , level(level_) - , time(time_) - , object(NULL) - , nearest(NULL) - { - rects.resize(level + 1); - } - }; - typedef std::shared_ptr MovingRegionPtr; - typedef std::vector MovingRegionPtrs; - - struct Texture - { - struct Bound - { - Pyramid value; - Pyramid count; - - void Create(const Size & size, size_t levelCount) - { - value.Recreate(size, levelCount); - count.Recreate(size, levelCount); - } - }; - - struct Feature - { - Pyramid value; - Bound lo; - Bound hi; - uint16_t weight; - - void Create(const Size & size, size_t levelCount, int weight_) - { - value.Recreate(size, levelCount); - lo.Create(size, levelCount); - hi.Create(size, levelCount); - weight = uint16_t(weight_ * 256); - } - }; - - enum FeatureType - { - FeatureGray, - FeatureDx, - FeatureDy, - }; - - Feature gray; - Feature dx; - Feature dy; - - typedef std::vector Features; - Features features; - - void Create(const Size & size, size_t levelCount, const Options & options) - { - gray.Create(size, levelCount, options.DifferenceGrayFeatureWeight); - dx.Create(size, levelCount, options.DifferenceDxFeatureWeight); - dy.Create(size, levelCount, options.DifferenceDyFeatureWeight); - - features.clear(); - features.push_back(&gray); - features.push_back(&dx); - features.push_back(&dy); - } - }; - - struct Background - { - enum State - { - Init, - Grow, - Update - }; - - State state; - int count; - int sabotageCounter; - Time growEndTime; - Time lastFrameTime; - Time incrementCounterTime; - - Background() - : state(Init) - { - } - }; - - struct Stability - { - enum State - { - Stable, - Sabotage - } state; - - Stability() - : state(Stable) - { - } - }; - - struct Segmentation - { - enum MaskIndices - { - MaskNotVisited = 0, - MaskSeed = 1, - MaskInvalid = 2, - MaskIndexSize, - }; - - Pyramid mask; - - int differenceCreationMin; - int differenceExpansionMin; - - MovingRegionPtrs movingRegions; - }; - - struct Object - { - Id trackingId, classificationId; - Point center; - Rect rect; - MovingRegionPtrs trajectory; - - enum Type - { - Static, - Moving, - } type; - - Point pointStart; - Time timeStart; - - Object(const Id trackingId_, const MovingRegionPtr & region) - : trackingId(trackingId_) - , classificationId(-1) - , center(region->rect.Center()) - , rect(region->rect) - , type(Static) - , pointStart(region->rect.Center()) - , timeStart(region->time) - { - trajectory.push_back(region); - } - }; - typedef std::shared_ptr ObjectPtr; - typedef std::vector ObjectPtrs; - - struct Tracking - { - ObjectPtrs objects; - ObjectPtrs justDeletedObjects; - Id id; - - Tracking() - : id(0) - { - } - }; - - struct Classification - { - ptrdiff_t squareShiftMin; - Id id; - - Classification() - : id(0) - { - } - }; - - struct Scene - { - Frame input, * output; - Pyramid scaled; - Metadata * metadata; - - Font font; - Pyramid buffer; - Detector::Model model; - - Texture texture; - - Background background; - - Stability stability; - - Pyramid difference; - - Segmentation segmentation; - - Tracking tracking; - - Classification classification; - - void Create(const Options & options) - { - scaled.Recreate(model.originalFrameSize, model.scaleLevel + 1); - font.Resize(model.originalFrameSize.y / 32); - buffer.Recreate(model.frameSize, model.levelCount); - - texture.Create(model.frameSize, model.levelCount, options); - difference.Recreate(model.frameSize, model.levelCount); - - segmentation.mask.Recreate(model.frameSize, model.levelCount); - segmentation.differenceCreationMin = int(255 * options.SegmentationCreateThreshold); - segmentation.differenceExpansionMin = int(255 * options.SegmentationExpandCoefficient*options.SegmentationCreateThreshold); - - classification.squareShiftMin = ptrdiff_t(Simd::SquaredDistance(model.frameSize, Point())* - options.ClassificationShiftMin*options.ClassificationShiftMin); - } - }; - Scene _scene; - - void SetFrame(const Frame & input, Frame * output) - { - SIMD_CHECK_PERFORMANCE(); - - _scene.input = input; - _scene.output = output; - Simd::Convert(input, Frame(_scene.scaled[0]).Ref()); - Simd::Build(_scene.scaled, SimdReduce2x2); - } - - bool Calibrate(const Size & frameSize) - { - Model & model = _scene.model; - - if (model.originalFrameSize == frameSize) - return true; - - SIMD_CHECK_PERFORMANCE(); - - model.originalFrameSize = frameSize; - - EstimateModelParameters(model); - SetScreenRoi(model); - GenerateSearchRegion(model); - GenerateSearchRegionScanlines(model); - - _scene.Create(_options); - - return true; - } - - void EstimateModelParameters(Model & model) - { - Size objectSize = OnvifToScreenSize(_model.size, model.originalFrameSize); - Size size = model.originalFrameSize; - model.areaRegionMinEstimated = int(objectSize.x*objectSize.y); - int levelCount = 1; - while (size.x >= _options.CalibrationTopLevelSizeMin && size.y >= _options.CalibrationTopLevelSizeMin && model.areaRegionMinEstimated > _options.CalibrationObjectAreaMin) - { - size = Simd::Scale(size); - ++levelCount; - model.areaRegionMinEstimated /= 4; - } - model.areaRegionMinEstimated = std::max(model.areaRegionMinEstimated, _options.CalibrationObjectAreaMin / 4 + 1); - model.scaleLevel = std::min(std::max(levelCount - _options.CalibrationLevelCountMin, 0), _options.CalibrationScaleLevelMax); - model.levelCount = levelCount - model.scaleLevel; - model.scale = size_t(1) << model.scaleLevel; - model.frameSize = model.originalFrameSize; - for (size_t level = 0; level < model.scaleLevel; ++level) - model.frameSize = Simd::Scale(model.frameSize); - } - - void SetScreenRoi(Model & model) - { - if (_model.roi.size() > 2) - { - model.roi.resize(_model.roi.size()); - for (size_t i = 0; i < _model.roi.size(); ++i) - model.roi[i] = OnvifToScreen(_model.roi[i], model.frameSize); - } - else - { - model.roi.clear(); - model.roi.push_back(Point(0, 0)); - model.roi.push_back(Point(model.frameSize.x, 0)); - model.roi.push_back(Point(model.frameSize.x, model.frameSize.y)); - model.roi.push_back(Point(0, model.frameSize.y)); - } - } - - void GenerateSearchRegion(Model & model) - { - model.searchRegions.clear(); - - Size size(model.frameSize); - for (size_t level = 1; level < model.levelCount; level++) - size = Simd::Scale(size); - - int level = (int)model.levelCount - 1; - const Rect rect(1, 1, size.x - 1, size.y - 1); - model.searchRegions.push_back(SearchRegion(rect, level)); - } - - void GenerateSearchRegionScanlines(Model & model) - { - static const uint8_t ROI_EMPTY = 0; - static const uint8_t ROI_NON_EMPTY = 255; - - model.roiMask.Recreate(model.frameSize, model.levelCount); - Simd::Fill(model.roiMask, ROI_EMPTY); - DrawFilledPolygon(model.roiMask[0], model.roi, ROI_NON_EMPTY); - Simd::Build(model.roiMask, SimdReduce4x4); - - for (size_t i = 0; i < model.searchRegions.size(); ++i) - { - SearchRegion & region = model.searchRegions[i]; - assert(region.scale < (int)model.roiMask.Size()); - - const View & view = model.roiMask[region.scale]; - const Rect & rect = region.rect; - for (ptrdiff_t row = rect.Top(); row < rect.Bottom(); ++row) - { - ptrdiff_t offset = row * view.stride + rect.Left(); - ptrdiff_t end = offset + rect.Width(); - for (; offset < end;) - { - if (view.data[offset]) - { - Scanline scanline; - scanline.first = offset; - while (++offset < end && view.data[offset]); - scanline.second = offset; - region.scanlines.push_back(scanline); - } - else - ++offset; - } - } - } - } - - void EstimateTextures() - { - SIMD_CHECK_PERFORMANCE(); - - Texture & texture = _scene.texture; - Simd::Copy(_scene.scaled.Top(), texture.gray.value[0]); - Simd::Build(texture.gray.value, SimdReduce4x4); - for (size_t i = 0; i < texture.gray.value.Size(); ++i) - { - Simd::TextureBoostedSaturatedGradient(texture.gray.value[i], - _options.TextureGradientSaturation, _options.TextureGradientBoost, - texture.dx.value[i], texture.dy.value[i]); - } - } - - void EstimateDifference() - { - SIMD_CHECK_PERFORMANCE(); - - const Texture & texture = _scene.texture; - Pyramid & difference = _scene.difference; - Pyramid & buffer = _scene.buffer; - for (size_t i = 0; i < difference.Size(); ++i) - { - Simd::Fill(difference[i], 0); - for (size_t j = 0; j < texture.features.size(); ++j) - { - const Texture::Feature & feature = *texture.features[j]; - Simd::AddFeatureDifference(feature.value[i], feature.lo.value[i], feature.hi.value[i], feature.weight, difference[i]); - } - } - if (_options.DifferencePropagateForward) - { - for (size_t i = 1; i < difference.Size(); ++i) - { - Simd::ReduceGray4x4(difference[i - 1], buffer[i]); - Simd::OperationBinary8u(difference[i], buffer[i], difference[i], SimdOperationBinary8uMaximum); - } - } - if (_options.DifferenceRoiMaskEnable) - { - for (size_t i = 0; i < difference.Size(); ++i) - Simd::OperationBinary8u(difference[i], _scene.model.roiMask[i], difference[i], SimdOperationBinary8uAnd); - } - } - - void PerformSegmentation() - { - SIMD_CHECK_PERFORMANCE(); - - Point neighbours[4]; - neighbours[0] = Point(-1, 0); - neighbours[1] = Point(0, -1); - neighbours[2] = Point(1, 0); - neighbours[3] = Point(0, 1); - - Segmentation & segmentation = _scene.segmentation; - const Model & model = _scene.model; - const Time & time = _scene.input.timestamp; - - segmentation.movingRegions.clear(); - - Simd::Fill(segmentation.mask, Segmentation::MaskNotVisited); - for (size_t i = 0; i < model.searchRegions.size(); ++i) - { - View & mask = segmentation.mask.At(model.searchRegions[i].scale); - Simd::FillFrame(mask, Rect(1, 1, mask.width - 1, mask.height - 1), Segmentation::MaskInvalid); - } - - for (size_t i = 0; i < model.searchRegions.size(); ++i) - { - const SearchRegion & searchRegion = model.searchRegions[i]; - int level = searchRegion.scale; - const View & difference = _scene.difference.At(level); - View & mask = segmentation.mask.At(level); - Rect roi = searchRegion.rect; - - for (size_t i = 0; i < searchRegion.scanlines.size(); ++i) - { - const Scanline & scanline = searchRegion.scanlines[i]; - for (size_t offset = scanline.first; offset < scanline.second; ++offset) - { - if (difference.data[offset] > segmentation.differenceCreationMin && mask.data[offset] == Segmentation::MaskNotVisited) - mask.data[offset] = Segmentation::MaskSeed; - } - } - - ShrinkRoi(mask, roi, Segmentation::MaskSeed); - roi &= searchRegion.rect; - - for (ptrdiff_t y = roi.top; y < roi.bottom; ++y) - { - for (ptrdiff_t x = roi.left; x < roi.right; ++x) - { - if (mask.At(x, y) == Segmentation::MaskSeed) - { - std::stack stack; - stack.push(Point(x, y)); - if (segmentation.movingRegions.size() + Segmentation::MaskIndexSize > UINT8_MAX) - return; - MovingRegionPtr region(new MovingRegion(uint8_t(segmentation.movingRegions.size() + Segmentation::MaskIndexSize), Rect(), level, time)); - while (!stack.empty()) - { - Point current = stack.top(); - stack.pop(); - mask.At(current) = region->index; - region->rect |= current; - for (size_t n = 0; n < 4; ++n) - { - Point neighbour = current + neighbours[n]; - if (difference.At(neighbour) > segmentation.differenceExpansionMin && mask.At(neighbour) <= Segmentation::MaskSeed) - stack.push(neighbour); - } - } - - if (region->rect.Area() <= model.areaRegionMinEstimated) - Simd::SegmentationChangeIndex(segmentation.mask[region->level].Region(region->rect).Ref(), region->index, Segmentation::MaskInvalid); - else - { - ComputeIndex(segmentation, *region); - if (!region->rect.Empty()) - { - region->level = searchRegion.scale; - region->point = region->rect.Center(); - segmentation.movingRegions.push_back(region); - } - } - } - } - } - } - } - - SIMD_INLINE void ShrinkRoi(const View & mask, Rect & roi, uint8_t index) - { - Simd::SegmentationShrinkRegion(mask, index, roi); - if (!roi.Empty()) - roi.AddBorder(1); - } - - SIMD_INLINE void ExpandRoi(const Rect & roiParent, const Rect & rectChild, Rect & roiChild) - { - roiChild.SetTopLeft(roiParent.TopLeft() * 2 - Point(1, 1)); - roiChild.SetBottomRight(roiParent.BottomRight() * 2 + Point(1, 1)); - roiChild.AddBorder(1); - roiChild &= rectChild; - } - - void ComputeIndex(const View & parentMask, View & childMask, const View & difference, MovingRegion & region, int differenceExpansionMin) - { - Rect rect = region.rect; - rect.right++; - rect.bottom++; - Simd::SegmentationPropagate2x2(parentMask.Region(rect), childMask.Region(2 * rect).Ref(), difference.Region(2 * rect), - region.index, Segmentation::MaskInvalid, Segmentation::MaskNotVisited, differenceExpansionMin); - - Rect rectChild(childMask.Size()); - rectChild.AddBorder(-1); - - ExpandRoi(region.rect, rectChild, region.rect); - ShrinkRoi(childMask, region.rect, region.index); - region.rect &= rectChild; - } - - void ComputeIndex(Segmentation & segmentation, MovingRegion & region) - { - region.rects[region.level] = region.rect; - - int level = region.level; - std::stack rects; - for (; region.level > 0; --region.level) - { - const int levelChild = region.level - 1; - - rects.push(region.rect); - ComputeIndex(segmentation.mask[region.level], segmentation.mask[levelChild], _scene.difference[levelChild], region, segmentation.differenceExpansionMin); - - region.rects[region.level - 1] = region.rect; - - if (region.rect.Empty()) - { - for (; region.level <= level; region.level++) - { - region.rect = rects.top(); - rects.pop(); - Simd::SegmentationChangeIndex(segmentation.mask[region.level].Region(region.rect).Ref(), region.index, Segmentation::MaskInvalid); - } - region.rect = Rect(); - return; - } - } - } - - void VerifyStability() - { - SIMD_CHECK_PERFORMANCE(); - - if (_scene.background.state == Background::Init) - return; - View mask = _scene.segmentation.mask[0]; - uint32_t count; - Simd::ConditionalCount8u(mask, Segmentation::MaskIndexSize, SimdCompareGreaterOrEqual, count); - bool sabotage = count >= mask.Area()*_options.StabilityRegionAreaMax; - if (sabotage) - { - if (_scene.stability.state != Stability::Sabotage) - _scene.metadata->events.push_back(Event(Event::SabotageOn, "SabotageOn")); - _scene.stability.state = Stability::Sabotage; - } - else - { - if (_scene.stability.state == Stability::Sabotage) - _scene.metadata->events.push_back(Event(Event::SabotageOff, "SabotageOff")); - _scene.stability.state = Stability::Stable; - } - } - - void TrackObjects() - { - SIMD_CHECK_PERFORMANCE(); - - if (_scene.background.state != Background::Update) - { - RemoveAllObjects(); - return; - } - - RefreshObjectsTrajectory(); - - DeleteOldObjects(); - - SetNearestObjects(); - - LinkObjects(); - - AddNewObjects(); - } - - void RemoveAllObjects() - { - _scene.tracking.justDeletedObjects.clear(); - if (_scene.tracking.objects.size()) - { - for (size_t i = 0; i < _scene.tracking.objects.size(); ++i) - { - ObjectPtr & object = _scene.tracking.objects[i]; - if (object->type == Object::Moving) - _scene.metadata->events.push_back(Event(Event::ObjectOut, "ObjectOut", object->classificationId)); - _scene.tracking.justDeletedObjects.push_back(object); - } - _scene.tracking.objects.clear(); - } - } - - void RefreshObjectsTrajectory() - { - ObjectPtrs & objects = _scene.tracking.objects; - for (size_t j = 0; j < objects.size(); ++j) - { - ObjectPtr & object = objects[j]; - if (object->trajectory.size() > (size_t)_options.TrackingTrajectoryMax) - object->trajectory.erase(object->trajectory.begin()); - } - } - - void DeleteOldObjects() - { - Time current = _scene.input.timestamp; - Tracking & tracking = _scene.tracking; - tracking.justDeletedObjects.clear(); - ObjectPtrs buffer; - for (size_t i = 0; i < tracking.objects.size(); ++i) - { - const ObjectPtr & object = tracking.objects[i]; - if (current - object->trajectory.back()->time < _options.TrackingRemoveTime) - buffer.push_back(object); - else - { - tracking.justDeletedObjects.push_back(object); - if (object->type == Object::Moving) - _scene.metadata->events.push_back(Event(Event::ObjectOut, "ObjectOut", object->classificationId)); - } - } - tracking.objects.swap(buffer); - } - - void SetNearestObjects() - { - for (size_t i = 0; i < _scene.segmentation.movingRegions.size(); ++i) - { - MovingRegion & region = *_scene.segmentation.movingRegions[i]; - region.nearest = NULL; - ptrdiff_t minDifferenceSquared = std::numeric_limits::max(); - for (size_t j = 0; j < _scene.tracking.objects.size(); ++j) - { - Detector::Object * object = _scene.tracking.objects[j].get(); - const ptrdiff_t differenceSquared = Simd::SquaredDistance(object->center, region.rect.Center()); - if (differenceSquared < minDifferenceSquared) - { - minDifferenceSquared = differenceSquared; - region.nearest = object; - } - } - } - } - - void LinkObjects() - { - for (size_t i = 0; i < _scene.tracking.objects.size(); ++i) - { - ObjectPtr & object = _scene.tracking.objects[i]; - MovingRegionPtr nearest; - ptrdiff_t minDifferenceSquared = std::numeric_limits::max(); - for (size_t j = 0; j < _scene.segmentation.movingRegions.size(); ++j) - { - MovingRegionPtr & region = _scene.segmentation.movingRegions[j]; - if (region->object != NULL) - continue; - if (object.get() != region->nearest) - continue; - Rect regionRect = Enlarged(region->rect); - Rect objectRect = Enlarged(object->rect); - ptrdiff_t differenceSquared = Simd::SquaredDistance(object->center, region->rect.Center()); - if (regionRect.Contains(object->center) || objectRect.Contains(region->rect.Center())) - { - if (differenceSquared < minDifferenceSquared) - { - minDifferenceSquared = differenceSquared; - nearest = region; - } - } - } - if (nearest) - { - nearest->object = object.get(); - object->trajectory.push_back(nearest); - Rect sum; - size_t end = object->trajectory.size(), start = std::max(0, end - _options.TrackingAveragingHalfRange); - for (size_t j = start; j < end; ++j) - sum += object->trajectory[j]->rect; - object->rect = sum / (end - start); - object->rect.Shift(nearest->rect.Center() - object->rect.Center()); - object->rect &= Rect(_scene.model.frameSize); - object->center = nearest->rect.Center(); - } - } - } - - SIMD_INLINE Rect Enlarged(Rect rect) - { - ptrdiff_t size = (rect.Width() + rect.Height()) / 2; - ptrdiff_t border = ptrdiff_t(::ceil(size*_options.TrackingAdditionalLinking)); - rect.AddBorder(border); - return rect; - } - - void AddNewObjects() - { - for (size_t j = 0; j < _scene.segmentation.movingRegions.size(); ++j) - { - const MovingRegionPtr & region = _scene.segmentation.movingRegions[j]; - if (region->object != NULL) - continue; - bool contained = false; - for (size_t i = 0; i < _scene.tracking.objects.size(); ++i) - { - const ObjectPtr & object = _scene.tracking.objects[i]; - if (object->rect.Contains(region->rect.Center())) - { - contained = true; - break; - } - } - if (!contained) - { - ObjectPtr object(new Object(_scene.tracking.id++, region)); - region->object = object.get(); - _scene.tracking.objects.push_back(object); - } - } - } - - void ClassifyObjects() - { - for (size_t i = 0; i < _scene.tracking.objects.size(); ++i) - { - Object & object = *_scene.tracking.objects[i]; - if (object.type == Object::Static) - { - Time time = _scene.input.timestamp - object.timeStart; - ptrdiff_t squareShift = Simd::SquaredDistance(object.trajectory.back()->point, object.pointStart); - if (time >= _options.ClassificationTimeMin && squareShift >= _scene.classification.squareShiftMin) - { - object.type = Object::Moving; - object.classificationId = _scene.classification.id++; - _scene.metadata->events.push_back(Event(Event::ObjectIn, "ObjectIn", object.classificationId)); - } - } - } - } - - struct InitUpdater - { - void operator()(View & value, View & loValue, View & loCount, View & hiValue, View & hiCount) const - { - Simd::Copy(value, loValue); - Simd::Copy(value, hiValue); - Simd::Fill(loCount, 0); - Simd::Fill(hiCount, 0); - } - }; - - struct GrowRangeUpdater - { - void operator()(View & value, View & loValue, View & loCount, View & hiValue, View & hiCount) const - { - Simd::BackgroundGrowRangeFast(value, loValue, hiValue); - } - }; - - struct IncrementCountUpdater - { - void operator()(View & value, View & loValue, View & loCount, View & hiValue, View & hiCount) const - { - Simd::BackgroundIncrementCount(value, loValue, hiValue, loCount, hiCount); - } - }; - - struct AdjustRangeUpdater - { - void operator()(View & value, View & loValue, View & loCount, View & hiValue, View & hiCount) const - { - Simd::BackgroundAdjustRange(loCount, loValue, hiCount, hiValue, 1); - } - }; - - template void Apply(Texture::Features & features, const Updater & updater) - { - for (size_t i = 0; i < features.size(); ++i) - { - Texture::Feature & feature = *features[i]; - for (size_t j = 0; j < feature.value.Size(); ++j) - { - updater(feature.value[j], feature.lo.value[j], feature.lo.count[j], feature.hi.value[j], feature.hi.count[j]); - } - } - } - - void UpdateBackground() - { - SIMD_CHECK_PERFORMANCE(); - - Background & background = _scene.background; - const Stability::State & stability = _scene.stability.state; - const Time & time = _scene.input.timestamp; - switch (background.state) - { - case Background::Update: - switch (stability) - { - case Stability::Stable: - Apply(_scene.texture.features, IncrementCountUpdater()); - ++background.count; - background.incrementCounterTime += time - background.lastFrameTime; - if (background.count >= CHAR_MAX || (background.incrementCounterTime > _options.BackgroundIncrementTime && background.count >= 8)) - { - Apply(_scene.texture.features, AdjustRangeUpdater()); - background.incrementCounterTime = 0; - background.count = 0; - } - break; - case Stability::Sabotage: - background.sabotageCounter++; - if (background.sabotageCounter > _options.BackgroundSabotageCountMax) - InitBackground(); - break; - default: - assert(0); - } - if (stability != Stability::Sabotage) - background.sabotageCounter = 0; - break; - case Background::Grow: - if (stability == Stability::Sabotage) - InitBackground(); - else - { - Apply(_scene.texture.features, GrowRangeUpdater()); - if (stability != Stability::Stable) - background.growEndTime = time + _options.BackgroundGrowTime; - if (background.growEndTime < time) - { - background.state = Background::Update; - background.count = 0; - } - } - break; - case Background::Init: - InitBackground(); - break; - default: - assert(0); - } - background.lastFrameTime = time; - } - - void InitBackground() - { - Background & background = _scene.background; - Apply(_scene.texture.features, InitUpdater()); - background.growEndTime = _scene.input.timestamp + _options.BackgroundGrowTime; - background.state = Background::Grow; - background.count = 0; - background.incrementCounterTime = 0; - } - - void SetMetadata() - { - _scene.metadata->objects.clear(); - AddToMetadata(_scene.tracking.objects); - AddToMetadata(_scene.tracking.justDeletedObjects); - } - - void AddToMetadata(const ObjectPtrs & objects) - { - size_t scale = _scene.model.scale; - for (size_t i = 0; i < objects.size(); ++i) - { - Object & srcObject = *objects[i]; - if (srcObject.type == Object::Moving) - { - Motion::Object dstObject; - dstObject.id = srcObject.classificationId; - dstObject.rect = srcObject.rect*scale; - for (size_t j = 0; j < srcObject.trajectory.size(); ++j) - { - ptrdiff_t begin = std::max(0, j - _options.TrackingAveragingHalfRange); - ptrdiff_t end = std::min(srcObject.trajectory.size(), j + _options.TrackingAveragingHalfRange); - Point sum; - for (ptrdiff_t l = begin; l < end; ++l) - sum += srcObject.trajectory[l]->point*scale; - Motion::Position position; - position.time = srcObject.trajectory[j]->time; - position.point = sum / (end - begin); - dstObject.trajectory.push_back(position); - } - _scene.metadata->objects.push_back(dstObject); - } - } - } - - void DebugAnnotation() - { - SIMD_CHECK_PERFORMANCE(); - - Frame * output = _scene.output; - size_t scale = _scene.model.scale; - - if (output && output->format == Frame::Bgr24) - { - View & canvas = output->planes[0]; - - if (_options.DebugDrawBottomRight) - { - View src; - switch (_options.DebugDrawBottomRight) - { - case 1: src = _scene.difference[_options.DebugDrawLevel]; break; - case 2: src = _scene.texture.gray.value[_options.DebugDrawLevel]; break; - case 3: src = _scene.texture.dx.value[_options.DebugDrawLevel]; break; - case 4: src = _scene.texture.dy.value[_options.DebugDrawLevel]; break; - } - Simd::GrayToBgr(src, canvas.Region(src.Size(), View::BottomRight).Ref()); - } - - if (_options.DebugAnnotateModel) - { - Simd::Pixel::Bgr24 color(0, 255, 255); - for (size_t i = 0; i < _scene.model.roi.size(); ++i) - { - Point p0 = i ? _scene.model.roi[i - 1] : _scene.model.roi.back(), p1 = _scene.model.roi[i]; - Simd::DrawLine(canvas, p0*scale, p1*scale, color); - } - Rect objectMin(OnvifToScreenSize(_model.size, _scene.model.originalFrameSize)); - objectMin.Shift(Point(_scene.model.originalFrameSize.x - objectMin.right - 2*scale, scale)); - Simd::DrawRectangle(canvas, objectMin, color); - } - - if (_options.DebugAnnotateMovingRegions) - { - Simd::Pixel::Bgr24 color(0, 255, 0); - for (size_t i = 0; i < _scene.segmentation.movingRegions.size(); ++i) - { - const MovingRegion & region = *_scene.segmentation.movingRegions[i]; - Simd::DrawRectangle(canvas, region.rect*scale, color, 1); - } - } - - if (_options.DebugAnnotateTrackingObjects) - { - Simd::Pixel::Bgr24 color(0, 255, 255); - for (size_t i = 0; i < _scene.tracking.objects.size(); ++i) - { - const Object & object = *_scene.tracking.objects[i]; - Simd::DrawRectangle(canvas, object.rect*scale, color, 1); - _scene.font.Draw(canvas, ToString(object.trackingId), Point(object.rect.Center().x*scale, object.rect.top*scale - _scene.font.Height()), color); - const MovingRegionPtrs & regions = object.trajectory; - for (size_t j = 1; j < regions.size(); ++j) - Simd::DrawLine(canvas, regions[j]->point*scale, regions[j - 1]->point*scale, color, 1); - } - } - } - } - }; - } -} - -#endif//__SimdMotion_hpp__ diff --git a/src/3rd/Simd/Simd/SimdMsa.h b/src/3rd/Simd/Simd/SimdMsa.h deleted file mode 100644 index faf03e43..00000000 --- a/src/3rd/Simd/Simd/SimdMsa.h +++ /dev/null @@ -1,42 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdMsa_h__ -#define __SimdMsa_h__ - -#include "Simd/SimdDefs.h" - -namespace Simd -{ -#ifdef SIMD_MSA_ENABLE - namespace Msa - { - void OperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride, SimdOperationBinary8uType type); - - void OperationBinary16i(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint8_t * dst, size_t dstStride, SimdOperationBinary16iType type); - } -#endif// SIMD_MSA_ENABLE -} -#endif//__SimMsa_h__ diff --git a/src/3rd/Simd/Simd/SimdMsaOperation.cpp b/src/3rd/Simd/Simd/SimdMsaOperation.cpp deleted file mode 100644 index af703739..00000000 --- a/src/3rd/Simd/Simd/SimdMsaOperation.cpp +++ /dev/null @@ -1,197 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ -#ifdef SIMD_MSA_ENABLE - namespace Msa - { - template SIMD_INLINE v16u8 OperationBinary8u(const v16u8 & a, const v16u8 & b); - - template <> SIMD_INLINE v16u8 OperationBinary8u(const v16u8 & a, const v16u8 & b) - { - return __msa_aver_u_b(a, b); - } - - template <> SIMD_INLINE v16u8 OperationBinary8u(const v16u8 & a, const v16u8 & b) - { - return __msa_and_v(a, b); - } - - template <> SIMD_INLINE v16u8 OperationBinary8u(const v16u8 & a, const v16u8 & b) - { - return __msa_or_v(a, b); - } - - template <> SIMD_INLINE v16u8 OperationBinary8u(const v16u8 & a, const v16u8 & b) - { - return __msa_max_u_b(a, b); - } - - template <> SIMD_INLINE v16u8 OperationBinary8u(const v16u8 & a, const v16u8 & b) - { - return __msa_min_u_b(a, b); - } - - template <> SIMD_INLINE v16u8 OperationBinary8u(const v16u8 & a, const v16u8 & b) - { - return __msa_subs_u_b(a, b); - } - - template <> SIMD_INLINE v16u8 OperationBinary8u(const v16u8 & a, const v16u8 & b) - { - return __msa_adds_u_b(a, b); - } - - template void OperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride) - { - assert(width*channelCount >= A); - if (align) - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(dst) && Aligned(dstStride)); - - size_t size = channelCount*width; - size_t alignedSize = Simd::AlignLo(size, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t offset = 0; offset < alignedSize; offset += A) - { - const v16u8 a_ = Load(a + offset); - const v16u8 b_ = Load(b + offset); - Store(dst + offset, OperationBinary8u(a_, b_)); - } - if (alignedSize != size) - { - const v16u8 a_ = Load(a + size - A); - const v16u8 b_ = Load(b + size - A); - Store(dst + size - A, OperationBinary8u(a_, b_)); - } - a += aStride; - b += bStride; - dst += dstStride; - } - } - - template void OperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride, SimdOperationBinary8uType type) - { - switch (type) - { - case SimdOperationBinary8uAverage: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uAnd: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uOr: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uMaximum: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uMinimum: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uSaturatedSubtraction: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uSaturatedAddition: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - default: - assert(0); - } - } - - void OperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride, SimdOperationBinary8uType type) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(dst) && Aligned(dstStride)) - OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride, type); - else - OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride, type); - } - - template SIMD_INLINE v8i16 OperationBinary16i(const v8i16 & a, const v8i16 & b); - - template <> SIMD_INLINE v8i16 OperationBinary16i(const v8i16 & a, const v8i16 & b) - { - return __msa_addv_h(a, b); - } - - template <> SIMD_INLINE v8i16 OperationBinary16i(const v8i16 & a, const v8i16 & b) - { - return __msa_subv_h(a, b); - } - - template void OperationBinary16i(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(width * sizeof(int16_t) >= A); - if (align) - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(dst) && Aligned(dstStride)); - - size_t size = width * sizeof(int16_t); - size_t alignedSize = Simd::AlignLo(size, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t offset = 0; offset < alignedSize; offset += A) - { - const v8i16 a_ = (v8i16)Load(a + offset); - const v8i16 b_ = (v8i16)Load(b + offset); - Store(dst + offset, (v16u8)OperationBinary16i(a_, b_)); - } - if (alignedSize != size) - { - const v8i16 a_ = (v8i16)Load(a + size - A); - const v8i16 b_ = (v8i16)Load(b + size - A); - Store(dst + size - A, (v16u8)OperationBinary16i(a_, b_)); - } - a += aStride; - b += bStride; - dst += dstStride; - } - } - - template void OperationBinary16i(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint8_t * dst, size_t dstStride, SimdOperationBinary16iType type) - { - switch (type) - { - case SimdOperationBinary16iAddition: - return OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride); - case SimdOperationBinary16iSubtraction: - return OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride); - default: - assert(0); - } - } - - void OperationBinary16i(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint8_t * dst, size_t dstStride, SimdOperationBinary16iType type) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(dst) && Aligned(dstStride)) - OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride, type); - else - OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride, type); - } - } -#endif// SIMD_MSA_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeon.h b/src/3rd/Simd/Simd/SimdNeon.h deleted file mode 100644 index 1dbc7354..00000000 --- a/src/3rd/Simd/Simd/SimdNeon.h +++ /dev/null @@ -1,641 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar, -* 2018-2018 Radchenko Andrey. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdNeon_h__ -#define __SimdNeon_h__ - -#include "Simd/SimdDefs.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - void AbsDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - size_t width, size_t height, uint64_t * sum); - - void AbsDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum); - - void AbsDifferenceSums3x3(const uint8_t *current, size_t currentStride, const uint8_t * background, size_t backgroundStride, - size_t width, size_t height, uint64_t * sums); - - void AbsDifferenceSums3x3Masked(const uint8_t *current, size_t currentStride, const uint8_t *background, size_t backgroundStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sums); - - void AbsGradientSaturatedSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void AddFeatureDifference(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, - uint16_t weight, uint8_t * difference, size_t differenceStride); - - void AlphaBlending(const uint8_t *src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t *alpha, size_t alphaStride, uint8_t *dst, size_t dstStride); - - void AlphaFilling(uint8_t * dst, size_t dstStride, size_t width, size_t height, const uint8_t * channel, size_t channelCount, const uint8_t * alpha, size_t alphaStride); - - void BackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride); - - void BackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride); - - void BackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * loValue, size_t loValueStride, const uint8_t * hiValue, size_t hiValueStride, - uint8_t * loCount, size_t loCountStride, uint8_t * hiCount, size_t hiCountStride); - - void BackgroundAdjustRange(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold); - - void BackgroundAdjustRangeMasked(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride); - - void BackgroundShiftRange(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride); - - void BackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride, const uint8_t * mask, size_t maskStride); - - void BackgroundInitMask(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t index, uint8_t value, uint8_t * dst, size_t dstStride); - - void BayerToBgr(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgr, size_t bgrStride); - - void BayerToBgra(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void BgraToBayer(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat); - - void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride); - - void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride); - - void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride); - - void BgraToYuv420p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void BgraToYuv422p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void BgraToYuv444p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void BgraToYuva420p(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, - uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride, uint8_t * a, size_t aStride); - - void BgrToBayer(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat); - - void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, - const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride); - - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride); - - void BgrToYuv420p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void BgrToYuv422p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void BgrToYuv444p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void Binarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride, SimdCompareType compareType); - - void AveragingBinarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, size_t neighborhood, uint8_t threshold, uint8_t positive, uint8_t negative, - uint8_t * dst, size_t dstStride, SimdCompareType compareType); - - void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, - uint8_t value, SimdCompareType compareType, uint32_t * count); - - void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, - int16_t value, SimdCompareType compareType, uint32_t * count); - - void ConditionalSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum); - - void ConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum); - - void ConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum); - - void ConditionalFill(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t threshold, SimdCompareType compareType, uint8_t value, uint8_t * dst, size_t dstStride); - - void DeinterleaveUv(const uint8_t * uv, size_t uvStride, size_t width, size_t height, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride); - - void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride); - - void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride); - - void DetectionHaarDetect32fp(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void DetectionHaarDetect32fi(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void DetectionLbpDetect32fp(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void DetectionLbpDetect32fi(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void DetectionLbpDetect16ip(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void DetectionLbpDetect16ii(const void * hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - - void EdgeBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride); - - void EdgeBackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride); - - void EdgeBackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t * backgroundCount, size_t backgroundCountStride); - - void EdgeBackgroundAdjustRange(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold); - - void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride); - - void EdgeBackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride, const uint8_t * mask, size_t maskStride); - - void FillBgr(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red); - - void FillBgra(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red, uint8_t alpha); - - void Fill32f(float * dst, size_t size, const float * value); - - void FillPixel(uint8_t * dst, size_t stride, size_t width, size_t height, const uint8_t * pixel, size_t pixelSize); - -#ifdef SIMD_NEON_FP16_ENABLE - void Float32ToFloat16(const float * src, size_t size, uint16_t * dst); - - void Float16ToFloat32(const uint16_t * src, size_t size, float * dst); - - void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum); - - void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance); - - void CosineDistancesMxNa16f(size_t M, size_t N, size_t K, const uint16_t * const * A, const uint16_t * const * B, float * distances); -#endif - - void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst); - - void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst); - - void CosineDistance32f(const float * a, const float * b, size_t size, float * distance); - - void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); - - void Gemm32fNT(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); - - void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride); - - void GrayToBgra(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha); - - void HistogramMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t index, uint32_t * histogram); - - void HistogramConditional(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint32_t * histogram); - - void HogDeinterleave(const float * src, size_t srcStride, size_t width, size_t height, size_t count, float ** dst, size_t dstStride); - - void HogDirectionHistograms(const uint8_t * src, size_t stride, size_t width, size_t height, - size_t cellX, size_t cellY, size_t quantization, float * histograms); - - void HogExtractFeatures(const uint8_t * src, size_t stride, size_t width, size_t height, float * features); - - void HogFilterSeparable(const float * src, size_t srcStride, size_t width, size_t height, const float * rowFilter, size_t rowSize, const float * colFilter, size_t colSize, float * dst, size_t dstStride, int add); - - void HogLiteExtractFeatures(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t cell, float * features, size_t featuresStride); - - void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride); - - void HogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight); - - void HogLiteCompressFeatures(const float * src, size_t srcStride, size_t width, size_t height, const float * pca, float * dst, size_t dstStride); - - void HogLiteFilterSeparable(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * hFilter, size_t hSize, const float * vFilter, size_t vSize, float * dst, size_t dstStride, int add); - - void HogLiteFindMax7x7(const float * a, size_t aStride, const float * b, size_t bStride, size_t height, float * value, size_t * col, size_t * row); - - void HogLiteCreateMask(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, const float * threshold, size_t scale, size_t size, uint32_t * dst, size_t dstStride); - - void Int16ToGray(const uint8_t * src, size_t width, size_t height, size_t srcStride, uint8_t * dst, size_t dstStride); - - void InterferenceIncrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t increment, int16_t saturation); - - void InterferenceIncrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t increment, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index); - - void InterferenceDecrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t decrement, int16_t saturation); - - void InterferenceDecrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t decrement, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index); - - void InterleaveUv(const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * uv, size_t uvStride); - - void InterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - void InterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride); - - void AbsSecondDerivativeHistogram(const uint8_t *src, size_t width, size_t height, size_t stride, - size_t step, size_t indent, uint32_t * histogram); - - void Laplace(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void LaplaceAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void LaplaceAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - void LbpEstimate(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void MeanFilter3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride); - - void MedianFilterRhomb3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - void MedianFilterRhomb5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - void MedianFilterSquare3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - void MedianFilterSquare5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride); - - void NeuralConvolutionForward(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, const float * weight, - size_t kernelX, size_t kernelY, size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, - void * buffer, size_t * size, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth, int add); - - void NeuralConvert(const uint8_t * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride, int inversion); - - void NeuralProductSum(const float * a, const float * b, size_t size, float * sum); - - void NeuralAddVectorMultipliedByValue(const float * src, size_t size, const float * value, float * dst); - - void NeuralAddVector(const float * src, size_t size, float * dst); - - void NeuralAddValue(const float * value, float * dst, size_t size); - - void NeuralRoughSigmoid(const float * src, size_t size, const float * slope, float * dst); - - void NeuralRoughSigmoid2(const float * src, size_t size, const float * slope, float * dst); - - void NeuralDerivativeSigmoid(const float * src, size_t size, const float * slope, float * dst); - - void NeuralRoughTanh(const float * src, size_t size, const float * slope, float * dst); - - void NeuralDerivativeTanh(const float * src, size_t size, const float * slope, float * dst); - - void NeuralPow(const float * src, size_t size, const float * exponent, float * dst); - - void NeuralDerivativeRelu(const float * src, size_t size, const float * slope, float * dst); - - void NeuralUpdateWeights(const float * x, size_t size, const float * a, const float * b, float * d, float * w); - - void NeuralAdaptiveGradientUpdate(const float * delta, size_t size, size_t batch, const float * alpha, const float * epsilon, float * gradient, float * weight); - - void NeuralAddConvolution2x2Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution3x3Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution4x4Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution5x5Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution2x2Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution3x3Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution4x4Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution5x5Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride); - - void NeuralAddConvolution2x2Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - void NeuralAddConvolution3x3Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - void NeuralAddConvolution4x4Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - void NeuralAddConvolution5x5Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums); - - void NeuralPooling1x1Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride); - - void NeuralPooling2x2Max2x2(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride); - - void NeuralPooling2x2Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride); - - void OperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride, SimdOperationBinary8uType type); - - void OperationBinary16i(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint8_t * dst, size_t dstStride, SimdOperationBinary16iType type); - - void VectorProduct(const uint8_t * vertical, const uint8_t * horizontal, uint8_t * dst, size_t stride, size_t width, size_t height); - - void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); - - void ReduceGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - void ReduceGray3x3(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation); - - void ReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - void ReduceGray5x5(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation); - - void Reorder16bit(const uint8_t * src, size_t size, uint8_t * dst); - - void Reorder32bit(const uint8_t * src, size_t size, uint8_t * dst); - - void Reorder64bit(const uint8_t * src, size_t size, uint8_t * dst); - - void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); - - void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); - - void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride); - - void SegmentationChangeIndex(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t oldIndex, uint8_t newIndex); - - void SegmentationFillSingleHoles(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index); - - void SegmentationPropagate2x2(const uint8_t * parent, size_t parentStride, size_t width, size_t height, - uint8_t * child, size_t childStride, const uint8_t * difference, size_t differenceStride, - uint8_t currentIndex, uint8_t invalidIndex, uint8_t emptyIndex, uint8_t differenceThreshold); - - void SegmentationShrinkRegion(const uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index, - ptrdiff_t * left, ptrdiff_t * top, ptrdiff_t * right, ptrdiff_t * bottom); - - void ShiftBilinear(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t * bkg, size_t bkgStride, const double * shiftX, const double * shiftY, - size_t cropLeft, size_t cropTop, size_t cropRight, size_t cropBottom, uint8_t * dst, size_t dstStride); - - void SobelDx(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void SobelDxAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void SobelDxAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - void SobelDy(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void SobelDyAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void SobelDyAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - void ContourMetrics(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - - void ContourMetricsMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t indexMin, uint8_t * dst, size_t dstStride); - - void ContourAnchors(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t step, int16_t threshold, uint8_t * dst, size_t dstStride); - - void SquaredDifferenceSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint64_t * sum); - - void SquaredDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum); - - void SquaredDifferenceSum32f(const float * a, const float * b, size_t size, float * sum); - - void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum); - - void GetStatistic(const uint8_t * src, size_t stride, size_t width, size_t height, - uint8_t * min, uint8_t * max, uint8_t * average); - - void GetMoments(const uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index, - uint64_t * area, uint64_t * x, uint64_t * y, uint64_t * xx, uint64_t * xy, uint64_t * yy); - - void GetObjectMoments(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t* mask, size_t maskStride, uint8_t index, - uint64_t* n, uint64_t* s, uint64_t* sx, uint64_t* sy, uint64_t* sxx, uint64_t* sxy, uint64_t* syy); - - void GetRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - void GetColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - void GetAbsDyRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - void GetAbsDxColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums); - - void ValueSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - void SquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); - - void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum); - - void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum); - - void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - void SvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum); - - void SynetAddBias(const float * bias, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetConvert32fTo8u(const float* src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float* shift, uint8_t* dst, SimdSynetCompatibilityType compatibility); - - void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst); - - void SynetElu32f(const float * src, size_t size, const float * alpha, float * dst); - - void SynetFusedLayerForward0(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward1(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward2(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward3(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward4(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward8(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetFusedLayerForward9(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1, SimdTensorFormatType format); - - void SynetHswish32f(const float * src, size_t size, const float * shift, const float * scale, float * dst); - - void SynetInnerProductLayerForward(const float * src, const float * weight, const float * bias, size_t count, size_t size, float * dst); - - void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst, SimdTensorFormatType format); - - void SynetPoolingForwardAverage(const float* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float* dst, size_t dstH, size_t dstW, SimdBool excludePad, SimdTensorFormatType format); - - void SynetPoolingForwardMax32f(const float * src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float * dst, size_t dstH, size_t dstW, SimdTensorFormatType format); - - void SynetPoolingForwardMax8u(const uint8_t* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, uint8_t* dst, size_t dstH, size_t dstW, SimdTensorFormatType format); - - void SynetPreluLayerForward(const float * src, const float * slope, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - - void SynetRelu32f(const float* src, size_t size, const float* slope, float* dst); - - void SynetReorderImage(size_t batch, size_t channels, size_t spatial, const float* src, SimdTensorFormatType srcFormat, float* dst, SimdTensorFormatType dstFormat); - - void SynetReorderFilter(size_t output, size_t input, size_t kernel, const float* src, SimdTensorFormatType srcFormat, float* dst, SimdTensorFormatType dstFormat); - - void SynetRestrictRange32f(const float * src, size_t size, const float * lower, const float * upper, float * dst); - - void SynetScaleLayerForward(const float* src, const float* scale, const float* bias, size_t channels, size_t height, size_t width, float* dst, SimdTensorFormatType format, SimdSynetCompatibilityType compatibility); - - void SynetSetInput(const uint8_t * src, size_t width, size_t height, size_t stride, SimdPixelFormatType srcFormat, - const float * lower, const float * upper, float * dst, size_t channels, SimdTensorFormatType dstFormat); - - void SynetShuffleLayerForward(const float* src0, const float* src1, size_t channels0, size_t channels1, size_t spatial, float* dst0, float* dst1, SimdTensorFormatType format, int type); - - void SynetSigmoid32f(const float* src, size_t size, const float* slope, float* dst); - - void SynetSoftmaxLayerForward(const float * src, size_t outer, size_t size, size_t inner, float * dst); - - void SynetSoftplus32f(const float* src, size_t size, const float* beta, const float* threshold, float* dst); - - void SynetTanh32f(const float* src, size_t size, const float* slope, float* dst); - - void SynetUnaryOperation32fLayerForward(const float* src, size_t size, SimdSynetUnaryOperation32fType type, float* dst); - - void TextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride); - - void TextureBoostedUv(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t boost, uint8_t * dst, size_t dstStride); - - void TextureGetDifferenceSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, int64_t * sum); - - void TexturePerformCompensation(const uint8_t * src, size_t srcStride, size_t width, size_t height, - int shift, uint8_t * dst, size_t dstStride); - - void TransformImage(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, SimdTransformType transform, uint8_t * dst, size_t dstStride); - - void WinogradKernel1x3Block1x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans); - - void WinogradKernel1x3Block1x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel1x3Block1x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel1x5Block1x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans); - - void WinogradKernel1x5Block1x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel1x5Block1x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel2x2Block2x2SetFilter(const float* src, size_t size, float* dst, SimdBool trans); - - void WinogradKernel2x2Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel2x2Block2x2SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel2x2Block4x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans); - - void WinogradKernel2x2Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel2x2Block4x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel3x3Block2x2SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - void WinogradKernel3x3Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel3x3Block2x2SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel3x3Block3x3SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - void WinogradKernel3x3Block3x3SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel3x3Block3x3SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void WinogradKernel3x3Block4x4SetFilter(const float * src, size_t size, float * dst, SimdBool trans); - - void WinogradKernel3x3Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans); - - void WinogradKernel3x3Block4x4SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans); - - void Yuva420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride); - - void Yuv420pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - void Yuv422pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - void Yuv444pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - void Yuv420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void Yuv422pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void Yuv444pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void Yuv420pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride); - - void Yuv444pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride); - - void Yuv420pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride); - - void Yuv422pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride); - - void Yuv444pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride); - } -#endif// SIMD_NEON_ENABLE -} -#endif//__SimdNeon_h__ diff --git a/src/3rd/Simd/Simd/SimdNeonAbsDifferenceSum.cpp b/src/3rd/Simd/Simd/SimdNeonAbsDifferenceSum.cpp deleted file mode 100644 index d79b1d69..00000000 --- a/src/3rd/Simd/Simd/SimdNeonAbsDifferenceSum.cpp +++ /dev/null @@ -1,322 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdLoad.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template void AbsDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, size_t width, size_t height, uint64_t * sum) - { - assert(width >= A); - if (align) - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - size_t blockSize = A << 8; - size_t blockCount = (alignedWidth >> 8) + 1; - - uint64x2_t _sum = K64_0000000000000000; - for (size_t row = 0; row < height; ++row) - { - uint32x4_t rowSum = K32_00000000; - for (size_t block = 0; block < blockCount; ++block) - { - uint16x8_t blockSum = K16_0000; - for (size_t col = block*blockSize, end = Min(col + blockSize, alignedWidth); col < end; col += A) - { - const uint8x16_t ad = vabdq_u8(Load(a + col), Load(b + col)); - blockSum = vaddq_u16(blockSum, vpaddlq_u8(ad)); - } - rowSum = vaddq_u32(rowSum, vpaddlq_u16(blockSum)); - } - if (alignedWidth != width) - { - const uint8x16_t ad = vabdq_u8(Load(a + width - A), Load(b + width - A)); - rowSum = vaddq_u32(rowSum, vpaddlq_u16(vpaddlq_u8(vandq_u8(tailMask, ad)))); - } - _sum = vaddq_u64(_sum, vpaddlq_u32(rowSum)); - a += aStride; - b += bStride; - } - *sum = ExtractSum64u(_sum); - } - - void AbsDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)) - AbsDifferenceSum(a, aStride, b, bStride, width, height, sum); - else - AbsDifferenceSum(a, aStride, b, bStride, width, height, sum); - } - - template void AbsDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) - { - assert(width >= A); - if (align) - { - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); - assert(Aligned(mask) && Aligned(maskStride)); - } - - size_t alignedWidth = Simd::AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - size_t blockSize = A << 8; - size_t blockCount = (alignedWidth >> 8) + 1; - - uint8x16_t _index = vdupq_n_u8(index); - uint64x2_t _sum = K64_0000000000000000; - - for (size_t row = 0; row < height; ++row) - { - uint32x4_t rowSum = K32_00000000; - for (size_t block = 0; block < blockCount; ++block) - { - uint16x8_t blockSum = K16_0000; - for (size_t col = block*blockSize, end = Min(col + blockSize, alignedWidth); col < end; col += A) - { - const uint8x16_t ad = vabdq_u8(Load(a + col), Load(b + col)); - const uint8x16_t _mask = vceqq_u8(Load(mask + col), _index); - blockSum = vaddq_u16(blockSum, vpaddlq_u8(vandq_u8(_mask, ad))); - } - rowSum = vaddq_u32(rowSum, vpaddlq_u16(blockSum)); - } - if (alignedWidth != width) - { - size_t col = width - A; - const uint8x16_t ad = vabdq_u8(Load(a + col), Load(b + col)); - const uint8x16_t _mask = vandq_u8(vceqq_u8(Load(mask + col), _index), tailMask); - rowSum = vaddq_u32(rowSum, vpaddlq_u16(vpaddlq_u8(vandq_u8(_mask, ad)))); - } - _sum = vaddq_u64(_sum, vpaddlq_u32(rowSum)); - a += aStride; - b += bStride; - mask += maskStride; - } - *sum = ExtractSum64u(_sum); - } - - void AbsDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(mask) && Aligned(maskStride)) - AbsDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - else - AbsDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - } - - template void AbsDifferenceSums3(uint8x16_t current, const uint8_t * background, uint16x8_t sums[3]) - { - sums[0] = vaddq_u16(sums[0], vpaddlq_u8(vabdq_u8(current, Load(background - 1)))); - sums[1] = vaddq_u16(sums[1], vpaddlq_u8(vabdq_u8(current, Load(background)))); - sums[2] = vaddq_u16(sums[2], vpaddlq_u8(vabdq_u8(current, Load(background + 1)))); - } - - template void AbsDifferenceSums3x3(uint8x16_t current, const uint8_t * background, size_t stride, uint16x8_t sums[9]) - { - AbsDifferenceSums3(current, background - stride, sums + 0); - AbsDifferenceSums3(current, background, sums + 3); - AbsDifferenceSums3(current, background + stride, sums + 6); - } - - template void AbsDifferenceSums3Masked(uint8x16_t current, const uint8_t * background, uint8x16_t mask, uint32x4_t sums[3]) - { - sums[0] = vaddq_u32(sums[0], vpaddlq_u16(vpaddlq_u8(vabdq_u8(current, vandq_u8(mask, Load(background - 1)))))); - sums[1] = vaddq_u32(sums[1], vpaddlq_u16(vpaddlq_u8(vabdq_u8(current, vandq_u8(mask, Load(background)))))); - sums[2] = vaddq_u32(sums[2], vpaddlq_u16(vpaddlq_u8(vabdq_u8(current, vandq_u8(mask, Load(background + 1)))))); - } - - template void AbsDifferenceSums3x3Masked(uint8x16_t current, const uint8_t * background, size_t stride, uint8x16_t mask, uint32x4_t sums[9]) - { - AbsDifferenceSums3Masked(current, background - stride, mask, sums + 0); - AbsDifferenceSums3Masked(current, background, mask, sums + 3); - AbsDifferenceSums3Masked(current, background + stride, mask, sums + 6); - } - - template void AbsDifferenceSums3x3(const uint8_t * current, size_t currentStride, - const uint8_t * background, size_t backgroundStride, size_t width, size_t height, uint64_t * sums) - { - assert(height > 2 && width >= A + 2); - if (align) - assert(Aligned(background) && Aligned(backgroundStride)); - - width -= 2; - height -= 2; - current += 1 + currentStride; - background += 1 + backgroundStride; - - size_t alignedWidth = Simd::AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - size_t blockSize = A << 8; - size_t blockCount = (alignedWidth >> 8) + 1; - - uint64x2_t _sums[9]; - for (size_t i = 0; i < 9; ++i) - _sums[i] = K64_0000000000000000; - - for (size_t row = 0; row < height; ++row) - { - uint32x4_t rowSums[9]; - for (size_t i = 0; i < 9; ++i) - rowSums[i] = K32_00000000; - - for (size_t block = 0; block < blockCount; ++block) - { - uint16x8_t blockSums[9]; - for (size_t i = 0; i < 9; ++i) - blockSums[i] = K16_0000; - - for (size_t col = block*blockSize, end = Min(col + blockSize, alignedWidth); col < end; col += A) - { - const uint8x16_t _current = Load(current + col); - AbsDifferenceSums3x3(_current, background + col, backgroundStride, blockSums); - } - - for (size_t i = 0; i < 9; ++i) - rowSums[i] = vaddq_u32(rowSums[i], vpaddlq_u16(blockSums[i])); - } - - if (alignedWidth != width) - { - const uint8x16_t _current = vandq_u8(tailMask, Load(current + width - A)); - AbsDifferenceSums3x3Masked(_current, background + width - A, backgroundStride, tailMask, rowSums); - } - - for (size_t i = 0; i < 9; ++i) - _sums[i] = vaddq_u64(_sums[i], vpaddlq_u32(rowSums[i])); - - current += currentStride; - background += backgroundStride; - } - - for (size_t i = 0; i < 9; ++i) - sums[i] = ExtractSum64u(_sums[i]); - } - - void AbsDifferenceSums3x3(const uint8_t * current, size_t currentStride, const uint8_t * background, size_t backgroundStride, - size_t width, size_t height, uint64_t * sums) - { - if (Aligned(background) && Aligned(backgroundStride)) - AbsDifferenceSums3x3(current, currentStride, background, backgroundStride, width, height, sums); - else - AbsDifferenceSums3x3(current, currentStride, background, backgroundStride, width, height, sums); - } - - template void AbsDifferenceSums3Masked(uint8x16_t current, const uint8_t * background, uint8x16_t mask, uint16x8_t sums[3]) - { - sums[0] = vaddq_u16(sums[0], vpaddlq_u8(vabdq_u8(current, vandq_u8(mask, Load(background - 1))))); - sums[1] = vaddq_u16(sums[1], vpaddlq_u8(vabdq_u8(current, vandq_u8(mask, Load(background))))); - sums[2] = vaddq_u16(sums[2], vpaddlq_u8(vabdq_u8(current, vandq_u8(mask, Load(background + 1))))); - } - - template void AbsDifferenceSums3x3Masked(uint8x16_t current, const uint8_t * background, size_t stride, uint8x16_t mask, uint16x8_t sums[9]) - { - AbsDifferenceSums3Masked(current, background - stride, mask, sums + 0); - AbsDifferenceSums3Masked(current, background, mask, sums + 3); - AbsDifferenceSums3Masked(current, background + stride, mask, sums + 6); - } - - template void AbsDifferenceSums3x3Masked(const uint8_t *current, size_t currentStride, const uint8_t *background, size_t backgroundStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sums) - { - assert(height > 2 && width >= A + 2); - if (align) - assert(Aligned(background) && Aligned(backgroundStride)); - - width -= 2; - height -= 2; - current += 1 + currentStride; - background += 1 + backgroundStride; - mask += 1 + maskStride; - - size_t alignedWidth = Simd::AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - size_t blockSize = A << 8; - size_t blockCount = (alignedWidth >> 8) + 1; - - uint8x16_t _index = vdupq_n_u8(index); - - uint64x2_t _sums[9]; - for (size_t i = 0; i < 9; ++i) - _sums[i] = K64_0000000000000000; - - for (size_t row = 0; row < height; ++row) - { - uint32x4_t rowSums[9]; - for (size_t i = 0; i < 9; ++i) - rowSums[i] = K32_00000000; - - for (size_t block = 0; block < blockCount; ++block) - { - uint16x8_t blockSums[9]; - for (size_t i = 0; i < 9; ++i) - blockSums[i] = K16_0000; - - for (size_t col = block*blockSize, end = Min(col + blockSize, alignedWidth); col < end; col += A) - { - const uint8x16_t _mask = vceqq_u8(Load(mask + col), _index); - const uint8x16_t _current = vandq_u8(Load(current + col), _mask); - AbsDifferenceSums3x3Masked(_current, background + col, backgroundStride, _mask, blockSums); - } - - for (size_t i = 0; i < 9; ++i) - rowSums[i] = vaddq_u32(rowSums[i], vpaddlq_u16(blockSums[i])); - } - - if (alignedWidth != width) - { - size_t col = width - A; - const uint8x16_t _mask = vandq_u8(tailMask, vceqq_u8(Load(mask + col), _index)); - const uint8x16_t _current = vandq_u8(_mask, Load(current + col)); - AbsDifferenceSums3x3Masked(_current, background + col, backgroundStride, _mask, rowSums); - } - - for (size_t i = 0; i < 9; ++i) - _sums[i] = vaddq_u64(_sums[i], vpaddlq_u32(rowSums[i])); - - current += currentStride; - background += backgroundStride; - mask += maskStride; - } - - for (size_t i = 0; i < 9; ++i) - sums[i] = ExtractSum64u(_sums[i]); - } - - void AbsDifferenceSums3x3Masked(const uint8_t *current, size_t currentStride, const uint8_t *background, size_t backgroundStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sums) - { - if (Aligned(background) && Aligned(backgroundStride)) - AbsDifferenceSums3x3Masked(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums); - else - AbsDifferenceSums3x3Masked(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonAbsGradientSaturatedSum.cpp b/src/3rd/Simd/Simd/SimdNeonAbsGradientSaturatedSum.cpp deleted file mode 100644 index 8ccded41..00000000 --- a/src/3rd/Simd/Simd/SimdNeonAbsGradientSaturatedSum.cpp +++ /dev/null @@ -1,70 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE uint8x16_t AbsGradientSaturatedSum(const uint8_t * src, size_t stride) - { - const uint8x16_t dx = vabdq_u8(Load(src + 1), Load(src - 1)); - const uint8x16_t dy = vabdq_u8(Load(src + stride), Load(src - stride)); - return vqaddq_u8(dx, dy); - } - - template void AbsGradientSaturatedSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - size_t alignedWidth = AlignLo(width, A); - memset(dst, 0, width); - src += srcStride; - dst += dstStride; - for (size_t row = 2; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - Store(dst + col, AbsGradientSaturatedSum(src + col, srcStride)); - if (width != alignedWidth) - Store(dst + width - A, AbsGradientSaturatedSum(src + width - A, srcStride)); - - dst[0] = 0; - dst[width - 1] = 0; - - src += srcStride; - dst += dstStride; - } - memset(dst, 0, width); - } - - void AbsGradientSaturatedSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - AbsGradientSaturatedSum(src, srcStride, width, height, dst, dstStride); - else - AbsGradientSaturatedSum(src, srcStride, width, height, dst, dstStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonAddFeatureDifference.cpp b/src/3rd/Simd/Simd/SimdNeonAddFeatureDifference.cpp deleted file mode 100644 index 6dffabb8..00000000 --- a/src/3rd/Simd/Simd/SimdNeonAddFeatureDifference.cpp +++ /dev/null @@ -1,109 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -#include "Simd/SimdLog.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE uint8x16_t FeatureDifference(uint8x16_t value, uint8x16_t lo, uint8x16_t hi) - { - return vmaxq_u8(vqsubq_u8(value, hi), vqsubq_u8(lo, value)); - } - - SIMD_INLINE uint16x8_t ShiftedWeightedSquare(uint8x8_t difference, uint16x4_t weight) - { - uint16x8_t square = vmull_u8(difference, difference); - uint16x4_t lo = vshrn_n_u32(vmull_u16(Half<0>(square), weight), 16); - uint16x4_t hi = vshrn_n_u32(vmull_u16(Half<1>(square), weight), 16); - return vcombine_u16(lo, hi); - } - - SIMD_INLINE uint8x16_t ShiftedWeightedSquare(uint8x16_t difference, uint16x4_t weight) - { - const uint16x8_t lo = ShiftedWeightedSquare(Half<0>(difference), weight); - const uint16x8_t hi = ShiftedWeightedSquare(Half<1>(difference), weight); - return PackSaturatedU16(lo, hi); - } - - template SIMD_INLINE void AddFeatureDifference(const uint8_t * value, const uint8_t * lo, const uint8_t * hi, - uint8_t * difference, size_t offset, uint16x4_t weight, uint8x16_t mask) - { - const uint8x16_t _value = Load(value + offset); - const uint8x16_t _lo = Load(lo + offset); - const uint8x16_t _hi = Load(hi + offset); - uint8x16_t _difference = Load(difference + offset); - - const uint8x16_t featureDifference = FeatureDifference(_value, _lo, _hi); - const uint8x16_t inc = vandq_u8(mask, ShiftedWeightedSquare(featureDifference, weight)); - Store(difference + offset, vqaddq_u8(_difference, inc)); - } - - template void AddFeatureDifference(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, - uint16_t weight, uint8_t * difference, size_t differenceStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(lo) && Aligned(loStride)); - assert(Aligned(hi) && Aligned(hiStride)); - assert(Aligned(difference) && Aligned(differenceStride)); - } - - size_t alignedWidth = AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - uint16x4_t _weight = vdup_n_u16(weight); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - AddFeatureDifference(value, lo, hi, difference, col, _weight, K8_FF); - if (alignedWidth != width) - AddFeatureDifference(value, lo, hi, difference, width - A, _weight, tailMask); - value += valueStride; - lo += loStride; - hi += hiStride; - difference += differenceStride; - } - } - - void AddFeatureDifference(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, - uint16_t weight, uint8_t * difference, size_t differenceStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(lo) && Aligned(loStride) && - Aligned(hi) && Aligned(hiStride) && Aligned(difference) && Aligned(differenceStride)) - AddFeatureDifference(value, valueStride, width, height, lo, loStride, hi, hiStride, weight, difference, differenceStride); - else - AddFeatureDifference(value, valueStride, width, height, lo, loStride, hi, hiStride, weight, difference, differenceStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonAlphaBlending.cpp b/src/3rd/Simd/Simd/SimdNeonAlphaBlending.cpp deleted file mode 100644 index b830cb61..00000000 --- a/src/3rd/Simd/Simd/SimdNeonAlphaBlending.cpp +++ /dev/null @@ -1,295 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE uint8x8_t AlphaBlending(const uint8x16_t & src, const uint8x16_t & dst, - const uint8x16_t & alpha, const uint8x16_t & ff_alpha) - { - uint16x8_t value = vaddq_u16( - vmull_u8(Half(src), Half(alpha)), - vmull_u8(Half(dst), Half(ff_alpha))); - return vshrn_n_u16(vaddq_u16(vaddq_u16(value, K16_0001), vshrq_n_u16(value, 8)), 8); - } - - template SIMD_INLINE void AlphaBlending(const uint8_t * src, uint8_t * dst, const uint8x16_t & alpha) - { - uint8x16_t _src = Load(src); - uint8x16_t _dst = Load(dst); - uint8x16_t ff_alpha = vsubq_u8(K8_FF, alpha); - uint8x8_t lo = AlphaBlending<0>(_src, _dst, alpha, ff_alpha); - uint8x8_t hi = AlphaBlending<1>(_src, _dst, alpha, ff_alpha); - Store(dst, vcombine_u8(lo, hi)); - } - - template struct AlphaBlender - { - void operator()(const uint8_t * src, uint8_t * dst, uint8x16_t alpha); - }; - - template struct AlphaBlender - { - SIMD_INLINE void operator()(const uint8_t * src, uint8_t * dst, uint8x16_t alpha) - { - AlphaBlending(src, dst, alpha); - } - }; - - template struct AlphaBlender - { - SIMD_INLINE void operator()(const uint8_t * src, uint8_t * dst, uint8x16_t alpha) - { - uint8x16x2_t _alpha = vzipq_u8(alpha, alpha); - AlphaBlending(src + 0, dst + 0, _alpha.val[0]); - AlphaBlending(src + A, dst + A, _alpha.val[1]); - } - }; - - template struct AlphaBlender - { - SIMD_INLINE void operator()(const uint8_t * src, uint8_t * dst, uint8x16_t alpha) - { - uint8x16x3_t _alpha; - _alpha.val[0] = alpha; - _alpha.val[1] = alpha; - _alpha.val[2] = alpha; - Store3((uint8_t*)&_alpha, _alpha); - AlphaBlending(src + 0 * A, dst + 0 * A, _alpha.val[0]); - AlphaBlending(src + 1 * A, dst + 1 * A, _alpha.val[1]); - AlphaBlending(src + 2 * A, dst + 2 * A, _alpha.val[2]); - } - }; - - template struct AlphaBlender - { - SIMD_INLINE void operator()(const uint8_t * src, uint8_t * dst, uint8x16_t alpha) - { - uint8x16x2_t _alpha = vzipq_u8(alpha, alpha); - AlphaBlender()(src + A * 0, dst + A * 0, _alpha.val[0]); - AlphaBlender()(src + A * 2, dst + A * 2, _alpha.val[1]); - } - }; - - template void AlphaBlending(const uint8_t *src, size_t srcStride, size_t width, size_t height, - const uint8_t *alpha, size_t alphaStride, uint8_t *dst, size_t dstStride) - { - size_t alignedWidth = AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - size_t step = channelCount*A; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < alignedWidth; col += A, offset += step) - { - uint8x16_t _alpha = Load(alpha + col); - AlphaBlender()(src + offset, dst + offset, _alpha); - } - if (alignedWidth != width) - { - uint8x16_t _alpha = vandq_u8(Load(alpha + width - A), tailMask); - AlphaBlender()(src + (width - A)*channelCount, dst + (width - A)*channelCount, _alpha); - } - src += srcStride; - alpha += alphaStride; - dst += dstStride; - } - } - - template void AlphaBlending(const uint8_t *src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t *alpha, size_t alphaStride, uint8_t *dst, size_t dstStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(src) && Aligned(srcStride)); - assert(Aligned(alpha) && Aligned(alphaStride)); - assert(Aligned(dst) && Aligned(dstStride)); - } - - switch (channelCount) - { - case 1: AlphaBlending(src, srcStride, width, height, alpha, alphaStride, dst, dstStride); break; - case 2: AlphaBlending(src, srcStride, width, height, alpha, alphaStride, dst, dstStride); break; - case 3: AlphaBlending(src, srcStride, width, height, alpha, alphaStride, dst, dstStride); break; - case 4: AlphaBlending(src, srcStride, width, height, alpha, alphaStride, dst, dstStride); break; - default: - assert(0); - } - } - - void AlphaBlending(const uint8_t *src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t *alpha, size_t alphaStride, uint8_t *dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(alpha) && Aligned(alphaStride) && Aligned(dst) && Aligned(dstStride)) - AlphaBlending(src, srcStride, width, height, channelCount, alpha, alphaStride, dst, dstStride); - else - AlphaBlending(src, srcStride, width, height, channelCount, alpha, alphaStride, dst, dstStride); - } - - template SIMD_INLINE void AlphaFilling(uint8_t * dst, const uint8x16_t & channel, const uint8x16_t & alpha) - { - uint8x16_t _dst = Load(dst); - uint8x16_t ff_alpha = vsubq_u8(K8_FF, alpha); - uint8x8_t lo = AlphaBlending<0>(channel, _dst, alpha, ff_alpha); - uint8x8_t hi = AlphaBlending<1>(channel, _dst, alpha, ff_alpha); - Store(dst, vcombine_u8(lo, hi)); - } - - template struct AlphaFiller - { - void operator() (uint8x16_t * dst, const uint8x16_t * channel, const uint8x16_t & alpha); - }; - - template struct AlphaFiller - { - SIMD_INLINE void operator()(uint8_t * dst, const uint8x16_t * channel, const uint8x16_t & alpha) - { - AlphaFilling(dst, channel[0], alpha); - } - }; - - template struct AlphaFiller - { - SIMD_INLINE void operator()(uint8_t * dst, const uint8x16_t * channel, const uint8x16_t & alpha) - { - uint8x16x2_t _alpha = vzipq_u8(alpha, alpha); - AlphaFilling(dst + 0 * A, channel[0], _alpha.val[0]); - AlphaFilling(dst + 1 * A, channel[1], _alpha.val[1]); - } - }; - - template struct AlphaFiller - { - SIMD_INLINE void operator()(uint8_t * dst, const uint8x16_t * channel, const uint8x16_t & alpha) - { - uint8x16x3_t _alpha; - _alpha.val[0] = alpha; - _alpha.val[1] = alpha; - _alpha.val[2] = alpha; - Store3((uint8_t*)&_alpha, _alpha); - AlphaFilling(dst + 0 * A, channel[0], _alpha.val[0]); - AlphaFilling(dst + 1 * A, channel[1], _alpha.val[1]); - AlphaFilling(dst + 2 * A, channel[2], _alpha.val[2]); - } - }; - - template struct AlphaFiller - { - SIMD_INLINE void operator()(uint8_t * dst, const uint8x16_t * channel, const uint8x16_t & alpha) - { - uint8x16x2_t _alpha = vzipq_u8(alpha, alpha); - AlphaFiller()(dst + A * 0, channel + 0, _alpha.val[0]); - AlphaFiller()(dst + A * 2, channel + 2, _alpha.val[1]); - } - }; - - template void AlphaFilling(uint8_t * dst, size_t dstStride, size_t width, size_t height, const uint8x16_t * channel, const uint8_t * alpha, size_t alphaStride) - { - size_t alignedWidth = AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - size_t step = channelCount * A; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < alignedWidth; col += A, offset += step) - { - uint8x16_t _alpha = Load(alpha + col); - AlphaFiller()(dst + offset, channel, _alpha); - } - if (alignedWidth != width) - { - uint8x16_t _alpha = vandq_u8(Load(alpha + width - A), tailMask); - AlphaFiller()(dst + (width - A)*channelCount, channel, _alpha); - } - alpha += alphaStride; - dst += dstStride; - } - } - - template void AlphaFilling(uint8_t * dst, size_t dstStride, size_t width, size_t height, const uint8_t * channel, size_t channelCount, const uint8_t * alpha, size_t alphaStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(dst) && Aligned(dstStride)); - assert(Aligned(alpha) && Aligned(alphaStride)); - } - - switch (channelCount) - { - case 1: - { - uint8x16_t _channel = vdupq_n_u8(channel[0]); - AlphaFilling(dst, dstStride, width, height, &_channel, alpha, alphaStride); - break; - } - case 2: - { - uint8x16x2_t _channel; - _channel.val[0] = vdupq_n_u8(channel[0]); - _channel.val[1] = vdupq_n_u8(channel[1]); - Store2((uint8_t*)&_channel, _channel); - AlphaFilling(dst, dstStride, width, height, _channel.val, alpha, alphaStride); - break; - } - case 3: - { - uint8x16x3_t _channel; - _channel.val[0] = vdupq_n_u8(channel[0]); - _channel.val[1] = vdupq_n_u8(channel[1]); - _channel.val[2] = vdupq_n_u8(channel[2]); - Store3((uint8_t*)&_channel, _channel); - AlphaFilling(dst, dstStride, width, height, _channel.val, alpha, alphaStride); - break; - } - case 4: - { - uint8x16x4_t _channel; - _channel.val[0] = vdupq_n_u8(channel[0]); - _channel.val[1] = vdupq_n_u8(channel[1]); - _channel.val[2] = vdupq_n_u8(channel[2]); - _channel.val[3] = vdupq_n_u8(channel[3]); - Store4((uint8_t*)&_channel, _channel); - AlphaFilling(dst, dstStride, width, height, _channel.val, alpha, alphaStride); - break; - } - default: - assert(0); - } - } - - void AlphaFilling(uint8_t * dst, size_t dstStride, size_t width, size_t height, const uint8_t * channel, size_t channelCount, const uint8_t * alpha, size_t alphaStride) - { - if (Aligned(dst) && Aligned(dstStride) && Aligned(alpha) && Aligned(alphaStride)) - AlphaFilling(dst, dstStride, width, height, channel, channelCount, alpha, alphaStride); - else - AlphaFilling(dst, dstStride, width, height, channel, channelCount, alpha, alphaStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonBackground.cpp b/src/3rd/Simd/Simd/SimdNeonBackground.cpp deleted file mode 100644 index c29552b8..00000000 --- a/src/3rd/Simd/Simd/SimdNeonBackground.cpp +++ /dev/null @@ -1,432 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE void BackgroundGrowRangeSlow(const uint8_t * value, uint8_t * lo, uint8_t * hi, uint8x16_t mask) - { - const uint8x16_t _value = Load(value); - const uint8x16_t _lo = Load(lo); - const uint8x16_t _hi = Load(hi); - - const uint8x16_t inc = vandq_u8(mask, vcgtq_u8(_value, _hi)); - const uint8x16_t dec = vandq_u8(mask, vcltq_u8(_value, _lo)); - - Store(lo, vqsubq_u8(_lo, dec)); - Store(hi, vqaddq_u8(_hi, inc)); - } - - template void BackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(lo) && Aligned(loStride)); - assert(Aligned(hi) && Aligned(hiStride)); - } - - size_t alignedWidth = AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_01, A - width + alignedWidth); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BackgroundGrowRangeSlow(value + col, lo + col, hi + col, K8_01); - if (alignedWidth != width) - BackgroundGrowRangeSlow(value + width - A, lo + width - A, hi + width - A, tailMask); - value += valueStride; - lo += loStride; - hi += hiStride; - } - } - - void BackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(lo) && Aligned(loStride) && Aligned(hi) && Aligned(hiStride)) - BackgroundGrowRangeSlow(value, valueStride, width, height, lo, loStride, hi, hiStride); - else - BackgroundGrowRangeSlow(value, valueStride, width, height, lo, loStride, hi, hiStride); - } - - template SIMD_INLINE void BackgroundGrowRangeFast(const uint8_t * value, uint8_t * lo, uint8_t * hi) - { - const uint8x16_t _value = Load(value); - const uint8x16_t _lo = Load(lo); - const uint8x16_t _hi = Load(hi); - - Store(lo, vminq_u8(_lo, _value)); - Store(hi, vmaxq_u8(_hi, _value)); - } - - template void BackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(lo) && Aligned(loStride)); - assert(Aligned(hi) && Aligned(hiStride)); - } - - size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BackgroundGrowRangeFast(value + col, lo + col, hi + col); - if (alignedWidth != width) - BackgroundGrowRangeFast(value + width - A, lo + width - A, hi + width - A); - value += valueStride; - lo += loStride; - hi += hiStride; - } - } - - void BackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(lo) && Aligned(loStride) && Aligned(hi) && Aligned(hiStride)) - BackgroundGrowRangeFast(value, valueStride, width, height, lo, loStride, hi, hiStride); - else - BackgroundGrowRangeFast(value, valueStride, width, height, lo, loStride, hi, hiStride); - } - - template SIMD_INLINE void BackgroundIncrementCount(const uint8_t * value, - const uint8_t * loValue, const uint8_t * hiValue, uint8_t * loCount, uint8_t * hiCount, size_t offset, uint8x16_t mask) - { - const uint8x16_t _value = Load(value + offset); - const uint8x16_t _loValue = Load(loValue + offset); - const uint8x16_t _loCount = Load(loCount + offset); - const uint8x16_t _hiValue = Load(hiValue + offset); - const uint8x16_t _hiCount = Load(hiCount + offset); - - const uint8x16_t incLo = vandq_u8(mask, vcltq_u8(_value, _loValue)); - const uint8x16_t incHi = vandq_u8(mask, vcgtq_u8(_value, _hiValue)); - - Store(loCount + offset, vqaddq_u8(_loCount, incLo)); - Store(hiCount + offset, vqaddq_u8(_hiCount, incHi)); - } - - template void BackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * loValue, size_t loValueStride, const uint8_t * hiValue, size_t hiValueStride, - uint8_t * loCount, size_t loCountStride, uint8_t * hiCount, size_t hiCountStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(loValue) && Aligned(loValueStride) && Aligned(hiValue) && Aligned(hiValueStride)); - assert(Aligned(loCount) && Aligned(loCountStride) && Aligned(hiCount) && Aligned(hiCountStride)); - } - - size_t alignedWidth = AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_01, A - width + alignedWidth); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BackgroundIncrementCount(value, loValue, hiValue, loCount, hiCount, col, K8_01); - if (alignedWidth != width) - BackgroundIncrementCount(value, loValue, hiValue, loCount, hiCount, width - A, tailMask); - value += valueStride; - loValue += loValueStride; - hiValue += hiValueStride; - loCount += loCountStride; - hiCount += hiCountStride; - } - } - - void BackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * loValue, size_t loValueStride, const uint8_t * hiValue, size_t hiValueStride, - uint8_t * loCount, size_t loCountStride, uint8_t * hiCount, size_t hiCountStride) - { - if (Aligned(value) && Aligned(valueStride) && - Aligned(loValue) && Aligned(loValueStride) && Aligned(hiValue) && Aligned(hiValueStride) && - Aligned(loCount) && Aligned(loCountStride) && Aligned(hiCount) && Aligned(hiCountStride)) - BackgroundIncrementCount(value, valueStride, width, height, - loValue, loValueStride, hiValue, hiValueStride, loCount, loCountStride, hiCount, hiCountStride); - else - BackgroundIncrementCount(value, valueStride, width, height, - loValue, loValueStride, hiValue, hiValueStride, loCount, loCountStride, hiCount, hiCountStride); - } - - SIMD_INLINE uint8x16_t AdjustLo(const uint8x16_t & count, const uint8x16_t & value, const uint8x16_t & mask, const uint8x16_t & threshold) - { - const uint8x16_t dec = vandq_u8(mask, vcgtq_u8(count, threshold)); - const uint8x16_t inc = vandq_u8(mask, vcltq_u8(count, threshold)); - return vqsubq_u8(vqaddq_u8(value, inc), dec); - } - - SIMD_INLINE uint8x16_t AdjustHi(const uint8x16_t & count, const uint8x16_t & value, const uint8x16_t & mask, const uint8x16_t & threshold) - { - const uint8x16_t inc = vandq_u8(mask, vcgtq_u8(count, threshold)); - const uint8x16_t dec = vandq_u8(mask, vcltq_u8(count, threshold)); - return vqsubq_u8(vqaddq_u8(value, inc), dec); - } - - template SIMD_INLINE void BackgroundAdjustRange(uint8_t * loCount, uint8_t * loValue, - uint8_t * hiCount, uint8_t * hiValue, size_t offset, const uint8x16_t & threshold, const uint8x16_t & mask) - { - const uint8x16_t _loCount = Load(loCount + offset); - const uint8x16_t _loValue = Load(loValue + offset); - const uint8x16_t _hiCount = Load(hiCount + offset); - const uint8x16_t _hiValue = Load(hiValue + offset); - - Store(loValue + offset, AdjustLo(_loCount, _loValue, mask, threshold)); - Store(hiValue + offset, AdjustHi(_hiCount, _hiValue, mask, threshold)); - Store(loCount + offset, K8_00); - Store(hiCount + offset, K8_00); - } - - template void BackgroundAdjustRange(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold) - { - assert(width >= A); - if (align) - { - assert(Aligned(loValue) && Aligned(loValueStride) && Aligned(hiValue) && Aligned(hiValueStride)); - assert(Aligned(loCount) && Aligned(loCountStride) && Aligned(hiCount) && Aligned(hiCountStride)); - } - - const uint8x16_t _threshold = vld1q_dup_u8(&threshold); - size_t alignedWidth = AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_01, A - width + alignedWidth); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BackgroundAdjustRange(loCount, loValue, hiCount, hiValue, col, _threshold, K8_01); - if (alignedWidth != width) - BackgroundAdjustRange(loCount, loValue, hiCount, hiValue, width - A, _threshold, tailMask); - loValue += loValueStride; - hiValue += hiValueStride; - loCount += loCountStride; - hiCount += hiCountStride; - } - } - - void BackgroundAdjustRange(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold) - { - if (Aligned(loValue) && Aligned(loValueStride) && Aligned(hiValue) && Aligned(hiValueStride) && - Aligned(loCount) && Aligned(loCountStride) && Aligned(hiCount) && Aligned(hiCountStride)) - BackgroundAdjustRange(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride, hiValue, hiValueStride, threshold); - else - BackgroundAdjustRange(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride, hiValue, hiValueStride, threshold); - } - - - template SIMD_INLINE void BackgroundAdjustRangeMasked(uint8_t * loCount, uint8_t * loValue, uint8_t * hiCount, uint8_t * hiValue, - const uint8_t * mask, size_t offset, const uint8x16_t & threshold, const uint8x16_t & tailMask) - { - const uint8x16_t _mask = Load(mask + offset); - BackgroundAdjustRange(loCount, loValue, hiCount, hiValue, offset, threshold, vandq_u8(_mask, tailMask)); - } - - template void BackgroundAdjustRangeMasked(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(loValue) && Aligned(loValueStride) && Aligned(hiValue) && Aligned(hiValueStride)); - assert(Aligned(loCount) && Aligned(loCountStride) && Aligned(hiCount) && Aligned(hiCountStride)); - assert(Aligned(mask) && Aligned(maskStride)); - } - - const uint8x16_t _threshold = vld1q_dup_u8(&threshold); - size_t alignedWidth = AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_01, A - width + alignedWidth); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BackgroundAdjustRangeMasked(loCount, loValue, hiCount, hiValue, mask, col, _threshold, K8_01); - if (alignedWidth != width) - BackgroundAdjustRangeMasked(loCount, loValue, hiCount, hiValue, mask, width - A, _threshold, tailMask); - loValue += loValueStride; - hiValue += hiValueStride; - loCount += loCountStride; - hiCount += hiCountStride; - mask += maskStride; - } - } - - void BackgroundAdjustRangeMasked(uint8_t * loCount, size_t loCountStride, size_t width, size_t height, - uint8_t * loValue, size_t loValueStride, uint8_t * hiCount, size_t hiCountStride, - uint8_t * hiValue, size_t hiValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride) - { - if (Aligned(loValue) && Aligned(loValueStride) && Aligned(hiValue) && Aligned(hiValueStride) && - Aligned(loCount) && Aligned(loCountStride) && Aligned(hiCount) && Aligned(hiCountStride) && - Aligned(mask) && Aligned(maskStride)) - BackgroundAdjustRangeMasked(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride, hiValue, hiValueStride, threshold, mask, maskStride); - else - BackgroundAdjustRangeMasked(loCount, loCountStride, width, height, loValue, loValueStride, - hiCount, hiCountStride, hiValue, hiValueStride, threshold, mask, maskStride); - } - - template SIMD_INLINE void BackgroundShiftRange(const uint8_t * value, uint8_t * lo, uint8_t * hi, size_t offset, uint8x16_t mask) - { - const uint8x16_t _value = Load(value + offset); - const uint8x16_t _lo = Load(lo + offset); - const uint8x16_t _hi = Load(hi + offset); - - const uint8x16_t add = vandq_u8(mask, vqsubq_u8(_value, _hi)); - const uint8x16_t sub = vandq_u8(mask, vqsubq_u8(_lo, _value)); - - Store(lo + offset, vqsubq_u8(vqaddq_u8(_lo, add), sub)); - Store(hi + offset, vqsubq_u8(vqaddq_u8(_hi, add), sub)); - } - - template void BackgroundShiftRange(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(lo) && Aligned(loStride)); - assert(Aligned(hi) && Aligned(hiStride)); - } - - size_t alignedWidth = AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BackgroundShiftRange(value, lo, hi, col, K8_FF); - if (alignedWidth != width) - BackgroundShiftRange(value, lo, hi, width - A, tailMask); - value += valueStride; - lo += loStride; - hi += hiStride; - } - } - - void BackgroundShiftRange(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(lo) && Aligned(loStride) && Aligned(hi) && Aligned(hiStride)) - BackgroundShiftRange(value, valueStride, width, height, lo, loStride, hi, hiStride); - else - BackgroundShiftRange(value, valueStride, width, height, lo, loStride, hi, hiStride); - } - - - template SIMD_INLINE void BackgroundShiftRangeMasked(const uint8_t * value, uint8_t * lo, uint8_t * hi, const uint8_t * mask, - size_t offset, uint8x16_t tailMask) - { - const uint8x16_t _mask = Load(mask + offset); - BackgroundShiftRange(value, lo, hi, offset, vandq_u8(_mask, tailMask)); - } - - template void BackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride, const uint8_t * mask, size_t maskStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(lo) && Aligned(loStride)); - assert(Aligned(hi) && Aligned(hiStride)); - assert(Aligned(mask) && Aligned(maskStride)); - } - - size_t alignedWidth = AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BackgroundShiftRangeMasked(value, lo, hi, mask, col, K8_FF); - if (alignedWidth != width) - BackgroundShiftRangeMasked(value, lo, hi, mask, width - A, tailMask); - value += valueStride; - lo += loStride; - hi += hiStride; - mask += maskStride; - } - } - - void BackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride, const uint8_t * mask, size_t maskStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(lo) && Aligned(loStride) && - Aligned(hi) && Aligned(hiStride) && Aligned(mask) && Aligned(maskStride)) - BackgroundShiftRangeMasked(value, valueStride, width, height, lo, loStride, hi, hiStride, mask, maskStride); - else - BackgroundShiftRangeMasked(value, valueStride, width, height, lo, loStride, hi, hiStride, mask, maskStride); - } - - template SIMD_INLINE void BackgroundInitMask(const uint8_t * src, uint8_t * dst, const uint8x16_t & index, const uint8x16_t & value) - { - uint8x16_t _mask = vceqq_u8(Load(src), index); - uint8x16_t _old = Load(dst); - Store(dst, vbslq_u8(_mask, value, _old)); - } - - template void BackgroundInitMask(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t index, uint8_t value, uint8_t * dst, size_t dstStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(src) && Aligned(srcStride)); - assert(Aligned(dst) && Aligned(dstStride)); - } - - size_t alignedWidth = AlignLo(width, A); - uint8x16_t _index = vld1q_dup_u8(&index); - uint8x16_t _value = vld1q_dup_u8(&value); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BackgroundInitMask(src + col, dst + col, _index, _value); - if (alignedWidth != width) - BackgroundInitMask(src + width - A, dst + width - A, _index, _value); - src += srcStride; - dst += dstStride; - } - } - - void BackgroundInitMask(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t index, uint8_t value, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - BackgroundInitMask(src, srcStride, width, height, index, value, dst, dstStride); - else - BackgroundInitMask(src, srcStride, width, height, index, value, dst, dstStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonBayerToBgr.cpp b/src/3rd/Simd/Simd/SimdNeonBayerToBgr.cpp deleted file mode 100644 index 54702e64..00000000 --- a/src/3rd/Simd/Simd/SimdNeonBayerToBgr.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdBayer.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE void SaveBgr(uint8x8x2_t src[3], uint8_t * dst) - { - uint8x16x3_t _bgr; - *(uint8x8x2_t*)(_bgr.val + 0) = vzip_u8(src[0].val[0], src[0].val[1]); - *(uint8x8x2_t*)(_bgr.val + 1) = vzip_u8(src[1].val[0], src[1].val[1]); - *(uint8x8x2_t*)(_bgr.val + 2) = vzip_u8(src[2].val[0], src[2].val[1]); - Store3(dst, _bgr); - } - - template void BayerToBgr(const uint8x8x2_t src[12], uint8_t * bgr, size_t stride) - { - uint8x8x2_t _bgr[6]; - BayerToBgr(src, _bgr); - SaveBgr(_bgr + 0, bgr); - SaveBgr(_bgr + 3, bgr + stride); - } - - template void BayerToBgr(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, uint8_t * bgr, size_t bgrStride) - { - const uint8_t * src[3]; - uint8x8x2_t _src[12]; - size_t body = AlignHi(width - 2, A) - A; - for (size_t row = 0; row < height; row += 2) - { - src[0] = (row == 0 ? bayer : bayer - 2 * bayerStride); - src[1] = bayer; - src[2] = (row == height - 2 ? bayer : bayer + 2 * bayerStride); - - LoadBayerNose(src, 0, bayerStride, _src); - BayerToBgr(_src, bgr, bgrStride); - for (size_t col = A; col < body; col += A) - { - LoadBayerBody(src, col, bayerStride, _src); - BayerToBgr(_src, bgr + 3 * col, bgrStride); - } - LoadBayerTail(src, width - A, bayerStride, _src); - BayerToBgr(_src, bgr + 3 * (width - A), bgrStride); - - bayer += 2 * bayerStride; - bgr += 2 * bgrStride; - } - } - - template void BayerToBgr(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgr, size_t bgrStride) - { - switch (bayerFormat) - { - case SimdPixelFormatBayerGrbg: - BayerToBgr(bayer, width, height, bayerStride, bgr, bgrStride); - break; - case SimdPixelFormatBayerGbrg: - BayerToBgr(bayer, width, height, bayerStride, bgr, bgrStride); - break; - case SimdPixelFormatBayerRggb: - BayerToBgr(bayer, width, height, bayerStride, bgr, bgrStride); - break; - case SimdPixelFormatBayerBggr: - BayerToBgr(bayer, width, height, bayerStride, bgr, bgrStride); - break; - default: - assert(0); - } - } - - void BayerToBgr(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgr, size_t bgrStride) - { - assert((width % 2 == 0) && (height % 2 == 0)); - - if (Aligned(bayer) && Aligned(bgr) && Aligned(bayerStride) && Aligned(bgrStride)) - BayerToBgr(bayer, width, height, bayerStride, bayerFormat, bgr, bgrStride); - else - BayerToBgr(bayer, width, height, bayerStride, bayerFormat, bgr, bgrStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonBayerToBgra.cpp b/src/3rd/Simd/Simd/SimdNeonBayerToBgra.cpp deleted file mode 100644 index 5683f094..00000000 --- a/src/3rd/Simd/Simd/SimdNeonBayerToBgra.cpp +++ /dev/null @@ -1,110 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdBayer.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE void SaveBgra(const uint8x8x2_t bgr[3], const uint8x16_t & alpha, uint8_t * bgra) - { - uint8x16x4_t _bgra; - *(uint8x8x2_t*)(_bgra.val + 0) = vzip_u8(bgr[0].val[0], bgr[0].val[1]); - *(uint8x8x2_t*)(_bgra.val + 1) = vzip_u8(bgr[1].val[0], bgr[1].val[1]); - *(uint8x8x2_t*)(_bgra.val + 2) = vzip_u8(bgr[2].val[0], bgr[2].val[1]); - _bgra.val[3] = alpha; - Store4(bgra, _bgra); - } - - template void BayerToBgra(const uint8x8x2_t src[12], const uint8x16_t & alpha, uint8_t * bgra, size_t stride) - { - uint8x8x2_t bgr[6]; - BayerToBgr(src, bgr); - SaveBgra(bgr + 0, alpha, bgra); - SaveBgra(bgr + 3, alpha, bgra + stride); - } - - template void BayerToBgra(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - const uint8_t * src[3]; - uint8x8x2_t _src[12]; - uint8x16_t _alpha = vdupq_n_u8(alpha); - size_t body = AlignHi(width - 2, A) - A; - for (size_t row = 0; row < height; row += 2) - { - src[0] = (row == 0 ? bayer : bayer - 2 * bayerStride); - src[1] = bayer; - src[2] = (row == height - 2 ? bayer : bayer + 2 * bayerStride); - - LoadBayerNose(src, 0, bayerStride, _src); - BayerToBgra(_src, _alpha, bgra, bgraStride); - for (size_t col = A; col < body; col += A) - { - LoadBayerBody(src, col, bayerStride, _src); - BayerToBgra(_src, _alpha, bgra + 4 * col, bgraStride); - } - LoadBayerTail(src, width - A, bayerStride, _src); - BayerToBgra(_src, _alpha, bgra + 4 * (width - A), bgraStride); - - bayer += 2 * bayerStride; - bgra += 2 * bgraStride; -} - } - - template void BayerToBgra(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - switch (bayerFormat) - { - case SimdPixelFormatBayerGrbg: - BayerToBgra(bayer, width, height, bayerStride, bgra, bgraStride, alpha); - break; - case SimdPixelFormatBayerGbrg: - BayerToBgra(bayer, width, height, bayerStride, bgra, bgraStride, alpha); - break; - case SimdPixelFormatBayerRggb: - BayerToBgra(bayer, width, height, bayerStride, bgra, bgraStride, alpha); - break; - case SimdPixelFormatBayerBggr: - BayerToBgra(bayer, width, height, bayerStride, bgra, bgraStride, alpha); - break; - default: - assert(0); -} - } - - void BayerToBgra(const uint8_t * bayer, size_t width, size_t height, size_t bayerStride, SimdPixelFormatType bayerFormat, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - assert((width % 2 == 0) && (height % 2 == 0)); - - if (Aligned(bayer) && Aligned(bgra) && Aligned(bayerStride) && Aligned(bgraStride)) - BayerToBgra(bayer, width, height, bayerStride, bayerFormat, bgra, bgraStride, alpha); - else - BayerToBgra(bayer, width, height, bayerStride, bayerFormat, bgra, bgraStride, alpha); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonBgrToBayer.cpp b/src/3rd/Simd/Simd/SimdNeonBgrToBayer.cpp deleted file mode 100644 index df382cb2..00000000 --- a/src/3rd/Simd/Simd/SimdNeonBgrToBayer.cpp +++ /dev/null @@ -1,100 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template - SIMD_INLINE void BgrToBayer(const uint8_t * bgr, uint8_t * bayer) - { - uint8x16x3_t _bgr = Load3(bgr); - Store(bayer, vbslq_u8((uint8x16_t)K16_00FF, _bgr.val[c0], _bgr.val[c1])); - } - - template - void BgrToBayer(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bayer, size_t bayerStride) - { - assert(width >= A); - if (align) - assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(bayer) && Aligned(bayerStride)); - - size_t alignedWidth = AlignLo(width, A); - const size_t A3 = A * 3; - - for (size_t row = 0; row < height; row += 2) - { - for (size_t col = 0, offset = 0; col < alignedWidth; col += A, offset += A3) - BgrToBayer(bgr + offset, bayer + col); - if (alignedWidth != width) - BgrToBayer(bgr + 3 * (width - A), bayer + width - A); - bgr += bgrStride; - bayer += bayerStride; - - for (size_t col = 0, offset = 0; col < alignedWidth; col += A, offset += A3) - BgrToBayer(bgr + offset, bayer + col); - if (alignedWidth != width) - BgrToBayer(bgr + 3 * (width - A), bayer + width - A); - bgr += bgrStride; - bayer += bayerStride; - } - } - - template - void BgrToBayer(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat) - { - assert((width % 2 == 0) && (height % 2 == 0)); - - switch (bayerFormat) - { - case SimdPixelFormatBayerGrbg: - BgrToBayer<1, 2, 0, 1, align>(bgr, width, height, bgrStride, bayer, bayerStride); - break; - case SimdPixelFormatBayerGbrg: - BgrToBayer<1, 0, 2, 1, align>(bgr, width, height, bgrStride, bayer, bayerStride); - break; - case SimdPixelFormatBayerRggb: - BgrToBayer<2, 1, 1, 0, align>(bgr, width, height, bgrStride, bayer, bayerStride); - break; - case SimdPixelFormatBayerBggr: - BgrToBayer<0, 1, 1, 2, align>(bgr, width, height, bgrStride, bayer, bayerStride); - break; - default: - assert(0); - } - } - - void BgrToBayer(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat) - { - if (Aligned(bgr) && Aligned(bgrStride) && Aligned(bayer) && Aligned(bayerStride)) - BgrToBayer(bgr, width, height, bgrStride, bayer, bayerStride, bayerFormat); - else - BgrToBayer(bgr, width, height, bgrStride, bayer, bayerStride, bayerFormat); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonBgrToBgra.cpp b/src/3rd/Simd/Simd/SimdNeonBgrToBgra.cpp deleted file mode 100644 index 98a360b0..00000000 --- a/src/3rd/Simd/Simd/SimdNeonBgrToBgra.cpp +++ /dev/null @@ -1,176 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - const size_t A3 = A * 3; - const size_t A4 = A * 4; - - union Bgra - { - uint8x16x4_t bgra; - uint8x16x3_t bgr; - }; - - template SIMD_INLINE void BgrToBgra(const uint8_t * bgr, uint8_t * bgra, Bgra & _bgra) - { - _bgra.bgr = Load3(bgr); - Store4(bgra, _bgra.bgra); - } - - template void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - assert(width >= A); - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)); - - size_t alignedWidth = AlignLo(width, A); - - Bgra _bgra; - _bgra.bgra.val[3] = vdupq_n_u8(alpha); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colBgra = 0, colBgr = 0; col < alignedWidth; col += A, colBgra += A4, colBgr += A3) - BgrToBgra(bgr + colBgr, bgra + colBgra, _bgra); - if (width != alignedWidth) - BgrToBgra(bgr + 3 * (width - A), bgra + 4 * (width - A), _bgra); - bgr += bgrStride; - bgra += bgraStride; - } - } - - void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); - else - BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void Bgr48pToBgra32(uint8_t * bgra, - const uint8_t * blue, const uint8_t * green, const uint8_t * red, size_t offset, const uint8x16_t & alpha) - { - uint8x16x2_t _blue = Load2(blue + offset); - uint8x16x2_t _green = Load2(green + offset); - uint8x16x2_t _red = Load2(red + offset); - - uint8x16x4_t _bgra; - _bgra.val[0] = _blue.val[0]; - _bgra.val[1] = _green.val[0]; - _bgra.val[2] = _red.val[0]; - _bgra.val[3] = alpha; - - Store4(bgra, _bgra); - } - - template void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, - const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - assert(width >= A); - if (align) - { - assert(Aligned(blue) && Aligned(blueStride)); - assert(Aligned(green) && Aligned(greenStride)); - assert(Aligned(red) && Aligned(redStride)); - assert(Aligned(bgra) && Aligned(bgraStride)); - } - - uint8x16_t _alpha; - _alpha = vdupq_n_u8(alpha); - - size_t alignedWidth = AlignLo(width, A) * 2; - for (size_t row = 0; row < height; ++row) - { - for (size_t srcOffset = 0, dstOffset = 0; srcOffset < alignedWidth; srcOffset += DA, dstOffset += QA) - Bgr48pToBgra32(bgra + dstOffset, blue, green, red, srcOffset, _alpha); - if (width != alignedWidth) - Bgr48pToBgra32(bgra + (width - A) * 4, blue, green, red, (width - A) * 2, _alpha); - blue += blueStride; - green += greenStride; - red += redStride; - bgra += bgraStride; - } - } - - void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, - const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(blue) && Aligned(blueStride) && Aligned(green) && Aligned(greenStride) && - Aligned(red) && Aligned(redStride) && Aligned(bgra) && Aligned(bgraStride)) - Bgr48pToBgra32(blue, blueStride, width, height, green, greenStride, red, redStride, bgra, bgraStride, alpha); - else - Bgr48pToBgra32(blue, blueStride, width, height, green, greenStride, red, redStride, bgra, bgraStride, alpha); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void RgbToBgra(const uint8_t* rgb, uint8_t* bgra, uint8x16_t alpha) - { - uint8x16x3_t _rgb = Load3(rgb); - uint8x16x4_t _bgra; - _bgra.val[0] = _rgb.val[2]; - _bgra.val[1] = _rgb.val[1]; - _bgra.val[2] = _rgb.val[0]; - _bgra.val[3] = alpha; - Store4(bgra, _bgra); - } - - template void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) - { - assert(width >= A); - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)); - - size_t alignedWidth = AlignLo(width, A); - uint8x16_t _alpha = vdupq_n_u8(alpha); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colBgra = 0, colRgb = 0; col < alignedWidth; col += A, colBgra += A4, colRgb += A3) - RgbToBgra(rgb + colRgb, bgra + colBgra, _alpha); - if (width != alignedWidth) - RgbToBgra(rgb + 3 * (width - A), bgra + 4 * (width - A), _alpha); - rgb += rgbStride; - bgra += bgraStride; - } - } - - void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)) - RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); - else - RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonBgrToGray.cpp b/src/3rd/Simd/Simd/SimdNeonBgrToGray.cpp deleted file mode 100644 index 57cf19f1..00000000 --- a/src/3rd/Simd/Simd/SimdNeonBgrToGray.cpp +++ /dev/null @@ -1,114 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE uint8x16_t BgrToGray(uint8x16x3_t bgr) - { - uint8x8_t lo = vmovn_u16(BgrToGray(UnpackU8<0>(bgr.val[0]), UnpackU8<0>(bgr.val[1]), UnpackU8<0>(bgr.val[2]))); - uint8x8_t hi = vmovn_u16(BgrToGray(UnpackU8<1>(bgr.val[0]), UnpackU8<1>(bgr.val[1]), UnpackU8<1>(bgr.val[2]))); - return vcombine_u8(lo, hi); - } - - template void BgrToGray(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* gray, size_t grayStride) - { - assert(width >= A); - if (align) - assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(gray) && Aligned(grayStride)); - - size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - { - uint8x16x3_t _bgr = Load3(bgr + 3 * col); - Store(gray + col, BgrToGray(_bgr)); - } - if (alignedWidth != width) - { - uint8x16x3_t _bgr = Load3(bgr + 3 * (width - A)); - Store(gray + width - A, BgrToGray(_bgr)); - } - bgr += bgrStride; - gray += grayStride; - } - } - - void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride) - { - if (Aligned(bgr) && Aligned(gray) && Aligned(bgrStride) && Aligned(grayStride)) - BgrToGray(bgr, width, height, bgrStride, gray, grayStride); - else - BgrToGray(bgr, width, height, bgrStride, gray, grayStride); - } - - //--------------------------------------------------------------------- - - SIMD_INLINE uint8x16_t RgbToGray(uint8x16x3_t rgb) - { - uint8x8_t lo = vmovn_u16(BgrToGray(UnpackU8<0>(rgb.val[2]), UnpackU8<0>(rgb.val[1]), UnpackU8<0>(rgb.val[0]))); - uint8x8_t hi = vmovn_u16(BgrToGray(UnpackU8<1>(rgb.val[2]), UnpackU8<1>(rgb.val[1]), UnpackU8<1>(rgb.val[0]))); - return vcombine_u8(lo, hi); - } - - template void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) - { - assert(width >= A); - if (align) - assert(Aligned(rgb) && Aligned(rgbStride) && Aligned(gray) && Aligned(grayStride)); - - size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - { - uint8x16x3_t _rgb = Load3(rgb + 3 * col); - Store(gray + col, RgbToGray(_rgb)); - } - if (alignedWidth != width) - { - uint8x16x3_t _rgb = Load3(rgb + 3 * (width - A)); - Store(gray + width - A, RgbToGray(_rgb)); - } - rgb += rgbStride; - gray += grayStride; - } - } - - void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) - { - if (Aligned(rgb) && Aligned(gray) && Aligned(rgbStride) && Aligned(grayStride)) - RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - else - RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonBgrToRgb.cpp b/src/3rd/Simd/Simd/SimdNeonBgrToRgb.cpp deleted file mode 100644 index 53a5deff..00000000 --- a/src/3rd/Simd/Simd/SimdNeonBgrToRgb.cpp +++ /dev/null @@ -1,71 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE void BgrToRgb(const uint8_t * bgr, uint8_t * rgb) - { - uint8x16x3_t _bgr = Load3(bgr); - uint8x16_t tmp = _bgr.val[0]; - _bgr.val[0] = _bgr.val[2]; - _bgr.val[2] = tmp; - Store3(rgb, _bgr); - } - - template void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) - { - assert(width >= A); - if (align) - assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)); - - const size_t A3 = A * 3; - size_t size = width * 3; - size_t aligned = AlignLo(width, A)*3; - - for (size_t row = 0; row < height; ++row) - { - for (size_t i = 0; i < aligned; i += A3) - BgrToRgb(bgr + i, rgb + i); - if (aligned < size) - BgrToRgb(bgr + size - A3, rgb + size - A3); - bgr += bgrStride; - rgb += rgbStride; - } - } - - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) - { - if (Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)) - BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); - else - BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); - } - } -#endif//SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonBgrToYuv.cpp b/src/3rd/Simd/Simd/SimdNeonBgrToYuv.cpp deleted file mode 100644 index 984581a9..00000000 --- a/src/3rd/Simd/Simd/SimdNeonBgrToYuv.cpp +++ /dev/null @@ -1,218 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" - -#include "Simd/SimdLog.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - const size_t A3 = A * 3; - const size_t A6 = A * 6; - - SIMD_INLINE uint16x8_t Average(uint8x16_t a, uint8x16_t b) - { - return vshrq_n_u16(vpadalq_u8(vpadalq_u8(K16_0002, a), b), 2); - } - - template SIMD_INLINE void BgrToYuv420p(const uint8_t * bgr0, size_t bgrStride, uint8_t * y0, size_t yStride, uint8_t * u, uint8_t * v) - { - const uint8_t * bgr1 = bgr0 + bgrStride; - uint8_t * y1 = y0 + yStride; - - uint8x16x3_t bgr00 = Load3(bgr0); - Store(y0 + 0, BgrToY(bgr00.val[0], bgr00.val[1], bgr00.val[2])); - - uint8x16x3_t bgr01 = Load3(bgr0 + A3); - Store(y0 + A, BgrToY(bgr01.val[0], bgr01.val[1], bgr01.val[2])); - - uint8x16x3_t bgr10 = Load3(bgr1); - Store(y1 + 0, BgrToY(bgr10.val[0], bgr10.val[1], bgr10.val[2])); - - uint8x16x3_t bgr11 = Load3(bgr1 + A3); - Store(y1 + A, BgrToY(bgr11.val[0], bgr11.val[1], bgr11.val[2])); - - uint16x8_t b0 = Average(bgr00.val[0], bgr10.val[0]); - uint16x8_t g0 = Average(bgr00.val[1], bgr10.val[1]); - uint16x8_t r0 = Average(bgr00.val[2], bgr10.val[2]); - - uint16x8_t b1 = Average(bgr01.val[0], bgr11.val[0]); - uint16x8_t g1 = Average(bgr01.val[1], bgr11.val[1]); - uint16x8_t r1 = Average(bgr01.val[2], bgr11.val[2]); - - Store(u, PackSaturatedI16(BgrToU(b0, g0, r0), BgrToU(b1, g1, r1))); - Store(v, PackSaturatedI16(BgrToV(b0, g0, r0), BgrToV(b1, g1, r1))); - } - - template void BgrToYuv420p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= DA) && (height >= 2)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); - } - - size_t alignedWidth = AlignLo(width, DA); - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colY = 0, colBgr = 0; colY < alignedWidth; colY += DA, colUV += A, colBgr += A6) - BgrToYuv420p(bgr + colBgr, bgrStride, y + colY, yStride, u + colUV, v + colUV); - if (width != alignedWidth) - { - size_t offset = width - DA; - BgrToYuv420p(bgr + offset * 3, bgrStride, y + offset, yStride, u + offset / 2, v + offset / 2); - } - y += 2 * yStride; - u += uStride; - v += vStride; - bgr += 2 * bgrStride; - } - } - - void BgrToYuv420p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToYuv420p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else - BgrToYuv420p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - } - - SIMD_INLINE uint16x8_t Average(uint8x16_t value) - { - return vshrq_n_u16(vpadalq_u8(K16_0001, value), 1); - } - - template SIMD_INLINE void BgrToYuv422p(const uint8_t * bgr, uint8_t * y, uint8_t * u, uint8_t * v) - { - uint8x16x3_t bgr0 = Load3(bgr); - Store(y + 0, BgrToY(bgr0.val[0], bgr0.val[1], bgr0.val[2])); - - uint16x8_t b0 = Average(bgr0.val[0]); - uint16x8_t g0 = Average(bgr0.val[1]); - uint16x8_t r0 = Average(bgr0.val[2]); - - uint8x16x3_t bgr1 = Load3(bgr + A3); - Store(y + A, BgrToY(bgr1.val[0], bgr1.val[1], bgr1.val[2])); - - uint16x8_t b1 = Average(bgr1.val[0]); - uint16x8_t g1 = Average(bgr1.val[1]); - uint16x8_t r1 = Average(bgr1.val[2]); - - Store(u, PackSaturatedI16(BgrToU(b0, g0, r0), BgrToU(b1, g1, r1))); - Store(v, PackSaturatedI16(BgrToV(b0, g0, r0), BgrToV(b1, g1, r1))); - } - - template void BgrToYuv422p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert((width % 2 == 0) && (width >= DA)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); - } - - size_t alignedWidth = AlignLo(width, DA); - for (size_t row = 0; row < height; ++row) - { - for (size_t colUV = 0, colY = 0, colBgr = 0; colY < alignedWidth; colY += DA, colUV += A, colBgr += A6) - BgrToYuv422p(bgr + colBgr, y + colY, u + colUV, v + colUV); - if (width != alignedWidth) - { - size_t offset = width - DA; - BgrToYuv422p(bgr + offset * 3, y + offset, u + offset / 2, v + offset / 2); - } - y += yStride; - u += uStride; - v += vStride; - bgr += bgrStride; - } - } - - void BgrToYuv422p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToYuv422p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else - BgrToYuv422p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - } - - - template SIMD_INLINE void BgrToYuv444p(const uint8_t * bgr, uint8_t * y, uint8_t * u, uint8_t * v) - { - uint8x16x3_t _bgr = Load3(bgr); - Store(y, BgrToY(_bgr.val[0], _bgr.val[1], _bgr.val[2])); - Store(u, BgrToU(_bgr.val[0], _bgr.val[1], _bgr.val[2])); - Store(v, BgrToV(_bgr.val[0], _bgr.val[1], _bgr.val[2])); - } - - template void BgrToYuv444p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); - } - - size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colBgr = 0; col < alignedWidth; col += A, colBgr += A3) - BgrToYuv444p(bgr + colBgr, y + col, u + col, v + col); - if (width != alignedWidth) - { - size_t offset = width - A; - BgrToYuv444p(bgr + offset * 3, y + offset, u + offset, v + offset); - } - y += yStride; - u += uStride; - v += vStride; - bgr += bgrStride; - } - } - - void BgrToYuv444p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToYuv444p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - else - BgrToYuv444p(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonBgraToBayer.cpp b/src/3rd/Simd/Simd/SimdNeonBgraToBayer.cpp deleted file mode 100644 index 2b5bc555..00000000 --- a/src/3rd/Simd/Simd/SimdNeonBgraToBayer.cpp +++ /dev/null @@ -1,99 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template - SIMD_INLINE void BgraToBayer(const uint8_t * bgra, uint8_t * bayer) - { - uint8x16x4_t _bgra = Load4(bgra); - Store(bayer, vbslq_u8((uint8x16_t)K16_00FF, _bgra.val[c0], _bgra.val[c1])); - } - - template - void BgraToBayer(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bayer, size_t bayerStride) - { - assert(width >= A); - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bayer) && Aligned(bayerStride)); - - size_t alignedWidth = AlignLo(width, A); - - for (size_t row = 0; row < height; row += 2) - { - for (size_t col = 0, offset = 0; col < alignedWidth; col += A, offset += QA) - BgraToBayer(bgra + offset, bayer + col); - if (alignedWidth != width) - BgraToBayer(bgra + 4 * (width - A), bayer + width - A); - bgra += bgraStride; - bayer += bayerStride; - - for (size_t col = 0, offset = 0; col < alignedWidth; col += A, offset += QA) - BgraToBayer(bgra + offset, bayer + col); - if (alignedWidth != width) - BgraToBayer(bgra + 4 * (width - A), bayer + width - A); - bgra += bgraStride; - bayer += bayerStride; - } - } - - template - void BgraToBayer(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat) - { - assert((width % 2 == 0) && (height % 2 == 0)); - - switch (bayerFormat) - { - case SimdPixelFormatBayerGrbg: - BgraToBayer<1, 2, 0, 1, align>(bgra, width, height, bgraStride, bayer, bayerStride); - break; - case SimdPixelFormatBayerGbrg: - BgraToBayer<1, 0, 2, 1, align>(bgra, width, height, bgraStride, bayer, bayerStride); - break; - case SimdPixelFormatBayerRggb: - BgraToBayer<2, 1, 1, 0, align>(bgra, width, height, bgraStride, bayer, bayerStride); - break; - case SimdPixelFormatBayerBggr: - BgraToBayer<0, 1, 1, 2, align>(bgra, width, height, bgraStride, bayer, bayerStride); - break; - default: - assert(0); - } - } - - void BgraToBayer(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bayer) && Aligned(bayerStride)) - BgraToBayer(bgra, width, height, bgraStride, bayer, bayerStride, bayerFormat); - else - BgraToBayer(bgra, width, height, bgraStride, bayer, bayerStride, bayerFormat); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonBgraToBgr.cpp b/src/3rd/Simd/Simd/SimdNeonBgraToBgr.cpp deleted file mode 100644 index 55d7b2eb..00000000 --- a/src/3rd/Simd/Simd/SimdNeonBgraToBgr.cpp +++ /dev/null @@ -1,112 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - const size_t A3 = A * 3; - const size_t A4 = A * 4; - - template SIMD_INLINE void BgraToBgr(const uint8_t * bgra, uint8_t * bgr) - { - uint8x16x4_t _bgra = Load4(bgra); - Store3(bgr, *(uint8x16x3_t*)&_bgra); - } - - template void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) - { - assert(width >= A); - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)); - - size_t alignedWidth = AlignLo(width, A); - if (width == alignedWidth) - alignedWidth -= A; - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colBgra = 0, colBgr = 0; col < alignedWidth; col += A, colBgra += A4, colBgr += A3) - BgraToBgr(bgra + colBgra, bgr + colBgr); - if (width != alignedWidth) - BgraToBgr(bgra + 4 * (width - A), bgr + 3 * (width - A)); - bgra += bgraStride; - bgr += bgrStride; - } - } - - void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)) - BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); - else - BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void BgraToRgb(const uint8_t* bgra, uint8_t* rgb) - { - uint8x16x4_t _bgra = Load4(bgra); - uint8x16x3_t _rgb; - _rgb.val[0] = _bgra.val[2]; - _rgb.val[1] = _bgra.val[1]; - _rgb.val[2] = _bgra.val[0]; - Store3(rgb, _rgb); - } - - template void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) - { - assert(width >= A); - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)); - - size_t alignedWidth = AlignLo(width, A); - if (width == alignedWidth) - alignedWidth -= A; - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colBgra = 0, colRgb = 0; col < alignedWidth; col += A, colBgra += A4, colRgb += A3) - BgraToRgb(bgra + colBgra, rgb + colRgb); - if (width != alignedWidth) - BgraToRgb(bgra + 4 * (width - A), rgb + 3 * (width - A)); - bgra += bgraStride; - rgb += rgbStride; - } - } - - void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)) - BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); - else - BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonBgraToGray.cpp b/src/3rd/Simd/Simd/SimdNeonBgraToGray.cpp deleted file mode 100644 index 24fc2285..00000000 --- a/src/3rd/Simd/Simd/SimdNeonBgraToGray.cpp +++ /dev/null @@ -1,71 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE uint8x8_t BgraToGray(uint8x8x4_t bgra) - { - return vmovn_u16(BgrToGray(vmovl_u8(bgra.val[0]), vmovl_u8(bgra.val[1]), vmovl_u8(bgra.val[2]))); - } - - template void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride) - { - assert(width >= HA); - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(gray) && Aligned(grayStride)); - - size_t alignedWidth = AlignLo(width, HA); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += HA) - { - uint8x8x4_t _bgra = LoadHalf4(bgra + 4 * col); - Store(gray + col, BgraToGray(_bgra)); - } - if (alignedWidth != width) - { - uint8x8x4_t _bgra = LoadHalf4(bgra + 4 * (width - HA)); - Store(gray + width - HA, BgraToGray(_bgra)); - } - bgra += bgraStride; - gray += grayStride; - } - } - - void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride) - { - if (Aligned(bgra) && Aligned(gray) && Aligned(bgraStride) && Aligned(grayStride)) - BgraToGray(bgra, width, height, bgraStride, gray, grayStride); - else - BgraToGray(bgra, width, height, bgraStride, gray, grayStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonBgraToYuv.cpp b/src/3rd/Simd/Simd/SimdNeonBgraToYuv.cpp deleted file mode 100644 index ab20615c..00000000 --- a/src/3rd/Simd/Simd/SimdNeonBgraToYuv.cpp +++ /dev/null @@ -1,289 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" - -#include "Simd/SimdLog.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE uint16x8_t Average(uint8x16_t a, uint8x16_t b) - { - return vshrq_n_u16(vpadalq_u8(vpadalq_u8(K16_0002, a), b), 2); - } - - template SIMD_INLINE void BgraToYuv420p(const uint8_t * bgra0, size_t bgraStride, uint8_t * y0, size_t yStride, uint8_t * u, uint8_t * v) - { - const uint8_t * bgra1 = bgra0 + bgraStride; - uint8_t * y1 = y0 + yStride; - - uint8x16x4_t bgra00 = Load4(bgra0); - Store(y0 + 0, BgrToY(bgra00.val[0], bgra00.val[1], bgra00.val[2])); - - uint8x16x4_t bgra01 = Load4(bgra0 + QA); - Store(y0 + A, BgrToY(bgra01.val[0], bgra01.val[1], bgra01.val[2])); - - uint8x16x4_t bgra10 = Load4(bgra1); - Store(y1 + 0, BgrToY(bgra10.val[0], bgra10.val[1], bgra10.val[2])); - - uint8x16x4_t bgra11 = Load4(bgra1 + QA); - Store(y1 + A, BgrToY(bgra11.val[0], bgra11.val[1], bgra11.val[2])); - - uint16x8_t b0 = Average(bgra00.val[0], bgra10.val[0]); - uint16x8_t g0 = Average(bgra00.val[1], bgra10.val[1]); - uint16x8_t r0 = Average(bgra00.val[2], bgra10.val[2]); - - uint16x8_t b1 = Average(bgra01.val[0], bgra11.val[0]); - uint16x8_t g1 = Average(bgra01.val[1], bgra11.val[1]); - uint16x8_t r1 = Average(bgra01.val[2], bgra11.val[2]); - - Store(u, PackSaturatedI16(BgrToU(b0, g0, r0), BgrToU(b1, g1, r1))); - Store(v, PackSaturatedI16(BgrToV(b0, g0, r0), BgrToV(b1, g1, r1))); - } - - template void BgraToYuv420p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= DA) && (height >= 2)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - size_t alignedWidth = AlignLo(width, DA); - const size_t A8 = A * 8; - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colY = 0, colBgra = 0; colY < alignedWidth; colY += DA, colUV += A, colBgra += A8) - BgraToYuv420p(bgra + colBgra, bgraStride, y + colY, yStride, u + colUV, v + colUV); - if (width != alignedWidth) - { - size_t offset = width - DA; - BgraToYuv420p(bgra + offset * 4, bgraStride, y + offset, yStride, u + offset / 2, v + offset / 2); - } - y += 2 * yStride; - u += uStride; - v += vStride; - bgra += 2 * bgraStride; - } - } - - void BgraToYuv420p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)) - BgraToYuv420p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else - BgraToYuv420p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - } - - SIMD_INLINE uint16x8_t Average(uint8x16_t value) - { - return vshrq_n_u16(vpadalq_u8(K16_0001, value), 1); - } - - template SIMD_INLINE void BgraToYuv422p(const uint8_t * bgra, uint8_t * y, uint8_t * u, uint8_t * v) - { - uint8x16x4_t bgra0 = Load4(bgra); - Store(y + 0, BgrToY(bgra0.val[0], bgra0.val[1], bgra0.val[2])); - - uint16x8_t b0 = Average(bgra0.val[0]); - uint16x8_t g0 = Average(bgra0.val[1]); - uint16x8_t r0 = Average(bgra0.val[2]); - - uint8x16x4_t bgra1 = Load4(bgra + QA); - Store(y + A, BgrToY(bgra1.val[0], bgra1.val[1], bgra1.val[2])); - - uint16x8_t b1 = Average(bgra1.val[0]); - uint16x8_t g1 = Average(bgra1.val[1]); - uint16x8_t r1 = Average(bgra1.val[2]); - - Store(u, PackSaturatedI16(BgrToU(b0, g0, r0), BgrToU(b1, g1, r1))); - Store(v, PackSaturatedI16(BgrToV(b0, g0, r0), BgrToV(b1, g1, r1))); - } - - template void BgraToYuv422p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert((width % 2 == 0) && (width >= DA)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - size_t alignedWidth = AlignLo(width, DA); - const size_t A8 = A * 8; - for (size_t row = 0; row < height; ++row) - { - for (size_t colUV = 0, colY = 0, colBgra = 0; colY < alignedWidth; colY += DA, colUV += A, colBgra += A8) - BgraToYuv422p(bgra + colBgra, y + colY, u + colUV, v + colUV); - if (width != alignedWidth) - { - size_t offset = width - DA; - BgraToYuv422p(bgra + offset * 4, y + offset, u + offset / 2, v + offset / 2); - } - y += yStride; - u += uStride; - v += vStride; - bgra += bgraStride; - } - } - - void BgraToYuv422p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)) - BgraToYuv422p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else - BgraToYuv422p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - } - - template SIMD_INLINE void BgraToYuv444p(const uint8_t * bgra, uint8_t * y, uint8_t * u, uint8_t * v) - { - uint8x16x4_t _bgra = Load4(bgra); - Store(y, BgrToY(_bgra.val[0], _bgra.val[1], _bgra.val[2])); - Store(u, BgrToU(_bgra.val[0], _bgra.val[1], _bgra.val[2])); - Store(v, BgrToV(_bgra.val[0], _bgra.val[1], _bgra.val[2])); - } - - template void BgraToYuv444p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colBgra = 0; col < alignedWidth; col += A, colBgra += QA) - BgraToYuv444p(bgra + colBgra, y + col, u + col, v + col); - if (width != alignedWidth) - { - size_t offset = width - A; - BgraToYuv444p(bgra + offset * 4, y + offset, u + offset, v + offset); - } - y += yStride; - u += uStride; - v += vStride; - bgra += bgraStride; - } - } - - void BgraToYuv444p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)) - BgraToYuv444p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - else - BgraToYuv444p(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); - } - - template SIMD_INLINE void BgraToYuva420p(const uint8_t * bgra0, size_t bgraStride, uint8_t * y0, size_t yStride, uint8_t * u, uint8_t * v, uint8_t * a0, size_t aStride) - { - const uint8_t * bgra1 = bgra0 + bgraStride; - uint8_t * y1 = y0 + yStride; - uint8_t * a1 = a0 + aStride; - - uint8x16x4_t bgra00 = Load4(bgra0); - Store(y0 + 0, BgrToY(bgra00.val[0], bgra00.val[1], bgra00.val[2])); - Store(a0 + 0, bgra00.val[3]); - - uint8x16x4_t bgra01 = Load4(bgra0 + QA); - Store(y0 + A, BgrToY(bgra01.val[0], bgra01.val[1], bgra01.val[2])); - Store(a0 + A, bgra01.val[3]); - - uint8x16x4_t bgra10 = Load4(bgra1); - Store(y1 + 0, BgrToY(bgra10.val[0], bgra10.val[1], bgra10.val[2])); - Store(a1 + 0, bgra10.val[3]); - - uint8x16x4_t bgra11 = Load4(bgra1 + QA); - Store(y1 + A, BgrToY(bgra11.val[0], bgra11.val[1], bgra11.val[2])); - Store(a1 + A, bgra11.val[3]); - - uint16x8_t b0 = Average(bgra00.val[0], bgra10.val[0]); - uint16x8_t g0 = Average(bgra00.val[1], bgra10.val[1]); - uint16x8_t r0 = Average(bgra00.val[2], bgra10.val[2]); - - uint16x8_t b1 = Average(bgra01.val[0], bgra11.val[0]); - uint16x8_t g1 = Average(bgra01.val[1], bgra11.val[1]); - uint16x8_t r1 = Average(bgra01.val[2], bgra11.val[2]); - - Store(u, PackSaturatedI16(BgrToU(b0, g0, r0), BgrToU(b1, g1, r1))); - Store(v, PackSaturatedI16(BgrToV(b0, g0, r0), BgrToV(b1, g1, r1))); - } - - template void BgraToYuva420p(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride, uint8_t * a, size_t aStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= DA) && (height >= 2)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(a) && Aligned(aStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - size_t alignedWidth = AlignLo(width, DA); - const size_t A8 = A * 8; - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colYA = 0, colBgra = 0; colYA < alignedWidth; colYA += DA, colUV += A, colBgra += A8) - BgraToYuva420p(bgra + colBgra, bgraStride, y + colYA, yStride, u + colUV, v + colUV, a + colYA, aStride); - if (width != alignedWidth) - { - size_t offset = width - DA; - BgraToYuva420p(bgra + offset * 4, bgraStride, y + offset, yStride, u + offset / 2, v + offset / 2, a + offset, aStride); - } - y += 2 * yStride; - u += uStride; - v += vStride; - a += 2 * aStride; - bgra += 2 * bgraStride; - } - } - - void BgraToYuva420p(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * y, size_t yStride, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride, uint8_t * a, size_t aStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride) - && Aligned(a) && Aligned(aStride) && Aligned(bgra) && Aligned(bgraStride)) - BgraToYuva420p(bgra, bgraStride, width, height, y, yStride, u, uStride, v, vStride, a, aStride); - else - BgraToYuva420p(bgra, bgraStride, width, height, y, yStride, u, uStride, v, vStride, a, aStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonBinarization.cpp b/src/3rd/Simd/Simd/SimdNeonBinarization.cpp deleted file mode 100644 index b42bea71..00000000 --- a/src/3rd/Simd/Simd/SimdNeonBinarization.cpp +++ /dev/null @@ -1,277 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdCompare.h" -#include "Simd/SimdSet.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template - void Binarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride) - { - assert(width >= A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - - uint8x16_t _value = vdupq_n_u8(value); - uint8x16_t _positive = vdupq_n_u8(positive); - uint8x16_t _negative = vdupq_n_u8(negative); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - { - const uint8x16_t mask = Compare8u(Load(src + col), _value); - Store(dst + col, vbslq_u8(mask, _positive, _negative)); - } - if (alignedWidth != width) - { - const uint8x16_t mask = Compare8u(Load(src + width - A), _value); - Store(dst + width - A, vbslq_u8(mask, _positive, _negative)); - } - src += srcStride; - dst += dstStride; - } - } - - template - void Binarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - else - Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - } - - void Binarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride, SimdCompareType compareType) - { - switch (compareType) - { - case SimdCompareEqual: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - case SimdCompareNotEqual: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - case SimdCompareGreater: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - case SimdCompareGreaterOrEqual: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - case SimdCompareLesser: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - case SimdCompareLesserOrEqual: - return Binarization(src, srcStride, width, height, value, positive, negative, dst, dstStride); - default: - assert(0); - } - } - - namespace - { - struct Buffer - { - Buffer(size_t width, size_t edge) - { - size_t size = sizeof(uint16_t)*(width + 2 * edge) + sizeof(uint32_t)*(2 * width + 2 * edge); - _p = Allocate(size); - memset(_p, 0, size); - sa = (uint16_t*)_p + edge; - s0a0 = (uint32_t*)(sa + width + edge) + edge; - sum = (uint32_t*)(s0a0 + width + edge); - } - - ~Buffer() - { - Free(_p); - } - - uint16_t * sa; - uint32_t * s0a0; - uint32_t * sum; - private: - void *_p; - }; - } - - template - SIMD_INLINE void AddRows(const uint8_t * src, uint16_t * sa, const uint8x16_t & value, const uint8x16_t & mask) - { - const uint8x16_t inc = vandq_u8(Compare8u(Load(src), value), mask); - uint8x16x2_t _sa = Load2((uint8_t*)sa); - _sa.val[0] = vaddq_u8(_sa.val[0], inc); - _sa.val[1] = vaddq_u8(_sa.val[1], mask); - Store2((uint8_t*)sa, _sa); - } - - template - SIMD_INLINE void SubRows(const uint8_t * src, uint16_t * sa, const uint8x16_t & value, const uint8x16_t & mask) - { - const uint8x16_t dec = vandq_u8(Compare8u(Load(src), value), mask); - uint8x16x2_t _sa = Load2((uint8_t*)sa); - _sa.val[0] = vsubq_u8(_sa.val[0], dec); - _sa.val[1] = vsubq_u8(_sa.val[1], mask); - Store2((uint8_t*)sa, _sa); - } - - SIMD_INLINE uint32x4_t CompareSum(const uint32x4_t & sum, const uint32x4_t & area, const uint32x4_t & threshold) - { - return vcgtq_u32(vmulq_u32(sum, K32_000000FF), vmulq_u32(area, threshold)); - } - - template - SIMD_INLINE uint16x8_t CompareSum(const uint16_t * sum, const uint32x4_t & threshold) - { - uint16x8x2_t _sum = Load2(sum); - uint32x4_t lo = CompareSum(UnpackU16<0>(_sum.val[0]), UnpackU16<0>(_sum.val[1]), threshold); - uint32x4_t hi = CompareSum(UnpackU16<1>(_sum.val[0]), UnpackU16<1>(_sum.val[1]), threshold); - return PackU32(lo, hi); - } - - template - SIMD_INLINE uint8x16_t CompareSum(const uint32_t * sum, const uint32x4_t & threshold) - { - uint16x8_t lo = CompareSum((uint16_t*)sum + 0, threshold); - uint16x8_t hi = CompareSum((uint16_t*)sum + A, threshold); - return PackU16(lo, hi); - } - - template - void AveragingBinarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, size_t neighborhood, uint8_t threshold, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride) - { - assert(width > neighborhood && height > neighborhood && neighborhood < 0x7F); - - const size_t alignedWidth = AlignLo(width, A); - - const uint8x16_t tailMask = ShiftLeft(K8_01, A - width + alignedWidth); - uint8x16_t _value = vdupq_n_u8(value); - uint8x16_t _positive = vdupq_n_u8(positive); - uint8x16_t _negative = vdupq_n_u8(negative); - uint32x4_t _threshold = vdupq_n_u32(threshold); - - Buffer buffer(AlignHi(width, A), AlignHi(neighborhood + 1, A)); - - for (size_t row = 0; row < neighborhood; ++row) - { - const uint8_t * s = src + row*srcStride; - for (size_t col = 0; col < alignedWidth; col += A) - AddRows(s + col, buffer.sa + col, _value, K8_01); - if (alignedWidth != width) - AddRows(s + width - A, buffer.sa + width - A, _value, tailMask); - } - - for (size_t row = 0; row < height; ++row) - { - if (row < height - neighborhood) - { - const uint8_t * s = src + (row + neighborhood)*srcStride; - for (size_t col = 0; col < alignedWidth; col += A) - AddRows(s + col, buffer.sa + col, _value, K8_01); - if (alignedWidth != width) - AddRows(s + width - A, buffer.sa + width - A, _value, tailMask); - } - if (row > neighborhood) - { - const uint8_t * s = src + (row - neighborhood - 1)*srcStride; - for (size_t col = 0; col < alignedWidth; col += A) - SubRows(s + col, buffer.sa + col, _value, K8_01); - if (alignedWidth != width) - SubRows(s + width - A, buffer.sa + width - A, _value, tailMask); - } - - for (size_t col = 0; col < width; col += HA) - { - const uint8x16_t sa = Load((uint8_t*)(buffer.sa + col)); - Store((uint16_t*)(buffer.s0a0 + col + 0), UnpackU8<0>(sa)); - Store((uint16_t*)(buffer.s0a0 + col + 4), UnpackU8<1>(sa)); - } - - uint32_t sum = 0; - for (size_t col = 0; col < neighborhood; ++col) - { - sum += buffer.s0a0[col]; - } - for (size_t col = 0; col < width; ++col) - { - sum += buffer.s0a0[col + neighborhood]; - sum -= buffer.s0a0[col - neighborhood - 1]; - buffer.sum[col] = sum; - } - - for (size_t col = 0; col < alignedWidth; col += A) - { - const uint8x16_t mask = CompareSum(buffer.sum + col, _threshold); - Store(dst + col, vbslq_u8(mask, _positive, _negative)); - } - if (alignedWidth != width) - { - const uint8x16_t mask = CompareSum(buffer.sum + width - A, _threshold); - Store(dst + width - A, vbslq_u8(mask, _positive, _negative)); - } - dst += dstStride; - } - } - - template - void AveragingBinarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, size_t neighborhood, uint8_t threshold, uint8_t positive, uint8_t negative, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - else - AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - } - - void AveragingBinarization(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t value, size_t neighborhood, uint8_t threshold, uint8_t positive, uint8_t negative, - uint8_t * dst, size_t dstStride, SimdCompareType compareType) - { - switch (compareType) - { - case SimdCompareEqual: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - case SimdCompareNotEqual: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - case SimdCompareGreater: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - case SimdCompareGreaterOrEqual: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - case SimdCompareLesser: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - case SimdCompareLesserOrEqual: - return AveragingBinarization(src, srcStride, width, height, value, neighborhood, threshold, positive, negative, dst, dstStride); - default: - assert(0); - } - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonConditional.cpp b/src/3rd/Simd/Simd/SimdNeonConditional.cpp deleted file mode 100644 index 8c3201e1..00000000 --- a/src/3rd/Simd/Simd/SimdNeonConditional.cpp +++ /dev/null @@ -1,475 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template - void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, uint8_t value, uint32_t * count) - { - assert(width >= A); - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - size_t blockSize = A << 8; - size_t blockCount = (alignedWidth >> 8) + 1; - - uint8x16_t _value = vdupq_n_u8(value); - - uint32x4_t _count = K32_00000000; - for (size_t row = 0; row < height; ++row) - { - uint16x8_t rowSum = K16_0000; - for (size_t block = 0; block < blockCount; ++block) - { - uint8x16_t blockSum = K8_00; - for (size_t col = block*blockSize, end = Min(col + blockSize, alignedWidth); col < end; col += A) - { - const uint8x16_t mask = Compare8u(Load(src + col), _value); - blockSum = vaddq_u8(blockSum, vandq_u8(mask, K8_01)); - } - rowSum = vaddq_u16(rowSum, vpaddlq_u8(blockSum)); - } - if (alignedWidth != width) - { - const uint8x16_t mask = vandq_u8(Compare8u(Load(src + width - A), _value), tailMask); - rowSum = vaddq_u16(rowSum, vpaddlq_u8(vandq_u8(mask, K8_01))); - } - _count = vaddq_u32(_count, vpaddlq_u16(rowSum)); - src += stride; - } - *count = ExtractSum32u(_count); - } - - template - void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, uint8_t value, uint32_t * count) - { - if (Aligned(src) && Aligned(stride)) - ConditionalCount8u(src, stride, width, height, value, count); - else - ConditionalCount8u(src, stride, width, height, value, count); - } - - void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, - uint8_t value, SimdCompareType compareType, uint32_t * count) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalCount8u(src, stride, width, height, value, count); - case SimdCompareNotEqual: - return ConditionalCount8u(src, stride, width, height, value, count); - case SimdCompareGreater: - return ConditionalCount8u(src, stride, width, height, value, count); - case SimdCompareGreaterOrEqual: - return ConditionalCount8u(src, stride, width, height, value, count); - case SimdCompareLesser: - return ConditionalCount8u(src, stride, width, height, value, count); - case SimdCompareLesserOrEqual: - return ConditionalCount8u(src, stride, width, height, value, count); - default: - assert(0); - } - } - - template - void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, int16_t value, uint32_t * count) - { - assert(width >= HA); - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t alignedWidth = Simd::AlignLo(width, HA); - uint16x8_t tailMask = (uint16x8_t)ShiftLeft(K8_FF, 2 * (HA - width + alignedWidth)); - - int16x8_t _value = vdupq_n_s16(value); - - uint32x4_t _count = K32_00000000; - for (size_t row = 0; row < height; ++row) - { - const int16_t * s = (const int16_t *)src; - uint16x8_t rowSum = K16_0000; - for (size_t col = 0; col < alignedWidth; col += HA) - { - const uint16x8_t mask = Compare16i(Load(s + col), _value); - rowSum = vaddq_u16(rowSum, vandq_u16(mask, K16_0001)); - } - if (alignedWidth != width) - { - const uint16x8_t mask = vandq_u16(Compare16i(Load(s + width - HA), _value), tailMask); - rowSum = vaddq_u16(rowSum, vandq_u16(mask, K16_0001)); - } - _count = vaddq_u32(_count, vpaddlq_u16(rowSum)); - src += stride; - } - *count = ExtractSum32u(_count); - } - - template - void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, int16_t value, uint32_t * count) - { - if (Aligned(src) && Aligned(stride)) - ConditionalCount16i(src, stride, width, height, value, count); - else - ConditionalCount16i(src, stride, width, height, value, count); - } - - void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, - int16_t value, SimdCompareType compareType, uint32_t * count) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalCount16i(src, stride, width, height, value, count); - case SimdCompareNotEqual: - return ConditionalCount16i(src, stride, width, height, value, count); - case SimdCompareGreater: - return ConditionalCount16i(src, stride, width, height, value, count); - case SimdCompareGreaterOrEqual: - return ConditionalCount16i(src, stride, width, height, value, count); - case SimdCompareLesser: - return ConditionalCount16i(src, stride, width, height, value, count); - case SimdCompareLesserOrEqual: - return ConditionalCount16i(src, stride, width, height, value, count); - default: - assert(0); - } - } - - template - void ConditionalSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) - { - assert(width >= A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - size_t blockSize = A << 8; - size_t blockCount = (alignedWidth >> 8) + 1; - - uint8x16_t _value = vdupq_n_u8(value); - - uint64x2_t _sum = K64_0000000000000000; - for (size_t row = 0; row < height; ++row) - { - uint32x4_t rowSum = K32_00000000; - for (size_t block = 0; block < blockCount; ++block) - { - uint16x8_t blockSum = K16_0000; - for (size_t col = block*blockSize, end = Min(col + blockSize, alignedWidth); col < end; col += A) - { - const uint8x16_t _src = Load(src + col); - const uint8x16_t _mask = Compare8u(Load(mask + col), _value); - blockSum = vaddq_u16(blockSum, vpaddlq_u8(vandq_u8(_mask, _src))); - } - rowSum = vaddq_u32(rowSum, vpaddlq_u16(blockSum)); - } - if (alignedWidth != width) - { - const uint8x16_t _src = Load(src + width - A); - const uint8x16_t _mask = vandq_u8(Compare8u(Load(mask + width - A), _value), tailMask); - rowSum = vaddq_u32(rowSum, vpaddlq_u16(vpaddlq_u8(vandq_u8(_mask, _src)))); - } - _sum = vaddq_u64(_sum, vpaddlq_u32(rowSum)); - src += srcStride; - mask += maskStride; - } - *sum = ExtractSum64u(_sum); - } - - template - void ConditionalSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)) - ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - else - ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - } - - void ConditionalSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareNotEqual: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreater: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreaterOrEqual: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesser: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesserOrEqual: - return ConditionalSum(src, srcStride, width, height, mask, maskStride, value, sum); - default: - assert(0); - } - } - - SIMD_INLINE uint16x8_t Square(uint8x8_t value) - { - return vmull_u8(value, value); - } - - SIMD_INLINE uint32x4_t Square(uint8x16_t value) - { - uint16x8_t lo = Square(vget_low_u8(value)); - uint16x8_t hi = Square(vget_high_u8(value)); - return vaddq_u32(vpaddlq_u16(lo), vpaddlq_u16(hi)); - } - - template - void ConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) - { - assert(width >= A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - - uint8x16_t _value = vdupq_n_u8(value); - - uint64x2_t _sum = K64_0000000000000000; - for (size_t row = 0; row < height; ++row) - { - uint32x4_t rowSum = K32_00000000; - for (size_t col = 0; col < alignedWidth; col += A) - { - const uint8x16_t _mask = Compare8u(Load(mask + col), _value); - const uint8x16_t _src = vandq_u8(_mask, Load(src + col)); - rowSum = vaddq_u32(rowSum, Square(_src)); - } - if (alignedWidth != width) - { - const uint8x16_t _mask = vandq_u8(Compare8u(Load(mask + width - A), _value), tailMask); - const uint8x16_t _src = vandq_u8(_mask, Load(src + width - A)); - rowSum = vaddq_u32(rowSum, Square(_src)); - } - _sum = vaddq_u64(_sum, vpaddlq_u32(rowSum)); - src += srcStride; - mask += maskStride; - } - *sum = ExtractSum64u(_sum); - } - - template - void ConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)) - ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - else - ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - } - - void ConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareNotEqual: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreater: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreaterOrEqual: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesser: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesserOrEqual: - return ConditionalSquareSum(src, srcStride, width, height, mask, maskStride, value, sum); - default: - assert(0); - } - } - - template - SIMD_INLINE uint32x4_t SquaredDifference(const uint8_t * src, ptrdiff_t step, uint8x16_t mask) - { - const uint8x16_t a = vandq_u8(Load(src - step), mask); - const uint8x16_t b = vandq_u8(Load(src + step), mask); - return Square(vabdq_u8(a, b)); - } - - template - void ConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) - { - assert(width >= A + 2 && height >= 3); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)); - - src += srcStride; - mask += maskStride; - height -= 2; - - size_t alignedWidth = Simd::AlignLo(width - 1, A); - uint8x16_t noseMask = ShiftRight(K8_FF, 1); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + 1 + alignedWidth); - - uint8x16_t _value = vdupq_n_u8(value); - - uint64x2_t _sum = K64_0000000000000000; - for (size_t row = 0; row < height; ++row) - { - uint32x4_t rowSum = K32_00000000; - { - const uint8x16_t _mask = vandq_u8(Compare8u(Load(mask + 1), _value), noseMask); - rowSum = vaddq_u32(rowSum, SquaredDifference(src + 1, 1, _mask)); - rowSum = vaddq_u32(rowSum, SquaredDifference(src + 1, srcStride, _mask)); - } - for (size_t col = A; col < alignedWidth; col += A) - { - const uint8x16_t _mask = Compare8u(Load(mask + col), _value); - rowSum = vaddq_u32(rowSum, SquaredDifference(src + col, 1, _mask)); - rowSum = vaddq_u32(rowSum, SquaredDifference(src + col, srcStride, _mask)); - } - if (alignedWidth != width - 1) - { - size_t offset = width - A - 1; - const uint8x16_t _mask = vandq_u8(Compare8u(Load(mask + offset), _value), tailMask); - rowSum = vaddq_u32(rowSum, SquaredDifference(src + offset, 1, _mask)); - rowSum = vaddq_u32(rowSum, SquaredDifference(src + offset, srcStride, _mask)); - } - _sum = vaddq_u64(_sum, vpaddlq_u32(rowSum)); - src += srcStride; - mask += maskStride; - } - *sum = ExtractSum64u(_sum); - } - - template - void ConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)) - ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - else - ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - } - - void ConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint64_t * sum) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareNotEqual: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreater: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareGreaterOrEqual: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesser: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - case SimdCompareLesserOrEqual: - return ConditionalSquareGradientSum(src, srcStride, width, height, mask, maskStride, value, sum); - default: - assert(0); - } - } - - template - SIMD_INLINE void ConditionalFill(const uint8_t * src, size_t offset, const uint8x16_t & threshold, const uint8x16_t & value, uint8_t * dst) - { - const uint8x16_t _src = Load(src + offset); - const uint8x16_t _dst = Load(dst + offset); - Store(dst + offset, vbslq_u8(Compare8u(_src, threshold), value, _dst)); - } - - template - void ConditionalFill(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t threshold, uint8_t value, uint8_t * dst, size_t dstStride) - { - assert(width >= A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - - uint8x16_t _value = vdupq_n_u8(value); - uint8x16_t _threshold = vdupq_n_u8(threshold); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - ConditionalFill(src, col, _threshold, _value, dst); - if (alignedWidth != width) - ConditionalFill(src, width - A, _threshold, _value, dst); - src += srcStride; - dst += dstStride; - } - } - - template - void ConditionalFill(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t threshold, uint8_t value, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - else - ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - } - - void ConditionalFill(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t threshold, SimdCompareType compareType, uint8_t value, uint8_t * dst, size_t dstStride) - { - switch (compareType) - { - case SimdCompareEqual: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - case SimdCompareNotEqual: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - case SimdCompareGreater: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - case SimdCompareGreaterOrEqual: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - case SimdCompareLesser: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - case SimdCompareLesserOrEqual: - return ConditionalFill(src, srcStride, width, height, threshold, value, dst, dstStride); - default: - assert(0); - } - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonDeinterleave.cpp b/src/3rd/Simd/Simd/SimdNeonDeinterleave.cpp deleted file mode 100644 index 53530a78..00000000 --- a/src/3rd/Simd/Simd/SimdNeonDeinterleave.cpp +++ /dev/null @@ -1,172 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template void DeinterleaveUv(const uint8_t * uv, size_t uvStride, size_t width, size_t height, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(uv) && Aligned(uvStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride)); - } - - size_t bodyWidth = AlignLo(width, A); - size_t tail = width - bodyWidth; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += DA) - { - uint8x16x2_t _uv = Load2(uv + offset); - Store(u + col, _uv.val[0]); - Store(v + col, _uv.val[1]); - } - if (tail) - { - size_t col = width - A; - size_t offset = 2 * col; - uint8x16x2_t _uv = Load2(uv + offset); - Store(u + col, _uv.val[0]); - Store(v + col, _uv.val[1]); - } - uv += uvStride; - u += uStride; - v += vStride; - } - } - - void DeinterleaveUv(const uint8_t * uv, size_t uvStride, size_t width, size_t height, - uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) - { - if (Aligned(uv) && Aligned(uvStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride)) - DeinterleaveUv(uv, uvStride, width, height, u, uStride, v, vStride); - else - DeinterleaveUv(uv, uvStride, width, height, u, uStride, v, vStride); - } - - template void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(b) && Aligned(bStride)); - assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride)); - } - - size_t bodyWidth = AlignLo(width, A); - size_t tail = width - bodyWidth; - size_t A3 = A * 3; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += A3) - { - uint8x16x3_t _bgr = Load3(bgr + offset); - Store(b + col, _bgr.val[0]); - Store(g + col, _bgr.val[1]); - Store(r + col, _bgr.val[2]); - } - if (tail) - { - size_t col = width - A; - size_t offset = 3 * col; - uint8x16x3_t _bgr = Load3(bgr + offset); - Store(b + col, _bgr.val[0]); - Store(g + col, _bgr.val[1]); - Store(r + col, _bgr.val[2]); - } - bgr += bgrStride; - b += bStride; - g += gStride; - r += rStride; - } - } - - void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride) - { - if (Aligned(bgr) && Aligned(bgrStride) && Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride)) - DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); - else - DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); - } - - template void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride)); - assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride)); - } - - size_t bodyWidth = AlignLo(width, A); - size_t tail = width - bodyWidth; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += QA) - { - uint8x16x4_t _bgra = Load4(bgra + offset); - Store(b + col, _bgra.val[0]); - Store(g + col, _bgra.val[1]); - Store(r + col, _bgra.val[2]); - Store(a + col, _bgra.val[3]); - } - if (tail) - { - size_t col = width - A; - size_t offset = 4 * col; - uint8x16x4_t _bgra = Load4(bgra + offset); - Store(b + col, _bgra.val[0]); - Store(g + col, _bgra.val[1]); - Store(r + col, _bgra.val[2]); - Store(a + col, _bgra.val[3]); - } - bgra += bgraStride; - b += bStride; - g += gStride; - r += rStride; - a += aStride; - } - } - - void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) && - Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride)) - DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); - else - DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonDetection.cpp b/src/3rd/Simd/Simd/SimdNeonDetection.cpp deleted file mode 100644 index 9f1afb1e..00000000 --- a/src/3rd/Simd/Simd/SimdNeonDetection.cpp +++ /dev/null @@ -1,777 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdDetection.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - using namespace Simd::Detection; - - SIMD_INLINE void UnpackMask16i(const uint8_t * src, uint16_t * dst, const uint8x16_t & mask, uint8x16x2_t & buffer) - { - buffer.val[0] = vandq_u8(mask, Load(src)); - Store2((uint8_t*)dst, buffer); - } - - SIMD_INLINE void UnpackMask16i(const uint8_t * src, size_t size, uint16_t * dst, const uint8x16_t & mask) - { - uint8x16x2_t buffer; - buffer.val[1] = K8_00; - size_t alignedSize = Simd::AlignLo(size, A); - for (size_t i = 0; i < alignedSize; i += A) - UnpackMask16i(src + i, dst + i, mask, buffer); - if (size != alignedSize) - UnpackMask16i(src + size - A, dst + size - A, mask, buffer); - } - - SIMD_INLINE void UnpackMask32i(const uint8_t * src, uint32_t * dst, const uint8x16_t & mask, uint8x16x4_t & buffer) - { - buffer.val[0] = vandq_u8(mask, Load(src)); - Store4((uint8_t*)dst, buffer); - } - - SIMD_INLINE void UnpackMask32i(const uint8_t * src, size_t size, uint32_t * dst, const uint8x16_t & mask) - { - uint8x16x4_t buffer; - buffer.val[1] = K8_00; - buffer.val[2] = K8_00; - buffer.val[3] = K8_00; - size_t alignedSize = Simd::AlignLo(size, A); - for (size_t i = 0; i < alignedSize; i += A) - UnpackMask32i(src + i, dst + i, mask, buffer); - if (size != alignedSize) - UnpackMask32i(src + size - A, dst + size - A, mask, buffer); - } - - SIMD_INLINE void PackResult16i(const uint16_t * src, uint8_t * dst) - { - uint8x16x2_t _src = Load2((const uint8_t *)src); - Store(dst, _src.val[0]); - } - - SIMD_INLINE void PackResult16i(const uint16_t * src, size_t size, uint8_t * dst) - { - size_t alignedSize = Simd::AlignLo(size, A); - for (size_t i = 0; i < alignedSize; i += A) - PackResult16i(src + i, dst + i); - if (size != alignedSize) - PackResult16i(src + size - A, dst + size - A); - } - - SIMD_INLINE void PackResult32i(const uint32_t * src, uint8_t * dst) - { - uint8x16x4_t _src = Load4((const uint8_t *)src); - Store(dst, _src.val[0]); - } - - SIMD_INLINE void PackResult32i(const uint32_t * src, size_t size, uint8_t * dst) - { - size_t alignedSize = Simd::AlignLo(size, A); - for (size_t i = 0; i < alignedSize; i += A) - PackResult32i(src + i, dst + i); - if (size != alignedSize) - PackResult32i(src + size - A, dst + size - A); - } - - SIMD_INLINE int ResultCount(const uint32x4_t & result) - { - uint32x4_t a = (uint32x4_t)vpaddlq_u32(result); - return vgetq_lane_u32(a, 0) + vgetq_lane_u32(a, 2); - } - - SIMD_INLINE int ResultCount(const uint16x8_t & result) - { - uint32x4_t a = (uint32x4_t)vpaddlq_u32(vpaddlq_u16(result)); - return vgetq_lane_u32(a, 0) + vgetq_lane_u32(a, 2); - } - - SIMD_INLINE float32x4_t ValidSqrt(const float32x4_t & value) - { - uint32x4_t mask = vcgtq_f32(value, vdupq_n_f32(0.0f)); - return Sqrt<1>(vbslq_f32(mask, value, vdupq_n_f32(1.0f))); - } - - SIMD_INLINE uint32x4_t Sum32ip(uint32_t * const ptr[4], size_t offset) - { - uint32x4_t s0 = vld1q_u32(ptr[0] + offset); - uint32x4_t s1 = vld1q_u32(ptr[1] + offset); - uint32x4_t s2 = vld1q_u32(ptr[2] + offset); - uint32x4_t s3 = vld1q_u32(ptr[3] + offset); - return vsubq_u32(vsubq_u32(s0, s1), vsubq_u32(s2, s3)); - } - - SIMD_INLINE uint32x4_t Sum32ii(uint32_t * const ptr[4], size_t offset) - { - uint32x4x2_t s0 = vld2q_u32(ptr[0] + offset); - uint32x4x2_t s1 = vld2q_u32(ptr[1] + offset); - uint32x4x2_t s2 = vld2q_u32(ptr[2] + offset); - uint32x4x2_t s3 = vld2q_u32(ptr[3] + offset); - return vsubq_u32(vsubq_u32(s0.val[0], s1.val[0]), vsubq_u32(s2.val[0], s3.val[0])); - } - - SIMD_INLINE float32x4_t Norm32fp(const HidHaarCascade & hid, size_t offset) - { - float32x4_t area = vdupq_n_f32(hid.windowArea); - float32x4_t sum = vcvtq_f32_u32(Sum32ip(hid.p, offset)); - float32x4_t sqsum = vcvtq_f32_u32(Sum32ip(hid.pq, offset)); - return ValidSqrt(vmlsq_f32(vmulq_f32(sqsum, area), sum, sum)); - } - - SIMD_INLINE float32x4_t Norm32fi(const HidHaarCascade & hid, size_t offset) - { - float32x4_t area = vdupq_n_f32(hid.windowArea); - float32x4_t sum = vcvtq_f32_u32(Sum32ii(hid.p, offset)); - float32x4_t sqsum = vcvtq_f32_u32(Sum32ii(hid.pq, offset)); - return ValidSqrt(vmlsq_f32(vmulq_f32(sqsum, area), sum, sum)); - } - - SIMD_INLINE float32x4_t WeightedSum32f(const WeightedRect & rect, size_t offset) - { - uint32x4_t s0 = vld1q_u32(rect.p0 + offset); - uint32x4_t s1 = vld1q_u32(rect.p1 + offset); - uint32x4_t s2 = vld1q_u32(rect.p2 + offset); - uint32x4_t s3 = vld1q_u32(rect.p3 + offset); - uint32x4_t sum = vsubq_u32(vsubq_u32(s0, s1), vsubq_u32(s2, s3)); - return vmulq_f32(vcvtq_f32_u32(sum), vdupq_n_f32(rect.weight)); - } - - SIMD_INLINE void StageSum32f(const float * leaves, float threshold, const float32x4_t & sum, const float32x4_t & norm, float32x4_t & stageSum) - { - uint32x4_t mask = vcltq_f32(sum, vmulq_f32(vdupq_n_f32(threshold), norm)); - stageSum = vaddq_f32(stageSum, vbslq_f32(mask, vdupq_n_f32(leaves[0]), vdupq_n_f32(leaves[1]))); - } - - void Detect32f(const HidHaarCascade & hid, size_t offset, const float32x4_t & norm, uint32x4_t & result) - { - typedef HidHaarCascade Hid; - const float * leaves = hid.leaves.data(); - const Hid::Node * node = hid.nodes.data(); - const Hid::Stage * stages = hid.stages.data(); - for (int i = 0, n = (int)hid.stages.size(); i < n; ++i) - { - const Hid::Stage & stage = stages[i]; - if (stage.canSkip) - continue; - const Hid::Node * end = node + stage.ntrees; - float32x4_t stageSum = vdupq_n_f32(0.0f); - if (stage.hasThree) - { - for (; node < end; ++node, leaves += 2) - { - const Hid::Feature & feature = hid.features[node->featureIdx]; - float32x4_t sum = vaddq_f32(WeightedSum32f(feature.rect[0], offset), WeightedSum32f(feature.rect[1], offset)); - if (feature.rect[2].p0) - sum = vaddq_f32(sum, WeightedSum32f(feature.rect[2], offset)); - StageSum32f(leaves, node->threshold, sum, norm, stageSum); - } - } - else - { - for (; node < end; ++node, leaves += 2) - { - const Hid::Feature & feature = hid.features[node->featureIdx]; - float32x4_t sum = vaddq_f32(WeightedSum32f(feature.rect[0], offset), WeightedSum32f(feature.rect[1], offset)); - StageSum32f(leaves, node->threshold, sum, norm, stageSum); - } - } - result = vandq_u32(vcleq_f32(vdupq_n_f32(stage.threshold), stageSum), result); - int resultCount = ResultCount(result); - if (resultCount == 0) - { - return; - } - else if (resultCount == 1) - { - uint32_t _result[4]; - float _norm[4]; - vst1q_u32(_result, result); - vst1q_f32(_norm, norm); - for (int j = 0; j < 4; ++j) - { - if (_result[j]) - { - _result[j] = Base::Detect32f(hid, offset + j, i + 1, _norm[j]) > 0 ? 1 : 0; - break; - } - } - result = vld1q_u32(_result); - return; - } - } - } - - void DetectionHaarDetect32fp(const HidHaarCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - size_t width = rect.Width(); - size_t alignedWidth = Simd::AlignLo(width, F); - size_t evenWidth = Simd::AlignLo(width, 2); - - Buffer buffer(width); - for (ptrdiff_t row = rect.top; row < rect.bottom; row += 1) - { - size_t col = 0; - size_t p_offset = row * hid.sum.stride / sizeof(uint32_t) + rect.left; - size_t pq_offset = row * hid.sqsum.stride / sizeof(uint32_t) + rect.left; - - UnpackMask32i(mask.data + row*mask.stride + rect.left, width, buffer.m, K8_01); - memset(buffer.d, 0, width * sizeof(uint32_t)); - for (; col < alignedWidth; col += F) - { - uint32x4_t result = vld1q_u32(buffer.m + col); - if (ResultCount(result) == 0) - continue; - float32x4_t norm = Norm32fp(hid, pq_offset + col); - Detect32f(hid, p_offset + col, norm, result); - vst1q_u32(buffer.d + col, result); - } - if (evenWidth > alignedWidth + 2) - { - col = evenWidth - F; - uint32x4_t result = vld1q_u32(buffer.m + col); - if (ResultCount(result) != 0) - { - float32x4_t norm = Norm32fp(hid, pq_offset + col); - Detect32f(hid, p_offset + col, norm, result); - vst1q_u32(buffer.d + col, result); - } - col += F; - } - for (; col < width; col += 1) - { - if (buffer.m[col] == 0) - continue; - float norm = Base::Norm32f(hid, pq_offset + col); - buffer.d[col] = Base::Detect32f(hid, p_offset + col, 0, norm) > 0 ? 1 : 0; - } - PackResult32i(buffer.d, width, dst.data + row*dst.stride + rect.left); - } - } - - void DetectionHaarDetect32fp(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidHaarCascade & hid = *(HidHaarCascade*)_hid; - return DetectionHaarDetect32fp(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - - void DetectionHaarDetect32fi(const HidHaarCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - const size_t step = 2; - size_t width = rect.Width(); - size_t alignedWidth = Simd::AlignLo(width, HA); - size_t evenWidth = Simd::AlignLo(width, 2); - - Buffer buffer(evenWidth); - for (ptrdiff_t row = rect.top; row < rect.bottom; row += step) - { - size_t col = 0; - size_t p_offset = row * hid.isum.stride / sizeof(uint32_t) + rect.left / 2; - size_t pq_offset = row * hid.sqsum.stride / sizeof(uint32_t) + rect.left; - - UnpackMask16i(mask.data + row*mask.stride + rect.left, evenWidth, buffer.m, (uint8x16_t)K16_0001); - memset(buffer.d, 0, evenWidth * sizeof(uint16_t)); - for (; col < alignedWidth; col += HA) - { - uint32x4_t result = (uint32x4_t)vld1q_u16(buffer.m + col); - if (ResultCount(result) == 0) - continue; - float32x4_t norm = Norm32fi(hid, pq_offset + col); - Detect32f(hid, p_offset + col / 2, norm, result); - vst1q_u16(buffer.d + col, (uint16x8_t)result); - } - if (evenWidth > alignedWidth) - { - col = evenWidth - HA; - uint32x4_t result = (uint32x4_t)vld1q_u16(buffer.m + col); - if (ResultCount(result) != 0) - { - float32x4_t norm = Norm32fi(hid, pq_offset + col); - Detect32f(hid, p_offset + col / 2, norm, result); - vst1q_u16(buffer.d + col, (uint16x8_t)result); - } - col += HA; - } - for (; col < width; col += step) - { - if (mask.At(col + rect.left, row) == 0) - continue; - float norm = Base::Norm32f(hid, pq_offset + col); - if (Base::Detect32f(hid, p_offset + col / 2, 0, norm) > 0) - dst.At(col + rect.left, row) = 1; - } - PackResult16i(buffer.d, evenWidth, dst.data + row*dst.stride + rect.left); - } - } - - void DetectionHaarDetect32fi(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidHaarCascade & hid = *(HidHaarCascade*)_hid; - return DetectionHaarDetect32fi(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - - const uint8x16_t K8_TBL_BITS = SIMD_VEC_SETR_EPI8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); - - SIMD_INLINE uint8x16_t Shuffle(const uint8x16_t & src, const uint8x16_t & shuffle) - { - return vcombine_u8(vtbl2_u8((const uint8x8x2_t &)src, vget_low_u8(shuffle)), vtbl2_u8((const uint8x8x2_t &)src, vget_high_u8(shuffle))); - } - - SIMD_INLINE uint8x16_t Shuffle(const uint8x16x2_t & src, const uint8x16_t & shuffle) - { - return vcombine_u8(vtbl4_u8((const uint8x8x4_t &)src, vget_low_u8(shuffle)), vtbl4_u8((const uint8x8x4_t &)src, vget_high_u8(shuffle))); - } - - SIMD_INLINE uint32x4_t IntegralSum32i(const uint32x4_t & s0, const uint32x4_t & s1, const uint32x4_t & s2, const uint32x4_t & s3) - { - return vsubq_u32(vsubq_u32(s0, s1), vsubq_u32(s2, s3)); - } - - template SIMD_INLINE void Load(uint32x4_t a[16], const HidLbpFeature & feature, ptrdiff_t offset) - { - a[i] = vld1q_u32((uint32_t*)feature.p[i] + offset); - } - - SIMD_INLINE void Calculate(const HidLbpFeature & feature, ptrdiff_t offset, uint32x4_t & shuffle, uint32x4_t & mask) - { - uint32x4_t a[16]; - Load<5>(a, feature, offset); - Load<6>(a, feature, offset); - Load<9>(a, feature, offset); - Load<10>(a, feature, offset); - uint32x4_t central = IntegralSum32i(a[5], a[6], a[9], a[10]); - - Load<0>(a, feature, offset); - Load<1>(a, feature, offset); - Load<4>(a, feature, offset); - - shuffle = K32_FFFFFF00; - shuffle = vorrq_u32(shuffle, vandq_u32(vcgeq_u32(IntegralSum32i(a[0], a[1], a[4], a[5]), central), K32_00000010)); - Load<2>(a, feature, offset); - shuffle = vorrq_u32(shuffle, vandq_u32(vcgeq_u32(IntegralSum32i(a[1], a[2], a[5], a[6]), central), K32_00000008)); - Load<3>(a, feature, offset); - Load<7>(a, feature, offset); - shuffle = vorrq_u32(shuffle, vandq_u32(vcgeq_u32(IntegralSum32i(a[2], a[3], a[6], a[7]), central), K32_00000004)); - Load<11>(a, feature, offset); - shuffle = vorrq_u32(shuffle, vandq_u32(vcgeq_u32(IntegralSum32i(a[6], a[7], a[10], a[11]), central), K32_00000002)); - Load<14>(a, feature, offset); - Load<15>(a, feature, offset); - shuffle = vorrq_u32(shuffle, vandq_u32(vcgeq_u32(IntegralSum32i(a[10], a[11], a[14], a[15]), central), K32_00000001)); - - mask = K32_08080800; - Load<13>(a, feature, offset); - mask = vorrq_u32(mask, vandq_u32(vcgeq_u32(IntegralSum32i(a[9], a[10], a[13], a[14]), central), K32_00000004)); - Load<12>(a, feature, offset); - Load<8>(a, feature, offset); - mask = vorrq_u32(mask, vandq_u32(vcgeq_u32(IntegralSum32i(a[8], a[9], a[12], a[13]), central), K32_00000002)); - mask = vorrq_u32(mask, vandq_u32(vcgeq_u32(IntegralSum32i(a[4], a[5], a[8], a[9]), central), K32_00000001)); - mask = (uint32x4_t)Shuffle(K8_TBL_BITS, (uint8x16_t)mask); - } - - SIMD_INLINE uint32x4_t LeafMask(const HidLbpFeature & feature, ptrdiff_t offset, const int * subset) - { - uint32x4_t shuffle, mask; - Calculate(feature, offset, shuffle, mask); - - uint8x16x2_t _subset; - _subset.val[0] = vld1q_u8((uint8_t*)subset + 0); - _subset.val[1] = vld1q_u8((uint8_t*)subset + A); - uint32x4_t value = vandq_u32((uint32x4_t)Shuffle(_subset, (uint8x16_t)shuffle), mask); - - return vmvnq_u32(vceqq_u32(value, K32_00000000)); - } - - void Detect(const HidLbpCascade & hid, size_t offset, uint32x4_t & result) - { - typedef HidLbpCascade Hid; - - size_t subsetSize = (hid.ncategories + 31) / 32; - const int * subsets = hid.subsets.data(); - const Hid::Leave * leaves = hid.leaves.data(); - const Hid::Node * nodes = hid.nodes.data(); - const Hid::Stage * stages = hid.stages.data(); - int nodeOffset = 0, leafOffset = 0; - for (int i_stage = 0, n_stages = (int)hid.stages.size(); i_stage < n_stages; i_stage++) - { - const Hid::Stage & stage = stages[i_stage]; - float32x4_t sum = vdupq_n_f32(0.0f); - for (int i_tree = 0, n_trees = stage.ntrees; i_tree < n_trees; i_tree++) - { - const Hid::Feature & feature = hid.features[nodes[nodeOffset].featureIdx]; - const int * subset = subsets + nodeOffset*subsetSize; - uint32x4_t mask = LeafMask(feature, offset, subset); - sum = vaddq_f32(sum, vbslq_f32(mask, vdupq_n_f32(leaves[leafOffset + 0]), vdupq_n_f32(leaves[leafOffset + 1]))); - nodeOffset++; - leafOffset += 2; - } - result = vandq_u32(vcleq_f32(vdupq_n_f32(stage.threshold), sum), result); - int resultCount = ResultCount(result); - if (resultCount == 0) - return; - else if (resultCount == 1) - { - uint32_t _result[4]; - vst1q_u32(_result, result); - for (int i = 0; i < 4; ++i) - { - if (_result[i]) - { - _result[i] = Base::Detect(hid, offset + i, i_stage + 1) > 0 ? 1 : 0; - break; - } - } - result = vld1q_u32(_result); - return; - } - } - } - - void DetectionLbpDetect32fp(const HidLbpCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - size_t width = rect.Width(); - size_t alignedWidth = Simd::AlignLo(width, 4); - size_t evenWidth = Simd::AlignLo(width, 2); - - Buffer buffer(width); - for (ptrdiff_t row = rect.top; row < rect.bottom; row += 1) - { - size_t col = 0; - size_t offset = row * hid.sum.stride / sizeof(uint32_t) + rect.left; - - UnpackMask32i(mask.data + row*mask.stride + rect.left, width, buffer.m, K8_01); - memset(buffer.d, 0, width * sizeof(uint32_t)); - for (; col < alignedWidth; col += 4) - { - uint32x4_t result = vld1q_u32(buffer.m + col); - if (ResultCount(result) == 0) - continue; - Detect(hid, offset + col, result); - vst1q_u32(buffer.d + col, result); - } - if (evenWidth > alignedWidth + 2) - { - col = evenWidth - 4; - uint32x4_t result = vld1q_u32(buffer.m + col); - if (ResultCount(result) != 0) - { - Detect(hid, offset + col, result); - vst1q_u32(buffer.d + col, result); - } - col += 4; - } - for (; col < width; col += 1) - { - if (buffer.m[col] == 0) - continue; - buffer.d[col] = Base::Detect(hid, offset + col, 0) > 0 ? 1 : 0; - } - PackResult32i(buffer.d, width, dst.data + row*dst.stride + rect.left); - } - } - - void DetectionLbpDetect32fp(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidLbpCascade & hid = *(HidLbpCascade*)_hid; - return DetectionLbpDetect32fp(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - - void DetectionLbpDetect32fi(const HidLbpCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - const size_t step = 2; - size_t width = rect.Width(); - size_t alignedWidth = Simd::AlignLo(width, HA); - size_t evenWidth = Simd::AlignLo(width, 2); - - Buffer buffer(evenWidth); - for (ptrdiff_t row = rect.top; row < rect.bottom; row += step) - { - size_t col = 0; - size_t offset = row * hid.isum.stride / sizeof(uint32_t) + rect.left / 2; - - UnpackMask16i(mask.data + row*mask.stride + rect.left, evenWidth, buffer.m, (uint8x16_t)K16_0001); - memset(buffer.d, 0, evenWidth * sizeof(uint16_t)); - for (; col < alignedWidth; col += HA) - { - uint32x4_t result = (uint32x4_t)vld1q_u16(buffer.m + col); - if (ResultCount(result) == 0) - continue; - Detect(hid, offset + col / 2, result); - vst1q_u16(buffer.d + col, (uint16x8_t)result); - } - if (evenWidth > alignedWidth) - { - col = evenWidth - HA; - uint32x4_t result = (uint32x4_t)vld1q_u16(buffer.m + col); - if (ResultCount(result) != 0) - { - Detect(hid, offset + col / 2, result); - vst1q_u16(buffer.d + col, (uint16x8_t)result); - } - col += HA; - } - for (; col < width; col += step) - { - if (mask.At(col + rect.left, row) == 0) - continue; - if (Base::Detect(hid, offset + col / 2, 0) > 0) - dst.At(col + rect.left, row) = 1; - } - PackResult16i(buffer.d, evenWidth, dst.data + row*dst.stride + rect.left); - } - } - - void DetectionLbpDetect32fi(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidLbpCascade & hid = *(HidLbpCascade*)_hid; - return DetectionLbpDetect32fi(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - - SIMD_INLINE uint16x8_t IntegralSum16i(const uint16x8_t & s0, const uint16x8_t & s1, const uint16x8_t & s2, const uint16x8_t & s3) - { - return vsubq_u16(vsubq_u16(s0, s1), vsubq_u16(s2, s3)); - } - - template SIMD_INLINE void Load(uint16x8_t a[16], const HidLbpFeature & feature, ptrdiff_t offset) - { - a[i] = vld1q_u16((uint16_t*)feature.p[i] + offset); - } - - SIMD_INLINE void Calculate(const HidLbpFeature & feature, ptrdiff_t offset, uint16x8_t & shuffle, uint16x8_t & mask) - { - uint16x8_t a[16]; - Load<5>(a, feature, offset); - Load<6>(a, feature, offset); - Load<9>(a, feature, offset); - Load<10>(a, feature, offset); - uint16x8_t central = IntegralSum16i(a[5], a[6], a[9], a[10]); - - Load<0>(a, feature, offset); - Load<1>(a, feature, offset); - Load<4>(a, feature, offset); - - shuffle = K16_FF00; - shuffle = vorrq_u16(shuffle, vandq_u16(vcgeq_u16(IntegralSum16i(a[0], a[1], a[4], a[5]), central), K16_0010)); - Load<2>(a, feature, offset); - shuffle = vorrq_u16(shuffle, vandq_u16(vcgeq_u16(IntegralSum16i(a[1], a[2], a[5], a[6]), central), K16_0008)); - Load<3>(a, feature, offset); - Load<7>(a, feature, offset); - shuffle = vorrq_u16(shuffle, vandq_u16(vcgeq_u16(IntegralSum16i(a[2], a[3], a[6], a[7]), central), K16_0004)); - Load<11>(a, feature, offset); - shuffle = vorrq_u16(shuffle, vandq_u16(vcgeq_u16(IntegralSum16i(a[6], a[7], a[10], a[11]), central), K16_0002)); - Load<14>(a, feature, offset); - Load<15>(a, feature, offset); - shuffle = vorrq_u16(shuffle, vandq_u16(vcgeq_u16(IntegralSum16i(a[10], a[11], a[14], a[15]), central), K16_0001)); - - mask = K16_0800; - Load<13>(a, feature, offset); - mask = vorrq_u16(mask, vandq_u16(vcgeq_u16(IntegralSum16i(a[9], a[10], a[13], a[14]), central), K16_0004)); - Load<12>(a, feature, offset); - Load<8>(a, feature, offset); - mask = vorrq_u16(mask, vandq_u16(vcgeq_u16(IntegralSum16i(a[8], a[9], a[12], a[13]), central), K16_0002)); - mask = vorrq_u16(mask, vandq_u16(vcgeq_u16(IntegralSum16i(a[4], a[5], a[8], a[9]), central), K16_0001)); - mask = (uint16x8_t)Shuffle(K8_TBL_BITS, (uint8x16_t)mask); - } - - SIMD_INLINE uint16x8_t LeafMask(const HidLbpFeature & feature, ptrdiff_t offset, const int * subset) - { - uint16x8_t shuffle, mask; - Calculate(feature, offset, shuffle, mask); - - uint8x16x2_t _subset; - _subset.val[0] = vld1q_u8((uint8_t*)subset + 0); - _subset.val[1] = vld1q_u8((uint8_t*)subset + A); - uint16x8_t value = vandq_u16((uint16x8_t)Shuffle(_subset, (uint8x16_t)shuffle), mask); - - return vmvnq_u16(vceqq_u16(value, K16_0000)); - } - - void Detect(const HidLbpCascade & hid, size_t offset, uint16x8_t & result) - { - typedef HidLbpCascade Hid; - - size_t subsetSize = (hid.ncategories + 31) / 32; - const int * subsets = hid.subsets.data(); - const Hid::Leave * leaves = hid.leaves.data(); - const Hid::Node * nodes = hid.nodes.data(); - const Hid::Stage * stages = hid.stages.data(); - int nodeOffset = 0, leafOffset = 0; - for (int i_stage = 0, n_stages = (int)hid.stages.size(); i_stage < n_stages; i_stage++) - { - const Hid::Stage & stage = stages[i_stage]; - int16x8_t sum = vdupq_n_s16(0); - for (int i_tree = 0, n_trees = stage.ntrees; i_tree < n_trees; i_tree++) - { - const Hid::Feature & feature = hid.features[nodes[nodeOffset].featureIdx]; - const int * subset = subsets + nodeOffset*subsetSize; - uint16x8_t mask = LeafMask(feature, offset, subset); - sum = vaddq_s16(sum, vbslq_s16(mask, vdupq_n_s16(leaves[leafOffset + 0]), vdupq_n_s16(leaves[leafOffset + 1]))); - nodeOffset++; - leafOffset += 2; - } - result = vandq_u16(vcleq_s16(vdupq_n_s16(stage.threshold), sum), result); - int resultCount = ResultCount(result); - if (resultCount == 0) - return; - else if (resultCount == 1) - { - uint16_t _result[HA]; - vst1q_u16(_result, result); - for (int i = 0; i < HA; ++i) - { - if (_result[i]) - { - _result[i] = Base::Detect(hid, offset + i, i_stage + 1) > 0 ? 1 : 0; - break; - } - } - result = vld1q_u16(_result); - return; - } - } - } - - void DetectionLbpDetect16ip(const HidLbpCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - size_t width = rect.Width(); - size_t alignedWidth = Simd::AlignLo(width, HA); - size_t evenWidth = Simd::AlignLo(width, 2); - Buffer buffer(width); - for (ptrdiff_t row = rect.top; row < rect.bottom; row += 1) - { - size_t col = 0; - size_t offset = row * hid.isum.stride / sizeof(uint16_t) + rect.left; - UnpackMask16i(mask.data + row*mask.stride + rect.left, width, buffer.m, K8_01); - memset(buffer.d, 0, width * sizeof(uint16_t)); - for (; col < alignedWidth; col += HA) - { - uint16x8_t result = vld1q_u16(buffer.m + col); - if (ResultCount(result) == 0) - continue; - Detect(hid, offset + col, result); - vst1q_u16(buffer.d + col, result); - } - if (evenWidth > alignedWidth + 2) - { - col = evenWidth - HA; - uint16x8_t result = vld1q_u16(buffer.m + col); - if (ResultCount(result) != 0) - { - Detect(hid, offset + col, result); - vst1q_u16(buffer.d + col, result); - } - col += HA; - } - for (; col < width; ++col) - { - if (buffer.m[col] == 0) - continue; - buffer.d[col] = Base::Detect(hid, offset + col, 0) > 0 ? 1 : 0; - } - PackResult16i(buffer.d, width, dst.data + row*dst.stride + rect.left); - } - } - - void DetectionLbpDetect16ip(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidLbpCascade & hid = *(HidLbpCascade*)_hid; - return DetectionLbpDetect16ip(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - - - void DetectionLbpDetect16ii(const HidLbpCascade & hid, const Image & mask, const Rect & rect, Image & dst) - { - const size_t step = 2; - size_t width = rect.Width(); - size_t alignedWidth = Simd::AlignLo(width, A); - size_t evenWidth = Simd::AlignLo(width, 2); - - for (ptrdiff_t row = rect.top; row < rect.bottom; row += step) - { - size_t col = 0; - size_t offset = row * hid.isum.stride / sizeof(uint16_t) + rect.left / 2; - const uint8_t * m = mask.data + row*mask.stride + rect.left; - uint8_t * d = dst.data + row*dst.stride + rect.left; - for (; col < alignedWidth; col += A) - { - uint16x8_t result = vandq_u16((uint16x8_t)vld1q_u8(m + col), K16_0001); - if (ResultCount(result) == 0) - continue; - Detect(hid, offset + col / 2, result); - vst1q_u8(d + col, (uint8x16_t)result); - } - if (evenWidth > alignedWidth + 2) - { - col = evenWidth - A; - uint16x8_t result = vandq_u16((uint16x8_t)vld1q_u8(m + col), K16_0001); - if (ResultCount(result) != 0) - { - Detect(hid, offset + col / 2, result); - vst1q_u8(d + col, (uint8x16_t)result); - } - col += A; - } - for (; col < width; col += step) - { - if (mask.At(col + rect.left, row) == 0) - continue; - if (Base::Detect(hid, offset + col / 2, 0) > 0) - dst.At(col + rect.left, row) = 1; - } - } - } - - void DetectionLbpDetect16ii(const void * _hid, const uint8_t * mask, size_t maskStride, - ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride) - { - const HidLbpCascade & hid = *(HidLbpCascade*)_hid; - return DetectionLbpDetect16ii(hid, - Image(hid.sum.width - 1, hid.sum.height - 1, maskStride, Image::Gray8, (uint8_t*)mask), - Rect(left, top, right, bottom), - Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonEdgeBackground.cpp b/src/3rd/Simd/Simd/SimdNeonEdgeBackground.cpp deleted file mode 100644 index 53d2f828..00000000 --- a/src/3rd/Simd/Simd/SimdNeonEdgeBackground.cpp +++ /dev/null @@ -1,294 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE void EdgeBackgroundGrowRangeSlow(const uint8_t * value, uint8_t * background, uint8x16_t mask) - { - const uint8x16_t _value = Load(value); - const uint8x16_t _background = Load(background); - const uint8x16_t inc = vandq_u8(mask, vcgtq_u8(_value, _background)); - Store(background, vqaddq_u8(_background, inc)); - } - - template void EdgeBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(background) && Aligned(backgroundStride)); - } - - size_t alignedWidth = AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_01, A - width + alignedWidth); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - EdgeBackgroundGrowRangeSlow(value + col, background + col, K8_01); - if (alignedWidth != width) - EdgeBackgroundGrowRangeSlow(value + width - A, background + width - A, tailMask); - value += valueStride; - background += backgroundStride; - } - } - - void EdgeBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(background) && Aligned(backgroundStride)) - EdgeBackgroundGrowRangeSlow(value, valueStride, width, height, background, backgroundStride); - else - EdgeBackgroundGrowRangeSlow(value, valueStride, width, height, background, backgroundStride); - } - - template SIMD_INLINE void EdgeBackgroundGrowRangeFast(const uint8_t * value, uint8_t * background) - { - const uint8x16_t _value = Load(value); - const uint8x16_t _background = Load(background); - Store(background, vmaxq_u8(_background, _value)); - } - - template void EdgeBackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(background) && Aligned(backgroundStride)); - } - - size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - EdgeBackgroundGrowRangeFast(value + col, background + col); - if (alignedWidth != width) - EdgeBackgroundGrowRangeFast(value + width - A, background + width - A); - value += valueStride; - background += backgroundStride; - } - } - - void EdgeBackgroundGrowRangeFast(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(background) && Aligned(backgroundStride)) - EdgeBackgroundGrowRangeFast(value, valueStride, width, height, background, backgroundStride); - else - EdgeBackgroundGrowRangeFast(value, valueStride, width, height, background, backgroundStride); - } - - - template SIMD_INLINE void EdgeBackgroundIncrementCount(const uint8_t * value, - const uint8_t * backgroundValue, uint8_t * backgroundCount, size_t offset, uint8x16_t mask) - { - const uint8x16_t _value = Load(value + offset); - const uint8x16_t _backgroundValue = Load(backgroundValue + offset); - const uint8x16_t _backgroundCount = Load(backgroundCount + offset); - - const uint8x16_t inc = vandq_u8(mask, vcgtq_u8(_value, _backgroundValue)); - - Store(backgroundCount + offset, vqaddq_u8(_backgroundCount, inc)); - } - - template void EdgeBackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t * backgroundCount, size_t backgroundCountStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(backgroundValue) && Aligned(backgroundValueStride) && Aligned(backgroundCount) && Aligned(backgroundCountStride)); - } - - size_t alignedWidth = AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_01, A - width + alignedWidth); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - EdgeBackgroundIncrementCount(value, backgroundValue, backgroundCount, col, K8_01); - if (alignedWidth != width) - EdgeBackgroundIncrementCount(value, backgroundValue, backgroundCount, width - A, tailMask); - value += valueStride; - backgroundValue += backgroundValueStride; - backgroundCount += backgroundCountStride; - } - } - - void EdgeBackgroundIncrementCount(const uint8_t * value, size_t valueStride, size_t width, size_t height, - const uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t * backgroundCount, size_t backgroundCountStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(backgroundValue) && Aligned(backgroundValueStride) && Aligned(backgroundCount) && Aligned(backgroundCountStride)) - EdgeBackgroundIncrementCount(value, valueStride, width, height, backgroundValue, backgroundValueStride, backgroundCount, backgroundCountStride); - else - EdgeBackgroundIncrementCount(value, valueStride, width, height, backgroundValue, backgroundValueStride, backgroundCount, backgroundCountStride); - } - - SIMD_INLINE uint8x16_t AdjustEdge(const uint8x16_t & count, const uint8x16_t & value, const uint8x16_t & mask, const uint8x16_t & threshold) - { - const uint8x16_t inc = vandq_u8(mask, vcgtq_u8(count, threshold)); - const uint8x16_t dec = vandq_u8(mask, vcltq_u8(count, threshold)); - return vqsubq_u8(vqaddq_u8(value, inc), dec); - } - - template SIMD_INLINE void EdgeBackgroundAdjustRange(uint8_t * backgroundCount, uint8_t * backgroundValue, - size_t offset, const uint8x16_t & threshold, const uint8x16_t & mask) - { - const uint8x16_t _backgroundCount = Load(backgroundCount + offset); - const uint8x16_t _backgroundValue = Load(backgroundValue + offset); - - Store(backgroundValue + offset, AdjustEdge(_backgroundCount, _backgroundValue, mask, threshold)); - Store(backgroundCount + offset, K8_00); - } - - template void EdgeBackgroundAdjustRange(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold) - { - assert(width >= A); - if (align) - { - assert(Aligned(backgroundValue) && Aligned(backgroundValueStride) && Aligned(backgroundCount) && Aligned(backgroundCountStride)); - } - - const uint8x16_t _threshold = vld1q_dup_u8(&threshold); - size_t alignedWidth = AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_01, A - width + alignedWidth); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - EdgeBackgroundAdjustRange(backgroundCount, backgroundValue, col, _threshold, K8_01); - if (alignedWidth != width) - EdgeBackgroundAdjustRange(backgroundCount, backgroundValue, width - A, _threshold, tailMask); - backgroundValue += backgroundValueStride; - backgroundCount += backgroundCountStride; - } - } - - void EdgeBackgroundAdjustRange(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold) - { - if (Aligned(backgroundValue) && Aligned(backgroundValueStride) && - Aligned(backgroundCount) && Aligned(backgroundCountStride)) - EdgeBackgroundAdjustRange(backgroundCount, backgroundCountStride, width, height, backgroundValue, backgroundValueStride, threshold); - else - EdgeBackgroundAdjustRange(backgroundCount, backgroundCountStride, width, height, backgroundValue, backgroundValueStride, threshold); - } - - template SIMD_INLINE void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, uint8_t * backgroundValue, - const uint8_t * mask, size_t offset, const uint8x16_t & threshold, const uint8x16_t & tailMask) - { - const uint8x16_t _mask = Load(mask + offset); - EdgeBackgroundAdjustRange(backgroundCount, backgroundValue, offset, threshold, vandq_u8(_mask, tailMask)); - } - - template void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(backgroundValue) && Aligned(backgroundValueStride)); - assert(Aligned(backgroundCount) && Aligned(backgroundCountStride)); - assert(Aligned(mask) && Aligned(maskStride)); - } - - const uint8x16_t _threshold = vld1q_dup_u8(&threshold); - size_t alignedWidth = AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_01, A - width + alignedWidth); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - EdgeBackgroundAdjustRangeMasked(backgroundCount, backgroundValue, mask, col, _threshold, K8_01); - if (alignedWidth != width) - EdgeBackgroundAdjustRangeMasked(backgroundCount, backgroundValue, mask, width - A, _threshold, tailMask); - backgroundValue += backgroundValueStride; - backgroundCount += backgroundCountStride; - mask += maskStride; - } - } - - void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, - uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride) - { - if (Aligned(backgroundValue) && Aligned(backgroundValueStride) && Aligned(backgroundCount) && Aligned(backgroundCountStride) && - Aligned(mask) && Aligned(maskStride)) - EdgeBackgroundAdjustRangeMasked(backgroundCount, backgroundCountStride, width, height, backgroundValue, backgroundValueStride, - threshold, mask, maskStride); - else - EdgeBackgroundAdjustRangeMasked(backgroundCount, backgroundCountStride, width, height, backgroundValue, backgroundValueStride, - threshold, mask, maskStride); - } - - template SIMD_INLINE void EdgeBackgroundShiftRangeMasked(const uint8_t * value, uint8_t * background, const uint8_t * mask, size_t offset) - { - const uint8x16_t _value = Load(value + offset); - const uint8x16_t _background = Load(background + offset); - const uint8x16_t _mask = Load(mask + offset); - Store(background + offset, vbslq_u8(_mask, _value, _background)); - } - - template void EdgeBackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride, const uint8_t * mask, size_t maskStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(value) && Aligned(valueStride)); - assert(Aligned(background) && Aligned(backgroundStride)); - assert(Aligned(mask) && Aligned(maskStride)); - } - - size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - EdgeBackgroundShiftRangeMasked(value, background, mask, col); - if (alignedWidth != width) - EdgeBackgroundShiftRangeMasked(value, background, mask, width - A); - value += valueStride; - background += backgroundStride; - mask += maskStride; - } - } - - void EdgeBackgroundShiftRangeMasked(const uint8_t * value, size_t valueStride, size_t width, size_t height, - uint8_t * background, size_t backgroundStride, const uint8_t * mask, size_t maskStride) - { - if (Aligned(value) && Aligned(valueStride) && Aligned(background) && Aligned(backgroundStride) && Aligned(mask) && Aligned(maskStride)) - EdgeBackgroundShiftRangeMasked(value, valueStride, width, height, background, backgroundStride, mask, maskStride); - else - EdgeBackgroundShiftRangeMasked(value, valueStride, width, height, background, backgroundStride, mask, maskStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonFill.cpp b/src/3rd/Simd/Simd/SimdNeonFill.cpp deleted file mode 100644 index e11f843e..00000000 --- a/src/3rd/Simd/Simd/SimdNeonFill.cpp +++ /dev/null @@ -1,188 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template void FillBgr(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red) - { - size_t size = width * 3; - size_t step = A * 3; - size_t alignedSize = AlignLo(width, A) * 3; - - uint8x16x3_t bgr; - bgr.val[0] = vdupq_n_u8(blue); - bgr.val[1] = vdupq_n_u8(green); - bgr.val[2] = vdupq_n_u8(red); - - for (size_t row = 0; row < height; ++row) - { - size_t offset = 0; - for (; offset < alignedSize; offset += step) - Store3(dst + offset, bgr); - if (offset < size) - Store3(dst + size - step, bgr); - dst += stride; - } - } - - void FillBgr(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red) - { - if (Aligned(dst) && Aligned(stride)) - FillBgr(dst, stride, width, height, blue, green, red); - else - FillBgr(dst, stride, width, height, blue, green, red); - } - - template void FillBgra(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red, uint8_t alpha) - { - size_t size = width * 4; - size_t alignedSize = AlignLo(width, A) * 4; - - uint8x16x4_t bgra; - bgra.val[0] = vdupq_n_u8(blue); - bgra.val[1] = vdupq_n_u8(green); - bgra.val[2] = vdupq_n_u8(red); - bgra.val[3] = vdupq_n_u8(alpha); - - for (size_t row = 0; row < height; ++row) - { - size_t offset = 0; - for (; offset < alignedSize; offset += QA) - Store4(dst + offset, bgra); - if (offset < size) - Store4(dst + size - QA, bgra); - dst += stride; - } - } - - void FillBgra(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red, uint8_t alpha) - { - if (Aligned(dst) && Aligned(stride)) - FillBgra(dst, stride, width, height, blue, green, red, alpha); - else - FillBgra(dst, stride, width, height, blue, green, red, alpha); - } - - template void Fill32f(float * dst, size_t size, const float * value) - { - if (value == 0 || value[0] == 0) - memset(dst, 0, size * sizeof(float)); - else - { - float v = value[0]; - const float * nose = (float*)AlignHi(dst, F * sizeof(float)); - for (; dst < nose && size; --size) - *dst++ = v; - const float * end = dst + size; - const float * endF = dst + AlignLo(size, F); - const float * endQF = dst + AlignLo(size, QF); - float32x4_t _v = vdupq_n_f32(v); - for (; dst < endQF; dst += QF) - { - Store(dst + 0 * F, _v); - Store(dst + 1 * F, _v); - Store(dst + 2 * F, _v); - Store(dst + 3 * F, _v); - } - for (; dst < endF; dst += F) - Store(dst, _v); - for (; dst < end;) - *dst++ = v; - } - } - - void Fill32f(float * dst, size_t size, const float * value) - { - if (Aligned(dst)) - Fill32f(dst, size, value); - else - Fill32f(dst, size, value); - } - - template void FillPixel(uint8_t * dst, size_t stride, size_t width, size_t height, const uint8x16_t & pixel) - { - assert(width >= A); - if (align) - assert(Aligned(dst) && Aligned(stride)); - - size_t fullAlignedWidth = AlignLo(width, QA); - size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < fullAlignedWidth; col += QA) - { - Store((dst + col) + 0 * A, pixel); - Store((dst + col) + 1 * A, pixel); - Store((dst + col) + 2 * A, pixel); - Store((dst + col) + 3 * A, pixel); - } - for (; col < alignedWidth; col += A) - Store((dst + col), pixel); - if (col < width) - Store((dst + width - A), pixel); - dst += stride; - } - } - - template void FillPixel(uint8_t * dst, size_t stride, size_t width, size_t height, const uint8_t * pixel, size_t pixelSize) - { - if (pixelSize == 3) - FillBgr(dst, stride, width, height, pixel[0], pixel[1], pixel[2]); - else if (pixelSize == 1) - Base::Fill(dst, stride, width, height, 1, pixel[0]); - else - { - uint8x16_t _pixel; - switch (pixelSize) - { - case 2: - _pixel = (uint8x16_t)vdupq_n_u16(*(uint16_t*)pixel); - break; - case 4: - _pixel = (uint8x16_t)vdupq_n_u32(*(uint32_t*)pixel); - break; - default: - assert(0); - } - FillPixel(dst, stride, width*pixelSize, height, _pixel); - } - } - - void FillPixel(uint8_t * dst, size_t stride, size_t width, size_t height, const uint8_t * pixel, size_t pixelSize) - { - if (Aligned(dst) && Aligned(stride)) - FillPixel(dst, stride, width, height, pixel, pixelSize); - else - FillPixel(dst, stride, width, height, pixel, pixelSize); - } - } -#endif// SIMD_SSE2_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonFloat16.cpp b/src/3rd/Simd/Simd/SimdNeonFloat16.cpp deleted file mode 100644 index 6a11b29d..00000000 --- a/src/3rd/Simd/Simd/SimdNeonFloat16.cpp +++ /dev/null @@ -1,459 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#if defined(SIMD_NEON_ENABLE) && defined(SIMD_NEON_FP16_ENABLE) - namespace Neon - { - template SIMD_INLINE void Float32ToFloat16(const float * src, uint16_t * dst) - { - Store(dst, (uint16x4_t)vcvt_f16_f32(Load(src))); - } - - template void Float32ToFloat16(const float * src, size_t size, uint16_t * dst) - { - assert(size >= F); - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t fullAlignedSize = Simd::AlignLo(size, QF); - size_t partialAlignedSize = Simd::AlignLo(size, F); - - size_t i = 0; - for (; i < fullAlignedSize; i += QF) - { - Float32ToFloat16(src + i + F * 0, dst + i + F * 0); - Float32ToFloat16(src + i + F * 1, dst + i + F * 1); - Float32ToFloat16(src + i + F * 2, dst + i + F * 2); - Float32ToFloat16(src + i + F * 3, dst + i + F * 3); - } - for (; i < partialAlignedSize; i += F) - Float32ToFloat16(src + i, dst + i); - if (partialAlignedSize != size) - Float32ToFloat16(src + size - F, dst + size - F); - } - - void Float32ToFloat16(const float * src, size_t size, uint16_t * dst) - { - if (Aligned(src) && Aligned(dst)) - Float32ToFloat16(src, size, dst); - else - Float32ToFloat16(src, size, dst); - } - - template SIMD_INLINE void Float16ToFloat32(const uint16_t * src, float * dst) - { - Store(dst, vcvt_f32_f16((float16x4_t)LoadHalf(src))); - } - - template void Float16ToFloat32(const uint16_t * src, size_t size, float * dst) - { - assert(size >= F); - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t fullAlignedSize = Simd::AlignLo(size, QF); - size_t partialAlignedSize = Simd::AlignLo(size, F); - - size_t i = 0; - for (; i < fullAlignedSize; i += QF) - { - Float16ToFloat32(src + i + F * 0, dst + i + F * 0); - Float16ToFloat32(src + i + F * 1, dst + i + F * 1); - Float16ToFloat32(src + i + F * 2, dst + i + F * 2); - Float16ToFloat32(src + i + F * 3, dst + i + F * 3); - } - for (; i < partialAlignedSize; i += F) - Float16ToFloat32(src + i, dst + i); - if (partialAlignedSize != size) - Float16ToFloat32(src + size - F, dst + size - F); - } - - void Float16ToFloat32(const uint16_t * src, size_t size, float * dst) - { - if (Aligned(src) && Aligned(dst)) - Float16ToFloat32(src, size, dst); - else - Float16ToFloat32(src, size, dst); - } - - template SIMD_INLINE void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t offset, float32x4_t & sum) - { - float32x4_t _a = vcvt_f32_f16((float16x4_t)LoadHalf(a + offset)); - float32x4_t _b = vcvt_f32_f16((float16x4_t)LoadHalf(b + offset)); - float32x4_t _d = vsubq_f32(_a, _b); - sum = vmlaq_f32(sum, _d, _d); - } - - template SIMD_INLINE void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum) - { - assert(size >= F); - if (align) - assert(Aligned(a) && Aligned(b)); - - size_t partialAlignedSize = AlignLo(size, F); - size_t fullAlignedSize = AlignLo(size, DF); - size_t i = 0; - float32x4_t sums[2] = { vdupq_n_f32(0), vdupq_n_f32(0) }; - if (fullAlignedSize) - { - for (; i < fullAlignedSize; i += DF) - { - SquaredDifferenceSum16f(a, b, i + F * 0, sums[0]); - SquaredDifferenceSum16f(a, b, i + F * 1, sums[1]); - } - sums[0] = vaddq_f32(sums[0], sums[1]); - } - for (; i < partialAlignedSize; i += F) - SquaredDifferenceSum16f(a, b, i, sums[0]); - if (partialAlignedSize != size) - { - float32x4_t tailMask = RightNotZero32f(size - partialAlignedSize); - float32x4_t _a = vcvt_f32_f16((float16x4_t)LoadHalf(a + size - F)); - float32x4_t _b = vcvt_f32_f16((float16x4_t)LoadHalf(a + size - F)); - float32x4_t _d = And(vsubq_f32(_a, _b), tailMask); - sums[0] = vaddq_f32(sums[0], vmulq_f32(_d, _d)); - } - *sum = ExtractSum32f(sums[0]); - } - - void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum) - { - if (Aligned(a) && Aligned(b)) - SquaredDifferenceSum16f(a, b, size, sum); - else - SquaredDifferenceSum16f(a, b, size, sum); - } - - template void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance) - { - if (align) - assert(Aligned(a) && Aligned(b)); - - size_t partialAlignedSize = AlignLo(size, F); - size_t fullAlignedSize = AlignLo(size, DF); - size_t i = 0; - float32x4_t _aa[2] = { vdupq_n_f32(0), vdupq_n_f32(0) }; - float32x4_t _ab[2] = { vdupq_n_f32(0), vdupq_n_f32(0) }; - float32x4_t _bb[2] = { vdupq_n_f32(0), vdupq_n_f32(0) }; - if (fullAlignedSize) - { - for (; i < fullAlignedSize; i += DF) - { - float32x4_t a0 = vcvt_f32_f16((float16x4_t)LoadHalf(a + i + 0)); - float32x4_t b0 = vcvt_f32_f16((float16x4_t)LoadHalf(b + i + 0)); - _aa[0] = vmlaq_f32(_aa[0], a0, a0); - _ab[0] = vmlaq_f32(_ab[0], a0, b0); - _bb[0] = vmlaq_f32(_bb[0], b0, b0); - float32x4_t a1 = vcvt_f32_f16((float16x4_t)LoadHalf(a + i + F)); - float32x4_t b1 = vcvt_f32_f16((float16x4_t)LoadHalf(b + i + F)); - _aa[1] = vmlaq_f32(_aa[1], a1, a1); - _ab[1] = vmlaq_f32(_ab[1], a1, b1); - _bb[1] = vmlaq_f32(_bb[1], b1, b1); - } - _aa[0] = vaddq_f32(_aa[0], _aa[1]); - _ab[0] = vaddq_f32(_ab[0], _ab[1]); - _bb[0] = vaddq_f32(_bb[0], _bb[1]); - } - for (; i < partialAlignedSize; i += F) - { - float32x4_t a0 = vcvt_f32_f16((float16x4_t)LoadHalf(a + i + 0)); - float32x4_t b0 = vcvt_f32_f16((float16x4_t)LoadHalf(b + i + 0)); - _aa[0] = vmlaq_f32(_aa[0], a0, a0); - _ab[0] = vmlaq_f32(_ab[0], a0, b0); - _bb[0] = vmlaq_f32(_bb[0], b0, b0); - } - if (partialAlignedSize != size) - { - float32x4_t tailMask = RightNotZero32f(size - partialAlignedSize); - float32x4_t a0 = And(vcvt_f32_f16((float16x4_t)LoadHalf(a + i + 0)), tailMask); - float32x4_t b0 = And(vcvt_f32_f16((float16x4_t)LoadHalf(b + i + 0)), tailMask); - _aa[0] = vmlaq_f32(_aa[0], a0, a0); - _ab[0] = vmlaq_f32(_ab[0], a0, b0); - _bb[0] = vmlaq_f32(_bb[0], b0, b0); - } - float aa = ExtractSum32f(_aa[0]), ab = ExtractSum32f(_ab[0]), bb = ExtractSum32f(_bb[0]); - *distance = 1.0f - ab / ::sqrt(aa*bb); - } - - void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance) - { - if (Aligned(a) && Aligned(b)) - CosineDistance16f(a, b, size, distance); - else - CosineDistance16f(a, b, size, distance); - } - - SIMD_INLINE float32x4_t Tail(size_t tail) - { - const int32_t mask[DF] = { 0, 0, 0, 0, -1, -1, -1, -1 }; - return Load((float*)(mask + tail)); - } - - static void Squares(size_t M, size_t K, const uint16_t * const * A, float * squares) - { - size_t M4 = AlignLo(M, 4); - size_t KF = AlignLo(K, F); - float32x4_t mask = Tail(K - KF); - size_t i = 0; - for (; i < M4; i += 4) - { - float32x4_t sums[4] = { vdupq_n_f32(0.0f), vdupq_n_f32(0.0f), vdupq_n_f32(0.0f), vdupq_n_f32(0.0f) }; - for (size_t k = 0; k < KF; k += F) - { - float32x4_t a0 = vcvt_f32_f16((float16x4_t)LoadHalf((A[i + 0] + k))); - float32x4_t a1 = vcvt_f32_f16((float16x4_t)LoadHalf((A[i + 1] + k))); - float32x4_t a2 = vcvt_f32_f16((float16x4_t)LoadHalf((A[i + 2] + k))); - float32x4_t a3 = vcvt_f32_f16((float16x4_t)LoadHalf((A[i + 3] + k))); - sums[0] = vmlaq_f32(sums[0], a0, a0); - sums[1] = vmlaq_f32(sums[1], a1, a1); - sums[2] = vmlaq_f32(sums[2], a2, a2); - sums[3] = vmlaq_f32(sums[3], a3, a3); - } - if (KF < K) - { - size_t k = K - F; - float32x4_t a0 = And(mask, vcvt_f32_f16((float16x4_t)LoadHalf((A[i + 0] + k)))); - float32x4_t a1 = And(mask, vcvt_f32_f16((float16x4_t)LoadHalf((A[i + 1] + k)))); - float32x4_t a2 = And(mask, vcvt_f32_f16((float16x4_t)LoadHalf((A[i + 2] + k)))); - float32x4_t a3 = And(mask, vcvt_f32_f16((float16x4_t)LoadHalf((A[i + 3] + k)))); - sums[0] = vmlaq_f32(sums[0], a0, a0); - sums[1] = vmlaq_f32(sums[1], a1, a1); - sums[2] = vmlaq_f32(sums[2], a2, a2); - sums[3] = vmlaq_f32(sums[3], a3, a3); - } - Store(squares + i, Extract4Sums(sums)); - } - for (; i < M; i += 1) - { - float32x4_t sum = vdupq_n_f32(0.0f); - for (size_t k = 0; k < KF; k += F) - { - float32x4_t a = vcvt_f32_f16((float16x4_t)LoadHalf((A[i] + k))); - sum = vmlaq_f32(sum, a, a); - } - if (KF < K) - { - size_t k = K - F; - float32x4_t a = And(mask, vcvt_f32_f16((float16x4_t)LoadHalf((A[i] + k)))); - sum = vmlaq_f32(sum, a, a); - } - squares[i] = ExtractSum32f(sum); - } - } - - static void MicroCosineDistances3x4(size_t K, const uint16_t * const * A, const uint16_t * const * B, const float * aa, const float * bb, float * distances, size_t stride) - { - size_t K4 = K & (~3); - float32x4_t c00 = vdupq_n_f32(0.0f); - float32x4_t c01 = vdupq_n_f32(0.0f); - float32x4_t c02 = vdupq_n_f32(0.0f); - float32x4_t c03 = vdupq_n_f32(0.0f); - float32x4_t c10 = vdupq_n_f32(0.0f); - float32x4_t c11 = vdupq_n_f32(0.0f); - float32x4_t c12 = vdupq_n_f32(0.0f); - float32x4_t c13 = vdupq_n_f32(0.0f); - float32x4_t c20 = vdupq_n_f32(0.0f); - float32x4_t c21 = vdupq_n_f32(0.0f); - float32x4_t c22 = vdupq_n_f32(0.0f); - float32x4_t c23 = vdupq_n_f32(0.0f); - float32x4_t a0, a1, a2, b0; - for (size_t k = 0; k < K4; k += 4) - { - a0 = vcvt_f32_f16((float16x4_t)LoadHalf((A[0] + k))); - a1 = vcvt_f32_f16((float16x4_t)LoadHalf((A[1] + k))); - a2 = vcvt_f32_f16((float16x4_t)LoadHalf((A[2] + k))); - b0 = vcvt_f32_f16((float16x4_t)LoadHalf((B[0] + k))); - c00 = vmlaq_f32(c00, a0, b0); - c10 = vmlaq_f32(c10, a1, b0); - c20 = vmlaq_f32(c20, a2, b0); - b0 = vcvt_f32_f16((float16x4_t)LoadHalf((B[1] + k))); - c01 = vmlaq_f32(c01, a0, b0); - c11 = vmlaq_f32(c11, a1, b0); - c21 = vmlaq_f32(c21, a2, b0); - b0 = vcvt_f32_f16((float16x4_t)LoadHalf((B[2] + k))); - c02 = vmlaq_f32(c02, a0, b0); - c12 = vmlaq_f32(c12, a1, b0); - c22 = vmlaq_f32(c22, a2, b0); - b0 = vcvt_f32_f16((float16x4_t)LoadHalf((B[3] + k))); - c03 = vmlaq_f32(c03, a0, b0); - c13 = vmlaq_f32(c13, a1, b0); - c23 = vmlaq_f32(c23, a2, b0); - } - if (K4 < K) - { - size_t k = K - 4; - float32x4_t tail = Tail(K - K4); - a0 = And(tail, vcvt_f32_f16((float16x4_t)LoadHalf((A[0] + k)))); - a1 = And(tail, vcvt_f32_f16((float16x4_t)LoadHalf((A[1] + k)))); - a2 = And(tail, vcvt_f32_f16((float16x4_t)LoadHalf((A[2] + k)))); - b0 = vcvt_f32_f16((float16x4_t)LoadHalf((B[0] + k))); - c00 = vmlaq_f32(c00, a0, b0); - c10 = vmlaq_f32(c10, a1, b0); - c20 = vmlaq_f32(c20, a2, b0); - b0 = vcvt_f32_f16((float16x4_t)LoadHalf((B[1] + k))); - c01 = vmlaq_f32(c01, a0, b0); - c11 = vmlaq_f32(c11, a1, b0); - c21 = vmlaq_f32(c21, a2, b0); - b0 = vcvt_f32_f16((float16x4_t)LoadHalf((B[2] + k))); - c02 = vmlaq_f32(c02, a0, b0); - c12 = vmlaq_f32(c12, a1, b0); - c22 = vmlaq_f32(c22, a2, b0); - b0 = vcvt_f32_f16((float16x4_t)LoadHalf((B[3] + k))); - c03 = vmlaq_f32(c03, a0, b0); - c13 = vmlaq_f32(c13, a1, b0); - c23 = vmlaq_f32(c23, a2, b0); - } - float32x4_t _bb = Load(bb); - float32x4_t _1 = vdupq_n_f32(1.0f); - Store(distances + 0 * stride, vmlsq_f32(_1, ReciprocalSqrt<1>(vmulq_f32(_bb, vdupq_n_f32(aa[0]))), Extract4Sums(c00, c01, c02, c03))); - Store(distances + 1 * stride, vmlsq_f32(_1, ReciprocalSqrt<1>(vmulq_f32(_bb, vdupq_n_f32(aa[1]))), Extract4Sums(c10, c11, c12, c13))); - Store(distances + 2 * stride, vmlsq_f32(_1, ReciprocalSqrt<1>(vmulq_f32(_bb, vdupq_n_f32(aa[2]))), Extract4Sums(c20, c21, c22, c23))); - } - - static void MicroCosineDistances3x1(size_t K, const uint16_t * const * A, const uint16_t * const * B, const float * aa, const float * bb, float * distances, size_t stride) - { - size_t K4 = K & (~3); - float32x4_t c00 = vdupq_n_f32(0.0f); - float32x4_t c10 = vdupq_n_f32(0.0f); - float32x4_t c20 = vdupq_n_f32(0.0f); - float32x4_t a0, b0; - for (size_t k = 0; k < K4; k += 4) - { - b0 = vcvt_f32_f16((float16x4_t)LoadHalf((B[0] + k))); - a0 = vcvt_f32_f16((float16x4_t)LoadHalf((A[0] + k))); - c00 = vmlaq_f32(c00, a0, b0); - a0 = vcvt_f32_f16((float16x4_t)LoadHalf((A[1] + k))); - c10 = vmlaq_f32(c10, a0, b0); - a0 = vcvt_f32_f16((float16x4_t)LoadHalf((A[2] + k))); - c20 = vmlaq_f32(c20, a0, b0); - } - if (K4 < K) - { - size_t k = K - 4; - float32x4_t tail = Tail(K - K4); - b0 = And(tail, vcvt_f32_f16((float16x4_t)LoadHalf((B[0] + k)))); - a0 = vcvt_f32_f16((float16x4_t)LoadHalf((A[0] + k))); - c00 = vmlaq_f32(c00, a0, b0); - a0 = vcvt_f32_f16((float16x4_t)LoadHalf((A[1] + k))); - c10 = vmlaq_f32(c10, a0, b0); - a0 = vcvt_f32_f16((float16x4_t)LoadHalf((A[2] + k))); - c20 = vmlaq_f32(c20, a0, b0); - } - distances[0 * stride] = 1.0f - ExtractSum32f(c00) / sqrt(bb[0] * aa[0]); - distances[1 * stride] = 1.0f - ExtractSum32f(c10) / sqrt(bb[0] * aa[1]); - distances[2 * stride] = 1.0f - ExtractSum32f(c20) / sqrt(bb[0] * aa[2]); - } - - static void MicroCosineDistances1x4(size_t K, const uint16_t * const * A, const uint16_t * const * B, const float * aa, const float * bb, float * distances, size_t stride) - { - size_t K4 = K & (~3); - float32x4_t c00 = vdupq_n_f32(0.0f); - float32x4_t c01 = vdupq_n_f32(0.0f); - float32x4_t c02 = vdupq_n_f32(0.0f); - float32x4_t c03 = vdupq_n_f32(0.0f); - float32x4_t a0, b0; - for (size_t k = 0; k < K4; k += 4) - { - a0 = vcvt_f32_f16((float16x4_t)LoadHalf((A[0] + k))); - b0 = vcvt_f32_f16((float16x4_t)LoadHalf((B[0] + k))); - c00 = vmlaq_f32(c00, a0, b0); - b0 = vcvt_f32_f16((float16x4_t)LoadHalf((B[1] + k))); - c01 = vmlaq_f32(c01, a0, b0); - b0 = vcvt_f32_f16((float16x4_t)LoadHalf((B[2] + k))); - c02 = vmlaq_f32(c02, a0, b0); - b0 = vcvt_f32_f16((float16x4_t)LoadHalf((B[3] + k))); - c03 = vmlaq_f32(c03, a0, b0); - } - if (K4 < K) - { - size_t k = K - 4; - float32x4_t tail = Tail(K - K4); - a0 = And(tail, vcvt_f32_f16((float16x4_t)LoadHalf((A[0] + k)))); - b0 = vcvt_f32_f16((float16x4_t)LoadHalf((B[0] + k))); - c00 = vmlaq_f32(c00, a0, b0); - b0 = vcvt_f32_f16((float16x4_t)LoadHalf((B[1] + k))); - c01 = vmlaq_f32(c01, a0, b0); - b0 = vcvt_f32_f16((float16x4_t)LoadHalf((B[2] + k))); - c02 = vmlaq_f32(c02, a0, b0); - b0 = vcvt_f32_f16((float16x4_t)LoadHalf((B[3] + k))); - c03 = vmlaq_f32(c03, a0, b0); - } - float32x4_t _bb = Load(bb); - float32x4_t _1 = vdupq_n_f32(1.0f); - Store(distances + 0 * stride, vmlsq_f32(_1, ReciprocalSqrt<1>(vmulq_f32(_bb, vdupq_n_f32(aa[0]))), Extract4Sums(c00, c01, c02, c03))); - } - - static void MacroCosineDistances(size_t M, size_t N, size_t K, const uint16_t * const * A, const uint16_t * const * B, const float * aa, const float * bb, float * distances, size_t stride) - { - size_t M3 = AlignLoAny(M, 3); - size_t N4 = AlignLo(N, 4); - size_t i = 0; - for (; i < M3; i += 3) - { - size_t j = 0; - for (; j < N4; j += 4) - MicroCosineDistances3x4(K, A + i, B + j, aa + i, bb + j, distances + j, stride); - for (; j < N; j += 1) - MicroCosineDistances3x1(K, A + i, B + j, aa + i, bb + j, distances + j, stride); - distances += 3 * stride; - } - for (; i < M; i++) - { - size_t j = 0; - for (; j < N4; j += 4) - MicroCosineDistances1x4(K, A + i, B + j, aa + i, bb + j, distances + j, stride); - for (; j < N; j += 1) - CosineDistance16f(A[i], B[j], K, distances + j); - distances += 1 * stride; - } - } - - void CosineDistancesMxNa16f(size_t M, size_t N, size_t K, const uint16_t * const * A, const uint16_t * const * B, float * distances) - { - const size_t L2 = Base::AlgCacheL2(); - size_t mN = AlignLoAny(L2 / 2 / K, 4); - size_t mM = AlignLoAny(L2 / 2 / K, 3); - Array32f aa(M), bb(N); - for (size_t i = 0; i < M; i += mM) - { - size_t dM = Simd::Min(M, i + mM) - i; - Squares(dM, K, A + i, aa.data + i); - for (size_t j = 0; j < N; j += mN) - { - size_t dN = Simd::Min(N, j + mN) - j; - if (i == 0) - Squares(dN, K, B + j, bb.data + j); - MacroCosineDistances(dM, dN, K, A + i, B + j, aa.data + i, bb.data + j, distances + i * N + j, N); - } - } - } - } -#endif // defined(SIMD_NEON_ENABLE) && defined(SIMD_NEON_FP16_ENABLE) -} diff --git a/src/3rd/Simd/Simd/SimdNeonFloat32.cpp b/src/3rd/Simd/Simd/SimdNeonFloat32.cpp deleted file mode 100644 index 8de966ee..00000000 --- a/src/3rd/Simd/Simd/SimdNeonFloat32.cpp +++ /dev/null @@ -1,171 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE uint32x4_t Float32ToUint32(const float * src, const float32x4_t & lower, const float32x4_t & upper, const float32x4_t & boost) - { - return vcvtq_u32_f32(vmulq_f32(vsubq_f32(vminq_f32(vmaxq_f32(Load(src), lower), upper), lower), boost)); - } - - template SIMD_INLINE void Float32ToUint8(const float * src, const float32x4_t & lower, const float32x4_t & upper, const float32x4_t & boost, uint8_t * dst) - { - uint32x4_t d0 = Float32ToUint32(src + F * 0, lower, upper, boost); - uint32x4_t d1 = Float32ToUint32(src + F * 1, lower, upper, boost); - uint32x4_t d2 = Float32ToUint32(src + F * 2, lower, upper, boost); - uint32x4_t d3 = Float32ToUint32(src + F * 3, lower, upper, boost); - Store(dst, PackU16(PackU32(d0, d1), PackU32(d2, d3))); - } - - template void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst) - { - assert(size >= A); - if (align) - assert(Aligned(src) && Aligned(dst)); - - float32x4_t _lower = vdupq_n_f32(lower[0]); - float32x4_t _upper = vdupq_n_f32(upper[0]); - float32x4_t boost = vdupq_n_f32(255.0f / (upper[0] - lower[0])); - - size_t alignedSize = AlignLo(size, A); - for (size_t i = 0; i < alignedSize; i += A) - Float32ToUint8(src + i, _lower, _upper, boost, dst + i); - if (alignedSize != size) - Float32ToUint8(src + size - A, _lower, _upper, boost, dst + size - A); - } - - void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst) - { - if (Aligned(src) && Aligned(dst)) - Float32ToUint8(src, size, lower, upper, dst); - else - Float32ToUint8(src, size, lower, upper, dst); - } - - template SIMD_INLINE float32x4_t Uint16ToFloat32(const uint16x8_t & value, const float32x4_t & lower, const float32x4_t & boost) - { - return vaddq_f32(vmulq_f32(vcvtq_f32_u32(UnpackU16(value)), boost), lower); - } - - template SIMD_INLINE void Uint8ToFloat32(const uint8_t * src, const float32x4_t & lower, const float32x4_t & boost, float * dst) - { - uint8x16_t _src = Load(src); - uint16x8_t lo = UnpackU8<0>(_src); - Store(dst + F * 0, Uint16ToFloat32<0>(lo, lower, boost)); - Store(dst + F * 1, Uint16ToFloat32<1>(lo, lower, boost)); - uint16x8_t hi = UnpackU8<1>(_src); - Store(dst + F * 2, Uint16ToFloat32<0>(hi, lower, boost)); - Store(dst + F * 3, Uint16ToFloat32<1>(hi, lower, boost)); - } - - template void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst) - { - assert(size >= A); - if (align) - assert(Aligned(src) && Aligned(dst)); - - float32x4_t _lower = vdupq_n_f32(lower[0]); - float32x4_t boost = vdupq_n_f32((upper[0] - lower[0]) / 255.0f); - - size_t alignedSize = AlignLo(size, A); - for (size_t i = 0; i < alignedSize; i += A) - Uint8ToFloat32(src + i, _lower, boost, dst + i); - if (alignedSize != size) - Uint8ToFloat32(src + size - A, _lower, boost, dst + size - A); - } - - void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst) - { - if (Aligned(src) && Aligned(dst)) - Uint8ToFloat32(src, size, lower, upper, dst); - else - Uint8ToFloat32(src, size, lower, upper, dst); - } - - template void CosineDistance32f(const float * a, const float * b, size_t size, float * distance) - { - if (align) - assert(Aligned(a) && Aligned(b)); - - size_t partialAlignedSize = AlignLo(size, F); - size_t fullAlignedSize = AlignLo(size, DF); - size_t i = 0; - float32x4_t _aa[2] = { vdupq_n_f32(0), vdupq_n_f32(0) }; - float32x4_t _ab[2] = { vdupq_n_f32(0), vdupq_n_f32(0) }; - float32x4_t _bb[2] = { vdupq_n_f32(0), vdupq_n_f32(0) }; - if (fullAlignedSize) - { - for (; i < fullAlignedSize; i += DF) - { - float32x4_t a0 = Load(a + i + 0); - float32x4_t b0 = Load(b + i + 0); - _aa[0] = vmlaq_f32(_aa[0], a0, a0); - _ab[0] = vmlaq_f32(_ab[0], a0, b0); - _bb[0] = vmlaq_f32(_bb[0], b0, b0); - float32x4_t a1 = Load(a + i + F); - float32x4_t b1 = Load(b + i + F); - _aa[1] = vmlaq_f32(_aa[1], a1, a1); - _ab[1] = vmlaq_f32(_ab[1], a1, b1); - _bb[1] = vmlaq_f32(_bb[1], b1, b1); - } - _aa[0] = vaddq_f32(_aa[0], _aa[1]); - _ab[0] = vaddq_f32(_ab[0], _ab[1]); - _bb[0] = vaddq_f32(_bb[0], _bb[1]); - } - for (; i < partialAlignedSize; i += F) - { - float32x4_t a0 = Load(a + i + 0); - float32x4_t b0 = Load(b + i + 0); - _aa[0] = vmlaq_f32(_aa[0], a0, a0); - _ab[0] = vmlaq_f32(_ab[0], a0, b0); - _bb[0] = vmlaq_f32(_bb[0], b0, b0); - } - float aa = ExtractSum32f(_aa[0]), ab = ExtractSum32f(_ab[0]), bb = ExtractSum32f(_bb[0]); - for (; i < size; ++i) - { - float _a = a[i]; - float _b = b[i]; - aa += _a * _a; - ab += _a * _b; - bb += _b * _b; - } - *distance = 1.0f - ab / ::sqrt(aa*bb); - } - - void CosineDistance32f(const float * a, const float * b, size_t size, float * distance) - { - if (Aligned(a) && Aligned(b)) - CosineDistance32f(a, b, size, distance); - else - CosineDistance32f(a, b, size, distance); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonGaussianBlur3x3.cpp b/src/3rd/Simd/Simd/SimdNeonGaussianBlur3x3.cpp deleted file mode 100644 index f80a047b..00000000 --- a/src/3rd/Simd/Simd/SimdNeonGaussianBlur3x3.cpp +++ /dev/null @@ -1,153 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -#include "Simd/SimdLog.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - namespace - { - struct Buffer - { - Buffer(size_t width) - { - _p = Allocate(sizeof(uint16_t) * 3 * width); - src0 = (uint16_t*)_p; - src1 = src0 + width; - src2 = src1 + width; - } - - ~Buffer() - { - Free(_p); - } - - uint16_t * src0; - uint16_t * src1; - uint16_t * src2; - private: - void * _p; - }; - } - - template SIMD_INLINE void BlurCol(uint8x16_t a[3], uint16_t * b) - { - Store(b + 0, BinomialSum16(UnpackU8<0>(a[0]), UnpackU8<0>(a[1]), UnpackU8<0>(a[2]))); - Store(b + HA, BinomialSum16(UnpackU8<1>(a[0]), UnpackU8<1>(a[1]), UnpackU8<1>(a[2]))); - } - - template SIMD_INLINE uint16x8_t BlurRow16(const Buffer & buffer, size_t offset) - { - return DivideBy16(BinomialSum16( - Load(buffer.src0 + offset), - Load(buffer.src1 + offset), - Load(buffer.src2 + offset))); - } - - template SIMD_INLINE uint8x16_t BlurRow(const Buffer & buffer, size_t offset) - { - return PackU16(BlurRow16(buffer, offset), BlurRow16(buffer, offset + HA)); - } - - template void GaussianBlur3x3( - const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(step*(width - 1) >= A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(step*width) && Aligned(dst) && Aligned(dstStride)); - - uint8x16_t a[3]; - - size_t size = step*width; - size_t bodySize = Simd::AlignHi(size, A) - A; - - Buffer buffer(Simd::AlignHi(size, A)); - - LoadNose3(src + 0, a); - BlurCol(a, buffer.src0 + 0); - for (size_t col = A; col < bodySize; col += A) - { - LoadBody3(src + col, a); - BlurCol(a, buffer.src0 + col); - } - LoadTail3(src + size - A, a); - BlurCol(a, buffer.src0 + size - A); - - memcpy(buffer.src1, buffer.src0, sizeof(uint16_t)*size); - - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - const uint8_t *src2 = src + srcStride*(row + 1); - if (row >= height - 2) - src2 = src + srcStride*(height - 1); - - LoadNose3(src2 + 0, a); - BlurCol(a, buffer.src2 + 0); - for (size_t col = A; col < bodySize; col += A) - { - LoadBody3(src2 + col, a); - BlurCol(a, buffer.src2 + col); - } - LoadTail3(src2 + size - A, a); - BlurCol(a, buffer.src2 + size - A); - - for (size_t col = 0; col < bodySize; col += A) - Store(dst + col, BlurRow(buffer, col)); - Store(dst + size - A, BlurRow(buffer, size - A)); - - Swap(buffer.src0, buffer.src2); - Swap(buffer.src0, buffer.src1); - } - } - - template void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - assert(channelCount > 0 && channelCount <= 4); - - switch (channelCount) - { - case 1: GaussianBlur3x3(src, srcStride, width, height, dst, dstStride); break; - case 2: GaussianBlur3x3(src, srcStride, width, height, dst, dstStride); break; - case 3: GaussianBlur3x3(src, srcStride, width, height, dst, dstStride); break; - case 4: GaussianBlur3x3(src, srcStride, width, height, dst, dstStride); break; - } - } - - void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(channelCount*width) && Aligned(dst) && Aligned(dstStride)) - GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else - GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonGemm32f.cpp b/src/3rd/Simd/Simd/SimdNeonGemm32f.cpp deleted file mode 100644 index 7d9b5c1f..00000000 --- a/src/3rd/Simd/Simd/SimdNeonGemm32f.cpp +++ /dev/null @@ -1,2919 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdGemm.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE -#if defined(SIMD_ARM64_ENABLE) -//#define SIMD_ARM64_KERNEL_9X -//#define SIMD_ARM64_KERNEL_14X -#endif - namespace Neon - { - SIMD_INLINE void AddProduct(float * ptr, float32x4_t value, float32x4_t alpha) - { - Store(ptr, vmlaq_f32(Load(ptr), value, alpha)); - } - - SIMD_INLINE void AddProduct(float * ptr, float32x4_t value, float32x4_t alpha, size_t tail) - { - if (tail == F) - AddProduct(ptr, value, alpha); - else - { - float tmp[F]; - Store(tmp, vmlaq_f32(Load(ptr), value, alpha)); - for (size_t i = 0; i < tail; ++i) - ptr[i] = tmp[i]; - } - } - - void GemmKernel4x12nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c10 = vdupq_n_f32(0); - float32x4_t c20 = vdupq_n_f32(0); - float32x4_t c30 = vdupq_n_f32(0); - float32x4_t c01 = vdupq_n_f32(0); - float32x4_t c11 = vdupq_n_f32(0); - float32x4_t c21 = vdupq_n_f32(0); - float32x4_t c31 = vdupq_n_f32(0); - float32x4_t c02 = vdupq_n_f32(0); - float32x4_t c12 = vdupq_n_f32(0); - float32x4_t c22 = vdupq_n_f32(0); - float32x4_t c32 = vdupq_n_f32(0); - float32x4_t b0, b1, b2, a0; - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t sa = lda == 1 ? 4 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - b1 = Load(B + ob1); - b2 = Load(B + ob2); - a0 = vld1q_dup_f32(A + oa0); - c00 = vmlaq_f32(c00, a0, b0); - c01 = vmlaq_f32(c01, a0, b1); - c02 = vmlaq_f32(c02, a0, b2); - a0 = vld1q_dup_f32(A + oa1); - c10 = vmlaq_f32(c10, a0, b0); - c11 = vmlaq_f32(c11, a0, b1); - c12 = vmlaq_f32(c12, a0, b2); - a0 = vld1q_dup_f32(A + oa2); - c20 = vmlaq_f32(c20, a0, b0); - c21 = vmlaq_f32(c21, a0, b1); - c22 = vmlaq_f32(c22, a0, b2); - a0 = vld1q_dup_f32(A + oa3); - c30 = vmlaq_f32(c30, a0, b0); - c31 = vmlaq_f32(c31, a0, b1); - c32 = vmlaq_f32(c32, a0, b2); - B += sb; - A += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01); - AddProduct(C + 2 * F, _alpha, c02, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11); - AddProduct(C + 2 * F, _alpha, c12, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21); - AddProduct(C + 2 * F, _alpha, c22, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31); - AddProduct(C + 2 * F, _alpha, c32, tail); - } - - void GemmKernel4x8nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c10 = vdupq_n_f32(0); - float32x4_t c20 = vdupq_n_f32(0); - float32x4_t c30 = vdupq_n_f32(0); - float32x4_t c01 = vdupq_n_f32(0); - float32x4_t c11 = vdupq_n_f32(0); - float32x4_t c21 = vdupq_n_f32(0); - float32x4_t c31 = vdupq_n_f32(0); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t sa = lda == 1 ? 4 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - float32x4_t b0, b1, a0; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - b1 = Load(B + ob1); - a0 = vld1q_dup_f32(A + oa0); - c00 = vmlaq_f32(c00, a0, b0); - c01 = vmlaq_f32(c01, a0, b1); - a0 = vld1q_dup_f32(A + oa1); - c10 = vmlaq_f32(c10, a0, b0); - c11 = vmlaq_f32(c11, a0, b1); - a0 = vld1q_dup_f32(A + oa2); - c20 = vmlaq_f32(c20, a0, b0); - c21 = vmlaq_f32(c21, a0, b1); - a0 = vld1q_dup_f32(A + oa3); - c30 = vmlaq_f32(c30, a0, b0); - c31 = vmlaq_f32(c31, a0, b1); - B += sb; - A += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31, tail); - } - - void GemmKernel4x4nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - float32x4_t c0 = vdupq_n_f32(0); - float32x4_t c1 = vdupq_n_f32(0); - float32x4_t c2 = vdupq_n_f32(0); - float32x4_t c3 = vdupq_n_f32(0); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t sa = lda == 1 ? 4 : 1; - const size_t ob0 = ldb * 0; - float32x4_t b0; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - c0 = vmlaq_f32(c0, b0, vld1q_dup_f32(A + oa0)); - c1 = vmlaq_f32(c1, b0, vld1q_dup_f32(A + oa1)); - c2 = vmlaq_f32(c2, b0, vld1q_dup_f32(A + oa2)); - c3 = vmlaq_f32(c3, b0, vld1q_dup_f32(A + oa3)); - B += sb; - A += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - AddProduct(C + 0 * ldc, _alpha, c0, tail); - AddProduct(C + 1 * ldc, _alpha, c1, tail); - AddProduct(C + 2 * ldc, _alpha, c2, tail); - AddProduct(C + 3 * ldc, _alpha, c3, tail); - } - -#if defined(SIMD_ARM64_ENABLE) - void GemmKernel6x16nn(size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, size_t sb, float* C, size_t ldc, size_t tail) - { - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c10 = vdupq_n_f32(0); - float32x4_t c20 = vdupq_n_f32(0); - float32x4_t c30 = vdupq_n_f32(0); - float32x4_t c40 = vdupq_n_f32(0); - float32x4_t c50 = vdupq_n_f32(0); - float32x4_t c01 = vdupq_n_f32(0); - float32x4_t c11 = vdupq_n_f32(0); - float32x4_t c21 = vdupq_n_f32(0); - float32x4_t c31 = vdupq_n_f32(0); - float32x4_t c41 = vdupq_n_f32(0); - float32x4_t c51 = vdupq_n_f32(0); - float32x4_t c02 = vdupq_n_f32(0); - float32x4_t c12 = vdupq_n_f32(0); - float32x4_t c22 = vdupq_n_f32(0); - float32x4_t c32 = vdupq_n_f32(0); - float32x4_t c42 = vdupq_n_f32(0); - float32x4_t c52 = vdupq_n_f32(0); - float32x4_t c03 = vdupq_n_f32(0); - float32x4_t c13 = vdupq_n_f32(0); - float32x4_t c23 = vdupq_n_f32(0); - float32x4_t c33 = vdupq_n_f32(0); - float32x4_t c43 = vdupq_n_f32(0); - float32x4_t c53 = vdupq_n_f32(0); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t sa = lda == 1 ? 6 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - const size_t ob3 = ldb * 3; - float32x4_t b0, b1, b2, b3, a0; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - b1 = Load(B + ob1); - b2 = Load(B + ob2); - b3 = Load(B + ob3); - a0 = vld1q_dup_f32(A + oa0); - c00 = vmlaq_f32(c00, a0, b0); - c01 = vmlaq_f32(c01, a0, b1); - c02 = vmlaq_f32(c02, a0, b2); - c03 = vmlaq_f32(c03, a0, b3); - a0 = vld1q_dup_f32(A + oa1); - c10 = vmlaq_f32(c10, a0, b0); - c11 = vmlaq_f32(c11, a0, b1); - c12 = vmlaq_f32(c12, a0, b2); - c13 = vmlaq_f32(c13, a0, b3); - a0 = vld1q_dup_f32(A + oa2); - c20 = vmlaq_f32(c20, a0, b0); - c21 = vmlaq_f32(c21, a0, b1); - c22 = vmlaq_f32(c22, a0, b2); - c23 = vmlaq_f32(c23, a0, b3); - a0 = vld1q_dup_f32(A + oa3); - c30 = vmlaq_f32(c30, a0, b0); - c31 = vmlaq_f32(c31, a0, b1); - c32 = vmlaq_f32(c32, a0, b2); - c33 = vmlaq_f32(c33, a0, b3); - a0 = vld1q_dup_f32(A + oa4); - c40 = vmlaq_f32(c40, a0, b0); - c41 = vmlaq_f32(c41, a0, b1); - c42 = vmlaq_f32(c42, a0, b2); - c43 = vmlaq_f32(c43, a0, b3); - a0 = vld1q_dup_f32(A + oa5); - c50 = vmlaq_f32(c50, a0, b0); - c51 = vmlaq_f32(c51, a0, b1); - c52 = vmlaq_f32(c52, a0, b2); - c53 = vmlaq_f32(c53, a0, b3); - B += sb; - A += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01); - AddProduct(C + 2 * F, _alpha, c02); - AddProduct(C + 3 * F, _alpha, c03, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11); - AddProduct(C + 2 * F, _alpha, c12); - AddProduct(C + 3 * F, _alpha, c13, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21); - AddProduct(C + 2 * F, _alpha, c22); - AddProduct(C + 3 * F, _alpha, c23, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31); - AddProduct(C + 2 * F, _alpha, c32); - AddProduct(C + 3 * F, _alpha, c33, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40); - AddProduct(C + 1 * F, _alpha, c41); - AddProduct(C + 2 * F, _alpha, c42); - AddProduct(C + 3 * F, _alpha, c43, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50); - AddProduct(C + 1 * F, _alpha, c51); - AddProduct(C + 2 * F, _alpha, c52); - AddProduct(C + 3 * F, _alpha, c53, tail); - } - - void GemmKernel6x12nn(size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, size_t sb, float* C, size_t ldc, size_t tail) - { - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c10 = vdupq_n_f32(0); - float32x4_t c20 = vdupq_n_f32(0); - float32x4_t c30 = vdupq_n_f32(0); - float32x4_t c40 = vdupq_n_f32(0); - float32x4_t c50 = vdupq_n_f32(0); - float32x4_t c01 = vdupq_n_f32(0); - float32x4_t c11 = vdupq_n_f32(0); - float32x4_t c21 = vdupq_n_f32(0); - float32x4_t c31 = vdupq_n_f32(0); - float32x4_t c41 = vdupq_n_f32(0); - float32x4_t c51 = vdupq_n_f32(0); - float32x4_t c02 = vdupq_n_f32(0); - float32x4_t c12 = vdupq_n_f32(0); - float32x4_t c22 = vdupq_n_f32(0); - float32x4_t c32 = vdupq_n_f32(0); - float32x4_t c42 = vdupq_n_f32(0); - float32x4_t c52 = vdupq_n_f32(0); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t sa = lda == 1 ? 6 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - float32x4_t b0, b1, b2, a0; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - b1 = Load(B + ob1); - b2 = Load(B + ob2); - a0 = vld1q_dup_f32(A + oa0); - c00 = vmlaq_f32(c00, a0, b0); - c01 = vmlaq_f32(c01, a0, b1); - c02 = vmlaq_f32(c02, a0, b2); - a0 = vld1q_dup_f32(A + oa1); - c10 = vmlaq_f32(c10, a0, b0); - c11 = vmlaq_f32(c11, a0, b1); - c12 = vmlaq_f32(c12, a0, b2); - a0 = vld1q_dup_f32(A + oa2); - c20 = vmlaq_f32(c20, a0, b0); - c21 = vmlaq_f32(c21, a0, b1); - c22 = vmlaq_f32(c22, a0, b2); - a0 = vld1q_dup_f32(A + oa3); - c30 = vmlaq_f32(c30, a0, b0); - c31 = vmlaq_f32(c31, a0, b1); - c32 = vmlaq_f32(c32, a0, b2); - a0 = vld1q_dup_f32(A + oa4); - c40 = vmlaq_f32(c40, a0, b0); - c41 = vmlaq_f32(c41, a0, b1); - c42 = vmlaq_f32(c42, a0, b2); - a0 = vld1q_dup_f32(A + oa5); - c50 = vmlaq_f32(c50, a0, b0); - c51 = vmlaq_f32(c51, a0, b1); - c52 = vmlaq_f32(c52, a0, b2); - B += sb; - A += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01); - AddProduct(C + 2 * F, _alpha, c02, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11); - AddProduct(C + 2 * F, _alpha, c12, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21); - AddProduct(C + 2 * F, _alpha, c22, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31); - AddProduct(C + 2 * F, _alpha, c32, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40); - AddProduct(C + 1 * F, _alpha, c41); - AddProduct(C + 2 * F, _alpha, c42, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50); - AddProduct(C + 1 * F, _alpha, c51); - AddProduct(C + 2 * F, _alpha, c52, tail); - } -#endif - - void GemmKernel6x8nn(size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, size_t sb, float* C, size_t ldc, size_t tail) - { - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c10 = vdupq_n_f32(0); - float32x4_t c20 = vdupq_n_f32(0); - float32x4_t c30 = vdupq_n_f32(0); - float32x4_t c40 = vdupq_n_f32(0); - float32x4_t c50 = vdupq_n_f32(0); - float32x4_t c01 = vdupq_n_f32(0); - float32x4_t c11 = vdupq_n_f32(0); - float32x4_t c21 = vdupq_n_f32(0); - float32x4_t c31 = vdupq_n_f32(0); - float32x4_t c41 = vdupq_n_f32(0); - float32x4_t c51 = vdupq_n_f32(0); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t sa = lda == 1 ? 6 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - float32x4_t b0, b1, a0; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - b1 = Load(B + ob1); - a0 = vld1q_dup_f32(A + oa0); - c00 = vmlaq_f32(c00, a0, b0); - c01 = vmlaq_f32(c01, a0, b1); - a0 = vld1q_dup_f32(A + oa1); - c10 = vmlaq_f32(c10, a0, b0); - c11 = vmlaq_f32(c11, a0, b1); - a0 = vld1q_dup_f32(A + oa2); - c20 = vmlaq_f32(c20, a0, b0); - c21 = vmlaq_f32(c21, a0, b1); - a0 = vld1q_dup_f32(A + oa3); - c30 = vmlaq_f32(c30, a0, b0); - c31 = vmlaq_f32(c31, a0, b1); - a0 = vld1q_dup_f32(A + oa4); - c40 = vmlaq_f32(c40, a0, b0); - c41 = vmlaq_f32(c41, a0, b1); - a0 = vld1q_dup_f32(A + oa5); - c50 = vmlaq_f32(c50, a0, b0); - c51 = vmlaq_f32(c51, a0, b1); - B += sb; - A += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40); - AddProduct(C + 1 * F, _alpha, c41, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50); - AddProduct(C + 1 * F, _alpha, c51, tail); - } - - void GemmKernel6x4nn(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - float32x4_t c0 = vdupq_n_f32(0); - float32x4_t c1 = vdupq_n_f32(0); - float32x4_t c2 = vdupq_n_f32(0); - float32x4_t c3 = vdupq_n_f32(0); - float32x4_t c4 = vdupq_n_f32(0); - float32x4_t c5 = vdupq_n_f32(0); - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t sa = lda == 1 ? 6 : 1; - const size_t ob0 = ldb * 0; - float32x4_t b0; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - c0 = vmlaq_f32(c0, b0, vld1q_dup_f32(A + oa0)); - c1 = vmlaq_f32(c1, b0, vld1q_dup_f32(A + oa1)); - c2 = vmlaq_f32(c2, b0, vld1q_dup_f32(A + oa2)); - c3 = vmlaq_f32(c3, b0, vld1q_dup_f32(A + oa3)); - c4 = vmlaq_f32(c4, b0, vld1q_dup_f32(A + oa4)); - c5 = vmlaq_f32(c5, b0, vld1q_dup_f32(A + oa5)); - B += sb; - A += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - AddProduct(C + 0 * ldc, _alpha, c0, tail); - AddProduct(C + 1 * ldc, _alpha, c1, tail); - AddProduct(C + 2 * ldc, _alpha, c2, tail); - AddProduct(C + 3 * ldc, _alpha, c3, tail); - AddProduct(C + 4 * ldc, _alpha, c4, tail); - AddProduct(C + 5 * ldc, _alpha, c5, tail); - } - -#if defined(SIMD_ARM64_ENABLE) - void GemmKernel8x12nn(size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, size_t sb, float* C, size_t ldc, size_t tail) - { - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c01 = vdupq_n_f32(0); - float32x4_t c02 = vdupq_n_f32(0); - float32x4_t c10 = vdupq_n_f32(0); - float32x4_t c11 = vdupq_n_f32(0); - float32x4_t c12 = vdupq_n_f32(0); - float32x4_t c20 = vdupq_n_f32(0); - float32x4_t c21 = vdupq_n_f32(0); - float32x4_t c22 = vdupq_n_f32(0); - float32x4_t c30 = vdupq_n_f32(0); - float32x4_t c31 = vdupq_n_f32(0); - float32x4_t c32 = vdupq_n_f32(0); - float32x4_t c40 = vdupq_n_f32(0); - float32x4_t c41 = vdupq_n_f32(0); - float32x4_t c42 = vdupq_n_f32(0); - float32x4_t c50 = vdupq_n_f32(0); - float32x4_t c51 = vdupq_n_f32(0); - float32x4_t c52 = vdupq_n_f32(0); - float32x4_t c60 = vdupq_n_f32(0); - float32x4_t c61 = vdupq_n_f32(0); - float32x4_t c62 = vdupq_n_f32(0); - float32x4_t c70 = vdupq_n_f32(0); - float32x4_t c71 = vdupq_n_f32(0); - float32x4_t c72 = vdupq_n_f32(0); - float32x4_t b0, b1, b2, a0; - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t oa6 = lda * 6; - const size_t oa7 = lda * 7; - const size_t sa = lda == 1 ? 8 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - b1 = Load(B + ob1); - b2 = Load(B + ob2); - a0 = vld1q_dup_f32(A + oa0); - c00 = vmlaq_f32(c00, a0, b0); - c01 = vmlaq_f32(c01, a0, b1); - c02 = vmlaq_f32(c02, a0, b2); - a0 = vld1q_dup_f32(A + oa1); - c10 = vmlaq_f32(c10, a0, b0); - c11 = vmlaq_f32(c11, a0, b1); - c12 = vmlaq_f32(c12, a0, b2); - a0 = vld1q_dup_f32(A + oa2); - c20 = vmlaq_f32(c20, a0, b0); - c21 = vmlaq_f32(c21, a0, b1); - c22 = vmlaq_f32(c22, a0, b2); - a0 = vld1q_dup_f32(A + oa3); - c30 = vmlaq_f32(c30, a0, b0); - c31 = vmlaq_f32(c31, a0, b1); - c32 = vmlaq_f32(c32, a0, b2); - a0 = vld1q_dup_f32(A + oa4); - c40 = vmlaq_f32(c40, a0, b0); - c41 = vmlaq_f32(c41, a0, b1); - c42 = vmlaq_f32(c42, a0, b2); - a0 = vld1q_dup_f32(A + oa5); - c50 = vmlaq_f32(c50, a0, b0); - c51 = vmlaq_f32(c51, a0, b1); - c52 = vmlaq_f32(c52, a0, b2); - a0 = vld1q_dup_f32(A + oa6); - c60 = vmlaq_f32(c60, a0, b0); - c61 = vmlaq_f32(c61, a0, b1); - c62 = vmlaq_f32(c62, a0, b2); - a0 = vld1q_dup_f32(A + oa7); - c70 = vmlaq_f32(c70, a0, b0); - c71 = vmlaq_f32(c71, a0, b1); - c72 = vmlaq_f32(c72, a0, b2); - B += sb; - A += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01); - AddProduct(C + 2 * F, _alpha, c02, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11); - AddProduct(C + 2 * F, _alpha, c12, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21); - AddProduct(C + 2 * F, _alpha, c22, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31); - AddProduct(C + 2 * F, _alpha, c32, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40); - AddProduct(C + 1 * F, _alpha, c41); - AddProduct(C + 2 * F, _alpha, c42, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50); - AddProduct(C + 1 * F, _alpha, c51); - AddProduct(C + 2 * F, _alpha, c52, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c60); - AddProduct(C + 1 * F, _alpha, c61); - AddProduct(C + 2 * F, _alpha, c62, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c70); - AddProduct(C + 1 * F, _alpha, c71); - AddProduct(C + 2 * F, _alpha, c72, tail); - } - - void GemmKernel8x8nn(size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, size_t sb, float* C, size_t ldc, size_t tail) - { - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c01 = vdupq_n_f32(0); - float32x4_t c10 = vdupq_n_f32(0); - float32x4_t c11 = vdupq_n_f32(0); - float32x4_t c20 = vdupq_n_f32(0); - float32x4_t c21 = vdupq_n_f32(0); - float32x4_t c30 = vdupq_n_f32(0); - float32x4_t c31 = vdupq_n_f32(0); - float32x4_t c40 = vdupq_n_f32(0); - float32x4_t c41 = vdupq_n_f32(0); - float32x4_t c50 = vdupq_n_f32(0); - float32x4_t c51 = vdupq_n_f32(0); - float32x4_t c60 = vdupq_n_f32(0); - float32x4_t c61 = vdupq_n_f32(0); - float32x4_t c70 = vdupq_n_f32(0); - float32x4_t c71 = vdupq_n_f32(0); - float32x4_t b0, b1, a0; - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t oa6 = lda * 6; - const size_t oa7 = lda * 7; - const size_t sa = lda == 1 ? 8 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - b1 = Load(B + ob1); - a0 = vld1q_dup_f32(A + oa0); - c00 = vmlaq_f32(c00, a0, b0); - c01 = vmlaq_f32(c01, a0, b1); - a0 = vld1q_dup_f32(A + oa1); - c10 = vmlaq_f32(c10, a0, b0); - c11 = vmlaq_f32(c11, a0, b1); - a0 = vld1q_dup_f32(A + oa2); - c20 = vmlaq_f32(c20, a0, b0); - c21 = vmlaq_f32(c21, a0, b1); - a0 = vld1q_dup_f32(A + oa3); - c30 = vmlaq_f32(c30, a0, b0); - c31 = vmlaq_f32(c31, a0, b1); - a0 = vld1q_dup_f32(A + oa4); - c40 = vmlaq_f32(c40, a0, b0); - c41 = vmlaq_f32(c41, a0, b1); - a0 = vld1q_dup_f32(A + oa5); - c50 = vmlaq_f32(c50, a0, b0); - c51 = vmlaq_f32(c51, a0, b1); - a0 = vld1q_dup_f32(A + oa6); - c60 = vmlaq_f32(c60, a0, b0); - c61 = vmlaq_f32(c61, a0, b1); - a0 = vld1q_dup_f32(A + oa7); - c70 = vmlaq_f32(c70, a0, b0); - c71 = vmlaq_f32(c71, a0, b1); - B += sb; - A += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40); - AddProduct(C + 1 * F, _alpha, c41, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50); - AddProduct(C + 1 * F, _alpha, c51, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c60); - AddProduct(C + 1 * F, _alpha, c61, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c70); - AddProduct(C + 1 * F, _alpha, c71, tail); - } - - void GemmKernel8x4nn(size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, size_t sb, float* C, size_t ldc, size_t tail) - { - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c10 = vdupq_n_f32(0); - float32x4_t c20 = vdupq_n_f32(0); - float32x4_t c30 = vdupq_n_f32(0); - float32x4_t c40 = vdupq_n_f32(0); - float32x4_t c50 = vdupq_n_f32(0); - float32x4_t c60 = vdupq_n_f32(0); - float32x4_t c70 = vdupq_n_f32(0); - float32x4_t b0, a0; - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t oa6 = lda * 6; - const size_t oa7 = lda * 7; - const size_t sa = lda == 1 ? 8 : 1; - const size_t ob0 = ldb * 0; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - a0 = vld1q_dup_f32(A + oa0); - c00 = vmlaq_f32(c00, a0, b0); - a0 = vld1q_dup_f32(A + oa1); - c10 = vmlaq_f32(c10, a0, b0); - a0 = vld1q_dup_f32(A + oa2); - c20 = vmlaq_f32(c20, a0, b0); - a0 = vld1q_dup_f32(A + oa3); - c30 = vmlaq_f32(c30, a0, b0); - a0 = vld1q_dup_f32(A + oa4); - c40 = vmlaq_f32(c40, a0, b0); - a0 = vld1q_dup_f32(A + oa5); - c50 = vmlaq_f32(c50, a0, b0); - a0 = vld1q_dup_f32(A + oa6); - c60 = vmlaq_f32(c60, a0, b0); - a0 = vld1q_dup_f32(A + oa7); - c70 = vmlaq_f32(c70, a0, b0); - B += sb; - A += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - AddProduct(C + 0 * F, _alpha, c00, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c60, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c70, tail); - } -#endif - -#if defined(SIMD_ARM64_ENABLE) && defined(SIMD_ARM64_KERNEL_9X) - void GemmKernel9x12nn(size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, size_t sb, float* C, size_t ldc, size_t tail) - { - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c01 = vdupq_n_f32(0); - float32x4_t c02 = vdupq_n_f32(0); - float32x4_t c10 = vdupq_n_f32(0); - float32x4_t c11 = vdupq_n_f32(0); - float32x4_t c12 = vdupq_n_f32(0); - float32x4_t c20 = vdupq_n_f32(0); - float32x4_t c21 = vdupq_n_f32(0); - float32x4_t c22 = vdupq_n_f32(0); - float32x4_t c30 = vdupq_n_f32(0); - float32x4_t c31 = vdupq_n_f32(0); - float32x4_t c32 = vdupq_n_f32(0); - float32x4_t c40 = vdupq_n_f32(0); - float32x4_t c41 = vdupq_n_f32(0); - float32x4_t c42 = vdupq_n_f32(0); - float32x4_t c50 = vdupq_n_f32(0); - float32x4_t c51 = vdupq_n_f32(0); - float32x4_t c52 = vdupq_n_f32(0); - float32x4_t c60 = vdupq_n_f32(0); - float32x4_t c61 = vdupq_n_f32(0); - float32x4_t c62 = vdupq_n_f32(0); - float32x4_t c70 = vdupq_n_f32(0); - float32x4_t c71 = vdupq_n_f32(0); - float32x4_t c72 = vdupq_n_f32(0); - float32x4_t c80 = vdupq_n_f32(0); - float32x4_t c81 = vdupq_n_f32(0); - float32x4_t c82 = vdupq_n_f32(0); - float32x4_t b0, b1, b2, a0; - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t oa6 = lda * 6; - const size_t oa7 = lda * 7; - const size_t oa8 = lda * 8; - const size_t sa = lda == 1 ? 9 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - b1 = Load(B + ob1); - b2 = Load(B + ob2); - a0 = vld1q_dup_f32(A + oa0); - c00 = vmlaq_f32(c00, a0, b0); - c01 = vmlaq_f32(c01, a0, b1); - c02 = vmlaq_f32(c02, a0, b2); - a0 = vld1q_dup_f32(A + oa1); - c10 = vmlaq_f32(c10, a0, b0); - c11 = vmlaq_f32(c11, a0, b1); - c12 = vmlaq_f32(c12, a0, b2); - a0 = vld1q_dup_f32(A + oa2); - c20 = vmlaq_f32(c20, a0, b0); - c21 = vmlaq_f32(c21, a0, b1); - c22 = vmlaq_f32(c22, a0, b2); - a0 = vld1q_dup_f32(A + oa3); - c30 = vmlaq_f32(c30, a0, b0); - c31 = vmlaq_f32(c31, a0, b1); - c32 = vmlaq_f32(c32, a0, b2); - a0 = vld1q_dup_f32(A + oa4); - c40 = vmlaq_f32(c40, a0, b0); - c41 = vmlaq_f32(c41, a0, b1); - c42 = vmlaq_f32(c42, a0, b2); - a0 = vld1q_dup_f32(A + oa5); - c50 = vmlaq_f32(c50, a0, b0); - c51 = vmlaq_f32(c51, a0, b1); - c52 = vmlaq_f32(c52, a0, b2); - a0 = vld1q_dup_f32(A + oa6); - c60 = vmlaq_f32(c60, a0, b0); - c61 = vmlaq_f32(c61, a0, b1); - c62 = vmlaq_f32(c62, a0, b2); - a0 = vld1q_dup_f32(A + oa7); - c70 = vmlaq_f32(c70, a0, b0); - c71 = vmlaq_f32(c71, a0, b1); - c72 = vmlaq_f32(c72, a0, b2); - a0 = vld1q_dup_f32(A + oa8); - c80 = vmlaq_f32(c80, a0, b0); - c81 = vmlaq_f32(c81, a0, b1); - c82 = vmlaq_f32(c82, a0, b2); - B += sb; - A += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01); - AddProduct(C + 2 * F, _alpha, c02, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11); - AddProduct(C + 2 * F, _alpha, c12, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21); - AddProduct(C + 2 * F, _alpha, c22, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31); - AddProduct(C + 2 * F, _alpha, c32, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40); - AddProduct(C + 1 * F, _alpha, c41); - AddProduct(C + 2 * F, _alpha, c42, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50); - AddProduct(C + 1 * F, _alpha, c51); - AddProduct(C + 2 * F, _alpha, c52, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c60); - AddProduct(C + 1 * F, _alpha, c61); - AddProduct(C + 2 * F, _alpha, c62, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c70); - AddProduct(C + 1 * F, _alpha, c71); - AddProduct(C + 2 * F, _alpha, c72, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c80); - AddProduct(C + 1 * F, _alpha, c81); - AddProduct(C + 2 * F, _alpha, c82, tail); - } - - void GemmKernel9x8nn(size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, size_t sb, float* C, size_t ldc, size_t tail) - { - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c01 = vdupq_n_f32(0); - float32x4_t c10 = vdupq_n_f32(0); - float32x4_t c11 = vdupq_n_f32(0); - float32x4_t c20 = vdupq_n_f32(0); - float32x4_t c21 = vdupq_n_f32(0); - float32x4_t c30 = vdupq_n_f32(0); - float32x4_t c31 = vdupq_n_f32(0); - float32x4_t c40 = vdupq_n_f32(0); - float32x4_t c41 = vdupq_n_f32(0); - float32x4_t c50 = vdupq_n_f32(0); - float32x4_t c51 = vdupq_n_f32(0); - float32x4_t c60 = vdupq_n_f32(0); - float32x4_t c61 = vdupq_n_f32(0); - float32x4_t c70 = vdupq_n_f32(0); - float32x4_t c71 = vdupq_n_f32(0); - float32x4_t c80 = vdupq_n_f32(0); - float32x4_t c81 = vdupq_n_f32(0); - float32x4_t b0, b1, a0; - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t oa6 = lda * 6; - const size_t oa7 = lda * 7; - const size_t oa8 = lda * 8; - const size_t sa = lda == 1 ? 9 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - b1 = Load(B + ob1); - a0 = vld1q_dup_f32(A + oa0); - c00 = vmlaq_f32(c00, a0, b0); - c01 = vmlaq_f32(c01, a0, b1); - a0 = vld1q_dup_f32(A + oa1); - c10 = vmlaq_f32(c10, a0, b0); - c11 = vmlaq_f32(c11, a0, b1); - a0 = vld1q_dup_f32(A + oa2); - c20 = vmlaq_f32(c20, a0, b0); - c21 = vmlaq_f32(c21, a0, b1); - a0 = vld1q_dup_f32(A + oa3); - c30 = vmlaq_f32(c30, a0, b0); - c31 = vmlaq_f32(c31, a0, b1); - a0 = vld1q_dup_f32(A + oa4); - c40 = vmlaq_f32(c40, a0, b0); - c41 = vmlaq_f32(c41, a0, b1); - a0 = vld1q_dup_f32(A + oa5); - c50 = vmlaq_f32(c50, a0, b0); - c51 = vmlaq_f32(c51, a0, b1); - a0 = vld1q_dup_f32(A + oa6); - c60 = vmlaq_f32(c60, a0, b0); - c61 = vmlaq_f32(c61, a0, b1); - a0 = vld1q_dup_f32(A + oa7); - c70 = vmlaq_f32(c70, a0, b0); - c71 = vmlaq_f32(c71, a0, b1); - a0 = vld1q_dup_f32(A + oa8); - c80 = vmlaq_f32(c80, a0, b0); - c81 = vmlaq_f32(c81, a0, b1); - B += sb; - A += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40); - AddProduct(C + 1 * F, _alpha, c41, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50); - AddProduct(C + 1 * F, _alpha, c51, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c60); - AddProduct(C + 1 * F, _alpha, c61, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c70); - AddProduct(C + 1 * F, _alpha, c71, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c80); - AddProduct(C + 1 * F, _alpha, c81, tail); - } - - void GemmKernel9x4nn(size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, size_t sb, float* C, size_t ldc, size_t tail) - { - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c10 = vdupq_n_f32(0); - float32x4_t c20 = vdupq_n_f32(0); - float32x4_t c30 = vdupq_n_f32(0); - float32x4_t c40 = vdupq_n_f32(0); - float32x4_t c50 = vdupq_n_f32(0); - float32x4_t c60 = vdupq_n_f32(0); - float32x4_t c70 = vdupq_n_f32(0); - float32x4_t c80 = vdupq_n_f32(0); - float32x4_t b0, a0; - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t oa6 = lda * 6; - const size_t oa7 = lda * 7; - const size_t oa8 = lda * 8; - const size_t sa = lda == 1 ? 9 : 1; - const size_t ob0 = ldb * 0; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - a0 = vld1q_dup_f32(A + oa0); - c00 = vmlaq_f32(c00, a0, b0); - a0 = vld1q_dup_f32(A + oa1); - c10 = vmlaq_f32(c10, a0, b0); - a0 = vld1q_dup_f32(A + oa2); - c20 = vmlaq_f32(c20, a0, b0); - a0 = vld1q_dup_f32(A + oa3); - c30 = vmlaq_f32(c30, a0, b0); - a0 = vld1q_dup_f32(A + oa4); - c40 = vmlaq_f32(c40, a0, b0); - a0 = vld1q_dup_f32(A + oa5); - c50 = vmlaq_f32(c50, a0, b0); - a0 = vld1q_dup_f32(A + oa6); - c60 = vmlaq_f32(c60, a0, b0); - a0 = vld1q_dup_f32(A + oa7); - c70 = vmlaq_f32(c70, a0, b0); - a0 = vld1q_dup_f32(A + oa8); - c80 = vmlaq_f32(c80, a0, b0); - B += sb; - A += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - AddProduct(C + 0 * F, _alpha, c00, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c60, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c70, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c80, tail); - } -#endif - -#if defined(SIMD_ARM64_ENABLE) - void GemmKernel12x8nn(size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, size_t sb, float* C, size_t ldc, size_t tail) - { - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c01 = vdupq_n_f32(0); - float32x4_t c10 = vdupq_n_f32(0); - float32x4_t c11 = vdupq_n_f32(0); - float32x4_t c20 = vdupq_n_f32(0); - float32x4_t c21 = vdupq_n_f32(0); - float32x4_t c30 = vdupq_n_f32(0); - float32x4_t c31 = vdupq_n_f32(0); - float32x4_t c40 = vdupq_n_f32(0); - float32x4_t c41 = vdupq_n_f32(0); - float32x4_t c50 = vdupq_n_f32(0); - float32x4_t c51 = vdupq_n_f32(0); - float32x4_t c60 = vdupq_n_f32(0); - float32x4_t c61 = vdupq_n_f32(0); - float32x4_t c70 = vdupq_n_f32(0); - float32x4_t c71 = vdupq_n_f32(0); - float32x4_t c80 = vdupq_n_f32(0); - float32x4_t c81 = vdupq_n_f32(0); - float32x4_t c90 = vdupq_n_f32(0); - float32x4_t c91 = vdupq_n_f32(0); - float32x4_t cA0 = vdupq_n_f32(0); - float32x4_t cA1 = vdupq_n_f32(0); - float32x4_t cB0 = vdupq_n_f32(0); - float32x4_t cB1 = vdupq_n_f32(0); - const float* A0 = A, * A6 = A + 6 * lda; - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t sa = lda == 1 ? 12 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - float32x4_t b0, b1, a0; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - b1 = Load(B + ob1); - a0 = vld1q_dup_f32(A0 + oa0); - c00 = vmlaq_f32(c00, a0, b0); - c01 = vmlaq_f32(c01, a0, b1); - a0 = vld1q_dup_f32(A0 + oa1); - c10 = vmlaq_f32(c10, a0, b0); - c11 = vmlaq_f32(c11, a0, b1); - a0 = vld1q_dup_f32(A0 + oa2); - c20 = vmlaq_f32(c20, a0, b0); - c21 = vmlaq_f32(c21, a0, b1); - a0 = vld1q_dup_f32(A0 + oa3); - c30 = vmlaq_f32(c30, a0, b0); - c31 = vmlaq_f32(c31, a0, b1); - a0 = vld1q_dup_f32(A0 + oa4); - c40 = vmlaq_f32(c40, a0, b0); - c41 = vmlaq_f32(c41, a0, b1); - a0 = vld1q_dup_f32(A0 + oa5); - c50 = vmlaq_f32(c50, a0, b0); - c51 = vmlaq_f32(c51, a0, b1); - a0 = vld1q_dup_f32(A6 + oa0); - c60 = vmlaq_f32(c60, a0, b0); - c61 = vmlaq_f32(c61, a0, b1); - a0 = vld1q_dup_f32(A6 + oa1); - c70 = vmlaq_f32(c70, a0, b0); - c71 = vmlaq_f32(c71, a0, b1); - a0 = vld1q_dup_f32(A6 + oa2); - c80 = vmlaq_f32(c80, a0, b0); - c81 = vmlaq_f32(c81, a0, b1); - a0 = vld1q_dup_f32(A6 + oa3); - c90 = vmlaq_f32(c90, a0, b0); - c91 = vmlaq_f32(c91, a0, b1); - a0 = vld1q_dup_f32(A6 + oa4); - cA0 = vmlaq_f32(cA0, a0, b0); - cA1 = vmlaq_f32(cA1, a0, b1); - a0 = vld1q_dup_f32(A6 + oa5); - cB0 = vmlaq_f32(cB0, a0, b0); - cB1 = vmlaq_f32(cB1, a0, b1); - B += sb; - A0 += sa; - A6 += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40); - AddProduct(C + 1 * F, _alpha, c41, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50); - AddProduct(C + 1 * F, _alpha, c51, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c60); - AddProduct(C + 1 * F, _alpha, c61, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c70); - AddProduct(C + 1 * F, _alpha, c71, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c80); - AddProduct(C + 1 * F, _alpha, c81, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c90); - AddProduct(C + 1 * F, _alpha, c91, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, cA0); - AddProduct(C + 1 * F, _alpha, cA1, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, cB0); - AddProduct(C + 1 * F, _alpha, cB1, tail); - } - - void GemmKernel12x4nn(size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, size_t sb, float* C, size_t ldc, size_t tail) - { - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c10 = vdupq_n_f32(0); - float32x4_t c20 = vdupq_n_f32(0); - float32x4_t c30 = vdupq_n_f32(0); - float32x4_t c40 = vdupq_n_f32(0); - float32x4_t c50 = vdupq_n_f32(0); - float32x4_t c60 = vdupq_n_f32(0); - float32x4_t c70 = vdupq_n_f32(0); - float32x4_t c80 = vdupq_n_f32(0); - float32x4_t c90 = vdupq_n_f32(0); - float32x4_t cA0 = vdupq_n_f32(0); - float32x4_t cB0 = vdupq_n_f32(0); - const float* A0 = A, * A6 = A + 6 * lda; - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t sa = lda == 1 ? 12 : 1; - const size_t ob0 = ldb * 0; - float32x4_t b0, a0; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - a0 = vld1q_dup_f32(A0 + oa0); - c00 = vmlaq_f32(c00, a0, b0); - a0 = vld1q_dup_f32(A0 + oa1); - c10 = vmlaq_f32(c10, a0, b0); - a0 = vld1q_dup_f32(A0 + oa2); - c20 = vmlaq_f32(c20, a0, b0); - a0 = vld1q_dup_f32(A0 + oa3); - c30 = vmlaq_f32(c30, a0, b0); - a0 = vld1q_dup_f32(A0 + oa4); - c40 = vmlaq_f32(c40, a0, b0); - a0 = vld1q_dup_f32(A0 + oa5); - c50 = vmlaq_f32(c50, a0, b0); - a0 = vld1q_dup_f32(A6 + oa0); - c60 = vmlaq_f32(c60, a0, b0); - a0 = vld1q_dup_f32(A6 + oa1); - c70 = vmlaq_f32(c70, a0, b0); - a0 = vld1q_dup_f32(A6 + oa2); - c80 = vmlaq_f32(c80, a0, b0); - a0 = vld1q_dup_f32(A6 + oa3); - c90 = vmlaq_f32(c90, a0, b0); - a0 = vld1q_dup_f32(A6 + oa4); - cA0 = vmlaq_f32(cA0, a0, b0); - a0 = vld1q_dup_f32(A6 + oa5); - cB0 = vmlaq_f32(cB0, a0, b0); - B += sb; - A0 += sa; - A6 += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - AddProduct(C + 0 * F, _alpha, c00, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c60, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c70, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c80, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c90, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, cA0, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, cB0, tail); - } -#endif - -#if defined(SIMD_ARM64_ENABLE) && defined(SIMD_ARM64_KERNEL_14X) - void GemmKernel14x8nn(size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, size_t sb, float* C, size_t ldc, size_t tail) - { - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c01 = vdupq_n_f32(0); - float32x4_t c10 = vdupq_n_f32(0); - float32x4_t c11 = vdupq_n_f32(0); - float32x4_t c20 = vdupq_n_f32(0); - float32x4_t c21 = vdupq_n_f32(0); - float32x4_t c30 = vdupq_n_f32(0); - float32x4_t c31 = vdupq_n_f32(0); - float32x4_t c40 = vdupq_n_f32(0); - float32x4_t c41 = vdupq_n_f32(0); - float32x4_t c50 = vdupq_n_f32(0); - float32x4_t c51 = vdupq_n_f32(0); - float32x4_t c60 = vdupq_n_f32(0); - float32x4_t c61 = vdupq_n_f32(0); - float32x4_t c70 = vdupq_n_f32(0); - float32x4_t c71 = vdupq_n_f32(0); - float32x4_t c80 = vdupq_n_f32(0); - float32x4_t c81 = vdupq_n_f32(0); - float32x4_t c90 = vdupq_n_f32(0); - float32x4_t c91 = vdupq_n_f32(0); - float32x4_t cA0 = vdupq_n_f32(0); - float32x4_t cA1 = vdupq_n_f32(0); - float32x4_t cB0 = vdupq_n_f32(0); - float32x4_t cB1 = vdupq_n_f32(0); - float32x4_t cC0 = vdupq_n_f32(0); - float32x4_t cC1 = vdupq_n_f32(0); - float32x4_t cD0 = vdupq_n_f32(0); - float32x4_t cD1 = vdupq_n_f32(0); - const float* A0 = A, * A7 = A + 7 * lda; - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t oa6 = lda * 6; - const size_t sa = lda == 1 ? 14 : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - float32x4_t b0, b1, a0; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - b1 = Load(B + ob1); - a0 = vld1q_dup_f32(A0 + oa0); - c00 = vmlaq_f32(c00, a0, b0); - c01 = vmlaq_f32(c01, a0, b1); - a0 = vld1q_dup_f32(A0 + oa1); - c10 = vmlaq_f32(c10, a0, b0); - c11 = vmlaq_f32(c11, a0, b1); - a0 = vld1q_dup_f32(A0 + oa2); - c20 = vmlaq_f32(c20, a0, b0); - c21 = vmlaq_f32(c21, a0, b1); - a0 = vld1q_dup_f32(A0 + oa3); - c30 = vmlaq_f32(c30, a0, b0); - c31 = vmlaq_f32(c31, a0, b1); - a0 = vld1q_dup_f32(A0 + oa4); - c40 = vmlaq_f32(c40, a0, b0); - c41 = vmlaq_f32(c41, a0, b1); - a0 = vld1q_dup_f32(A0 + oa5); - c50 = vmlaq_f32(c50, a0, b0); - c51 = vmlaq_f32(c51, a0, b1); - a0 = vld1q_dup_f32(A0 + oa6); - c60 = vmlaq_f32(c60, a0, b0); - c61 = vmlaq_f32(c61, a0, b1); - a0 = vld1q_dup_f32(A7 + oa0); - c70 = vmlaq_f32(c70, a0, b0); - c71 = vmlaq_f32(c71, a0, b1); - a0 = vld1q_dup_f32(A7 + oa1); - c80 = vmlaq_f32(c80, a0, b0); - c81 = vmlaq_f32(c81, a0, b1); - a0 = vld1q_dup_f32(A7 + oa2); - c90 = vmlaq_f32(c90, a0, b0); - c91 = vmlaq_f32(c91, a0, b1); - a0 = vld1q_dup_f32(A7 + oa3); - cA0 = vmlaq_f32(cA0, a0, b0); - cA1 = vmlaq_f32(cA1, a0, b1); - a0 = vld1q_dup_f32(A7 + oa4); - cB0 = vmlaq_f32(cB0, a0, b0); - cB1 = vmlaq_f32(cB1, a0, b1); - a0 = vld1q_dup_f32(A7 + oa5); - cC0 = vmlaq_f32(cC0, a0, b0); - cC1 = vmlaq_f32(cC1, a0, b1); - a0 = vld1q_dup_f32(A7 + oa6); - cD0 = vmlaq_f32(cD0, a0, b0); - cD1 = vmlaq_f32(cD1, a0, b1); - B += sb; - A0 += sa; - A7 += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - AddProduct(C + 0 * F, _alpha, c00); - AddProduct(C + 1 * F, _alpha, c01, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10); - AddProduct(C + 1 * F, _alpha, c11, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20); - AddProduct(C + 1 * F, _alpha, c21, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30); - AddProduct(C + 1 * F, _alpha, c31, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40); - AddProduct(C + 1 * F, _alpha, c41, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50); - AddProduct(C + 1 * F, _alpha, c51, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c60); - AddProduct(C + 1 * F, _alpha, c61, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c70); - AddProduct(C + 1 * F, _alpha, c71, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c80); - AddProduct(C + 1 * F, _alpha, c81, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c90); - AddProduct(C + 1 * F, _alpha, c91, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, cA0); - AddProduct(C + 1 * F, _alpha, cA1, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, cB0); - AddProduct(C + 1 * F, _alpha, cB1, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, cC0); - AddProduct(C + 1 * F, _alpha, cC1, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, cD0); - AddProduct(C + 1 * F, _alpha, cD1, tail); - } - - void GemmKernel14x4nn(size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, size_t sb, float* C, size_t ldc, size_t tail) - { - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c10 = vdupq_n_f32(0); - float32x4_t c20 = vdupq_n_f32(0); - float32x4_t c30 = vdupq_n_f32(0); - float32x4_t c40 = vdupq_n_f32(0); - float32x4_t c50 = vdupq_n_f32(0); - float32x4_t c60 = vdupq_n_f32(0); - float32x4_t c70 = vdupq_n_f32(0); - float32x4_t c80 = vdupq_n_f32(0); - float32x4_t c90 = vdupq_n_f32(0); - float32x4_t cA0 = vdupq_n_f32(0); - float32x4_t cB0 = vdupq_n_f32(0); - float32x4_t cC0 = vdupq_n_f32(0); - float32x4_t cD0 = vdupq_n_f32(0); - const float* A0 = A, * A7 = A + 7 * lda; - const size_t oa0 = lda * 0; - const size_t oa1 = lda * 1; - const size_t oa2 = lda * 2; - const size_t oa3 = lda * 3; - const size_t oa4 = lda * 4; - const size_t oa5 = lda * 5; - const size_t oa6 = lda * 6; - const size_t sa = lda == 1 ? 14 : 1; - const size_t ob0 = ldb * 0; - float32x4_t b0, a0; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - a0 = vld1q_dup_f32(A0 + oa0); - c00 = vmlaq_f32(c00, a0, b0); - a0 = vld1q_dup_f32(A0 + oa1); - c10 = vmlaq_f32(c10, a0, b0); - a0 = vld1q_dup_f32(A0 + oa2); - c20 = vmlaq_f32(c20, a0, b0); - a0 = vld1q_dup_f32(A0 + oa3); - c30 = vmlaq_f32(c30, a0, b0); - a0 = vld1q_dup_f32(A0 + oa4); - c40 = vmlaq_f32(c40, a0, b0); - a0 = vld1q_dup_f32(A0 + oa5); - c50 = vmlaq_f32(c50, a0, b0); - a0 = vld1q_dup_f32(A0 + oa6); - c60 = vmlaq_f32(c60, a0, b0); - a0 = vld1q_dup_f32(A7 + oa0); - c70 = vmlaq_f32(c70, a0, b0); - a0 = vld1q_dup_f32(A7 + oa1); - c80 = vmlaq_f32(c80, a0, b0); - a0 = vld1q_dup_f32(A7 + oa2); - c90 = vmlaq_f32(c90, a0, b0); - a0 = vld1q_dup_f32(A7 + oa3); - cA0 = vmlaq_f32(cA0, a0, b0); - a0 = vld1q_dup_f32(A7 + oa4); - cB0 = vmlaq_f32(cB0, a0, b0); - a0 = vld1q_dup_f32(A7 + oa5); - cC0 = vmlaq_f32(cC0, a0, b0); - a0 = vld1q_dup_f32(A7 + oa6); - cD0 = vmlaq_f32(cD0, a0, b0); - B += sb; - A0 += sa; - A7 += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - AddProduct(C + 0 * F, _alpha, c00, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c10, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c20, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c30, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c40, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c50, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c60, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c70, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c80, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, c90, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, cA0, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, cB0, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, cC0, tail); - C += ldc; - AddProduct(C + 0 * F, _alpha, cD0, tail); - } -#endif - - void GemmKernelMx16nn(size_t M, size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, size_t sb, float* C, size_t ldc, size_t tail) - { -#ifdef SIMD_ARM64_ENABLE - float32x4_t c[6][4]; - size_t oa[6]; -#else - float32x4_t c[3][4]; - size_t oa[3]; -#endif - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - const size_t ob3 = ldb * 3; - for (size_t i = 0; i < M; ++i) - { - c[i][0] = vdupq_n_f32(0); - c[i][1] = vdupq_n_f32(0); - c[i][2] = vdupq_n_f32(0); - c[i][3] = vdupq_n_f32(0); - oa[i] = lda * i; - } - float32x4_t b0, b1, b2, b3, a0; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - b1 = Load(B + ob1); - b2 = Load(B + ob2); - b3 = Load(B + ob3); - for (size_t i = 0; i < M; ++i) - { - a0 = vld1q_dup_f32(A + oa[i]); - c[i][0] = vmlaq_f32(c[i][0], b0, a0); - c[i][1] = vmlaq_f32(c[i][1], b1, a0); - c[i][2] = vmlaq_f32(c[i][2], b2, a0); - c[i][3] = vmlaq_f32(c[i][3], b3, a0); - } - B += sb; - A += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - for (size_t i = 0; i < M; ++i) - { - AddProduct(C + 0 * F, _alpha, c[i][0]); - AddProduct(C + 1 * F, _alpha, c[i][1]); - AddProduct(C + 2 * F, _alpha, c[i][2]); - AddProduct(C + 3 * F, _alpha, c[i][3], tail); - C += ldc; - } - } - - void GemmKernelMx12nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { -#ifdef SIMD_ARM64_ENABLE - float32x4_t c[8][3]; - size_t oa[8]; -#else - float32x4_t c[4][3]; - size_t oa[4]; -#endif - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - for (size_t i = 0; i < M; ++i) - { - c[i][0] = vdupq_n_f32(0); - c[i][1] = vdupq_n_f32(0); - c[i][2] = vdupq_n_f32(0); - oa[i] = lda * i; - } - float32x4_t b0, b1, b2, a0; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - b1 = Load(B + ob1); - b2 = Load(B + ob2); - for (size_t i = 0; i < M; ++i) - { - a0 = vld1q_dup_f32(A + oa[i]); - c[i][0] = vmlaq_f32(c[i][0], b0, a0); - c[i][1] = vmlaq_f32(c[i][1], b1, a0); - c[i][2] = vmlaq_f32(c[i][2], b2, a0); - } - B += sb; - A += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - for (size_t i = 0; i < M; ++i) - { - AddProduct(C + 0 * F, _alpha, c[i][0]); - AddProduct(C + 1 * F, _alpha, c[i][1]); - AddProduct(C + 2 * F, _alpha, c[i][2], tail); - C += ldc; - } - } - - void GemmKernelMx8nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { -#ifdef SIMD_ARM64_ENABLE - float32x4_t c[12][2]; - size_t oa[12]; -#else - float32x4_t c[6][2]; - size_t oa[6]; -#endif - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - for (size_t i = 0; i < M; ++i) - { - c[i][0] = vdupq_n_f32(0); - c[i][1] = vdupq_n_f32(0); - oa[i] = lda * i; - } - float32x4_t b0, b1, a0; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - b1 = Load(B + ob1); - for (size_t i = 0; i < M; ++i) - { - a0 = vld1q_dup_f32(A + oa[i]); - c[i][0] = vmlaq_f32(c[i][0], b0, a0); - c[i][1] = vmlaq_f32(c[i][1], b1, a0); - } - B += sb; - A += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - for (size_t i = 0; i < M; ++i) - { - AddProduct(C + 0 * F, _alpha, c[i][0]); - AddProduct(C + 1 * F, _alpha, c[i][1], tail); - C += ldc; - } - } - - void GemmKernelMx4nn(size_t M, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { -#ifdef SIMD_ARM64_ENABLE - float32x4_t c[12]; - size_t oa[12]; -#else - float32x4_t c[6]; - size_t oa[6]; -#endif - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - for (size_t i = 0; i < M; ++i) - { - c[i] = vdupq_n_f32(0); - oa[i] = lda * i; - } - float32x4_t b0, a0; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - for (size_t i = 0; i < M; ++i) - { - a0 = vld1q_dup_f32(A + oa[i]); - c[i] = vmlaq_f32(c[i], b0, a0); - } - B += sb; - A += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - for (size_t i = 0; i < M; ++i) - AddProduct(C + i * ldc, _alpha, c[i], tail); - } - - template void GemmKernelMx16nnT(size_t, size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, size_t sb, float* C, size_t ldc, size_t tail) - { - float32x4_t c00, c01, c02, c03, c04, c05, c10, c11, c12, c13, c14, c15, c20, c21, c22, c23, c24, c25, c30, c31, c32, c33, c34, c35, b0, b1, b2, b3, a0; - if (M > 0) c00 = vdupq_n_f32(0), c10 = vdupq_n_f32(0), c20 = vdupq_n_f32(0), c30 = vdupq_n_f32(0); - if (M > 1) c01 = vdupq_n_f32(0), c11 = vdupq_n_f32(0), c21 = vdupq_n_f32(0), c31 = vdupq_n_f32(0); - if (M > 2) c02 = vdupq_n_f32(0), c12 = vdupq_n_f32(0), c22 = vdupq_n_f32(0), c32 = vdupq_n_f32(0); - if (M > 3) c03 = vdupq_n_f32(0), c13 = vdupq_n_f32(0), c23 = vdupq_n_f32(0), c33 = vdupq_n_f32(0); - if (M > 4) c04 = vdupq_n_f32(0), c14 = vdupq_n_f32(0), c24 = vdupq_n_f32(0), c34 = vdupq_n_f32(0); - if (M > 5) c05 = vdupq_n_f32(0), c15 = vdupq_n_f32(0), c25 = vdupq_n_f32(0), c35 = vdupq_n_f32(0); - size_t oa0, oa1, oa2, oa3, oa4, oa5; - if (M > 0) oa0 = lda * 0; - if (M > 1) oa1 = lda * 1; - if (M > 2) oa2 = lda * 2; - if (M > 3) oa3 = lda * 3; - if (M > 4) oa4 = lda * 4; - if (M > 5) oa5 = lda * 5; - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - const size_t ob3 = ldb * 3; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - b1 = Load(B + ob1); - b2 = Load(B + ob2); - b3 = Load(B + ob3); - if (M > 0) a0 = vld1q_dup_f32(A + oa0), c00 = vmlaq_f32(c00, b0, a0), c10 = vmlaq_f32(c10, b1, a0), c20 = vmlaq_f32(c20, b2, a0), c30 = vmlaq_f32(c30, b3, a0); - if (M > 1) a0 = vld1q_dup_f32(A + oa1), c01 = vmlaq_f32(c01, b0, a0), c11 = vmlaq_f32(c11, b1, a0), c21 = vmlaq_f32(c21, b2, a0), c31 = vmlaq_f32(c31, b3, a0); - if (M > 2) a0 = vld1q_dup_f32(A + oa2), c02 = vmlaq_f32(c02, b0, a0), c12 = vmlaq_f32(c12, b1, a0), c22 = vmlaq_f32(c22, b2, a0), c32 = vmlaq_f32(c32, b3, a0); - if (M > 3) a0 = vld1q_dup_f32(A + oa3), c03 = vmlaq_f32(c03, b0, a0), c13 = vmlaq_f32(c13, b1, a0), c23 = vmlaq_f32(c23, b2, a0), c33 = vmlaq_f32(c33, b3, a0); - if (M > 4) a0 = vld1q_dup_f32(A + oa4), c04 = vmlaq_f32(c04, b0, a0), c14 = vmlaq_f32(c14, b1, a0), c24 = vmlaq_f32(c24, b2, a0), c34 = vmlaq_f32(c34, b3, a0); - if (M > 5) a0 = vld1q_dup_f32(A + oa5), c05 = vmlaq_f32(c05, b0, a0), c15 = vmlaq_f32(c15, b1, a0), c25 = vmlaq_f32(c25, b2, a0), c35 = vmlaq_f32(c35, b3, a0); - B += sb; - A += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - if (M > 0) AddProduct(C + 0 * F, _alpha, c00), AddProduct(C + 1 * F, _alpha, c10), AddProduct(C + 2 * F, _alpha, c20), AddProduct(C + 3 * F, _alpha, c30, tail), C += ldc; - if (M > 1) AddProduct(C + 0 * F, _alpha, c01), AddProduct(C + 1 * F, _alpha, c11), AddProduct(C + 2 * F, _alpha, c21), AddProduct(C + 3 * F, _alpha, c31, tail), C += ldc; - if (M > 2) AddProduct(C + 0 * F, _alpha, c02), AddProduct(C + 1 * F, _alpha, c12), AddProduct(C + 2 * F, _alpha, c22), AddProduct(C + 3 * F, _alpha, c32, tail), C += ldc; - if (M > 3) AddProduct(C + 0 * F, _alpha, c03), AddProduct(C + 1 * F, _alpha, c13), AddProduct(C + 2 * F, _alpha, c23), AddProduct(C + 3 * F, _alpha, c33, tail), C += ldc; - if (M > 4) AddProduct(C + 0 * F, _alpha, c04), AddProduct(C + 1 * F, _alpha, c14), AddProduct(C + 2 * F, _alpha, c24), AddProduct(C + 3 * F, _alpha, c34, tail), C += ldc; - if (M > 5) AddProduct(C + 0 * F, _alpha, c05), AddProduct(C + 1 * F, _alpha, c15), AddProduct(C + 2 * F, _alpha, c25), AddProduct(C + 3 * F, _alpha, c35, tail), C += ldc; - } - - template void GemmKernelMx12nnT(size_t, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - float32x4_t c00, c01, c02, c03, c04, c05, c06, c07, c10, c11, c12, c13, c14, c15, c16, c17, c20, c21, c22, c23, c24, c25, c26, c27, b0, b1, b2, a0; - if (M > 0) c00 = vdupq_n_f32(0), c10 = vdupq_n_f32(0), c20 = vdupq_n_f32(0); - if (M > 1) c01 = vdupq_n_f32(0), c11 = vdupq_n_f32(0), c21 = vdupq_n_f32(0); - if (M > 2) c02 = vdupq_n_f32(0), c12 = vdupq_n_f32(0), c22 = vdupq_n_f32(0); - if (M > 3) c03 = vdupq_n_f32(0), c13 = vdupq_n_f32(0), c23 = vdupq_n_f32(0); - if (M > 4) c04 = vdupq_n_f32(0), c14 = vdupq_n_f32(0), c24 = vdupq_n_f32(0); - if (M > 5) c05 = vdupq_n_f32(0), c15 = vdupq_n_f32(0), c25 = vdupq_n_f32(0); - if (M > 6) c06 = vdupq_n_f32(0), c16 = vdupq_n_f32(0), c26 = vdupq_n_f32(0); - if (M > 7) c07 = vdupq_n_f32(0), c17 = vdupq_n_f32(0), c27 = vdupq_n_f32(0); - const float* A0 = A, * A4 = A + 4 * lda; - size_t oa0, oa1, oa2, oa3; - if (M > 0) oa0 = lda * 0; - if (M > 1) oa1 = lda * 1; - if (M > 2) oa2 = lda * 2; - if (M > 3) oa3 = lda * 3; - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - const size_t ob2 = ldb * 2; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - b1 = Load(B + ob1); - b2 = Load(B + ob2); - if (M > 0) a0 = vld1q_dup_f32(A0 + oa0), c00 = vmlaq_f32(c00, b0, a0), c10 = vmlaq_f32(c10, b1, a0), c20 = vmlaq_f32(c20, b2, a0); - if (M > 1) a0 = vld1q_dup_f32(A0 + oa1), c01 = vmlaq_f32(c01, b0, a0), c11 = vmlaq_f32(c11, b1, a0), c21 = vmlaq_f32(c21, b2, a0); - if (M > 2) a0 = vld1q_dup_f32(A0 + oa2), c02 = vmlaq_f32(c02, b0, a0), c12 = vmlaq_f32(c12, b1, a0), c22 = vmlaq_f32(c22, b2, a0); - if (M > 3) a0 = vld1q_dup_f32(A0 + oa3), c03 = vmlaq_f32(c03, b0, a0), c13 = vmlaq_f32(c13, b1, a0), c23 = vmlaq_f32(c23, b2, a0); - if (M > 4) a0 = vld1q_dup_f32(A4 + oa0), c04 = vmlaq_f32(c04, b0, a0), c14 = vmlaq_f32(c14, b1, a0), c24 = vmlaq_f32(c24, b2, a0); - if (M > 5) a0 = vld1q_dup_f32(A4 + oa1), c05 = vmlaq_f32(c05, b0, a0), c15 = vmlaq_f32(c15, b1, a0), c25 = vmlaq_f32(c25, b2, a0); - if (M > 6) a0 = vld1q_dup_f32(A4 + oa2), c06 = vmlaq_f32(c06, b0, a0), c16 = vmlaq_f32(c16, b1, a0), c26 = vmlaq_f32(c26, b2, a0); - if (M > 7) a0 = vld1q_dup_f32(A4 + oa3), c07 = vmlaq_f32(c07, b0, a0), c17 = vmlaq_f32(c17, b1, a0), c27 = vmlaq_f32(c27, b2, a0); - B += sb; - A0 += sa; - A4 += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - if (M > 0) AddProduct(C + 0 * F, _alpha, c00), AddProduct(C + 1 * F, _alpha, c10), AddProduct(C + 2 * F, _alpha, c20, tail), C += ldc; - if (M > 1) AddProduct(C + 0 * F, _alpha, c01), AddProduct(C + 1 * F, _alpha, c11), AddProduct(C + 2 * F, _alpha, c21, tail), C += ldc; - if (M > 2) AddProduct(C + 0 * F, _alpha, c02), AddProduct(C + 1 * F, _alpha, c12), AddProduct(C + 2 * F, _alpha, c22, tail), C += ldc; - if (M > 3) AddProduct(C + 0 * F, _alpha, c03), AddProduct(C + 1 * F, _alpha, c13), AddProduct(C + 2 * F, _alpha, c23, tail), C += ldc; - if (M > 4) AddProduct(C + 0 * F, _alpha, c04), AddProduct(C + 1 * F, _alpha, c14), AddProduct(C + 2 * F, _alpha, c24, tail), C += ldc; - if (M > 5) AddProduct(C + 0 * F, _alpha, c05), AddProduct(C + 1 * F, _alpha, c15), AddProduct(C + 2 * F, _alpha, c25, tail), C += ldc; - if (M > 6) AddProduct(C + 0 * F, _alpha, c06), AddProduct(C + 1 * F, _alpha, c16), AddProduct(C + 2 * F, _alpha, c26, tail), C += ldc; - if (M > 7) AddProduct(C + 0 * F, _alpha, c07), AddProduct(C + 1 * F, _alpha, c17), AddProduct(C + 2 * F, _alpha, c27, tail), C += ldc; - } - - template void GemmKernelMx8nnT(size_t, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - float32x4_t c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c0A, c0B, c10, c11, c12, c13, c14, c15, c16, c17, c18, c19, c1A, c1B, b0, b1, a0; - if (M > 0x0) c00 = vdupq_n_f32(0), c10 = vdupq_n_f32(0); - if (M > 0x1) c01 = vdupq_n_f32(0), c11 = vdupq_n_f32(0); - if (M > 0x2) c02 = vdupq_n_f32(0), c12 = vdupq_n_f32(0); - if (M > 0x3) c03 = vdupq_n_f32(0), c13 = vdupq_n_f32(0); - if (M > 0x4) c04 = vdupq_n_f32(0), c14 = vdupq_n_f32(0); - if (M > 0x5) c05 = vdupq_n_f32(0), c15 = vdupq_n_f32(0); - if (M > 0x6) c06 = vdupq_n_f32(0), c16 = vdupq_n_f32(0); - if (M > 0x7) c07 = vdupq_n_f32(0), c17 = vdupq_n_f32(0); - if (M > 0x8) c08 = vdupq_n_f32(0), c18 = vdupq_n_f32(0); - if (M > 0x9) c09 = vdupq_n_f32(0), c19 = vdupq_n_f32(0); - if (M > 0xA) c0A = vdupq_n_f32(0), c1A = vdupq_n_f32(0); - if (M > 0xB) c0B = vdupq_n_f32(0), c1B = vdupq_n_f32(0); - const float* A0 = A, * A6 = A + 6 * lda; - size_t oa0, oa1, oa2, oa3, oa4, oa5, oa6; - if (M > 0) oa0 = lda * 0; - if (M > 1) oa1 = lda * 1; - if (M > 2) oa2 = lda * 2; - if (M > 3) oa3 = lda * 3; - if (M > 4) oa4 = lda * 4; - if (M > 5) oa5 = lda * 5; - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - const size_t ob1 = ldb * 1; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - b1 = Load(B + ob1); - if (M > 0x0) a0 = vld1q_dup_f32(A0 + oa0), c00 = vmlaq_f32(c00, b0, a0), c10 = vmlaq_f32(c10, b1, a0); - if (M > 0x1) a0 = vld1q_dup_f32(A0 + oa1), c01 = vmlaq_f32(c01, b0, a0), c11 = vmlaq_f32(c11, b1, a0); - if (M > 0x2) a0 = vld1q_dup_f32(A0 + oa2), c02 = vmlaq_f32(c02, b0, a0), c12 = vmlaq_f32(c12, b1, a0); - if (M > 0x3) a0 = vld1q_dup_f32(A0 + oa3), c03 = vmlaq_f32(c03, b0, a0), c13 = vmlaq_f32(c13, b1, a0); - if (M > 0x4) a0 = vld1q_dup_f32(A0 + oa4), c04 = vmlaq_f32(c04, b0, a0), c14 = vmlaq_f32(c14, b1, a0); - if (M > 0x5) a0 = vld1q_dup_f32(A0 + oa5), c05 = vmlaq_f32(c05, b0, a0), c15 = vmlaq_f32(c15, b1, a0); - if (M > 0x6) a0 = vld1q_dup_f32(A6 + oa0), c06 = vmlaq_f32(c06, b0, a0), c16 = vmlaq_f32(c16, b1, a0); - if (M > 0x7) a0 = vld1q_dup_f32(A6 + oa1), c07 = vmlaq_f32(c07, b0, a0), c17 = vmlaq_f32(c17, b1, a0); - if (M > 0x8) a0 = vld1q_dup_f32(A6 + oa2), c08 = vmlaq_f32(c08, b0, a0), c18 = vmlaq_f32(c18, b1, a0); - if (M > 0x9) a0 = vld1q_dup_f32(A6 + oa3), c09 = vmlaq_f32(c09, b0, a0), c19 = vmlaq_f32(c19, b1, a0); - if (M > 0xA) a0 = vld1q_dup_f32(A6 + oa4), c0A = vmlaq_f32(c0A, b0, a0), c1A = vmlaq_f32(c1A, b1, a0); - if (M > 0xB) a0 = vld1q_dup_f32(A6 + oa5), c0B = vmlaq_f32(c0B, b0, a0), c1B = vmlaq_f32(c1B, b1, a0); - B += sb; - A0 += sa; - A6 += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - if (M > 0x0) AddProduct(C + 0 * F, _alpha, c00), AddProduct(C + 1 * F, _alpha, c10, tail), C += ldc; - if (M > 0x1) AddProduct(C + 0 * F, _alpha, c01), AddProduct(C + 1 * F, _alpha, c11, tail), C += ldc; - if (M > 0x2) AddProduct(C + 0 * F, _alpha, c02), AddProduct(C + 1 * F, _alpha, c12, tail), C += ldc; - if (M > 0x3) AddProduct(C + 0 * F, _alpha, c03), AddProduct(C + 1 * F, _alpha, c13, tail), C += ldc; - if (M > 0x4) AddProduct(C + 0 * F, _alpha, c04), AddProduct(C + 1 * F, _alpha, c14, tail), C += ldc; - if (M > 0x5) AddProduct(C + 0 * F, _alpha, c05), AddProduct(C + 1 * F, _alpha, c15, tail), C += ldc; - if (M > 0x6) AddProduct(C + 0 * F, _alpha, c06), AddProduct(C + 1 * F, _alpha, c16, tail), C += ldc; - if (M > 0x7) AddProduct(C + 0 * F, _alpha, c07), AddProduct(C + 1 * F, _alpha, c17, tail), C += ldc; - if (M > 0x8) AddProduct(C + 0 * F, _alpha, c08), AddProduct(C + 1 * F, _alpha, c18, tail), C += ldc; - if (M > 0x9) AddProduct(C + 0 * F, _alpha, c09), AddProduct(C + 1 * F, _alpha, c19, tail), C += ldc; - if (M > 0xA) AddProduct(C + 0 * F, _alpha, c0A), AddProduct(C + 1 * F, _alpha, c1A, tail), C += ldc; - if (M > 0xB) AddProduct(C + 0 * F, _alpha, c0B), AddProduct(C + 1 * F, _alpha, c1B, tail), C += ldc; - } - - template void GemmKernelMx4nnT(size_t, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, size_t sb, float * C, size_t ldc, size_t tail) - { - float32x4_t c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c0A, c0B, b0, a0; - if (M > 0x0) c00 = vdupq_n_f32(0); - if (M > 0x1) c01 = vdupq_n_f32(0); - if (M > 0x2) c02 = vdupq_n_f32(0); - if (M > 0x3) c03 = vdupq_n_f32(0); - if (M > 0x4) c04 = vdupq_n_f32(0); - if (M > 0x5) c05 = vdupq_n_f32(0); - if (M > 0x6) c06 = vdupq_n_f32(0); - if (M > 0x7) c07 = vdupq_n_f32(0); - if (M > 0x8) c08 = vdupq_n_f32(0); - if (M > 0x9) c09 = vdupq_n_f32(0); - if (M > 0xA) c0A = vdupq_n_f32(0); - if (M > 0xB) c0B = vdupq_n_f32(0); - const float* A0 = A, * A6 = A + 6 * lda; - size_t oa0, oa1, oa2, oa3, oa4, oa5; - if (M > 0) oa0 = lda * 0; - if (M > 1) oa1 = lda * 1; - if (M > 2) oa2 = lda * 2; - if (M > 3) oa3 = lda * 3; - if (M > 4) oa4 = lda * 4; - if (M > 5) oa5 = lda * 5; - const size_t sa = lda == 1 ? M : 1; - const size_t ob0 = ldb * 0; - for (size_t k = 0; k < K; k++) - { - b0 = Load(B + ob0); - if (M > 0x0) c00 = vmlaq_f32(c00, b0, vld1q_dup_f32(A0 + oa0)); - if (M > 0x1) c01 = vmlaq_f32(c01, b0, vld1q_dup_f32(A0 + oa1)); - if (M > 0x2) c02 = vmlaq_f32(c02, b0, vld1q_dup_f32(A0 + oa2)); - if (M > 0x3) c03 = vmlaq_f32(c03, b0, vld1q_dup_f32(A0 + oa3)); - if (M > 0x4) c04 = vmlaq_f32(c04, b0, vld1q_dup_f32(A0 + oa4)); - if (M > 0x5) c05 = vmlaq_f32(c05, b0, vld1q_dup_f32(A0 + oa5)); - if (M > 0x6) c06 = vmlaq_f32(c06, b0, vld1q_dup_f32(A6 + oa0)); - if (M > 0x7) c07 = vmlaq_f32(c07, b0, vld1q_dup_f32(A6 + oa1)); - if (M > 0x8) c08 = vmlaq_f32(c08, b0, vld1q_dup_f32(A6 + oa2)); - if (M > 0x9) c09 = vmlaq_f32(c09, b0, vld1q_dup_f32(A6 + oa3)); - if (M > 0xA) c0A = vmlaq_f32(c0A, b0, vld1q_dup_f32(A6 + oa4)); - if (M > 0xB) c0B = vmlaq_f32(c0B, b0, vld1q_dup_f32(A6 + oa5)); - B += sb; - A0 += sa; - A6 += sa; - } - float32x4_t _alpha = vdupq_n_f32(alpha); - if (M > 0x0) AddProduct(C, _alpha, c00, tail), C += ldc; - if (M > 0x1) AddProduct(C, _alpha, c01, tail), C += ldc; - if (M > 0x2) AddProduct(C, _alpha, c02, tail), C += ldc; - if (M > 0x3) AddProduct(C, _alpha, c03, tail), C += ldc; - if (M > 0x4) AddProduct(C, _alpha, c04, tail), C += ldc; - if (M > 0x5) AddProduct(C, _alpha, c05, tail), C += ldc; - if (M > 0x6) AddProduct(C, _alpha, c06, tail), C += ldc; - if (M > 0x7) AddProduct(C, _alpha, c07, tail), C += ldc; - if (M > 0x8) AddProduct(C, _alpha, c08, tail), C += ldc; - if (M > 0x9) AddProduct(C, _alpha, c09, tail), C += ldc; - if (M > 0xA) AddProduct(C, _alpha, c0A, tail), C += ldc; - if (M > 0xB) AddProduct(C, _alpha, c0B, tail), C += ldc; - } - - SIMD_INLINE Simd::GemmNN::Tail GetGemmTail(size_t M, size_t N) - { - if (N <= 4) - { - switch (M) - { - case 0: return GemmKernelMx4nnT<0>; - case 1: return GemmKernelMx4nnT<1>; - case 2: return GemmKernelMx4nnT<2>; - case 3: return GemmKernelMx4nnT<3>; - case 4: return GemmKernelMx4nnT<4>; - case 5: return GemmKernelMx4nnT<5>; - case 6: return GemmKernelMx4nnT<6>; - case 7: return GemmKernelMx4nnT<7>; - case 8: return GemmKernelMx4nnT<8>; - case 9: return GemmKernelMx4nnT<9>; - case 10: return GemmKernelMx4nnT<10>; - case 11: return GemmKernelMx4nnT<11>; - case 12: return GemmKernelMx4nnT<12>; - } - } - else if (N <= 8) - { - switch (M) - { - case 0: return GemmKernelMx8nnT<0>; - case 1: return GemmKernelMx8nnT<1>; - case 2: return GemmKernelMx8nnT<2>; - case 3: return GemmKernelMx8nnT<3>; - case 4: return GemmKernelMx8nnT<4>; - case 5: return GemmKernelMx8nnT<5>; - case 6: return GemmKernelMx8nnT<6>; - case 7: return GemmKernelMx8nnT<7>; - case 8: return GemmKernelMx8nnT<8>; - case 9: return GemmKernelMx8nnT<9>; - case 10: return GemmKernelMx8nnT<10>; - case 11: return GemmKernelMx8nnT<11>; - case 12: return GemmKernelMx8nnT<12>; - } - } - else if (N <= 12) - { - switch (M) - { - case 0: return GemmKernelMx12nnT<0>; - case 1: return GemmKernelMx12nnT<1>; - case 2: return GemmKernelMx12nnT<2>; - case 3: return GemmKernelMx12nnT<3>; - case 4: return GemmKernelMx12nnT<4>; - case 5: return GemmKernelMx12nnT<5>; - case 6: return GemmKernelMx12nnT<6>; - case 7: return GemmKernelMx12nnT<7>; - case 8: return GemmKernelMx12nnT<8>; - } - } - else if (N <= 16) - { - switch (M) - { - case 0: return GemmKernelMx16nnT<0>; - case 1: return GemmKernelMx16nnT<1>; - case 2: return GemmKernelMx16nnT<2>; - case 3: return GemmKernelMx16nnT<3>; - case 4: return GemmKernelMx16nnT<4>; - case 5: return GemmKernelMx16nnT<5>; - case 6: return GemmKernelMx16nnT<6>; - } - } - assert(0); - return NULL; - } - - SIMD_INLINE void GemmPackA_4x4(const float* src, size_t stride, float* dst) - { - float32x4x4_t dst0; - dst0.val[0] = Load(src + 0 * stride); - dst0.val[1] = Load(src + 1 * stride); - dst0.val[2] = Load(src + 2 * stride); - dst0.val[3] = Load(src + 3 * stride); - Store4(dst, dst0); - } - - SIMD_INLINE void GemmPackA_6x4(const float* src, size_t stride, float* dst) - { - float32x4_t src0 = Load(src + 0 * stride); - float32x4_t src1 = Load(src + 1 * stride); - float32x4_t src2 = Load(src + 2 * stride); - float32x4_t src3 = Load(src + 3 * stride); - float32x4_t src4 = Load(src + 4 * stride); - float32x4_t src5 = Load(src + 5 * stride); - float32x4x2_t src03 = vzipq_f32(src0, src3); - float32x4x2_t src14 = vzipq_f32(src1, src4); - float32x4x2_t src25 = vzipq_f32(src2, src5); - float32x4x3_t dst0; - dst0.val[0] = src03.val[0]; - dst0.val[1] = src14.val[0]; - dst0.val[2] = src25.val[0]; - Store3(dst, dst0); - float32x4x3_t dst1; - dst1.val[0] = src03.val[1]; - dst1.val[1] = src14.val[1]; - dst1.val[2] = src25.val[1]; - Store3(dst + 12, dst1); - } - - SIMD_INLINE void GemmPackA_8x4(const float* src, size_t stride, float* dst) - { - float32x4x2_t src04 = vzipq_f32(Load(src + 0 * stride), Load(src + 4 * stride)); - float32x4x2_t src15 = vzipq_f32(Load(src + 1 * stride), Load(src + 5 * stride)); - float32x4x2_t src26 = vzipq_f32(Load(src + 2 * stride), Load(src + 6 * stride)); - float32x4x2_t src37 = vzipq_f32(Load(src + 3 * stride), Load(src + 7 * stride)); - float32x4x4_t dst0; - dst0.val[0] = src04.val[0]; - dst0.val[1] = src15.val[0]; - dst0.val[2] = src26.val[0]; - dst0.val[3] = src37.val[0]; - Store4(dst, dst0); - float32x4x4_t dst1; - dst1.val[0] = src04.val[1]; - dst1.val[1] = src15.val[1]; - dst1.val[2] = src26.val[1]; - dst1.val[3] = src37.val[1]; - Store4(dst + 16, dst1); - } - - SIMD_INLINE void GemmPackA_12x4(const float* src, size_t stride, float* dst) - { - float32x4x2_t b[6]; - b[0] = vzipq_f32(Load(src + 0 * stride), Load(src + 6 * stride)); - b[1] = vzipq_f32(Load(src + 1 * stride), Load(src + 7 * stride)); - b[2] = vzipq_f32(Load(src + 2 * stride), Load(src + 8 * stride)); - b[3] = vzipq_f32(Load(src + 3 * stride), Load(src + 9 * stride)); - b[4] = vzipq_f32(Load(src + 4 * stride), Load(src + 10 * stride)); - b[5] = vzipq_f32(Load(src + 5 * stride), Load(src + 11 * stride)); - - float32x4x2_t c[3]; - c[0] = vzipq_f32(b[0].val[0], b[3].val[0]); - c[1] = vzipq_f32(b[1].val[0], b[4].val[0]); - c[2] = vzipq_f32(b[2].val[0], b[5].val[0]); - - float32x4x3_t d; - d.val[0] = c[0].val[0]; - d.val[1] = c[1].val[0]; - d.val[2] = c[2].val[0]; - Store3(dst + 0, d); - d.val[0] = c[0].val[1]; - d.val[1] = c[1].val[1]; - d.val[2] = c[2].val[1]; - Store3(dst + 12, d); - - c[0] = vzipq_f32(b[0].val[1], b[3].val[1]); - c[1] = vzipq_f32(b[1].val[1], b[4].val[1]); - c[2] = vzipq_f32(b[2].val[1], b[5].val[1]); - - d.val[0] = c[0].val[0]; - d.val[1] = c[1].val[0]; - d.val[2] = c[2].val[0]; - Store3(dst + 24, d); - d.val[0] = c[0].val[1]; - d.val[1] = c[1].val[1]; - d.val[2] = c[2].val[1]; - Store3(dst + 36, d); - } - - void GemmPackA(const float * src, size_t stride, size_t M, size_t K, size_t cell, float * dst) - { - size_t K4 = AlignLo(K, 4); - for (size_t i = 0; i < M; i += cell) - { - size_t m = Simd::Min(cell, M - i), k = 0; - if (cell == 4 && m == 4) - { - for (; k < K4; k += 4, dst += 16) - GemmPackA_4x4(src + k, stride, dst); - } - else if (cell == 6 && m == 6) - { - for (; k < K4; k += 4, dst += 24) - GemmPackA_6x4(src + k, stride, dst); - } - else if (cell == 8 && m == 8) - { - for (; k < K4; k += 4, dst += 32) - GemmPackA_8x4(src + k, stride, dst); - } - else if (cell == 12 && m == 12) - { - for (; k < K4; k += 4, dst += 48) - GemmPackA_12x4(src + k, stride, dst); - } - for (; k < K; ++k) - { - for (size_t c = 0; c < m; ++c) - *(dst++) = src[c*stride + k]; - } - src += cell * stride; - } - } - - void GemmPackB(const float * B, size_t ldb, size_t K, size_t N, size_t microN, float * pB) - { - for (size_t j = 0; j < N; j += microN) - { - size_t n = Simd::Min(microN, N - j); - size_t k = 0; - if (microN == 1 * F) - { - if (n == microN) - { - for (; k < K; ++k) - { - const float * b = B + k * ldb; - Store(pB + 0 * F, Load(b + 0 * F)); - pB += microN; - } - } - else - { - float32x4_t mask0 = LeftNotZero32f(n - 0 * F); - for (; k < K - 1; ++k) - { - const float * b = B + k * ldb; - Store(pB + 0 * F, And(mask0, Load(b + 0 * F))); - pB += microN; - } - } - } - else if (microN == 2 * F) - { - if (n == microN) - { - for (; k < K; ++k) - { - const float * b = B + k * ldb; - Store(pB + 0 * F, Load(b + 0 * F)); - Store(pB + 1 * F, Load(b + 1 * F)); - pB += microN; - } - } - else - { - float32x4_t mask0 = LeftNotZero32f(n - 0 * F); - float32x4_t mask1 = LeftNotZero32f(n - 1 * F); - for (; k < K - 1; ++k) - { - const float * b = B + k * ldb; - Store(pB + 0 * F, And(mask0, Load(b + 0 * F))); - Store(pB + 1 * F, And(mask1, Load(b + 1 * F))); - pB += microN; - } - } - } - else if (microN == 3 * F) - { - if (n == microN) - { - for (; k < K; ++k) - { - const float * b = B + k * ldb; - Store(pB + 0 * F, Load(b + 0 * F)); - Store(pB + 1 * F, Load(b + 1 * F)); - Store(pB + 2 * F, Load(b + 2 * F)); - pB += microN; - } - } - else - { - float32x4_t mask0 = LeftNotZero32f(n - 0 * F); - float32x4_t mask1 = LeftNotZero32f(n - 1 * F); - float32x4_t mask2 = LeftNotZero32f(n - 2 * F); - for (; k < K - 1; ++k) - { - const float * b = B + k * ldb; - Store(pB + 0 * F, And(mask0, Load(b + 0 * F))); - Store(pB + 1 * F, And(mask1, Load(b + 1 * F))); - Store(pB + 2 * F, And(mask2, Load(b + 2 * F))); - pB += microN; - } - } - } - else if (microN == 4 * F) - { - if (n == microN) - { - for (; k < K; ++k) - { - const float* b = B + k * ldb; - Store(pB + 0 * F, Load(b + 0 * F)); - Store(pB + 1 * F, Load(b + 1 * F)); - Store(pB + 2 * F, Load(b + 2 * F)); - Store(pB + 3 * F, Load(b + 3 * F)); - pB += microN; - } - } - else - { - float32x4_t mask0 = LeftNotZero32f(n - 0 * F); - float32x4_t mask1 = LeftNotZero32f(n - 1 * F); - float32x4_t mask2 = LeftNotZero32f(n - 2 * F); - float32x4_t mask3 = LeftNotZero32f(n - 3 * F); - for (; k < K - 1; ++k) - { - const float* b = B + k * ldb; - Store(pB + 0 * F, And(mask0, Load(b + 0 * F))); - Store(pB + 1 * F, And(mask1, Load(b + 1 * F))); - Store(pB + 2 * F, And(mask2, Load(b + 2 * F))); - Store(pB + 3 * F, And(mask2, Load(b + 3 * F))); - pB += microN; - } - } - } - for (; k < K; ++k) - { - const float * b = B + k * ldb; - size_t c = 0; - for (; c < n; ++c) - *(pB++) = *(b++); - for (; c < microN; ++c) - *(pB++) = 0; - } - B += microN; - } - } - - SIMD_INLINE void ScaleC(float * C, float32x4_t beta) - { - Store(C, vmulq_f32(Load(C), beta)); - } - - void GemmScaleC(size_t M, size_t N, float beta, float * C, size_t ldc) - { - if (beta == 1.0f) - return; - else if (beta == 0.0f) - { - for (size_t i = 0; i < M; ++i) - memset(C + i * ldc, 0, N * sizeof(float)); - } - else - { - size_t NQF = AlignLo(N, QF); - size_t NF = AlignLo(N, F); - float32x4_t _beta = vdupq_n_f32(beta); - for (size_t i = 0; i < M; ++i) - { - size_t j = 0; - for (; j < NQF; j += QF) - { - ScaleC(C + j + F * 0, _beta); - ScaleC(C + j + F * 1, _beta); - ScaleC(C + j + F * 2, _beta); - ScaleC(C + j + F * 3, _beta); - } - for (; j < NF; j += F) - ScaleC(C + j, _beta); - for (; j < N; ++j) - C[j] *= beta; - C += ldc; - } - } - } - - void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc) - { - typedef Simd::GemmNN GemmNN; - GemmNN::Main kernelMM, kernelMT; - GemmNN::Tail kernelTM, kernelTT; - size_t microM, microN, L1, L2; -#if defined(SIMD_ARM64_ENABLE) - if (N == 8 || M == 12 || M * 8 < N) - { - microM = 12; - microN = 8; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = GemmKernel12x8nn; - kernelMT = tail > F ? GemmKernel12x8nn : GemmKernel12x4nn; - kernelTM = GemmKernelMx8nn; - kernelTT = tail > F ? GemmKernelMx8nn : GemmKernelMx4nn; - } - else if(N == 12 || N == 24 || M == 8 || M == 16) - { - microM = 8; - microN = 12; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = GemmKernel8x12nn; - kernelMT = tail > DF ? GemmKernel8x12nn : (tail > F ? GemmKernel8x8nn : GemmKernel8x4nn); - kernelTM = GemmKernelMx12nn; - kernelTT = tail > DF ? GemmKernelMx12nn : (tail > F ? GemmKernelMx8nn : GemmKernelMx4nn); - } - else - { - microM = 6; - microN = 16; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = GemmKernel6x16nn; - kernelMT = tail > 3 * F ? GemmKernel6x16nn : (tail > 2 * F ? GemmKernel6x12nn : (tail > F ? GemmKernel6x8nn : GemmKernel6x4nn)); - kernelTM = GemmKernelMx16nn; - kernelTT = tail > 3 * F ? GemmKernelMx16nn : (tail > 2 * F ? GemmKernelMx12nn : (tail > F ? GemmKernelMx8nn : GemmKernelMx4nn)); - } -#else - if (N != 12 && M != 4 && M != 8) - { - microM = 6; - microN = 8; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = GemmKernel6x8nn; - kernelMT = tail > F ? GemmKernel6x8nn : GemmKernel6x4nn; - kernelTM = GemmKernelMx8nn; - kernelTT = tail > F ? GemmKernelMx8nn : GemmKernelMx4nn; - } - else - { - microM = 4; - microN = 12; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = GemmKernel4x12nn; - kernelMT = tail > DF ? GemmKernel4x12nn : (tail > F ? GemmKernel4x8nn : GemmKernel4x4nn); - kernelTM = GemmKernelMx12nn; - kernelTT = tail > DF ? GemmKernelMx12nn : (tail > F ? GemmKernelMx8nn : GemmKernelMx4nn); - } -#endif - GemmNN::PackA packA = GemmPackA; - L1 = N > 4096 ? Base::AlgCacheL2() : Base::AlgCacheL1(); - L2 = N > 4096 ? Base::AlgCacheL3() : Base::AlgCacheL2(); - GemmNN gemmNN(M, N, K, microM, microN, L1, L2, Base::AlgCacheL3(), F, - kernelMM, kernelMT, kernelTM, kernelTT, packA, GemmPackB, GemmScaleC, NULL); - gemmNN.Run(alpha, A, lda, B, ldb, beta, C, ldc); - } - - //--------------------------------------------------------------------- - - typedef Simd::GemmNNcb Gemm32fNNcb; - - SIMD_INLINE Gemm32fNNcb CreateGemm32fNNcb(size_t M, size_t N, size_t K, GemmKernelType type, bool compatibility) - { - Gemm32fNNcb::Main kernelMM, kernelMT; - Gemm32fNNcb::Tail kernelTM, kernelTT; - size_t microM, microN; -#if defined(SIMD_ARM64_ENABLE) - if (type == GemmKernelF4 || (type == GemmKernelAny && (M != 4 && M != 8 && M != 16) && N > 12)) - { - microN = 16; - size_t tail = N - AlignLoAny(N, microN); - microM = 6; - kernelMM = Neon::GemmKernel6x16nn; - kernelMT = tail > 3 * F ? Neon::GemmKernel6x16nn : (tail > DF ? Neon::GemmKernel6x12nn : (tail > F ? Neon::GemmKernel6x8nn : Neon::GemmKernel6x4nn)); - kernelTM = Neon::GetGemmTail(M % microM, microN); - kernelTT = Neon::GetGemmTail(M % microM, tail); - type = GemmKernelF4; - } - if (type == GemmKernelF3 || (type == GemmKernelAny && (M == 4 || M == 8 || M == 16) && N > 8)) - { - microN = 12; - size_t tail = N - AlignLoAny(N, microN); - if (M == 4) - { - microM = 4; - kernelMM = Neon::GemmKernel4x12nn; - kernelMT = tail > DF ? Neon::GemmKernel4x12nn : (tail > F ? Neon::GemmKernel4x8nn : Neon::GemmKernel4x4nn); - } - else - { - microM = 8; - kernelMM = Neon::GemmKernel8x12nn; - kernelMT = tail > DF ? Neon::GemmKernel8x12nn : (tail > F ? Neon::GemmKernel8x8nn : Neon::GemmKernel8x4nn); - } - kernelTM = Neon::GetGemmTail(M % microM, microN); - kernelTT = Neon::GetGemmTail(M % microM, tail); - type = GemmKernelF3; - } - if (type == GemmKernelF2 || (type == GemmKernelF3 && N <= 8) || (type == GemmKernelAny && N > 4)) - { - microN = 8; - size_t tail = N - AlignLoAny(N, microN); - if (M == 6) - { - microM = 6; - kernelMM = Neon::GemmKernel6x8nn; - kernelMT = tail > F ? Neon::GemmKernel6x8nn : Neon::GemmKernel6x4nn; - } - else - { - microM = 12; - kernelMM = Neon::GemmKernel12x8nn; - kernelMT = tail > F ? Neon::GemmKernel12x8nn : Neon::GemmKernel12x4nn; - } - kernelTM = Neon::GetGemmTail(M % microM, microN); - kernelTT = Neon::GetGemmTail(M % microM, tail); - type = GemmKernelF2; - } - if (type == GemmKernelF1 || (type == GemmKernelF2 && N <= 4) || type == GemmKernelAny) - { - microM = 12; - microN = 4; - kernelMM = Neon::GemmKernel12x4nn; - kernelMT = Neon::GemmKernel12x4nn; - kernelTM = Neon::GetGemmTail(M % microM, microN); - kernelTT = Neon::GetGemmTail(M % microM, microN); - type = GemmKernelF1; - } -#else - if (type == GemmKernelF3 || (type == GemmKernelAny && (M == 4 || M == 8 || M == 16) && N > 8)) - { - microM = 4; - microN = 12; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = Neon::GemmKernel4x12nn; - kernelMT = tail > DF ? Neon::GemmKernel4x12nn : (tail > F ? Neon::GemmKernel4x8nn : Neon::GemmKernel4x4nn); - kernelTM = Neon::GetGemmTail(M%microM, microN); - kernelTT = Neon::GetGemmTail(M%microM, tail); - type = GemmKernelF3; - } - if (type == GemmKernelF2 || (type == GemmKernelF3 && N <= 8) || (type == GemmKernelAny && N > 4)) - { - microM = 6; - microN = 8; - size_t tail = N - AlignLoAny(N, microN); - kernelMM = Neon::GemmKernel6x8nn; - kernelMT = tail > F ? Neon::GemmKernel6x8nn : Neon::GemmKernel6x4nn; - kernelTM = Neon::GetGemmTail(M%microM, microN); - kernelTT = Neon::GetGemmTail(M%microM, tail); - type = GemmKernelF2; - } - if (type == GemmKernelF1 || (type == GemmKernelF2 && N <= 4) || type == GemmKernelAny) - { - microM = 6; - microN = 4; - kernelMM = Neon::GemmKernel6x4nn; - kernelMT = Neon::GemmKernel6x4nn; - kernelTM = Neon::GetGemmTail(M%microM, microN); - kernelTT = Neon::GetGemmTail(M%microM, microN); - type = GemmKernelF1; - } -#endif - Gemm32fNNcb::PackA packA = ((M * 3 < N && N >= 512 && K >= 128 && M > 16) || (K >= 256 && M > 256)) ? Neon::GemmPackA : NULL; - return Gemm32fNNcb(M, N, K, microM, microN, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), F, - kernelMM, kernelMT, kernelTM, kernelTT, packA, Neon::GemmPackB, Neon::GemmScaleC, NULL, compatibility); - } - - size_t Gemm32fNNcbBufferSize(size_t M, size_t N, size_t K, GemmKernelType type, bool compatibility) - { - Gemm32fNNcb gemm = CreateGemm32fNNcb(M, N, K, type, compatibility); - return gemm.BufferSize(); - } - - void Gemm32fNNcbReorderB(size_t M, size_t N, size_t K, const float * B, float * pB, GemmKernelType type, bool compatibility) - { - Gemm32fNNcb gemm = CreateGemm32fNNcb(M, N, K, type, compatibility); - gemm.ReorderB(B, N, pB); - } - - void Gemm32fNNcbRun(size_t M, size_t N, size_t K, const float * A, const float * pB, float * C, GemmKernelType type, bool compatibility) - { - Gemm32fNNcb gemm = CreateGemm32fNNcb(M, N, K, type, compatibility); - gemm.Run(A, K, pB, C, N); - } - - //--------------------------------------------------------------------- - - SIMD_INLINE float32x4_t Tail(size_t tail) - { - const int32_t mask[DF] = { 0, 0, 0, 0, -1, -1, -1, -1 }; - return Load((float*)(mask + tail)); - } - - SIMD_INLINE void Add4ExtractedSums(const float32x4_t & sum0, const float32x4_t & sum1, const float32x4_t & sum2, const float32x4_t & sum3, const float32x4_t & alpha, float * dst) - { - float32x4x2_t a02 = vzipq_f32(sum0, sum2); - float32x4x2_t a13 = vzipq_f32(sum1, sum3); - float32x4x2_t b0 = vzipq_f32(a02.val[0], a13.val[0]); - float32x4x2_t b1 = vzipq_f32(a02.val[1], a13.val[1]); - Store(dst, vmlaq_f32(Load(dst), alpha, vaddq_f32(vaddq_f32(b0.val[0], b0.val[1]), vaddq_f32(b1.val[0], b1.val[1])))); - } - - static void Kernel1x1x4nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K4 = K & (~3); - const float * A0 = A + 0 * lda; - const float * B0 = B + 0 * ldb; - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t a0, b0; - for (size_t k = 0; k < K4; k += 4) - { - a0 = Load(A0 + k); - b0 = Load(B0 + k); - c00 = vmlaq_f32(c00, a0, b0); - } - if (K4 < K) - { - size_t k = K - 4; - float32x4_t tail = Tail(K - K4); - a0 = And(tail, Load(A0 + k)); - b0 = Load(B0 + k); - c00 = vmlaq_f32(c00, a0, b0); - } - C[0] += alpha * ExtractSum32f(c00); - } - - static void Kernel1x4x4nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K4 = K & (~3); - const float * A0 = A + 0 * lda; - const float * B0 = B + 0 * ldb; - const float * B1 = B + 1 * ldb; - const float * B2 = B + 2 * ldb; - const float * B3 = B + 3 * ldb; - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c01 = vdupq_n_f32(0); - float32x4_t c02 = vdupq_n_f32(0); - float32x4_t c03 = vdupq_n_f32(0); - float32x4_t a0, b0; - for (size_t k = 0; k < K4; k += 4) - { - a0 = Load(A0 + k); - b0 = Load(B0 + k); - c00 = vmlaq_f32(c00, a0, b0); - b0 = Load(B1 + k); - c01 = vmlaq_f32(c01, a0, b0); - b0 = Load(B2 + k); - c02 = vmlaq_f32(c02, a0, b0); - b0 = Load(B3 + k); - c03 = vmlaq_f32(c03, a0, b0); - } - if (K4 < K) - { - size_t k = K - 4; - float32x4_t tail = Tail(K - K4); - a0 = And(tail, Load(A0 + k)); - b0 = Load(B0 + k); - c00 = vmlaq_f32(c00, a0, b0); - b0 = Load(B1 + k); - c01 = vmlaq_f32(c01, a0, b0); - b0 = Load(B2 + k); - c02 = vmlaq_f32(c02, a0, b0); - b0 = Load(B3 + k); - c03 = vmlaq_f32(c03, a0, b0); - } - float32x4_t _alpha = vdupq_n_f32(alpha); - Add4ExtractedSums(c00, c01, c02, c03, _alpha, C + 0 * ldc); - } - - static void Kernel2x1x4nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K4 = K & (~3); - const float * A0 = A + 0 * lda; - const float * A1 = A + 1 * lda; - const float * B0 = B + 0 * ldb; - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c10 = vdupq_n_f32(0); - float32x4_t a0, a1, b0; - for (size_t k = 0; k < K4; k += 4) - { - a0 = Load(A0 + k); - a1 = Load(A1 + k); - b0 = Load(B0 + k); - c00 = vmlaq_f32(c00, a0, b0); - c10 = vmlaq_f32(c10, a1, b0); - } - if (K4 < K) - { - size_t k = K - 4; - float32x4_t tail = Tail(K - K4); - a0 = And(tail, Load(A0 + k)); - a1 = And(tail, Load(A1 + k)); - b0 = Load(B0 + k); - c00 = vmlaq_f32(c00, a0, b0); - c10 = vmlaq_f32(c10, a1, b0); - } - C[0 * ldc] += alpha * ExtractSum32f(c00); - C[1 * ldc] += alpha * ExtractSum32f(c10); - } - - static void Kernel2x4x4nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K4 = K & (~3); - const float * A0 = A + 0 * lda; - const float * A1 = A + 1 * lda; - const float * B0 = B + 0 * ldb; - const float * B1 = B + 1 * ldb; - const float * B2 = B + 2 * ldb; - const float * B3 = B + 3 * ldb; - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c01 = vdupq_n_f32(0); - float32x4_t c02 = vdupq_n_f32(0); - float32x4_t c03 = vdupq_n_f32(0); - float32x4_t c10 = vdupq_n_f32(0); - float32x4_t c11 = vdupq_n_f32(0); - float32x4_t c12 = vdupq_n_f32(0); - float32x4_t c13 = vdupq_n_f32(0); - float32x4_t a0, a1, b0; - for (size_t k = 0; k < K4; k += 4) - { - a0 = Load(A0 + k); - a1 = Load(A1 + k); - b0 = Load(B0 + k); - c00 = vmlaq_f32(c00, a0, b0); - c10 = vmlaq_f32(c10, a1, b0); - b0 = Load(B1 + k); - c01 = vmlaq_f32(c01, a0, b0); - c11 = vmlaq_f32(c11, a1, b0); - b0 = Load(B2 + k); - c02 = vmlaq_f32(c02, a0, b0); - c12 = vmlaq_f32(c12, a1, b0); - b0 = Load(B3 + k); - c03 = vmlaq_f32(c03, a0, b0); - c13 = vmlaq_f32(c13, a1, b0); - } - if (K4 < K) - { - size_t k = K - 4; - float32x4_t tail = Tail(K - K4); - a0 = And(tail, Load(A0 + k)); - a1 = And(tail, Load(A1 + k)); - b0 = Load(B0 + k); - c00 = vmlaq_f32(c00, a0, b0); - c10 = vmlaq_f32(c10, a1, b0); - b0 = Load(B1 + k); - c01 = vmlaq_f32(c01, a0, b0); - c11 = vmlaq_f32(c11, a1, b0); - b0 = Load(B2 + k); - c02 = vmlaq_f32(c02, a0, b0); - c12 = vmlaq_f32(c12, a1, b0); - b0 = Load(B3 + k); - c03 = vmlaq_f32(c03, a0, b0); - c13 = vmlaq_f32(c13, a1, b0); - } - float32x4_t _alpha = vdupq_n_f32(alpha); - Add4ExtractedSums(c00, c01, c02, c03, _alpha, C + 0 * ldc); - Add4ExtractedSums(c10, c11, c12, c13, _alpha, C + 1 * ldc); - } - - static void Kernel3x1x4nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K4 = K & (~3); - const float * A0 = A + 0 * lda; - const float * A1 = A + 1 * lda; - const float * A2 = A + 2 * lda; - const float * B0 = B + 0 * ldb; - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c10 = vdupq_n_f32(0); - float32x4_t c20 = vdupq_n_f32(0); - float32x4_t a0, a1, a2, b0; - for (size_t k = 0; k < K4; k += 4) - { - a0 = Load(A0 + k); - a1 = Load(A1 + k); - a2 = Load(A2 + k); - b0 = Load(B0 + k); - c00 = vmlaq_f32(c00, a0, b0); - c10 = vmlaq_f32(c10, a1, b0); - c20 = vmlaq_f32(c20, a2, b0); - } - if (K4 < K) - { - size_t k = K - 4; - float32x4_t tail = Tail(K - K4); - a0 = And(tail, Load(A0 + k)); - a1 = And(tail, Load(A1 + k)); - a2 = And(tail, Load(A2 + k)); - b0 = Load(B0 + k); - c00 = vmlaq_f32(c00, a0, b0); - c10 = vmlaq_f32(c10, a1, b0); - c20 = vmlaq_f32(c20, a2, b0); - } - C[0 * ldc] += alpha * ExtractSum32f(c00); - C[1 * ldc] += alpha * ExtractSum32f(c10); - C[2 * ldc] += alpha * ExtractSum32f(c20); - } - - static void Kernel3x4x4nt(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc) - { - size_t K4 = K & (~3); - const float * A0 = A + 0 * lda; - const float * A1 = A + 1 * lda; - const float * A2 = A + 2 * lda; - const float * B0 = B + 0 * ldb; - const float * B1 = B + 1 * ldb; - const float * B2 = B + 2 * ldb; - const float * B3 = B + 3 * ldb; - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c01 = vdupq_n_f32(0); - float32x4_t c02 = vdupq_n_f32(0); - float32x4_t c03 = vdupq_n_f32(0); - float32x4_t c10 = vdupq_n_f32(0); - float32x4_t c11 = vdupq_n_f32(0); - float32x4_t c12 = vdupq_n_f32(0); - float32x4_t c13 = vdupq_n_f32(0); - float32x4_t c20 = vdupq_n_f32(0); - float32x4_t c21 = vdupq_n_f32(0); - float32x4_t c22 = vdupq_n_f32(0); - float32x4_t c23 = vdupq_n_f32(0); - float32x4_t a0, a1, a2, b0; - for (size_t k = 0; k < K4; k += 4) - { - a0 = Load(A0 + k); - a1 = Load(A1 + k); - a2 = Load(A2 + k); - b0 = Load(B0 + k); - c00 = vmlaq_f32(c00, a0, b0); - c10 = vmlaq_f32(c10, a1, b0); - c20 = vmlaq_f32(c20, a2, b0); - b0 = Load(B1 + k); - c01 = vmlaq_f32(c01, a0, b0); - c11 = vmlaq_f32(c11, a1, b0); - c21 = vmlaq_f32(c21, a2, b0); - b0 = Load(B2 + k); - c02 = vmlaq_f32(c02, a0, b0); - c12 = vmlaq_f32(c12, a1, b0); - c22 = vmlaq_f32(c22, a2, b0); - b0 = Load(B3 + k); - c03 = vmlaq_f32(c03, a0, b0); - c13 = vmlaq_f32(c13, a1, b0); - c23 = vmlaq_f32(c23, a2, b0); - } - if (K4 < K) - { - size_t k = K - 4; - float32x4_t tail = Tail(K - K4); - a0 = And(tail, Load(A0 + k)); - a1 = And(tail, Load(A1 + k)); - a2 = And(tail, Load(A2 + k)); - b0 = Load(B0 + k); - c00 = vmlaq_f32(c00, a0, b0); - c10 = vmlaq_f32(c10, a1, b0); - c20 = vmlaq_f32(c20, a2, b0); - b0 = Load(B1 + k); - c01 = vmlaq_f32(c01, a0, b0); - c11 = vmlaq_f32(c11, a1, b0); - c21 = vmlaq_f32(c21, a2, b0); - b0 = Load(B2 + k); - c02 = vmlaq_f32(c02, a0, b0); - c12 = vmlaq_f32(c12, a1, b0); - c22 = vmlaq_f32(c22, a2, b0); - b0 = Load(B3 + k); - c03 = vmlaq_f32(c03, a0, b0); - c13 = vmlaq_f32(c13, a1, b0); - c23 = vmlaq_f32(c23, a2, b0); - } - float32x4_t _alpha = vdupq_n_f32(alpha); - Add4ExtractedSums(c00, c01, c02, c03, _alpha, C + 0 * ldc); - Add4ExtractedSums(c10, c11, c12, c13, _alpha, C + 1 * ldc); - Add4ExtractedSums(c20, c21, c22, c23, _alpha, C + 2 * ldc); - } - - static void Kernel6x1x4nt(size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, float* C, size_t ldc) - { - size_t K4 = K & (~3); - const float* A0 = A + 0 * lda; - const float* A1 = A + 1 * lda; - const float* A2 = A + 2 * lda; - const float* A3 = A + 3 * lda; - const float* A4 = A + 4 * lda; - const float* A5 = A + 5 * lda; - const float* B0 = B + 0 * ldb; - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c10 = vdupq_n_f32(0); - float32x4_t c20 = vdupq_n_f32(0); - float32x4_t c30 = vdupq_n_f32(0); - float32x4_t c40 = vdupq_n_f32(0); - float32x4_t c50 = vdupq_n_f32(0); - float32x4_t a0, a1, a2, a3, a4, a5, b0; - for (size_t k = 0; k < K4; k += 4) - { - a0 = Load(A0 + k); - a1 = Load(A1 + k); - a2 = Load(A2 + k); - a3 = Load(A3 + k); - a4 = Load(A4 + k); - a5 = Load(A5 + k); - b0 = Load(B0 + k); - c00 = vmlaq_f32(c00, a0, b0); - c10 = vmlaq_f32(c10, a1, b0); - c20 = vmlaq_f32(c20, a2, b0); - c30 = vmlaq_f32(c30, a3, b0); - c40 = vmlaq_f32(c40, a4, b0); - c50 = vmlaq_f32(c50, a5, b0); - } - if (K4 < K) - { - size_t k = K - 4; - float32x4_t tail = Tail(K - K4); - a0 = And(tail, Load(A0 + k)); - a1 = And(tail, Load(A1 + k)); - a2 = And(tail, Load(A2 + k)); - a3 = And(tail, Load(A3 + k)); - a4 = And(tail, Load(A4 + k)); - a5 = And(tail, Load(A5 + k)); - b0 = Load(B0 + k); - c00 = vmlaq_f32(c00, a0, b0); - c10 = vmlaq_f32(c10, a1, b0); - c20 = vmlaq_f32(c20, a2, b0); - c30 = vmlaq_f32(c30, a3, b0); - c40 = vmlaq_f32(c40, a4, b0); - c50 = vmlaq_f32(c50, a5, b0); - } - C[0 * ldc] += alpha * ExtractSum32f(c00); - C[1 * ldc] += alpha * ExtractSum32f(c10); - C[2 * ldc] += alpha * ExtractSum32f(c20); - C[3 * ldc] += alpha * ExtractSum32f(c30); - C[4 * ldc] += alpha * ExtractSum32f(c40); - C[5 * ldc] += alpha * ExtractSum32f(c50); - } - - static void Kernel6x4x4nt(size_t K, float alpha, const float* A, size_t lda, const float* B, size_t ldb, float* C, size_t ldc) - { - size_t K4 = K & (~3); - const float* A0 = A + 0 * lda; - const float* A1 = A + 1 * lda; - const float* A2 = A + 2 * lda; - const float* A3 = A + 3 * lda; - const float* A4 = A + 4 * lda; - const float* A5 = A + 5 * lda; - const float* B0 = B + 0 * ldb; - const float* B1 = B + 1 * ldb; - const float* B2 = B + 2 * ldb; - const float* B3 = B + 3 * ldb; - float32x4_t c00 = vdupq_n_f32(0); - float32x4_t c01 = vdupq_n_f32(0); - float32x4_t c02 = vdupq_n_f32(0); - float32x4_t c03 = vdupq_n_f32(0); - float32x4_t c10 = vdupq_n_f32(0); - float32x4_t c11 = vdupq_n_f32(0); - float32x4_t c12 = vdupq_n_f32(0); - float32x4_t c13 = vdupq_n_f32(0); - float32x4_t c20 = vdupq_n_f32(0); - float32x4_t c21 = vdupq_n_f32(0); - float32x4_t c22 = vdupq_n_f32(0); - float32x4_t c23 = vdupq_n_f32(0); - float32x4_t c30 = vdupq_n_f32(0); - float32x4_t c31 = vdupq_n_f32(0); - float32x4_t c32 = vdupq_n_f32(0); - float32x4_t c33 = vdupq_n_f32(0); - float32x4_t c40 = vdupq_n_f32(0); - float32x4_t c41 = vdupq_n_f32(0); - float32x4_t c42 = vdupq_n_f32(0); - float32x4_t c43 = vdupq_n_f32(0); - float32x4_t c50 = vdupq_n_f32(0); - float32x4_t c51 = vdupq_n_f32(0); - float32x4_t c52 = vdupq_n_f32(0); - float32x4_t c53 = vdupq_n_f32(0); - float32x4_t a0, a1, a2, a3, a4, a5, b0; - for (size_t k = 0; k < K4; k += 4) - { - a0 = Load(A0 + k); - a1 = Load(A1 + k); - a2 = Load(A2 + k); - a3 = Load(A3 + k); - a4 = Load(A4 + k); - a5 = Load(A5 + k); - b0 = Load(B0 + k); - c00 = vmlaq_f32(c00, a0, b0); - c10 = vmlaq_f32(c10, a1, b0); - c20 = vmlaq_f32(c20, a2, b0); - c30 = vmlaq_f32(c30, a3, b0); - c40 = vmlaq_f32(c40, a4, b0); - c50 = vmlaq_f32(c50, a5, b0); - b0 = Load(B1 + k); - c01 = vmlaq_f32(c01, a0, b0); - c11 = vmlaq_f32(c11, a1, b0); - c21 = vmlaq_f32(c21, a2, b0); - c31 = vmlaq_f32(c31, a3, b0); - c41 = vmlaq_f32(c41, a4, b0); - c51 = vmlaq_f32(c51, a5, b0); - b0 = Load(B2 + k); - c02 = vmlaq_f32(c02, a0, b0); - c12 = vmlaq_f32(c12, a1, b0); - c22 = vmlaq_f32(c22, a2, b0); - c32 = vmlaq_f32(c32, a3, b0); - c42 = vmlaq_f32(c42, a4, b0); - c52 = vmlaq_f32(c52, a5, b0); - b0 = Load(B3 + k); - c03 = vmlaq_f32(c03, a0, b0); - c13 = vmlaq_f32(c13, a1, b0); - c23 = vmlaq_f32(c23, a2, b0); - c33 = vmlaq_f32(c33, a3, b0); - c43 = vmlaq_f32(c43, a4, b0); - c53 = vmlaq_f32(c53, a5, b0); - } - if (K4 < K) - { - size_t k = K - 4; - float32x4_t tail = Tail(K - K4); - a0 = And(tail, Load(A0 + k)); - a1 = And(tail, Load(A1 + k)); - a2 = And(tail, Load(A2 + k)); - a3 = And(tail, Load(A3 + k)); - a4 = And(tail, Load(A4 + k)); - a5 = And(tail, Load(A5 + k)); - b0 = Load(B0 + k); - c00 = vmlaq_f32(c00, a0, b0); - c10 = vmlaq_f32(c10, a1, b0); - c20 = vmlaq_f32(c20, a2, b0); - c30 = vmlaq_f32(c30, a3, b0); - c40 = vmlaq_f32(c40, a4, b0); - c50 = vmlaq_f32(c50, a5, b0); - b0 = Load(B1 + k); - c01 = vmlaq_f32(c01, a0, b0); - c11 = vmlaq_f32(c11, a1, b0); - c21 = vmlaq_f32(c21, a2, b0); - c31 = vmlaq_f32(c31, a3, b0); - c41 = vmlaq_f32(c41, a4, b0); - c51 = vmlaq_f32(c51, a5, b0); - b0 = Load(B2 + k); - c02 = vmlaq_f32(c02, a0, b0); - c12 = vmlaq_f32(c12, a1, b0); - c22 = vmlaq_f32(c22, a2, b0); - c32 = vmlaq_f32(c32, a3, b0); - c42 = vmlaq_f32(c42, a4, b0); - c52 = vmlaq_f32(c52, a5, b0); - b0 = Load(B3 + k); - c03 = vmlaq_f32(c03, a0, b0); - c13 = vmlaq_f32(c13, a1, b0); - c23 = vmlaq_f32(c23, a2, b0); - c33 = vmlaq_f32(c33, a3, b0); - c43 = vmlaq_f32(c43, a4, b0); - c53 = vmlaq_f32(c53, a5, b0); - } - float32x4_t _alpha = vdupq_n_f32(alpha); - Add4ExtractedSums(c00, c01, c02, c03, _alpha, C + 0 * ldc); - Add4ExtractedSums(c10, c11, c12, c13, _alpha, C + 1 * ldc); - Add4ExtractedSums(c20, c21, c22, c23, _alpha, C + 2 * ldc); - Add4ExtractedSums(c30, c31, c32, c33, _alpha, C + 3 * ldc); - Add4ExtractedSums(c40, c41, c42, c43, _alpha, C + 4 * ldc); - Add4ExtractedSums(c50, c51, c52, c53, _alpha, C + 5 * ldc); - } - - void Gemm32fNT(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc) - { - typedef Simd::GemmNT GemmNT; -#if defined(SIMD_ARM64_ENABLE) - GemmNT gemmNT(M, N, K, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), F, GemmScaleC, - Kernel1x1x4nt, Kernel1x4x4nt, Kernel2x1x4nt, Kernel2x4x4nt, Kernel3x1x4nt, Kernel3x4x4nt, Kernel6x1x4nt, Kernel6x4x4nt); -#else - GemmNT gemmNT(M, N, K, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), F, GemmScaleC, - Kernel1x1x4nt, Kernel1x4x4nt, Kernel2x1x4nt, Kernel2x4x4nt, Kernel3x1x4nt, Kernel3x4x4nt, NULL, NULL); -#endif - gemmNT.Run(alpha, A, lda, B, ldb, beta, C, ldc); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonGrayToBgr.cpp b/src/3rd/Simd/Simd/SimdNeonGrayToBgr.cpp deleted file mode 100644 index 3154e04c..00000000 --- a/src/3rd/Simd/Simd/SimdNeonGrayToBgr.cpp +++ /dev/null @@ -1,71 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - const size_t A3 = A * 3; - - template SIMD_INLINE void GrayToBgr(const uint8_t * gray, uint8_t * bgr) - { - uint8x16x3_t _bgr; - _bgr.val[0] = Load(gray); - _bgr.val[1] = _bgr.val[0]; - _bgr.val[2] = _bgr.val[0]; - Store3(bgr, _bgr); - } - - template void GrayToBgr(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgr, size_t bgrStride) - { - assert(width >= A); - if (align) - assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(gray) && Aligned(grayStride)); - - size_t alignedWidth = AlignLo(width, A); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colBgr = 0; col < alignedWidth; col += A, colBgr += A3) - GrayToBgr(gray + col, bgr + colBgr); - if (width != alignedWidth) - GrayToBgr(gray + width - A, bgr + 3 * (width - A)); - gray += grayStride; - bgr += bgrStride; - } - } - - void GrayToBgr(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgr, size_t bgrStride) - { - if (Aligned(bgr) && Aligned(gray) && Aligned(bgrStride) && Aligned(grayStride)) - GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); - else - GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonGrayToBgra.cpp b/src/3rd/Simd/Simd/SimdNeonGrayToBgra.cpp deleted file mode 100644 index d4bcae91..00000000 --- a/src/3rd/Simd/Simd/SimdNeonGrayToBgra.cpp +++ /dev/null @@ -1,73 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - const size_t A4 = A * 4; - - template SIMD_INLINE void GrayToBgra(const uint8_t * gray, uint8_t * bgra, uint8x16x4_t & _bgra) - { - _bgra.val[0] = Load(gray); - _bgra.val[1] = _bgra.val[0]; - _bgra.val[2] = _bgra.val[0]; - Store4(bgra, _bgra); - } - - template void GrayToBgra(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - assert(width >= A); - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(gray) && Aligned(grayStride)); - - size_t alignedWidth = AlignLo(width, A); - - uint8x16x4_t _bgra; - _bgra.val[3] = vdupq_n_u8(alpha); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colBgra = 0; col < alignedWidth; col += A, colBgra += A4) - GrayToBgra(gray + col, bgra + colBgra, _bgra); - if (width != alignedWidth) - GrayToBgra(gray + width - A, bgra + 4 * (width - A), _bgra); - gray += grayStride; - bgra += bgraStride; - } - } - - void GrayToBgra(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(bgra) && Aligned(gray) && Aligned(bgraStride) && Aligned(grayStride)) - GrayToBgra(gray, width, height, grayStride, bgra, bgraStride, alpha); - else - GrayToBgra(gray, width, height, grayStride, bgra, bgraStride, alpha); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonHistogram.cpp b/src/3rd/Simd/Simd/SimdNeonHistogram.cpp deleted file mode 100644 index 8d347011..00000000 --- a/src/3rd/Simd/Simd/SimdNeonHistogram.cpp +++ /dev/null @@ -1,276 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - namespace - { - template struct Buffer - { - Buffer(size_t rowSize, size_t histogramSize) - { - _p = Allocate(sizeof(T)*rowSize + 4 * sizeof(uint32_t)*histogramSize); - v = (T*)_p; - h[0] = (uint32_t *)(v + rowSize); - h[1] = h[0] + histogramSize; - h[2] = h[1] + histogramSize; - h[3] = h[2] + histogramSize; - memset(h[0], 0, 4 * sizeof(uint32_t)*histogramSize); - } - - ~Buffer() - { - Free(_p); - } - - T * v; - uint32_t * h[4]; - private: - void *_p; - }; - } - - template - SIMD_INLINE uint8x16_t AbsSecondDerivative(const uint8_t * src, ptrdiff_t step) - { - const uint8x16_t s0 = Load(src - step); - const uint8x16_t s1 = Load(src); - const uint8x16_t s2 = Load(src + step); - return vabdq_u8(vrhaddq_u8(s0, s2), s1); - } - - template - SIMD_INLINE void AbsSecondDerivative(const uint8_t * src, ptrdiff_t colStep, ptrdiff_t rowStep, uint8_t * dst) - { - const uint8x16_t sdX = AbsSecondDerivative(src, colStep); - const uint8x16_t sdY = AbsSecondDerivative(src, rowStep); - Store(dst, vmaxq_u8(sdY, sdX)); - } - - SIMD_INLINE void SumHistograms(uint32_t * src, size_t start, uint32_t * dst) - { - uint32_t * src0 = src + start; - uint32_t * src1 = src0 + start + HISTOGRAM_SIZE; - uint32_t * src2 = src1 + start + HISTOGRAM_SIZE; - uint32_t * src3 = src2 + start + HISTOGRAM_SIZE; - for (size_t i = 0; i < HISTOGRAM_SIZE; i += 4) - Store(dst + i, vaddq_u32( - vaddq_u32(Load(src0 + i), Load(src1 + i)), - vaddq_u32(Load(src2 + i), Load(src3 + i)))); - } - - template void AbsSecondDerivativeHistogram(const uint8_t *src, size_t width, size_t height, size_t stride, - size_t step, size_t indent, uint32_t * histogram) - { - Buffer buffer(AlignHi(width, A), HISTOGRAM_SIZE); - buffer.v += indent; - src += indent*(stride + 1); - height -= 2 * indent; - width -= 2 * indent; - - ptrdiff_t bodyStart = (uint8_t*)AlignHi(buffer.v, A) - buffer.v; - ptrdiff_t bodyEnd = bodyStart + AlignLo(width - bodyStart, A); - size_t rowStep = step*stride; - size_t alignedWidth = Simd::AlignLo(width, 4); - for (size_t row = 0; row < height; ++row) - { - if (bodyStart) - AbsSecondDerivative(src, step, rowStep, buffer.v); - for (ptrdiff_t col = bodyStart; col < bodyEnd; col += A) - AbsSecondDerivative(src + col, step, rowStep, buffer.v + col); - if (width != (size_t)bodyEnd) - AbsSecondDerivative(src + width - A, step, rowStep, buffer.v + width - A); - - size_t col = 0; - for (; col < alignedWidth; col += 4) - { - ++buffer.h[0][buffer.v[col + 0]]; - ++buffer.h[1][buffer.v[col + 1]]; - ++buffer.h[2][buffer.v[col + 2]]; - ++buffer.h[3][buffer.v[col + 3]]; - } - for (; col < width; ++col) - ++buffer.h[0][buffer.v[col + 0]]; - src += stride; - } - - SumHistograms(buffer.h[0], 0, histogram); - } - - void AbsSecondDerivativeHistogram(const uint8_t *src, size_t width, size_t height, size_t stride, - size_t step, size_t indent, uint32_t * histogram) - { - assert(width > 2 * indent && height > 2 * indent && indent >= step && width >= A + 2 * indent); - - if (Aligned(src) && Aligned(stride)) - AbsSecondDerivativeHistogram(src, width, height, stride, step, indent, histogram); - else - AbsSecondDerivativeHistogram(src, width, height, stride, step, indent, histogram); - } - - template - SIMD_INLINE void MaskSrc(const uint8_t * src, const uint8_t * mask, const uint8x16_t & index, ptrdiff_t offset, uint16_t * dst) - { - const uint8x16_t _src = Load(src + offset); - const uint8x16_t _mask = vandq_u8(vceqq_u8(Load(mask + offset), index), K8_01); - Store(dst + offset + 0, vmulq_u16(vaddw_u8(K16_0004, Half<0>(_src)), UnpackU8<0>(_mask))); - Store(dst + offset + HA, vmulq_u16(vaddw_u8(K16_0004, Half<1>(_src)), UnpackU8<1>(_mask))); - } - - template void HistogramMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t index, uint32_t * histogram) - { - Buffer buffer(AlignHi(width, A), HISTOGRAM_SIZE + 4); - size_t widthAligned4 = Simd::AlignLo(width, 4); - size_t widthAlignedA = Simd::AlignLo(width, A); - size_t widthAlignedDA = Simd::AlignLo(width, DA); - uint8x16_t _index = vdupq_n_u8(index); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < widthAlignedDA; col += DA) - { - MaskSrc(src, mask, _index, col, buffer.v); - MaskSrc(src, mask, _index, col + A, buffer.v); - } - for (; col < widthAlignedA; col += A) - MaskSrc(src, mask, _index, col, buffer.v); - if (width != widthAlignedA) - MaskSrc(src, mask, _index, width - A, buffer.v); - - for (col = 0; col < widthAligned4; col += 4) - { - ++buffer.h[0][buffer.v[col + 0]]; - ++buffer.h[1][buffer.v[col + 1]]; - ++buffer.h[2][buffer.v[col + 2]]; - ++buffer.h[3][buffer.v[col + 3]]; - } - for (; col < width; ++col) - ++buffer.h[0][buffer.v[col]]; - - src += srcStride; - mask += maskStride; - } - - SumHistograms(buffer.h[0], 4, histogram); - } - - void HistogramMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t index, uint32_t * histogram) - { - assert(width >= A); - - if (Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)) - HistogramMasked(src, srcStride, width, height, mask, maskStride, index, histogram); - else - HistogramMasked(src, srcStride, width, height, mask, maskStride, index, histogram); - } - - template - SIMD_INLINE void ConditionalSrc(const uint8_t * src, const uint8_t * mask, const uint8x16_t & value, ptrdiff_t offset, uint16_t * dst) - { - const uint8x16_t _src = Load(src + offset); - const uint8x16_t _mask = vandq_u8(Compare8u(Load(mask + offset), value), K8_01); - Store(dst + offset + 0, vmulq_u16(vaddw_u8(K16_0004, Half<0>(_src)), UnpackU8<0>(_mask))); - Store(dst + offset + HA, vmulq_u16(vaddw_u8(K16_0004, Half<1>(_src)), UnpackU8<1>(_mask))); - } - - template void HistogramConditional(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint32_t * histogram) - { - Buffer buffer(AlignHi(width, A), HISTOGRAM_SIZE + 4); - size_t widthAligned4 = Simd::AlignLo(width, 4); - size_t widthAlignedA = Simd::AlignLo(width, A); - size_t widthAlignedDA = Simd::AlignLo(width, DA); - uint8x16_t _value = vdupq_n_u8(value); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < widthAlignedDA; col += DA) - { - ConditionalSrc(src, mask, _value, col, buffer.v); - ConditionalSrc(src, mask, _value, col + A, buffer.v); - } - for (; col < widthAlignedA; col += A) - ConditionalSrc(src, mask, _value, col, buffer.v); - if (width != widthAlignedA) - ConditionalSrc(src, mask, _value, width - A, buffer.v); - - for (col = 0; col < widthAligned4; col += 4) - { - ++buffer.h[0][buffer.v[col + 0]]; - ++buffer.h[1][buffer.v[col + 1]]; - ++buffer.h[2][buffer.v[col + 2]]; - ++buffer.h[3][buffer.v[col + 3]]; - } - for (; col < width; ++col) - ++buffer.h[0][buffer.v[col]]; - - src += srcStride; - mask += maskStride; - } - - SumHistograms(buffer.h[0], 4, histogram); - } - - template - void HistogramConditional(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, uint32_t * histogram) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)) - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - else - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - } - - void HistogramConditional(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t value, SimdCompareType compareType, uint32_t * histogram) - { - switch (compareType) - { - case SimdCompareEqual: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - case SimdCompareNotEqual: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - case SimdCompareGreater: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - case SimdCompareGreaterOrEqual: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - case SimdCompareLesser: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - case SimdCompareLesserOrEqual: - return HistogramConditional(src, srcStride, width, height, mask, maskStride, value, histogram); - default: - assert(0); - } - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonHog.cpp b/src/3rd/Simd/Simd/SimdNeonHog.cpp deleted file mode 100644 index 09a3c115..00000000 --- a/src/3rd/Simd/Simd/SimdNeonHog.cpp +++ /dev/null @@ -1,577 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdArray.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdExtract.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE void HogDeinterleave(const float * src, size_t count, float ** dst, size_t offset, size_t i) - { - src += i; - float32x4x2_t a01 = vzipq_f32(Load(src + 0 * count), Load(src + 2 * count)); - float32x4x2_t a23 = vzipq_f32(Load(src + 1 * count), Load(src + 3 * count)); - float32x4x2_t b01 = vzipq_f32(a01.val[0], a23.val[0]); - float32x4x2_t b23 = vzipq_f32(a01.val[1], a23.val[1]); - Store(dst[i + 0] + offset, b01.val[0]); - Store(dst[i + 1] + offset, b01.val[1]); - Store(dst[i + 2] + offset, b23.val[0]); - Store(dst[i + 3] + offset, b23.val[1]); - } - - void HogDeinterleave(const float * src, size_t srcStride, size_t width, size_t height, size_t count, float ** dst, size_t dstStride) - { - assert(width >= F && count >= F); - - size_t alignedCount = AlignLo(count, F); - size_t alignedWidth = AlignLo(width, F); - - for (size_t row = 0; row < height; ++row) - { - size_t rowOffset = row*dstStride; - for (size_t col = 0; col < alignedWidth; col += F) - { - const float * s = src + count*col; - size_t offset = rowOffset + col; - for (size_t i = 0; i < alignedCount; i += F) - HogDeinterleave(s, count, dst, offset, i); - if (alignedCount != count) - HogDeinterleave(s, count, dst, offset, count - F); - } - if (alignedWidth != width) - { - size_t col = width - F; - const float * s = src + count*col; - size_t offset = rowOffset + col; - for (size_t i = 0; i < alignedCount; i += F) - HogDeinterleave(s, count, dst, offset, i); - if (alignedCount != count) - HogDeinterleave(s, count, dst, offset, count - F); - } - src += srcStride; - } - } - - namespace - { - struct Buffer - { - const int size; - float32x4_t * cos, *sin; - int32x4_t * pos, *neg; - int * index; - float * value; - - Buffer(size_t width, size_t quantization) - : size((int)quantization / 2) - { - width = AlignHi(width, A / sizeof(float)); - _p = Allocate(width*(sizeof(int) + sizeof(float)) + (sizeof(int32x4_t) + sizeof(float32x4_t)) * 2 * size); - index = (int*)_p - 1; - value = (float*)index + width; - cos = (float32x4_t*)(value + width + 1); - sin = cos + size; - pos = (int32x4_t*)(sin + size); - neg = pos + size; - for (int i = 0; i < size; ++i) - { - cos[i] = vdupq_n_f32((float)::cos(i*M_PI / size)); - sin[i] = vdupq_n_f32((float)::sin(i*M_PI / size)); - pos[i] = vdupq_n_s32(i); - neg[i] = vdupq_n_s32(size + i); - } - } - - ~Buffer() - { - Free(_p); - } - - private: - void *_p; - }; - } - - template SIMD_INLINE void HogDirectionHistograms(const float32x4_t & dx, const float32x4_t & dy, Buffer & buffer, size_t col) - { - float32x4_t bestDot = vdupq_n_f32(0); - int32x4_t bestIndex = vdupq_n_s32(0); - for (int i = 0; i < buffer.size; ++i) - { - float32x4_t dot = vaddq_f32(vmulq_f32(dx, buffer.cos[i]), vmulq_f32(dy, buffer.sin[i])); - uint32x4_t mask = vcgtq_f32(dot, bestDot); - bestDot = vmaxq_f32(dot, bestDot); - bestIndex = vbslq_s32(mask, buffer.pos[i], bestIndex); - - dot = vnegq_f32(dot); - mask = vcgtq_f32(dot, bestDot); - bestDot = vmaxq_f32(dot, bestDot); - bestIndex = vbslq_s32(mask, buffer.neg[i], bestIndex); - } - Store(buffer.index + col, bestIndex); - Store(buffer.value + col, Sqrt(vaddq_f32(vmulq_f32(dx, dx), vmulq_f32(dy, dy)))); - } - - template SIMD_INLINE void HogDirectionHistograms(const int16x8_t & dx, const int16x8_t & dy, Buffer & buffer, size_t col) - { - HogDirectionHistograms(ToFloat<0>(dx), ToFloat<0>(dy), buffer, col + 0); - HogDirectionHistograms(ToFloat<1>(dx), ToFloat<1>(dy), buffer, col + 4); - } - - template SIMD_INLINE void HogDirectionHistograms(const uint8_t * src, size_t stride, Buffer & buffer, size_t col) - { - const uint8_t * s = src + col; - uint8x16_t t = Load(s - stride); - uint8x16_t l = Load(s - 1); - uint8x16_t r = Load(s + 1); - uint8x16_t b = Load(s + stride); - HogDirectionHistograms(Sub<0>(r, l), Sub<0>(b, t), buffer, col + 0); - HogDirectionHistograms(Sub<1>(r, l), Sub<1>(b, t), buffer, col + 8); - } - - void HogDirectionHistograms(const uint8_t * src, size_t stride, size_t width, size_t height, - size_t cellX, size_t cellY, size_t quantization, float * histograms) - { - assert(width%cellX == 0 && height%cellY == 0 && quantization % 2 == 0); - - Buffer buffer(width, quantization); - - memset(histograms, 0, quantization*(width / cellX)*(height / cellY) * sizeof(float)); - - size_t alignedWidth = AlignLo(width - 2, A) + 1; - - for (size_t row = 1; row < height - 1; ++row) - { - const uint8_t * s = src + stride*row; - for (size_t col = 1; col < alignedWidth; col += A) - HogDirectionHistograms(s, stride, buffer, col); - HogDirectionHistograms(s, stride, buffer, width - 1 - A); - Base::AddRowToHistograms(buffer.index, buffer.value, row, width, height, cellX, cellY, quantization, histograms); - } - } - - class HogFeatureExtractor - { - static const size_t C = 8; - static const size_t Q = 9; - static const size_t Q2 = 18; - - typedef Array Array32i; - typedef Array Array32f; - - size_t _sx, _sy, _hs; - - int32x4_t _pos[5]; - float32x4_t _cos[5], _sin[5]; - float32x4_t _kx[8], _ky[8]; - int32x4_t _Q, _Q2; - - Array32i _index; - Array32f _value; - Array32f _buffer; - Array32f _histogram; - Array32f _norm; - - void Init(size_t w, size_t h) - { - _sx = w / C; - _hs = _sx + 2; - _sy = h / C; - for (int i = 0; i < 5; ++i) - { - _cos[i] = vdupq_n_f32((float)::cos(i*M_PI / Q)); - _sin[i] = vdupq_n_f32((float)::sin(i*M_PI / Q)); - _pos[i] = vdupq_n_s32(i); - } - for (int i = 0; i < C; ++i) - { - float k0 = float((15 - i * 2) / 16.0f); - float k1 = 1.0f - k0; - _kx[i] = SetF32(k0, k1, k0, k1); - _ky[i] = SetF32(k0, k0, k1, k1); - } - _Q = vdupq_n_s32(Q); - _Q2 = vdupq_n_s32(Q2); - - _index.Resize(w); - _value.Resize(w); - _buffer.Resize((_sx + 1) * 4 * Q2); - _histogram.Resize((_sx + 2)*(_sy + 2)*Q2); - _norm.Resize((_sx + 2)*(_sy + 2)); - } - - template SIMD_INLINE void GetHistogram(const float32x4_t & dx, const float32x4_t & dy, size_t col) - { - float32x4_t _0 = vdupq_n_f32(0); - float32x4_t bestDot = _0; - int32x4_t bestIndex = vdupq_n_s32(0); - float32x4_t adx = vabsq_f32(dx); - float32x4_t ady = vabsq_f32(dy); - for (int i = 0; i < 5; ++i) - { - float32x4_t dot = vmlaq_f32(vmulq_f32(adx, _cos[i]), ady, _sin[i]); - uint32x4_t mask = vcgtq_f32(dot, bestDot); - bestDot = vmaxq_f32(dot, bestDot); - bestIndex = vbslq_s32(mask, _pos[i], bestIndex); - } - uint32x4_t maskDx = vcltq_f32(dx, _0); - bestIndex = vbslq_s32(maskDx, vsubq_s32(_Q, bestIndex), bestIndex); - - uint32x4_t maskDy = vcltq_f32(dy, _0); - uint32x4_t corr = vandq_u32(vceqq_f32(adx, _0), K32_00000001); - bestIndex = vbslq_s32(maskDy, vsubq_s32(_Q2, vaddq_s32(bestIndex, (int32x4_t)corr)), bestIndex); - - bestIndex = vbslq_s32(vceqq_s32(bestIndex, _Q2), (int32x4_t)K32_00000000, bestIndex); - - Store(_index.data + col, bestIndex); // fixed program crash. - Store(_value.data + col, Sqrt(vmlaq_f32(vmulq_f32(adx, adx), ady, ady))); - } - - template SIMD_INLINE void GetHistogram(const int16x8_t & dx, const int16x8_t & dy, size_t col) - { - GetHistogram(ToFloat<0>(dx), ToFloat<0>(dy), col + 0); - GetHistogram(ToFloat<1>(dx), ToFloat<1>(dy), col + 4); - } - - template SIMD_INLINE void GetHistogram(const uint8_t * src, size_t stride, size_t col) - { - const uint8_t * s = src + col; - uint8x16_t t = Load(s - stride); - uint8x16_t l = Load(s - 1); - uint8x16_t r = Load(s + 1); - uint8x16_t b = Load(s + stride); - GetHistogram(Sub<0>(r, l), Sub<0>(b, t), col + 0); - GetHistogram(Sub<1>(r, l), Sub<1>(b, t), col + 8); - } - - void AddRowToBuffer(const uint8_t * src, size_t stride, size_t row, size_t width, size_t aligned) - { - const uint8_t * s = src + stride*row; - GetHistogram(s, stride, 1); - for (size_t col = A; col < aligned; col += A) - GetHistogram(s, stride, col); - GetHistogram(s, stride, width - 1 - A); - - float32x4_t * buffer = (float32x4_t*)_buffer.data; - float32x4_t ky = _ky[(row + 4) & 7]; - for (size_t col = 1, n = C, i = 5; col < width - 1; i = 0, n = Simd::Min(C, width - col - 1)) - { - for (; i < n; ++i, ++col) - { - int index = _index[col]; - float32x4_t value = vdupq_n_f32(_value[col]); - buffer[index] = vmlaq_f32(buffer[index], value, vmulq_f32(ky, _kx[i])); - } - buffer += Q2; - } - } - - void AddToHistogram(size_t row, size_t width, size_t height) - { - typedef float f18_t[18]; - const float * src = _buffer.data; - f18_t * h0 = (f18_t*)_histogram.data + row*_hs; - f18_t * h1 = h0 + _hs; - for (size_t cell = 0; cell <= width; ++cell) - { - for (size_t i = 0; i < 16; i += 4) - { - float32x4x4_t s = Load4(src + 4 * i); - Store(h0[0] + i, vaddq_f32(Load(h0[0] + i), s.val[0])); - Store(h0[1] + i, vaddq_f32(Load(h0[1] + i), s.val[1])); - Store(h1[0] + i, vaddq_f32(Load(h1[0] + i), s.val[2])); - Store(h1[1] + i, vaddq_f32(Load(h1[1] + i), s.val[3])); - } - float32x2x4_t s = LoadHalf4(src + 64); - Store(h0[0] + 16, vadd_f32(LoadHalf(h0[0] + 16), s.val[0])); - Store(h0[1] + 16, vadd_f32(LoadHalf(h0[1] + 16), s.val[1])); - Store(h1[0] + 16, vadd_f32(LoadHalf(h1[0] + 16), s.val[2])); - Store(h1[1] + 16, vadd_f32(LoadHalf(h1[1] + 16), s.val[3])); - h0++; - h1++; - src += 4 * Q2; - } - _buffer.Clear(); - } - - void EstimateHistogram(const uint8_t * src, size_t stride, size_t width, size_t height) - { - _histogram.Clear(); - - size_t aligned = AlignHi(width - 1, A) - A; - - _buffer.Clear(); - for (size_t row = 1; row < 4; ++row) - AddRowToBuffer(src, stride, row, width, aligned); - AddToHistogram(0, _sx, _sy); - for (size_t row = 4, cell = 1; row < height - 4; ++row) - { - AddRowToBuffer(src, stride, row, width, aligned); - if ((row & 7) == 3) - AddToHistogram(cell++, _sx, _sy); - } - for (size_t row = height - 4; row < height - 1; ++row) - AddRowToBuffer(src, stride, row, width, aligned); - AddToHistogram(_sy, _sx, _sy); - } - - SIMD_INLINE float GetNorm(const float * src) - { - float32x4_t norm = vdupq_n_f32(0); - for (size_t i = 0; i < 8; i += 4) - { - float32x4_t sum = vaddq_f32(Load(src + i + 0), Load(src + i + Q)); - norm = vmlaq_f32(norm, sum, sum); - } - return ExtractSum32f(norm) + Simd::Square(src[Q - 1] + src[Q2 - 1]); - } - - void EstimateNorm() - { - _norm.Clear(); - for (size_t y = 0, i = 0; y < _sy; y++) - { - const float * h = _histogram.data + ((y + 1)*_hs + 1)*Q2; - float * n = _norm.data + (y + 1)*_hs + 1; - for (size_t x = 0; x < _sx; x++, i++) - n[x] = GetNorm(h + x*Q2); - } - } - - void ExtractFeatures(float * features) - { - float32x4_t _02 = vdupq_n_f32(0.2f); - float32x4_t _05 = vdupq_n_f32(0.5f); - float32x4_t _02357 = vdupq_n_f32(0.2357f); - float32x4_t eps = vdupq_n_f32(0.0001f); - for (size_t y = 0; y < _sy; y++) - { - float * ph = _histogram.data + ((y + 1)*_hs + 1)*Q2; - for (size_t x = 0; x < _sx; x++) - { - float * dst = features + (y*_sx + x) * 31; - - float * p0 = _norm.data + y*_hs + x; - float * p1 = p0 + _hs; - float * p2 = p1 + _hs; - - float32x4_t n = SetF32( - p1[1] + p1[2] + p2[1] + p2[2], - p0[1] + p0[2] + p1[1] + p1[2], - p1[0] + p1[1] + p2[0] + p2[1], - p0[0] + p0[1] + p1[0] + p1[1]); - - n = ReciprocalSqrt(vaddq_f32(n, eps)); - - float32x4_t t = vdupq_n_f32(0); - - float * src = ph + x*Q2; - for (int o = 0; o < 16; o += 4) - { - float32x4_t s = Load(src); - float32x4_t h0 = vminq_f32(vmulq_f32(Broadcast<0>(s), n), _02); - float32x4_t h1 = vminq_f32(vmulq_f32(Broadcast<1>(s), n), _02); - float32x4_t h2 = vminq_f32(vmulq_f32(Broadcast<2>(s), n), _02); - float32x4_t h3 = vminq_f32(vmulq_f32(Broadcast<3>(s), n), _02); - t = vaddq_f32(t, vaddq_f32(vaddq_f32(h0, h1), vaddq_f32(h2, h3))); - Store(dst, vmulq_f32(_05, Hadd(Hadd(h0, h1), Hadd(h2, h3)))); - dst += 4; - src += 4; - } - { - float32x4_t h0 = vminq_f32(vmulq_f32(vdupq_n_f32(*src++), n), _02); - float32x4_t h1 = vminq_f32(vmulq_f32(vdupq_n_f32(*src++), n), _02); - t = vaddq_f32(t, vaddq_f32(h0, h1)); - float32x4_t h = Hadd(h0, h1); - Store(dst, vmulq_f32(_05, Hadd(h, h))); - dst += 2; - } - - src = ph + x*Q2; - for (int o = 0; o < 8; o += 4) - { - float32x4_t s = vaddq_f32(Load(src), Load(src + Q)); - float32x4_t h0 = vminq_f32(vmulq_f32(Broadcast<0>(s), n), _02); - float32x4_t h1 = vminq_f32(vmulq_f32(Broadcast<1>(s), n), _02); - float32x4_t h2 = vminq_f32(vmulq_f32(Broadcast<2>(s), n), _02); - float32x4_t h3 = vminq_f32(vmulq_f32(Broadcast<3>(s), n), _02); - Store(dst, vmulq_f32(_05, Hadd(Hadd(h0, h1), Hadd(h2, h3)))); - dst += 4; - src += 4; - } - { - float32x4_t s = vdupq_n_f32(src[0] + src[Q]); - float32x4_t h = vminq_f32(vmulq_f32(s, n), _02); - h = vmulq_f32(_05, h); - *dst++ = ExtractSum32f(h); - } - Store(dst, vmulq_f32(t, _02357)); - } - } - } - - public: - - void Run(const uint8_t * src, size_t stride, size_t width, size_t height, float * features) - { - Init(width, height); - - EstimateHistogram(src, stride, width, height); - - EstimateNorm(); - - ExtractFeatures(features); - } - }; - - void HogExtractFeatures(const uint8_t * src, size_t stride, size_t width, size_t height, float * features) - { - assert(width % 8 == 0 && height % 8 == 0 && width >= 16 && height >= 16); - - HogFeatureExtractor extractor; - extractor.Run(src, stride, width, height, features); - } - - namespace HogSeparableFilter_Detail - { - template SIMD_INLINE void Set(float * dst, const float32x4_t & value, const float32x4_t & mask) - { - Store(dst, value); - } - - template <> SIMD_INLINE void Set<1, false>(float * dst, const float32x4_t & value, const float32x4_t & mask) - { - Store(dst, vaddq_f32(Load(dst), value)); - } - - template <> SIMD_INLINE void Set<1, true>(float * dst, const float32x4_t & value, const float32x4_t & mask) - { - Store(dst, vaddq_f32(Load(dst), And(value, mask))); - } - } - - class HogSeparableFilter - { - typedef Array Array32f; - typedef Array Array128f; - - size_t _w, _h, _s; - Array32f _buffer; - Array128f _filter; - - void Init(size_t w, size_t h, size_t rs, size_t cs) - { - _w = w - rs + 1; - _s = AlignHi(_w, F); - _h = h - cs + 1; - _buffer.Resize(_s*h); - } - - template void FilterRows(const float * src, const float32x4_t * filter, size_t size, float * dst) - { - float32x4_t sum = vdupq_n_f32(0); - for (size_t i = 0; i < size; ++i) - sum = vmlaq_f32(sum, Load(src + i), filter[i]); - Store(dst, sum); - } - - void FilterRows(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - _filter.Resize(size); - for (size_t i = 0; i < size; ++i) - _filter[i] = vdupq_n_f32(filter[i]); - - size_t alignedWidth = AlignLo(width, F); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += F) - FilterRows(src + col, _filter.data, size, dst + col); - if (alignedWidth != width) - FilterRows(src + width - F, _filter.data, size, dst + width - F); - src += srcStride; - dst += dstStride; - } - } - - template void FilterCols(const float * src, size_t stride, const float32x4_t * filter, size_t size, float * dst, const float32x4_t & mask) - { - float32x4_t sum = vdupq_n_f32(0); - for (size_t i = 0; i < size; ++i, src += stride) - sum = vmlaq_f32(sum, Load(src), filter[i]); - HogSeparableFilter_Detail::Set(dst, sum, mask); - } - - template void FilterCols(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - _filter.Resize(size); - for (size_t i = 0; i < size; ++i) - _filter[i] = vdupq_n_f32(filter[i]); - - size_t alignedWidth = AlignLo(width, F); - float32x4_t tailMask = RightNotZero32f(width - alignedWidth); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += F) - FilterCols(src + col, srcStride, _filter.data, size, dst + col, tailMask); - if (alignedWidth != width) - FilterCols(src + width - F, srcStride, _filter.data, size, dst + width - F, tailMask); - src += srcStride; - dst += dstStride; - } - } - - public: - - void Run(const float * src, size_t srcStride, size_t width, size_t height, - const float * rowFilter, size_t rowSize, const float * colFilter, size_t colSize, float * dst, size_t dstStride, int add) - { - Init(width, height, rowSize, colSize); - - FilterRows(src, srcStride, _w, height, rowFilter, rowSize, _buffer.data, _s); - - if (add) - FilterCols<1>(_buffer.data, _s, _w, _h, colFilter, colSize, dst, dstStride); - else - FilterCols<0>(_buffer.data, _s, _w, _h, colFilter, colSize, dst, dstStride); - } - }; - - void HogFilterSeparable(const float * src, size_t srcStride, size_t width, size_t height, - const float * rowFilter, size_t rowSize, const float * colFilter, size_t colSize, float * dst, size_t dstStride, int add) - { - assert(width >= F + rowSize - 1 && height >= colSize - 1); - - HogSeparableFilter filter; - filter.Run(src, srcStride, width, height, rowFilter, rowSize, colFilter, colSize, dst, dstStride, add); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonHogLite.cpp b/src/3rd/Simd/Simd/SimdNeonHogLite.cpp deleted file mode 100644 index 930be7a3..00000000 --- a/src/3rd/Simd/Simd/SimdNeonHogLite.cpp +++ /dev/null @@ -1,945 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdArray.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdUpdate.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - const uint8x16_t K8_KX4 = SIMD_VEC_SETR_EPI8(1, 3, 5, 7, 7, 5, 3, 1, 1, 3, 5, 7, 7, 5, 3, 1); - const uint8x16_t K8_KX8 = SIMD_VEC_SETR_EPI8(1, 3, 5, 7, 9, 11, 13, 15, 15, 13, 11, 9, 7, 5, 3, 1); - - SIMD_INLINE uint16x8_t Hadd16u(uint16x8_t a, uint16x8_t b) - { - return vcombine_u16(vpadd_u16(Half<0>(a), Half<1>(a)), vpadd_u16(Half<0>(b), Half<1>(b))); - } - - SIMD_INLINE uint32x4_t Hadd32u(uint32x4_t a, uint32x4_t b) - { - return vcombine_u32(vpadd_u32(Half<0>(a), Half<1>(a)), vpadd_u32(Half<0>(b), Half<1>(b))); - } - - SIMD_INLINE uint16x8_t Madd8u(uint8x16_t a, uint8x16_t b) - { - return Hadd16u(vmull_u8(Half<0>(a), Half<0>(b)), vmull_u8(Half<1>(a), Half<1>(b))); - } - - SIMD_INLINE int32x4_t Madd16u(uint16x8_t a, uint16x8_t b) - { - return (int32x4_t)Hadd32u(vmull_u16(Half<0>(a), Half<0>(b)), vmull_u16(Half<1>(a), Half<1>(b))); - } - - const uint8x8_t K8_I40 = SIMD_VEC_SETR_PI8(16, 17, 18, 19, 0, 1, 2, 3); - const uint8x8_t K8_I51 = SIMD_VEC_SETR_PI8(20, 21, 22, 23, 4, 5, 6, 7); - const uint8x8_t K8_I62 = SIMD_VEC_SETR_PI8(24, 25, 26, 27, 8, 9, 10, 11); - const uint8x8_t K8_I73 = SIMD_VEC_SETR_PI8(28, 29, 30, 31, 12, 13, 14, 15); - - SIMD_INLINE float32x2_t Permute(float32x4x2_t v, uint8x8_t i) - { - return vreinterpret_f32_u8(vtbl4_u8(*(uint8x8x4_t*)&v, i)); - } - - SIMD_INLINE void UzpAs32(const uint16x8_t * src, uint16x8_t * dst) - { - *(uint32x4x2_t*)dst = vuzpq_u32(vreinterpretq_u32_u16(src[0]), vreinterpretq_u32_u16(src[1])); - } - - template class HogLiteFeatureExtractor - { - static const size_t FQ = 8; - static const size_t HQ = FQ / 2; - static const size_t DQ = FQ * 2; - - typedef Array Bytes; - typedef Array Ints; - typedef Array Floats; - - size_t _hx, _fx, _w, _aw; - Bytes _value, _index; - Ints _hi[2]; - Floats _hf[2], _nf[4], _nb; - int _k0[cell], _k1[cell]; - //__m128i _kx4, _kx8; - float32x4_t _k, _02, _05, _02357, _eps; - - SIMD_INLINE void Init(size_t width) - { - _w = (width / cell - 1)*cell; - _aw = AlignLo(_w, A); - _hx = width / cell; - _fx = _hx - 2; - _value.Resize(_aw + 3 * A, true); - _index.Resize(_aw + 3 * A, true); - for (size_t i = 0; i < cell; ++i) - { - _k0[i] = int(cell - i - 1) * 2 + 1; - _k1[i] = int(i) * 2 + 1; - } - for (size_t i = 0; i < 2; ++i) - { - _hi[i].Resize((_hx + 4)*FQ, true); - _hf[i].Resize(_hx*FQ); - } - for (size_t i = 0; i < 4; ++i) - _nf[i].Resize(_hx + DF); - _nb.Resize(_hx * 4); - _k = vdupq_n_f32(1.0f / Simd::Square(cell * 2)); - _02 = vdupq_n_f32(0.2f); - _05 = vdupq_n_f32(0.5f); - _02357 = vdupq_n_f32(0.2357f); - _eps = vdupq_n_f32(0.0001f); - } - - template static SIMD_INLINE void SetIndexAndValue(const uint8_t * src, size_t stride, uint8_t * value, uint8_t * index) - { - uint8x16_t y0 = Load(src - stride); - uint8x16_t y1 = Load(src + stride); - uint8x16_t x0 = Load(src - 1); - uint8x16_t x1 = Load(src + 1); - - uint8x16_t ady = vabdq_u8(y0, y1); - uint8x16_t adx = vabdq_u8(x0, x1); - - uint8x16_t max = vmaxq_u8(ady, adx); - uint8x16_t min = vminq_u8(ady, adx); - uint8x16_t val = vqaddq_u8(max, vrhaddq_u8(min, K8_00)); - Store(value, val); - - uint8x16_t idx = vbslq_u8(Compare8u(adx, ady), K8_00, K8_01); - idx = vbslq_u8(Compare8u(x1, x0), idx, vsubq_u8(K8_03, idx)); - idx = vbslq_u8(Compare8u(y1, y0), idx, vsubq_u8(K8_07, idx)); - Store(index, idx); - } - - SIMD_INLINE void SetIndexAndValue(const uint8_t * src, size_t stride) - { - uint8_t * value = _value.data + A; - uint8_t * index = _index.data + A; - for (size_t col = 0; col < _aw; col += A) - SetIndexAndValue(src + col, stride, value + col, index + col); - if (_aw < _w) - { - size_t col = _w - A; - SetIndexAndValue(src + col, stride, value + col, index + col); - } - } - - static SIMD_INLINE void UpdateIntegerHistogram4x4(uint8_t * value, uint8_t * index, const uint16x8_t & ky0, const uint16x8_t & ky1, int * h0, int * h1) - { - uint8x16_t val = Load(value); - uint8x16_t idx = Load(index); - uint8x16_t cur0 = K8_00; - uint8x16_t cur1 = K8_01; - uint16x8_t dirs[4]; - for (size_t i = 0; i < 4; ++i) - { - uint16x8_t dir0 = Madd8u(vandq_u8(vceqq_u8(idx, cur0), val), K8_KX4); - uint16x8_t dir1 = Madd8u(vandq_u8(vceqq_u8(idx, cur1), val), K8_KX4); - dirs[i] = Hadd16u(dir0, dir1); - cur0 = vqaddq_u8(cur0, K8_02); - cur1 = vqaddq_u8(cur1, K8_02); - } - UzpAs32(dirs + 0, dirs + 0); - UzpAs32(dirs + 2, dirs + 2); - Store(h0 + 0 * F, vaddq_s32(Load(h0 + 0 * F), Madd16u(dirs[0], ky0))); - Store(h0 + 1 * F, vaddq_s32(Load(h0 + 1 * F), Madd16u(dirs[2], ky0))); - Store(h0 + 4 * F, vaddq_s32(Load(h0 + 4 * F), Madd16u(dirs[1], ky0))); - Store(h0 + 5 * F, vaddq_s32(Load(h0 + 5 * F), Madd16u(dirs[3], ky0))); - Store(h1 + 0 * F, vaddq_s32(Load(h1 + 0 * F), Madd16u(dirs[0], ky1))); - Store(h1 + 1 * F, vaddq_s32(Load(h1 + 1 * F), Madd16u(dirs[2], ky1))); - Store(h1 + 4 * F, vaddq_s32(Load(h1 + 4 * F), Madd16u(dirs[1], ky1))); - Store(h1 + 5 * F, vaddq_s32(Load(h1 + 5 * F), Madd16u(dirs[3], ky1))); - } - - SIMD_INLINE void UpdateIntegerHistogram4x4(size_t rowI, size_t rowF) - { - int * h0 = _hi[(rowI + 0) & 1].data; - int * h1 = _hi[(rowI + 1) & 1].data; - uint8_t * value = _value.data + A - cell; - uint8_t * index = _index.data + A - cell; - uint16x8_t ky0 = vdupq_n_u16(_k0[rowF]); - uint16x8_t ky1 = vdupq_n_u16(_k1[rowF]); - for (size_t col = 0; col <= _w;) - { - UpdateIntegerHistogram4x4(value + col, index + col, ky0, ky1, h0, h1); - col += cell; - h0 += FQ; - h1 += FQ; - UpdateIntegerHistogram4x4(value + col, index + col, ky0, ky1, h0, h1); - col += 3 * cell; - h0 += 3 * FQ; - h1 += 3 * FQ; - } - } - - SIMD_INLINE void UpdateIntegerHistogram8x8(size_t rowI, size_t rowF) - { - int * h0 = _hi[(rowI + 0) & 1].data; - int * h1 = _hi[(rowI + 1) & 1].data; - uint8_t * value = _value.data + A - cell; - uint8_t * index = _index.data + A - cell; - uint16x8_t ky0 = vdupq_n_u16(_k0[rowF]); - uint16x8_t ky1 = vdupq_n_u16(_k1[rowF]); - for (size_t col = 0; col <= _w; col += cell) - { - uint8x16_t val = Load(value + col); - uint8x16_t idx = Load(index + col); - uint8x16_t cur0 = K8_00; - uint8x16_t cur1 = K8_01; - uint16x8_t dirs[4]; - for (size_t i = 0; i < 4; ++i) - { - uint16x8_t dir0 = Madd8u(vandq_u8(vceqq_u8(idx, cur0), val), K8_KX8); - uint16x8_t dir1 = Madd8u(vandq_u8(vceqq_u8(idx, cur1), val), K8_KX8); - dirs[i] = Hadd16u(dir0, dir1); - cur0 = vqaddq_u8(cur0, K8_02); - cur1 = vqaddq_u8(cur1, K8_02); - } - dirs[0] = Hadd16u(dirs[0], dirs[1]); - dirs[1] = Hadd16u(dirs[2], dirs[3]); - Store(h0 + 0, vaddq_s32(Load(h0 + 0), Madd16u(dirs[0], ky0))); - Store(h0 + F, vaddq_s32(Load(h0 + F), Madd16u(dirs[1], ky0))); - Store(h1 + 0, vaddq_s32(Load(h1 + 0), Madd16u(dirs[0], ky1))); - Store(h1 + F, vaddq_s32(Load(h1 + F), Madd16u(dirs[1], ky1))); - h0 += FQ; - h1 += FQ; - } - } - - SIMD_INLINE void UpdateFloatHistogram(size_t rowI) - { - Ints & hi = _hi[rowI & 1]; - Floats & hf = _hf[rowI & 1]; - Floats & nf = _nf[rowI & 3]; - - for (size_t i = 0; i < hf.size; i += DF) - { - Store(hf.data + i + 0, vmulq_f32(_k, vcvtq_f32_s32(Load(hi.data + i + 0)))); - Store(hf.data + i + F, vmulq_f32(_k, vcvtq_f32_s32(Load(hi.data + i + F)))); - } - hi.Clear(); - - const float * h = hf.data; - for (size_t x = 0; x < _hx; ++x, h += FQ) - { - float32x4_t h0 = Load(h + 00); - float32x4_t h1 = Load(h + HQ); - float32x4_t s1 = vaddq_f32(h0, h1); - float32x4_t s2 = vmulq_f32(s1, s1); - nf.data[x] = ExtractSum32f(s2); - } - } - - SIMD_INLINE void BlockNorm(size_t rowI) - { - const float * src0 = _nf[(rowI - 2) & 3].data; - const float * src1 = _nf[(rowI - 1) & 3].data; - const float * src2 = _nf[(rowI - 0) & 3].data; - float * dst = _nb.data; - for (size_t x = 0; x < _fx; x += 3, src0 += 3, src1 += 3, src2 += 3, dst += 3 * F) - { - float32x4_t s00 = Load(src0 + 0); - float32x4_t s01 = Load(src0 + 1); - float32x4_t s10 = Load(src1 + 0); - float32x4_t s11 = Load(src1 + 1); - float32x4_t s20 = Load(src2 + 0); - float32x4_t s21 = Load(src2 + 1); - float32x4_t v00 = vaddq_f32(s00, s10); - float32x4_t v01 = vaddq_f32(s01, s11); - float32x4_t v10 = vaddq_f32(s10, s20); - float32x4_t v11 = vaddq_f32(s11, s21); - float32x4x2_t h; - h.val[0] = Hadd(v00, v01); - h.val[1] = Hadd(v10, v11); - float32x2_t p40 = Permute(h, K8_I40); - float32x2_t p51 = Permute(h, K8_I51); - float32x2_t p62 = Permute(h, K8_I62); - float32x2_t p73 = Permute(h, K8_I73); - Store(dst + 0 * HF, p62); - Store(dst + 1 * HF, p40); - Store(dst + 2 * HF, p51); - Store(dst + 3 * HF, p62); - Store(dst + 4 * HF, p73); - Store(dst + 5 * HF, p51); - } - } - - SIMD_INLINE void SetFeatures(size_t rowI, float * dst) - { - const float * hf = _hf[(rowI - 1) & 1].data + FQ; - const float * nb = _nb.data; - for (size_t x = 0; x < _fx; ++x, nb += 4) - { - float32x4_t n = ReciprocalSqrt<1>(vaddq_f32(Load(nb), _eps)); - float32x4_t t = vdupq_n_f32(0.0f); - const float * src = hf + x * FQ; - for (int o = 0; o < FQ; o += 4) - { - float32x4_t s = Load(src); - float32x4_t h0 = vminq_f32(vmulq_f32(Broadcast<0>(s), n), _02); - float32x4_t h1 = vminq_f32(vmulq_f32(Broadcast<1>(s), n), _02); - float32x4_t h2 = vminq_f32(vmulq_f32(Broadcast<2>(s), n), _02); - float32x4_t h3 = vminq_f32(vmulq_f32(Broadcast<3>(s), n), _02); - t = vaddq_f32(t, vaddq_f32(vaddq_f32(h0, h1), vaddq_f32(h2, h3))); - Store(dst, vmulq_f32(_05, Hadd(Hadd(h0, h1), Hadd(h2, h3)))); - dst += F; - src += F; - } - src = hf + x * FQ; - float32x4_t s = vaddq_f32(Load(src), Load(src + HQ)); - float32x4_t h0 = vminq_f32(vmulq_f32(Broadcast<0>(s), n), _02); - float32x4_t h1 = vminq_f32(vmulq_f32(Broadcast<1>(s), n), _02); - float32x4_t h2 = vminq_f32(vmulq_f32(Broadcast<2>(s), n), _02); - float32x4_t h3 = vminq_f32(vmulq_f32(Broadcast<3>(s), n), _02); - Store(dst, vmulq_f32(_05, Hadd(Hadd(h0, h1), Hadd(h2, h3)))); - dst += 4; - Store(dst, vmulq_f32(t, _02357)); - dst += 4; - } - } - - public: - - void Run(const uint8_t * src, size_t srcStride, size_t width, size_t height, float * features, size_t featuresStride) - { - assert(cell == 8 || cell == 4); - assert(width >= cell * 3 && height >= cell * 3); - - Init(width); - - src += (srcStride + 1)*cell / 2; - height = (height / cell - 1)*cell; - - for (size_t row = 0; row < height; ++row) - { - SetIndexAndValue(src, srcStride); - size_t rowI = row / cell; - size_t rowF = row & (cell - 1); - if (cell == 4) - UpdateIntegerHistogram4x4(rowI, rowF); - else - UpdateIntegerHistogram8x8(rowI, rowF); - if (rowF == cell - 1) - { - UpdateFloatHistogram(rowI); - if (rowI >= 2) - { - BlockNorm(rowI); - SetFeatures(rowI, features); - features += featuresStride; - } - } - src += srcStride; - } - size_t rowI = height / cell; - UpdateFloatHistogram(rowI); - BlockNorm(rowI); - SetFeatures(rowI, features); - } - }; - - void HogLiteExtractFeatures(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t cell, float * features, size_t featuresStride) - { - if (cell == 4) - { - HogLiteFeatureExtractor<4> extractor; - extractor.Run(src, srcStride, width, height, features, featuresStride); - } - else - { - HogLiteFeatureExtractor<8> extractor; - extractor.Run(src, srcStride, width, height, features, featuresStride); - } - } - - class HogLiteFeatureFilter - { - template SIMD_INLINE void ProductSum1x1(const float * src, const float * filter, float32x4_t & sum) - { - float32x4_t _src = Load(src); - float32x4_t _filter = Load(filter); - sum = vmlaq_f32(sum, _src, _filter); - } - - template SIMD_INLINE void ProductSum1x4(const float * src, const float * filter, float32x4_t * sums) - { - float32x4_t _filter = Load(filter); - sums[0] = vmlaq_f32(sums[0], Load(src + 0 * step), _filter); - sums[1] = vmlaq_f32(sums[1], Load(src + 1 * step), _filter); - sums[2] = vmlaq_f32(sums[2], Load(src + 2 * step), _filter); - sums[3] = vmlaq_f32(sums[3], Load(src + 3 * step), _filter); - } - - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride) - { - size_t filterStride = featureSize * filterWidth; - size_t alignedDstWidth = AlignLo(dstWidth, 4); - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - size_t dstCol = 0; - for (; dstCol < alignedDstWidth; dstCol += 4) - { - float32x4_t sums[4] = { vdupq_n_f32(0.0f), vdupq_n_f32(0.0f), vdupq_n_f32(0.0f), vdupq_n_f32(0.0f) }; - const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - size_t filterCol = 0; - for (; filterCol < filterStride; filterCol += F) - ProductSum1x4(pSrc + filterCol, pFilter + filterCol, sums); - pSrc += srcStride; - pFilter += filterStride; - } - Store(dst + dstCol, Extract4Sums(sums)); - } - for (; dstCol < dstWidth; ++dstCol) - { - float32x4_t sum = vdupq_n_f32(0.0f); - const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - for (size_t filterCol = 0; filterCol < filterStride; filterCol += F) - ProductSum1x1(pSrc + filterCol, pFilter + filterCol, sum); - pSrc += srcStride; - pFilter += filterStride; - } - dst[dstCol] = ExtractSum32f(sum); - } - dst += dstStride; - } - } - - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) - { - size_t filterStride = featureSize * filterWidth; - size_t alignedDstWidth = AlignLo(dstWidth, 4); - float32x4_t _min = vdupq_n_f32(-FLT_MAX); - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - size_t dstCol = 0; - for (; dstCol < alignedDstWidth; dstCol += 4) - { - uint32x4_t _mask = Load(mask + dstCol); - if (TestZ(_mask)) - Store(dst + dstCol, _min); - else - { - float32x4_t sums[4] = { vdupq_n_f32(0.0f), vdupq_n_f32(0.0f), vdupq_n_f32(0.0f), vdupq_n_f32(0.0f) }; - const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - size_t filterCol = 0; - for (; filterCol < filterStride; filterCol += F) - ProductSum1x4(pSrc + filterCol, pFilter + filterCol, sums); - pSrc += srcStride; - pFilter += filterStride; - } - Store(dst + dstCol, vbslq_f32(_mask, Extract4Sums(sums), _min)); - } - } - for (; dstCol < dstWidth; ++dstCol) - { - if (mask[dstCol]) - { - float32x4_t sum = vdupq_n_f32(0.0f); - const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; - const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) - { - for (size_t filterCol = 0; filterCol < filterStride; filterCol += F) - ProductSum1x1(pSrc + filterCol, pFilter + filterCol, sum); - pSrc += srcStride; - pFilter += filterStride; - } - dst[dstCol] = ExtractSum32f(sum); - } - else - dst[dstCol] = -FLT_MAX; - } - dst += dstStride; - mask += maskStride; - } - } - - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride) - { - if (featureSize == 16) - Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride); - else - Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride); - } - - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) - { - if (featureSize == 16) - Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - else - Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - } - - public: - - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) - { - assert(featureSize == 8 || featureSize == 16); - assert(srcWidth >= filterWidth && srcHeight >= filterHeight); - - size_t dstWidth = srcWidth - filterWidth + 1; - size_t dstHeight = srcHeight - filterHeight + 1; - - if (mask) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(filter)) - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - else - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - } - else - { - if (Aligned(src) && Aligned(srcStride) && Aligned(filter)) - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride); - else - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride); - } - } - }; - - void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) - { - HogLiteFeatureFilter featureFilter; - featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); - } - - namespace HogLiteFeatureResizerDetail - { - template struct Feature - { - template static SIMD_INLINE void Interpolate(const float * src0, const float * src1, const float32x4_t k[2][2], float * dst); - }; - - template <> struct Feature<8> - { - template static SIMD_INLINE void Interpolate(const float * src0, const float * src1, const float32x4_t k[2][2], float * dst) - { - Store(dst + 0 * F, vaddq_f32( - vmlaq_f32(vmulq_f32(Load(src0 + 0 * F), k[0][0]), Load(src0 + 2 * F), k[0][1]), - vmlaq_f32(vmulq_f32(Load(src1 + 0 * F), k[1][0]), Load(src1 + 2 * F), k[1][1]))); - Store(dst + 1 * F, vaddq_f32( - vmlaq_f32(vmulq_f32(Load(src0 + 1 * F), k[0][0]), Load(src0 + 3 * F), k[0][1]), - vmlaq_f32(vmulq_f32(Load(src1 + 1 * F), k[1][0]), Load(src1 + 3 * F), k[1][1]))); - } - }; - - template <> struct Feature<16> - { - template static SIMD_INLINE void Interpolate(const float * src0, const float * src1, const float32x4_t k[2][2], float * dst) - { - Store(dst + 0 * F, vaddq_f32( - vmlaq_f32(vmulq_f32(Load(src0 + 0 * F), k[0][0]), Load(src0 + 4 * F), k[0][1]), - vmlaq_f32(vmulq_f32(Load(src1 + 0 * F), k[1][0]), Load(src1 + 4 * F), k[1][1]))); - Store(dst + 1 * F, vaddq_f32( - vmlaq_f32(vmulq_f32(Load(src0 + 1 * F), k[0][0]), Load(src0 + 5 * F), k[0][1]), - vmlaq_f32(vmulq_f32(Load(src1 + 1 * F), k[1][0]), Load(src1 + 5 * F), k[1][1]))); - Store(dst + 2 * F, vaddq_f32( - vmlaq_f32(vmulq_f32(Load(src0 + 2 * F), k[0][0]), Load(src0 + 6 * F), k[0][1]), - vmlaq_f32(vmulq_f32(Load(src1 + 2 * F), k[1][0]), Load(src1 + 6 * F), k[1][1]))); - Store(dst + 3 * F, vaddq_f32( - vmlaq_f32(vmulq_f32(Load(src0 + 3 * F), k[0][0]), Load(src0 + 7 * F), k[0][1]), - vmlaq_f32(vmulq_f32(Load(src1 + 3 * F), k[1][0]), Load(src1 + 7 * F), k[1][1]))); - } - }; - } - - class HogLiteFeatureResizer - { - typedef Array Ints; - typedef Array Floats; - - Ints _iy, _ix; - Floats _ky, _kx; - - void InitIndexWeight(size_t srcSize, size_t dstSize, size_t dstStep, Ints & indexes, Floats & weights) - { - indexes.Resize(dstSize); - weights.Resize(dstSize); - - float scale = float(srcSize) / float(dstSize); - for (size_t i = 0; i < dstSize; ++i) - { - float weight = (float)((i + 0.5f)*scale - 0.5f); - int index = (int)::floor(weight); - weight -= index; - if (index < 0) - { - index = 0; - weight = 0.0f; - } - if (index > (int)srcSize - 2) - { - index = (int)srcSize - 2; - weight = 1.0f; - } - indexes[i] = int(index*dstStep); - weights[i] = weight; - } - } - - template void Resize(const float * src, size_t srcStride, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) - { - float32x4_t _1 = vdupq_n_f32(1.0f); - for (size_t rowDst = 0; rowDst < dstHeight; ++rowDst) - { - float32x4_t ky1 = vdupq_n_f32(_ky[rowDst]); - float32x4_t ky0 = vsubq_f32(_1, ky1); - const float * pSrc = src + _iy[rowDst]; - float * pDst = dst + rowDst * dstStride; - for (size_t colDst = 0; colDst < dstWidth; ++colDst, pDst += featureSize) - { - float32x4_t kx1 = vdupq_n_f32(_kx[colDst]); - float32x4_t kx0 = vsubq_f32(_1, kx1); - float32x4_t k[2][2]; - k[0][0] = vmulq_f32(ky0, kx0); - k[0][1] = vmulq_f32(ky0, kx1); - k[1][0] = vmulq_f32(ky1, kx0); - k[1][1] = vmulq_f32(ky1, kx1); - const float * pSrc0 = pSrc + _ix[colDst]; - const float * pSrc1 = pSrc0 + srcStride; - HogLiteFeatureResizerDetail::Feature:: template Interpolate(pSrc0, pSrc1, k, pDst); - } - } - } - - template void Resize(const float * src, size_t srcStride, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) - { - if (featureSize == 8) - Resize(src, srcStride, dst, dstStride, dstWidth, dstHeight); - else - Resize(src, srcStride, dst, dstStride, dstWidth, dstHeight); - } - - public: - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) - { - assert(featureSize == 8 || featureSize == 16); - - if (srcWidth == dstWidth && srcHeight == dstHeight) - { - size_t size = sizeof(float)*srcWidth*featureSize; - for (size_t row = 0; row < dstHeight; ++row) - memcpy(dst + row * dstStride, src + row * srcStride, size); - return; - } - - InitIndexWeight(srcWidth, dstWidth, featureSize, _ix, _kx); - InitIndexWeight(srcHeight, dstHeight, srcStride, _iy, _ky); - - if (Aligned(src) && Aligned(dst)) - Resize(src, srcStride, featureSize, dst, dstStride, dstWidth, dstHeight); - else - Resize(src, srcStride, featureSize, dst, dstStride, dstWidth, dstHeight); - } - }; - - void HogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) - { - HogLiteFeatureResizer featureResizer; - featureResizer.Run(src, srcStride, srcWidth, srcHeight, featureSize, dst, dstStride, dstWidth, dstHeight); - } - - template void HogLiteCompressFeatures(const float * src, size_t srcStride, size_t width, size_t height, const float * pca, float * dst, size_t dstStride) - { - for (size_t row = 0; row < height; ++row) - { - const float * s = src; - float * d = dst; - for (size_t col = 0; col < width; ++col) - { - const float * p = pca; - for (size_t i = 0; i < 8; i += 4, p += 64) - { - float32x4_t sums[4] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; - for (size_t j = 0; j < 16; j += F) - { - float32x4_t _s = Load(s + j); - sums[0] = vmlaq_f32(sums[0], _s, Load(p + j + 00)); - sums[1] = vmlaq_f32(sums[1], _s, Load(p + j + 16)); - sums[2] = vmlaq_f32(sums[2], _s, Load(p + j + 32)); - sums[3] = vmlaq_f32(sums[3], _s, Load(p + j + 48)); - } - Store(d + i, Extract4Sums(sums)); - } - s += 16; - d += 8; - } - src += srcStride; - dst += dstStride; - } - - } - - void HogLiteCompressFeatures(const float * src, size_t srcStride, size_t width, size_t height, const float * pca, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(pca) && Aligned(dst)) - HogLiteCompressFeatures(src, srcStride, width, height, pca, dst, dstStride); - else - HogLiteCompressFeatures(src, srcStride, width, height, pca, dst, dstStride); - } - - class HogLiteSeparableFilter - { - size_t _dstWidth, _dstHeight, _dstStride; - Array32f _buffer; - Array128f _filter; - - void Init(size_t srcWidth, size_t srcHeight, size_t hSize, size_t vSize) - { - _dstWidth = srcWidth - hSize + 1; - _dstStride = AlignHi(_dstWidth, F); - _dstHeight = srcHeight - vSize + 1; - _buffer.Resize(_dstStride*srcHeight); - } - - template static SIMD_INLINE void FilterHx1(const float * src, const float * filter, float32x4_t & sum) - { - float32x4_t _src = Load(src); - float32x4_t _filter = Load(filter); - sum = vmlaq_f32(sum, _src, _filter); - } - - template static SIMD_INLINE void FilterHx4(const float * src, const float * filter, float32x4_t * sums) - { - float32x4_t _filter = Load(filter); - sums[0] = vmlaq_f32(sums[0], Load(src + 0 * step), _filter); - sums[1] = vmlaq_f32(sums[1], Load(src + 1 * step), _filter); - sums[2] = vmlaq_f32(sums[2], Load(src + 2 * step), _filter); - sums[3] = vmlaq_f32(sums[3], Load(src + 3 * step), _filter); - } - - template void FilterH(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - size_t alignedWidth = AlignLo(width, 4); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += 4) - { - float32x4_t sums[4] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; - const float * s = src + col * step; - for (size_t i = 0; i < size; i += F) - FilterHx4(s + i, filter + i, sums); - Store(dst + col, Extract4Sums(sums)); - } - for (; col < width; ++col) - { - float32x4_t sum = vdupq_n_f32(0); - const float * s = src + col * step; - for (size_t i = 0; i < size; i += F) - FilterHx1(s + i, filter + i, sum); - dst[col] = ExtractSum32f(sum); - } - src += srcStride; - dst += dstStride; - } - } - - template void FilterH(const float * src, size_t srcStride, size_t width, size_t height, size_t step, const float * filter, size_t size, float * dst, size_t dstStride) - { - if (step == 16) - FilterH(src, srcStride, width, height, filter, size, dst, dstStride); - else - FilterH(src, srcStride, width, height, filter, size, dst, dstStride); - } - - template static SIMD_INLINE void FilterV(const float * src, size_t stride, const float32x4_t * filter, size_t size, float * dst, const float32x4_t & mask) - { - float32x4_t sum = vdupq_n_f32(0); - for (size_t i = 0; i < size; ++i, src += stride) - sum = vmlaq_f32(sum, Load(src), filter[i]); - Update(dst, Masked(sum, mask)); - } - - template void FilterV(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - _filter.Resize(size); - for (size_t i = 0; i < size; ++i) - _filter[i] = vdupq_n_f32(filter[i]); - - size_t alignedWidth = AlignLo(width, F); - float32x4_t tailMask = RightNotZero32f(width - alignedWidth); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += F) - FilterV(src + col, srcStride, _filter.data, size, dst + col, tailMask); - if (alignedWidth != width) - FilterV(src + width - F, srcStride, _filter.data, size, dst + width - F, tailMask); - src += srcStride; - dst += dstStride; - } - } - - template void FilterV(const float * src, size_t srcStride, size_t width, size_t height, const float * filter, size_t size, float * dst, size_t dstStride) - { - if (Aligned(dst) && Aligned(dstStride)) - FilterV(src, srcStride, width, height, filter, size, dst, dstStride); - else - FilterV(src, srcStride, width, height, filter, size, dst, dstStride); - } - - public: - - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * hFilter, size_t hSize, const float * vFilter, size_t vSize, float * dst, size_t dstStride, int add) - { - assert(featureSize == 8 || featureSize == 16); - assert(srcWidth >= hSize && srcHeight >= vSize); - - Init(srcWidth, srcHeight, hSize, vSize); - - if (Aligned(src) && Aligned(srcStride) && Aligned(hFilter)) - FilterH(src, srcStride, _dstWidth, srcHeight, featureSize, hFilter, hSize*featureSize, _buffer.data, _dstStride); - else - FilterH(src, srcStride, _dstWidth, srcHeight, featureSize, hFilter, hSize*featureSize, _buffer.data, _dstStride); - - if (add) - FilterV(_buffer.data, _dstStride, _dstWidth, _dstHeight, vFilter, vSize, dst, dstStride); - else - FilterV(_buffer.data, _dstStride, _dstWidth, _dstHeight, vFilter, vSize, dst, dstStride); - } - }; - - void HogLiteFilterSeparable(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * hFilter, size_t hSize, const float * vFilter, size_t vSize, float * dst, size_t dstStride, int add) - { - HogLiteSeparableFilter filter; - filter.Run(src, srcStride, srcWidth, srcHeight, featureSize, hFilter, hSize, vFilter, vSize, dst, dstStride, add); - } - - void HogLiteFindMax7x7(const float * a, size_t aStride, const float * b, size_t bStride, size_t height, float * pValue, size_t * pCol, size_t * pRow) - { - float32x4_t max = vdupq_n_f32(-FLT_MAX), val; - uint32x4_t idx = vdupq_n_u32(0); - uint32x4_t cur = K32_0123; - for (size_t row = 0; row < height; ++row) - { - val = vaddq_f32(Load(a + 0), Load(b + 0)); - idx = vbslq_u32(vcgtq_f32(val, max), cur, idx); - max = vmaxq_f32(max, val); - cur = vaddq_u32(cur, K32_00000003); - val = vaddq_f32(Load(a + 3), Load(b + 3)); - idx = vbslq_u32(vcgtq_f32(val, max), cur, idx); - max = vmaxq_f32(max, val); - cur = vaddq_u32(cur, K32_00000005); - a += aStride; - b += bStride; - } - - uint32_t _idx[F]; - float _max[F]; - Store(_max, max); - Store(_idx, idx); - *pValue = -FLT_MAX; - for (size_t i = 0; i < F; ++i) - { - if (_max[i] > *pValue) - { - *pValue = _max[i]; - *pCol = _idx[i] & 7; - *pRow = _idx[i] / 8; - } - else if (_max[i] == *pValue && *pRow > _idx[i] / 8) - { - *pCol = _idx[i] & 7; - *pRow = _idx[i] / 8; - } - } - } - - SIMD_INLINE void Fill7x7(uint32_t * dst, size_t stride) - { - for (size_t row = 0; row < 7; ++row) - { - Store(dst + 0, K32_FFFFFFFF); - Store(dst + 3, K32_FFFFFFFF); - dst += stride; - } - } - - template void HogLiteCreateMask7x7(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, const float * threshold, uint32_t * dst, size_t dstStride) - { - size_t dstStartEnd = 7 - scale; - size_t dstRowSize = (srcWidth*scale + 7 - scale) * sizeof(uint32_t); - for (size_t dstRow = 0; dstRow < dstStartEnd; ++dstRow) - memset(dst + dstRow * dstStride, 0, dstRowSize); - - size_t alignedSrcWidth = AlignLo(srcWidth, F); - float32x4_t _threshold = vdupq_n_f32(*threshold); - for (size_t srcRow = 0; srcRow < srcHeight; ++srcRow) - { - for (size_t dstRow = 0; dstRow < scale; ++dstRow) - memset(dst + (dstStartEnd + dstRow)*dstStride, 0, dstRowSize); - - size_t srcCol = 0; - for (; srcCol < alignedSrcWidth; srcCol += F) - { - uint32x4_t mask = vcgtq_f32(Load(src + srcCol), _threshold); - uint32_t * pDst = dst + srcCol * scale; - if (vgetq_lane_u32(mask, 0)) - Fill7x7(pDst + 0 * scale, dstStride); - if (vgetq_lane_u32(mask, 1)) - Fill7x7(pDst + 1 * scale, dstStride); - if (vgetq_lane_u32(mask, 2)) - Fill7x7(pDst + 2 * scale, dstStride); - if (vgetq_lane_u32(mask, 3)) - Fill7x7(pDst + 3 * scale, dstStride); - } - for (; srcCol < srcWidth; ++srcCol) - { - if (src[srcCol] > *threshold) - Fill7x7(dst + srcCol * scale, dstStride); - } - src += srcStride; - dst += dstStride * scale; - } - } - - void HogLiteCreateMask(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, const float * threshold, size_t scale, size_t size, uint32_t * dst, size_t dstStride) - { - if (scale == 1 && size == 7) - HogLiteCreateMask7x7<1>(src, srcStride, srcWidth, srcHeight, threshold, dst, dstStride); - else if (scale == 2 && size == 7) - HogLiteCreateMask7x7<2>(src, srcStride, srcWidth, srcHeight, threshold, dst, dstStride); - else - Base::HogLiteCreateMask(src, srcStride, srcWidth, srcHeight, threshold, scale, size, dst, dstStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonInt16ToGray.cpp b/src/3rd/Simd/Simd/SimdNeonInt16ToGray.cpp deleted file mode 100644 index a67e39e8..00000000 --- a/src/3rd/Simd/Simd/SimdNeonInt16ToGray.cpp +++ /dev/null @@ -1,65 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE void Int16ToGray(const int16_t * src, uint8_t * dst) - { - int16x8_t _src = Load(src); - Store(dst, vqmovun_s16(_src)); - } - - template void Int16ToGray(const int16_t * src, size_t width, size_t height, size_t srcStride, uint8_t * dst, size_t dstStride) - { - assert(width >= HA); - if (align) - assert(Aligned(src) && Aligned(srcStride, HA) && Aligned(dst) && Aligned(dstStride)); - - size_t alignedWidth = AlignLo(width, HA); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += HA) - Int16ToGray(src + col, dst + col); - if (alignedWidth != width) - Int16ToGray(src + width - HA, dst + width - HA); - src += srcStride; - dst += dstStride; - } - } - - void Int16ToGray(const uint8_t * src, size_t width, size_t height, size_t srcStride, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - Int16ToGray((const int16_t *)src, width, height, srcStride / sizeof(int16_t), dst, dstStride); - else - Int16ToGray((const int16_t *)src, width, height, srcStride / sizeof(int16_t), dst, dstStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonInterference.cpp b/src/3rd/Simd/Simd/SimdNeonInterference.cpp deleted file mode 100644 index 8a0caea4..00000000 --- a/src/3rd/Simd/Simd/SimdNeonInterference.cpp +++ /dev/null @@ -1,145 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template int16x8_t InterferenceChange(const int16x8_t & statistic, const int16x8_t & value, const int16x8_t & saturation); - - template<> SIMD_INLINE int16x8_t InterferenceChange(const int16x8_t & statistic, const int16x8_t & value, const int16x8_t & saturation) - { - return vminq_s16(vaddq_s16(statistic, value), saturation); - } - - template<> SIMD_INLINE int16x8_t InterferenceChange(const int16x8_t & statistic, const int16x8_t & value, const int16x8_t & saturation) - { - return vmaxq_s16(vsubq_s16(statistic, value), saturation); - } - - template SIMD_INLINE void InterferenceChange(int16_t * statistic, const int16x8_t & value, const int16x8_t & saturation) - { - Store(statistic, InterferenceChange(Load(statistic), value, saturation)); - } - - template void InterferenceChange(int16_t * statistic, size_t stride, size_t width, size_t height, uint8_t value, int16_t saturation) - { - assert(width >= HA); - if (align) - assert(Aligned(statistic) && Aligned(stride, HA)); - - size_t alignedWidth = Simd::AlignLo(width, HA); - int16x8_t tailMask = (int16x8_t)ShiftLeft(K8_FF, 2 * (HA - width + alignedWidth)); - - int16x8_t _value = vdupq_n_s16(value); - int16x8_t _saturation = vdupq_n_s16(saturation); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += HA) - InterferenceChange(statistic + col, _value, _saturation); - if (alignedWidth != width) - InterferenceChange(statistic + width - HA, vandq_s16(_value, tailMask), _saturation); - statistic += stride; - } - } - - void InterferenceIncrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t increment, int16_t saturation) - { - assert(Aligned(stride, 2)); - - if (Aligned(statistic) && Aligned(stride)) - InterferenceChange((int16_t*)statistic, stride / 2, width, height, increment, saturation); - else - InterferenceChange((int16_t*)statistic, stride / 2, width, height, increment, saturation); - } - - void InterferenceDecrement(uint8_t * statistic, size_t stride, size_t width, size_t height, uint8_t decrement, int16_t saturation) - { - assert(Aligned(stride, 2)); - - if (Aligned(statistic) && Aligned(stride)) - InterferenceChange((int16_t*)statistic, stride / 2, width, height, decrement, saturation); - else - InterferenceChange((int16_t*)statistic, stride / 2, width, height, decrement, saturation); - } - - template void InterferenceChangeMasked(int16_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t value, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index) - { - assert(width >= A); - if (align) - assert(Aligned(statistic) && Aligned(statisticStride, HA) && Aligned(mask) && Aligned(maskStride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - - int16x8_t _value = vdupq_n_s16(value); - int16x8_t _saturation = vdupq_n_s16(saturation); - uint8x16_t _index = vdupq_n_u8(index); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - { - uint8x16_t _mask = vceqq_u8(Load(mask + col), _index); - InterferenceChange(statistic + col, vandq_s16(_value, (int16x8_t)Stretch2<0>(_mask)), _saturation); - InterferenceChange(statistic + col + HA, vandq_s16(_value, (int16x8_t)Stretch2<1>(_mask)), _saturation); - } - if (alignedWidth != width) - { - uint8x16_t _mask = vandq_u8(vceqq_u8(Load(mask + width - A), _index), tailMask); - InterferenceChange(statistic + width - A, vandq_s16(_value, (int16x8_t)Stretch2<0>(_mask)), _saturation); - InterferenceChange(statistic + width - HA, vandq_s16(_value, (int16x8_t)Stretch2<1>(_mask)), _saturation); - } - statistic += statisticStride; - mask += maskStride; - } - } - - void InterferenceIncrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t increment, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index) - { - assert(Aligned(statisticStride, 2)); - - if (Aligned(statistic) && Aligned(statisticStride) && Aligned(mask) && Aligned(maskStride)) - InterferenceChangeMasked((int16_t*)statistic, statisticStride / 2, width, height, increment, saturation, mask, maskStride, index); - else - InterferenceChangeMasked((int16_t*)statistic, statisticStride / 2, width, height, increment, saturation, mask, maskStride, index); - } - - void InterferenceDecrementMasked(uint8_t * statistic, size_t statisticStride, size_t width, size_t height, - uint8_t decrement, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index) - { - assert(Aligned(statisticStride, 2)); - - if (Aligned(statistic) && Aligned(statisticStride) && Aligned(mask) && Aligned(maskStride)) - InterferenceChangeMasked((int16_t*)statistic, statisticStride / 2, width, height, decrement, saturation, mask, maskStride, index); - else - InterferenceChangeMasked((int16_t*)statistic, statisticStride / 2, width, height, decrement, saturation, mask, maskStride, index); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonInterleave.cpp b/src/3rd/Simd/Simd/SimdNeonInterleave.cpp deleted file mode 100644 index 58b1b0dc..00000000 --- a/src/3rd/Simd/Simd/SimdNeonInterleave.cpp +++ /dev/null @@ -1,175 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template void InterleaveUv(const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * uv, size_t uvStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(uv) && Aligned(uvStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride)); - } - - size_t bodyWidth = AlignLo(width, A); - size_t tail = width - bodyWidth; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += DA) - { - uint8x16x2_t _uv; - _uv.val[0] = Load(u + col); - _uv.val[1] = Load(v + col); - Store2(uv + offset, _uv); - } - if (tail) - { - size_t col = width - A; - size_t offset = 2 * col; - uint8x16x2_t _uv; - _uv.val[0] = Load(u + col); - _uv.val[1] = Load(v + col); - Store2(uv + offset, _uv); - } - u += uStride; - v += vStride; - uv += uvStride; - } - } - - void InterleaveUv(const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * uv, size_t uvStride) - { - if (Aligned(uv) && Aligned(uvStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride)) - InterleaveUv(u, uStride, v, vStride, width, height, uv, uvStride); - else - InterleaveUv(u, uStride, v, vStride, width, height, uv, uvStride); - } - - template void InterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(b) && Aligned(bStride)); - assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride)); - } - - size_t A3 = A * 3; - size_t bodyWidth = AlignLo(width, A); - size_t tail = width - bodyWidth; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += A3) - { - uint8x16x3_t _bgr; - _bgr.val[0] = Load(b + col); - _bgr.val[1] = Load(g + col); - _bgr.val[2] = Load(r + col); - Store3(bgr + offset, _bgr); - } - if (tail) - { - size_t col = width - A; - size_t offset = 3 * col; - uint8x16x3_t _bgr; - _bgr.val[0] = Load(b + col); - _bgr.val[1] = Load(g + col); - _bgr.val[2] = Load(r + col); - Store3(bgr + offset, _bgr); - } - b += bStride; - g += gStride; - r += rStride; - bgr += bgrStride; - } - } - - void InterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - if (Aligned(bgr) && Aligned(bgrStride) && Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride)) - InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride); - else - InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride); - } - - template void InterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride)); - assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride)); - } - - size_t bodyWidth = AlignLo(width, A); - size_t tail = width - bodyWidth; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += QA) - { - uint8x16x4_t _bgra; - _bgra.val[0] = Load(b + col); - _bgra.val[1] = Load(g + col); - _bgra.val[2] = Load(r + col); - _bgra.val[3] = Load(a + col); - Store4(bgra + offset, _bgra); - } - if (tail) - { - size_t col = width - A; - size_t offset = 4 * col; - uint8x16x4_t _bgra; - _bgra.val[0] = Load(b + col); - _bgra.val[1] = Load(g + col); - _bgra.val[2] = Load(r + col); - _bgra.val[3] = Load(a + col); - Store4(bgra + offset, _bgra); - } - b += bStride; - g += gStride; - r += rStride; - a += aStride; - bgra += bgraStride; - } - } - - void InterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride)) - InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride); - else - InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonLaplace.cpp b/src/3rd/Simd/Simd/SimdNeonLaplace.cpp deleted file mode 100644 index b32063d7..00000000 --- a/src/3rd/Simd/Simd/SimdNeonLaplace.cpp +++ /dev/null @@ -1,184 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - const uint8x8_t K8X8_08 = SIMD_VEC_SET1_EPI16(0x0008); - - template SIMD_INLINE int16x8_t Laplace(uint8x16_t a[3][3]) - { - return vsubq_s16((int16x8_t)vmull_u8(K8X8_08, Half(a[1][1])), (int16x8_t)vaddq_u16( - vaddq_u16(vaddl_u8(Half(a[0][0]), Half(a[0][1])), vaddl_u8(Half(a[0][2]), Half(a[1][0]))), - vaddq_u16(vaddl_u8(Half(a[1][2]), Half(a[2][0])), vaddl_u8(Half(a[2][1]), Half(a[2][2]))))); - } - - template SIMD_INLINE void Laplace(uint8x16_t a[3][3], int16_t * dst) - { - Store(dst + 0, ConditionalAbs(Laplace<0>(a))); - Store(dst + 8, ConditionalAbs(Laplace<1>(a))); - } - - template void Laplace(const uint8_t * src, size_t srcStride, size_t width, size_t height, int16_t * dst, size_t dstStride) - { - assert(width > A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride, HA)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - uint8x16_t a[3][3]; - - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - LoadNose3(src0 + 0, a[0]); - LoadNose3(src1 + 0, a[1]); - LoadNose3(src2 + 0, a[2]); - Laplace(a, dst + 0); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBody3(src0 + col, a[0]); - LoadBody3(src1 + col, a[1]); - LoadBody3(src2 + col, a[2]); - Laplace(a, dst + col); - } - LoadTail3(src0 + width - A, a[0]); - LoadTail3(src1 + width - A, a[1]); - LoadTail3(src2 + width - A, a[2]); - Laplace(a, dst + width - A); - - dst += dstStride; - } - } - - void Laplace(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - Laplace(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - Laplace(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - void LaplaceAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - Laplace(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - Laplace(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - SIMD_INLINE void LaplaceAbsSum(uint8x16_t a[3][3], uint32x4_t & sum) - { - sum = vaddq_u32(sum, vpaddlq_u16((uint16x8_t)ConditionalAbs(Laplace<0>(a)))); - sum = vaddq_u32(sum, vpaddlq_u16((uint16x8_t)ConditionalAbs(Laplace<1>(a)))); - } - - SIMD_INLINE void SetMask3(uint8x16_t a[3], uint8x16_t mask) - { - a[0] = vandq_u8(a[0], mask); - a[1] = vandq_u8(a[1], mask); - a[2] = vandq_u8(a[2], mask); - } - - SIMD_INLINE void SetMask3x3(uint8x16_t a[3][3], uint8x16_t mask) - { - SetMask3(a[0], mask); - SetMask3(a[1], mask); - SetMask3(a[2], mask); - } - - template void LaplaceAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - assert(width > A); - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - - uint8x16_t a[3][3]; - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + bodyWidth); - - uint64x2_t fullSum = K64_0000000000000000; - for (size_t row = 0; row < height; ++row) - { - src0 = src + stride*(row - 1); - src1 = src0 + stride; - src2 = src1 + stride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - uint32x4_t rowSum = K32_00000000; - - LoadNose3(src0 + 0, a[0]); - LoadNose3(src1 + 0, a[1]); - LoadNose3(src2 + 0, a[2]); - LaplaceAbsSum(a, rowSum); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBody3(src0 + col, a[0]); - LoadBody3(src1 + col, a[1]); - LoadBody3(src2 + col, a[2]); - LaplaceAbsSum(a, rowSum); - } - LoadTail3(src0 + width - A, a[0]); - LoadTail3(src1 + width - A, a[1]); - LoadTail3(src2 + width - A, a[2]); - SetMask3x3(a, tailMask); - LaplaceAbsSum(a, rowSum); - - fullSum = vaddq_u64(fullSum, vpaddlq_u32(rowSum)); - } - *sum = ExtractSum64u(fullSum); - } - - void LaplaceAbsSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(src) && Aligned(srcStride)) - LaplaceAbsSum(src, srcStride, width, height, sum); - else - LaplaceAbsSum(src, srcStride, width, height, sum); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonLbp.cpp b/src/3rd/Simd/Simd/SimdNeonLbp.cpp deleted file mode 100644 index ebf8b8b0..00000000 --- a/src/3rd/Simd/Simd/SimdNeonLbp.cpp +++ /dev/null @@ -1,84 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdCompare.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template void LbpEstimate(const uint8_t * src, ptrdiff_t stride, uint8_t * dst) - { - uint8x16_t threshold = Load(src); - uint8x16_t lbp = K8_00; - lbp = vorrq_u8(lbp, vandq_u8(vcgeq_u8(Load(src - 1 - stride), threshold), K8_01)); - lbp = vorrq_u8(lbp, vandq_u8(vcgeq_u8(Load(src - stride), threshold), K8_02)); - lbp = vorrq_u8(lbp, vandq_u8(vcgeq_u8(Load(src + 1 - stride), threshold), K8_04)); - lbp = vorrq_u8(lbp, vandq_u8(vcgeq_u8(Load(src + 1), threshold), K8_08)); - lbp = vorrq_u8(lbp, vandq_u8(vcgeq_u8(Load(src + 1 + stride), threshold), K8_10)); - lbp = vorrq_u8(lbp, vandq_u8(vcgeq_u8(Load(src + stride), threshold), K8_20)); - lbp = vorrq_u8(lbp, vandq_u8(vcgeq_u8(Load(src - 1 + stride), threshold), K8_40)); - lbp = vorrq_u8(lbp, vandq_u8(vcgeq_u8(Load(src - 1), threshold), K8_80)); - Store(dst, lbp); - } - - template void LbpEstimate( - const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(width >= A + 2); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); - - size_t alignedWidth = AlignLo(width - 2, A) + 1; - - memset(dst, 0, width); - src += srcStride; - dst += dstStride; - for (size_t row = 2; row < height; ++row) - { - dst[0] = 0; - for (size_t col = 1; col < alignedWidth; col += A) - LbpEstimate(src + col, srcStride, dst + col); - if (alignedWidth != width - 1) - LbpEstimate(src + width - 1 - A, srcStride, dst + width - 1 - A); - dst[width - 1] = 0; - - src += srcStride; - dst += dstStride; - } - memset(dst, 0, width); - } - - void LbpEstimate(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - LbpEstimate(src, srcStride, width, height, dst, dstStride); - else - LbpEstimate(src, srcStride, width, height, dst, dstStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonMeanFilter3x3.cpp b/src/3rd/Simd/Simd/SimdNeonMeanFilter3x3.cpp deleted file mode 100644 index 9848fdd9..00000000 --- a/src/3rd/Simd/Simd/SimdNeonMeanFilter3x3.cpp +++ /dev/null @@ -1,163 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - namespace - { - struct Buffer - { - Buffer(size_t width) - { - _p = Allocate(sizeof(uint16_t) * 3 * width); - src0 = (uint16_t*)_p; - src1 = src0 + width; - src2 = src1 + width; - } - - ~Buffer() - { - Free(_p); - } - - uint16_t * src0; - uint16_t * src1; - uint16_t * src2; - private: - void * _p; - }; - } - - template SIMD_INLINE uint16x8_t SumCol(uint8x16_t a[3]) - { - return vaddw_u8(vaddl_u8(Half(a[0]), Half(a[1])), Half(a[2])); - } - - template SIMD_INLINE void SumCol(uint8x16_t a[3], uint16_t * b) - { - Store(b + 0, SumCol<0>(a)); - Store(b + HA, SumCol<1>(a)); - } - - const uint16x4_t K16_DIVISION_BY_9_FACTOR = SIMD_VEC_SET1_EPI32(Base::DIVISION_BY_9_FACTOR); - - SIMD_INLINE uint16x8_t DivBy9(uint16x8_t value) - { - uint32x4_t lo = vshrq_n_u32(vmull_u16(Half<0>(value), K16_DIVISION_BY_9_FACTOR), Base::DIVISION_BY_9_SHIFT); - uint32x4_t hi = vshrq_n_u32(vmull_u16(Half<1>(value), K16_DIVISION_BY_9_FACTOR), Base::DIVISION_BY_9_SHIFT); - return PackU32(lo, hi); - } - - template SIMD_INLINE uint16x8_t AverageRow16(const Buffer & buffer, size_t offset) - { - return DivBy9(vaddq_u16(vaddq_u16(K16_0005, Load(buffer.src0 + offset)), - vaddq_u16(Load(buffer.src1 + offset), Load(buffer.src2 + offset)))); - } - - template SIMD_INLINE uint8x16_t AverageRow(const Buffer & buffer, size_t offset) - { - return PackU16(AverageRow16(buffer, offset), AverageRow16(buffer, offset + HA)); - } - - template void MeanFilter3x3( - const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(step*(width - 1) >= A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(step*width) && Aligned(dst) && Aligned(dstStride)); - - uint8x16_t a[3]; - - size_t size = step*width; - size_t bodySize = Simd::AlignHi(size, A) - A; - - Buffer buffer(Simd::AlignHi(size, A)); - - LoadNose3(src + 0, a); - SumCol(a, buffer.src0 + 0); - for (size_t col = A; col < bodySize; col += A) - { - LoadBody3(src + col, a); - SumCol(a, buffer.src0 + col); - } - LoadTail3(src + size - A, a); - SumCol(a, buffer.src0 + size - A); - - memcpy(buffer.src1, buffer.src0, sizeof(uint16_t)*size); - - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - const uint8_t * src2 = src + srcStride*(row + 1); - if (row >= height - 2) - src2 = src + srcStride*(height - 1); - - LoadNose3(src2 + 0, a); - SumCol(a, buffer.src2 + 0); - for (size_t col = A; col < bodySize; col += A) - { - LoadBody3(src2 + col, a); - SumCol(a, buffer.src2 + col); - } - LoadTail3(src2 + size - A, a); - SumCol(a, buffer.src2 + size - A); - - for (size_t col = 0; col < bodySize; col += A) - Store(dst + col, AverageRow(buffer, col)); - Store(dst + size - A, AverageRow(buffer, size - A)); - - Swap(buffer.src0, buffer.src2); - Swap(buffer.src0, buffer.src1); - } - } - - template void MeanFilter3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - assert(channelCount > 0 && channelCount <= 4); - - switch (channelCount) - { - case 1: MeanFilter3x3(src, srcStride, width, height, dst, dstStride); break; - case 2: MeanFilter3x3(src, srcStride, width, height, dst, dstStride); break; - case 3: MeanFilter3x3(src, srcStride, width, height, dst, dstStride); break; - case 4: MeanFilter3x3(src, srcStride, width, height, dst, dstStride); break; - } - } - - void MeanFilter3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(channelCount*width) && Aligned(dst) && Aligned(dstStride)) - MeanFilter3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else - MeanFilter3x3(src, srcStride, width, height, channelCount, dst, dstStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonMedianFilter.cpp b/src/3rd/Simd/Simd/SimdNeonMedianFilter.cpp deleted file mode 100644 index 5d60d8c6..00000000 --- a/src/3rd/Simd/Simd/SimdNeonMedianFilter.cpp +++ /dev/null @@ -1,511 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE void LoadNoseRhomb3x3(const uint8_t* y[3], size_t offset, uint8x16_t a[5]) - { - a[0] = Load(y[0] + offset); - LoadNose3(y[1] + offset, a + 1); - a[4] = Load(y[2] + offset); - } - - template SIMD_INLINE void LoadBodyRhomb3x3(const uint8_t* y[3], size_t offset, uint8x16_t a[5]) - { - a[0] = Load(y[0] + offset); - LoadBody3(y[1] + offset, a + 1); - a[4] = Load(y[2] + offset); - } - - template SIMD_INLINE void LoadTailRhomb3x3(const uint8_t* y[3], size_t offset, uint8x16_t a[5]) - { - a[0] = Load(y[0] + offset); - LoadTail3(y[1] + offset, a + 1); - a[4] = Load(y[2] + offset); - } - - SIMD_INLINE void PartialSort5(uint8x16_t a[5]) - { - SortU8(a[2], a[3]); - SortU8(a[1], a[2]); - SortU8(a[2], a[3]); - a[4] = vmaxq_u8(a[1], a[4]); - a[0] = vminq_u8(a[0], a[3]); - SortU8(a[2], a[0]); - a[2] = vmaxq_u8(a[4], a[2]); - a[2] = vminq_u8(a[2], a[0]); - } - - template void MedianFilterRhomb3x3( - const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(step*(width - 1) >= A); - - const uint8_t * y[3]; - uint8x16_t a[5]; - - size_t size = step*width; - size_t bodySize = Simd::AlignHi(size, A) - A; - - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - y[0] = src + srcStride*(row - 1); - y[1] = y[0] + srcStride; - y[2] = y[1] + srcStride; - if (row < 1) - y[0] = y[1]; - if (row >= height - 1) - y[2] = y[1]; - - LoadNoseRhomb3x3(y, 0, a); - PartialSort5(a); - Store(dst, a[2]); - - for (size_t col = A; col < bodySize; col += A) - { - LoadBodyRhomb3x3(y, col, a); - PartialSort5(a); - Store(dst + col, a[2]); - } - - size_t col = size - A; - LoadTailRhomb3x3(y, col, a); - PartialSort5(a); - Store(dst + col, a[2]); - } - } - - template void MedianFilterRhomb3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - assert(channelCount > 0 && channelCount <= 4); - - switch (channelCount) - { - case 1: MedianFilterRhomb3x3(src, srcStride, width, height, dst, dstStride); break; - case 2: MedianFilterRhomb3x3(src, srcStride, width, height, dst, dstStride); break; - case 3: MedianFilterRhomb3x3(src, srcStride, width, height, dst, dstStride); break; - case 4: MedianFilterRhomb3x3(src, srcStride, width, height, dst, dstStride); break; - } - } - - void MedianFilterRhomb3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(width) && Aligned(dst) && Aligned(dstStride)) - MedianFilterRhomb3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else - MedianFilterRhomb3x3(src, srcStride, width, height, channelCount, dst, dstStride); - } - - template SIMD_INLINE void LoadNoseSquare3x3(const uint8_t* y[3], size_t offset, uint8x16_t a[9]) - { - LoadNose3(y[0] + offset, a + 0); - LoadNose3(y[1] + offset, a + 3); - LoadNose3(y[2] + offset, a + 6); - } - - template SIMD_INLINE void LoadBodySquare3x3(const uint8_t* y[3], size_t offset, uint8x16_t a[9]) - { - LoadBody3(y[0] + offset, a + 0); - LoadBody3(y[1] + offset, a + 3); - LoadBody3(y[2] + offset, a + 6); - } - - template SIMD_INLINE void LoadTailSquare3x3(const uint8_t* y[3], size_t offset, uint8x16_t a[9]) - { - LoadTail3(y[0] + offset, a + 0); - LoadTail3(y[1] + offset, a + 3); - LoadTail3(y[2] + offset, a + 6); - } - - SIMD_INLINE void PartialSort9(uint8x16_t a[9]) - { - SortU8(a[1], a[2]); SortU8(a[4], a[5]); SortU8(a[7], a[8]); - SortU8(a[0], a[1]); SortU8(a[3], a[4]); SortU8(a[6], a[7]); - SortU8(a[1], a[2]); SortU8(a[4], a[5]); SortU8(a[7], a[8]); - a[3] = vmaxq_u8(a[0], a[3]); - a[5] = vminq_u8(a[5], a[8]); - SortU8(a[4], a[7]); - a[6] = vmaxq_u8(a[3], a[6]); - a[4] = vmaxq_u8(a[1], a[4]); - a[2] = vminq_u8(a[2], a[5]); - a[4] = vminq_u8(a[4], a[7]); - SortU8(a[4], a[2]); - a[4] = vmaxq_u8(a[6], a[4]); - a[4] = vminq_u8(a[4], a[2]); - } - - template void MedianFilterSquare3x3( - const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(step*(width - 1) >= A); - - const uint8_t * y[3]; - uint8x16_t a[9]; - - size_t size = step*width; - size_t bodySize = Simd::AlignHi(size, A) - A; - - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - y[0] = src + srcStride*(row - 1); - y[1] = y[0] + srcStride; - y[2] = y[1] + srcStride; - if (row < 1) - y[0] = y[1]; - if (row >= height - 1) - y[2] = y[1]; - - LoadNoseSquare3x3(y, 0, a); - PartialSort9(a); - Store(dst, a[4]); - - for (size_t col = A; col < bodySize; col += A) - { - LoadBodySquare3x3(y, col, a); - PartialSort9(a); - Store(dst + col, a[4]); - } - - size_t col = size - A; - LoadTailSquare3x3(y, col, a); - PartialSort9(a); - Store(dst + col, a[4]); - } - } - - template void MedianFilterSquare3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - assert(channelCount > 0 && channelCount <= 4); - - switch (channelCount) - { - case 1: MedianFilterSquare3x3(src, srcStride, width, height, dst, dstStride); break; - case 2: MedianFilterSquare3x3(src, srcStride, width, height, dst, dstStride); break; - case 3: MedianFilterSquare3x3(src, srcStride, width, height, dst, dstStride); break; - case 4: MedianFilterSquare3x3(src, srcStride, width, height, dst, dstStride); break; - } - } - - void MedianFilterSquare3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(width) && Aligned(dst) && Aligned(dstStride)) - MedianFilterSquare3x3(src, srcStride, width, height, channelCount, dst, dstStride); - else - MedianFilterSquare3x3(src, srcStride, width, height, channelCount, dst, dstStride); - } - - template SIMD_INLINE void LoadNoseRhomb5x5(const uint8_t* y[5], size_t offset, uint8x16_t a[13]) - { - a[0] = Load(y[0] + offset); - LoadNose3(y[1] + offset, a + 1); - LoadNose5(y[2] + offset, a + 4); - LoadNose3(y[3] + offset, a + 9); - a[12] = Load(y[4] + offset); - } - - template SIMD_INLINE void LoadBodyRhomb5x5(const uint8_t* y[5], size_t offset, uint8x16_t a[13]) - { - a[0] = Load(y[0] + offset); - LoadBody3(y[1] + offset, a + 1); - LoadBody5(y[2] + offset, a + 4); - LoadBody3(y[3] + offset, a + 9); - a[12] = Load(y[4] + offset); - } - - template SIMD_INLINE void LoadTailRhomb5x5(const uint8_t* y[5], size_t offset, uint8x16_t a[13]) - { - a[0] = Load(y[0] + offset); - LoadTail3(y[1] + offset, a + 1); - LoadTail5(y[2] + offset, a + 4); - LoadTail3(y[3] + offset, a + 9); - a[12] = Load(y[4] + offset); - } - - SIMD_INLINE void PartialSort13(uint8x16_t a[13]) - { - SortU8(a[0], a[1]); SortU8(a[3], a[4]); SortU8(a[2], a[4]); - SortU8(a[2], a[3]); SortU8(a[6], a[7]); SortU8(a[5], a[7]); - SortU8(a[5], a[6]); SortU8(a[9], a[10]); SortU8(a[8], a[10]); - SortU8(a[8], a[9]); SortU8(a[11], a[12]); SortU8(a[5], a[8]); - SortU8(a[2], a[8]); SortU8(a[2], a[5]); SortU8(a[6], a[9]); - SortU8(a[3], a[9]); SortU8(a[3], a[6]); SortU8(a[7], a[10]); - SortU8(a[4], a[10]); SortU8(a[4], a[7]); SortU8(a[3], a[12]); - SortU8(a[0], a[9]); - a[1] = vminq_u8(a[1], a[10]); - a[1] = vminq_u8(a[1], a[7]); - a[1] = vminq_u8(a[1], a[9]); - a[11] = vmaxq_u8(a[5], a[11]); - a[11] = vmaxq_u8(a[3], a[11]); - a[11] = vmaxq_u8(a[2], a[11]); - SortU8(a[0], a[6]); SortU8(a[1], a[8]); SortU8(a[6], a[8]); - a[4] = vminq_u8(a[4], a[8]); - SortU8(a[0], a[1]); SortU8(a[4], a[6]); SortU8(a[0], a[4]); - a[11] = vmaxq_u8(a[0], a[11]); - SortU8(a[6], a[11]); - a[1] = vminq_u8(a[1], a[11]); - SortU8(a[1], a[4]); SortU8(a[6], a[12]); - a[6] = vmaxq_u8(a[1], a[6]); - a[4] = vminq_u8(a[4], a[12]); - a[6] = vmaxq_u8(a[4], a[6]); - } - - template void MedianFilterRhomb5x5( - const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(step*(width - 2) >= A); - - const uint8_t * y[5]; - uint8x16_t a[13]; - - size_t size = step*width; - size_t bodySize = Simd::AlignHi(size, A) - A; - - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - y[0] = src + srcStride*(row - 2); - y[1] = y[0] + srcStride; - y[2] = y[1] + srcStride; - y[3] = y[2] + srcStride; - y[4] = y[3] + srcStride; - if (row < 2) - { - if (row < 1) - y[1] = y[2]; - y[0] = y[1]; - } - if (row >= height - 2) - { - if (row >= height - 1) - y[3] = y[2]; - y[4] = y[3]; - } - - LoadNoseRhomb5x5(y, 0, a); - PartialSort13(a); - Store(dst, a[6]); - - for (size_t col = A; col < bodySize; col += A) - { - LoadBodyRhomb5x5(y, col, a); - PartialSort13(a); - Store(dst + col, a[6]); - } - - size_t col = size - A; - LoadTailRhomb5x5(y, col, a); - PartialSort13(a); - Store(dst + col, a[6]); - } - } - - template void MedianFilterRhomb5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - assert(channelCount > 0 && channelCount <= 4); - - switch (channelCount) - { - case 1: MedianFilterRhomb5x5(src, srcStride, width, height, dst, dstStride); break; - case 2: MedianFilterRhomb5x5(src, srcStride, width, height, dst, dstStride); break; - case 3: MedianFilterRhomb5x5(src, srcStride, width, height, dst, dstStride); break; - case 4: MedianFilterRhomb5x5(src, srcStride, width, height, dst, dstStride); break; - } - } - - void MedianFilterRhomb5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(width) && Aligned(dst) && Aligned(dstStride)) - MedianFilterRhomb5x5(src, srcStride, width, height, channelCount, dst, dstStride); - else - MedianFilterRhomb5x5(src, srcStride, width, height, channelCount, dst, dstStride); - } - - template SIMD_INLINE void LoadNoseSquare5x5(const uint8_t* y[5], size_t offset, uint8x16_t a[25]) - { - LoadNose5(y[0] + offset, a + 0); - LoadNose5(y[1] + offset, a + 5); - LoadNose5(y[2] + offset, a + 10); - LoadNose5(y[3] + offset, a + 15); - LoadNose5(y[4] + offset, a + 20); - } - - template SIMD_INLINE void LoadBodySquare5x5(const uint8_t* y[5], size_t offset, uint8x16_t a[25]) - { - LoadBody5(y[0] + offset, a + 0); - LoadBody5(y[1] + offset, a + 5); - LoadBody5(y[2] + offset, a + 10); - LoadBody5(y[3] + offset, a + 15); - LoadBody5(y[4] + offset, a + 20); - } - - template SIMD_INLINE void LoadTailSquare5x5(const uint8_t* y[5], size_t offset, uint8x16_t a[25]) - { - LoadTail5(y[0] + offset, a + 0); - LoadTail5(y[1] + offset, a + 5); - LoadTail5(y[2] + offset, a + 10); - LoadTail5(y[3] + offset, a + 15); - LoadTail5(y[4] + offset, a + 20); - } - - SIMD_INLINE void PartialSort25(uint8x16_t a[25]) - { - SortU8(a[0], a[1]); SortU8(a[3], a[4]); SortU8(a[2], a[4]); - SortU8(a[2], a[3]); SortU8(a[6], a[7]); SortU8(a[5], a[7]); - SortU8(a[5], a[6]); SortU8(a[9], a[10]); SortU8(a[8], a[10]); - SortU8(a[8], a[9]); SortU8(a[12], a[13]); SortU8(a[11], a[13]); - SortU8(a[11], a[12]); SortU8(a[15], a[16]); SortU8(a[14], a[16]); - SortU8(a[14], a[15]); SortU8(a[18], a[19]); SortU8(a[17], a[19]); - SortU8(a[17], a[18]); SortU8(a[21], a[22]); SortU8(a[20], a[22]); - SortU8(a[20], a[21]); SortU8(a[23], a[24]); SortU8(a[2], a[5]); - SortU8(a[3], a[6]); SortU8(a[0], a[6]); SortU8(a[0], a[3]); - SortU8(a[4], a[7]); SortU8(a[1], a[7]); SortU8(a[1], a[4]); - SortU8(a[11], a[14]); SortU8(a[8], a[14]); SortU8(a[8], a[11]); - SortU8(a[12], a[15]); SortU8(a[9], a[15]); SortU8(a[9], a[12]); - SortU8(a[13], a[16]); SortU8(a[10], a[16]); SortU8(a[10], a[13]); - SortU8(a[20], a[23]); SortU8(a[17], a[23]); SortU8(a[17], a[20]); - SortU8(a[21], a[24]); SortU8(a[18], a[24]); SortU8(a[18], a[21]); - SortU8(a[19], a[22]); SortU8(a[9], a[18]); SortU8(a[0], a[18]); - a[17] = vmaxq_u8(a[8], a[17]); - a[9] = vmaxq_u8(a[0], a[9]); - SortU8(a[10], a[19]); SortU8(a[1], a[19]); SortU8(a[1], a[10]); - SortU8(a[11], a[20]); SortU8(a[2], a[20]); SortU8(a[12], a[21]); - a[11] = vmaxq_u8(a[2], a[11]); - SortU8(a[3], a[21]); SortU8(a[3], a[12]); SortU8(a[13], a[22]); - a[4] = vminq_u8(a[4], a[22]); - SortU8(a[4], a[13]); SortU8(a[14], a[23]); - SortU8(a[5], a[23]); SortU8(a[5], a[14]); SortU8(a[15], a[24]); - a[6] = vminq_u8(a[6], a[24]); - SortU8(a[6], a[15]); - a[7] = vminq_u8(a[7], a[16]); - a[7] = vminq_u8(a[7], a[19]); - a[13] = vminq_u8(a[13], a[21]); - a[15] = vminq_u8(a[15], a[23]); - a[7] = vminq_u8(a[7], a[13]); - a[7] = vminq_u8(a[7], a[15]); - a[9] = vmaxq_u8(a[1], a[9]); - a[11] = vmaxq_u8(a[3], a[11]); - a[17] = vmaxq_u8(a[5], a[17]); - a[17] = vmaxq_u8(a[11], a[17]); - a[17] = vmaxq_u8(a[9], a[17]); - SortU8(a[4], a[10]); - SortU8(a[6], a[12]); SortU8(a[7], a[14]); SortU8(a[4], a[6]); - a[7] = vmaxq_u8(a[4], a[7]); - SortU8(a[12], a[14]); - a[10] = vminq_u8(a[10], a[14]); - SortU8(a[6], a[7]); SortU8(a[10], a[12]); SortU8(a[6], a[10]); - a[17] = vmaxq_u8(a[6], a[17]); - SortU8(a[12], a[17]); - a[7] = vminq_u8(a[7], a[17]); - SortU8(a[7], a[10]); SortU8(a[12], a[18]); - a[12] = vmaxq_u8(a[7], a[12]); - a[10] = vminq_u8(a[10], a[18]); - SortU8(a[12], a[20]); - a[10] = vminq_u8(a[10], a[20]); - a[12] = vmaxq_u8(a[10], a[12]); - } - - template void MedianFilterSquare5x5( - const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(step*(width - 2) >= A); - - const uint8_t * y[5]; - uint8x16_t a[25]; - - size_t size = step*width; - size_t bodySize = Simd::AlignHi(size, A) - A; - - for (size_t row = 0; row < height; ++row, dst += dstStride) - { - y[0] = src + srcStride*(row - 2); - y[1] = y[0] + srcStride; - y[2] = y[1] + srcStride; - y[3] = y[2] + srcStride; - y[4] = y[3] + srcStride; - if (row < 2) - { - if (row < 1) - y[1] = y[2]; - y[0] = y[1]; - } - if (row >= height - 2) - { - if (row >= height - 1) - y[3] = y[2]; - y[4] = y[3]; - } - - LoadNoseSquare5x5(y, 0, a); - PartialSort25(a); - Store(dst, a[12]); - - for (size_t col = A; col < bodySize; col += A) - { - LoadBodySquare5x5(y, col, a); - PartialSort25(a); - Store(dst + col, a[12]); - } - - size_t col = size - A; - LoadTailSquare5x5(y, col, a); - PartialSort25(a); - Store(dst + col, a[12]); - } - } - - template void MedianFilterSquare5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - assert(channelCount > 0 && channelCount <= 4); - - switch (channelCount) - { - case 1: MedianFilterSquare5x5(src, srcStride, width, height, dst, dstStride); break; - case 2: MedianFilterSquare5x5(src, srcStride, width, height, dst, dstStride); break; - case 3: MedianFilterSquare5x5(src, srcStride, width, height, dst, dstStride); break; - case 4: MedianFilterSquare5x5(src, srcStride, width, height, dst, dstStride); break; - } - } - - void MedianFilterSquare5x5(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t channelCount, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(width) && Aligned(dst) && Aligned(dstStride)) - MedianFilterSquare5x5(src, srcStride, width, height, channelCount, dst, dstStride); - else - MedianFilterSquare5x5(src, srcStride, width, height, channelCount, dst, dstStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonNeural.cpp b/src/3rd/Simd/Simd/SimdNeonNeural.cpp deleted file mode 100644 index b0ba50f6..00000000 --- a/src/3rd/Simd/Simd/SimdNeonNeural.cpp +++ /dev/null @@ -1,2173 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar, -* 2018-2018 Radchenko Andrey. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdExp.h" -#include "Simd/SimdPow.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE void Add4ExtractedSums(const float32x4_t * src, float * dst) - { - float32x2_t sm0 = vadd_f32(vget_high_f32(src[0]), vget_low_f32(src[0])); - float32x2_t sm1 = vadd_f32(vget_high_f32(src[1]), vget_low_f32(src[1])); - float32x2_t sm2 = vadd_f32(vget_high_f32(src[2]), vget_low_f32(src[2])); - float32x2_t sm3 = vadd_f32(vget_high_f32(src[3]), vget_low_f32(src[3])); - float32x2_t sm01 = vpadd_f32(sm0, sm1); - float32x2_t sm23 = vpadd_f32(sm2, sm3); - float32x4_t sm0123 = vcombine_f32(sm01 , sm23); - vst1q_f32(dst, vaddq_f32(vld1q_f32(dst), sm0123)); - } - - template uint8x16_t Invert(const uint8x16_t & value); - - template <> uint8x16_t Invert(const uint8x16_t & value) - { - return vsubq_u8(K8_FF, value); - } - - template <> uint8x16_t Invert(const uint8x16_t & value) - { - return value; - } - - template void Convert(const uint16x8_t & src, const float32x4_t &_1_255, float * dst) - { - Store(dst + 0, vmulq_f32(ToFloat<0>(src), _1_255)); - Store(dst + F, vmulq_f32(ToFloat<1>(src), _1_255)); - } - - template void Convert(const uint8_t * src, const float32x4_t &_1_255, float * dst) - { - uint8x16_t _src = Invert(Load(src)); - Convert(UnpackU8<0>(_src), _1_255, dst + 0); - Convert(UnpackU8<1>(_src), _1_255, dst + DF); - } - - template void NeuralConvert(const uint8_t * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - assert(width >= A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); - - size_t alignedWidth = AlignLo(width, A); - float32x4_t _1_255 = vdupq_n_f32(1.0f / 255.0f); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - Convert(src + col, _1_255, dst + col); - if (width != alignedWidth) - Convert(src + width - A, _1_255, dst + width - A); - src += srcStride; - dst += dstStride; - } - } - - template void NeuralConvert(const uint8_t * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - NeuralConvert(src, srcStride, width, height, dst, dstStride); - else - NeuralConvert(src, srcStride, width, height, dst, dstStride); - } - - void NeuralConvert(const uint8_t * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride, int inversion) - { - if (inversion) - NeuralConvert(src, srcStride, width, height, dst, dstStride); - else - NeuralConvert(src, srcStride, width, height, dst, dstStride); - } - - template SIMD_INLINE void NeuralProductSum(const float * a, const float * b, size_t offset, float32x4_t & sum) - { - float32x4_t _a = Load(a + offset); - float32x4_t _b = Load(b + offset); - sum = vmlaq_f32(sum, _a, _b); - } - - template SIMD_INLINE void NeuralProductSum(const float * a, const float * b, size_t size, float * sum) - { - if (align) - assert(Aligned(a) && Aligned(b)); - - *sum = 0; - size_t partialAlignedSize = AlignLo(size, F); - size_t fullAlignedSize = AlignLo(size, DF); - size_t i = 0; - if (partialAlignedSize) - { - float32x4_t sums[2] = { vdupq_n_f32(0), vdupq_n_f32(0) }; - if (fullAlignedSize) - { - for (; i < fullAlignedSize; i += DF) - { - NeuralProductSum(a, b, i + 0, sums[0]); - NeuralProductSum(a, b, i + F, sums[1]); - } - sums[0] = vaddq_f32(sums[0], sums[1]); - } - for (; i < partialAlignedSize; i += F) - NeuralProductSum(a, b, i, sums[0]); - *sum += ExtractSum32f(sums[0]); - } - for (; i < size; ++i) - *sum += a[i] * b[i]; - } - - void NeuralProductSum(const float * a, const float * b, size_t size, float * sum) - { - if (Aligned(a) && Aligned(b)) - NeuralProductSum(a, b, size, sum); - else - NeuralProductSum(a, b, size, sum); - } - - template SIMD_INLINE void AddMultiplied(const float * src, const float32x4_t & value, float * dst) - { - Store(dst, vmlaq_f32(Load(dst), value, Load(src))); - } - - template SIMD_INLINE void AddMultiplied(const float * src, size_t aligned, size_t partial, size_t full, float value, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst) && Aligned(aligned, QF) && Aligned(partial, F)); - size_t i = 0; - if (partial) - { - float32x4_t _value = vdupq_n_f32(value); - for (; i < aligned; i += QF) - { - AddMultiplied(src + i + F * 0, _value, dst + i + F * 0); - AddMultiplied(src + i + F * 1, _value, dst + i + F * 1); - AddMultiplied(src + i + F * 2, _value, dst + i + F * 2); - AddMultiplied(src + i + F * 3, _value, dst + i + F * 3); - } - for (; i < partial; i += F) - AddMultiplied(src + i, _value, dst + i); - } - for (; i < full; ++i) - dst[i] += src[i] * value; - } - - void NeuralAddVectorMultipliedByValue(const float * src, size_t size, const float * value, float * dst) - { - size_t aligned = AlignLo(size, QF); - size_t partial = AlignLo(size, F); - if (Aligned(src) && Aligned(dst)) - AddMultiplied(src, aligned, partial, size, *value, dst); - else - AddMultiplied(src, aligned, partial, size, *value, dst); - } - - template SIMD_INLINE void AddVector(const float * src, float * dst) - { - Store(dst, vaddq_f32(Load(dst), Load(src))); - } - - template SIMD_INLINE void AddVector(const float * src, size_t aligned, size_t partial, size_t full, float * dst) - { - size_t i = 0; - for (; i < aligned; i += QF) - { - AddVector(src + i + F * 0, dst + i + F * 0); - AddVector(src + i + F * 1, dst + i + F * 1); - AddVector(src + i + F * 2, dst + i + F * 2); - AddVector(src + i + F * 3, dst + i + F * 3); - } - for (; i < partial; i += F) - AddVector(src + i, dst + i); - for (; i < full; ++i) - dst[i] += src[i]; - } - - void NeuralAddVector(const float * src, size_t size, float * dst) - { - size_t aligned = AlignLo(size, QF); - size_t partial = AlignLo(size, F); - if (Aligned(src) && Aligned(dst)) - AddVector(src, aligned, partial, size, dst); - else - AddVector(src, aligned, partial, size, dst); - } - - template SIMD_INLINE void AddValue(const float32x4_t & value, float * dst) - { - Store(dst, vaddq_f32(Load(dst), value)); - } - - template SIMD_INLINE void AddValue(const float * value, float * dst, size_t aligned, size_t partial, size_t full) - { - size_t i = 0; - if (partial) - { - float32x4_t _value = vdupq_n_f32(value[0]); - for (; i < aligned; i += QF) - { - AddValue(_value, dst + i + F * 0); - AddValue(_value, dst + i + F * 1); - AddValue(_value, dst + i + F * 2); - AddValue(_value, dst + i + F * 3); - } - for (; i < partial; i += F) - AddValue(_value, dst + i); - } - for (; i < full; ++i) - dst[i] += value[0]; - } - - void NeuralAddValue(const float * value, float * dst, size_t size) - { - size_t aligned = AlignLo(size, QF); - size_t partial = AlignLo(size, F); - if (Aligned(dst)) - AddValue(value, dst, aligned, partial, size); - else - AddValue(value, dst, aligned, partial, size); - } - - template SIMD_INLINE void NeuralRoughSigmoid(const float * src, size_t size, const float * slope, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - size_t alignedSize = Simd::AlignLo(size, F); - float32x4_t _slope = vdupq_n_f32(*slope); - float32x4_t _0 = vdupq_n_f32(-0.0f); - float32x4_t _1 = vdupq_n_f32(1.0f); - float32x4_t _a = vdupq_n_f32(0.5417f); - float32x4_t _b = vdupq_n_f32(0.1460f); - size_t i = 0; - for (; i < alignedSize; i += F) - { - float32x4_t _src = Load(src + i); - float32x4_t x = vabsq_f32(vmulq_f32(_src, _slope)); - float32x4_t x2 = vmulq_f32(x, x); - float32x4_t x4 = vmulq_f32(x2, x2); - float32x4_t series = vaddq_f32(vmlaq_f32(_1, x2, _a), vmlaq_f32(x, x4, _b)); - uint32x4_t mask = vcgtq_f32(_src, _0); - float32x4_t exp = vbslq_f32(mask, Reciprocal<1>(series), series); - float32x4_t sigmoid = Reciprocal<1>(vaddq_f32(_1, exp)); - Store(dst + i, sigmoid); - } - for (; i < size; ++i) - dst[i] = Base::RoughSigmoid(src[i] * slope[0]); - } - - void NeuralRoughSigmoid(const float * src, size_t size, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - NeuralRoughSigmoid(src, size, slope, dst); - else - NeuralRoughSigmoid(src, size, slope, dst); - } - - template SIMD_INLINE void NeuralRoughSigmoid2(const float * src, const float32x4_t & k, const float32x4_t & o, const float32x4_t & m, float * dst) - { - float32x4_t _src = Load(src); - float32x4_t e1 = vmaxq_f32(m, vmlsq_f32(o, _src, k)); - float32x4_t e2 = vmulq_f32(e1, e1); - float32x4_t e4 = vmulq_f32(e2, e2); - float32x4_t e8 = vmulq_f32(e4, e4); - float32x4_t e16 = vmulq_f32(e8, e8); - float32x4_t e32 = vmulq_f32(e16, e16); - float32x4_t e64 = vmulq_f32(e32, e32); - float32x4_t sigmoid = Reciprocal<1>(vmlaq_f32(o, e64, e64)); - Store(dst, sigmoid); - } - - template SIMD_INLINE void NeuralRoughSigmoid2(const float * src, size_t size, const float * slope, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - size_t partialAlignedSize = Simd::AlignLo(size, F); - size_t fullAlignedSize = Simd::AlignLo(size, QF); - float32x4_t _k = vdupq_n_f32((*slope)*0.0078125f); - float32x4_t _1 = vdupq_n_f32(1.0f); - float32x4_t _05 = vdupq_n_f32(0.5f); - size_t i = 0; - for (; i < fullAlignedSize; i += QF) - { - NeuralRoughSigmoid2(src + i + 0 * F, _k, _1, _05, dst + i + 0 * F); - NeuralRoughSigmoid2(src + i + 1 * F, _k, _1, _05, dst + i + 1 * F); - NeuralRoughSigmoid2(src + i + 2 * F, _k, _1, _05, dst + i + 2 * F); - NeuralRoughSigmoid2(src + i + 3 * F, _k, _1, _05, dst + i + 3 * F); - } - for (; i < partialAlignedSize; i += F) - NeuralRoughSigmoid2(src + i, _k, _1, _05, dst + i); - for (; i < size; ++i) - dst[i] = Base::RoughSigmoid2(src[i] * slope[0]); - } - - void NeuralRoughSigmoid2(const float * src, size_t size, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - NeuralRoughSigmoid2(src, size, slope, dst); - else - NeuralRoughSigmoid2(src, size, slope, dst); - } - - template SIMD_INLINE void NeuralDerivativeSigmoid(const float * src, size_t size, const float * slope, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - size_t alignedSize = Simd::AlignLo(size, F); - float32x4_t _slope = vdupq_n_f32(*slope); - float32x4_t _1 = vdupq_n_f32(1.0f); - size_t i = 0; - for (; i < alignedSize; i += F) - { - float32x4_t _src = Load(src + i); - float32x4_t _dst = Load(dst + i); - Store(dst + i, vmulq_f32(vmulq_f32(_dst, _slope), vmulq_f32(vsubq_f32(_1, _src), _src))); - } - for (; i < size; ++i) - dst[i] *= slope[0] * Base::DerivativeSigmoid(src[i]); - } - - void NeuralDerivativeSigmoid(const float * src, size_t size, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - NeuralDerivativeSigmoid(src, size, slope, dst); - else - NeuralDerivativeSigmoid(src, size, slope, dst); - } - - template SIMD_INLINE void NeuralRoughTanh(const float * src, size_t size, const float * slope, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - size_t alignedSize = Simd::AlignLo(size, F); - float32x4_t _slope = vdupq_n_f32(*slope); - float32x4_t _0 = vdupq_n_f32(-0.0f); - float32x4_t _1 = vdupq_n_f32(1.0f); - float32x4_t _a = vdupq_n_f32(0.5658f); - float32x4_t _b = vdupq_n_f32(0.1430f); - size_t i = 0; - for (; i < alignedSize; i += F) - { - float32x4_t _src = Load(src + i); - float32x4_t x = vabsq_f32(vmulq_f32(_src, _slope)); - float32x4_t x2 = vmulq_f32(x, x); - float32x4_t x4 = vmulq_f32(x2, x2); - float32x4_t pe = vaddq_f32(vmlaq_f32(_1, x2, _a), vmlaq_f32(x, x4, _b)); - float32x4_t ne = Reciprocal<1>(pe); - float32x4_t absTanh = vmulq_f32(vsubq_f32(pe, ne), Reciprocal<1>(vaddq_f32(pe, ne))); - float32x4_t tanh = (float32x4_t)veorq_u32((uint32x4_t)absTanh, vandq_u32((uint32x4_t)_0, vcgtq_f32(_0, _src))); - Store(dst + i, tanh); - } - for (; i < size; ++i) - dst[i] = Base::RoughTanh(src[i] * slope[0]); - } - - void NeuralRoughTanh(const float * src, size_t size, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - NeuralRoughTanh(src, size, slope, dst); - else - NeuralRoughTanh(src, size, slope, dst); - } - - template SIMD_INLINE void NeuralDerivativeTanh(const float * src, size_t size, const float * slope, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - size_t alignedSize = Simd::AlignLo(size, F); - float32x4_t _slope = vdupq_n_f32(*slope); - float32x4_t _1 = vdupq_n_f32(1.0f); - size_t i = 0; - for (; i < alignedSize; i += F) - { - float32x4_t _src = Load(src + i); - float32x4_t _dst = Load(dst + i); - Store(dst + i, vmulq_f32(vmulq_f32(_dst, _slope), vsubq_f32(_1, vmulq_f32(_src, _src)))); - } - for (; i < size; ++i) - dst[i] *= slope[0] * Base::DerivativeTanh(src[i]); - } - - void NeuralDerivativeTanh(const float * src, size_t size, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - NeuralDerivativeTanh(src, size, slope, dst); - else - NeuralDerivativeTanh(src, size, slope, dst); - } - - template void NeuralPow(const float * src, size_t size, const float * exponent, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - float e = exponent[0]; - size_t alignedSize = AlignLo(size, F); - float32x4_t _e = vdupq_n_f32(e); - Pow pow; - size_t i = 0; - for (; i < alignedSize; i += F) - Store(dst + i, pow(Load(src + i), _e)); - for (; i < size; ++i) - dst[i] = Base::Pow(src[i], e); - } - - void NeuralPow(const float * src, size_t size, const float * exponent, float * dst) - { - if (Aligned(src) && Aligned(dst)) - NeuralPow(src, size, exponent, dst); - else - NeuralPow(src, size, exponent, dst); - } - - template void NeuralDerivativeRelu(const float * src, size_t size, const float * slope, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - float s = slope[0]; - float32x4_t _0 = vdupq_n_f32(0.0f); - float32x4_t _1 = vdupq_n_f32(1.0f); - float32x4_t _s = vdupq_n_f32(s); - size_t alignedSize = Simd::AlignLo(size, F); - size_t i = 0; - for (; i < alignedSize; i += F) - { - uint32x4_t mask = vcgtq_f32(Load(src + i), _0); - float32x4_t _dst = Load(dst + i); - Store(dst + i, vmulq_f32(vbslq_f32(mask, _1, _s), _dst)); - } - for (; i < size; ++i) - dst[i] *= src[i] > 0 ? 1.0f : s; - } - - void NeuralDerivativeRelu(const float * src, size_t size, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - NeuralDerivativeRelu(src, size, slope, dst); - else - NeuralDerivativeRelu(src, size, slope, dst); - } - - template SIMD_INLINE void UpdateWeights(const float * x, const float32x4_t & a, const float32x4_t & b, float * d, float * w) - { - float32x4_t _d = vaddq_f32(vmulq_f32(a, Load(d)), vmulq_f32(b, Load(x))); - Store(d, _d); - Store(w, vaddq_f32(Load(w), _d)); - } - - template SIMD_INLINE void UpdateWeights(const float * x, size_t offset, const float32x4_t & a, const float32x4_t & b, float * d, float * w) - { - UpdateWeights(x + offset, a, b, d + offset, w + offset); - } - - template void NeuralUpdateWeights(const float * x, size_t size, const float & a, const float & b, float * d, float * w) - { - if (align) - assert(Aligned(x) && Aligned(d) && Aligned(w)); - - size_t partialAlignedSize = AlignLo(size, F); - size_t fullAlignedSize = AlignLo(size, QF); - float32x4_t _a = vdupq_n_f32(a); - float32x4_t _b = vdupq_n_f32(b); - size_t i = 0; - if (partialAlignedSize) - { - if (fullAlignedSize) - { - for (; i < fullAlignedSize; i += QF) - { - UpdateWeights(x, i + F * 0, _a, _b, d, w); - UpdateWeights(x, i + F * 1, _a, _b, d, w); - UpdateWeights(x, i + F * 2, _a, _b, d, w); - UpdateWeights(x, i + F * 3, _a, _b, d, w); - } - } - for (; i < partialAlignedSize; i += F) - UpdateWeights(x, i, _a, _b, d, w); - } - for (; i < size; ++i) - Base::UpdateWeights(x, i, a, b, d, w); - } - - void NeuralUpdateWeights(const float * x, size_t size, const float * a, const float * b, float * d, float * w) - { - if (Aligned(x) && Aligned(d) && Aligned(w)) - NeuralUpdateWeights(x, size, *a, *b, d, w); - else - NeuralUpdateWeights(x, size, *a, *b, d, w); - } - - - template SIMD_INLINE void AdaptiveGradientUpdate(const float * delta, const float32x4_t & norm, const float32x4_t & alpha, const float32x4_t & epsilon, float * gradient, float * weight) - { - float32x4_t d = vmulq_f32(Load(delta), norm); - float32x4_t _gradient = vaddq_f32(Load(gradient), vmulq_f32(d, d)); - Store(gradient, _gradient); - Store(weight, vsubq_f32(Load(weight), vmulq_f32(vmulq_f32(alpha, d), ReciprocalSqrt<1>(vaddq_f32(_gradient, epsilon))))); - } - - template SIMD_INLINE void AdaptiveGradientUpdate(const float * delta, size_t offset, const float32x4_t & norm, const float32x4_t & alpha, const float32x4_t & epsilon, float * gradient, float * weight) - { - AdaptiveGradientUpdate(delta + offset, norm, alpha, epsilon, gradient + offset, weight + offset); - } - - template void NeuralAdaptiveGradientUpdate(const float * delta, size_t size, size_t batch, const float * alpha, const float * epsilon, float * gradient, float * weight) - { - if (align) - assert(Aligned(delta) && Aligned(gradient) && Aligned(weight)); - - size_t partialAlignedSize = AlignLo(size, F); - size_t fullAlignedSize = AlignLo(size, QF); - const float norm = (float)(1.0 / batch); - float32x4_t _norm = vdupq_n_f32(norm); - float32x4_t _alpha = vdupq_n_f32(*alpha); - float32x4_t _epsilon = vdupq_n_f32(*epsilon); - size_t i = 0; - if (partialAlignedSize) - { - if (fullAlignedSize) - { - for (; i < fullAlignedSize; i += QF) - { - AdaptiveGradientUpdate(delta, i + F * 0, _norm, _alpha, _epsilon, gradient, weight); - AdaptiveGradientUpdate(delta, i + F * 1, _norm, _alpha, _epsilon, gradient, weight); - AdaptiveGradientUpdate(delta, i + F * 2, _norm, _alpha, _epsilon, gradient, weight); - AdaptiveGradientUpdate(delta, i + F * 3, _norm, _alpha, _epsilon, gradient, weight); - } - } - for (; i < partialAlignedSize; i += F) - AdaptiveGradientUpdate(delta, i, _norm, _alpha, _epsilon, gradient, weight); - } - for (; i < size; ++i) - Base::AdaptiveGradientUpdate(delta, i, norm, *alpha, *epsilon, gradient, weight); - } - - void NeuralAdaptiveGradientUpdate(const float * delta, size_t size, size_t batch, const float * alpha, const float * epsilon, float * gradient, float * weight) - { - if (Aligned(delta) && Aligned(gradient) && Aligned(weight)) - NeuralAdaptiveGradientUpdate(delta, size, batch, alpha, epsilon, gradient, weight); - else - NeuralAdaptiveGradientUpdate(delta, size, batch, alpha, epsilon, gradient, weight); - } - - template SIMD_INLINE void LoadWeightsForward(const float * src, float32x4_t * dst) - { - for (size_t i = 0; i < size; ++i) - dst[i] = vdupq_n_f32(src[i]); - } - - template SIMD_INLINE void LoadWeightsBackward(const float * src, float32x4_t * dst) - { - for (size_t i = 0; i < size; ++i) - dst[i] = vdupq_n_f32(src[size - i - 1]); - } - - namespace - { - template struct Buffer - { - Buffer(size_t width) - { - _size = width * sizeof(float); - size_t stride = AlignHi(width + 2 * (count - 1), F); - size_t full = count*stride * sizeof(float); - _ptr = Allocate(full); - memset(_ptr, 0, full); - rows[0] = (float*)_ptr; - for (size_t i = 1; i < count; ++i) - rows[i] = rows[i - 1] + stride; - } - - void Update(const float * src) - { - float * tmp = rows[0]; - if (src == NULL) - memset(tmp + count - 1, 0, _size); - else - memcpy(tmp + count - 1, src, _size); - for (size_t i = 0; i < count - 1; ++i) - rows[i] = rows[i + 1]; - rows[count - 1] = tmp; - } - - ~Buffer() - { - Free(_ptr); - } - - float * rows[count]; - private: - size_t _size; - void * _ptr; - }; - } - - template struct Convolution - { - template static SIMD_INLINE float32x4_t Forward(const float * src, size_t stride, const float32x4_t * weights); - - template static SIMD_INLINE float32x4_t Backward(const Buffer & buffer, size_t offset, const float32x4_t * weights); - - template static SIMD_INLINE void Sum(const float * src, const float32x4_t & dst, float32x4_t * sums); - }; - - template<> struct Convolution<2, 2> - { - template static SIMD_INLINE float32x4_t Convolution2(const float * src, const float32x4_t * weights) - { - float32x4_t _src[2]; - _src[0] = Load(src + 0); - _src[1] = vld1q_f32(src + 1); - return vmlaq_f32(vmulq_f32(_src[0], weights[0]), _src[1], weights[1]); - } - - template static SIMD_INLINE float32x4_t Forward(const float * src, size_t stride, const float32x4_t * weights) - { - return vaddq_f32(Convolution2(src, weights), - Convolution2(src + stride, weights + 2)); - } - - template static SIMD_INLINE float32x4_t Backward(const Buffer<2> & buffer, size_t offset, const float32x4_t * weights) - { - return vaddq_f32(Convolution2(buffer.rows[0] + offset, weights), - Convolution2(buffer.rows[1] + offset, weights + 2)); - } - - template static SIMD_INLINE void Sum(const float * src, const float32x4_t & dst, float32x4_t * sums) - { - float32x4_t _src[2]; - _src[0] = Load(src); - _src[1] = vld1q_f32(src + 1); - sums[0] = vmlaq_f32(sums[0], dst, _src[0]); - sums[1] = vmlaq_f32(sums[1], dst, _src[1]); - } - - template static SIMD_INLINE void Sum(const float * src, size_t stride, const float32x4_t & dst, float32x4_t * sums) - { - Sum(src + stride * 0, dst, sums + 0); - Sum(src + stride * 1, dst, sums + 2); - } - }; - - template<> struct Convolution<3, 3> - { - template static SIMD_INLINE float32x4_t Convolution3(const float * src, const float32x4_t * weights) - { - float32x4_t _src[3]; - _src[0] = Load(src + 0); - _src[1] = vld1q_f32(src + 1); - _src[2] = vld1q_f32(src + 2); - return vmlaq_f32(vmlaq_f32(vmulq_f32(_src[0], weights[0]), _src[1], weights[1]), _src[2], weights[2]); - } - - template static SIMD_INLINE float32x4_t Forward(const float * src, size_t stride, const float32x4_t * weights) - { - return vaddq_f32(Convolution3(src, weights), - vaddq_f32(Convolution3(src + stride, weights + 3), - Convolution3(src + 2 * stride, weights + 6))); - } - - template static SIMD_INLINE float32x4_t Backward(const Buffer<3> & buffer, size_t offset, const float32x4_t * weights) - { - return vaddq_f32(Convolution3(buffer.rows[0] + offset, weights), - vaddq_f32(Convolution3(buffer.rows[1] + offset, weights + 3), - Convolution3(buffer.rows[2] + offset, weights + 6))); - } - - template static SIMD_INLINE void Sum(const float * src, const float32x4_t & dst, float32x4_t * sums) - { - float32x4_t _src[3]; - _src[0] = Load(src); - _src[1] = vld1q_f32(src + 1); - _src[2] = vld1q_f32(src + 2); - sums[0] = vmlaq_f32(sums[0], dst, _src[0]); - sums[1] = vmlaq_f32(sums[1], dst, _src[1]); - sums[2] = vmlaq_f32(sums[2], dst, _src[2]); - } - - template static SIMD_INLINE void Sum(const float * src, size_t stride, const float32x4_t & dst, float32x4_t * sums) - { - Sum(src + stride * 0, dst, sums + 0); - Sum(src + stride * 1, dst, sums + 3); - Sum(src + stride * 2, dst, sums + 6); - } - }; - - template<> struct Convolution<4, 4> - { - template static SIMD_INLINE float32x4_t Convolution4(const float * src, const float32x4_t * weights) - { - float32x4_t _src[4]; - _src[0] = Load(src + 0); - _src[1] = vld1q_f32(src + 1); - _src[2] = vld1q_f32(src + 2); - _src[3] = vld1q_f32(src + 3); - return vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(_src[0], weights[0]), _src[1], weights[1]), _src[2], weights[2]), _src[3], weights[3]); - } - - template static SIMD_INLINE float32x4_t Forward(const float * src, size_t stride, const float32x4_t * weights) - { - return vaddq_f32(vaddq_f32(Convolution4(src, weights), - Convolution4(src + stride, weights + 4)), - vaddq_f32(Convolution4(src + 2 * stride, weights + 8), - Convolution4(src + 3 * stride, weights + 12))); - } - - template static SIMD_INLINE float32x4_t Backward(const Buffer<4> & buffer, size_t offset, const float32x4_t * weights) - { - return vaddq_f32(vaddq_f32(Convolution4(buffer.rows[0] + offset, weights), - Convolution4(buffer.rows[1] + offset, weights + 4)), - vaddq_f32(Convolution4(buffer.rows[2] + offset, weights + 8), - Convolution4(buffer.rows[3] + offset, weights + 12))); - } - - template static SIMD_INLINE void Sum(const float * src, const float32x4_t & dst, float32x4_t * sums) - { - float32x4_t _src[5]; - _src[0] = Load(src); - _src[1] = vld1q_f32(src + 1); - _src[2] = vld1q_f32(src + 2); - _src[3] = vld1q_f32(src + 3); - sums[0] = vmlaq_f32(sums[0], dst, _src[0]); - sums[1] = vmlaq_f32(sums[1], dst, _src[1]); - sums[2] = vmlaq_f32(sums[2], dst, _src[2]); - sums[3] = vmlaq_f32(sums[3], dst, _src[3]); - } - - template static SIMD_INLINE void Sum(const float * src, size_t stride, const float32x4_t & dst, float32x4_t * sums) - { - Sum(src + stride * 0, dst, sums + 0); - Sum(src + stride * 1, dst, sums + 4); - Sum(src + stride * 2, dst, sums + 8); - Sum(src + stride * 3, dst, sums + 12); - } - }; - - template<> struct Convolution<5, 5> - { - template static SIMD_INLINE float32x4_t Convolution5(const float * src, const float32x4_t * weights) - { - float32x4_t _src[5]; - _src[0] = Load(src + 0); - _src[1] = vld1q_f32(src + 1); - _src[2] = vld1q_f32(src + 2); - _src[3] = vld1q_f32(src + 3); - _src[4] = Load(src + 4); - return vmlaq_f32(vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(_src[0], weights[0]), _src[1], weights[1]), _src[2], weights[2]), _src[3], weights[3]), _src[4], weights[4]); - } - - template static SIMD_INLINE float32x4_t Forward(const float * src, size_t stride, const float32x4_t * weights) - { - return vaddq_f32(Convolution5(src, weights), - vaddq_f32(vaddq_f32(Convolution5(src + stride, weights + 5), - Convolution5(src + 2 * stride, weights + 10)), - vaddq_f32(Convolution5(src + 3 * stride, weights + 15), - Convolution5(src + 4 * stride, weights + 20)))); - } - - template static SIMD_INLINE float32x4_t Backward(const Buffer<5> & buffer, size_t offset, const float32x4_t * weights) - { - return vaddq_f32(vaddq_f32(Convolution5(buffer.rows[0] + offset, weights), - vaddq_f32(Convolution5(buffer.rows[1] + offset, weights + 5), - Convolution5(buffer.rows[2] + offset, weights + 10))), - vaddq_f32(Convolution5(buffer.rows[3] + offset, weights + 15), - Convolution5(buffer.rows[4] + offset, weights + 20))); - } - - template static SIMD_INLINE void Sum(const float * src, const float32x4_t & dst, float32x4_t * sums) - { - float32x4_t _src[5]; - _src[0] = Load(src); - _src[1] = vld1q_f32(src + 1); - _src[2] = vld1q_f32(src + 2); - _src[3] = vld1q_f32(src + 3); - _src[4] = Load(src + 4); - sums[0] = vmlaq_f32(sums[0], dst, _src[0]); - sums[1] = vmlaq_f32(sums[1], dst, _src[1]); - sums[2] = vmlaq_f32(sums[2], dst, _src[2]); - sums[3] = vmlaq_f32(sums[3], dst, _src[3]); - sums[4] = vmlaq_f32(sums[4], dst, _src[4]); - } - - template static SIMD_INLINE void Sum(const float * src, size_t stride, const float32x4_t & dst, float32x4_t * sums) - { - Sum(src + stride * 0, dst, sums + 0); - Sum(src + stride * 1, dst, sums + 5); - Sum(src + stride * 2, dst, sums + 10); - Sum(src + stride * 3, dst, sums + 15); - Sum(src + stride * 4, dst, sums + 20); - } - }; - - template void NeuralAddConvolutionForward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - size_t alignedWidth = AlignLo(width, F); - float32x4_t tailMask = RightNotZero32f(width - alignedWidth); - float32x4_t _weights[coreX*coreY]; - LoadWeightsForward(weights, _weights); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += F) - { - float32x4_t _dst = Load(dst + col); - _dst = vaddq_f32(_dst, Convolution::template Forward(src + col, srcStride, _weights)); - Store(dst + col, _dst); - } - if (width - alignedWidth) - { - size_t col = width - F; - float32x4_t _dst = Load(dst + col); - _dst = vaddq_f32(_dst, And(tailMask, Convolution::template Forward(src + col, srcStride, _weights))); - Store(dst + col, _dst); - } - src += srcStride; - dst += dstStride; - } - } - - namespace Ncf - { - namespace Ver0 - { - void PrepareB(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, size_t kernelX, size_t kernelY, - size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, float * dst) - { - const size_t K = kernelX*kernelY*srcDepth, N = dstHeight*dstWidth; - if (dilationX*dilationY*strideX*strideY != 1) - { - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - size_t srcRow0 = dstRow*strideY - padY; - for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol) - { - size_t srcCol0 = dstCol*strideX - padX; - for (size_t channel = 0; channel < srcDepth; ++channel) - { - for (size_t kernelRow = 0; kernelRow < kernelY; ++kernelRow) - { - size_t srcRow = srcRow0 + kernelRow*dilationY; - if (srcRow < srcHeight) - { - const float * psrc = src + (channel*srcHeight + srcRow)*srcWidth; - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol) - { - size_t srcCol = srcCol0 + kernelCol*dilationX; - if (srcCol < srcWidth) - *(dst++) = psrc[srcCol]; - else - *(dst++) = 0; - } - } - else - { - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol) - *(dst++) = 0; - } - } - } - } - } - } - else if (kernelX*kernelY != 1) - { - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - size_t srcRow0 = dstRow - padY; - for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol) - { - size_t srcCol0 = dstCol - padX; - for (size_t channel = 0; channel < srcDepth; ++channel) - { - for (size_t kernelRow = 0; kernelRow < kernelY; ++kernelRow) - { - size_t srcRow = srcRow0 + kernelRow; - if (srcRow < srcHeight) - { - const float * psrc = src + (channel*srcHeight + srcRow)*srcWidth; - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol) - { - size_t srcCol = srcCol0 + kernelCol; - if (srcCol < srcWidth) - *(dst++) = psrc[srcCol]; - else - *(dst++) = 0; - } - } - else - { - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol) - *(dst++) = 0; - } - } - } - } - } - } - else - { - for (size_t i = 0; i < N; ++i) - { - for (size_t k = 0; k < K; ++k) - *(dst++) = src[k*N + i]; - } - } - } - - template static SIMD_INLINE void Kernel1x4x4(const float32x4_t & a, size_t K, const float * b, float32x4_t * sums) - { - sums[0] = vaddq_f32(sums[0], vmulq_f32(a, Load(b + 0 * K))); - sums[1] = vaddq_f32(sums[1], vmulq_f32(a, Load(b + 1 * K))); - sums[2] = vaddq_f32(sums[2], vmulq_f32(a, Load(b + 2 * K))); - sums[3] = vaddq_f32(sums[3], vmulq_f32(a, Load(b + 3 * K))); - } - - template static SIMD_INLINE void Kernel1x1x4(const float32x4_t & a, const float * b, float32x4_t & sum) - { - sum = vaddq_f32(sum, vmulq_f32(a, Load(b))); - } - - template static SIMD_INLINE void Kernel3x4x4(const float32x4_t * a, size_t K, const float * b, float32x4_t * sums) - { - float32x4_t _b; - _b = Load(b + 0 * K); - sums[0x0] = vaddq_f32(sums[0x0], vmulq_f32(a[0], _b)); - sums[0x4] = vaddq_f32(sums[0x4], vmulq_f32(a[1], _b)); - sums[0x8] = vaddq_f32(sums[0x8], vmulq_f32(a[2], _b)); - _b = Load(b + 1 * K); - sums[0x1] = vaddq_f32(sums[0x1], vmulq_f32(a[0], _b)); - sums[0x5] = vaddq_f32(sums[0x5], vmulq_f32(a[1], _b)); - sums[0x9] = vaddq_f32(sums[0x9], vmulq_f32(a[2], _b)); - _b = Load(b + 2 * K); - sums[0x2] = vaddq_f32(sums[0x2], vmulq_f32(a[0], _b)); - sums[0x6] = vaddq_f32(sums[0x6], vmulq_f32(a[1], _b)); - sums[0xA] = vaddq_f32(sums[0xA], vmulq_f32(a[2], _b)); - _b = Load(b + 3 * K); - sums[0x3] = vaddq_f32(sums[0x3], vmulq_f32(a[0], _b)); - sums[0x7] = vaddq_f32(sums[0x7], vmulq_f32(a[1], _b)); - sums[0xB] = vaddq_f32(sums[0xB], vmulq_f32(a[2], _b)); - } - - template static SIMD_INLINE void Kernel3x1x4(const float32x4_t * a, const float * b, float32x4_t * sums) - { - float32x4_t _b = Load(b); - sums[0x0] = vaddq_f32(sums[0x0], vmulq_f32(a[0], _b)); - sums[0x1] = vaddq_f32(sums[0x1], vmulq_f32(a[1], _b)); - sums[0x2] = vaddq_f32(sums[0x2], vmulq_f32(a[2], _b)); - } - - template void Execute(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) - { - size_t M3 = M / 3 * 3; - size_t N4 = Simd::AlignLo(N, 4); - size_t K4 = Simd::AlignLo(K, 4); - float32x4_t tailMask = RightNotZero32f(K - K4); - size_t i = 0; - for (; i < M3; i += 3) - { - const float * pa = a + i * K; - float * pc = c + i * N; - size_t j = 0; - for (; j < N4; j += 4) - { - const float * pb = b + j * K; - float32x4_t sums[12] = { - vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), - vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), - vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; - float32x4_t _a[3]; - for (size_t k = 0; k < K4; k += 4) - { - _a[0] = Load(pa + k + 0 * K); - _a[1] = Load(pa + k + 1 * K); - _a[2] = Load(pa + k + 2 * K); - Kernel3x4x4(_a, K, pb + k, sums); - } - if (K4 < K) - { - size_t k = K - 4; - _a[0] = And(tailMask, Load(pa + k + 0 * K)); - _a[1] = And(tailMask, Load(pa + k + 1 * K)); - _a[2] = And(tailMask, Load(pa + k + 2 * K)); - Kernel3x4x4(_a, K, pb + k, sums); - } - Add4ExtractedSums(sums + 0, pc + j + 0 * N); - Add4ExtractedSums(sums + 4, pc + j + 1 * N); - Add4ExtractedSums(sums + 8, pc + j + 2 * N); - } - for (; j < N; ++j) - { - const float * pb = b + j * K; - float32x4_t sums[3] = { vdupq_n_f32(0), vdupq_n_f32(0) , vdupq_n_f32(0) }; - float32x4_t _a[3]; - for (size_t k = 0; k < K4; k += 4) - { - _a[0] = Load(pa + k + 0 * K); - _a[1] = Load(pa + k + 1 * K); - _a[2] = Load(pa + k + 2 * K); - Kernel3x1x4(_a, pb + k, sums); - } - if (K4 < K) - { - size_t k = K - 4; - _a[0] = And(tailMask, Load(pa + k + 0 * K)); - _a[1] = And(tailMask, Load(pa + k + 1 * K)); - _a[2] = And(tailMask, Load(pa + k + 2 * K)); - Kernel3x1x4(_a, pb + k, sums); - } - pc[j + 0 * N] += ExtractSum32f(sums[0]); - pc[j + 1 * N] += ExtractSum32f(sums[1]); - pc[j + 2 * N] += ExtractSum32f(sums[2]); - } - } - for (; i < M; ++i) - { - const float * pa = a + i*K; - float * pc = c + i*N; - size_t j = 0; - for (; j < N4; j += 4) - { - const float * pb = b + j*K; - float32x4_t sums[4] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; - for (size_t k = 0; k < K4; k += 4) - { - float32x4_t _a = Load(pa + k); - Kernel1x4x4(_a, K, pb + k, sums); - } - if (K4 < K) - { - size_t k = K - 4; - float32x4_t _a = And(tailMask, Load(pa + k)); - Kernel1x4x4(_a, K, pb + k, sums); - } - Add4ExtractedSums(sums + 0, pc + j); - } - for (; j < N; ++j) - { - const float * pb = b + j*K; - float32x4_t sum = vdupq_n_f32(0); - for (size_t k = 0; k < K4; k += 4) - { - float32x4_t _a = Load(pa + k); - Kernel1x1x4(_a, pb + k, sum); - } - if (K4 < K) - { - size_t k = K - 4; - float32x4_t _a = And(tailMask, Load(pa + k)); - Kernel1x1x4(_a, pb + k, sum); - } - pc[j] += ExtractSum32f(sum); - } - } - } - - void Execute(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) - { - if (Aligned(K, F)) - Execute(M, N, K, a, b, c); - else - Execute(M, N, K, a, b, c); - } - } - - namespace Ver1 - { - void PrepareA(const float * src, size_t M, size_t K, size_t cell, float * dst) - { - size_t K4 = AlignLo(K, 4); - for (size_t i = 0; i < M; i += cell) - { - size_t n = Simd::Min(cell, M - i), k = 0; - if (cell == 4 && n == 4) - { - for (; k < K4; k += 4) - { - const float * ps = src + k; - float32x4_t s0 = vld1q_f32(ps + 0 * K); - float32x4_t s1 = vld1q_f32(ps + 1 * K); - float32x4_t s2 = vld1q_f32(ps + 2 * K); - float32x4_t s3 = vld1q_f32(ps + 3 * K); - - float32x4x2_t s00_10 = vzipq_f32(s0, s2); - float32x4x2_t s01_11 = vzipq_f32(s1, s3); - - float32x4x2_t ss0 = vzipq_f32(s00_10.val[0], s01_11.val[0]); - float32x4x2_t ss1 = vzipq_f32(s00_10.val[1], s01_11.val[1]); - - vst1q_f32(dst + 0, ss0.val[0]); - vst1q_f32(dst + 4, ss0.val[1]); - vst1q_f32(dst + 8, ss1.val[0]); - vst1q_f32(dst + 12, ss1.val[1]); - - dst += 16; - } - } - for (; k < K; ++k) - { - for (size_t c = 0; c < n; ++c) - *(dst++) = src[c*K + k]; - } - src += cell*K; - } - } - - void PrepareB(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, size_t kernelX, size_t kernelY, size_t padX, size_t padY, - size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, size_t cell, float * tmp, float * dst) - { - const size_t K = kernelX*kernelY*srcDepth, N = dstHeight*dstWidth; - if (kernelX*kernelY != 1) - { - float * dst = tmp; - size_t channelSize = srcHeight * srcWidth; - if (dilationX*dilationY*strideX*strideY != 1) - { - for (size_t channel = 0, k = 0; channel < srcDepth; ++channel, src += channelSize) - { - for (size_t kernelRow = 0; kernelRow < kernelY; ++kernelRow) - { - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol, ++k) - { - size_t srcRow = kernelRow*dilationY - padY; - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) - { - if (srcRow < srcHeight) - { - size_t srcCol = kernelCol*dilationX - padX; - for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol) - { - if (srcCol < srcWidth) - *(dst++) = src[srcRow*srcWidth + srcCol]; - else - *(dst++) = 0; - srcCol += strideX; - } - } - else - { - for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol) - *(dst++) = 0; - } - srcRow += strideY; - } - } - } - } - } - else - { - const size_t bodySize = dstWidth - padX * 2; - for (size_t channel = 0, k = 0; channel < srcDepth; ++channel, src += channelSize) - { - for (size_t kernelRow = 0; kernelRow < kernelY; ++kernelRow) - { - for (size_t kernelCol = 0; kernelCol < kernelX; ++kernelCol, ++k) - { - size_t srcRow = kernelRow - padY; - for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow, ++srcRow) - { - if (srcRow < srcHeight) - { - size_t srcCol = kernelCol - padX, dstCol = 0; - const float * psrc = src + srcRow*srcWidth; - for (; dstCol < padX; ++dstCol, ++srcCol) - { - if (srcCol < srcWidth) - *(dst++) = psrc[srcCol]; - else - *(dst++) = 0; - } - memcpy(dst, psrc + srcCol, bodySize * 4); - dst += bodySize; - dstCol += bodySize; - srcCol += bodySize; - for (; dstCol < dstWidth; ++dstCol, ++srcCol) - { - if (srcCol < srcWidth) - *(dst++) = psrc[srcCol]; - else - *(dst++) = 0; - } - } - else - { - memset(dst, 0, dstWidth * 4); - dst += dstWidth; - } - } - } - } - } - } - src = tmp; - } - if (cell == 8) - { - for (size_t j = 0; j < N; j += cell) - { - size_t n = Simd::Min(cell, N - j); - if (n == cell) - { - for (size_t k = 0; k < K; ++k) - { - const float * psrc = src + k*N; - Store(dst + 0, Load(psrc + 0)); - Store(dst + 4, Load(psrc + 4)); - dst += 8; - } - } - else - { - for (size_t k = 0; k < K; ++k) - { - const float * psrc = src + k*N; - size_t c = 0; - for (; c < n; ++c) - *(dst++) = *(psrc++); - for (; c < cell; ++c) - *(dst++) = 0; - } - } - src += cell; - } - } - else - { - for (size_t j = 0; j < N; j += cell) - { - size_t n = Simd::Min(cell, N - j); - for (size_t k = 0; k < K; ++k) - { - const float * psrc = src + k*N; - size_t c = 0; - for (; c < n; ++c) - *(dst++) = *(psrc++); - for (; c < cell; ++c) - *(dst++) = 0; - } - src += cell; - } - } - } - - SIMD_INLINE void AddSum(const float32x4_t & sum, float * dst) - { - Store(dst, vaddq_f32(Load(dst), sum)); - } - - SIMD_INLINE void AddSums4(const float32x4_t * sums, size_t size, const float * mask, float * dst, size_t stride) - { - if (mask) - { - float32x4_t _mask = vld1q_f32(mask); - for (size_t i = 0; i < size; ++i, dst += stride) - AddSum(And(_mask, sums[i]), dst); - } - else - { - for (size_t i = 0; i < size; ++i, dst += stride) - AddSum(sums[i], dst); - } - } - - template SIMD_INLINE void KernelMx4(size_t N, size_t K, const float * a, const float * b, float * c, const float * mask, size_t m) - { - float32x4_t sums[4] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; - for (size_t k = 0; k < K; ++k) - { - float32x4_t b0 = Load(b); - for (size_t s = 0; s < m; ++s) - sums[s] = vaddq_f32(sums[s], vmulq_f32( vdupq_n_f32(a[s]), b0)); - b += 4; - a += m; - } - AddSums4(sums, m, mask, c, N); - } - - template SIMD_INLINE void Kernel4x4(size_t N, size_t K, const float * a, const float * b, float * c, const float * mask) - { - float32x4_t sums[4] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; - for (size_t k = 0; k < K; ++k) - { - float32x4_t b0 = Load(b); - sums[0] = vaddq_f32(sums[0], vmulq_f32(vdupq_n_f32(a[0]), b0)); - sums[1] = vaddq_f32(sums[1], vmulq_f32(vdupq_n_f32(a[1]), b0)); - sums[2] = vaddq_f32(sums[2], vmulq_f32(vdupq_n_f32(a[2]), b0)); - sums[3] = vaddq_f32(sums[3], vmulq_f32(vdupq_n_f32(a[3]), b0)); - b += 4; - a += 4; - } - AddSums4(sums, 4, mask, c, N); - } - - template void Execute4x4(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) - { - size_t M4 = Simd::AlignLo(M, 4); - size_t N4 = Simd::AlignLo(N, 4); - const int32_t mask[8] = { -1, -1, -1, -1, 0, 0, 0, 0 }; - const float * tail = (float*)mask + 4 - N + N4; - size_t i = 0; - for (; i < M4; i += 4) - { - size_t j = 0; - for (; j < N4; j += 4) - Kernel4x4(N, K, a + i*K, b + j*K, c + i*N + j, NULL); - if (N4 < N) - Kernel4x4(N, K, a + i*K, b + j*K, c + i*N + j, tail); - } - if (M4 < M) - { - size_t j = 0; - for (; j < N4; j += 4) - KernelMx4(N, K, a + i*K, b + j*K, c + i*N + j, NULL, M - M4); - if (N4 < N) - KernelMx4(N, K, a + i*K, b + j*K, c + i*N + j, tail, M - M4); - } - } - - SIMD_INLINE void AddSums8(const float32x4_t * sums, size_t size, const float * mask, float * dst, size_t stride) - { - if (mask) - { - float32x4_t mask0 = vld1q_f32(mask + 0); - float32x4_t mask1 = vld1q_f32(mask + 4); - for (size_t i = 0; i < size; ++i, dst += stride) - { - AddSum(And(mask0, sums[i + 0]), dst + 0); - AddSum(And(mask1, sums[i + 4]), dst + 4); - } - } - else - { - for (size_t i = 0; i < size; ++i, dst += stride) - { - AddSum(sums[i + 0], dst + 0); - AddSum(sums[i + 4], dst + 4); - } - } - } - - template SIMD_INLINE void KernelMx8(size_t N, size_t K, const float * a, const float * b, float * c, const float * mask, size_t m) - { - float32x4_t sums[8] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; - for (size_t k = 0; k < K; ++k) - { - float32x4_t b0 = Load(b + 0); - float32x4_t b1 = Load(b + 4); - for (size_t s = 0; s < m; ++s) - { - float32x4_t a0 = vdupq_n_f32(a[s]); - sums[s + 0] = vaddq_f32(sums[s + 0], vmulq_f32(b0, a0)); - sums[s + 4] = vaddq_f32(sums[s + 4], vmulq_f32(b1, a0)); - } - b += 8; - a += m; - } - AddSums8(sums, m, mask, c, N); - } - - template SIMD_INLINE void Kernel4x8(size_t N, size_t K, const float * a, const float * b, float * c, const float * mask) - { - float32x4_t sums[8] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; - for (size_t k = 0; k < K; ++k) - { - float32x4_t b0 = Load(b + 0); - float32x4_t b1 = Load(b + 4); - float32x4_t a0 = vdupq_n_f32(a[0]); - sums[0] = vaddq_f32(sums[0], vmulq_f32(b0, a0)); - sums[4] = vaddq_f32(sums[4], vmulq_f32(b1, a0)); - float32x4_t a1 = vdupq_n_f32(a[1]); - sums[1] = vaddq_f32(sums[1], vmulq_f32(b0, a1)); - sums[5] = vaddq_f32(sums[5], vmulq_f32(b1, a1)); - float32x4_t a2 = vdupq_n_f32(a[2]); - sums[2] = vaddq_f32(sums[2], vmulq_f32(b0, a2)); - sums[6] = vaddq_f32(sums[6], vmulq_f32(b1, a2)); - float32x4_t a3 = vdupq_n_f32(a[3]); - sums[3] = vaddq_f32(sums[3], vmulq_f32(b0, a3)); - sums[7] = vaddq_f32(sums[7], vmulq_f32(b1, a3)); - b += 8; - a += 4; - } - AddSums8(sums, 4, mask, c, N); - } - - template void Execute4x8(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) - { - size_t M4 = Simd::AlignLo(M, 4); - size_t N8 = Simd::AlignLo(N, 8); - const int32_t mask[16] = { -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 }; - const float * tail = (float*)mask + 8 - N + N8; - size_t i = 0; - for (; i < M4; i += 4) - { - size_t j = 0; - for (; j < N8; j += 8) - Kernel4x8(N, K, a + i*K, b + j*K, c + i*N + j, NULL); - if (N8 < N) - Kernel4x8(N, K, a + i*K, b + j*K, c + i*N + j, tail); - } - if (M4 < M) - { - size_t j = 0; - for (; j < N8; j += 8) - KernelMx8(N, K, a + i*K, b + j*K, c + i*N + j, NULL, M - M4); - if (N8 < N) - KernelMx8(N, K, a + i*K, b + j*K, c + i*N + j, tail, M - M4); - } - } - - void Execute(size_t M, size_t N, size_t K, const float * a, const float * b, float * c, size_t cellA, size_t cellB) - { - if (cellA == 4) - { - if (cellB == 4) - Execute4x4(M, N, K, a, b, c); - if (cellB == 8) - Execute4x8(M, N, K, a, b, c); - } - } - } - - - namespace Ver2 - { - void PrepareB(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, size_t padX, size_t padY, float * dst, size_t dstWidth, size_t dstHeight) - { - for (size_t channel = 0; channel < srcDepth; ++channel) - { - const float * s = src; - float * d = dst; - memset(d, 0, padY*dstWidth * 4); - d += padY*dstWidth; - for (size_t row = padY; row < dstHeight - padY; ++row) - { - memset(d, 0, padX * 4); - memcpy(d + padX, s, srcWidth * 4); - memset(d + padX + srcWidth, 0, padX * 4); - d += dstWidth; - s += srcWidth; - } - memset(d, 0, padY*dstWidth * 4); - src += srcWidth*srcHeight; - dst += dstWidth*dstHeight; - } - } - - template void AddConvolution(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, - const float * weight, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth) - { - size_t alignedWidth = AlignLo(dstWidth, F); - float32x4_t tailMask = RightNotZero32f(dstWidth - alignedWidth); - float32x4_t _weight[kernelX*kernelY]; - for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) - { - for (size_t dstChannel = 0; dstChannel < dstDepth; ++dstChannel) - { - const float * psrc = src + srcWidth*srcHeight*srcChannel; - const float * pweight = weight + (dstChannel*srcDepth + srcChannel)*kernelX*kernelY; - float * pdst = dst + dstWidth*dstHeight*dstChannel; - LoadWeightsForward(pweight, _weight); - for (size_t row = 0; row < dstHeight; ++row) - { - size_t col = 0; - for (; col < alignedWidth; col += F) - { - float32x4_t _dst = Load(pdst + col); - _dst = vaddq_f32(_dst, Convolution::template Forward(psrc + col, srcWidth, _weight)); - Store(pdst + col, _dst); - } - if (dstWidth - alignedWidth) - { - size_t col = dstWidth - F; - float32x4_t _dst = Load(pdst + col); - _dst = vaddq_f32(_dst, And(tailMask, Convolution::template Forward(psrc + col, srcWidth, _weight))); - Store(pdst + col, _dst); - } - psrc += srcWidth; - pdst += dstWidth; - } - } - } - } - - void Execute(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, - const float * weight, size_t kernelX, size_t kernelY, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth) - { - assert(kernelX == kernelY); - if (kernelX == 2) - AddConvolution(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth); - else if (kernelX == 3) - AddConvolution(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth); - else if (kernelX == 4) - AddConvolution(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth); - else if (kernelX == 5) - AddConvolution(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth); - else - assert(0); - } - - bool Preferable(size_t srcDepth, size_t kernelX, size_t kernelY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, size_t dstDepth) - { - if (kernelX == kernelY && kernelX >= 2 && kernelX <= 5 && strideX*strideY*dilationX*dilationY == 1) - { - if (dstWidth*dstHeight*kernelX*kernelY >= 8 * 8 * 5 * 5) - return true; - } - return false; - } - } - - struct Opt - { - enum Alg - { - None, - Ver0, - Ver1, - Ver2, - } alg; - - size_t sizeA; - size_t sizeB; - size_t sizeT; - - size_t cellA; - size_t cellB; - - size_t M, N, K; - size_t strideB; - size_t paddedW; - size_t paddedH; - - Opt(size_t srcWidth, size_t srcHeight, size_t srcDepth, size_t kernelX, size_t kernelY, size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, size_t dstDepth) - { - alg = None; - sizeA = 0; - sizeB = 0; - sizeT = 0; - cellA = 1; - cellB = 1; - - M = dstDepth; - N = dstHeight*dstWidth; - K = kernelX*kernelY*srcDepth; - - if (dstWidth*dstHeight / kernelX <= 2000) - alg = Ver0; - else - alg = Ver1; - if (Ver2::Preferable(srcDepth, kernelX, kernelY, strideX, strideY, dilationX, dilationY, dstWidth, dstHeight, dstDepth)) - alg = Ver2; - - switch (alg) - { - case Ver0: - sizeB = N*K; - break; - case Ver1: - cellA = 4; - cellB = 8; - sizeA = M*K; - strideB = Simd::AlignHi(N, cellB); - sizeB = strideB*K; - if (kernelX*kernelY > 1) - sizeT = sizeB; - break; - case Ver2: - if (padX > 0 || padY > 0) - { - paddedW = Simd::AlignHi(srcWidth + 2 * padX, F); - paddedH = srcHeight + 2 * padY; - sizeB = paddedW*paddedH*srcDepth; - } - else - { - paddedW = srcWidth; - paddedH = srcHeight; - } - break; - default: - assert(0); - break; - } - } - }; - - struct Data - { - float * a; - float * b; - float * t; - - Data(size_t sizeA, size_t sizeB, size_t sizeT, void * externalData, size_t * externalSize) - : a(0) - , b(0) - , _data(0) - { - sizeA = AlignHi(sizeA, F); - sizeB = AlignHi(sizeB, F); - sizeT = AlignHi(sizeT, F); - size_t size = (sizeA + sizeB + sizeT) * sizeof(float); - if (size == 0) - return; - if (externalData != AlignHi(externalData, SIMD_ALIGN)) - size += SIMD_ALIGN; - float * data = NULL; - if (externalData == NULL || externalSize == NULL || *externalSize < size) - { - _data = Simd::Allocate(size); - if (externalSize) - *externalSize = size; - data = (float*)_data; - } - else - data = (float*)AlignHi(externalData, SIMD_ALIGN); - if (sizeA) - a = data; - if (sizeB) - b = data + sizeA; - if (sizeT) - t = data + sizeA + sizeB; - } - - ~Data() - { - if (_data) - Simd::Free(_data); - } - - private: - void * _data; - }; - } - - void NeuralAddConvolution2x2Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution3x3Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution4x4Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution5x5Forward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionForward(src, srcStride, width, height, weights, dst, dstStride); - } - - template struct If - { - template static SIMD_INLINE void AddMultiplied(const float * src, size_t aligned, size_t partial, size_t full, float value, float * dst) - { - Neon::AddMultiplied(src, aligned, partial, full, value, dst); - } - }; - - template<> struct If - { - template static SIMD_INLINE void AddMultiplied(const float * src, size_t aligned, size_t partial, size_t full, float value, float * dst) - { - } - }; - - template void NeuralAddConvolutionBackwardSmall(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - size_t aligned = AlignLo(width, QF); - size_t partial = AlignLo(width, F); - for (size_t row = 0; row < height; ++row) - { - for (size_t dy = 0; dy < coreY; ++dy) - { - const float * w = weights + dy * coreX; - float * d = dst + dy*dstStride; - If < 0 < coreX > ::template AddMultiplied(src, aligned, partial, width, w[0], d + 0); - If < 1 < coreX > ::template AddMultiplied(src, aligned, partial, width, w[1], d + 1); - If < 2 < coreX > ::template AddMultiplied(src, aligned, partial, width, w[2], d + 2); - If < 3 < coreX > ::template AddMultiplied(src, aligned, partial, width, w[3], d + 3); - If < 4 < coreX > ::template AddMultiplied(src, aligned, partial, width, w[4], d + 4); - } - src += srcStride; - dst += dstStride; - } - } - - template void NeuralAddConvolutionBackwardLarge(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - Buffer buffer(width); - height += coreY - 1; - width += coreX - 1; - size_t alignedWidth = AlignLo(width, F); - float32x4_t tailMask = RightNotZero32f(width - alignedWidth); - float32x4_t _weights[coreX*coreY]; - LoadWeightsBackward(weights, _weights); - - for (size_t row = 0; row < height; ++row) - { - buffer.Update(row <= height - coreY ? src : NULL); - for (size_t col = 0; col < alignedWidth; col += F) - { - float32x4_t _dst = Load(dst + col); - _dst = vaddq_f32(_dst, Convolution::template Backward(buffer, col, _weights)); - Store(dst + col, _dst); - } - if (width - alignedWidth) - { - size_t col = width - F; - float32x4_t _dst = Load(dst + col); - _dst = vaddq_f32(_dst, And(tailMask, Convolution::template Backward(buffer, col, _weights))); - Store(dst + col, _dst); - } - src += srcStride; - dst += dstStride; - } - } - - template void NeuralAddConvolutionBackward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (width*height < 1024) - NeuralAddConvolutionBackwardSmall(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionBackwardLarge(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution2x2Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution3x3Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution4x4Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - } - - void NeuralAddConvolution5x5Backward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - else - NeuralAddConvolutionBackward(src, srcStride, width, height, weights, dst, dstStride); - } - - template SIMD_INLINE void NeuralAddConvolutionSum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - size_t alignedWidth = Simd::AlignLo(width, F); - float32x4_t tailMask = RightNotZero32f(width - alignedWidth); - float32x4_t _sums[coreX*coreY]; - memset(_sums, 0, sizeof(_sums)); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += F) - { - float32x4_t _dst = Load(dst + col); - Convolution::template Sum(src + col, srcStride, _dst, _sums); - } - if (alignedWidth < width) - { - size_t col = width - F; - float32x4_t _dst = And(tailMask, Load(dst + col)); - Convolution::template Sum(src + col, srcStride, _dst, _sums); - } - src += srcStride; - dst += dstStride; - } - for (size_t i = 0; i < coreX*coreY; ++i) - sums[i] += ExtractSum32f(_sums[i]); - } - - void NeuralAddConvolution2x2Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - else - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - } - - void NeuralAddConvolution3x3Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - else - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - } - - void NeuralAddConvolution4x4Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - else - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - } - - void NeuralAddConvolution5x5Sum(const float * src, size_t srcStride, const float * dst, size_t dstStride, size_t width, size_t height, float * sums) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - else - NeuralAddConvolutionSum(src, srcStride, dst, dstStride, width, height, sums); - } - - template SIMD_INLINE float32x4_t Pooling1x1Max3x1Body(const float * src) - { - return vmaxq_f32(vmaxq_f32(Load(src - 1), Load(src)), Load(src + 1)); - } - - template SIMD_INLINE void Pooling1x1Max3x3Body(const float * src, size_t stride, float * dst) - { - float32x4_t src0 = Pooling1x1Max3x1Body(src - stride); - float32x4_t src1 = Pooling1x1Max3x1Body(src); - float32x4_t src2 = Pooling1x1Max3x1Body(src + stride); - Store(dst, vmaxq_f32(vmaxq_f32(src0, src1), src2)); - } - - template SIMD_INLINE void Pooling1x1Max3x2Body(const float * src, size_t stride, float * dst) - { - float32x4_t src0 = Pooling1x1Max3x1Body(src); - float32x4_t src1 = Pooling1x1Max3x1Body(src + stride); - Store(dst, vmaxq_f32(src0, src1)); - } - - template SIMD_INLINE float32x4_t Pooling1x1Max3x1Nose(const float * src) - { - float32x4_t src1 = Load(src); - float32x4_t src0 = vextq_f32(vextq_f32(src1, src1, 1), src1, 3); - float32x4_t src2 = Load(src + 1); - return vmaxq_f32(vmaxq_f32(src0, src1), src2); - } - - template SIMD_INLINE void Pooling1x1Max3x3Nose(const float * src, size_t stride, float * dst) - { - float32x4_t src0 = Pooling1x1Max3x1Nose(src - stride); - float32x4_t src1 = Pooling1x1Max3x1Nose(src); - float32x4_t src2 = Pooling1x1Max3x1Nose(src + stride); - Store(dst, vmaxq_f32(vmaxq_f32(src0, src1), src2)); - } - template SIMD_INLINE void Pooling1x1Max3x2Nose(const float * src, size_t stride, float * dst) - { - float32x4_t src0 = Pooling1x1Max3x1Nose(src); - float32x4_t src1 = Pooling1x1Max3x1Nose(src + stride); - Store(dst, vmaxq_f32(src0, src1)); - } - - template SIMD_INLINE float32x4_t Pooling1x1Max3x1Tail(const float * src) - { - float32x4_t src0 = Load(src - 1); - float32x4_t src1 = Load(src); - float32x4_t src2 = vextq_f32(src1, vextq_f32(src1, src1, 3), 1); - return vmaxq_f32(vmaxq_f32(src0, src1), src2); - } - - template SIMD_INLINE void Pooling1x1Max3x3Tail(const float * src, size_t stride, float * dst) - { - float32x4_t src0 = Pooling1x1Max3x1Tail(src - stride); - float32x4_t src1 = Pooling1x1Max3x1Tail(src); - float32x4_t src2 = Pooling1x1Max3x1Tail(src + stride); - Store(dst, vmaxq_f32(vmaxq_f32(src0, src1), src2)); - } - template SIMD_INLINE void Pooling1x1Max3x2Tail(const float * src, size_t stride, float * dst) - { - float32x4_t src0 = Pooling1x1Max3x1Tail(src); - float32x4_t src1 = Pooling1x1Max3x1Tail(src + stride); - Store(dst, vmaxq_f32(src0, src1)); - } - - template void NeuralPooling1x1Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - assert(width > F && height > 1); - - size_t alignedWidth = AlignHi(width, F) - F; - height -= 1; - - Pooling1x1Max3x2Nose(src, srcStride, dst); - for (size_t col = F; col < alignedWidth; col += F) - Pooling1x1Max3x2Body(src + col, srcStride, dst + col); - Pooling1x1Max3x2Tail(src + width - F, srcStride, dst + width - F); - - for (size_t row = 1; row < height; ++row) - { - src += srcStride; - dst += dstStride; - Pooling1x1Max3x3Nose(src, srcStride, dst); - for (size_t col = F; col < alignedWidth; col += F) - Pooling1x1Max3x3Body(src + col, srcStride, dst + col); - Pooling1x1Max3x3Tail(src + width - F, srcStride, dst + width - F); - } - - dst += dstStride; - Pooling1x1Max3x2Nose(src, srcStride, dst); - for (size_t col = F; col < alignedWidth; col += F) - Pooling1x1Max3x2Body(src + col, srcStride, dst + col); - Pooling1x1Max3x2Tail(src + width - F, srcStride, dst + width - F); - } - - void NeuralPooling1x1Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralPooling1x1Max3x3(src, srcStride, width, height, dst, dstStride); - else - NeuralPooling1x1Max3x3(src, srcStride, width, height, dst, dstStride); - } - - template SIMD_INLINE float32x4_t Pooling2x2Max2x2(const float * src, size_t stride) - { - float32x4_t s0 = vmaxq_f32(Load(src + 0), Load(src + stride + 0)); - float32x4_t s1 = vmaxq_f32(Load(src + F), Load(src + stride + F)); - return vcombine_f32(vpmax_f32(vget_low_f32(s0), vget_high_f32(s0)), vpmax_f32(vget_low_f32(s1), vget_high_f32(s1))); - } - - template SIMD_INLINE float32x4_t Pooling2x2Max2(const float * src) - { - float32x4_t s0 = Load(src + 0); - float32x4_t s1 = Load(src + F); - return vcombine_f32(vpmax_f32(vget_low_f32(s0), vget_high_f32(s0)), vpmax_f32(vget_low_f32(s1), vget_high_f32(s1))); - } - - template void NeuralPooling2x2Max2x2(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - if (align) - assert(Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)); - - size_t heightEven = Simd::AlignLo(height, 2); - size_t widthEven = Simd::AlignLo(width, 2); - size_t alignedWidth = AlignLo(width, DF); - for (size_t row = 0; row < heightEven; row += 2) - { - for (size_t col = 0; col < alignedWidth; col += DF) - Store(dst + (col >> 1), Pooling2x2Max2x2(src + col, srcStride)); - if (widthEven - alignedWidth) - { - size_t col = widthEven - DF; - Store(dst + (col >> 1), Pooling2x2Max2x2(src + col, srcStride)); - } - if (width - widthEven) - dst[widthEven >> 1] = Simd::Max(src[widthEven], src[widthEven + srcStride]); - src += 2 * srcStride; - dst += dstStride; - } - if (height - heightEven) - { - for (size_t col = 0; col < alignedWidth; col += DF) - Store(dst + (col >> 1), Pooling2x2Max2(src + col)); - if (widthEven - alignedWidth) - { - size_t col = widthEven - DF; - Store(dst + (col >> 1), Pooling2x2Max2(src + col)); - } - if (width - widthEven) - dst[widthEven >> 1] = src[widthEven]; - } - } - - void NeuralPooling2x2Max2x2(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralPooling2x2Max2x2(src, srcStride, width, height, dst, dstStride); - else - NeuralPooling2x2Max2x2(src, srcStride, width, height, dst, dstStride); - } - - SIMD_INLINE float Max2(const float * src) - { - return Simd::Max(src[0], src[1]); - } - - SIMD_INLINE float Max2x2(const float * src, size_t stride) - { - return Simd::Max(Max2(src), Max2(src + stride)); - } - - SIMD_INLINE float Max2x3(const float * src, size_t stride) - { - return Simd::Max(Max2(src), Simd::Max(Max2(src + stride), Max2(src + 2 * stride))); - } - - template SIMD_INLINE float32x4_t Pooling2x2Max1x3(const float * src, size_t stride) - { - return vmaxq_f32(vmaxq_f32(Load(src), Load(src + stride)), Load(src + 2 * stride)); - } - - const uint8x8_t K8_TBL_BITS_LO = SIMD_VEC_SETR_EPI16(0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B); - const uint8x8_t K8_TBL_BITS_HI = SIMD_VEC_SETR_EPI16(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F); - - SIMD_INLINE float32x4_t CombineFor2x2(const float32x4_t & lo, const float32x4_t & hi) - { - return vcombine_f32((float32x2_t)vtbl2_u8((const uint8x8x2_t &)lo, K8_TBL_BITS_LO), (float32x2_t)vtbl2_u8((const uint8x8x2_t &)hi, K8_TBL_BITS_HI)); - } - - template SIMD_INLINE float32x4_t Pooling2x2Max3x3(const float * src, size_t stride) - { - float32x4_t _0123 = Pooling2x2Max1x3(src, stride); - float32x4_t _4567 = Pooling2x2Max1x3(src + F, stride); - float32x4_t _5678 = Pooling2x2Max1x3(src + F + 1, stride); - float32x4x2_t _02461357 = vuzpq_f32(_0123, _4567); - float32x4_t _2468 = CombineFor2x2(_02461357.val[0], _5678); - return vmaxq_f32(vmaxq_f32(_02461357.val[0], _02461357.val[1]), _2468); - } - - template SIMD_INLINE float32x4_t Pooling2x2Max1x2(const float * src, size_t stride) - { - return vmaxq_f32(Load(src), Load(src + stride)); - } - - template SIMD_INLINE float32x4_t Pooling2x2Max3x2(const float * src, size_t stride) - { - float32x4_t _0123 = Pooling2x2Max1x2(src, stride); - float32x4_t _4567 = Pooling2x2Max1x2(src + F, stride); - float32x4_t _5678 = Pooling2x2Max1x2(src + F + 1, stride); - float32x4x2_t _02461357 = vuzpq_f32(_0123, _4567); - float32x4_t _2468 = CombineFor2x2(_02461357.val[0], _5678); - return vmaxq_f32(vmaxq_f32(_02461357.val[0], _02461357.val[1]), _2468); - } - - template void NeuralPooling2x2Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - height -= 1; - width -= 1; - size_t heightEven = Simd::AlignLo(height, 2); - size_t widthEven = Simd::AlignLo(width, 2); - size_t alignedWidth = AlignLo(width, DF); - for (size_t row = 0; row < heightEven; row += 2) - { - for (size_t col = 0; col < alignedWidth; col += DF) - Store(dst + (col >> 1), Pooling2x2Max3x3(src + col, srcStride)); - if (widthEven - alignedWidth) - { - size_t col = widthEven - DF; - Store(dst + (col >> 1), Pooling2x2Max3x3(src + col, srcStride)); - } - if (width - widthEven) - dst[widthEven >> 1] = Max2x3(src + widthEven, srcStride); - src += 2 * srcStride; - dst += dstStride; - } - if (height - heightEven) - { - for (size_t col = 0; col < alignedWidth; col += DF) - Store(dst + (col >> 1), Pooling2x2Max3x2(src + col, srcStride)); - if (widthEven - alignedWidth) - { - size_t col = widthEven - DF; - Store(dst + (col >> 1), Pooling2x2Max3x2(src + col, srcStride)); - } - if (width - widthEven) - dst[widthEven >> 1] = Max2x2(src + widthEven, srcStride); - } - } - - void NeuralPooling2x2Max3x3(const float * src, size_t srcStride, size_t width, size_t height, float * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride, F) && Aligned(dst) && Aligned(dstStride, F)) - NeuralPooling2x2Max3x3(src, srcStride, width, height, dst, dstStride); - else - NeuralPooling2x2Max3x3(src, srcStride, width, height, dst, dstStride); - } - - void NeuralConvolutionForward(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, - const float * weight, size_t kernelX, size_t kernelY, size_t padX, size_t padY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, - void * buffer, size_t * size, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth, int add) - { - using namespace Ncf; - - assert(dstWidth == (srcWidth + 2 * padX - (dilationX * (kernelX - 1) + 1)) / strideX + 1); - assert(dstHeight == (srcHeight + 2 * padY - (dilationY * (kernelY - 1) + 1)) / strideY + 1); - - if (!add) - memset(dst, 0, dstWidth*dstHeight*dstDepth * sizeof(float)); - - Opt opt(srcWidth, srcHeight, srcDepth, kernelX, kernelY, padX, padY, strideX, strideY, dilationX, dilationY, dstWidth, dstHeight, dstDepth); - - Data data(opt.sizeA, opt.sizeB, opt.sizeT, buffer, size); - - if (opt.sizeA) - { - switch (opt.alg) - { - case Opt::Ver1: Ver1::PrepareA(weight, opt.M, opt.K, opt.cellA, data.a); - default: - break; - } - } - else - data.a = (float*)weight; - - if (opt.sizeB) - { - switch (opt.alg) - { - case Opt::Ver0: Ver0::PrepareB(src, srcWidth, srcHeight, srcDepth, kernelX, kernelY, padX, padY, strideX, strideY, dilationX, dilationY, dstWidth, dstHeight, data.b); break; - case Opt::Ver1: Ver1::PrepareB(src, srcWidth, srcHeight, srcDepth, kernelX, kernelY, padX, padY, strideX, strideY, dilationX, dilationY, dstWidth, dstHeight, opt.cellB, data.t, data.b); break; - case Opt::Ver2: Ver2::PrepareB(src, srcWidth, srcHeight, srcDepth, padX, padY, data.b, opt.paddedW, opt.paddedH); break; - default: break; - } - } - else - data.b = (float*)src; - - switch (opt.alg) - { - case Opt::Ver0: Ver0::Execute(opt.M, opt.N, opt.K, data.a, data.b, dst); break; - case Opt::Ver1: Ver1::Execute(opt.M, opt.N, opt.K, data.a, data.b, dst, opt.cellA, opt.cellB); break; - case Opt::Ver2: Ver2::Execute(data.b, opt.paddedW, opt.paddedH, srcDepth, weight, kernelX, kernelY, dst, dstWidth, dstHeight, dstDepth); break; - default: break; - } - } - - - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonOperation.cpp b/src/3rd/Simd/Simd/SimdNeonOperation.cpp deleted file mode 100644 index fc86dfe0..00000000 --- a/src/3rd/Simd/Simd/SimdNeonOperation.cpp +++ /dev/null @@ -1,231 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE uint8x16_t OperationBinary8u(const uint8x16_t & a, const uint8x16_t & b); - - template <> SIMD_INLINE uint8x16_t OperationBinary8u(const uint8x16_t & a, const uint8x16_t & b) - { - return vrhaddq_u8(a, b); - } - - template <> SIMD_INLINE uint8x16_t OperationBinary8u(const uint8x16_t & a, const uint8x16_t & b) - { - return vandq_u8(a, b); - } - - template <> SIMD_INLINE uint8x16_t OperationBinary8u(const uint8x16_t & a, const uint8x16_t & b) - { - return vorrq_u8(a, b); - } - - template <> SIMD_INLINE uint8x16_t OperationBinary8u(const uint8x16_t & a, const uint8x16_t & b) - { - return vmaxq_u8(a, b); - } - - template <> SIMD_INLINE uint8x16_t OperationBinary8u(const uint8x16_t & a, const uint8x16_t & b) - { - return vminq_u8(a, b); - } - - template <> SIMD_INLINE uint8x16_t OperationBinary8u(const uint8x16_t & a, const uint8x16_t & b) - { - return vqsubq_u8(a, b); - } - - template <> SIMD_INLINE uint8x16_t OperationBinary8u(const uint8x16_t & a, const uint8x16_t & b) - { - return vqaddq_u8(a, b); - } - - template void OperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride) - { - assert(width*channelCount >= A); - if (align) - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(dst) && Aligned(dstStride)); - - size_t size = channelCount*width; - size_t alignedSize = Simd::AlignLo(size, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t offset = 0; offset < alignedSize; offset += A) - { - const uint8x16_t a_ = Load(a + offset); - const uint8x16_t b_ = Load(b + offset); - Store(dst + offset, OperationBinary8u(a_, b_)); - } - if (alignedSize != size) - { - const uint8x16_t a_ = Load(a + size - A); - const uint8x16_t b_ = Load(b + size - A); - Store(dst + size - A, OperationBinary8u(a_, b_)); - } - a += aStride; - b += bStride; - dst += dstStride; - } - } - - template void OperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride, SimdOperationBinary8uType type) - { - switch (type) - { - case SimdOperationBinary8uAverage: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uAnd: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uOr: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uMaximum: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uMinimum: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uSaturatedSubtraction: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - case SimdOperationBinary8uSaturatedAddition: - return OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride); - default: - assert(0); - } - } - - void OperationBinary8u(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride, SimdOperationBinary8uType type) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(dst) && Aligned(dstStride)) - OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride, type); - else - OperationBinary8u(a, aStride, b, bStride, width, height, channelCount, dst, dstStride, type); - } - - template SIMD_INLINE int16x8_t OperationBinary16i(const int16x8_t & a, const int16x8_t & b); - - template <> SIMD_INLINE int16x8_t OperationBinary16i(const int16x8_t & a, const int16x8_t & b) - { - return vaddq_s16(a, b); - } - - template <> SIMD_INLINE int16x8_t OperationBinary16i(const int16x8_t & a, const int16x8_t & b) - { - return vsubq_s16(a, b); - } - - template void OperationBinary16i(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(width * sizeof(uint16_t) >= A); - if (align) - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(dst) && Aligned(dstStride)); - - size_t size = width * sizeof(int16_t); - size_t alignedSize = Simd::AlignLo(size, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t offset = 0; offset < alignedSize; offset += A) - { - const int16x8_t a_ = (int16x8_t)Load(a + offset); - const int16x8_t b_ = (int16x8_t)Load(b + offset); - Store(dst + offset, (uint8x16_t)OperationBinary16i(a_, b_)); - } - if (alignedSize != size) - { - const int16x8_t a_ = (int16x8_t)Load(a + size - A); - const int16x8_t b_ = (int16x8_t)Load(b + size - A); - Store(dst + size - A, (uint8x16_t)OperationBinary16i(a_, b_)); - } - a += aStride; - b += bStride; - dst += dstStride; - } - } - - template void OperationBinary16i(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint8_t * dst, size_t dstStride, SimdOperationBinary16iType type) - { - switch (type) - { - case SimdOperationBinary16iAddition: - return OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride); - case SimdOperationBinary16iSubtraction: - return OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride); - default: - assert(0); - } - } - - void OperationBinary16i(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint8_t * dst, size_t dstStride, SimdOperationBinary16iType type) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(dst) && Aligned(dstStride)) - OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride, type); - else - OperationBinary16i(a, aStride, b, bStride, width, height, dst, dstStride, type); - } - - template SIMD_INLINE void VectorProduct(const uint16x8_t & vertical, const uint8_t * horizontal, uint8_t * dst) - { - uint8x16x2_t _horizontal = vzipq_u8(Load(horizontal), K8_00); - _horizontal.val[0] = (uint8x16_t)DivideI16By255(vmulq_u16(vertical, (uint16x8_t)_horizontal.val[0])); - _horizontal.val[1] = (uint8x16_t)DivideI16By255(vmulq_u16(vertical, (uint16x8_t)_horizontal.val[1])); - Store(dst, vuzpq_u8(_horizontal.val[0], _horizontal.val[1]).val[0]); - } - - template void VectorProduct(const uint8_t * vertical, const uint8_t * horizontal, uint8_t * dst, size_t stride, size_t width, size_t height) - { - assert(width >= A); - if (align) - assert(Aligned(horizontal) && Aligned(dst) && Aligned(stride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - uint16x8_t _vertical = vmovq_n_u16(vertical[row]); - for (size_t col = 0; col < alignedWidth; col += A) - VectorProduct(_vertical, horizontal + col, dst + col); - if (alignedWidth != width) - VectorProduct(_vertical, horizontal + width - A, dst + width - A); - dst += stride; - } - } - - void VectorProduct(const uint8_t * vertical, const uint8_t * horizontal, uint8_t * dst, size_t stride, size_t width, size_t height) - { - if (Aligned(horizontal) && Aligned(dst) && Aligned(stride)) - VectorProduct(vertical, horizontal, dst, stride, width, height); - else - VectorProduct(vertical, horizontal, dst, stride, width, height); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonReduce.cpp b/src/3rd/Simd/Simd/SimdNeonReduce.cpp deleted file mode 100644 index 0d1af502..00000000 --- a/src/3rd/Simd/Simd/SimdNeonReduce.cpp +++ /dev/null @@ -1,177 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE uint8x8_t Average(const uint8x16_t & s0, const uint8x16_t & s1) - { - return vshrn_n_u16(vaddq_u16(vaddq_u16(vpaddlq_u8(s0), vpaddlq_u8(s1)), vdupq_n_u16(2)), 2); - } - - SIMD_INLINE uint8x16_t Average(const uint8x16_t & s00, const uint8x16_t & s01, const uint8x16_t & s10, const uint8x16_t & s11) - { - return vcombine_u8(Average(s00, s10), Average(s01, s11)); - } - - template struct Color2x2 - { - template static SIMD_INLINE void Reduce(const uint8_t * src0, const uint8_t * src1, uint8_t * dst) - { - assert(0); - } - }; - - template<> struct Color2x2<1> - { - template static SIMD_INLINE void Reduce(const uint8_t * src0, const uint8_t * src1, uint8_t * dst) - { - Store(dst, Average(Load(src0 + 0), Load(src0 + A), Load(src1 + 0), Load(src1 + A))); - } - }; - - template<> struct Color2x2<2> - { - template static SIMD_INLINE void Reduce(const uint8_t * src0, const uint8_t * src1, uint8_t * dst) - { - uint8x16x2_t s00 = Load2(src0 + 0 * A); - uint8x16x2_t s01 = Load2(src0 + 2 * A); - uint8x16x2_t s10 = Load2(src1 + 0 * A); - uint8x16x2_t s11 = Load2(src1 + 2 * A); - uint8x16x2_t d; - d.val[0] = Average(s00.val[0], s01.val[0], s10.val[0], s11.val[0]); - d.val[1] = Average(s00.val[1], s01.val[1], s10.val[1], s11.val[1]); - Store2(dst, d); - } - }; - - template<> struct Color2x2<3> - { - template static SIMD_INLINE void Reduce(const uint8_t * src0, const uint8_t * src1, uint8_t * dst) - { - uint8x16x3_t s0, s1; - uint8x8x3_t d; - s0 = Load3(src0); - s1 = Load3(src1); - d.val[0] = Average(s0.val[0], s1.val[0]); - d.val[1] = Average(s0.val[1], s1.val[1]); - d.val[2] = Average(s0.val[2], s1.val[2]); - Store3(dst, d); - s0 = Load3(src0 + 3*A); - s1 = Load3(src1 + 3*A); - d.val[0] = Average(s0.val[0], s1.val[0]); - d.val[1] = Average(s0.val[1], s1.val[1]); - d.val[2] = Average(s0.val[2], s1.val[2]); - Store3(dst + 3*HA, d); - } - }; - - template<> struct Color2x2<4> - { - template static SIMD_INLINE void Reduce(const uint8_t * src0, const uint8_t * src1, uint8_t * dst) - { - uint8x16x4_t s0, s1; - uint8x8x4_t d; - s0 = Load4(src0); - s1 = Load4(src1); - d.val[0] = Average(s0.val[0], s1.val[0]); - d.val[1] = Average(s0.val[1], s1.val[1]); - d.val[2] = Average(s0.val[2], s1.val[2]); - d.val[3] = Average(s0.val[3], s1.val[3]); - Store4(dst, d); - s0 = Load4(src0 + 4 * A); - s1 = Load4(src1 + 4 * A); - d.val[0] = Average(s0.val[0], s1.val[0]); - d.val[1] = Average(s0.val[1], s1.val[1]); - d.val[2] = Average(s0.val[2], s1.val[2]); - d.val[3] = Average(s0.val[3], s1.val[3]); - Store4(dst + 4 * HA, d); - } - }; - - template void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride) - { - size_t evenWidth = AlignLo(srcWidth, 2); - size_t alignedWidth = AlignLo(srcWidth, A); - size_t evenSize = evenWidth * channelCount; - size_t alignedSize = alignedWidth * channelCount; - size_t srcStep = DA * channelCount, dstStep = A * channelCount; - for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) - { - const uint8_t *src0 = src; - const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride); - size_t srcOffset = 0, dstOffset = 0; - for (; srcOffset < alignedSize; srcOffset += srcStep, dstOffset += dstStep) - Color2x2::template Reduce(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); - if (alignedSize != evenSize) - { - srcOffset = evenSize - srcStep; - dstOffset = srcOffset / 2; - Color2x2::template Reduce(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); - } - if (evenWidth != srcWidth) - { - for (size_t c = 0; c < channelCount; ++c) - dst[evenSize / 2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]); - } - src += 2 * srcStride; - dst += dstStride; - } - } - - template void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) - { - assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA); - if (align) - { - assert(Aligned(src) && Aligned(srcStride)); - assert(Aligned(dst) && Aligned(dstStride)); - } - - switch (channelCount) - { - case 1: ReduceColor2x2<1, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - case 2: ReduceColor2x2<2, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - case 3: ReduceColor2x2<3, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - case 4: ReduceColor2x2<4, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - default: assert(0); - } - } - - void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - else - ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonReduceGray2x2.cpp b/src/3rd/Simd/Simd/SimdNeonReduceGray2x2.cpp deleted file mode 100644 index 045088bf..00000000 --- a/src/3rd/Simd/Simd/SimdNeonReduceGray2x2.cpp +++ /dev/null @@ -1,126 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE uint8x8_t Average(const uint8x16_t & s0, const uint8x16_t & s1) - { - return vshrn_n_u16(vaddq_u16(vaddq_u16(vpaddlq_u8(s0), vpaddlq_u8(s1)), vdupq_n_u16(2)), 2); - } - - template SIMD_INLINE void ReduceGray2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst) - { - uint8x8x2_t _dst; - _dst.val[0] = Average(Load(src0 + 0), Load(src1 + 0)); - _dst.val[1] = Average(Load(src0 + A), Load(src1 + A)); - Store(dst, *(uint8x16_t*)&_dst); - } - - template SIMD_INLINE void ReduceGray2x2(const uint8_t * src0, const uint8_t * src1, size_t size, uint8_t * dst) - { - for (size_t i = 0; i < size; i += DA, src0 += DA, src1 += DA, dst += A) - ReduceGray2x2(src0, src1, dst); - } - -#if defined(SIMD_NEON_ASM_ENABLE) && 0 - template <> void ReduceGray2x2(const uint8_t * src0, const uint8_t * src1, size_t size, uint8_t * dst) - { - asm( - "mov r4, #2 \n" - "vdup.u16 q4, r4 \n" - "mov r5, %0 \n" - "mov r6, %1 \n" - "mov r4, %2 \n" - "mov r7, %3 \n" - ".loop: \n" - "vld1.8 {q0}, [r5:128]! \n" - "vld1.8 {q2}, [r6:128]! \n" - "vpaddl.u8 q0, q0 \n" - "vpadal.u8 q0, q2 \n" - "vadd.u16 q0, q0, q4 \n" - "vshrn.u16 d10, q0, #2 \n" - "vld1.8 {q1}, [r5:128]! \n" - "vld1.8 {q3}, [r6:128]! \n" - "vpaddl.u8 q1, q1 \n" - "vpadal.u8 q1, q3 \n" - "vadd.u16 q1, q1, q4 \n" - "vshrn.u16 d11, q1, #2 \n" - "vst1.8 {q5}, [r7:128]! \n" - "subs r4, r4, #32 \n" - "bne .loop \n" - - : - : "r"(src0), "r"(src1), "r" (size), "r"(dst) - : "q0", "q1", "q2", "q3", "q4", "q5", "r4", "r5", "r6", "r7", "memory" - ); - } -#endif - - template void ReduceGray2x2( - const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA); - if (align) - { - assert(Aligned(src) && Aligned(srcStride)); - assert(Aligned(dst) && Aligned(dstStride)); - } - - size_t alignedWidth = AlignLo(srcWidth, DA); - size_t evenWidth = AlignLo(srcWidth, 2); - for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) - { - const uint8_t * src0 = src; - const uint8_t * src1 = (srcRow == srcHeight - 1 ? src : src + srcStride); - ReduceGray2x2(src0, src1, alignedWidth, dst); - if (alignedWidth != srcWidth) - { - size_t dstOffset = dstWidth - A - (evenWidth != srcWidth ? 1 : 0); - size_t srcOffset = evenWidth - DA; - ReduceGray2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); - if (evenWidth != srcWidth) - dst[dstWidth - 1] = Base::Average(src0[evenWidth], src1[evenWidth]); - } - src += 2 * srcStride; - dst += dstStride; - } - } - - void ReduceGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonReduceGray3x3.cpp b/src/3rd/Simd/Simd/SimdNeonReduceGray3x3.cpp deleted file mode 100644 index 2c12e37c..00000000 --- a/src/3rd/Simd/Simd/SimdNeonReduceGray3x3.cpp +++ /dev/null @@ -1,118 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE uint16x8_t DivideBy16(uint16x8_t value); - - template <> SIMD_INLINE uint16x8_t DivideBy16(uint16x8_t value) - { - return vshrq_n_u16(vaddq_u16(value, K16_0008), 4); - } - - template <> SIMD_INLINE uint16x8_t DivideBy16(uint16x8_t value) - { - return vshrq_n_u16(value, 4); - } - - template SIMD_INLINE uint16x8_t ReduceColNose(const uint8_t * src) - { - uint8x16_t t12 = Load(src); - uint8x16_t t01 = LoadBeforeFirst<1>(t12); - return vaddq_u16(vpaddlq_u8(t01), vpaddlq_u8(t12)); - } - - template SIMD_INLINE uint16x8_t ReduceColBody(const uint8_t * src) - { - uint8x16_t t01 = Load(src - 1); - uint8x16_t t12 = Load(src); - return vaddq_u16(vpaddlq_u8(t01), vpaddlq_u8(t12)); - } - - template SIMD_INLINE uint8x8_t ReduceRow(const uint16x8_t & r0, const uint16x8_t & r1, const uint16x8_t & r2) - { - return vmovn_u16(DivideBy16(BinomialSum16(r0, r1, r2))); - } - - template void ReduceGray3x3( - const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert(srcWidth >= A && (srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight); - if (align) - assert(Aligned(src) && Aligned(srcStride)); - - size_t lastOddCol = srcWidth - AlignLo(srcWidth, 2); - size_t bodyWidth = AlignLo(srcWidth, A); - for (size_t row = 0; row < srcHeight; row += 2, dst += dstStride, src += 2 * srcStride) - { - const uint8_t * s1 = src; - const uint8_t * s0 = s1 - (row ? srcStride : 0); - const uint8_t * s2 = s1 + (row != srcHeight - 1 ? srcStride : 0); - - vst1_u8(dst, ReduceRow(ReduceColNose(s0), - ReduceColNose(s1), ReduceColNose(s2))); - - for (size_t srcCol = A, dstCol = HA; srcCol < bodyWidth; srcCol += A, dstCol += HA) - vst1_u8(dst + dstCol, ReduceRow(ReduceColBody(s0 + srcCol), - ReduceColBody(s1 + srcCol), ReduceColBody(s2 + srcCol))); - - if (bodyWidth != srcWidth) - { - size_t srcCol = srcWidth - A - lastOddCol; - size_t dstCol = dstWidth - HA - lastOddCol; - vst1_u8(dst + dstCol, ReduceRow(ReduceColBody(s0 + srcCol), - ReduceColBody(s1 + srcCol), ReduceColBody(s2 + srcCol))); - if (lastOddCol) - dst[dstWidth - 1] = Base::GaussianBlur3x3(s0 + srcWidth, s1 + srcWidth, s2 + srcWidth, -2, -1, -1); - } - } - } - - template void ReduceGray3x3( - const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation) - { - if (compensation) - ReduceGray3x3(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - ReduceGray3x3(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - } - - void ReduceGray3x3(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation) - { - if (Aligned(src) && Aligned(srcStride)) - ReduceGray3x3(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, compensation); - else - ReduceGray3x3(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, compensation); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonReduceGray4x4.cpp b/src/3rd/Simd/Simd/SimdNeonReduceGray4x4.cpp deleted file mode 100644 index 3a24fbb3..00000000 --- a/src/3rd/Simd/Simd/SimdNeonReduceGray4x4.cpp +++ /dev/null @@ -1,167 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - namespace - { - struct Buffer - { - Buffer(size_t width) - { - _p = Allocate(sizeof(uint16_t) * 4 * width); - src0 = (uint16_t*)_p; - src1 = src0 + width; - src2 = src1 + width; - src3 = src2 + width; - } - - ~Buffer() - { - Free(_p); - } - - uint16_t * src0; - uint16_t * src1; - uint16_t * src2; - uint16_t * src3; - private: - void * _p; - }; - } - - SIMD_INLINE uint16x8_t DivideBy64(uint16x8_t value) - { - return vshrq_n_u16(vaddq_u16(value, K16_0020), 6); - } - - SIMD_INLINE uint16x8_t ReduceColNose(const uint8_t * src) - { - const uint8x8x2_t t01 = Deinterleave(LoadBeforeFirst<1>(vld1q_u8(src))); - const uint8x8x2_t t23 = vld2_u8(src + 1); - return vaddq_u16(vaddl_u8(t01.val[0], t23.val[1]), vmulq_u16(vaddl_u8(t01.val[1], t23.val[0]), K16_0003)); - } - - SIMD_INLINE uint16x8_t ReduceColBody(const uint8_t *src) - { - const uint8x8x2_t t01 = vld2_u8(src - 1); - const uint8x8x2_t t23 = vld2_u8(src + 1); - return vaddq_u16(vaddl_u8(t01.val[0], t23.val[1]), vmulq_u16(vaddl_u8(t01.val[1], t23.val[0]), K16_0003)); - } - - template SIMD_INLINE uint16x8_t ReduceColTail(const uint8_t *src); - - template <> SIMD_INLINE uint16x8_t ReduceColTail(const uint8_t *src) - { - const uint8x8x2_t t01 = vld2_u8(src - 1); - const uint8x8x2_t t23 = Deinterleave(LoadAfterLast<1>(vld1q_u8(src))); - return vaddq_u16(vaddl_u8(t01.val[0], t23.val[1]), vmulq_u16(vaddl_u8(t01.val[1], t23.val[0]), K16_0003)); - } - - template <> SIMD_INLINE uint16x8_t ReduceColTail(const uint8_t *src) - { - const uint8x8x2_t t01 = vld2_u8(src - 1); - const uint8x8x2_t t23 = Deinterleave(LoadAfterLast<1>(LoadAfterLast<1>(vld1q_u8(src - 1)))); - return vaddq_u16(vaddl_u8(t01.val[0], t23.val[1]), vmulq_u16(vaddl_u8(t01.val[1], t23.val[0]), K16_0003)); - } - - template SIMD_INLINE uint8x8_t ReduceRow(const Buffer & buffer, size_t offset) - { - return vmovn_u16(DivideBy64(BinomialSum16( - Load(buffer.src0 + offset), Load(buffer.src1 + offset), - Load(buffer.src2 + offset), Load(buffer.src3 + offset)))); - } - - template void ReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth > A); - - size_t alignedDstWidth = Simd::AlignLo(dstWidth, HA); - size_t srcTail = Simd::AlignHi(srcWidth - A, 2); - - Buffer buffer(Simd::AlignHi(dstWidth, A)); - - uint16x8_t tmp = ReduceColNose(src); - Store(buffer.src0, tmp); - Store(buffer.src1, tmp); - size_t srcCol = A, dstCol = HA; - for (; srcCol < srcWidth - A; srcCol += A, dstCol += HA) - { - tmp = ReduceColBody(src + srcCol); - Store(buffer.src0 + dstCol, tmp); - Store(buffer.src1 + dstCol, tmp); - } - tmp = ReduceColTail(src + srcTail); - Store(buffer.src0 + dstWidth - HA, tmp); - Store(buffer.src1 + dstWidth - HA, tmp); - - for (size_t row = 0; row < srcHeight; row += 2, dst += dstStride) - { - const uint8_t *src2 = src + srcStride*(row + 1); - const uint8_t *src3 = src2 + srcStride; - if (row >= srcHeight - 2) - { - src2 = src + srcStride*(srcHeight - 1); - src3 = src2; - } - - Store(buffer.src2, ReduceColNose(src2)); - Store(buffer.src3, ReduceColNose(src3)); - size_t srcCol = A, dstCol = HA; - for (; srcCol < srcWidth - A; srcCol += A, dstCol += HA) - { - Store(buffer.src2 + dstCol, ReduceColBody(src2 + srcCol)); - Store(buffer.src3 + dstCol, ReduceColBody(src3 + srcCol)); - } - Store(buffer.src2 + dstWidth - HA, ReduceColTail(src2 + srcTail)); - Store(buffer.src3 + dstWidth - HA, ReduceColTail(src3 + srcTail)); - - for (size_t col = 0; col < alignedDstWidth; col += HA) - vst1_u8(dst + col, ReduceRow(buffer, col)); - - if (alignedDstWidth != dstWidth) - vst1_u8(dst + dstWidth - HA, ReduceRow(buffer, dstWidth - HA)); - - Swap(buffer.src0, buffer.src2); - Swap(buffer.src1, buffer.src3); - } - } - - void ReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - if (Aligned(srcWidth, 2)) - ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonReduceGray5x5.cpp b/src/3rd/Simd/Simd/SimdNeonReduceGray5x5.cpp deleted file mode 100644 index 1737c14f..00000000 --- a/src/3rd/Simd/Simd/SimdNeonReduceGray5x5.cpp +++ /dev/null @@ -1,178 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - namespace - { - struct Buffer - { - Buffer(size_t width) - { - _p = Allocate(sizeof(uint16_t)*(5 * width + A)); - in0 = (uint16_t*)_p; - in1 = in0 + width; - out0 = in1 + width; - out1 = out0 + width; - dst = out1 + width + HA; - } - - ~Buffer() - { - Free(_p); - } - - uint16_t * in0; - uint16_t * in1; - uint16_t * out0; - uint16_t * out1; - uint16_t * dst; - private: - void *_p; - }; - } - - template SIMD_INLINE uint16x8_t DivideBy256(uint16x8_t value); - - template <> SIMD_INLINE uint16x8_t DivideBy256(uint16x8_t value) - { - return vshrq_n_u16(vaddq_u16(value, K16_0080), 8); - } - - template <> SIMD_INLINE uint16x8_t DivideBy256(uint16x8_t value) - { - return vshrq_n_u16(value, 8); - } - - SIMD_INLINE uint16x8_t LoadUnpacked(const uint8_t * src) - { - return vmovl_u8(vld1_u8(src)); - } - - template SIMD_INLINE void FirstRow5x5(uint16x8_t src, Buffer & buffer, size_t offset) - { - Store(buffer.in0 + offset, src); - Store(buffer.in1 + offset, vmulq_u16(src, K16_0005)); - } - - template SIMD_INLINE void FirstRow5x5(const uint8_t * src, Buffer & buffer, size_t offset) - { - FirstRow5x5(LoadUnpacked(src + offset), buffer, offset); - offset += HA; - FirstRow5x5(LoadUnpacked(src + offset), buffer, offset); - } - - template SIMD_INLINE void MainRowY5x5(uint16x8_t odd, uint16x8_t even, Buffer & buffer, size_t offset) - { - uint16x8_t cp = vmulq_u16(odd, K16_0004); - uint16x8_t c0 = Load(buffer.in0 + offset); - uint16x8_t c1 = Load(buffer.in1 + offset); - Store(buffer.dst + offset, vaddq_u16(even, vaddq_u16(c1, vaddq_u16(cp, vmulq_u16(c0, K16_0006))))); - Store(buffer.out1 + offset, vaddq_u16(c0, cp)); - Store(buffer.out0 + offset, even); - } - - template SIMD_INLINE void MainRowY5x5(const uint8_t *odd, const uint8_t *even, Buffer & buffer, size_t offset) - { - MainRowY5x5(LoadUnpacked(odd + offset), LoadUnpacked(even + offset), buffer, offset); - offset += HA; - MainRowY5x5(LoadUnpacked(odd + offset), LoadUnpacked(even + offset), buffer, offset); - } - - template SIMD_INLINE uint16x8_t MainRowX5x5(uint16_t * dst) - { - uint16x8_t t0 = vld1q_u16(dst - 2); - uint16x8_t t1 = vld1q_u16(dst - 1); - uint16x8_t t2 = Load(dst); - uint16x8_t t3 = vld1q_u16(dst + 1); - uint16x8_t t4 = vld1q_u16(dst + 2); - t2 = vaddq_u16(vaddq_u16(vmulq_u16(t2, K16_0006), vmulq_u16(vaddq_u16(t1, t3), K16_0004)), vaddq_u16(t0, t4)); - return DivideBy256(t2); - } - - template SIMD_INLINE void MainRowX5x5(Buffer & buffer, size_t offset, uint8_t *dst) - { - uint16x8_t lo = MainRowX5x5(buffer.dst + offset); - uint16x8_t hi = MainRowX5x5(buffer.dst + offset + HA); - vst1_u8(dst, Deinterleave(PackU16(lo, hi)).val[0]); - } - - template void ReduceGray5x5( - const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= A); - - size_t alignedWidth = Simd::AlignLo(srcWidth, A); - size_t bufferDstTail = Simd::AlignHi(srcWidth - A, 2); - - Buffer buffer(Simd::AlignHi(srcWidth, A)); - - for (size_t col = 0; col < alignedWidth; col += A) - FirstRow5x5(src, buffer, col); - if (alignedWidth != srcWidth) - FirstRow5x5(src, buffer, srcWidth - A); - src += srcStride; - - for (size_t row = 1; row <= srcHeight; row += 2, dst += dstStride, src += 2 * srcStride) - { - const uint8_t *odd = src - (row < srcHeight ? 0 : srcStride); - const uint8_t *even = odd + (row < srcHeight - 1 ? srcStride : 0); - - for (size_t col = 0; col < alignedWidth; col += A) - MainRowY5x5(odd, even, buffer, col); - if (alignedWidth != srcWidth) - MainRowY5x5(odd, even, buffer, srcWidth - A); - - Swap(buffer.in0, buffer.out0); - Swap(buffer.in1, buffer.out1); - - buffer.dst[-2] = buffer.dst[0]; - buffer.dst[-1] = buffer.dst[0]; - buffer.dst[srcWidth] = buffer.dst[srcWidth - 1]; - buffer.dst[srcWidth + 1] = buffer.dst[srcWidth - 1]; - - for (size_t srcCol = 0, dstCol = 0; srcCol < alignedWidth; srcCol += A, dstCol += HA) - MainRowX5x5(buffer, srcCol, dst + dstCol); - if (alignedWidth != srcWidth) - MainRowX5x5(buffer, bufferDstTail, dst + dstWidth - HA); - } - } - - void ReduceGray5x5(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation) - { - if (compensation) - ReduceGray5x5(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - ReduceGray5x5(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonReorder.cpp b/src/3rd/Simd/Simd/SimdNeonReorder.cpp deleted file mode 100644 index 72e71fb7..00000000 --- a/src/3rd/Simd/Simd/SimdNeonReorder.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE void Reorder16bit(const uint8_t * src, uint8_t * dst) - { - uint8x16_t _src = Load(src); - Store(dst, vrev16q_u8(_src)); - } - - template void Reorder16bit(const uint8_t * src, size_t size, uint8_t * dst) - { - assert(size >= A && size % 2 == 0); - - size_t alignedSize = AlignLo(size, A); - for (size_t i = 0; i < alignedSize; i += A) - Reorder16bit(src + i, dst + i); - for (size_t i = alignedSize; i < size; i += 2) - Base::Reorder16bit(src + i, dst + i); - } - - void Reorder16bit(const uint8_t * src, size_t size, uint8_t * dst) - { - if (Aligned(src) && Aligned(dst)) - Reorder16bit(src, size, dst); - else - Reorder16bit(src, size, dst); - } - - template SIMD_INLINE void Reorder32bit(const uint8_t * src, uint8_t * dst) - { - uint8x16_t _src = Load(src); - Store(dst, vrev32q_u8(_src)); - } - - template void Reorder32bit(const uint8_t * src, size_t size, uint8_t * dst) - { - assert(size >= A && size % 4 == 0); - - size_t alignedSize = AlignLo(size, A); - for (size_t i = 0; i < alignedSize; i += A) - Reorder32bit(src + i, dst + i); - for (size_t i = alignedSize; i < size; i += 4) - Base::Reorder32bit(src + i, dst + i); - } - - void Reorder32bit(const uint8_t * src, size_t size, uint8_t * dst) - { - if (Aligned(src) && Aligned(dst)) - Reorder32bit(src, size, dst); - else - Reorder32bit(src, size, dst); - } - - template SIMD_INLINE void Reorder64bit(const uint8_t * src, uint8_t * dst) - { - uint8x16_t _src = Load(src); - Store(dst, vrev64q_u8(_src)); - } - - template void Reorder64bit(const uint8_t * src, size_t size, uint8_t * dst) - { - assert(size >= A && size % 8 == 0); - - size_t alignedSize = AlignLo(size, A); - for (size_t i = 0; i < alignedSize; i += A) - Reorder64bit(src + i, dst + i); - for (size_t i = alignedSize; i < size; i += 8) - Base::Reorder64bit(src + i, dst + i); - } - - void Reorder64bit(const uint8_t * src, size_t size, uint8_t * dst) - { - if (Aligned(src) && Aligned(dst)) - Reorder64bit(src, size, dst); - else - Reorder64bit(src, size, dst); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonResizeBilinear.cpp b/src/3rd/Simd/Simd/SimdNeonResizeBilinear.cpp deleted file mode 100644 index 7b67a4b1..00000000 --- a/src/3rd/Simd/Simd/SimdNeonResizeBilinear.cpp +++ /dev/null @@ -1,414 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar, -* 2018-2018 Radchenko Andrey. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - namespace - { - struct Buffer - { - Buffer(size_t size, size_t width, size_t height) - { - _p = Allocate(3 * size + sizeof(int)*(2 * height + width)); - bx[0] = (uint8_t*)_p; - bx[1] = bx[0] + size; - ax = bx[1] + size; - ix = (int*)(ax + size); - iy = ix + width; - ay = iy + height; - } - - ~Buffer() - { - Free(_p); - } - - uint8_t * bx[2]; - uint8_t * ax; - int * ix; - int * ay; - int * iy; - private: - void *_p; - }; - - struct Index - { - int src, dst; - uint8_t shuffle[Simd::Neon::A]; - }; - - struct BufferG - { - BufferG(size_t width, size_t blocks, size_t height) - { - _p = Simd::Allocate(3 * width + sizeof(int) * 2 * height + blocks * sizeof(Index) + 2 * A); - bx[0] = (uint8_t*)_p; - bx[1] = bx[0] + width + A; - ax = bx[1] + width + A; - ix = (Index*)(ax + width); - iy = (int*)(ix + blocks); - ay = iy + height; - } - - ~BufferG() - { - Free(_p); - } - - uint8_t * bx[2]; - uint8_t * ax; - Index * ix; - int * ay; - int * iy; - private: - void *_p; - }; - - } - - template void EstimateAlphaIndexX(size_t srcSize, size_t dstSize, int * indexes, uint8_t * alphas) - { - float scale = (float)srcSize / dstSize; - - for (size_t i = 0; i < dstSize; ++i) - { - float alpha = (float)((i + 0.5)*scale - 0.5); - ptrdiff_t index = (ptrdiff_t)::floor(alpha); - alpha -= index; - - if (index < 0) - { - index = 0; - alpha = 0; - } - - if (index > (ptrdiff_t)srcSize - 2) - { - index = srcSize - 2; - alpha = 1; - } - - indexes[i] = (int)index; - alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5); - alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]); - for (size_t channel = 1; channel < channelCount; channel++) - ((uint16_t*)alphas)[channel] = *(uint16_t*)alphas; - alphas += 2 * channelCount; - } - } - - void EstimateAlphaIndexX(int srcSize, int dstSize, Index * indexes, uint8_t * alphas, size_t & blockCount) - { - float scale = (float)srcSize / dstSize; - int block = 0; - indexes[0].src = 0; - indexes[0].dst = 0; - for (int dstIndex = 0; dstIndex < dstSize; ++dstIndex) - { - float alpha = (float)((dstIndex + 0.5)*scale - 0.5); - int srcIndex = (int)::floor(alpha); - alpha -= srcIndex; - - if (srcIndex < 0) - { - srcIndex = 0; - alpha = 0; - } - - if (srcIndex > srcSize - 2) - { - srcIndex = srcSize - 2; - alpha = 1; - } - - int dst = 2 * dstIndex - indexes[block].dst; - int src = srcIndex - indexes[block].src; - if (src >= A - 1 || dst >= A) - { - block++; - indexes[block].src = Simd::Min(srcIndex, srcSize - (int)A); - indexes[block].dst = 2 * dstIndex; - dst = 0; - src = srcIndex - indexes[block].src; - } - indexes[block].shuffle[dst] = src; - indexes[block].shuffle[dst + 1] = src + 1; - - alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5); - alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]); - alphas += 2; - } - blockCount = block + 1; - } - - SIMD_INLINE size_t BlockCountMax(size_t src, size_t dst) - { - return (size_t)Simd::Max(::ceil(float(src) / (A - 1)), ::ceil(float(dst) / HA)); - } - - template void InterpolateX(const uint8_t * alpha, uint8_t * buffer); - - template <> SIMD_INLINE void InterpolateX<1>(const uint8_t * alpha, uint8_t * buffer) - { - uint8x8x2_t a = vld2_u8(alpha); - uint8x8x2_t b = vld2_u8(buffer); - Store(buffer, (uint8x16_t)vaddq_u16(vmull_u8(a.val[0], b.val[0]), vmull_u8(a.val[1], b.val[1]))); - } - - SIMD_INLINE void InterpolateX2(const uint8_t * alpha, uint8_t * buffer) - { - uint8x8x2_t a = vld2_u8(alpha); - uint16x4x2_t b = vld2_u16((uint16_t*)buffer); - Store(buffer, (uint8x16_t)vaddq_u16(vmull_u8(a.val[0], (uint8x8_t)b.val[0]), vmull_u8(a.val[1], (uint8x8_t)b.val[1]))); - } - - template <> SIMD_INLINE void InterpolateX<2>(const uint8_t * alpha, uint8_t * buffer) - { - InterpolateX2(alpha + 0, buffer + 0); - InterpolateX2(alpha + A, buffer + A); - } - - SIMD_INLINE void InterpolateX3(const uint8_t * alpha, const uint8_t * src, uint8_t * dst) - { - uint8x8x2_t a = vld2_u8(alpha); - uint8x8x2_t b = vld2_u8(src); - Store(dst, (uint8x16_t)vaddq_u16(vmull_u8(a.val[0], b.val[0]), vmull_u8(a.val[1], b.val[1]))); - } - - template <> SIMD_INLINE void InterpolateX<3>(const uint8_t * alpha, uint8_t * buffer) - { - uint8_t b[3 * A]; - uint8x16x3_t _b = vld3q_u8(buffer); - vst3q_u16((uint16_t*)b, *(uint16x8x3_t*)&_b); - InterpolateX3(alpha + 0 * A, b + 0 * A, buffer + 0 * A); - InterpolateX3(alpha + 1 * A, b + 1 * A, buffer + 1 * A); - InterpolateX3(alpha + 2 * A, b + 2 * A, buffer + 2 * A); - } - - SIMD_INLINE void InterpolateX4(const uint8_t * alpha, uint8_t * buffer) - { - uint8x8x2_t a = vld2_u8(alpha); - uint32x2x2_t b = vld2_u32((uint32_t*)buffer); - Store(buffer, (uint8x16_t)vaddq_u16(vmull_u8(a.val[0], (uint8x8_t)b.val[0]), vmull_u8(a.val[1], (uint8x8_t)b.val[1]))); - } - - template <> SIMD_INLINE void InterpolateX<4>(const uint8_t * alpha, uint8_t * buffer) - { - InterpolateX4(alpha + 0 * A, buffer + 0 * A); - InterpolateX4(alpha + 1 * A, buffer + 1 * A); - InterpolateX4(alpha + 2 * A, buffer + 2 * A); - InterpolateX4(alpha + 3 * A, buffer + 3 * A); - } - - const uint16x8_t K16_FRACTION_ROUND_TERM = SIMD_VEC_SET1_EPI16(Base::BILINEAR_ROUND_TERM); - - template SIMD_INLINE uint16x8_t InterpolateY(const uint16_t * pbx0, const uint16_t * pbx1, uint16x8_t alpha[2]) - { - uint16x8_t sum = vaddq_u16(vmulq_u16(Load(pbx0), alpha[0]), vmulq_u16(Load(pbx1), alpha[1])); - return vshrq_n_u16(vaddq_u16(sum, K16_FRACTION_ROUND_TERM), Base::BILINEAR_SHIFT); - } - - template SIMD_INLINE void InterpolateY(const uint8_t * bx0, const uint8_t * bx1, uint16x8_t alpha[2], uint8_t * dst) - { - uint16x8_t lo = InterpolateY((uint16_t*)(bx0 + 0), (uint16_t*)(bx1 + 0), alpha); - uint16x8_t hi = InterpolateY((uint16_t*)(bx0 + A), (uint16_t*)(bx1 + A), alpha); - Store(dst, PackU16(lo, hi)); - } - - template void ResizeBilinear( - const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert(dstWidth >= A); - - struct One { uint8_t channels[channelCount]; }; - struct Two { uint8_t channels[channelCount * 2]; }; - - size_t size = 2 * dstWidth*channelCount; - size_t bufferSize = AlignHi(dstWidth, A)*channelCount * 2; - size_t alignedSize = AlignHi(size, DA) - DA; - const size_t step = A * channelCount; - - Buffer buffer(bufferSize, dstWidth, dstHeight); - - Base::EstimateAlphaIndex(srcHeight, dstHeight, buffer.iy, buffer.ay, 1); - - EstimateAlphaIndexX(srcWidth, dstWidth, buffer.ix, buffer.ax); - - ptrdiff_t previous = -2; - - uint16x8_t a[2]; - - for (size_t yDst = 0; yDst < dstHeight; yDst++, dst += dstStride) - { - a[0] = vdupq_n_u16(Base::FRACTION_RANGE - buffer.ay[yDst]); - a[1] = vdupq_n_u16(buffer.ay[yDst]); - - ptrdiff_t sy = buffer.iy[yDst]; - int k = 0; - - if (sy == previous) - k = 2; - else if (sy == previous + 1) - { - Swap(buffer.bx[0], buffer.bx[1]); - k = 1; - } - - previous = sy; - - for (; k < 2; k++) - { - Two * pb = (Two *)buffer.bx[k]; - const One * psrc = (const One *)(src + (sy + k)*srcStride); - for (size_t x = 0; x < dstWidth; x++) - pb[x] = *(Two *)(psrc + buffer.ix[x]); - - uint8_t * pbx = buffer.bx[k]; - for (size_t i = 0; i < bufferSize; i += step) - InterpolateX(buffer.ax + i, pbx + i); - } - - for (size_t ib = 0, id = 0; ib < alignedSize; ib += DA, id += A) - InterpolateY(buffer.bx[0] + ib, buffer.bx[1] + ib, a, dst + id); - size_t i = size - DA; - InterpolateY(buffer.bx[0] + i, buffer.bx[1] + i, a, dst + i / 2); - } - } - - SIMD_INLINE void LoadGray(const uint8_t * src, const Index & index, uint8_t * dst) - { - - uint8x16_t _src = vld1q_u8(src + index.src); - uint8x16_t _shuffle = vld1q_u8(index.shuffle); - - uint8x8x2_t src1; - src1.val[0] = vget_low_u8(_src); - src1.val[1] = vget_high_u8(_src); - - uint8x8_t dstLow = vtbl2_u8(src1, vget_low_u8(_shuffle)); - uint8x8_t dstHigh = vtbl2_u8(src1, vget_high_u8(_shuffle)); - - uint8x16_t _dst = vcombine_u8(dstLow, dstHigh); - - vst1q_u8(dst + index.dst, _dst); - - } - - void ResizeBilinearGray( - const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert(dstWidth >= A); - - size_t bufferWidth = AlignHi(dstWidth, A) * 2; - size_t blockCount = BlockCountMax(srcWidth, dstWidth); - size_t size = 2 * dstWidth; - size_t alignedSize = AlignHi(size, DA) - DA; - const size_t step = A; - - BufferG buffer(bufferWidth, blockCount, dstHeight); - - Base::EstimateAlphaIndex(srcHeight, dstHeight, buffer.iy, buffer.ay, 1); - - EstimateAlphaIndexX((int)srcWidth, (int)dstWidth, buffer.ix, buffer.ax, blockCount); - - ptrdiff_t previous = -2; - - uint16x8_t a[2]; - - for (size_t yDst = 0; yDst < dstHeight; yDst++, dst += dstStride) - { - a[0] = vdupq_n_u16(Base::FRACTION_RANGE - buffer.ay[yDst]); - a[1] = vdupq_n_u16(buffer.ay[yDst]); - - ptrdiff_t sy = buffer.iy[yDst]; - int k = 0; - - if (sy == previous) - k = 2; - else if (sy == previous + 1) - { - Swap(buffer.bx[0], buffer.bx[1]); - k = 1; - } - - previous = sy; - - for (; k < 2; k++) - { - const uint8_t * psrc = src + (sy + k)*srcStride; - uint8_t * pdst = buffer.bx[k]; - for (size_t i = 0; i < blockCount; ++i) - LoadGray(psrc, buffer.ix[i], pdst); - - uint8_t * pbx = buffer.bx[k]; - for (size_t i = 0; i < bufferWidth; i += step) - InterpolateX<1>(buffer.ax + i, pbx + i); - } - - for (size_t ib = 0, id = 0; ib < alignedSize; ib += DA, id += A) - InterpolateY(buffer.bx[0] + ib, buffer.bx[1] + ib, a, dst + id); - size_t i = size - DA; - InterpolateY(buffer.bx[0] + i, buffer.bx[1] + i, a, dst + i / 2); - } - } - - void ResizeBilinear( - const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) - { - switch (channelCount) - { - case 1: - if (srcWidth >= A && srcWidth < 4 * dstWidth) - ResizeBilinearGray(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - ResizeBilinear<1>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - break; - case 2: - ResizeBilinear<2>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - break; - case 3: - ResizeBilinear<3>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - break; - case 4: - ResizeBilinear<4>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - break; - default: - Base::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - } - } - } -#endif -} - diff --git a/src/3rd/Simd/Simd/SimdNeonResizer.cpp b/src/3rd/Simd/Simd/SimdNeonResizer.cpp deleted file mode 100644 index 86b481f5..00000000 --- a/src/3rd/Simd/Simd/SimdNeonResizer.cpp +++ /dev/null @@ -1,592 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdResizer.h" -#include "Simd/SimdUpdate.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - ResizerByteBilinear::ResizerByteBilinear(const ResParam & param) - : Base::ResizerByteBilinear(param) - , _blocks(0) - { - } - - size_t ResizerByteBilinear::BlockCountMax(size_t align) - { - return (size_t)Simd::Max(::ceil(float(_param.srcW) / (align - 1)), ::ceil(float(_param.dstW) * 2.0f / align)); - } - - void ResizerByteBilinear::EstimateParams() - { - if (_ax.data) - return; - if (_param.channels == 1 && _param.srcW < 4 * _param.dstW) - _blocks = BlockCountMax(A); - float scale = (float)_param.srcW / _param.dstW; - _ax.Resize(_param.dstW * _param.channels * 2, false, _param.align); - uint8_t * alphas = _ax.data; - if (_blocks) - { - _ixg.Resize(_blocks); - int block = 0; - _ixg[0].src = 0; - _ixg[0].dst = 0; - for (int dstIndex = 0; dstIndex < _param.dstW; ++dstIndex) - { - float alpha = (float)((dstIndex + 0.5)*scale - 0.5); - int srcIndex = (int)::floor(alpha); - alpha -= srcIndex; - - if (srcIndex < 0) - { - srcIndex = 0; - alpha = 0; - } - - if (srcIndex > _param.srcW - 2) - { - srcIndex = (int)_param.srcW - 2; - alpha = 1; - } - - int dst = 2 * dstIndex - _ixg[block].dst; - int src = srcIndex - _ixg[block].src; - if (src >= A - 1 || dst >= A) - { - block++; - _ixg[block].src = Simd::Min(srcIndex, int(_param.srcW - A)); - _ixg[block].dst = 2 * dstIndex; - dst = 0; - src = srcIndex - _ixg[block].src; - } - _ixg[block].shuffle[dst] = src; - _ixg[block].shuffle[dst + 1] = src + 1; - - alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5); - alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]); - alphas += 2; - } - _blocks = block + 1; - } - else - { - _ix.Resize(_param.dstW); - for (size_t i = 0; i < _param.dstW; ++i) - { - float alpha = (float)((i + 0.5)*scale - 0.5); - ptrdiff_t index = (ptrdiff_t)::floor(alpha); - alpha -= index; - - if (index < 0) - { - index = 0; - alpha = 0; - } - - if (index >(ptrdiff_t)_param.srcW - 2) - { - index = _param.srcW - 2; - alpha = 1; - } - - _ix[i] = (int)index; - alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5); - alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]); - for (size_t channel = 1; channel < _param.channels; channel++) - ((uint16_t*)alphas)[channel] = *(uint16_t*)alphas; - alphas += 2 * _param.channels; - } - } - size_t size = AlignHi(_param.dstW, _param.align)*_param.channels * 2; - _bx[0].Resize(size, false, _param.align); - _bx[1].Resize(size, false, _param.align); -} - - template void ResizerByteBilinearInterpolateX(const uint8_t * alpha, uint8_t * buffer); - - template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<1>(const uint8_t * alpha, uint8_t * buffer) - { - uint8x8x2_t a = vld2_u8(alpha); - uint8x8x2_t b = vld2_u8(buffer); - Store(buffer, (uint8x16_t)vaddq_u16(vmull_u8(a.val[0], b.val[0]), vmull_u8(a.val[1], b.val[1]))); - } - - SIMD_INLINE void ResizerByteBilinearInterpolateX2(const uint8_t * alpha, uint8_t * buffer) - { - uint8x8x2_t a = vld2_u8(alpha); - uint16x4x2_t b = vld2_u16((uint16_t*)buffer); - Store(buffer, (uint8x16_t)vaddq_u16(vmull_u8(a.val[0], (uint8x8_t)b.val[0]), vmull_u8(a.val[1], (uint8x8_t)b.val[1]))); - } - - template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<2>(const uint8_t * alpha, uint8_t * buffer) - { - ResizerByteBilinearInterpolateX2(alpha + 0, buffer + 0); - ResizerByteBilinearInterpolateX2(alpha + A, buffer + A); - } - - SIMD_INLINE void ResizerByteBilinearInterpolateX3(const uint8_t * alpha, const uint8_t * src, uint8_t * dst) - { - uint8x8x2_t a = vld2_u8(alpha); - uint8x8x2_t b = vld2_u8(src); - Store(dst, (uint8x16_t)vaddq_u16(vmull_u8(a.val[0], b.val[0]), vmull_u8(a.val[1], b.val[1]))); - } - - template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<3>(const uint8_t * alpha, uint8_t * buffer) - { - uint8_t b[3 * A]; - uint8x16x3_t _b = vld3q_u8(buffer); - vst3q_u16((uint16_t*)b, *(uint16x8x3_t*)&_b); - ResizerByteBilinearInterpolateX3(alpha + 0 * A, b + 0 * A, buffer + 0 * A); - ResizerByteBilinearInterpolateX3(alpha + 1 * A, b + 1 * A, buffer + 1 * A); - ResizerByteBilinearInterpolateX3(alpha + 2 * A, b + 2 * A, buffer + 2 * A); - } - - SIMD_INLINE void ResizerByteBilinearInterpolateX4(const uint8_t * alpha, uint8_t * buffer) - { - uint8x8x2_t a = vld2_u8(alpha); - uint32x2x2_t b = vld2_u32((uint32_t*)buffer); - Store(buffer, (uint8x16_t)vaddq_u16(vmull_u8(a.val[0], (uint8x8_t)b.val[0]), vmull_u8(a.val[1], (uint8x8_t)b.val[1]))); - } - - template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<4>(const uint8_t * alpha, uint8_t * buffer) - { - ResizerByteBilinearInterpolateX4(alpha + 0 * A, buffer + 0 * A); - ResizerByteBilinearInterpolateX4(alpha + 1 * A, buffer + 1 * A); - ResizerByteBilinearInterpolateX4(alpha + 2 * A, buffer + 2 * A); - ResizerByteBilinearInterpolateX4(alpha + 3 * A, buffer + 3 * A); - } - - const uint16x8_t K16_FRACTION_ROUND_TERM = SIMD_VEC_SET1_EPI16(Base::BILINEAR_ROUND_TERM); - - template SIMD_INLINE uint16x8_t ResizerByteBilinearInterpolateY(const uint16_t * pbx0, const uint16_t * pbx1, uint16x8_t alpha[2]) - { - uint16x8_t sum = vaddq_u16(vmulq_u16(Load(pbx0), alpha[0]), vmulq_u16(Load(pbx1), alpha[1])); - return vshrq_n_u16(vaddq_u16(sum, K16_FRACTION_ROUND_TERM), Base::BILINEAR_SHIFT); - } - - template SIMD_INLINE void ResizerByteBilinearInterpolateY(const uint8_t * bx0, const uint8_t * bx1, uint16x8_t alpha[2], uint8_t * dst) - { - uint16x8_t lo = ResizerByteBilinearInterpolateY((uint16_t*)(bx0 + 0), (uint16_t*)(bx1 + 0), alpha); - uint16x8_t hi = ResizerByteBilinearInterpolateY((uint16_t*)(bx0 + A), (uint16_t*)(bx1 + A), alpha); - Store(dst, PackU16(lo, hi)); - } - - template void ResizerByteBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - struct One { uint8_t val[N * 1]; }; - struct Two { uint8_t val[N * 2]; }; - - size_t size = 2 * _param.dstW*N; - size_t aligned = AlignHi(size, DA) - DA; - const size_t step = A * N; - ptrdiff_t previous = -2; - uint16x8_t a[2]; - uint8_t * bx[2] = { _bx[0].data, _bx[1].data }; - const uint8_t * ax = _ax.data; - const int32_t * ix = _ix.data; - size_t dstW = _param.dstW; - - for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride) - { - a[0] = vdupq_n_u16(int16_t(Base::FRACTION_RANGE - _ay[yDst])); - a[1] = vdupq_n_u16(int16_t(_ay[yDst])); - - ptrdiff_t sy = _iy[yDst]; - int k = 0; - - if (sy == previous) - k = 2; - else if (sy == previous + 1) - { - Swap(bx[0], bx[1]); - k = 1; - } - - previous = sy; - - for (; k < 2; k++) - { - Two * pb = (Two *)bx[k]; - const One * psrc = (const One *)(src + (sy + k)*srcStride); - for (size_t x = 0; x < dstW; x++) - pb[x] = *(Two *)(psrc + ix[x]); - - uint8_t * pbx = bx[k]; - for (size_t i = 0; i < size; i += step) - ResizerByteBilinearInterpolateX(ax + i, pbx + i); - } - - for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A) - ResizerByteBilinearInterpolateY(bx[0] + ib, bx[1] + ib, a, dst + id); - size_t i = size - DA; - ResizerByteBilinearInterpolateY(bx[0] + i, bx[1] + i, a, dst + i / 2); - } - } - - union ResizerByteBilinearLoadGrayInterpolatedHelper - { - uint8x16_t full; - uint8x8x2_t half; - }; - - template SIMD_INLINE void ResizerByteBilinearLoadGrayInterpolated(const uint8_t * src, const Idx & index, const uint8_t * alpha, uint8_t * dst) - { - ResizerByteBilinearLoadGrayInterpolatedHelper _src, _shuffle, _alpha, unpacked; - _src.full = vld1q_u8(src + index.src); - _shuffle.full = vld1q_u8(index.shuffle); - unpacked.half.val[0] = vtbl2_u8(_src.half, _shuffle.half.val[0]); - unpacked.half.val[1] = vtbl2_u8(_src.half, _shuffle.half.val[1]); - _alpha.full = vld1q_u8(alpha + index.dst); - uint16x8_t lo = vmull_u8(unpacked.half.val[0], _alpha.half.val[0]); - uint16x8_t hi = vmull_u8(unpacked.half.val[1], _alpha.half.val[1]); - vst1q_u8(dst + index.dst, vreinterpretq_u8_u16(Hadd(lo, hi))); - } - - template SIMD_INLINE void ResizerByteBilinearLoadGray(const uint8_t * src, const Idx & index, uint8_t * dst) - { - ResizerByteBilinearLoadGrayInterpolatedHelper _src, _shuffle, _alpha, unpacked; - _src.full = vld1q_u8(src + index.src); - _shuffle.full = vld1q_u8(index.shuffle); - unpacked.half.val[0] = vtbl2_u8(_src.half, _shuffle.half.val[0]); - unpacked.half.val[1] = vtbl2_u8(_src.half, _shuffle.half.val[1]); - vst1q_u8(dst + index.dst, unpacked.full); - } - -//#define MERGE_LOADING_AND_INTERPOLATION - - void ResizerByteBilinear::RunG(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - size_t bufW = AlignHi(_param.dstW, A) * 2; - size_t size = 2 * _param.dstW; - size_t aligned = AlignHi(size, DA) - DA; - size_t blocks = _blocks; - ptrdiff_t previous = -2; - uint16x8_t a[2]; - uint8_t * bx[2] = { _bx[0].data, _bx[1].data }; - const uint8_t * ax = _ax.data; - const Idx * ixg = _ixg.data; - - for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride) - { - a[0] = vdupq_n_u16(int16_t(Base::FRACTION_RANGE - _ay[yDst])); - a[1] = vdupq_n_u16(int16_t(_ay[yDst])); - - ptrdiff_t sy = _iy[yDst]; - int k = 0; - - if (sy == previous) - k = 2; - else if (sy == previous + 1) - { - Swap(bx[0], bx[1]); - k = 1; - } - - previous = sy; - - for (; k < 2; k++) - { -#ifdef MERGE_LOADING_AND_INTERPOLATION - const uint8_t * psrc = src + (sy + k)*srcStride; - uint8_t * pdst = bx[k]; - for (size_t i = 0; i < blocks; ++i) - ResizerByteBilinearLoadGrayInterpolated(psrc, ixg[i], ax, pdst); -#else - const uint8_t * psrc = src + (sy + k)*srcStride; - uint8_t * pdst = bx[k]; - for (size_t i = 0; i < blocks; ++i) - ResizerByteBilinearLoadGray(psrc, ixg[i], pdst); - - uint8_t * pbx = bx[k]; - for (size_t i = 0; i < size; i += A) - ResizerByteBilinearInterpolateX<1>(ax + i, pbx + i); -#endif - } - - for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A) - ResizerByteBilinearInterpolateY(bx[0] + ib, bx[1] + ib, a, dst + id); - size_t i = size - DA; - ResizerByteBilinearInterpolateY(bx[0] + i, bx[1] + i, a, dst + i / 2); - } - } - - void ResizerByteBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - assert(_param.dstW >= A); - - EstimateParams(); - switch (_param.channels) - { - case 1: - if (_blocks) - RunG(src, srcStride, dst, dstStride); - else - Run<1>(src, srcStride, dst, dstStride); - break; - case 2: Run<2>(src, srcStride, dst, dstStride); break; - case 3: Run<3>(src, srcStride, dst, dstStride); break; - case 4: Run<4>(src, srcStride, dst, dstStride); break; - default: - assert(0); - } - } - //--------------------------------------------------------------------- - - ResizerByteArea::ResizerByteArea(const ResParam & param) - : Base::ResizerByteArea(param) - { - _by.Resize(AlignHi(_param.srcW*_param.channels, _param.align), false, _param.align); - } - - template SIMD_INLINE void ResizerByteAreaRowUpdate(const uint8_t * src0, size_t size, int32_t a0, int32_t * dst) - { - int16x4_t _a0 = vdup_n_s16(a0); - for (size_t i = 0; i < size; i += A, dst += A, src0 += A) - { - uint8x16_t s0 = Load(src0); - int16x8_t u00 = UnpackU8s<0>(s0); - int16x8_t u01 = UnpackU8s<1>(s0); - Update(dst + 0 * F, vmull_s16(_a0, Half<0>(u00))); - Update(dst + 1 * F, vmull_s16(_a0, Half<1>(u00))); - Update(dst + 2 * F, vmull_s16(_a0, Half<0>(u01))); - Update(dst + 3 * F, vmull_s16(_a0, Half<1>(u01))); - } - } - - template SIMD_INLINE void ResizerByteAreaRowUpdate(const uint8_t * src0, size_t stride, size_t size, int32_t a0, int32_t a1, int32_t * dst) - { - int16x4_t _a0 = vdup_n_s16(a0); - int16x4_t _a1 = vdup_n_s16(a1); - const uint8_t * src1 = src0 + stride; - for (size_t i = 0; i < size; i += A, dst += A) - { - uint8x16_t s0 = Load(src0 + i); - uint8x16_t s1 = Load(src1 + i); - int16x8_t u00 = UnpackU8s<0>(s0); - int16x8_t u01 = UnpackU8s<1>(s0); - int16x8_t u10 = UnpackU8s<0>(s1); - int16x8_t u11 = UnpackU8s<1>(s1); - Update(dst + 0 * F, vmlal_s16(vmull_s16(_a0, Half<0>(u00)), _a1, Half<0>(u10))); - Update(dst + 1 * F, vmlal_s16(vmull_s16(_a0, Half<1>(u00)), _a1, Half<1>(u10))); - Update(dst + 2 * F, vmlal_s16(vmull_s16(_a0, Half<0>(u01)), _a1, Half<0>(u11))); - Update(dst + 3 * F, vmlal_s16(vmull_s16(_a0, Half<1>(u01)), _a1, Half<1>(u11))); - } - } - - SIMD_INLINE void ResizerByteAreaRowSum(const uint8_t * src, size_t stride, size_t count, size_t size, int32_t curr, int32_t zero, int32_t next, int32_t * dst) - { - if (count) - { - size_t i = 0; - ResizerByteAreaRowUpdate(src, stride, size, curr, count == 1 ? zero - next : zero, dst), src += 2 * stride, i += 2; - for (; i < count; i += 2, src += 2 * stride) - ResizerByteAreaRowUpdate(src, stride, size, zero, i == count - 1 ? zero - next : zero, dst); - if (i == count) - ResizerByteAreaRowUpdate(src, size, zero - next, dst); - } - else - ResizerByteAreaRowUpdate(src, size, curr - next, dst); - } - - template SIMD_INLINE void ResizerByteAreaSet(const int32_t * src, int32_t value, int32_t * dst) - { - for (size_t c = 0; c < N; ++c) - dst[c] = src[c] * value; - } - - template SIMD_INLINE void ResizerByteAreaAdd(const int32_t * src, int32_t value, int32_t * dst) - { - for (size_t c = 0; c < N; ++c) - dst[c] += src[c] * value; - } - - template SIMD_INLINE void ResizerByteAreaRes(const int32_t * src, uint8_t * dst) - { - for (size_t c = 0; c < N; ++c) - dst[c] = uint8_t((src[c] + Base::AREA_ROUND) >> Base::AREA_SHIFT); - } - - template SIMD_INLINE void ResizerByteAreaResult(const int32_t * src, size_t count, int32_t curr, int32_t zero, int32_t next, uint8_t * dst) - { - int32_t sum[N]; - ResizerByteAreaSet(src, curr, sum); - for (size_t i = 0; i < count; ++i) - src += N, ResizerByteAreaAdd(src, zero, sum); - ResizerByteAreaAdd(src, -next, sum); - ResizerByteAreaRes(sum, dst); - } - - template SIMD_INLINE void ResizerByteAreaResult34(const int32_t * src, size_t count, int32_t curr, int32_t zero, int32_t next, uint8_t * dst) - { - int32x4_t sum = vmulq_s32(Load(src), vdupq_n_s32(curr)); - for (size_t i = 0; i < count; ++i) - src += N, sum = vmlaq_s32(sum, Load(src), vdupq_n_s32(zero)); - sum = vmlaq_s32(sum, Load(src), vdupq_n_s32(-next)); - int32x4_t res = vshrq_n_s32(vaddq_s32(sum, vdupq_n_s32(Base::AREA_ROUND)), Base::AREA_SHIFT); - *(uint32_t*)dst = vget_lane_u32((uint32x2_t)vqmovn_u16(vcombine_u16(vqmovun_s32(res), vdup_n_u16(0))), 0); - } - - template<> SIMD_INLINE void ResizerByteAreaResult<4>(const int32_t * src, size_t count, int32_t curr, int32_t zero, int32_t next, uint8_t * dst) - { - ResizerByteAreaResult34<4>(src, count, curr, zero, next, dst); - } - - template<> SIMD_INLINE void ResizerByteAreaResult<3>(const int32_t * src, size_t count, int32_t curr, int32_t zero, int32_t next, uint8_t * dst) - { - ResizerByteAreaResult34<3>(src, count, curr, zero, next, dst); - } - - template void ResizerByteArea::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - size_t dstW = _param.dstW, rowSize = _param.srcW*N, rowRest = dstStride - dstW * N; - const int32_t * iy = _iy.data, *ix = _ix.data, *ay = _ay.data, *ax = _ax.data; - int32_t ay0 = ay[0], ax0 = ax[0]; - for (size_t dy = 0; dy < _param.dstH; dy++, dst += rowRest) - { - int32_t * buf = _by.data; - size_t yn = iy[dy + 1] - iy[dy]; - ResizerByteAreaRowSum(src, srcStride, yn, rowSize, ay[dy], ay0, ay[dy + 1], buf), src += yn * srcStride; - for (size_t dx = 0; dx < dstW; dx++, dst += N) - { - size_t xn = ix[dx + 1] - ix[dx]; - ResizerByteAreaResult(buf, xn, ax[dx], ax0, ax[dx + 1], dst), buf += xn * N; - } - } - } - - void ResizerByteArea::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - switch (_param.channels) - { - case 1: Run<1>(src, srcStride, dst, dstStride); return; - case 2: Run<2>(src, srcStride, dst, dstStride); return; - case 3: Run<3>(src, srcStride, dst, dstStride); return; - case 4: Run<4>(src, srcStride, dst, dstStride); return; - default: - assert(0); - } - } - - //--------------------------------------------------------------------- - - ResizerFloatBilinear::ResizerFloatBilinear(const ResParam & param) - : Base::ResizerFloatBilinear(param) - { - } - - void ResizerFloatBilinear::Run(const float * src, size_t srcStride, float * dst, size_t dstStride) - { - size_t cn = _param.channels; - size_t rs = _param.dstW * cn; - float * pbx[2] = { _bx[0].data, _bx[1].data }; - int32_t prev = -2; - size_t rsa = AlignLo(rs, F); - for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride) - { - float fy1 = _ay[dy]; - float fy0 = 1.0f - fy1; - int32_t sy = _iy[dy]; - int32_t k = 0; - - if (sy == prev) - k = 2; - else if (sy == prev + 1) - { - Swap(pbx[0], pbx[1]); - k = 1; - } - - prev = sy; - - for (; k < 2; k++) - { - float * pb = pbx[k]; - const float * ps = src + (sy + k)*srcStride; - size_t dx = 0; - if (cn == 1) - { - float32x4_t _1 = vdupq_n_f32(1.0f); - for (; dx < rsa; dx += F) - { - float32x4_t s01 = Load(ps + _ix[dx + 0], ps + _ix[dx + 1]); - float32x4_t s23 = Load(ps + _ix[dx + 2], ps + _ix[dx + 3]); - float32x4_t fx1 = Load(_ax.data + dx); - float32x4_t fx0 = vsubq_f32(_1, fx1); - float32x4x2_t us = vuzpq_f32(s01, s23); - Store(pb + dx, vmlaq_f32(vmulq_f32(us.val[0], fx0), us.val[1], fx1)); - } - } - if (cn == 3 && rs > 3) - { - float32x4_t _1 = vdupq_n_f32(1.0f); - size_t rs3 = rs - 3; - for (; dx < rs3; dx += 3) - { - float32x4_t s0 = Load(ps + _ix[dx] + 0); - float32x4_t s1 = Load(ps + _ix[dx] + 3); - float32x4_t fx1 = vdupq_n_f32(_ax.data[dx]); - float32x4_t fx0 = vsubq_f32(_1, fx1); - Store(pb + dx, vmlaq_f32(vmulq_f32(fx0, s0), fx1, s1)); - } - } - for (; dx < rs; dx++) - { - int32_t sx = _ix[dx]; - float fx = _ax[dx]; - pb[dx] = ps[sx] * (1.0f - fx) + ps[sx + cn] * fx; - } - } - - size_t dx = 0; - float32x4_t _fy0 = vdupq_n_f32(fy0); - float32x4_t _fy1 = vdupq_n_f32(fy1); - for (; dx < rsa; dx += F) - Store(dst + dx, vmlaq_f32(vmulq_f32(Load(pbx[0] + dx), _fy0), Load(pbx[1] + dx), _fy1)); - for (; dx < rs; dx++) - dst[dx] = pbx[0][dx] * fy0 + pbx[1][dx] * fy1; - } - } - - //--------------------------------------------------------------------- - - void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) - { - ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(float32x4_t)); - if (param.IsByteBilinear() && dstX >= A) - return new ResizerByteBilinear(param); - else if (param.IsByteArea()) - return new ResizerByteArea(param); - else if (param.IsFloatBilinear()) - return new ResizerFloatBilinear(param); - else - return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonSegmentation.cpp b/src/3rd/Simd/Simd/SimdNeonSegmentation.cpp deleted file mode 100644 index 24d93624..00000000 --- a/src/3rd/Simd/Simd/SimdNeonSegmentation.cpp +++ /dev/null @@ -1,293 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdCompare.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE void ChangeIndex(uint8_t * mask, const uint8x16_t & oldIndex, const uint8x16_t & newIndex) - { - uint8x16_t _mask = Load(mask); - Store(mask, vbslq_u8(vceqq_u8(_mask, oldIndex), newIndex, _mask)); - } - - template void SegmentationChangeIndex(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t oldIndex, uint8_t newIndex) - { - if (align) - assert(Aligned(mask) && Aligned(stride)); - - uint8x16_t _oldIndex = vdupq_n_u8(oldIndex); - uint8x16_t _newIndex = vdupq_n_u8(newIndex); - size_t alignedWidth = Simd::AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - ChangeIndex(mask + col, _oldIndex, _newIndex); - if (alignedWidth != width) - ChangeIndex(mask + width - A, _oldIndex, _newIndex); - mask += stride; - } - } - - void SegmentationChangeIndex(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t oldIndex, uint8_t newIndex) - { - if (Aligned(mask) && Aligned(stride)) - SegmentationChangeIndex(mask, stride, width, height, oldIndex, newIndex); - else - SegmentationChangeIndex(mask, stride, width, height, oldIndex, newIndex); - } - - template SIMD_INLINE void FillSingleHoles(uint8_t * mask, ptrdiff_t stride, const uint8x16_t & index) - { - uint8x16_t up = vceqq_u8(Load(mask - stride), index); - uint8x16_t left = vceqq_u8(Load(mask - 1), index); - uint8x16_t right = vceqq_u8(Load(mask + 1), index); - uint8x16_t down = vceqq_u8(Load(mask + stride), index); - Store(mask, vbslq_u8(vandq_u8(vandq_u8(up, left), vandq_u8(right, down)), index, Load(mask))); - } - - template void SegmentationFillSingleHoles(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index) - { - assert(width > A + 2 && height > 2); - if (align) - assert(Aligned(mask) && Aligned(stride)); - - height -= 1; - width -= 1; - uint8x16_t _index = vdupq_n_u8(index); - size_t alignedWidth = Simd::AlignLo(width, A); - for (size_t row = 1; row < height; ++row) - { - mask += stride; - - FillSingleHoles(mask + 1, stride, _index); - - for (size_t col = A; col < alignedWidth; col += A) - FillSingleHoles(mask + col, stride, _index); - - if (alignedWidth != width) - FillSingleHoles(mask + width - A, stride, _index); - } - } - - void SegmentationFillSingleHoles(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index) - { - if (Aligned(mask) && Aligned(stride)) - SegmentationFillSingleHoles(mask, stride, width, height, index); - else - SegmentationFillSingleHoles(mask, stride, width, height, index); - } - - SIMD_INLINE void SegmentationPropagate2x2(const uint8x16_t & parentOne, const uint8x16_t & parentAll, - const uint8_t * difference0, const uint8_t * difference1, uint8_t * child0, uint8_t * child1, size_t childCol, - const uint8x16_t & index, const uint8x16_t & invalid, const uint8x16_t & empty, const uint8x16_t & threshold) - { - const uint8x16_t _difference0 = Load(difference0 + childCol); - const uint8x16_t _difference1 = Load(difference1 + childCol); - const uint8x16_t _child0 = Load(child0 + childCol); - const uint8x16_t _child1 = Load(child1 + childCol); - const uint8x16_t condition0 = vorrq_u8(parentAll, vandq_u8(parentOne, vcgtq_u8(_difference0, threshold))); - const uint8x16_t condition1 = vorrq_u8(parentAll, vandq_u8(parentOne, vcgtq_u8(_difference1, threshold))); - Store(child0 + childCol, vbslq_u8(vcltq_u8(_child0, invalid), vbslq_u8(condition0, index, empty), _child0)); - Store(child1 + childCol, vbslq_u8(vcltq_u8(_child1, invalid), vbslq_u8(condition1, index, empty), _child1)); - } - - template SIMD_INLINE void SegmentationPropagate2x2(const uint8_t * parent0, const uint8_t * parent1, size_t parentCol, - const uint8_t * difference0, const uint8_t * difference1, uint8_t * child0, uint8_t * child1, size_t childCol, - const uint8x16_t & index, const uint8x16_t & invalid, const uint8x16_t & empty, const uint8x16_t & threshold) - { - const uint8x16_t parent00 = vceqq_u8(Load(parent0 + parentCol), index); - const uint8x16_t parent01 = vceqq_u8(Load(parent0 + parentCol + 1), index); - const uint8x16_t parent10 = vceqq_u8(Load(parent1 + parentCol), index); - const uint8x16_t parent11 = vceqq_u8(Load(parent1 + parentCol + 1), index); - const uint8x16_t parentOne = vorrq_u8(vorrq_u8(parent00, parent01), vorrq_u8(parent10, parent11)); - const uint8x16_t parentAll = vandq_u8(vandq_u8(parent00, parent01), vandq_u8(parent10, parent11)); - - SegmentationPropagate2x2(Stretch2<0>(parentOne), Stretch2<0>(parentAll), - difference0, difference1, child0, child1, childCol, index, invalid, empty, threshold); - - SegmentationPropagate2x2(Stretch2<1>(parentOne), Stretch2<1>(parentAll), - difference0, difference1, child0, child1, childCol + A, index, invalid, empty, threshold); - } - - template void SegmentationPropagate2x2(const uint8_t * parent, size_t parentStride, size_t width, size_t height, - uint8_t * child, size_t childStride, const uint8_t * difference, size_t differenceStride, - uint8_t currentIndex, uint8_t invalidIndex, uint8_t emptyIndex, uint8_t differenceThreshold) - { - assert(width >= A + 1 && height >= 2); - height--; - width--; - - size_t alignedWidth = Simd::AlignLo(width, A); - uint8x16_t index = vdupq_n_u8(currentIndex); - uint8x16_t invalid = vdupq_n_u8(invalidIndex); - uint8x16_t empty = vdupq_n_u8(emptyIndex); - uint8x16_t threshold = vdupq_n_u8(differenceThreshold); - - for (size_t parentRow = 0, childRow = 1; parentRow < height; ++parentRow, childRow += 2) - { - const uint8_t * parent0 = parent + parentRow*parentStride; - const uint8_t * parent1 = parent0 + parentStride; - const uint8_t * difference0 = difference + childRow*differenceStride; - const uint8_t * difference1 = difference0 + differenceStride; - uint8_t * child0 = child + childRow*childStride; - uint8_t * child1 = child0 + childStride; - - for (size_t parentCol = 0, childCol = 1; parentCol < alignedWidth; parentCol += A, childCol += DA) - SegmentationPropagate2x2(parent0, parent1, parentCol, difference0, difference1, - child0, child1, childCol, index, invalid, empty, threshold); - if (alignedWidth != width) - SegmentationPropagate2x2(parent0, parent1, width - A, difference0, difference1, - child0, child1, (width - A) * 2 + 1, index, invalid, empty, threshold); - } - } - - void SegmentationPropagate2x2(const uint8_t * parent, size_t parentStride, size_t width, size_t height, - uint8_t * child, size_t childStride, const uint8_t * difference, size_t differenceStride, - uint8_t currentIndex, uint8_t invalidIndex, uint8_t emptyIndex, uint8_t differenceThreshold) - { - if (Aligned(parent) && Aligned(parentStride)) - SegmentationPropagate2x2(parent, parentStride, width, height, child, childStride, - difference, differenceStride, currentIndex, invalidIndex, emptyIndex, differenceThreshold); - else - SegmentationPropagate2x2(parent, parentStride, width, height, child, childStride, - difference, differenceStride, currentIndex, invalidIndex, emptyIndex, differenceThreshold); - } - - SIMD_INLINE bool IsNotZero(uint8x16_t value) - { - uint32x2_t tmp = (uint32x2_t)vorr_u8(Half<0>(value), Half<1>(value)); - return vget_lane_u32(vpmax_u32(tmp, tmp), 0); - } - - SIMD_INLINE bool RowHasIndex(const uint8_t * mask, size_t alignedSize, size_t fullSize, uint8x16_t index) - { - for (size_t col = 0; col < alignedSize; col += A) - { - if (IsNotZero(vceqq_u8(Load(mask + col), index))) - return true; - } - if (alignedSize != fullSize) - { - if (IsNotZero(vceqq_u8(Load(mask + fullSize - A), index))) - return true; - } - return false; - } - - SIMD_INLINE bool ColsHasIndex(const uint8_t * mask, size_t stride, size_t size, uint8x16_t index, uint8_t * cols) - { - uint8x16_t _cols = K8_00; - for (size_t row = 0; row < size; ++row) - { - _cols = vorrq_u8(_cols, vceqq_u8(Load(mask), index)); - mask += stride; - } - Store(cols, _cols); - return IsNotZero(_cols); - } - - void SegmentationShrinkRegion(const uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t index, - ptrdiff_t * left, ptrdiff_t * top, ptrdiff_t * right, ptrdiff_t * bottom) - { - assert(*right - *left >= (ptrdiff_t)A && *bottom > *top); - assert(*left >= 0 && *right <= (ptrdiff_t)width && *top >= 0 && *bottom <= (ptrdiff_t)height); - - size_t fullWidth = *right - *left; - ptrdiff_t alignedWidth = Simd::AlignLo(fullWidth, A); - uint8x16_t _index = vdupq_n_u8(index); - bool search = true; - for (ptrdiff_t row = *top; search && row < *bottom; ++row) - { - if (RowHasIndex(mask + row*stride + *left, alignedWidth, fullWidth, _index)) - { - search = false; - *top = row; - } - } - - if (search) - { - *left = 0; - *top = 0; - *right = 0; - *bottom = 0; - return; - } - - search = true; - for (ptrdiff_t row = *bottom - 1; search && row >= *top; --row) - { - if (RowHasIndex(mask + row*stride + *left, alignedWidth, fullWidth, _index)) - { - search = false; - *bottom = row + 1; - } - } - - search = true; - for (ptrdiff_t col = *left; search && col < *left + alignedWidth; col += A) - { - uint8_t cols[A]; - if (ColsHasIndex(mask + (*top)*stride + col, stride, *bottom - *top, _index, cols)) - { - for (size_t i = 0; i < A; i++) - { - if (cols[i]) - { - *left = col + i; - break; - } - } - search = false; - break; - } - } - - search = true; - for (ptrdiff_t col = *right; search && col > *left; col -= A) - { - uint8_t cols[A]; - if (ColsHasIndex(mask + (*top)*stride + col - A, stride, *bottom - *top, _index, cols)) - { - for (ptrdiff_t i = A - 1; i >= 0; i--) - { - if (cols[i]) - { - *right = col - A + i + 1; - break; - } - } - search = false; - break; - } - } - } - } -#endif//SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonShiftBilinear.cpp b/src/3rd/Simd/Simd/SimdNeonShiftBilinear.cpp deleted file mode 100644 index a0815142..00000000 --- a/src/3rd/Simd/Simd/SimdNeonShiftBilinear.cpp +++ /dev/null @@ -1,179 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - const uint16x8_t K16_LINEAR_ROUND_TERM = SIMD_VEC_SET1_EPI16(Base::LINEAR_ROUND_TERM); - const uint16x8_t K16_BILINEAR_ROUND_TERM = SIMD_VEC_SET1_EPI16(Base::BILINEAR_ROUND_TERM); - - template SIMD_INLINE uint16x8_t Interpolate(uint8x16_t s[2][2], uint8x8_t k[2][2]) - { - return vshrq_n_u16(vmlal_u8(vmlal_u8(vmlal_u8(vmlal_u8(K16_BILINEAR_ROUND_TERM, Half(s[0][0]), k[0][0]), - Half(s[0][1]), k[0][1]), Half(s[1][0]), k[1][0]), Half(s[1][1]), k[1][1]), Base::BILINEAR_SHIFT); - } - - SIMD_INLINE uint8x16_t Interpolate(uint8x16_t s[2][2], uint8x8_t k[2][2]) - { - return PackU16(Interpolate<0>(s, k), Interpolate<1>(s, k)); - } - - template SIMD_INLINE uint16x8_t Interpolate(uint8x16_t s[2], uint8x8_t k[2]) - { - return vshrq_n_u16(vmlal_u8(vmlal_u8(K16_LINEAR_ROUND_TERM, Half(s[0]), k[0]), Half(s[1]), k[1]), Base::LINEAR_SHIFT); - } - - SIMD_INLINE uint8x16_t Interpolate(uint8x16_t s[2], uint8x8_t k[2]) - { - return PackU16(Interpolate<0>(s, k), Interpolate<1>(s, k)); - } - - SIMD_INLINE void LoadBlock(const uint8_t * src, size_t dx, size_t dy, uint8x16_t s[2][2]) - { - s[0][0] = Load(src); - s[0][1] = Load(src + dx); - s[1][0] = Load(src + dy); - s[1][1] = Load(src + dy + dx); - } - - SIMD_INLINE void LoadBlock(const uint8_t * src, size_t dr, uint8x16_t s[2]) - { - s[0] = Load(src); - s[1] = Load(src + dr); - } - - void ShiftBilinear(const uint8_t *src, size_t srcStride, size_t width, size_t height, size_t channelCount, - int fDx, int fDy, uint8_t *dst, size_t dstStride) - { - size_t size = width*channelCount; - size_t alignedSize = AlignLo(size, A); - - if (fDy) - { - if (fDx) - { - uint8x8_t k[2][2]; - uint8x16_t s[2][2]; - k[0][0] = vdup_n_u8((Base::FRACTION_RANGE - fDx)*(Base::FRACTION_RANGE - fDy)); - k[0][1] = vdup_n_u8(fDx*(Base::FRACTION_RANGE - fDy)); - k[1][0] = vdup_n_u8((Base::FRACTION_RANGE - fDx)*fDy); - k[1][1] = vdup_n_u8(fDx*fDy); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedSize; col += A) - { - LoadBlock(src + col, channelCount, srcStride, s); - Store(dst + col, Interpolate(s, k)); - } - if (size != alignedSize) - { - LoadBlock(src + size - A, channelCount, srcStride, s); - Store(dst + size - A, Interpolate(s, k)); - } - src += srcStride; - dst += dstStride; - } - } - else - { - uint8x8_t k[2]; - uint8x16_t s[2]; - k[0] = vdup_n_u8(Base::FRACTION_RANGE - fDy); - k[1] = vdup_n_u8(fDy); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedSize; col += A) - { - LoadBlock(src + col, srcStride, s); - Store(dst + col, Interpolate(s, k)); - } - if (size != alignedSize) - { - LoadBlock(src + size - A, srcStride, s); - Store(dst + size - A, Interpolate(s, k)); - } - src += srcStride; - dst += dstStride; - } - } - } - else - { - if (fDx) - { - uint8x8_t k[2]; - uint8x16_t s[2]; - k[0] = vdup_n_u8(Base::FRACTION_RANGE - fDx); - k[1] = vdup_n_u8(fDx); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedSize; col += A) - { - LoadBlock(src + col, channelCount, s); - Store(dst + col, Interpolate(s, k)); - } - if (size != alignedSize) - { - LoadBlock(src + size - A, channelCount, s); - Store(dst + size - A, Interpolate(s, k)); - } - src += srcStride; - dst += dstStride; - } - } - else - { - for (size_t row = 0; row < height; ++row) - { - memcpy(dst, src, size); - src += srcStride; - dst += dstStride; - } - } - } - } - - void ShiftBilinear( - const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, - const uint8_t * bkg, size_t bkgStride, const double * shiftX, const double * shiftY, - size_t cropLeft, size_t cropTop, size_t cropRight, size_t cropBottom, uint8_t * dst, size_t dstStride) - { - int fDx, fDy; - Base::CommonShiftAction(src, srcStride, width, height, channelCount, bkg, bkgStride, shiftX, shiftY, - cropLeft, cropTop, cropRight, cropBottom, dst, dstStride, fDx, fDy); - - if (*shiftX + A < cropRight - cropLeft) - Neon::ShiftBilinear(src, srcStride, width, height, channelCount, fDx, fDy, dst, dstStride); - else - Base::ShiftBilinear(src, srcStride, width, height, channelCount, fDx, fDy, dst, dstStride); - } - } -#endif//SIMD_NEON_ENABLE -} - diff --git a/src/3rd/Simd/Simd/SimdNeonSobel.cpp b/src/3rd/Simd/Simd/SimdNeonSobel.cpp deleted file mode 100644 index 67d45206..00000000 --- a/src/3rd/Simd/Simd/SimdNeonSobel.cpp +++ /dev/null @@ -1,478 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE int16x8_t SobelDx(uint8x16_t a[3][3]) - { - return ConditionalAbs(BinomialSum(Sub(a[0][2], a[0][0]), Sub(a[1][2], a[1][0]), Sub(a[2][2], a[2][0]))); - } - - template SIMD_INLINE void SobelDx(uint8x16_t a[3][3], int16_t * dst) - { - Store(dst, SobelDx(a)); - Store(dst + HA, SobelDx(a)); - } - - template void SobelDx(const uint8_t * src, size_t srcStride, size_t width, size_t height, int16_t * dst, size_t dstStride) - { - assert(width > A); - if (align) - assert(Aligned(dst) && Aligned(dstStride, HA)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - uint8x16_t a[3][3]; - - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - LoadNoseDx(src0 + 0, a[0]); - LoadNoseDx(src1 + 0, a[1]); - LoadNoseDx(src2 + 0, a[2]); - SobelDx(a, dst + 0); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBodyDx(src0 + col, a[0]); - LoadBodyDx(src1 + col, a[1]); - LoadBodyDx(src2 + col, a[2]); - SobelDx(a, dst + col); - } - LoadTailDx(src0 + width - A, a[0]); - LoadTailDx(src1 + width - A, a[1]); - LoadTailDx(src2 + width - A, a[2]); - SobelDx(a, dst + width - A); - - dst += dstStride; - } - } - - void SobelDx(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(dst) && Aligned(dstStride)) - SobelDx(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - SobelDx(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - void SobelDxAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(dst) && Aligned(dstStride)) - SobelDx(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - SobelDx(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - SIMD_INLINE void SobelDxAbsSum(uint8x16_t a[3][3], uint32x4_t & sum) - { - sum = vaddq_u32(sum, vpaddlq_u16((uint16x8_t)vaddq_s16(SobelDx(a), SobelDx(a)))); - } - - SIMD_INLINE void SetMask3(uint8x16_t a[3], uint8x16_t mask) - { - a[0] = vandq_u8(a[0], mask); - a[1] = vandq_u8(a[1], mask); - a[2] = vandq_u8(a[2], mask); - } - - SIMD_INLINE void SetMask3x3(uint8x16_t a[3][3], uint8x16_t mask) - { - SetMask3(a[0], mask); - SetMask3(a[1], mask); - SetMask3(a[2], mask); - } - - void SobelDxAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - assert(width > A); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - - uint8x16_t a[3][3]; - uint64x2_t fullSum = K64_0000000000000000; - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + bodyWidth); - - for (size_t row = 0; row < height; ++row) - { - src0 = src + stride*(row - 1); - src1 = src0 + stride; - src2 = src1 + stride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - uint32x4_t rowSum = K32_00000000; - - LoadNoseDx(src0 + 0, a[0]); - LoadNoseDx(src1 + 0, a[1]); - LoadNoseDx(src2 + 0, a[2]); - SobelDxAbsSum(a, rowSum); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBodyDx(src0 + col, a[0]); - LoadBodyDx(src1 + col, a[1]); - LoadBodyDx(src2 + col, a[2]); - SobelDxAbsSum(a, rowSum); - } - LoadTailDx(src0 + width - A, a[0]); - LoadTailDx(src1 + width - A, a[1]); - LoadTailDx(src2 + width - A, a[2]); - SetMask3x3(a, tailMask); - SobelDxAbsSum(a, rowSum); - - fullSum = vaddq_u64(fullSum, vpaddlq_u32(rowSum)); - } - *sum = ExtractSum64u(fullSum); - } - - template SIMD_INLINE int16x8_t SobelDy(uint8x16_t a[3][3]) - { - return ConditionalAbs(BinomialSum(Sub(a[2][0], a[0][0]), Sub(a[2][1], a[0][1]), Sub(a[2][2], a[0][2]))); - } - - template SIMD_INLINE void SobelDy(uint8x16_t a[3][3], int16_t * dst) - { - Store(dst, SobelDy(a)); - Store(dst + HA, SobelDy(a)); - } - - template void SobelDy(const uint8_t * src, size_t srcStride, size_t width, size_t height, int16_t * dst, size_t dstStride) - { - assert(width > A); - if (align) - assert(Aligned(dst) && Aligned(dstStride, HA)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - uint8x16_t a[3][3]; - - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - LoadNose3(src0 + 0, a[0]); - LoadNose3(src2 + 0, a[2]); - SobelDy(a, dst + 0); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBody3(src0 + col, a[0]); - LoadBody3(src2 + col, a[2]); - SobelDy(a, dst + col); - } - LoadTail3(src0 + width - A, a[0]); - LoadTail3(src2 + width - A, a[2]); - SobelDy(a, dst + width - A); - - dst += dstStride; - } - } - - void SobelDy(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - SobelDy(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - SobelDy(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - void SobelDyAbs(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - SobelDy(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - SobelDy(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - SIMD_INLINE void SobelDyAbsSum(uint8x16_t a[3][3], uint32x4_t & sum) - { - sum = vaddq_u32(sum, vpaddlq_u16((uint16x8_t)vaddq_s16(SobelDy(a), SobelDy(a)))); - } - - template void SobelDyAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - assert(width > A); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - - uint8x16_t a[3][3]; - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + bodyWidth); - uint64x2_t fullSum = K64_0000000000000000; - - for (size_t row = 0; row < height; ++row) - { - src0 = src + stride*(row - 1); - src1 = src0 + stride; - src2 = src1 + stride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - uint32x4_t rowSum = K32_00000000; - - LoadNose3(src0 + 0, a[0]); - LoadNose3(src2 + 0, a[2]); - SobelDyAbsSum(a, rowSum); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBody3(src0 + col, a[0]); - LoadBody3(src2 + col, a[2]); - SobelDyAbsSum(a, rowSum); - } - LoadTail3(src0 + width - A, a[0]); - LoadTail3(src2 + width - A, a[2]); - SetMask3x3(a, tailMask); - SobelDyAbsSum(a, rowSum); - - fullSum = vaddq_u64(fullSum, vpaddlq_u32(rowSum)); - } - *sum = ExtractSum64u(fullSum); - } - - void SobelDyAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(src) && Aligned(stride)) - SobelDyAbsSum(src, stride, width, height, sum); - else - SobelDyAbsSum(src, stride, width, height, sum); - } - - template SIMD_INLINE int16x8_t ContourMetrics(uint8x16_t a[3][3]) - { - int16x8_t dx = SobelDx(a); - int16x8_t dy = SobelDy(a); - return vaddq_s16(vshlq_n_s16(vaddq_s16(dx, dy), 1), (int16x8_t)vandq_u16(vcltq_s16(dx, dy), K16_0001)); - } - - template SIMD_INLINE void ContourMetrics(uint8x16_t a[3][3], int16_t * dst) - { - Store(dst, ContourMetrics<0>(a)); - Store(dst + HA, ContourMetrics<1>(a)); - } - - template void ContourMetrics(const uint8_t * src, size_t srcStride, size_t width, size_t height, int16_t * dst, size_t dstStride) - { - assert(width > A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride, HA)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - uint8x16_t a[3][3]; - - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - LoadNose3(src0 + 0, a[0]); - LoadNose3(src1 + 0, a[1]); - LoadNose3(src2 + 0, a[2]); - ContourMetrics(a, dst + 0); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBody3(src0 + col, a[0]); - LoadBody3(src1 + col, a[1]); - LoadBody3(src2 + col, a[2]); - ContourMetrics(a, dst + col); - } - LoadTail3(src0 + width - A, a[0]); - LoadTail3(src1 + width - A, a[1]); - LoadTail3(src2 + width - A, a[2]); - ContourMetrics(a, dst + width - A); - - dst += dstStride; - } - } - - void ContourMetrics(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - ContourMetrics(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - ContourMetrics(src, srcStride, width, height, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - template SIMD_INLINE void ContourMetricsMasked(uint8x16_t a[3][3], const uint8_t * mask, const uint8x16_t & indexMin, int16_t * dst) - { - uint8x16_t _mask = vcgeq_u8(Load(mask), indexMin); - Store(dst, vandq_s16(ContourMetrics<0>(a), (int16x8_t)Stretch2<0>(_mask))); - Store(dst + HA, vandq_s16(ContourMetrics<1>(a), (int16x8_t)Stretch2<1>(_mask))); - } - - template void ContourMetricsMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t indexMin, int16_t * dst, size_t dstStride) - { - assert(width > A); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride, HA) && Aligned(mask) && Aligned(maskStride)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - const uint8_t *src0, *src1, *src2; - uint8x16_t _indexMin = vdupq_n_u8(indexMin); - uint8x16_t a[3][3]; - - for (size_t row = 0; row < height; ++row) - { - src0 = src + srcStride*(row - 1); - src1 = src0 + srcStride; - src2 = src1 + srcStride; - if (row == 0) - src0 = src1; - if (row == height - 1) - src2 = src1; - - LoadNose3(src0 + 0, a[0]); - LoadNose3(src1 + 0, a[1]); - LoadNose3(src2 + 0, a[2]); - ContourMetricsMasked(a, mask + 0, _indexMin, dst + 0); - for (size_t col = A; col < bodyWidth; col += A) - { - LoadBody3(src0 + col, a[0]); - LoadBody3(src1 + col, a[1]); - LoadBody3(src2 + col, a[2]); - ContourMetricsMasked(a, mask + col, _indexMin, dst + col); - } - LoadTail3(src0 + width - A, a[0]); - LoadTail3(src1 + width - A, a[1]); - LoadTail3(src2 + width - A, a[2]); - ContourMetricsMasked(a, mask + width - A, _indexMin, dst + width - A); - - dst += dstStride; - mask += maskStride; - } - } - - void ContourMetricsMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * mask, size_t maskStride, uint8_t indexMin, uint8_t * dst, size_t dstStride) - { - assert(dstStride % sizeof(int16_t) == 0); - - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride) && Aligned(mask) && Aligned(maskStride)) - ContourMetricsMasked(src, srcStride, width, height, mask, maskStride, indexMin, (int16_t *)dst, dstStride / sizeof(int16_t)); - else - ContourMetricsMasked(src, srcStride, width, height, mask, maskStride, indexMin, (int16_t *)dst, dstStride / sizeof(int16_t)); - } - - template SIMD_INLINE uint16x8_t AnchorComponent(const int16_t * src, size_t step, const int16x8_t & value, const uint16x8_t & mask) - { - int16x8_t last = (int16x8_t)vshrq_n_u16((uint16x8_t)Load(src - step), 1); - int16x8_t next = (int16x8_t)vshrq_n_u16((uint16x8_t)Load(src + step), 1); - return vandq_u16(vandq_u16(vcgeq_s16(value, last), vcgeq_s16(value, next)), mask); - } - - template SIMD_INLINE uint16x8_t Anchor(const int16_t * src, size_t stride, const int16x8_t & threshold) - { - int16x8_t _src = Load(src); - uint16x8_t direction = vandq_u16((uint16x8_t)_src, K16_0001); - int16x8_t magnitude = (int16x8_t)vshrq_n_u16((uint16x8_t)_src, 1); - int16x8_t value = vsubq_s16(magnitude, threshold); - uint16x8_t vertical = AnchorComponent(src, 1, value, vceqq_u16(direction, K16_0001)); - uint16x8_t horizontal = AnchorComponent(src, stride, value, vceqq_u16(direction, K16_0000)); - return vandq_u16(vcgtq_u16((uint16x8_t)magnitude, K16_0000), vandq_u16(vorrq_u16(vertical, horizontal), K16_00FF)); - } - - template SIMD_INLINE void Anchor(const int16_t * src, size_t stride, const int16x8_t & threshold, uint8_t * dst) - { - uint16x8_t lo = Anchor(src, stride, threshold); - uint16x8_t hi = Anchor(src + HA, stride, threshold); - Store(dst, PackU16(lo, hi)); - } - - template void ContourAnchors(const int16_t * src, size_t srcStride, size_t width, size_t height, - size_t step, int16_t threshold, uint8_t * dst, size_t dstStride) - { - assert(width > A); - if (align) - assert(Aligned(src) && Aligned(srcStride, HA) && Aligned(dst) && Aligned(dstStride)); - - size_t bodyWidth = Simd::AlignHi(width, A) - A; - int16x8_t _threshold = vdupq_n_s16(threshold); - memset(dst, 0, width); - memset(dst + dstStride*(height - 1), 0, width); - src += srcStride; - dst += dstStride; - for (size_t row = 1; row < height - 1; row += step) - { - dst[0] = 0; - Anchor(src + 1, srcStride, _threshold, dst + 1); - for (size_t col = A; col < bodyWidth; col += A) - Anchor(src + col, srcStride, _threshold, dst + col); - Anchor(src + width - A - 1, srcStride, _threshold, dst + width - A - 1); - dst[width - 1] = 0; - src += step*srcStride; - dst += step*dstStride; - } - } - - void ContourAnchors(const uint8_t * src, size_t srcStride, size_t width, size_t height, - size_t step, int16_t threshold, uint8_t * dst, size_t dstStride) - { - assert(srcStride % sizeof(int16_t) == 0); - - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - ContourAnchors((const int16_t *)src, srcStride / sizeof(int16_t), width, height, step, threshold, dst, dstStride); - else - ContourAnchors((const int16_t *)src, srcStride / sizeof(int16_t), width, height, step, threshold, dst, dstStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonSquaredDifferenceSum.cpp b/src/3rd/Simd/Simd/SimdNeonSquaredDifferenceSum.cpp deleted file mode 100644 index e4a829ff..00000000 --- a/src/3rd/Simd/Simd/SimdNeonSquaredDifferenceSum.cpp +++ /dev/null @@ -1,222 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE uint16x8_t Square(const uint8x8_t & value) - { - return vmull_u8(value, value); - } - - SIMD_INLINE uint32x4_t SquaredDifferenceSum(const uint8x16_t & a, const uint8x16_t & b) - { - uint8x16_t ad = vabdq_u8(a, b); - uint16x8_t lo = Square(vget_low_u8(ad)); - uint16x8_t hi = Square(vget_high_u8(ad)); - return vaddq_u32(vpaddlq_u16(lo), vpaddlq_u16(hi)); - } - - SIMD_INLINE uint32x4_t SquaredDifferenceSumMasked(const uint8x16_t & a, const uint8x16_t & b, const uint8x16_t & mask) - { - uint8x16_t ad = vandq_u8(vabdq_u8(a, b), mask); - uint16x8_t lo = Square(vget_low_u8(ad)); - uint16x8_t hi = Square(vget_high_u8(ad)); - return vaddq_u32(vpaddlq_u16(lo), vpaddlq_u16(hi)); - } - - template void SquaredDifferenceSum( - const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - size_t width, size_t height, uint64_t * sum) - { - assert(width < 0x10000); - if (align) - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - - uint64x2_t _sum = K64_0000000000000000; - for (size_t row = 0; row < height; ++row) - { - uint32x4_t rowSum = K32_00000000; - for (size_t col = 0; col < alignedWidth; col += A) - { - uint8x16_t _a = Load(a + col); - uint8x16_t _b = Load(b + col); - rowSum = vaddq_u32(rowSum, SquaredDifferenceSum(_a, _b)); - } - if (width - alignedWidth) - { - uint8x16_t _a = Load(a + width - A); - uint8x16_t _b = Load(b + width - A); - rowSum = vaddq_u32(rowSum, SquaredDifferenceSumMasked(_a, _b, tailMask)); - } - _sum = vaddq_u64(_sum, vpaddlq_u32(rowSum)); - a += aStride; - b += bStride; - } - *sum = ExtractSum64u(_sum); - } - - void SquaredDifferenceSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)) - SquaredDifferenceSum(a, aStride, b, bStride, width, height, sum); - else - SquaredDifferenceSum(a, aStride, b, bStride, width, height, sum); - } - - template void SquaredDifferenceSumMasked( - const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) - { - assert(width < 0x10000); - if (align) - { - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); - assert(Aligned(mask) && Aligned(maskStride)); - } - - size_t alignedWidth = Simd::AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - uint8x16_t _index = vdupq_n_u8(index); - uint64x2_t _sum = K64_0000000000000000; - - for (size_t row = 0; row < height; ++row) - { - uint32x4_t rowSum = K32_00000000; - for (size_t col = 0; col < alignedWidth; col += A) - { - uint8x16_t _mask = vceqq_u8(Load(mask + col), _index); - uint8x16_t _a = Load(a + col); - uint8x16_t _b = Load(b + col); - rowSum = vaddq_u32(rowSum, SquaredDifferenceSumMasked(_a, _b, _mask)); - } - if (width - alignedWidth) - { - uint8x16_t _mask = vandq_u8(tailMask, vceqq_u8(Load(mask + width - A), _index)); - uint8x16_t _a = Load(a + width - A); - uint8x16_t _b = Load(b + width - A); - rowSum = vaddq_u32(rowSum, SquaredDifferenceSumMasked(_a, _b, _mask)); - } - _sum = vaddq_u64(_sum, vpaddlq_u32(rowSum)); - a += aStride; - b += bStride; - mask += maskStride; - } - *sum = ExtractSum64u(_sum); - } - - void SquaredDifferenceSumMasked(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, - const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride) && Aligned(mask) && Aligned(maskStride)) - SquaredDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - else - SquaredDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); - } - - template SIMD_INLINE void SquaredDifferenceSum32f(const float * a, const float * b, size_t offset, float32x4_t & sum) - { - float32x4_t _a = Load(a + offset); - float32x4_t _b = Load(b + offset); - float32x4_t _d = vsubq_f32(_a, _b); - sum = vmlaq_f32(sum, _d, _d); - } - - template SIMD_INLINE void SquaredDifferenceSum32f(const float * a, const float * b, size_t size, float * sum) - { - if (align) - assert(Aligned(a) && Aligned(b)); - - *sum = 0; - size_t alignedSize = AlignLo(size, 4); - size_t i = 0; - if (alignedSize) - { - float32x4_t sums = vdupq_n_f32(0); - for (; i < alignedSize; i += 4) - SquaredDifferenceSum32f(a, b, i, sums); - *sum += ExtractSum32f(sums); - } - for (; i < size; ++i) - *sum += Simd::Square(a[i] - b[i]); - } - - void SquaredDifferenceSum32f(const float * a, const float * b, size_t size, float * sum) - { - if (Aligned(a) && Aligned(b)) - SquaredDifferenceSum32f(a, b, size, sum); - else - SquaredDifferenceSum32f(a, b, size, sum); - } - - template SIMD_INLINE void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t offset, float32x4_t & sum, float32x4_t & correction) - { - float32x4_t _a = Load(a + offset); - float32x4_t _b = Load(b + offset); - float32x4_t _d = vsubq_f32(_a, _b); - float32x4_t term = vmlaq_f32(correction, _d, _d); - float32x4_t temp = vaddq_f32(sum, term); - correction = vsubq_f32(vmulq_f32(temp, sum), term); - sum = temp; - } - - template SIMD_INLINE void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum) - { - if (align) - assert(Aligned(a) && Aligned(b)); - - *sum = 0; - size_t alignedSize = AlignLo(size, 4); - size_t i = 0; - if (alignedSize) - { - float32x4_t sums = vdupq_n_f32(0); - float32x4_t corrections = vdupq_n_f32(0); - for (; i < alignedSize; i += 4) - SquaredDifferenceKahanSum32f(a, b, i, sums, corrections); - *sum += ExtractSum32f(sums); - } - for (; i < size; ++i) - *sum += Simd::Square(a[i] - b[i]); - } - - void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum) - { - if (Aligned(a) && Aligned(b)) - SquaredDifferenceKahanSum32f(a, b, size, sum); - else - SquaredDifferenceKahanSum32f(a, b, size, sum); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonStatistic.cpp b/src/3rd/Simd/Simd/SimdNeonStatistic.cpp deleted file mode 100644 index c8a2b26d..00000000 --- a/src/3rd/Simd/Simd/SimdNeonStatistic.cpp +++ /dev/null @@ -1,485 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar, -* 2018-2018 Radchenko Andrey. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template void GetStatistic(const uint8_t * src, size_t stride, size_t width, size_t height, - uint8_t * min, uint8_t * max, uint8_t * average) - { - assert(width*height && width >= A); - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t alignedWidth = AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - size_t blockSize = A << 8; - size_t blockCount = (alignedWidth >> 8) + 1; - uint64x2_t fullSum = K64_0000000000000000; - uint8x16_t _min = K8_FF; - uint8x16_t _max = K8_00; - for (size_t row = 0; row < height; ++row) - { - uint32x4_t rowSum = K32_00000000; - for (size_t block = 0; block < blockCount; ++block) - { - uint16x8_t blockSum = K16_0000; - for (size_t col = block*blockSize, end = Min(col + blockSize, alignedWidth); col < end; col += A) - { - const uint8x16_t _src = Load(src + col); - _min = vminq_u8(_min, _src); - _max = vmaxq_u8(_max, _src); - blockSum = vaddq_u16(blockSum, vpaddlq_u8(_src)); - } - rowSum = vaddq_u32(rowSum, vpaddlq_u16(blockSum)); - } - if (width - alignedWidth) - { - const uint8x16_t _src = Load(src + width - A); - _min = vminq_u8(_min, _src); - _max = vmaxq_u8(_max, _src); - rowSum = vaddq_u32(rowSum, vpaddlq_u16(vpaddlq_u8(vandq_u8(_src, tailMask)))); - } - fullSum = vaddq_u64(fullSum, vpaddlq_u32(rowSum)); - src += stride; - } - - uint8_t min_buffer[A], max_buffer[A]; - Store(min_buffer, _min); - Store(max_buffer, _max); - *min = UCHAR_MAX; - *max = 0; - for (size_t i = 0; i < A; ++i) - { - *min = Base::MinU8(min_buffer[i], *min); - *max = Base::MaxU8(max_buffer[i], *max); - } - *average = (uint8_t)((ExtractSum64u(fullSum) + width*height / 2) / (width*height)); - } - - void GetStatistic(const uint8_t * src, size_t stride, size_t width, size_t height, - uint8_t * min, uint8_t * max, uint8_t * average) - { - if (Aligned(src) && Aligned(stride)) - GetStatistic(src, stride, width, height, min, max, average); - else - GetStatistic(src, stride, width, height, min, max, average); - } - - template void GetRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - size_t alignedWidth = AlignLo(width, A); - const uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - size_t blockSize = A << 8; - size_t blockCount = (alignedWidth >> 8) + 1; - - memset(sums, 0, sizeof(uint32_t)*height); - for (size_t row = 0; row < height; ++row) - { - uint32x4_t rowSum = K32_00000000; - for (size_t block = 0; block < blockCount; ++block) - { - uint16x8_t blockSum = K16_0000; - for (size_t col = block*blockSize, end = Min(col + blockSize, alignedWidth); col < end; col += A) - { - const uint8x16_t _src = Load(src + col); - blockSum = vaddq_u16(blockSum, vpaddlq_u8(_src)); - } - rowSum = vaddq_u32(rowSum, vpaddlq_u16(blockSum)); - } - if (alignedWidth != width) - { - const uint8x16_t _src = vandq_u8(Load(src + width - A), tailMask); - rowSum = vaddq_u32(rowSum, vpaddlq_u16(vpaddlq_u8(_src))); - } - sums[row] = ExtractSum32u(rowSum); - src += stride; - } - } - - void GetRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - if (Aligned(src) && Aligned(stride)) - GetRowSums(src, stride, width, height, sums); - else - GetRowSums(src, stride, width, height, sums); - } - - namespace - { - struct Buffer - { - Buffer(size_t width) - { - _p = Allocate(sizeof(uint16_t)*width + sizeof(uint32_t)*width); - sums16 = (uint16_t*)_p; - sums32 = (uint32_t*)(sums16 + width); - } - - ~Buffer() - { - Free(_p); - } - - uint16_t * sums16; - uint32_t * sums32; - private: - void *_p; - }; - } - - template SIMD_INLINE void Sum16(const uint8x16_t & src, uint16_t * dst) - { - Store(dst + 0, vaddq_u16(Load(dst + 0), UnpackU8<0>(src))); - Store(dst + 8, vaddq_u16(Load(dst + 8), UnpackU8<1>(src))); - } - - template SIMD_INLINE void Sum32(const uint16x8_t & src, uint32_t * dst) - { - Store(dst + 0, vaddq_u32(Load(dst + 0), UnpackU16<0>(src))); - Store(dst + 4, vaddq_u32(Load(dst + 4), UnpackU16<1>(src))); - } - - template void GetColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - size_t alignedLoWidth = AlignLo(width, A); - size_t alignedHiWidth = AlignHi(width, A); - const uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedLoWidth); - size_t stepSize = SCHAR_MAX + 1; - size_t stepCount = (height + SCHAR_MAX) / stepSize; - - Buffer buffer(alignedHiWidth); - memset(buffer.sums32, 0, sizeof(uint32_t)*alignedHiWidth); - for (size_t step = 0; step < stepCount; ++step) - { - size_t rowStart = step*stepSize; - size_t rowEnd = Min(rowStart + stepSize, height); - - memset(buffer.sums16, 0, sizeof(uint16_t)*width); - for (size_t row = rowStart; row < rowEnd; ++row) - { - for (size_t col = 0; col < alignedLoWidth; col += A) - { - const uint8x16_t _src = Load(src + col); - Sum16(_src, buffer.sums16 + col); - } - if (alignedLoWidth != width) - { - const uint8x16_t _src = vandq_u8(Load(src + width - A), tailMask); - Sum16(_src, buffer.sums16 + width - A); - } - src += stride; - } - - for (size_t col = 0; col < alignedHiWidth; col += HA) - Sum32(Load(buffer.sums16 + col), buffer.sums32 + col); - } - memcpy(sums, buffer.sums32, sizeof(uint32_t)*width); - } - - void GetColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - if (Aligned(src) && Aligned(stride)) - GetColSums(src, stride, width, height, sums); - else - GetColSums(src, stride, width, height, sums); - } - - template void GetAbsDyRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - size_t alignedWidth = AlignLo(width, A); - const uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - size_t blockSize = A << 8; - size_t blockCount = (alignedWidth >> 8) + 1; - - memset(sums, 0, sizeof(uint32_t)*height); - const uint8_t * src0 = src; - const uint8_t * src1 = src + stride; - height--; - for (size_t row = 0; row < height; ++row) - { - uint32x4_t rowSum = K32_00000000; - for (size_t block = 0; block < blockCount; ++block) - { - uint16x8_t blockSum = K16_0000; - for (size_t col = block*blockSize, end = Min(col + blockSize, alignedWidth); col < end; col += A) - { - const uint8x16_t _src0 = Load(src0 + col); - const uint8x16_t _src1 = Load(src1 + col); - blockSum = vaddq_u16(blockSum, vpaddlq_u8(vabdq_u8(_src0, _src1))); - } - rowSum = vaddq_u32(rowSum, vpaddlq_u16(blockSum)); - } - if (alignedWidth != width) - { - const uint8x16_t _src0 = Load(src0 + width - A); - const uint8x16_t _src1 = Load(src1 + width - A); - rowSum = vaddq_u32(rowSum, vpaddlq_u16(vpaddlq_u8(vandq_u8(vabdq_u8(_src0, _src1), tailMask)))); - } - sums[row] = ExtractSum32u(rowSum); - src0 += stride; - src1 += stride; - } - } - - void GetAbsDyRowSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - if (Aligned(src) && Aligned(stride)) - GetAbsDyRowSums(src, stride, width, height, sums); - else - GetAbsDyRowSums(src, stride, width, height, sums); - } - - template void GetAbsDxColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - width--; - size_t alignedLoWidth = AlignLo(width, A); - size_t alignedHiWidth = AlignHi(width, A); - const uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedLoWidth); - size_t stepSize = SCHAR_MAX + 1; - size_t stepCount = (height + SCHAR_MAX) / stepSize; - - Buffer buffer(alignedHiWidth); - memset(buffer.sums32, 0, sizeof(uint32_t)*alignedHiWidth); - for (size_t step = 0; step < stepCount; ++step) - { - size_t rowStart = step*stepSize; - size_t rowEnd = Min(rowStart + stepSize, height); - - memset(buffer.sums16, 0, sizeof(uint16_t)*width); - for (size_t row = rowStart; row < rowEnd; ++row) - { - for (size_t col = 0; col < alignedLoWidth; col += A) - { - const uint8x16_t _src0 = Load(src + col + 0); - const uint8x16_t _src1 = Load(src + col + 1); - Sum16(vabdq_u8(_src0, _src1), buffer.sums16 + col); - } - if (alignedLoWidth != width) - { - const uint8x16_t _src0 = Load(src + width - A + 0); - const uint8x16_t _src1 = Load(src + width - A + 1); - Sum16(vandq_u8(vabdq_u8(_src0, _src1), tailMask), buffer.sums16 + width - A); - } - src += stride; - } - - for (size_t col = 0; col < alignedHiWidth; col += HA) - Sum32(Load(buffer.sums16 + col), buffer.sums32 + col); - } - memcpy(sums, buffer.sums32, sizeof(uint32_t)*width); - sums[width] = 0; - } - - void GetAbsDxColSums(const uint8_t * src, size_t stride, size_t width, size_t height, uint32_t * sums) - { - if (Aligned(src) && Aligned(stride)) - GetAbsDxColSums(src, stride, width, height, sums); - else - GetAbsDxColSums(src, stride, width, height, sums); - } - - template void ValueSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - assert(width >= A); - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t alignedWidth = AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - size_t blockSize = A << 8; - size_t blockCount = (alignedWidth >> 8) + 1; - uint64x2_t fullSum = K64_0000000000000000; - for (size_t row = 0; row < height; ++row) - { - uint32x4_t rowSum = K32_00000000; - for (size_t block = 0; block < blockCount; ++block) - { - uint16x8_t blockSum = K16_0000; - for (size_t col = block*blockSize, end = Min(col + blockSize, alignedWidth); col < end; col += A) - { - const uint8x16_t _src = Load(src + col); - blockSum = vaddq_u16(blockSum, vpaddlq_u8(_src)); - } - rowSum = vaddq_u32(rowSum, vpaddlq_u16(blockSum)); - } - if (width - alignedWidth) - { - const uint8x16_t _src = vandq_u8(Load(src + width - A), tailMask); - rowSum = vaddq_u32(rowSum, vpaddlq_u16(vpaddlq_u8(_src))); - } - fullSum = vaddq_u64(fullSum, vpaddlq_u32(rowSum)); - src += stride; - } - *sum = ExtractSum64u(fullSum); - } - - void ValueSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(src) && Aligned(stride)) - ValueSum(src, stride, width, height, sum); - else - ValueSum(src, stride, width, height, sum); - } - - SIMD_INLINE uint16x8_t Square(uint8x8_t value) - { - return vmull_u8(value, value); - } - - SIMD_INLINE uint32x4_t Square(uint8x16_t value) - { - uint16x8_t lo = Square(vget_low_u8(value)); - uint16x8_t hi = Square(vget_high_u8(value)); - return vaddq_u32(vpaddlq_u16(lo), vpaddlq_u16(hi)); - } - - template void SquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - assert(width >= A); - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - - uint64x2_t fullSum = K64_0000000000000000; - for (size_t row = 0; row < height; ++row) - { - uint32x4_t rowSum = K32_00000000; - for (size_t col = 0; col < alignedWidth; col += A) - rowSum = vaddq_u32(rowSum, Square(Load(src + col))); - if (alignedWidth != width) - rowSum = vaddq_u32(rowSum, Square(vandq_u8(Load(src + width - A), tailMask))); - fullSum = vaddq_u64(fullSum, vpaddlq_u32(rowSum)); - src += stride; - } - *sum = ExtractSum64u(fullSum); - } - - void SquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(src) && Aligned(stride)) - SquareSum(src, stride, width, height, sum); - else - SquareSum(src, stride, width, height, sum); - } - - template void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum) - { - assert(width >= A); - if (align) - assert(Aligned(src) && Aligned(stride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - uint64x2_t fullValueSum = K64_0000000000000000; - uint64x2_t fullSquareSum = K64_0000000000000000; - for (size_t row = 0; row < height; ++row) - { - uint32x4_t rowValueSum = K32_00000000; - uint32x4_t rowSquareSum = K32_00000000; - for (size_t col = 0; col < alignedWidth; col += A) - { - uint8x16_t _src = Load(src + col); - rowValueSum = vpadalq_u16(rowValueSum, vpaddlq_u8(_src)); - rowSquareSum = vaddq_u32(rowSquareSum, Square(_src)); - } - if (alignedWidth != width) - { - uint8x16_t _src = vandq_u8(Load(src + width - A), tailMask); - rowValueSum = vpadalq_u16(rowValueSum, vpaddlq_u8(_src)); - rowSquareSum = vaddq_u32(rowSquareSum, Square(_src)); - } - fullValueSum = vaddq_u64(fullValueSum, vpaddlq_u32(rowValueSum)); - fullSquareSum = vaddq_u64(fullSquareSum, vpaddlq_u32(rowSquareSum)); - src += stride; - } - *valueSum = ExtractSum64u(fullValueSum); - *squareSum = ExtractSum64u(fullSquareSum); - } - - void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum) - { - if (Aligned(src) && Aligned(stride)) - ValueSquareSum(src, stride, width, height, valueSum, squareSum); - else - ValueSquareSum(src, stride, width, height, valueSum, squareSum); - } - - SIMD_INLINE uint32x4_t Correlation(const uint8x16_t & a, const uint8x16_t & b) - { - uint16x8_t lo = vmull_u8(Half<0>(a), Half<0>(b)); - uint16x8_t hi = vmull_u8(Half<1>(a), Half<1>(b)); - return vaddq_u32(vpaddlq_u16(lo), vpaddlq_u16(hi)); - } - - template void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum) - { - assert(width >= A); - if (align) - assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); - - size_t alignedWidth = Simd::AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - - uint64x2_t fullSum = K64_0000000000000000; - for (size_t row = 0; row < height; ++row) - { - uint32x4_t rowSum = K32_00000000; - for (size_t col = 0; col < alignedWidth; col += A) - { - uint8x16_t _a = Load(a + col); - uint8x16_t _b = Load(b + col); - rowSum = vaddq_u32(rowSum, Correlation(_a, _b)); - } - if (alignedWidth != width) - { - uint8x16_t _a = vandq_u8(Load(a + width - A), tailMask); - uint8x16_t _b = vandq_u8(Load(b + width - A), tailMask); - rowSum = vaddq_u32(rowSum, Correlation(_a, _b)); - } - fullSum = vaddq_u64(fullSum, vpaddlq_u32(rowSum)); - a += aStride; - b += bStride; - } - *sum = ExtractSum64u(fullSum); - } - - void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum) - { - if (Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)) - CorrelationSum(a, aStride, b, bStride, width, height, sum); - else - CorrelationSum(a, aStride, b, bStride, width, height, sum); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonStatisticMoments.cpp b/src/3rd/Simd/Simd/SimdNeonStatisticMoments.cpp deleted file mode 100644 index 28db3087..00000000 --- a/src/3rd/Simd/Simd/SimdNeonStatisticMoments.cpp +++ /dev/null @@ -1,202 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE void GetObjectMoments(uint16x4_t src, uint16x4_t col, uint32x4_t & sx, uint32x4_t & sxx) - { - sx = vmlal_u16(sx, src, col); - sxx = vmlal_u16(sxx, src, vmul_u16(col, col)); - } - - SIMD_INLINE void GetObjectMoments(uint16x8_t src, uint16x8_t col, uint16x8_t & s, uint32x4_t& sx, uint32x4_t& sxx) - { - s = vaddq_u16(s, src); - GetObjectMoments(Half<0>(src), Half<0>(col), sx, sxx); - GetObjectMoments(Half<1>(src), Half<1>(col), sx, sxx); - } - - SIMD_INLINE void GetObjectMoments(uint8x16_t src, uint8x16_t mask, uint16x8_t & col, uint8x16_t & n, uint16x8_t & s, uint32x4_t & sx, uint32x4_t & sxx) - { - src = vandq_u8(src, mask); - n = vaddq_u8(n, vandq_u8(K8_01, mask)); - GetObjectMoments(UnpackU8<0>(src), col, s, sx, sxx); - col = vaddq_u16(col, K16_0008); - GetObjectMoments(UnpackU8<1>(src), col, s, sx, sxx); - col = vaddq_u16(col, K16_0008); - } - - template void GetObjectMoments(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t index, - uint64x2_t & n, uint64x2_t & s, uint64x2_t & sx, uint64x2_t & sy, uint64x2_t & sxx, uint64x2_t& sxy, uint64x2_t& syy) - { - size_t widthA = AlignLo(width, A); - const size_t B = AlignLo(181, A); - size_t widthB = AlignLoAny(width, B); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + widthA); - - const uint16x8_t K16_I = SIMD_VEC_SETR_EPI16(0, 1, 2, 3, 4, 5, 6, 7); - const uint8x16_t _index = vdupq_n_u8(index); - const uint16x8_t tailCol = vaddq_u16(K16_I, vdupq_n_u16((uint16_t)(width - A - widthB))); - - for (size_t row = 0; row < height; ++row) - { - for (size_t colB = 0; colB < width;) - { - size_t colE = Simd::Min(colB + B, widthA); - uint16x8_t _col = K16_I; - uint8x16_t _n8 = K8_00; - uint16x8_t _s16 = K16_0000; - uint32x4_t _sx32 = K32_00000000; - uint32x4_t _sxx32 = K32_00000000; - if (mask == NULL) - { - for (size_t col = colB; col < colE; col += A) - { - uint8x16_t _src = Load(src + col); - GetObjectMoments(_src, K8_FF, _col, _n8, _s16, _sx32, _sxx32); - } - if (colB == widthB && widthA < width) - { - uint8x16_t _src = Load(src + width - A); - _col = tailCol; - GetObjectMoments(_src, tailMask, _col, _n8, _s16, _sx32, _sxx32); - colE = width; - } - } - else if (src == NULL) - { - for (size_t col = colB; col < colE; col += A) - { - uint8x16_t _mask = vceqq_u8(Load(mask + col), _index); - GetObjectMoments(K8_01, _mask, _col, _n8, _s16, _sx32, _sxx32); - } - if (colB == widthB && widthA < width) - { - uint8x16_t _mask = vandq_u8(vceqq_u8(Load(mask + width - A), _index), tailMask); - _col = tailCol; - GetObjectMoments(K8_01, _mask, _col, _n8, _s16, _sx32, _sxx32); - colE = width; - } - } - else - { - for (size_t col = colB; col < colE; col += A) - { - uint8x16_t _src = Load(src + col); - uint8x16_t _mask = vceqq_u8(Load(mask + col), _index); - GetObjectMoments(_src, _mask, _col, _n8, _s16, _sx32, _sxx32); - } - if (colB == widthB && widthA < width) - { - uint8x16_t _mask = vandq_u8(vceqq_u8(Load(mask + width - A), _index), tailMask); - uint8x16_t _src = Load(src + width - A); - _col = tailCol; - GetObjectMoments(_src, _mask, _col, _n8, _s16, _sx32, _sxx32); - colE = width; - } - } - uint32x2_t _s = vmovn_u64(vpaddlq_u32(vpaddlq_u16(_s16))); - uint32x2_t _sx = vpadd_u32(Half<0>(_sx32), Half<1>(_sx32)); - uint32x2_t _sxx = vpadd_u32(Half<0>(_sxx32), Half<1>(_sxx32)); - uint32x2_t _y = vdup_n_u32((uint32_t)row); - uint32x2_t _x = vdup_n_u32((uint32_t)colB); - - n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(_n8)))); - - s = vaddq_u64(s, vpaddlq_u32(vpaddlq_u16(_s16))); - - sx = vaddw_u32(sx, _sx); - sx = vmlal_u32(sx, _s, _x); - - sy = vmlal_u32(sy, _s, _y); - - sxx = vaddw_u32(sxx, _sxx); - sxx = vmlal_u32(sxx, _sx, vadd_u32(_x, _x)); - sxx = vmlal_u32(sxx, _s, vmul_u32(_x, _x)); - - sxy = vmlal_u32(sxy, _sx, _y); - sxy = vmlal_u32(sxy, _s, vmul_u32(_x, _y)); - - syy = vmlal_u32(syy, _s, vmul_u32(_y, _y)); - - colB = colE; - } - if(src) - src += srcStride; - if(mask) - mask += maskStride; - } - } - - template void GetObjectMoments(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t* mask, size_t maskStride, uint8_t index, - uint64_t* n, uint64_t* s, uint64_t* sx, uint64_t* sy, uint64_t* sxx, uint64_t* sxy, uint64_t* syy) - { - assert(width >= A && (src || mask)); - if (align) - assert((src == NULL || (Aligned(src) && Aligned(srcStride))) && (mask == NULL || (Aligned(mask) && Aligned(maskStride)))); - - uint64x2_t _n = vdupq_n_u64(0); - uint64x2_t _s = vdupq_n_u64(0); - uint64x2_t _sx = vdupq_n_u64(0); - uint64x2_t _sy = vdupq_n_u64(0); - uint64x2_t _sxx = vdupq_n_u64(0); - uint64x2_t _sxy = vdupq_n_u64(0); - uint64x2_t _syy = vdupq_n_u64(0); - - GetObjectMoments(src, srcStride, width, height, mask, maskStride, index, _n, _s, _sx, _sy, _sxx, _sxy, _syy); - - *n = ExtractSum64u(_n); - *s = ExtractSum64u(_s); - *sx = ExtractSum64u(_sx); - *sy = ExtractSum64u(_sy); - *sxx = ExtractSum64u(_sxx); - *sxy = ExtractSum64u(_sxy); - *syy = ExtractSum64u(_syy); - } - - void GetObjectMoments(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t* mask, size_t maskStride, uint8_t index, - uint64_t* n, uint64_t* s, uint64_t* sx, uint64_t* sy, uint64_t* sxx, uint64_t* sxy, uint64_t* syy) - { - if ((src == NULL || (Aligned(src) && Aligned(srcStride))) && (mask == NULL || (Aligned(mask) && Aligned(maskStride)))) - GetObjectMoments(src, srcStride, width, height, mask, maskStride, index, n, s, sx, sy, sxx, sxy, syy); - else - GetObjectMoments(src, srcStride, width, height, mask, maskStride, index, n, s, sx, sy, sxx, sxy, syy); - } - - void GetMoments(const uint8_t* mask, size_t stride, size_t width, size_t height, uint8_t index, - uint64_t* area, uint64_t* x, uint64_t* y, uint64_t* xx, uint64_t* xy, uint64_t* yy) - { - uint64_t stub; - GetObjectMoments(NULL, 0, width, height, mask, stride, index, &stub, area, x, y, xx, xy, yy); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonStretchGray2x2.cpp b/src/3rd/Simd/Simd/SimdNeonStretchGray2x2.cpp deleted file mode 100644 index e58f565f..00000000 --- a/src/3rd/Simd/Simd/SimdNeonStretchGray2x2.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE void StretchGray2x2(const uint8_t * src, uint8_t * dst, size_t stride) - { - uint8x16x2_t _src; - _src.val[0] = Load(src); - _src.val[1] = _src.val[0]; - Store2(dst, _src); - Store2(dst + stride, _src); - } - - template void StretchGray2x2( - const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert(srcWidth * 2 == dstWidth && srcHeight * 2 == dstHeight && srcWidth >= A); - if (align) - { - assert(Aligned(src) && Aligned(srcStride)); - assert(Aligned(dst) && Aligned(dstStride)); - } - - size_t alignedWidth = AlignLo(srcWidth, A); - for (size_t row = 0; row < srcHeight; ++row) - { - for (size_t srcCol = 0, dstCol = 0; srcCol < alignedWidth; srcCol += A, dstCol += DA) - StretchGray2x2(src + srcCol, dst + dstCol, dstStride); - if (alignedWidth != srcWidth) - StretchGray2x2(src + srcWidth - A, dst + dstWidth - DA, dstStride); - src += srcStride; - dst += 2 * dstStride; - } - } - - void StretchGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - StretchGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - StretchGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonSvm.cpp b/src/3rd/Simd/Simd/SimdNeonSvm.cpp deleted file mode 100644 index d514b70a..00000000 --- a/src/3rd/Simd/Simd/SimdNeonSvm.cpp +++ /dev/null @@ -1,91 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdExtract.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - namespace - { - struct Buffer - { - Buffer(size_t count) - { - size_t size = sizeof(float)*count; - _p = Allocate(size); - memset(_p, 0, size); - sums = (float*)_p; - } - - ~Buffer() - { - Free(_p); - } - - float * sums; - private: - void *_p; - }; - } - - void SvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum) - { - Buffer buffer(count); - size_t alignedCount = AlignLo(count, 4); - - for (size_t j = 0; j < length; ++j) - { - size_t i = 0; - float v = x[j]; - float32x4_t _v = vdupq_n_f32(v); - for (; i < alignedCount; i += 4) - { - float32x4_t sums = Load(buffer.sums + i); - float32x4_t _svs = Load(svs + i); - Store(buffer.sums + i, vaddq_f32(sums, vmulq_f32(_v, _svs))); - } - for (; i < count; ++i) - buffer.sums[i] += v*svs[i]; - svs += count; - } - - size_t i = 0; - float32x4_t _sum = vdupq_n_f32(0); - for (; i < alignedCount; i += 4) - { - float32x4_t sums = Load(buffer.sums + i); - float32x4_t _weights = Load(weights + i); - _sum = vaddq_f32(_sum, vmulq_f32(sums, _weights)); - } - *sum = ExtractSum32f(_sum); - for (; i < count; ++i) - *sum += buffer.sums[i] * weights[i]; - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonSynet.cpp b/src/3rd/Simd/Simd/SimdNeonSynet.cpp deleted file mode 100644 index b2e7e7f0..00000000 --- a/src/3rd/Simd/Simd/SimdNeonSynet.cpp +++ /dev/null @@ -1,1129 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdNeon.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdPow.h" -#include "Simd/SimdExp.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE void SynetAddBias(const float * bias, float * dst) - { - Store(dst, vaddq_f32(Load(dst), Load(bias))); - } - - template SIMD_INLINE void SynetAddBias(float32x4_t bias, float * dst) - { - Store(dst, vaddq_f32(Load(dst), bias)); - } - - template void SynetAddBiasNchw(const float * bias, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(spatial, F) && Aligned(dst)); - - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - if (partial) - { - float32x4_t _bias = vdupq_n_f32(bias[c]); - for (; s < aligned; s += QF) - { - SynetAddBias(_bias, dst + s + F * 0); - SynetAddBias(_bias, dst + s + F * 1); - SynetAddBias(_bias, dst + s + F * 2); - SynetAddBias(_bias, dst + s + F * 3); - } - for (; s < partial; s += F) - SynetAddBias(_bias, dst + s); - } - for (; s < spatial; ++s) - dst[s] += bias[c]; - dst += spatial; - } - } - - SIMD_INLINE void SynetAddBiasNchw(const float * bias, size_t channels, size_t spatial, float * dst) - { - if (Aligned(spatial, F) && Aligned(dst)) - SynetAddBiasNchw(bias, channels, spatial, dst); - else - SynetAddBiasNchw(bias, channels, spatial, dst); - } - - template void SynetAddBiasNhwc(const float * bias, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(channels, F) && Aligned(bias) && Aligned(dst)); - - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - if (partial) - { - for (; c < aligned; c += QF) - { - SynetAddBias(bias + c + F * 0, dst + c + F * 0); - SynetAddBias(bias + c + F * 1, dst + c + F * 1); - SynetAddBias(bias + c + F * 2, dst + c + F * 2); - SynetAddBias(bias + c + F * 3, dst + c + F * 3); - } - for (; c < partial; c += F) - SynetAddBias(bias + c, dst + c); - } - for (; c < channels; ++c) - dst[c] += bias[c]; - dst += channels; - } - } - - SIMD_INLINE void SynetAddBiasNhwc(const float * bias, size_t channels, size_t spatial, float * dst) - { - if (Aligned(bias) && Aligned(channels, F) && Aligned(dst)) - SynetAddBiasNhwc(bias, channels, spatial, dst); - else - SynetAddBiasNhwc(bias, channels, spatial, dst); - } - - template void SynetAddBiasNchw4c(const float * bias, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(dst)); - - size_t spatial4 = AlignLo(spatial, 4); - for (size_t c = 0; c < channels; c += F) - { - float32x4_t _bias = Load(bias + c); - size_t s = 0; - for (; s < spatial4; s += 4, dst += 4 * F) - { - SynetAddBias(_bias, dst + 0 * F); - SynetAddBias(_bias, dst + 1 * F); - SynetAddBias(_bias, dst + 2 * F); - SynetAddBias(_bias, dst + 3 * F); - } - for (; s < spatial; ++s, dst += F) - SynetAddBias(_bias, dst); - } - } - - SIMD_INLINE void SynetAddBiasNchw4c(const float * bias, size_t channels, size_t spatial, float * dst) - { - if (Aligned(dst)) - SynetAddBiasNchw4c(bias, channels, spatial, dst); - else - SynetAddBiasNchw4c(bias, channels, spatial, dst); - } - - void SynetAddBias(const float * bias, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetAddBiasNchw(bias, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetAddBiasNhwc(bias, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - SynetAddBiasNchw4c(bias, channels, spatial, dst); - else - Base::SynetAddBias(bias, channels, spatial, dst, format); - } - - //--------------------------------------------------------------------- - - template float32x4_t SynetEltwiseLayerForward(float32x4_t src0, float32x4_t src1); - - template <> SIMD_INLINE float32x4_t SynetEltwiseLayerForward(float32x4_t src0, float32x4_t src1) - { - return vmulq_f32(src0, src1); - } - - template <> SIMD_INLINE float32x4_t SynetEltwiseLayerForward(float32x4_t src0, float32x4_t src1) - { - return vmaxq_f32(src0, src1); - } - - template <> SIMD_INLINE float32x4_t SynetEltwiseLayerForward(float32x4_t src0, float32x4_t src1) - { - return vminq_f32(src0, src1); - } - - template SIMD_INLINE void SynetEltwiseLayerForward(const float * src0, const float * src1, float * dst, size_t offset) - { - Store(dst + offset, SynetEltwiseLayerForward(Load(src0 + offset), Load(src1 + offset))); - } - - template void SynetEltwiseLayerForward(float const * const * src, size_t count, size_t size, float * dst) - { - size_t aligned = AlignLo(size, QF); - size_t partial = AlignLo(size, F); - const float * src0 = src[0]; - const float * src1 = src[1]; - size_t j = 0; - if (partial) - { - for (; j < aligned; j += QF) - { - SynetEltwiseLayerForward(src0, src1, dst, j + F * 0); - SynetEltwiseLayerForward(src0, src1, dst, j + F * 1); - SynetEltwiseLayerForward(src0, src1, dst, j + F * 2); - SynetEltwiseLayerForward(src0, src1, dst, j + F * 3); - } - for (; j < partial; j += F) - SynetEltwiseLayerForward(src0, src1, dst, j); - } - for (; j < size; ++j) - dst[j] = Base::SynetEltwiseLayerForward(src0[j], src1[j]); - for (size_t i = 2; i < count; ++i) - { - const float * srci = src[i]; - size_t j = 0; - if (partial) - { - for (; j < aligned; j += QF) - { - SynetEltwiseLayerForward(dst, srci, dst, j + F * 0); - SynetEltwiseLayerForward(dst, srci, dst, j + F * 1); - SynetEltwiseLayerForward(dst, srci, dst, j + F * 2); - SynetEltwiseLayerForward(dst, srci, dst, j + F * 3); - } - for (; j < partial; j += F) - SynetEltwiseLayerForward(dst, srci, dst, j); - } - for (; j < size; ++j) - dst[j] = Base::SynetEltwiseLayerForward(dst[j], srci[j]); - } - } - - template SIMD_INLINE void SynetEltwiseLayerForwardSum(const float * src0, const float32x4_t & weight0, const float * src1, const float32x4_t & weight1, float * dst, size_t offset) - { - Store(dst + offset, vmlaq_f32(vmulq_f32(Load(src0 + offset), weight0), Load(src1 + offset), weight1)); - } - - template SIMD_INLINE void SynetEltwiseLayerForwardSum(const float * src, const float32x4_t & weight, float * dst, size_t offset) - { - Store(dst + offset, vmlaq_f32(Load(dst + offset), Load(src + offset), weight)); - } - - template void SynetEltwiseLayerForwardSum(float const * const * src, const float * weight, size_t count, size_t size, float * dst) - { - size_t aligned = AlignLo(size, QF); - size_t partial = AlignLo(size, F); - const float * src0 = src[0]; - const float * src1 = src[1]; - float32x4_t weight0 = vdupq_n_f32(weight[0]); - float32x4_t weight1 = vdupq_n_f32(weight[1]); - size_t j = 0; - if (partial) - { - for (; j < aligned; j += QF) - { - SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 0); - SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 1); - SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 2); - SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 3); - } - for (; j < partial; j += F) - SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j); - } - for (; j < size; ++j) - dst[j] = src0[j] * weight[0] + src1[j] * weight[1]; - for (size_t i = 2; i < count; ++i) - { - const float * srci = src[i]; - float32x4_t weighti = vdupq_n_f32(weight[i]); - size_t j = 0; - if (partial) - { - for (; j < aligned; j += QF) - { - SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 0); - SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 1); - SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 2); - SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 3); - } - for (; j < partial; j += F) - SynetEltwiseLayerForwardSum(srci, weighti, dst, j); - } - for (; j < size; ++j) - dst[j] += srci[j] * weight[i]; - } - } - - template void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst) - { - switch (type) - { - case SimdSynetEltwiseOperationProduct: - SynetEltwiseLayerForward(src, count, size, dst); - break; - case SimdSynetEltwiseOperationSum: - SynetEltwiseLayerForwardSum(src, weight, count, size, dst); - break; - case SimdSynetEltwiseOperationMax: - SynetEltwiseLayerForward(src, count, size, dst); - break; - case SimdSynetEltwiseOperationMin: - SynetEltwiseLayerForward(src, count, size, dst); - break; - default: - assert(0); - } - } - - void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst) - { - assert(count >= 2); - bool aligned = Aligned(dst) && Aligned(src[0]) && Aligned(src[1]); - for (size_t i = 2; i < count; ++i) - aligned = aligned && Aligned(src[i]); - if (aligned) - SynetEltwiseLayerForward(src, weight, count, size, type, dst); - else - SynetEltwiseLayerForward(src, weight, count, size, type, dst); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetInnerProductLayerForward(const float * src, const float * weight, size_t offset, float32x4_t & sum) - { - float32x4_t s = Load(src + offset); - float32x4_t w = Load(weight + offset); - sum = vmlaq_f32(sum, s, w); - } - - template SIMD_INLINE void SynetInnerProductLayerForward(const float * src, const float * weight0, const float * weight1, size_t offset, float32x4_t * sum) - { - float32x4_t s = Load(src + offset); - float32x4_t w0 = Load(weight0 + offset); - float32x4_t w1 = Load(weight1 + offset); - sum[0] = vmlaq_f32(sum[0], s, w0); - sum[1] = vmlaq_f32(sum[1], s, w1); - } - - template void SynetInnerProductLayerForward(const float * src, const float * weight, const float * bias, size_t count, size_t size, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(weight) && Aligned(size) && Aligned(dst)); - size_t count2 = AlignLo(count, 2); - size_t sizeF = AlignLo(size, F); - size_t sizeDF = AlignLo(size, DF); - size_t sizeQF = AlignLo(size, QF); - size_t i = 0; - for (; i < count2; i += 2) - { - size_t j = 0; - float sum0 = 0, sum1 = 0; - const float * weight0 = weight + 0 * size; - const float * weight1 = weight + 1 * size; - if (sizeF) - { - float32x4_t sums[4] = { vdupq_n_f32(0.0f), vdupq_n_f32(0.0f), vdupq_n_f32(0.0f), vdupq_n_f32(0.0f) }; - if (sizeDF) - { - for (; j < sizeDF; j += DF) - { - SynetInnerProductLayerForward(src, weight0, weight1, j + 0 * F, sums + 0); - SynetInnerProductLayerForward(src, weight0, weight1, j + 1 * F, sums + 2); - } - sums[0] = vaddq_f32(sums[0], sums[2]); - sums[1] = vaddq_f32(sums[1], sums[3]); - } - for (; j < sizeF; j += F) - SynetInnerProductLayerForward(src, weight0, weight1, j, sums); - sum0 = ExtractSum32f(sums[0]); - sum1 = ExtractSum32f(sums[1]); - } - for (; j < size; ++j) - { - sum0 += src[j] * weight0[j]; - sum1 += src[j] * weight1[j]; - } - dst[i + 0] = sum0 + (bias ? bias[i + 0] : 0); - dst[i + 1] = sum1 + (bias ? bias[i + 1] : 0); - weight += 2*size; - } - for (; i < count; ++i) - { - size_t j = 0; - float sum = 0; - if (sizeF) - { - float32x4_t sums[4] = { vdupq_n_f32(0.0f), vdupq_n_f32(0.0f), vdupq_n_f32(0.0f), vdupq_n_f32(0.0f) }; - if (sizeQF) - { - for (; j < sizeQF; j += QF) - { - SynetInnerProductLayerForward(src, weight, j + 0 * F, sums[0]); - SynetInnerProductLayerForward(src, weight, j + 1 * F, sums[1]); - SynetInnerProductLayerForward(src, weight, j + 2 * F, sums[2]); - SynetInnerProductLayerForward(src, weight, j + 3 * F, sums[3]); - } - sums[0] = vaddq_f32(vaddq_f32(sums[0], sums[1]), vaddq_f32(sums[2], sums[3])); - } - for (; j < sizeF; j += F) - SynetInnerProductLayerForward(src, weight, j, sums[0]); - sum = ExtractSum32f(sums[0]); - } - for (; j < size; ++j) - sum += src[j] * weight[j]; - dst[i] = sum + (bias ? bias[i] : 0); - weight += size; - } - } - - void SynetInnerProductLayerForward(const float * src, const float * weight, const float * bias, size_t count, size_t size, float * dst) - { - if (Aligned(src) && Aligned(weight) && Aligned(size) && Aligned(dst)) - SynetInnerProductLayerForward(src, weight, bias, count, size, dst); - else - SynetInnerProductLayerForward(src, weight, bias, count, size, dst); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE float32x4_t LoadAtEdge(const float * src) - { - static const int32_t mask[3 * F] = { 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0 }; - return And(Load(src + shift), Load((float*)mask + F + shift)); - } - - SIMD_INLINE float32x4_t NoseSquareSum(const float * src) - { - float32x4_t s0 = LoadAtEdge<-2>(src); - float32x4_t s1 = LoadAtEdge<-1>(src); - float32x4_t s2 = Load(src); - float32x4_t s3 = Load(src + 1); - float32x4_t s4 = Load(src + 2); - return vaddq_f32(vmlaq_f32(vmulq_f32(s0, s0), s1, s1), vmlaq_f32(vmlaq_f32(vmulq_f32(s2, s2), s3, s3), s4, s4)); - } - - SIMD_INLINE float32x4_t BodySquareSum(const float * src) - { - float32x4_t s0 = Load(src - 2); - float32x4_t s1 = Load(src - 1); - float32x4_t s2 = Load(src); - float32x4_t s3 = Load(src + 1); - float32x4_t s4 = Load(src + 2); - return vaddq_f32(vmlaq_f32(vmulq_f32(s0, s0), s1, s1), vmlaq_f32(vmlaq_f32(vmulq_f32(s2, s2), s3, s3), s4, s4)); - } - - SIMD_INLINE float32x4_t TailSquareSum(const float * src) - { - float32x4_t s0 = Load(src - 2); - float32x4_t s1 = Load(src - 1); - float32x4_t s2 = Load(src); - float32x4_t s3 = LoadAtEdge<1>(src); - float32x4_t s4 = LoadAtEdge<2>(src); - return vaddq_f32(vmlaq_f32(vmulq_f32(s0, s0), s1, s1), vmlaq_f32(vmlaq_f32(vmulq_f32(s2, s2), s3, s3), s4, s4)); - } - - template void SynetLrnLayerCrossChannelsNchw(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst) - { - float32x4_t k0 = vdupq_n_f32(k[0]); - float32x4_t k1 = vdupq_n_f32(k[1]); - float32x4_t k2 = vdupq_n_f32(k[2]); - Neon::Pow pow; - Array32f sum(spatial, true), zero(spatial, true); - size_t aligned = AlignLo(spatial, F); - for (size_t c = 0; c < half; ++c) - { - const float * pos = src + c * spatial; - size_t s = 0; - for (; s < aligned; s += F) - { - float32x4_t _pos = Neon::Load(pos + s); - Neon::Store(sum.data + s, vmlaq_f32(Neon::Load(sum.data + s), _pos, _pos)); - } - for (; s < spatial; ++s) - sum[s] += Simd::Square(pos[s]); - } - for (size_t c = 0; c < channels; ++c) - { - const float * pos = (c < channels - half) ? src + half * spatial : zero.data; - const float * neg = (c > half) ? src - (half + 1) * spatial : zero.data; - size_t s = 0; - for (; s < aligned; s += F) - { - float32x4_t _pos = Neon::Load(pos + s); - float32x4_t _neg = Neon::Load(neg + s); - float32x4_t _sum = Neon::Load(sum.data + s); - _sum = vmlsq_f32(vmlaq_f32(_sum, _pos, _pos), _neg, _neg); - float32x4_t _src = Neon::Load(src + s); - Neon::Store(sum.data + s, _sum); - Neon::Store(dst + s, vmulq_f32(_src, pow(vmlaq_f32(k0, k1, _sum), k2))); - } - for (; s < spatial; ++s) - { - sum[s] += Simd::Square(pos[s]); - sum[s] -= Simd::Square(neg[s]); - dst[s] = src[s] * Base::Pow(k[0] + k[1] * sum[s], k[2]); - } - src += spatial; - dst += spatial; - } - } - - SIMD_INLINE void SynetLrnLayerCrossChannelsNchw(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst) - { - if (Aligned(src) && Aligned(dst) && Aligned(spatial, F)) - SynetLrnLayerCrossChannelsNchw(src, half, channels, spatial, k, dst); - else - SynetLrnLayerCrossChannelsNchw(src, half, channels, spatial, k, dst); - } - - template void SynetLrnLayerCrossChannelsNhwc2h(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst) - { - float32x4_t k0 = vdupq_n_f32(k[0]); - float32x4_t k1 = vdupq_n_f32(k[1]); - float32x4_t k2 = vdupq_n_f32(k[2]); - Neon::Pow pow; - size_t aligned = AlignLo(channels - half, F); - for (size_t s = 0; s < spatial; ++s) - { - Neon::Store(dst + 0, vmulq_f32(Neon::Load(src + 0), pow(vmlaq_f32(k0, k1, NoseSquareSum(src + 0)), k2))); - for (size_t c = F; c < aligned; c += F) - Neon::Store(dst + c, vmulq_f32(Neon::Load(src + c), pow(vmlaq_f32(k0, k1, BodySquareSum(src + c)), k2))); - if (aligned != channels - half) - { - size_t c = channels - half - F; - Neon::Store(dst + c, vmulq_f32(Neon::Load(src + c), pow(vmlaq_f32(k0, k1, BodySquareSum(src + c)), k2))); - } - size_t c = channels - F; - Neon::Store(dst + c, vmulq_f32(Neon::Load(src + c), pow(vmlaq_f32(k0, k1, TailSquareSum(src + c)), k2))); - src += channels; - dst += channels; - } - } - - SIMD_INLINE void SynetLrnLayerCrossChannelsNhwc(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst) - { - if (half == 2 && channels >= F + half) - { - if (Aligned(src) && Aligned(dst) && Aligned(channels, F)) - SynetLrnLayerCrossChannelsNhwc2h(src, half, channels, spatial, k, dst); - else - SynetLrnLayerCrossChannelsNhwc2h(src, half, channels, spatial, k, dst); - } - else - Base::SynetLrnLayerCrossChannels(src, half, channels, spatial, k, dst, SimdTensorFormatNhwc); - } - - void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t channels, size_t spatial, const float * k, float * dst, SimdTensorFormatType format) - { - if (format == SimdTensorFormatNchw) - SynetLrnLayerCrossChannelsNchw(src, half, channels, spatial, k, dst); - else if (format == SimdTensorFormatNhwc) - SynetLrnLayerCrossChannelsNhwc(src, half, channels, spatial, k, dst); - else - Base::SynetLrnLayerCrossChannels(src, half, channels, spatial, k, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, float * dst, size_t offset) - { - Store(dst + offset, Fmadd(Load(src + offset), Load(scale + offset), Load(bias + offset))); - } - - template SIMD_INLINE void SynetScaleLayerForward(const float * src, const float * scale, float * dst, size_t offset) - { - Store(dst + offset, vmulq_f32(Load(src + offset), Load(scale + offset))); - } - - template SIMD_INLINE void SynetScaleLayerForward(const float * src, const float32x4_t & scale, const float32x4_t & bias, float * dst, size_t offset) - { - Store(dst + offset, Fmadd(Load(src + offset), scale, bias)); - } - - template SIMD_INLINE void SynetScaleLayerForward(const float * src, const float32x4_t & scale, float * dst, size_t offset) - { - Store(dst + offset, vmulq_f32(Load(src + offset), scale)); - } - - template void SynetScaleLayerForwardNchw(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(spatial, F) && Aligned(dst)); - - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - if (bias) - { - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - if (partial) - { - float32x4_t _scale = vdupq_n_f32(scale[c]); - float32x4_t _bias = vdupq_n_f32(bias[c]); - for (; s < aligned; s += QF) - { - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 0); - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 1); - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 2); - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetScaleLayerForward(src, _scale, _bias, dst, s); - } - for (; s < spatial; ++s) - dst[s] = src[s] * scale[c] + bias[c]; - src += spatial; - dst += spatial; - } - } - else - { - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - if (partial) - { - float32x4_t _scale = vdupq_n_f32(scale[c]); - for (; s < aligned; s += QF) - { - SynetScaleLayerForward(src, _scale, dst, s + F * 0); - SynetScaleLayerForward(src, _scale, dst, s + F * 1); - SynetScaleLayerForward(src, _scale, dst, s + F * 2); - SynetScaleLayerForward(src, _scale, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetScaleLayerForward(src, _scale, dst, s); - } - for (; s < spatial; ++s) - dst[s] = src[s] * scale[c]; - src += spatial; - dst += spatial; - } - } - } - - template void SynetScaleLayerForwardNchw(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(spatial, F) && Aligned(dst)) - SynetScaleLayerForwardNchw(src, scale, bias, channels, spatial, dst); - else - SynetScaleLayerForwardNchw(src, scale, bias, channels, spatial, dst); - } - - template void SynetScaleLayerForwardNhwc(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(scale) && Aligned(bias) && Aligned(channels, F) && Aligned(dst)); - - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - if (bias) - { - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - if (partial) - { - for (; c < aligned; c += QF) - { - SynetScaleLayerForward(src, scale, bias, dst, c + F * 0); - SynetScaleLayerForward(src, scale, bias, dst, c + F * 1); - SynetScaleLayerForward(src, scale, bias, dst, c + F * 2); - SynetScaleLayerForward(src, scale, bias, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetScaleLayerForward(src, scale, bias, dst, c); - } - for (; c < channels; ++c) - dst[c] = src[c] * scale[c] + bias[c]; - src += channels; - dst += channels; - } - } - else - { - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - if (partial) - { - for (; c < aligned; c += QF) - { - SynetScaleLayerForward(src, scale, dst, c + F * 0); - SynetScaleLayerForward(src, scale, dst, c + F * 1); - SynetScaleLayerForward(src, scale, dst, c + F * 2); - SynetScaleLayerForward(src, scale, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetScaleLayerForward(src, scale, dst, c); - } - for (; c < channels; ++c) - dst[c] = src[c] * scale[c]; - src += channels; - dst += channels; - } - } - } - - template void SynetScaleLayerForwardNhwc3(const float * src, const float * scale, const float * bias, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t spatial3 = spatial * 3; - size_t spatialF3 = AlignLo(spatial, F) * 3; - if (bias) - { - size_t s = 0; - if (spatialF3) - { - float _scale[F * 3], _bias[F * 3]; - for (size_t i = 0; i < F; ++i) - for (size_t c = 0; c < 3; ++c) - _scale[i * 3 + c] = scale[c], _bias[i * 3 + c] = bias[c]; - float32x4_t _scale0 = Load(_scale + 0 * F); - float32x4_t _scale1 = Load(_scale + 1 * F); - float32x4_t _scale2 = Load(_scale + 2 * F); - float32x4_t _bias0 = Load(_bias + 0 * F); - float32x4_t _bias1 = Load(_bias + 1 * F); - float32x4_t _bias2 = Load(_bias + 2 * F); - for (; s < spatialF3; s += F * 3) - { - SynetScaleLayerForward(src, _scale0, _bias0, dst, s + F * 0); - SynetScaleLayerForward(src, _scale1, _bias1, dst, s + F * 1); - SynetScaleLayerForward(src, _scale2, _bias2, dst, s + F * 2); - } - } - for (; s < spatial3; s += 3) - { - dst[s + 0] = src[s + 0] * scale[0] + bias[0]; - dst[s + 1] = src[s + 1] * scale[1] + bias[1]; - dst[s + 2] = src[s + 2] * scale[2] + bias[2]; - } - } - else - { - size_t s = 0; - if (spatialF3) - { - float _scale[F * 3]; - for (size_t i = 0; i < F; ++i) - for (size_t c = 0; c < 3; ++c) - _scale[i * 3 + c] = scale[c]; - float32x4_t _scale0 = Load(_scale + 0 * F); - float32x4_t _scale1 = Load(_scale + 1 * F); - float32x4_t _scale2 = Load(_scale + 2 * F); - for (; s < spatialF3; s += F * 3) - { - SynetScaleLayerForward(src, _scale0, dst, s + F * 0); - SynetScaleLayerForward(src, _scale1, dst, s + F * 1); - SynetScaleLayerForward(src, _scale2, dst, s + F * 2); - } - } - for (; s < spatial3; s += 3) - { - dst[s + 0] = src[s + 0] * scale[0]; - dst[s + 1] = src[s + 1] * scale[1]; - dst[s + 2] = src[s + 2] * scale[2]; - } - } - } - - template void SynetScaleLayerForwardNhwc(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, float * dst) - { - if (channels == 3) - { - if (Aligned(src) && Aligned(dst)) - SynetScaleLayerForwardNhwc3(src, scale, bias, spatial, dst); - else - SynetScaleLayerForwardNhwc3(src, scale, bias, spatial, dst); - } - else - { - if (Aligned(src) && Aligned(scale) && Aligned(bias) && Aligned(channels, F) && Aligned(dst)) - SynetScaleLayerForwardNhwc(src, scale, bias, channels, spatial, dst); - else - SynetScaleLayerForwardNhwc(src, scale, bias, channels, spatial, dst); - } - } - - template void SynetScaleLayerForwardNchw4c(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - if (bias) - { - for (size_t c = 0; c < channels; c += F) - { - float32x4_t _scale = Load(scale + c); - float32x4_t _bias = Load(bias + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 0); - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 1); - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 2); - SynetScaleLayerForward(src, _scale, _bias, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetScaleLayerForward(src, _scale, _bias, dst, s); - src += spatialF; - dst += spatialF; - } - } - else - { - for (size_t c = 0; c < channels; c += F) - { - float32x4_t _scale = Load(scale + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetScaleLayerForward(src, _scale, dst, s + F * 0); - SynetScaleLayerForward(src, _scale, dst, s + F * 1); - SynetScaleLayerForward(src, _scale, dst, s + F * 2); - SynetScaleLayerForward(src, _scale, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetScaleLayerForward(src, _scale, dst, s); - src += spatialF; - dst += spatialF; - } - } - } - - template void SynetScaleLayerForwardNchw4c(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetScaleLayerForwardNchw4c(src, scale, bias, channels, spatial, dst); - else - SynetScaleLayerForwardNchw4c(src, scale, bias, channels, spatial, dst); - } - - void SynetScaleLayerForward(const float* src, const float* scale, const float* bias, size_t channels, size_t height, size_t width, float* dst, SimdTensorFormatType format, SimdSynetCompatibilityType compatibility) - { - size_t spatial = height * width; - bool nofma = compatibility & SimdSynetCompatibilityNoFma; - if (Base::NchwCompatible(channels, spatial, format)) - { - if(nofma) - SynetScaleLayerForwardNchw(src, scale, bias, channels, spatial, dst); - else - SynetScaleLayerForwardNchw(src, scale, bias, channels, spatial, dst); - } - else if (Base::NhwcCompatible(channels, spatial, format)) - { - if (nofma) - SynetScaleLayerForwardNhwc(src, scale, bias, channels, spatial, dst); - else - SynetScaleLayerForwardNhwc(src, scale, bias, channels, spatial, dst); - } - else if (format == SimdTensorFormatNchw4c) - { - if (nofma) - SynetScaleLayerForwardNchw4c(src, scale, bias, channels, spatial, dst); - else - SynetScaleLayerForwardNchw4c(src, scale, bias, channels, spatial, dst); - } - else - Base::SynetScaleLayerForward(src, scale, bias, channels, height, width, dst, format, compatibility); - } - - //--------------------------------------------------------------------- - - void SynetShuffleLayerForward(const float* src0, const float* src1, size_t channels0, size_t channels1, size_t spatial, float* dst0, float* dst1, SimdTensorFormatType format, int type) - { - if (format == SimdTensorFormatNchw) - Base::SynetShuffleLayerForward(src0, src1, channels0, channels1, spatial, dst0, dst1, format, type); - else if (format == SimdTensorFormatNhwc) - { - size_t channels = (channels0 + channels1) / 2; - size_t channels0DF = AlignLo(channels0, DF); - size_t channels1DF = AlignLo(channels1, DF); - if (type == 0) - { - for (size_t s = 0; s < spatial; ++s) - { - size_t cd = 0, cs0 = 0, cs1 = 0; - for (; cs0 < channels0DF; cs0 += DF, cd += F) - { - float32x4x2_t _src0 = Load2(src0 + cs0); - Store(dst0 + cd, _src0.val[0]); - Store(dst1 + cd, _src0.val[1]); - } - for (; cs0 < channels0; cs0 += 2, cd += 1) - { - dst0[cd] = src0[cs0 + 0]; - dst1[cd] = src0[cs0 + 1]; - } - for (; cs1 < channels1DF; cs1 += DF, cd += F) - { - float32x4x2_t _src1 = Load2(src1 + cs1); - Store(dst0 + cd, _src1.val[0]); - Store(dst1 + cd, _src1.val[1]); - } - for (; cs1 < channels1; cs1 += 2, cd += 1) - { - dst0[cd] = src1[cs1 + 0]; - dst1[cd] = src1[cs1 + 1]; - } - src0 += channels0; - src1 += channels1; - dst0 += channels; - dst1 += channels; - } - } - else if (type == 1) - { - for (size_t s = 0; s < spatial; ++s) - { - size_t cs = 0, cd0 = 0, cd1 = 0; - for (; cd0 < channels0DF; cd0 += DF, cs += F) - { - float32x4x2_t s; - s.val[0] = Load(src0 + cs); - s.val[1] = Load(src1 + cs); - Store2(dst0 + cd0, s); - } - for (; cd0 < channels0; cd0 += 2, cs += 1) - { - dst0[cd0 + 0] = src0[cs]; - dst0[cd0 + 1] = src1[cs]; - } - for (; cd1 < channels1DF; cd1 += DF, cs += F) - { - float32x4x2_t s; - s.val[0] = Load(src0 + cs); - s.val[1] = Load(src1 + cs); - Store2(dst1 + cd1, s); - } - for (; cd1 < channels1; cd1 += 2, cs += 1) - { - dst1[cd1 + 0] = src0[cs]; - dst1[cd1 + 1] = src1[cs]; - } - src0 += channels; - src1 += channels; - dst0 += channels0; - dst1 += channels1; - } - } - else - assert(0); - } - else - assert(0); - } - - //--------------------------------------------------------------------- - - void SynetSoftmaxLayerForward(const float * src, size_t outer, size_t count, size_t inner, float * dst) - { - Exp exp; - if (inner == 1 && count == 2) - { - size_t aligned = Simd::AlignLo(outer, F); - size_t o = 0; - for (; o < aligned; o += F) - { - float32x4x2_t s = Load2(src); - float32x4_t max = vmaxq_f32(s.val[0], s.val[1]); - float32x4_t exp0 = exp.Exponent(vsubq_f32(s.val[0], max)); - float32x4_t exp1 = exp.Exponent(vsubq_f32(s.val[1], max)); - float32x4_t sum = vaddq_f32(exp0, exp1); - float32x4x2_t d; - d.val[0] = Div<1>(exp0, sum); - d.val[1] = Div<1>(exp1, sum); - Store2(dst, d); - src += DF; - dst += DF; - } - for (; o < outer; ++o) - { - float max = Simd::Max(src[0], src[1]); - float exp0 = ::exp(src[0] - max); - float exp1 = ::exp(src[1] - max); - float sum = exp0 + exp1; - dst[0] = exp0 / sum; - dst[1] = exp1 / sum; - src += 2; - dst += 2; - } - } - else - { - size_t aligned = Simd::AlignLo(inner, F); - Array32f tmp(inner * 2); - const float * s; - float * max = tmp.data, *sum = tmp.data + inner, *d; - for (size_t o = 0; o < outer; ++o) - { - memcpy(max, src, inner * sizeof(float)); - s = src + inner; - for (size_t c = 1; c < count; ++c) - { - size_t i = 0; - for (; i < aligned; i += F) - Store(max + i, vmaxq_f32(Load(s + i), Load(max + i))); - for (; i < inner; ++i) - max[i] = Simd::Max(max[i], s[i]); - s += inner; - } - - s = src; - d = dst; - memset(sum, 0, inner * sizeof(float)); - for (size_t c = 0; c < count; ++c) - { - size_t i = 0; - for (; i < aligned; i += F) - { - float32x4_t _d = exp.Exponent(vsubq_f32(Load(s + i), Load(max + i))); - Store(d + i, _d); - Store(sum + i, vaddq_f32(_d, Load(sum + i))); - } - for (; i < inner; ++i) - { - d[i] = ::exp(s[i] - max[i]); - sum[i] += d[i]; - } - s += inner; - d += inner; - } - - d = dst; - for (size_t c = 0; c < count; ++c) - { - size_t i = 0; - for (; i < aligned; i += F) - Store(d + i, Div<1>(Load(d + i), Load(sum + i))); - for (; i < inner; ++i) - d[i] /= sum[i]; - d += inner; - } - src += count * inner; - dst += count * inner; - } - } - } - - //--------------------------------------------------------------------- - - template float32x4_t SynetUnaryOperation32f(float32x4_t value); - - template<> SIMD_INLINE float32x4_t SynetUnaryOperation32f(float32x4_t value) - { - return vabsq_f32(value); - } - - template<> SIMD_INLINE float32x4_t SynetUnaryOperation32f(float32x4_t value) - { - return Exponent(value); - } - - template<> SIMD_INLINE float32x4_t SynetUnaryOperation32f(float32x4_t value) - { - return Logarithm(value); - } - - template<> SIMD_INLINE float32x4_t SynetUnaryOperation32f(float32x4_t value) - { - return vnegq_f32(value); - } - - template<> SIMD_INLINE float32x4_t SynetUnaryOperation32f(float32x4_t value) - { - return ReciprocalSqrt<1>(value); - } - - template<> SIMD_INLINE float32x4_t SynetUnaryOperation32f(float32x4_t value) - { - return Sqrt<1>(value); - } - - template<> SIMD_INLINE float32x4_t SynetUnaryOperation32f(float32x4_t value) - { - return Tanh<1>(value); - } - - template<> SIMD_INLINE float32x4_t SynetUnaryOperation32f(float32x4_t value) - { - return vdupq_n_f32(0.0f); - } - - template void SynetUnaryOperation32fLayerForward(const float* src, size_t size, float* dst) - { - size_t sizeF = AlignLo(size, F); - size_t sizeQF = AlignLo(size, QF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - Neon::Store(dst + i + 0 * F, SynetUnaryOperation32f(Neon::Load(src + i + 0 * F))); - Neon::Store(dst + i + 1 * F, SynetUnaryOperation32f(Neon::Load(src + i + 1 * F))); - Neon::Store(dst + i + 2 * F, SynetUnaryOperation32f(Neon::Load(src + i + 2 * F))); - Neon::Store(dst + i + 3 * F, SynetUnaryOperation32f(Neon::Load(src + i + 3 * F))); - } - for (; i < sizeF; i += F) - Neon::Store(dst + i, SynetUnaryOperation32f(Neon::Load(src + i))); - for (; i < size; ++i) - dst[i] = Base::SynetUnaryOperation32f(src[i]); - } - - template void SynetUnaryOperation32fLayerForward(const float* src, size_t size, SimdSynetUnaryOperation32fType type, float* dst) - { - switch (type) - { - case SimdSynetUnaryOperation32fAbs: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fExp: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fLog: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fNeg: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fRsqrt: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fSqrt: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fTanh: SynetUnaryOperation32fLayerForward(src, size, dst); break; - case SimdSynetUnaryOperation32fZero: SynetUnaryOperation32fLayerForward(src, size, dst); break; - default: - assert(0); - } - } - - void SynetUnaryOperation32fLayerForward(const float* src, size_t size, SimdSynetUnaryOperation32fType type, float* dst) - { - if (Aligned(src) && Aligned(dst)) - SynetUnaryOperation32fLayerForward(src, size, type, dst); - else - SynetUnaryOperation32fLayerForward(src, size, type, dst); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonSynetActivation.cpp b/src/3rd/Simd/Simd/SimdNeonSynetActivation.cpp deleted file mode 100644 index 0e9dde75..00000000 --- a/src/3rd/Simd/Simd/SimdNeonSynetActivation.cpp +++ /dev/null @@ -1,424 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdArray.h" -#include "Simd/SimdPow.h" -#include "Simd/SimdExp.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdSynet.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE void SynetElu32f(const float * src, const Neon::Exp & exp, float32x4_t alpha, float * dst, size_t offset) - { - Store(dst + offset, exp.Elu(Load(src + offset), alpha)); - } - - template void SynetElu32f(const float * src, size_t size, const float * alpha, float * dst) - { - float32x4_t _alpha = vdupq_n_f32(alpha[0]); - Neon::Exp exp; - size_t sizeF = AlignLo(size, F); - size_t sizeQF = AlignLo(size, QF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - SynetElu32f(src, exp, _alpha, dst, i + 0 * F); - SynetElu32f(src, exp, _alpha, dst, i + 1 * F); - SynetElu32f(src, exp, _alpha, dst, i + 2 * F); - SynetElu32f(src, exp, _alpha, dst, i + 3 * F); - } - for (; i < sizeF; i += F) - SynetElu32f(src, exp, _alpha, dst, i); - for (; i < size; ++i) - dst[i] = Base::SynetElu32f(src[i], alpha[0]); - } - - void SynetElu32f(const float * src, size_t size, const float * alpha, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetElu32f(src, size, alpha, dst); - else - SynetElu32f(src, size, alpha, dst); - } - - //------------------------------------------------------------------------- - - template SIMD_INLINE void SynetHswish32f(const float * src, float32x4_t shift, float32x4_t scale, float * dst, size_t offset) - { - float32x4_t _src = Load(src + offset); - float32x4_t _dst = SynetHswish32f(_src, shift, scale); - Store(dst + offset, _dst); - } - - template void SynetHswish32f(const float * src, size_t size, const float * shift, const float * scale, float * dst) - { - float32x4_t _shift = vdupq_n_f32(shift[0]); - float32x4_t _scale = vdupq_n_f32(scale[0]); - size_t sizeF = AlignLo(size, F); - size_t sizeQF = AlignLo(size, QF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - SynetHswish32f(src, _shift, _scale, dst, i + 0 * F); - SynetHswish32f(src, _shift, _scale, dst, i + 1 * F); - SynetHswish32f(src, _shift, _scale, dst, i + 2 * F); - SynetHswish32f(src, _shift, _scale, dst, i + 3 * F); - } - for (; i < sizeF; i += F) - SynetHswish32f(src, _shift, _scale, dst, i); - for (; i < size; ++i) - dst[i] = Base::SynetHswish32f(src[i], shift[0], scale[0]); - } - - void SynetHswish32f(const float * src, size_t size, const float * shift, const float * scale, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetHswish32f(src, size, shift, scale, dst); - else - SynetHswish32f(src, size, shift, scale, dst); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetPreluLayerForward(const float* src, const float* slope, float32x4_t _0, float* dst, size_t offset) - { - Store(dst + offset, SynetRelu32f(Load(src + offset), Load(slope + offset), _0)); - } - - template SIMD_INLINE void SynetPreluLayerForward(const float* src, float32x4_t slope, float32x4_t _0, float* dst, size_t offset) - { - Store(dst + offset, SynetRelu32f(Load(src + offset), slope, _0)); - } - - template void SynetPreluLayerForwardNchw(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) - { - if (align) - assert(Aligned(src) && Aligned(spatial, F) && Aligned(dst)); - - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - float32x4_t _0 = vdupq_n_f32(0.0f); - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - if (partial) - { - float32x4_t _slope = vdupq_n_f32(slope[c]); - for (; s < aligned; s += QF) - { - SynetPreluLayerForward(src, _slope, _0, dst, s + F * 0); - SynetPreluLayerForward(src, _slope, _0, dst, s + F * 1); - SynetPreluLayerForward(src, _slope, _0, dst, s + F * 2); - SynetPreluLayerForward(src, _slope, _0, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetPreluLayerForward(src, _slope, _0, dst, s); - } - for (; s < spatial; ++s) - dst[s] = Base::SynetRelu32f(src[s], slope[c]); - src += spatial; - dst += spatial; - } - } - - SIMD_INLINE void SynetPreluLayerForwardNchw(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) - { - if (Aligned(src) && Aligned(spatial, F) && Aligned(dst)) - SynetPreluLayerForwardNchw(src, slope, channels, spatial, dst); - else - SynetPreluLayerForwardNchw(src, slope, channels, spatial, dst); - } - - template void SynetPreluLayerForwardNhwc(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) - { - if (align) - assert(Aligned(src) && Aligned(slope) && Aligned(channels, F) && Aligned(dst)); - - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - float32x4_t _0 = vdupq_n_f32(0.0f); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - if (partial) - { - for (; c < aligned; c += QF) - { - SynetPreluLayerForward(src, slope, _0, dst, c + F * 0); - SynetPreluLayerForward(src, slope, _0, dst, c + F * 1); - SynetPreluLayerForward(src, slope, _0, dst, c + F * 2); - SynetPreluLayerForward(src, slope, _0, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetPreluLayerForward(src, slope, _0, dst, c); - } - for (; c < channels; ++c) - dst[c] = Base::SynetRelu32f(src[c], slope[c]); - src += channels; - dst += channels; - } - } - - SIMD_INLINE void SynetPreluLayerForwardNhwc(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) - { - if (Aligned(src) && Aligned(slope) && Aligned(channels, F) && Aligned(dst)) - SynetPreluLayerForwardNhwc(src, slope, channels, spatial, dst); - else - SynetPreluLayerForwardNhwc(src, slope, channels, spatial, dst); - } - - template void SynetPreluLayerForwardNchw4c(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4) * F; - float32x4_t _0 = vdupq_n_f32(0.0f); - for (size_t c = 0; c < channels; c += F) - { - float32x4_t _slope = Load(slope + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetPreluLayerForward(src, _slope, _0, dst, s + F * 0); - SynetPreluLayerForward(src, _slope, _0, dst, s + F * 1); - SynetPreluLayerForward(src, _slope, _0, dst, s + F * 2); - SynetPreluLayerForward(src, _slope, _0, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetPreluLayerForward(src, _slope, _0, dst, s); - src += spatialF; - dst += spatialF; - } - } - - SIMD_INLINE void SynetPreluLayerForwardNchw4c(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) - { - if (Aligned(src) && Aligned(dst)) - SynetPreluLayerForwardNchw4c(src, slope, channels, spatial, dst); - else - SynetPreluLayerForwardNchw4c(src, slope, channels, spatial, dst); - } - - void SynetPreluLayerForward(const float* src, const float* slope, size_t channels, size_t spatial, float* dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetPreluLayerForwardNchw(src, slope, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetPreluLayerForwardNhwc(src, slope, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - SynetPreluLayerForwardNchw4c(src, slope, channels, spatial, dst); - else - Base::SynetPreluLayerForward(src, slope, channels, spatial, dst, format); - } - - //------------------------------------------------------------------------- - - template SIMD_INLINE void SynetRelu32f(const float* src, float32x4_t slope, float32x4_t zero, float* dst, size_t offset) - { - float32x4_t _src = Load(src + offset); - float32x4_t _dst = SynetRelu32f(_src, slope, zero); - Store(dst + offset, _dst); - } - - template void SynetRelu32f(const float* src, size_t size, const float* slope, float* dst) - { - float32x4_t _slope = vdupq_n_f32(slope[0]); - float32x4_t _0 = vdupq_n_f32(0.0f); - size_t sizeF = AlignLo(size, F); - size_t sizeQF = AlignLo(size, QF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - SynetRelu32f(src, _slope, _0, dst, i + 0 * F); - SynetRelu32f(src, _slope, _0, dst, i + 1 * F); - SynetRelu32f(src, _slope, _0, dst, i + 2 * F); - SynetRelu32f(src, _slope, _0, dst, i + 3 * F); - } - for (; i < sizeF; i += F) - SynetRelu32f(src, _slope, _0, dst, i); - for (; i < size; ++i) - dst[i] = Base::SynetRelu32f(src[i], slope[0]); - } - - void SynetRelu32f(const float* src, size_t size, const float* slope, float* dst) - { - if (Aligned(src) && Aligned(dst)) - SynetRelu32f(src, size, slope, dst); - else - SynetRelu32f(src, size, slope, dst); - } - - //--------------------------------------------------------------------- - - template void SynetRestrictRange32f(const float * src, size_t size, const float * lower, const float * upper, float * dst) - { - assert(lower[0] <= upper[0]); - if (align) - assert(Aligned(src) && Aligned(dst)); - float min = *lower; - float max = *upper; - float32x4_t _min = vdupq_n_f32(min); - float32x4_t _max = vdupq_n_f32(max); - size_t sizeF = Simd::AlignLo(size, F); - size_t sizeQF = Simd::AlignLo(size, QF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - Store(dst + i + 0 * F, vminq_f32(vmaxq_f32(_min, Load(src + i + 0 * F)), _max)); - Store(dst + i + 1 * F, vminq_f32(vmaxq_f32(_min, Load(src + i + 1 * F)), _max)); - Store(dst + i + 2 * F, vminq_f32(vmaxq_f32(_min, Load(src + i + 2 * F)), _max)); - Store(dst + i + 3 * F, vminq_f32(vmaxq_f32(_min, Load(src + i + 3 * F)), _max)); - } - for (; i < sizeF; i += F) - Store(dst + i, vminq_f32(vmaxq_f32(_min, Load(src + i)), _max)); - for (; i < size; ++i) - dst[i] = Simd::RestrictRange(src[i], min, max); - } - - void SynetRestrictRange32f(const float * src, size_t size, const float * lower, const float * upper, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetRestrictRange32f(src, size, lower, upper, dst); - else - SynetRestrictRange32f(src, size, lower, upper, dst); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetSigmoid32f(const float* src, const Neon::Exp& exp, float* dst, size_t offset) - { - Store(dst + offset, exp.Sigmoid<1>(Load(src + offset))); - } - - template void SynetSigmoid32f(const float* src, size_t size, const float* slope, float* dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - Exp exp(-slope[0]); - size_t sizeF = AlignLo(size, F); - size_t sizeQF = AlignLo(size, QF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - SynetSigmoid32f(src, exp, dst, i + 0 * F); - SynetSigmoid32f(src, exp, dst, i + 1 * F); - SynetSigmoid32f(src, exp, dst, i + 2 * F); - SynetSigmoid32f(src, exp, dst, i + 3 * F); - } - for (; i < sizeF; i += F) - SynetSigmoid32f(src, exp, dst, i); - for (; i < size; ++i) - dst[i] = Base::SynetSigmoid32f(src[i], slope[0]); - } - - void SynetSigmoid32f(const float* src, size_t size, const float* slope, float* dst) - { - if (Aligned(src) && Aligned(dst)) - SynetSigmoid32f(src, size, slope, dst); - else - SynetSigmoid32f(src, size, slope, dst); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetSoftplus32f(const float* src, float32x4_t beta, float32x4_t threshold, float* dst, size_t offset) - { - Store(dst + offset, Softplus<1>(Load(src + offset), beta, threshold)); - } - - template void SynetSoftplus32f(const float* src, size_t size, const float* beta, const float* threshold, float* dst) - { - float32x4_t _beta = vdupq_n_f32(beta[0]); - float32x4_t _threshold = vdupq_n_f32(threshold[0]); - size_t sizeF = AlignLo(size, F); - size_t sizeQF = AlignLo(size, QF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - SynetSoftplus32f(src, _beta, _threshold, dst, i + 0 * F); - SynetSoftplus32f(src, _beta, _threshold, dst, i + 1 * F); - SynetSoftplus32f(src, _beta, _threshold, dst, i + 2 * F); - SynetSoftplus32f(src, _beta, _threshold, dst, i + 3 * F); - } - for (; i < sizeF; i += F) - SynetSoftplus32f(src, _beta, _threshold, dst, i); - for (; i < size; ++i) - dst[i] = Base::SynetSoftplus32f(src[i], beta[0], threshold[0]); - } - - void SynetSoftplus32f(const float* src, size_t size, const float* beta, const float* threshold, float* dst) - { - if (Aligned(src) && Aligned(dst)) - SynetSoftplus32f(src, size, beta, threshold, dst); - else - SynetSoftplus32f(src, size, beta, threshold, dst); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetTanh32f(const float* src, const Neon::Exp& exp, float* dst, size_t offset) - { - Store(dst + offset, exp.Tanh<1>(Load(src + offset))); - } - - template void SynetTanh32f(const float* src, size_t size, const float* slope, float* dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - Exp exp(-2.0f*slope[0]); - size_t sizeF = AlignLo(size, F); - size_t sizeQF = AlignLo(size, QF); - size_t i = 0; - for (; i < sizeQF; i += QF) - { - SynetTanh32f(src, exp, dst, i + 0 * F); - SynetTanh32f(src, exp, dst, i + 1 * F); - SynetTanh32f(src, exp, dst, i + 2 * F); - SynetTanh32f(src, exp, dst, i + 3 * F); - } - for (; i < sizeF; i += F) - SynetTanh32f(src, exp, dst, i); - for (; i < size; ++i) - dst[i] = Base::SynetTanh32f(src[i], slope[0]); - } - - void SynetTanh32f(const float* src, size_t size, const float* slope, float* dst) - { - if (Aligned(src) && Aligned(dst)) - SynetTanh32f(src, size, slope, dst); - else - SynetTanh32f(src, size, slope, dst); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonSynetConversion.cpp b/src/3rd/Simd/Simd/SimdNeonSynetConversion.cpp deleted file mode 100644 index 6a24495a..00000000 --- a/src/3rd/Simd/Simd/SimdNeonSynetConversion.cpp +++ /dev/null @@ -1,989 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdTranspose.h" -#include "Simd/SimdConversion.h" -#include "Simd/SimdSet.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdSynet.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE void SynetConvert32fTo8u(const float* src, float32x4_t scale, float32x4_t shift, uint8_t* dst) - { - int32x4_t i32 = Round(Fmadd(Load(src), scale, shift)); - *((int32_t*)dst) = vget_lane_s32(vreinterpret_s32_u8(vqmovun_s16(vcombine_s16(vmovn_s32(i32), vcreate_s16(0)))), 0); - } - - template void SynetConvert32fTo8uNchw(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, uint8_t* dst) - { - if (align) - assert(Aligned(src) && Aligned(dst) && Aligned(spatial, F)); - - size_t spatialF = AlignLo(spatial, F); - for (size_t b = 0; b < batch; ++b) - { - for (size_t c = 0; c < channels; ++c) - { - float32x4_t _scale = vdupq_n_f32(scale[c]); - float32x4_t _shift = vdupq_n_f32(shift[c]); - size_t s = 0; - for (; s < spatialF; s += F) - SynetConvert32fTo8u(src + s, _scale, _shift, dst + s); - for (; s < spatial; s += 1) - dst[s] = Base::SynetConvert32fTo8u(src[s], scale[c], shift[c]); - src += spatial; - dst += spatial; - } - } - } - - template void SynetConvert32fTo8uNhwc(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, uint8_t* dst) - { - if (align) - assert(Aligned(src) && Aligned(dst) && Aligned(channels, F) && Aligned(scale) && Aligned(shift)); - - size_t channelsF = AlignLo(channels, F); - - for (size_t b = 0; b < batch; ++b) - { - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < channelsF; c += F) - SynetConvert32fTo8u(src + c, Load(scale + c), Load(shift + c), dst + c); - for (; c < channels; ++c) - dst[c] = Base::SynetConvert32fTo8u(src[c], scale[c], shift[c]); - src += channels; - dst += channels; - } - } - } - - template void SynetConvert32fTo8uNhwc3(const float* src, size_t batch, size_t spatial, const float* scale, const float* shift, uint8_t* dst) - { - if (align) - assert(Aligned(src) && Aligned(dst) && Aligned(spatial, A)); - - size_t spatial3 = spatial * 3; - size_t spatial3F = AlignLo(spatial, F) * 3; - - float _scale[F * 3], _shift[F * 3]; - for (size_t i = 0; i < F; ++i) - for (size_t c = 0; c < 3; ++c) - _scale[i * 3 + c] = scale[c], _shift[i * 3 + c] = shift[c]; - - float32x4_t _scale0 = Load(_scale + 0 * F); - float32x4_t _scale1 = Load(_scale + 1 * F); - float32x4_t _scale2 = Load(_scale + 2 * F); - float32x4_t _shift0 = Load(_shift + 0 * F); - float32x4_t _shift1 = Load(_shift + 1 * F); - float32x4_t _shift2 = Load(_shift + 2 * F); - - for (size_t b = 0; b < batch; ++b) - { - size_t s = 0; - for (; s < spatial3F; s += 3 * F) - { - SynetConvert32fTo8u(src + 0 * F, _scale0, _shift0, dst + 0 * F); - SynetConvert32fTo8u(src + 1 * F, _scale1, _shift1, dst + 1 * F); - SynetConvert32fTo8u(src + 2 * F, _scale2, _shift2, dst + 2 * F); - src += 3 * F; - dst += 3 * F; - } - for (; s < spatial3; s += 3) - { - dst[0] = Base::SynetConvert32fTo8u(src[0], scale[0], shift[0]); - dst[1] = Base::SynetConvert32fTo8u(src[1], scale[1], shift[1]); - dst[2] = Base::SynetConvert32fTo8u(src[2], scale[2], shift[2]); - src += 3; - dst += 3; - } - } - } - - template void SynetConvert32fTo8u(const float* src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float* shift, uint8_t* dst) - { - size_t spatial = height * width; - if (Base::NchwCompatible(channels, spatial, format)) - { - if (Aligned(src) && Aligned(dst) && Aligned(spatial, A)) - SynetConvert32fTo8uNchw(src, batch, channels, spatial, scale, shift, dst); - else - SynetConvert32fTo8uNchw(src, batch, channels, spatial, scale, shift, dst); - } - else if (Base::NhwcCompatible(channels, spatial, format)) - { - if (channels == 3) - { - if (Aligned(src) && Aligned(dst) && Aligned(spatial, A)) - SynetConvert32fTo8uNhwc3(src, batch, spatial, scale, shift, dst); - else - SynetConvert32fTo8uNhwc3(src, batch, spatial, scale, shift, dst); - } - else - { - if (Aligned(src) && Aligned(dst) && Aligned(channels, A) && Aligned(scale) && Aligned(shift)) - SynetConvert32fTo8uNhwc(src, batch, channels, spatial, scale, shift, dst); - else - SynetConvert32fTo8uNhwc(src, batch, channels, spatial, scale, shift, dst); - } - } - else - assert(0); - } - - void SynetConvert32fTo8u(const float* src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float* shift, uint8_t* dst, SimdSynetCompatibilityType compatibility) - { - if (compatibility & SimdSynetCompatibilityNoFma) - SynetConvert32fTo8u(src, batch, channels, height, width, format, scale, shift, dst); - else - SynetConvert32fTo8u(src, batch, channels, height, width, format, scale, shift, dst); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void StoreScaled(float * ptr, uint32x4_t value32, float32x4_t scale, float32x4_t shift) - { - Store(ptr, vmlaq_f32(shift, vcvtq_f32_u32(value32), scale)); - } - - template SIMD_INLINE void SynetSetInput1(const uint8_t * src, float32x4_t scale, float32x4_t shift, float * dst); - - SIMD_INLINE void SynetSetInput1Gray16(uint16x8_t gray, float32x4_t scale, float32x4_t shift, float * dst) - { - StoreScaled(dst + 0 * F, UnpackU16<0>(gray), scale, shift); - StoreScaled(dst + 1 * F, UnpackU16<1>(gray), scale, shift); - } - - SIMD_INLINE void SynetSetInput1Gray8(uint8x16_t gray, float32x4_t scale, float32x4_t shift, float * dst) - { - SynetSetInput1Gray16(UnpackU8<0>(gray), scale, shift, dst + 0 * F); - SynetSetInput1Gray16(UnpackU8<1>(gray), scale, shift, dst + 2 * F); - } - - template<> SIMD_INLINE void SynetSetInput1(const uint8_t * src, float32x4_t scale, float32x4_t shift, float * dst) - { - SynetSetInput1Gray8(Load(src), scale, shift, dst); - } - - SIMD_INLINE void SynetSetInput1Bgr16(uint16x8_t blue, uint16x8_t green, uint16x8_t red, float32x4_t scale, float32x4_t shift, float * dst) - { - StoreScaled(dst + 0 * F, BgrToGray<0>(blue, green, red), scale, shift); - StoreScaled(dst + 1 * F, BgrToGray<1>(blue, green, red), scale, shift); - } - - template<> SIMD_INLINE void SynetSetInput1(const uint8_t * src, float32x4_t scale, float32x4_t shift, float * dst) - { - uint8x16x3_t bgr = Load3(src); - SynetSetInput1Bgr16(UnpackU8<0>(bgr.val[0]), UnpackU8<0>(bgr.val[1]), UnpackU8<0>(bgr.val[2]), scale, shift, dst + 0 * F); - SynetSetInput1Bgr16(UnpackU8<1>(bgr.val[0]), UnpackU8<1>(bgr.val[1]), UnpackU8<1>(bgr.val[2]), scale, shift, dst + 2 * F); - } - - template<> SIMD_INLINE void SynetSetInput1(const uint8_t * src, float32x4_t scale, float32x4_t shift, float * dst) - { - uint8x16x4_t bgra = Load4(src); - SynetSetInput1Bgr16(UnpackU8<0>(bgra.val[0]), UnpackU8<0>(bgra.val[1]), UnpackU8<0>(bgra.val[2]), scale, shift, dst + 0 * F); - SynetSetInput1Bgr16(UnpackU8<1>(bgra.val[0]), UnpackU8<1>(bgra.val[1]), UnpackU8<1>(bgra.val[2]), scale, shift, dst + 2 * F); - } - - template<> SIMD_INLINE void SynetSetInput1(const uint8_t * src, float32x4_t scale, float32x4_t shift, float * dst) - { - uint8x16x3_t rgb = Load3(src); - SynetSetInput1Bgr16(UnpackU8<0>(rgb.val[2]), UnpackU8<0>(rgb.val[1]), UnpackU8<0>(rgb.val[0]), scale, shift, dst + 0 * F); - SynetSetInput1Bgr16(UnpackU8<1>(rgb.val[2]), UnpackU8<1>(rgb.val[1]), UnpackU8<1>(rgb.val[0]), scale, shift, dst + 2 * F); - } - - template void SynetSetInput1(const uint8_t * src, size_t width, size_t height, size_t stride, const float * scale, const float * shift, float * dst) - { - float32x4_t _scale = vdupq_n_f32(scale[0]); - float32x4_t _shift = vdupq_n_f32(shift[0]); - size_t aligned = AlignLo(width, A); - for (size_t y = 0; y < height; ++y) - { - for (size_t x = 0; x < aligned; x += A) - SynetSetInput1(src + step * x, _scale, _shift, dst + x); - if (aligned < width) - SynetSetInput1(src + step * (width - A), _scale, _shift, dst + width - A); - src += stride; - dst += width; - } - } - - template SIMD_INLINE void SynetSetInputNchw3(const uint8_t * src, const float32x4_t * scale, const float32x4_t * shift, float * dst, size_t channel); - - template<> SIMD_INLINE void SynetSetInputNchw3(const uint8_t * src, const float32x4_t * scale, const float32x4_t * shift, float * dst, size_t channel) - { - uint8x16_t _src = Load(src); - uint16x8_t src0 = UnpackU8<0>(_src); - uint16x8_t src1 = UnpackU8<1>(_src); - uint32x4_t gray0 = UnpackU16<0>(src0); - uint32x4_t gray1 = UnpackU16<1>(src0); - uint32x4_t gray2 = UnpackU16<0>(src1); - uint32x4_t gray3 = UnpackU16<1>(src1); - StoreScaled(dst + 0 * F, gray0, scale[0], shift[0]); - StoreScaled(dst + 1 * F, gray1, scale[0], shift[0]); - StoreScaled(dst + 2 * F, gray2, scale[0], shift[0]); - StoreScaled(dst + 3 * F, gray3, scale[0], shift[0]); - dst += channel; - StoreScaled(dst + 0 * F, gray0, scale[1], shift[1]); - StoreScaled(dst + 1 * F, gray1, scale[1], shift[1]); - StoreScaled(dst + 2 * F, gray2, scale[1], shift[1]); - StoreScaled(dst + 3 * F, gray3, scale[1], shift[1]); - dst += channel; - StoreScaled(dst + 0 * F, gray0, scale[2], shift[2]); - StoreScaled(dst + 1 * F, gray1, scale[2], shift[2]); - StoreScaled(dst + 2 * F, gray2, scale[2], shift[2]); - StoreScaled(dst + 3 * F, gray3, scale[2], shift[2]); - } - - template<> SIMD_INLINE void SynetSetInputNchw3(const uint8_t * src, const float32x4_t * scale, const float32x4_t * shift, float * dst, size_t channel) - { - uint8x16x3_t bgr = Load3(src); - SynetSetInput1Gray8(bgr.val[0], scale[0], shift[0], dst + 0 * channel); - SynetSetInput1Gray8(bgr.val[1], scale[1], shift[1], dst + 1 * channel); - SynetSetInput1Gray8(bgr.val[2], scale[2], shift[2], dst + 2 * channel); - } - - template<> SIMD_INLINE void SynetSetInputNchw3(const uint8_t * src, const float32x4_t * scale, const float32x4_t * shift, float * dst, size_t channel) - { - uint8x16x4_t bgra = Load4(src); - SynetSetInput1Gray8(bgra.val[0], scale[0], shift[0], dst + 0 * channel); - SynetSetInput1Gray8(bgra.val[1], scale[1], shift[1], dst + 1 * channel); - SynetSetInput1Gray8(bgra.val[2], scale[2], shift[2], dst + 2 * channel); - } - - template<> SIMD_INLINE void SynetSetInputNchw3(const uint8_t * src, const float32x4_t * scale, const float32x4_t * shift, float * dst, size_t channel) - { - uint8x16x3_t rgb = Load3(src); - SynetSetInput1Gray8(rgb.val[2], scale[0], shift[0], dst + 0 * channel); - SynetSetInput1Gray8(rgb.val[1], scale[1], shift[1], dst + 1 * channel); - SynetSetInput1Gray8(rgb.val[0], scale[2], shift[2], dst + 2 * channel); - } - - template void SynetSetInputNchw3(const uint8_t * src, size_t width, size_t height, size_t stride, const float * scale, const float * shift, float * dst) - { - size_t aligned = AlignLo(width, A), channel = width * height; - float32x4_t _scale[3], _shift[3]; - for (size_t i = 0; i < 3; ++i) - { - _scale[i] = vdupq_n_f32(scale[i]); - _shift[i] = vdupq_n_f32(shift[i]); - } - for (size_t y = 0; y < height; ++y) - { - for (size_t x = 0; x < aligned; x += A) - SynetSetInputNchw3(src + step * x, _scale, _shift, dst + x, channel); - if (aligned < width) - SynetSetInputNchw3(src + step * (width - A), _scale, _shift, dst + width - A, channel); - src += stride; - dst += width; - } - } - - template SIMD_INLINE void SynetSetInputNhwc3(const uint8_t * src, const float32x4_t * scale, const float32x4_t * shift, float * dst); - - SIMD_INLINE uint8x8_t Shuffle(const uint8x16_t & src, const uint8x8_t & idx) - { - return vtbl2_u8((const uint8x8x2_t &)src, idx); - } - - const uint8x8_t K8_TBL_GRAY_TO_BGR_0 = SIMD_VEC_SETR_EPI16(0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x2, 0x2); - const uint8x8_t K8_TBL_GRAY_TO_BGR_1 = SIMD_VEC_SETR_EPI16(0x2, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5); - const uint8x8_t K8_TBL_GRAY_TO_BGR_2 = SIMD_VEC_SETR_EPI16(0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7); - - template<> SIMD_INLINE void SynetSetInputNhwc3(const uint8_t * src, const float32x4_t * scale, const float32x4_t * shift, float * dst) - { - uint8x8_t gray0 = LoadHalf(src + 0); - uint16x8_t bgr0 = vmovl_u8(vtbl1_u8(gray0, K8_TBL_GRAY_TO_BGR_0)); - StoreScaled(dst + 0x0 * F, UnpackU16<0>(bgr0), scale[0], shift[0]); - StoreScaled(dst + 0x1 * F, UnpackU16<1>(bgr0), scale[1], shift[1]); - uint16x8_t bgr1 = vmovl_u8(vtbl1_u8(gray0, K8_TBL_GRAY_TO_BGR_1)); - StoreScaled(dst + 0x2 * F, UnpackU16<0>(bgr1), scale[2], shift[2]); - StoreScaled(dst + 0x3 * F, UnpackU16<1>(bgr1), scale[0], shift[0]); - uint16x8_t bgr2 = vmovl_u8(vtbl1_u8(gray0, K8_TBL_GRAY_TO_BGR_2)); - StoreScaled(dst + 0x4 * F, UnpackU16<0>(bgr2), scale[1], shift[1]); - StoreScaled(dst + 0x5 * F, UnpackU16<1>(bgr2), scale[2], shift[2]); - uint8x8_t gray1 = LoadHalf(src + 8); - uint16x8_t bgr3 = vmovl_u8(vtbl1_u8(gray1, K8_TBL_GRAY_TO_BGR_0)); - StoreScaled(dst + 0x6 * F, UnpackU16<0>(bgr3), scale[0], shift[0]); - StoreScaled(dst + 0x7 * F, UnpackU16<1>(bgr3), scale[1], shift[1]); - uint16x8_t bgr4 = vmovl_u8(vtbl1_u8(gray1, K8_TBL_GRAY_TO_BGR_1)); - StoreScaled(dst + 0x8 * F, UnpackU16<0>(bgr4), scale[2], shift[2]); - StoreScaled(dst + 0x9 * F, UnpackU16<1>(bgr4), scale[0], shift[0]); - uint16x8_t bgr5 = vmovl_u8(vtbl1_u8(gray1, K8_TBL_GRAY_TO_BGR_2)); - StoreScaled(dst + 0xA * F, UnpackU16<0>(bgr5), scale[1], shift[1]); - StoreScaled(dst + 0xB * F, UnpackU16<1>(bgr5), scale[2], shift[2]); - } - - template<> SIMD_INLINE void SynetSetInputNhwc3(const uint8_t * src, const float32x4_t * scale, const float32x4_t * shift, float * dst) - { - uint8x16_t bgr0 = Load(src + 0 * A); - uint16x8_t bgr00 = UnpackU8<0>(bgr0); - StoreScaled(dst + 0x0 * F, UnpackU16<0>(bgr00), scale[0], shift[0]); - StoreScaled(dst + 0x1 * F, UnpackU16<1>(bgr00), scale[1], shift[1]); - uint16x8_t bgr01 = UnpackU8<1>(bgr0); - StoreScaled(dst + 0x2 * F, UnpackU16<0>(bgr01), scale[2], shift[2]); - StoreScaled(dst + 0x3 * F, UnpackU16<1>(bgr01), scale[0], shift[0]); - uint8x16_t bgr1 = Load(src + 1 * A); - uint16x8_t bgr10 = UnpackU8<0>(bgr1); - StoreScaled(dst + 0x4 * F, UnpackU16<0>(bgr10), scale[1], shift[1]); - StoreScaled(dst + 0x5 * F, UnpackU16<1>(bgr10), scale[2], shift[2]); - uint16x8_t bgr11 = UnpackU8<1>(bgr1); - StoreScaled(dst + 0x6 * F, UnpackU16<0>(bgr11), scale[0], shift[0]); - StoreScaled(dst + 0x7 * F, UnpackU16<1>(bgr11), scale[1], shift[1]); - uint8x16_t bgr2 = Load(src + 2 * A); - uint16x8_t bgr20 = UnpackU8<0>(bgr2); - StoreScaled(dst + 0x8 * F, UnpackU16<0>(bgr20), scale[2], shift[2]); - StoreScaled(dst + 0x9 * F, UnpackU16<1>(bgr20), scale[0], shift[0]); - uint16x8_t bgr21 = UnpackU8<1>(bgr2); - StoreScaled(dst + 0xA * F, UnpackU16<0>(bgr21), scale[1], shift[1]); - StoreScaled(dst + 0xB * F, UnpackU16<1>(bgr21), scale[2], shift[2]); - } - - const uint8x8_t K8_TBL_BGRA_TO_BGR_0 = SIMD_VEC_SETR_EPI16(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9); - const uint8x8_t K8_TBL_BGRA_TO_BGR_1 = SIMD_VEC_SETR_EPI16(0x0, 0x2, 0x3, 0x4, 0x6, 0x7, 0x8, 0xA); - const uint8x8_t K8_TBL_BGRA_TO_BGR_2 = SIMD_VEC_SETR_EPI16(0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE); - - template<> SIMD_INLINE void SynetSetInputNhwc3(const uint8_t * src, const float32x4_t * scale, const float32x4_t * shift, float * dst) - { - uint16x8_t bgr0 = vmovl_u8(Shuffle(Load(src + 0), K8_TBL_BGRA_TO_BGR_0)); - StoreScaled(dst + 0x0 * F, UnpackU16<0>(bgr0), scale[0], shift[0]); - StoreScaled(dst + 0x1 * F, UnpackU16<1>(bgr0), scale[1], shift[1]); - uint16x8_t bgr1 = vmovl_u8(Shuffle(Load(src + 10), K8_TBL_BGRA_TO_BGR_1)); - StoreScaled(dst + 0x2 * F, UnpackU16<0>(bgr1), scale[2], shift[2]); - StoreScaled(dst + 0x3 * F, UnpackU16<1>(bgr1), scale[0], shift[0]); - uint16x8_t bgr2 = vmovl_u8(Shuffle(Load(src + 16), K8_TBL_BGRA_TO_BGR_2)); - StoreScaled(dst + 0x4 * F, UnpackU16<0>(bgr2), scale[1], shift[1]); - StoreScaled(dst + 0x5 * F, UnpackU16<1>(bgr2), scale[2], shift[2]); - uint16x8_t bgr3 = vmovl_u8(Shuffle(Load(src + 32), K8_TBL_BGRA_TO_BGR_0)); - StoreScaled(dst + 0x6 * F, UnpackU16<0>(bgr3), scale[0], shift[0]); - StoreScaled(dst + 0x7 * F, UnpackU16<1>(bgr3), scale[1], shift[1]); - uint16x8_t bgr4 = vmovl_u8(Shuffle(Load(src + 42), K8_TBL_BGRA_TO_BGR_1)); - StoreScaled(dst + 0x8 * F, UnpackU16<0>(bgr4), scale[2], shift[2]); - StoreScaled(dst + 0x9 * F, UnpackU16<1>(bgr4), scale[0], shift[0]); - uint16x8_t bgr5 = vmovl_u8(Shuffle(Load(src + 48), K8_TBL_BGRA_TO_BGR_2)); - StoreScaled(dst + 0xA * F, UnpackU16<0>(bgr5), scale[1], shift[1]); - StoreScaled(dst + 0xB * F, UnpackU16<1>(bgr5), scale[2], shift[2]); - } - - const uint8x8_t K8_TBL_RGB_TO_BGR_0 = SIMD_VEC_SETR_EPI16(0x2, 0x1, 0x0, 0x5, 0x4, 0x3, 0x8, 0x7); - const uint8x8_t K8_TBL_RGB_TO_BGR_1 = SIMD_VEC_SETR_EPI16(0x0, 0x5, 0x4, 0x3, 0x8, 0x7, 0x6, 0xB); - const uint8x8_t K8_TBL_RGB_TO_BGR_2 = SIMD_VEC_SETR_EPI16(0x8, 0x7, 0xC, 0xB, 0xA, 0xF, 0xE, 0xD); - - template<> SIMD_INLINE void SynetSetInputNhwc3(const uint8_t * src, const float32x4_t * scale, const float32x4_t * shift, float * dst) - { - uint16x8_t bgr0 = vmovl_u8(Shuffle(Load(src + 0), K8_TBL_RGB_TO_BGR_0)); - StoreScaled(dst + 0x0 * F, UnpackU16<0>(bgr0), scale[0], shift[0]); - StoreScaled(dst + 0x1 * F, UnpackU16<1>(bgr0), scale[1], shift[1]); - uint16x8_t bgr1 = vmovl_u8(Shuffle(Load(src + 6), K8_TBL_RGB_TO_BGR_1)); - StoreScaled(dst + 0x2 * F, UnpackU16<0>(bgr1), scale[2], shift[2]); - StoreScaled(dst + 0x3 * F, UnpackU16<1>(bgr1), scale[0], shift[0]); - uint16x8_t bgr2 = vmovl_u8(Shuffle(Load(src + 8), K8_TBL_RGB_TO_BGR_2)); - StoreScaled(dst + 0x4 * F, UnpackU16<0>(bgr2), scale[1], shift[1]); - StoreScaled(dst + 0x5 * F, UnpackU16<1>(bgr2), scale[2], shift[2]); - uint16x8_t bgr3 = vmovl_u8(Shuffle(Load(src + 24), K8_TBL_RGB_TO_BGR_0)); - StoreScaled(dst + 0x6 * F, UnpackU16<0>(bgr3), scale[0], shift[0]); - StoreScaled(dst + 0x7 * F, UnpackU16<1>(bgr3), scale[1], shift[1]); - uint16x8_t bgr4 = vmovl_u8(Shuffle(Load(src + 30), K8_TBL_RGB_TO_BGR_1)); - StoreScaled(dst + 0x8 * F, UnpackU16<0>(bgr4), scale[2], shift[2]); - StoreScaled(dst + 0x9 * F, UnpackU16<1>(bgr4), scale[0], shift[0]); - uint16x8_t bgr5 = vmovl_u8(Shuffle(Load(src + 32), K8_TBL_RGB_TO_BGR_2)); - StoreScaled(dst + 0xA * F, UnpackU16<0>(bgr5), scale[1], shift[1]); - StoreScaled(dst + 0xB * F, UnpackU16<1>(bgr5), scale[2], shift[2]); - } - - template void SynetSetInputNhwc3(const uint8_t * src, size_t width, size_t height, size_t stride, const float * scale, const float * shift, float * dst) - { - size_t aligned = AlignLo(width, A); - float32x4_t _scale[3], _shift[3]; - _scale[0] = SetF32(scale[0], scale[1], scale[2], scale[0]); - _scale[1] = SetF32(scale[1], scale[2], scale[0], scale[1]); - _scale[2] = SetF32(scale[2], scale[0], scale[1], scale[2]); - _shift[0] = SetF32(shift[0], shift[1], shift[2], shift[0]); - _shift[1] = SetF32(shift[1], shift[2], shift[0], shift[1]); - _shift[2] = SetF32(shift[2], shift[0], shift[1], shift[2]); - for (size_t y = 0; y < height; ++y) - { - for (size_t x = 0; x < aligned; x += A) - SynetSetInputNhwc3(src + step * x, _scale, _shift, dst + 3 * x); - if (aligned < width) - SynetSetInputNhwc3(src + step * (width - A), _scale, _shift, dst + 3 * (width - A)); - src += stride; - dst += 3 * width; - } - } - - void SynetSetInput(const uint8_t * src, size_t width, size_t height, size_t stride, SimdPixelFormatType srcFormat, - const float * lower, const float * upper, float * dst, size_t channels, SimdTensorFormatType dstFormat) - { - assert(width >= A); - - float scale[3]; - for (size_t i = 0; i < channels; ++i) - scale[i] = (upper[i] - lower[i]) / 255.0f; - switch (channels) - { - case 1: - switch (srcFormat) - { - case SimdPixelFormatGray8: SynetSetInput1(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgr24: SynetSetInput1(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgra32: SynetSetInput1(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatRgb24: SynetSetInput1(src, width, height, stride, scale, lower, dst); return; - default: assert(0); - } - break; - case 3: - switch (dstFormat) - { - case SimdTensorFormatNchw: - switch (srcFormat) - { - case SimdPixelFormatGray8: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgr24: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgra32: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatRgb24: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return; - default: assert(0); - } - break; - case SimdTensorFormatNhwc: - switch (srcFormat) - { - case SimdPixelFormatGray8: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgr24: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatBgra32: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return; - case SimdPixelFormatRgb24: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return; - default: return Base::SynetSetInput(src, width, height, stride, srcFormat, lower, upper, dst, channels, dstFormat); assert(0); - } - break; - default: assert(0); - } - default: assert(0); - } - } - - //--------------------------------------------------------------------- - - template void SynetReorderImage_Chw_Hwc(size_t channels, size_t spatial, const float* src, float* dst) - { - size_t channels4 = AlignLo(channels, 4); - size_t spatial4 = AlignLo(spatial, 4); - size_t s = 0; - for (; s < spatial4; s += 4, src += 4, dst += 4 * channels) - { - size_t c = 0; - const float* ps = src; - float* pd = dst; - for (; c < channels4; c += 4, ps += 4 * spatial, pd += 4) - Transpose4x4(ps, spatial, pd, channels); - for (; c < channels; ++c, ps += spatial, pd += 1) - { - pd[0 * channels] = ps[0]; - pd[1 * channels] = ps[1]; - pd[2 * channels] = ps[2]; - pd[3 * channels] = ps[3]; - } - } - for (; s < spatial; ++s, src += 1, dst += channels) - for (size_t c = 0; c < channels; ++c) - dst[c] = src[c * spatial]; - } - - template void SynetReorderImage_Chw_Chw4c(size_t channels, size_t spatial, const float* src, float* dst) - { - size_t channels4 = AlignLo(channels, 4); - size_t spatial4 = AlignLo(spatial, 4); - size_t tail = channels - channels4; - size_t c = 0; - for (; c < channels4; c += 4, src += 4 * spatial) - { - size_t s = 0; - const float* ps = src; - for (; s < spatial4; s += 4, dst += 4 * F, ps += 4) - Transpose4x4(ps, spatial, dst, 4); - for (; s < spatial; ++s, dst += F, ps += 1) - { - dst[0] = ps[0 * spatial]; - dst[1] = ps[1 * spatial]; - dst[2] = ps[2 * spatial]; - dst[3] = ps[3 * spatial]; - } - } - if (tail) - { - const float* ps = src; - for (size_t s = 0; s < spatial; ++s, dst += F, ps += 1) - { - size_t i = 0; - for (; i < tail; ++i) - dst[i] = ps[i * spatial]; - for (; i < F; ++i) - dst[i] = 0; - } - } - } - - template void SynetReorderImage_Hwc_Chw(size_t channels, size_t spatial, const float* src, float* dst) - { - SynetReorderImage_Chw_Hwc(spatial, channels, src, dst); - } - - template void SynetReorderImage_Hwc_Chw4c(size_t channels, size_t spatial, const float* src, float* dst) - { - size_t channelsF = AlignLo(channels, F); - size_t channelsF4 = AlignLo(channels, 4 * F); - size_t tail = channels - channelsF; - size_t spatial4 = AlignLo(spatial, 4); - size_t stride = spatial * F; - size_t c = 0; - for (; c < channelsF4; c += 4 * F, src += 4 * F) - { - const float* ps = src; - float* pd = dst; - size_t i = 0; - for (; i < spatial4; i += 4, pd += 4 * F, ps += 4 * channels) - Transpose4x4xF(ps, channels, pd, stride); - for (; i < spatial; ++i, pd += F, ps += channels) - { - Copy(ps + 0 * F, pd + 0 * stride); - Copy(ps + 1 * F, pd + 1 * stride); - Copy(ps + 2 * F, pd + 2 * stride); - Copy(ps + 3 * F, pd + 3 * stride); - } - dst += 4 * stride; - } - for (; c < channelsF; c += F, src += F) - { - const float* ps = src; - for (size_t s = 0; s < spatial; ++s, ps += channels, dst += F) - Copy(ps, dst); - } - if (tail) - { - const float* psrc = src; - for (size_t s = 0; s < spatial; ++s, psrc += channels, dst += F) - { - size_t i = 0; - for (; i < tail; ++i) - dst[i] = psrc[i]; - for (; i < F; ++i) - dst[i] = 0; - } - } - } - - template void SynetReorderImage_Chw4c_Chw(size_t channels, size_t spatial, const float* src, float* dst) - { - size_t channels4 = AlignLo(channels, 4); - size_t spatial4 = AlignLo(spatial, 4); - size_t tail = channels - channels4; - size_t c = 0; - for (; c < channels4; c += 4, dst += 4 * spatial, src += 4 * spatial) - { - const float* ps = src; - size_t s = 0; - for (; s < spatial4; s += 4, ps += 4 * F) - Transpose4x4(ps, 4, dst + s, spatial); - for (; s < spatial; ++s, ps += 4) - { - dst[s + 0 * spatial] = ps[0]; - dst[s + 1 * spatial] = ps[1]; - dst[s + 2 * spatial] = ps[2]; - dst[s + 3 * spatial] = ps[3]; - } - } - if (tail) - { - const float* ps = src; - for (size_t i = 0; i < tail; ++i, ps += 1, dst += spatial) - { - for (size_t s = 0; s < spatial; ++s) - dst[s] = ps[s * F]; - } - } - } - - template void SynetReorderImage_Chw4c_Hwc(size_t channels, size_t spatial, const float* src, float* dst) - { - size_t stride = F * spatial; - size_t channelsF = AlignLo(channels, F); - size_t channelsF4 = AlignLo(channels, 4 * F); - size_t tail = channels - channelsF; - size_t spatial4 = AlignLo(spatial, 4); - size_t s = 0; - for (; s < spatial4; s += 4, src += 4 * F, dst += 4 * channels) - { - const float* ps = src; - float* pd = dst; - size_t c = 0; - for (; c < channelsF4; c += 4 * F, ps += 4 * stride, pd += 4 * F) - Transpose4x4xF(ps, stride, pd, channels); - for (; c < channelsF; c += F, ps += stride, pd += F) - { - Copy(ps + 0 * F, pd + 0 * channels); - Copy(ps + 1 * F, pd + 1 * channels); - Copy(ps + 2 * F, pd + 2 * channels); - Copy(ps + 3 * F, pd + 3 * channels); - } - if (tail) - { - for (size_t i = 0; i < tail; ++i) - { - pd[i + 0 * channels] = ps[i + 0 * F]; - pd[i + 1 * channels] = ps[i + 1 * F]; - pd[i + 2 * channels] = ps[i + 2 * F]; - pd[i + 3 * channels] = ps[i + 3 * F]; - } - } - } - for (; s < spatial; ++s, src += F) - { - const float* ps = src; - for (size_t c = 0; c < channelsF; c += F, ps += stride, dst += F) - Copy(ps, dst); - if (tail) - { - for (size_t i = 0; i < tail; ++i) - *(dst++) = ps[i]; - } - } - } - - typedef void(*SynetImageReorderPtr)(size_t channels, size_t spatial, const float* src, float* dst); - SynetImageReorderPtr GetImageReorder(SimdTensorFormatType src, SimdTensorFormatType dst) - { - if (src == SimdTensorFormatNchw) - { - if (dst == SimdTensorFormatNhwc) - return SynetReorderImage_Chw_Hwc; - if (dst == SimdTensorFormatNchw4c) - return SynetReorderImage_Chw_Chw4c; - } - if (src == SimdTensorFormatNhwc) - { - if (dst == SimdTensorFormatNchw) - return SynetReorderImage_Hwc_Chw; - if (dst == SimdTensorFormatNchw4c) - return SynetReorderImage_Hwc_Chw4c; - } - if (src == SimdTensorFormatNchw4c) - { - if (dst == SimdTensorFormatNchw) - return SynetReorderImage_Chw4c_Chw; - if (dst == SimdTensorFormatNhwc) - return SynetReorderImage_Chw4c_Hwc; - } - return NULL; - } - - void SynetReorderImage(size_t batch, size_t channels, size_t spatial, const float* src, SimdTensorFormatType srcFormat, float* dst, SimdTensorFormatType dstFormat) - { - SynetImageReorderPtr imageReorder = GetImageReorder(srcFormat, dstFormat); - if (imageReorder) - { - size_t srcStride = AlignHi(channels, Base::SynetTensorAlignment(srcFormat)) * spatial; - size_t dstStride = AlignHi(channels, Base::SynetTensorAlignment(dstFormat)) * spatial; - for (size_t n = 0; n < batch; ++n) - { - imageReorder(channels, spatial, src, dst); - src += srcStride; - dst += dstStride; - } - } - else - return Base::SynetReorderImage(batch, channels, spatial, src, srcFormat, dst, dstFormat); - } - - template void SynetReorderFilter_Oiyx_Yxio(size_t output, size_t input, size_t kernel, const float* src, float* dst) - { - if (kernel == 1) - { - SynetReorderImage_Chw_Hwc(output, input, src, dst); - return; - } - size_t output4 = AlignLo(output, 4); - size_t kernel4 = AlignLo(kernel, 4); - size_t ik = input * kernel, oi = output * input; - for (size_t i = 0; i < input; ++i, src += kernel, dst += output) - { - const float* ps = src; - float* pd = dst; - size_t k = 0; - for (; k < kernel4; k += 4, ps += 4, pd += 4 * oi) - { - size_t o = 0; - for (; o < output4; o += 4) - Transpose4x4(ps + o * ik, ik, pd + o, oi); - for (; o < output; ++o) - { - pd[0 * oi + o] = ps[o * ik + 0]; - pd[1 * oi + o] = ps[o * ik + 1]; - pd[2 * oi + o] = ps[o * ik + 2]; - pd[3 * oi + o] = ps[o * ik + 3]; - } - } - for (; k < kernel; ++k, ps += 1, pd += oi) - for (size_t o = 0; o < output; ++o) - pd[o] = ps[o * ik]; - } - } - - template void SynetReorderFilter_Oiyx_Oyxi4o(size_t output, size_t input, size_t kernel, const float* src, float* dst) - { - if (kernel == 1) - { - SynetReorderImage_Chw_Chw4c(output, input, src, dst); - return; - } - size_t outputF = AlignLo(output, F); - size_t kernelF = AlignLo(kernel, F); - size_t tail = output - outputF; - size_t ik = input * kernel; - size_t stride = input * F; - for (size_t o = 0; o < outputF; o += F) - { - for (size_t i = 0; i < input; ++i) - { - const float* ps = src + o * ik + i * kernel; - float* pd = dst + o * ik + i * F; - size_t k = 0; - for (; k < kernelF; k += F, ps += F, pd += F * stride) - Transpose4x4(ps, ik, pd, stride); - for (; k < kernel; ++k, ps += 1, pd += stride) - for (size_t j = 0; j < F; ++j) - pd[j] = ps[j * ik]; - } - } - if (tail) - { - for (size_t i = 0; i < input; ++i) - { - const float* ps = src + outputF * ik + i * kernel; - float* pd = dst + outputF * ik + i * F; - for (size_t k = 0; k < kernel; ++k, ps += 1, pd += stride) - { - size_t j = 0; - for (; j < tail; ++j) - pd[j] = ps[j * ik]; - for (; j < F; ++j) - pd[j] = 0; - } - } - } - } - - template void SynetReorderFilter_Yxio_Oiyx(size_t output, size_t input, size_t kernel, const float* src, float* dst) - { - if (kernel == 1) - { - SynetReorderImage_Chw_Hwc(input, output, src, dst); - return; - } - SynetReorderFilter_Oiyx_Yxio(kernel, input, output, src, dst); - } - - template void SynetReorderFilter_Yxio_Oyxi4o(size_t output, size_t input, size_t kernel, const float* src, float* dst) - { - size_t outputF = AlignLo(output, F); - size_t outputF4 = AlignLo(output, F * 4); - size_t ki = kernel * input; - size_t stride = ki * F; - size_t ki4 = AlignLo(ki, 4); - size_t o = 0; - for (; o < outputF4; o += 4 * F, src += 4 * F) - { - const float* ps = src; - float* pd = dst; - size_t i = 0; - for (; i < ki4; i += 4, pd += 4 * F, ps += 4 * output) - Transpose4x4xF(ps, output, pd, stride); - for (; i < ki; ++i, pd += F, ps += output) - { - Copy(ps + 0 * F, pd + 0 * stride); - Copy(ps + 1 * F, pd + 1 * stride); - Copy(ps + 2 * F, pd + 2 * stride); - Copy(ps + 3 * F, pd + 3 * stride); - } - dst += 4 * stride; - } - for (; o < outputF; o += F, src += F) - { - const float* ps = src; - float* pd = dst; - size_t i = 0; - for (; i < ki; ++i, pd += F, ps += output) - Copy(ps, pd); - dst += stride; - } - if (outputF < output) - { - size_t tail = output - outputF; - for (size_t k = 0; k < kernel; ++k) - { - for (size_t i = 0; i < input; ++i, src += output) - { - size_t j = 0; - for (; j < tail; ++j) - *(dst++) = src[j]; - for (; j < F; ++j) - *(dst++) = 0; - } - } - } - } - - template void SynetReorderFilter_Oyxi4o_Oiyx(size_t output, size_t input, size_t kernel, const float* src, float* dst) - { - if (kernel == 1) - { - SynetReorderImage_Chw4c_Chw(output, input, src, dst); - return; - } - size_t outputF = AlignLo(output, F); - size_t tail = output - outputF; - size_t kernelF = AlignLo(kernel, F); - size_t ik = input * kernel; - size_t stride = F * input; - size_t o = 0; - for (; o < outputF; o += F, src += F * ik) - { - const float* ps = src; - float* pd = dst; - for (size_t i = 0; i < input; ++i, ps += F) - { - size_t k = 0; - for (; k < kernelF; k += F, pd += F) - Transpose4x4(ps + k * stride, stride, pd, ik); - for (; k < kernel; ++k, pd++) - { - pd[0 * ik] = ps[k * stride + 0]; - pd[1 * ik] = ps[k * stride + 1]; - pd[2 * ik] = ps[k * stride + 2]; - pd[3 * ik] = ps[k * stride + 3]; - } - } - dst += F * ik; - } - if (tail) - { - for (size_t j = 0; j < tail; ++j) - { - const float* ps = src + j; - for (size_t i = 0; i < input; ++i, ps += F) - for (size_t k = 0; k < kernel; ++k) - *(dst++) = ps[k * stride]; - } - } - } - - template void SynetReorderFilter_Oyxi4o_Yxio(size_t output, size_t input, size_t kernel, const float* src, float* dst) - { - size_t outputF = AlignLo(output, F); - size_t outputF4 = AlignLo(output, 4 * F); - size_t tail = output - outputF; - size_t ki = kernel * input; - size_t ki4 = AlignLo(ki, 4); - size_t stride = ki * F; - size_t i = 0; - for (; i < ki4; i += 4, src += 4 * F) - { - const float* ps = src; - float* pd = dst; - size_t o = 0; - for (; o < outputF4; o += 4 * F, ps += 4 * stride, pd += 4 * F) - Transpose4x4xF(ps, stride, pd, output); - for (; o < outputF; o += F, ps += stride, pd += F) - { - Copy(ps + 0 * F, pd + 0 * output); - Copy(ps + 1 * F, pd + 1 * output); - Copy(ps + 2 * F, pd + 2 * output); - Copy(ps + 3 * F, pd + 3 * output); - } - if (tail) - { - for (size_t j = 0; j < tail; ++j) - { - pd[j + 0 * output] = ps[j + 0 * F]; - pd[j + 1 * output] = ps[j + 1 * F]; - pd[j + 2 * output] = ps[j + 2 * F]; - pd[j + 3 * output] = ps[j + 3 * F]; - } - } - dst += 4 * output; - } - for (; i < ki; ++i, src += F) - { - const float* ps = src; - for (size_t o = 0; o < outputF; o += F, ps += stride, dst += F) - Copy(ps, dst); - if (tail) - { - for (size_t j = 0; j < tail; ++j) - *(dst++) = ps[j]; - } - } - } - - typedef void(*SynetFilterReorderPtr)(size_t output, size_t input, size_t kernel, const float* src, float* dst); - SynetFilterReorderPtr GetFilterReorder(SimdTensorFormatType src, SimdTensorFormatType dst) - { - if (src == SimdTensorFormatOiyx) - { - if (dst == SimdTensorFormatYxio) - return SynetReorderFilter_Oiyx_Yxio; - if (dst == SimdTensorFormatOyxi4o) - return SynetReorderFilter_Oiyx_Oyxi4o; - } - if (src == SimdTensorFormatYxio) - { - if (dst == SimdTensorFormatOiyx) - return SynetReorderFilter_Yxio_Oiyx; - if (dst == SimdTensorFormatOyxi4o) - return SynetReorderFilter_Yxio_Oyxi4o; - } - if (src == SimdTensorFormatOyxi4o) - { - if (dst == SimdTensorFormatOiyx) - return SynetReorderFilter_Oyxi4o_Oiyx; - if (dst == SimdTensorFormatYxio) - return SynetReorderFilter_Oyxi4o_Yxio; - } - return NULL; - } - - void SynetReorderFilter(size_t output, size_t input, size_t kernel, const float* src, SimdTensorFormatType srcFormat, float* dst, SimdTensorFormatType dstFormat) - { - SynetFilterReorderPtr filterReorder = GetFilterReorder(srcFormat, dstFormat); - if (filterReorder) - filterReorder(output, input, kernel, src, dst); - else - Base::SynetReorderFilter(output, input, kernel, src, srcFormat, dst, dstFormat); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonSynetConvolution32f.cpp b/src/3rd/Simd/Simd/SimdNeonSynetConvolution32f.cpp deleted file mode 100644 index e3b11b81..00000000 --- a/src/3rd/Simd/Simd/SimdNeonSynetConvolution32f.cpp +++ /dev/null @@ -1,1902 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdNeon.h" -#include "Simd/SimdGemm.h" -#include "Simd/SimdExp.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - void ConvolutionBiasAndActivation(const float * bias, size_t count, size_t size, ::SimdConvolutionActivationType activation, const float * params, ::SimdBool trans, float * dst) - { - size_t aligned = trans ? AlignLo(count, F) : AlignLo(size, F); - if (activation == ::SimdConvolutionActivationIdentity) - { - if (bias) - Neon::SynetAddBias(bias, count, size, dst, (SimdTensorFormatType)trans); - } - else if (activation == ::SimdConvolutionActivationRelu) - { - if (bias) - { - float32x4_t _0 = vdupq_n_f32(0.0f); - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - size_t i = 0; - for (; i < aligned; i += F) - { - float32x4_t _dst = Load(dst + i); - float32x4_t _bias = Load(bias + i); - Store(dst + i, vmaxq_f32(_0, vaddq_f32(_dst, _bias))); - } - for (; i < count; ++i) - dst[i] = Simd::Max(0.0f, dst[i] + bias[i]); - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - float32x4_t _bias = vdupq_n_f32(bias[i]); - size_t j = 0; - for (; j < aligned; j += F) - { - float32x4_t _dst = Load(dst + j); - Store(dst + j, vmaxq_f32(_0, vaddq_f32(_dst, _bias))); - } - for (; j < size; ++j) - dst[j] = Simd::Max(0.0f, dst[j] + bias[i]); - dst += size; - } - } - } - else - { - float slope = 0; - Neon::SynetRelu32f(dst, size*count, &slope, dst); - } - } - else if (activation == ::SimdConvolutionActivationLeakyRelu) - { - float slope = params[0]; - if (bias) - { - float32x4_t _0 = vdupq_n_f32(0.0f); - float32x4_t _slope = vdupq_n_f32(slope); - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - size_t i = 0; - for (; i < aligned; i += F) - { - float32x4_t value = vaddq_f32(Load(dst + i), Load(bias + i)); - Store(dst + i, SynetRelu32f(value, _slope, _0)); - } - for (; i < count; ++i) - dst[i] = Base::SynetRelu32f(dst[i] + bias[i], slope); - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - float32x4_t _bias = vdupq_n_f32(bias[i]); - size_t j = 0; - for (; j < aligned; j += F) - { - float32x4_t value = vaddq_f32(Load(dst + j), _bias); - Store(dst + j, SynetRelu32f(value, _slope, _0)); - } - for (; j < size; ++j) - dst[j] = Base::SynetRelu32f(dst[j] + bias[i], slope); - dst += size; - } - } - } - else - Neon::SynetRelu32f(dst, size*count, &slope, dst); - } - else if (activation == ::SimdConvolutionActivationRestrictRange) - { - float lower = params[0]; - float upper = params[1]; - if (bias) - { - float32x4_t _lower = vdupq_n_f32(lower); - float32x4_t _upper = vdupq_n_f32(upper); - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - size_t i = 0; - for (; i < aligned; i += F) - { - float32x4_t value = vaddq_f32(Load(dst + i), Load(bias + i)); - Store(dst + i, vminq_f32(vmaxq_f32(_lower, value), _upper)); - } - for (; i < count; ++i) - dst[i] = Simd::RestrictRange(dst[i] + bias[i], lower, upper); - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - float32x4_t _bias = vdupq_n_f32(bias[i]); - size_t j = 0; - for (; j < aligned; j += F) - { - float32x4_t value = vaddq_f32(Load(dst + j), _bias); - Store(dst + j, vminq_f32(vmaxq_f32(_lower, value), _upper)); - } - for (; j < size; ++j) - dst[j] = Simd::RestrictRange(dst[j] + bias[i], lower, upper); - dst += size; - } - } - } - else - Neon::SynetRestrictRange32f(dst, size*count, &lower, &upper, dst); - } - else if (activation == ::SimdConvolutionActivationPrelu) - { - if (bias) - { - float32x4_t _0 = vdupq_n_f32(0.0f); - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - size_t i = 0; - for (; i < aligned; i += F) - { - float32x4_t value = vaddq_f32(Load(dst + i), Load(bias + i)); - Store(dst + i, SynetRelu32f(value, Load(params + i), _0)); - } - for (; i < count; ++i) - dst[i] = Base::SynetRelu32f(dst[i] + bias[i], params[i]); - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - float32x4_t _bias = vdupq_n_f32(bias[i]); - float32x4_t _slope = vdupq_n_f32(params[i]); - size_t j = 0; - for (; j < aligned; j += F) - { - float32x4_t value = vaddq_f32(Load(dst + j), _bias); - Store(dst + j, SynetRelu32f(value, _slope, _0)); - } - for (; j < size; ++j) - dst[j] = Base::SynetRelu32f(dst[j] + bias[i], params[i]); - dst += size; - } - } - } - else - Neon::SynetPreluLayerForward(dst, params, count, size, dst, (SimdTensorFormatType)trans); - } - else if (activation == ::SimdConvolutionActivationElu) - { - float alpha = params[0]; - if (bias) - { - float32x4_t _0 = vdupq_n_f32(0.0f); - float32x4_t _alpha = vdupq_n_f32(alpha); - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - size_t i = 0; - for (; i < aligned; i += F) - { - float32x4_t value = vaddq_f32(Load(dst + i), Load(bias + i)); - Store(dst + i, Neon::Elu(value, _alpha)); - } - for (; i < count; ++i) - dst[i] = Base::SynetElu32f(dst[i] + bias[i], alpha); - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - float32x4_t _bias = vdupq_n_f32(bias[i]); - size_t j = 0; - for (; j < aligned; j += F) - { - float32x4_t value = vaddq_f32(Load(dst + j), _bias); - Store(dst + j, Neon::Elu(value, _alpha)); - } - for (; j < size; ++j) - dst[j] = Base::SynetElu32f(dst[j] + bias[i], alpha); - dst += size; - } - } - } - else - Neon::SynetElu32f(dst, size*count, &alpha, dst); - } - else if (activation == ::SimdConvolutionActivationHswish) - { - float shift = params[0]; - float scale = params[1]; - if (bias) - { - float32x4_t _shift = vdupq_n_f32(shift); - float32x4_t _scale = vdupq_n_f32(scale); - if (trans) - { - for (size_t j = 0; j < size; ++j) - { - size_t i = 0; - for (; i < aligned; i += F) - { - float32x4_t value = vaddq_f32(Load(dst + i), Load(bias + i)); - Store(dst + i, Neon::SynetHswish32f(value, _shift, _scale)); - } - for (; i < count; ++i) - dst[i] = Base::SynetHswish32f(dst[i] + bias[i], shift, scale); - dst += count; - } - } - else - { - for (size_t i = 0; i < count; ++i) - { - float32x4_t _bias = vdupq_n_f32(bias[i]); - size_t j = 0; - for (; j < aligned; j += F) - { - float32x4_t value = vaddq_f32(Load(dst + j), _bias); - Store(dst + j, Neon::SynetHswish32f(value, _shift, _scale)); - } - for (; j < size; ++j) - dst[j] = Base::SynetHswish32f(dst[j] + bias[i], shift, scale); - dst += size; - } - } - } - else - Neon::SynetHswish32f(dst, size*count, &shift, &scale, dst); - } - else - assert(0); - } - - //--------------------------------------------------------------------- - - SynetConvolution32fGemmNN::SynetConvolution32fGemmNN(const ConvParam32f & p) - : Base::SynetConvolution32fGemmNN(p) - { - _gemm.Init(InitGemmFuncs(Neon::Gemm32fNN, "Neon", p.gemm, "Ext")); - if (_param.trans && _param.group == 1) - { - if (NHWC_GEMM_RUNTIME) - { -#if defined(SIMD_ARM64_ENABLE) - _gemmCb.Init(InitGemmCbFuncs(Neon::Gemm32fNNcbBufferSize, Neon::Gemm32fNNcbReorderB, Neon::Gemm32fNNcbRun, "Neon", GemmKernelF2, GemmKernelF4)); -#else - _gemmCb.Init(InitGemmCbFuncs(Neon::Gemm32fNNcbBufferSize, Neon::Gemm32fNNcbReorderB, Neon::Gemm32fNNcbRun, "Neon", GemmKernelF2, GemmKernelF3)); -#endif - _nhwcWeight.Resize(_gemmCb.At(0).BufferSize(_M*_merge, _N, _K)); - } - else - _nhwcWeight.Resize(Neon::Gemm32fNNcbBufferSize(_M*_merge, _N, _K, GemmKernelAny, NHWC_GEMM_COMPATIBLE)); - _nhwcRun = Neon::Gemm32fNNcbRun; - _nhwcReorderB = Neon::Gemm32fNNcbReorderB; - } - _biasAndActivation = Neon::ConvolutionBiasAndActivation; - } - - //--------------------------------------------------------------------- - - SynetConvolution32fGemmNT::SynetConvolution32fGemmNT(const ConvParam32f & p) - : Base::SynetConvolution32fGemmNT(p) - { - _gemm.Init(InitGemmFuncs(Neon::Gemm32fNT, "Neon")); - _biasAndActivation = Neon::ConvolutionBiasAndActivation; - } - - bool SynetConvolution32fGemmNT::Preferable(const ConvParam32f & p) - { - if (p.group != 1) - return false; - if (p.trans) - return p.Is1x1() && p.dstC == 1; - else - return p.srcH < 4 && p.srcW < 4; - } - - //--------------------------------------------------------------------- - - SynetConvolution32fWinograd::SynetConvolution32fWinograd(const ConvParam32f & p) - : Base::SynetConvolution32fWinograd(p) - { - if (p.kernelY == 1 && p.kernelX == 3) - { - { - SetBlock(1, 4); - _setFilter = Neon::WinogradKernel1x3Block1x4SetFilter; - _setInput = Neon::WinogradKernel1x3Block1x4SetInput; - _setOutput = Neon::WinogradKernel1x3Block1x4SetOutput; - } - } - else if (p.kernelY == 1 && p.kernelX == 5) - { - { - SetBlock(1, 4); - _setFilter = Neon::WinogradKernel1x5Block1x4SetFilter; - _setInput = Neon::WinogradKernel1x5Block1x4SetInput; - _setOutput = Neon::WinogradKernel1x5Block1x4SetOutput; - } - } - else if (p.kernelY == 2 && p.kernelX == 2) - { - if (p.trans && p.srcH >= 8 && p.srcW >= 8 && p.srcH * p.srcW * p.batch >= 144) - { - SetBlock(4, 4); - _setFilter = Neon::WinogradKernel2x2Block4x4SetFilter; - _setInput = Neon::WinogradKernel2x2Block4x4SetInput; - _setOutput = Neon::WinogradKernel2x2Block4x4SetOutput; - } - else - { - SetBlock(2, 2); - _setFilter = Neon::WinogradKernel2x2Block2x2SetFilter; - _setInput = Neon::WinogradKernel2x2Block2x2SetInput; - _setOutput = Neon::WinogradKernel2x2Block2x2SetOutput; - } - } - else if (p.kernelY == 3 && p.kernelX == 3) - { - if (p.trans && p.srcH >= 8 && p.srcW >= 8 && p.srcH * p.srcW * p.batch >= 144) - { - SetBlock(4, 4); - _setFilter = Neon::WinogradKernel3x3Block4x4SetFilter; - _setInput = Neon::WinogradKernel3x3Block4x4SetInput; - _setOutput = Neon::WinogradKernel3x3Block4x4SetOutput; - } - else if (p.trans && p.srcH >= 6 && p.srcW >= 6 && p.srcH * p.srcW * p.batch >= 81 && p.dstH % 3 == 0 && p.dstW % 3 == 0) - { - SetBlock(3, 3); - _setFilter = Neon::WinogradKernel3x3Block3x3SetFilter; - _setInput = Neon::WinogradKernel3x3Block3x3SetInput; - _setOutput = Neon::WinogradKernel3x3Block3x3SetOutput; - } - else - { - SetBlock(2, 2); - _setFilter = Neon::WinogradKernel3x3Block2x2SetFilter; - _setInput = Neon::WinogradKernel3x3Block2x2SetInput; - _setOutput = Neon::WinogradKernel3x3Block2x2SetOutput; - } - } - else - assert(0); - _gemm.Init(InitGemmFuncs(Neon::Gemm32fNN, "Neon", p.gemm, "Ext")); - if (_param.trans) - { - if (NHWC_GEMM_RUNTIME) - { -#if defined(SIMD_ARM64_ENABLE) - _gemmCb.Init(InitGemmCbFuncs(Neon::Gemm32fNNcbBufferSize, Neon::Gemm32fNNcbReorderB, Neon::Gemm32fNNcbRun, "Neon", GemmKernelF2, GemmKernelF4)); -#else - _gemmCb.Init(InitGemmCbFuncs(Neon::Gemm32fNNcbBufferSize, Neon::Gemm32fNNcbReorderB, Neon::Gemm32fNNcbRun, "Neon", GemmKernelF2, GemmKernelF3)); -#endif - _nhwcStrideW = _gemmCb.At(0).BufferSize(_M*_merge, _N, _K); - } - else - _nhwcStrideW = Neon::Gemm32fNNcbBufferSize(_M*_merge, _N, _K, GemmKernelAny, NHWC_GEMM_COMPATIBLE); - _nhwcWeight.Resize(_nhwcStrideW*_count); - _nhwcRun = Neon::Gemm32fNNcbRun; - _nhwcReorderB = Neon::Gemm32fNNcbReorderB; - } - _biasAndActivation = Neon::ConvolutionBiasAndActivation; - } - - bool SynetConvolution32fWinograd::Preferable(const ConvParam32f & p) - { - if (!p.IsDilation(1) || !p.IsStride(1) || p.group != 1 || p.srcC < 10) - return false; - if (p.IsKernel(1, 3)) - { - if (!(p.IsPad(0) || (p.padX == 1 && p.padW == 1))) - return false; - if (p.srcC <= 32) - return false; - return p.trans && p.srcW >= 8 && p.srcH * p.srcW * p.batch >= 36; - } - else if (p.IsKernel(1, 5)) - { - if (!(p.IsPad(0) || (p.padX == 2 && p.padW == 2))) - return false; - return p.trans && p.srcW >= 8 && p.srcH * p.srcW * p.batch >= 36; - } - else if (p.IsKernel(2)) - { - if (!(p.IsPad(0) || (p.padY + p.padH == 1 && p.padX + p.padW == 1))) - return false; - return p.trans && p.srcH >= 4 && p.srcW >= 4 && p.srcH * p.srcW * p.batch >= 36; - } - else if (p.IsKernel(3)) - { - if (!(p.IsPad(0) || p.IsPad(1))) - return false; - if (p.trans) - return p.srcH >= 4 && p.srcW >= 4 && p.srcH * p.srcW * p.batch >= 36; - else - return p.srcH >= 6 && p.srcW >= 6; - } - return false; - } - - //--------------------------------------------------------------------- - - SynetConvolution32fDirectNchw::SynetConvolution32fDirectNchw(const ConvParam32f & p) - : Base::SynetConvolution32fDirectNchw(p) - { - _convolutionBiasActivation = SetConvolutionBiasActivation(); - } - - template SIMD_INLINE void LoadWeight(const float * src, float32x4_t * dst) - { - for (size_t i = 0; i < size; ++i) - dst[i] = vdupq_n_f32(src[i]); - } - - template struct Kernel - { - static float32x4_t SynetConvolution32f(const float * src, size_t step, const float32x4_t * weight); - }; - - template<> struct Kernel<1, 1> - { - static SIMD_INLINE float32x4_t SynetConvolution32f(const float * src, size_t step, const float32x4_t * weight) - { - return vmulq_f32(Load(src), weight[0]); - } - }; - - template<> struct Kernel<2, 1> - { - static SIMD_INLINE float32x4_t RowConv(const float * src, const float32x4_t * weight) - { - return vmlaq_f32(vmulq_f32(Load(src + 0), weight[0]), Load(src + 1), weight[1]); - } - - static SIMD_INLINE float32x4_t SynetConvolution32f(const float * src, size_t step, const float32x4_t * weight) - { - return vaddq_f32(RowConv(src, weight), RowConv(src + step, weight + 2)); - } - }; - - template<> struct Kernel<2, 2> - { - static SIMD_INLINE float32x4_t RowConv(const float * src, const float32x4_t * weight) - { - float32x4x2_t s = Load2(src); - return vmlaq_f32(vmulq_f32(s.val[0], weight[0]), s.val[1], weight[1]); - } - - static SIMD_INLINE float32x4_t SynetConvolution32f(const float * src, size_t step, const float32x4_t * weight) - { - return vaddq_f32(RowConv(src, weight), RowConv(src + step, weight + 2)); - } - }; - - template<> struct Kernel<3, 1> - { - static SIMD_INLINE float32x4_t RowConv(const float * src, const float32x4_t * weight) - { - return vmlaq_f32(vmlaq_f32(vmulq_f32(Load(src), weight[0]), - Load(src + 1), weight[1]), Load(src + 2), weight[2]); - } - - static SIMD_INLINE float32x4_t SynetConvolution32f(const float * src, size_t step, const float32x4_t * weight) - { - return vaddq_f32(RowConv(src, weight), - vaddq_f32(RowConv(src + step, weight + 3), - RowConv(src + 2 * step, weight + 6))); - } - }; - - template<> struct Kernel<3, 2> - { - static SIMD_INLINE float32x4_t RowConv(const float * src, const float32x4_t * weight) - { - float32x4x2_t s0 = Load2(src + 0); - float32x4x2_t s2 = Load2(src + 2); - return vmlaq_f32(vmlaq_f32(vmulq_f32(s0.val[0], weight[0]), - s0.val[1], weight[1]), s2.val[0], weight[2]); - } - - static SIMD_INLINE float32x4_t SynetConvolution32f(const float * src, size_t step, const float32x4_t * weight) - { - return vaddq_f32(RowConv(src, weight), - vaddq_f32(RowConv(src + step, weight + 3), - RowConv(src + 2 * step, weight + 6))); - } - }; - - template<> struct Kernel<3, 3> - { - static SIMD_INLINE float32x4_t RowConv(const float * src, const float32x4_t * weight) - { - float32x4x3_t s = Load3(src); - return vmlaq_f32(vmlaq_f32(vmulq_f32(s.val[0], weight[0]), - s.val[1], weight[1]), s.val[2], weight[2]); - } - - static SIMD_INLINE float32x4_t SynetConvolution32f(const float * src, size_t step, const float32x4_t * weight) - { - return vaddq_f32(RowConv(src, weight), - vaddq_f32(RowConv(src + step, weight + 3), - RowConv(src + 2 * step, weight + 6))); - } - }; - - template<::SimdConvolutionActivationType type> SIMD_INLINE float32x4_t Activate(float32x4_t value, const float32x4_t * params); - - template<> SIMD_INLINE float32x4_t Activate<::SimdConvolutionActivationIdentity>(float32x4_t value, const float32x4_t * params) - { - return value; - } - - template<> SIMD_INLINE float32x4_t Activate<::SimdConvolutionActivationRelu>(float32x4_t value, const float32x4_t * params) - { - return vmaxq_f32(vdupq_n_f32(0.0f), value); - } - - template<> SIMD_INLINE float32x4_t Activate<::SimdConvolutionActivationLeakyRelu>(float32x4_t value, const float32x4_t * params) - { - return vmlaq_f32(vmaxq_f32(vdupq_n_f32(0.0f), value), params[0], vminq_f32(vdupq_n_f32(0.0f), value)); - } - - template<> SIMD_INLINE float32x4_t Activate<::SimdConvolutionActivationRestrictRange>(float32x4_t value, const float32x4_t * params) - { - return vminq_f32(vmaxq_f32(params[0], value), params[1]); - } - - template<> SIMD_INLINE float32x4_t Activate<::SimdConvolutionActivationPrelu>(float32x4_t value, const float32x4_t * params) - { - return vmlaq_f32(vmaxq_f32(vdupq_n_f32(0.0f), value), params[0], vminq_f32(vdupq_n_f32(0.0f), value)); - } - - template<> SIMD_INLINE float32x4_t Activate<::SimdConvolutionActivationElu>(float32x4_t value, const float32x4_t * params) - { - return Neon::Elu(value, params[0]); - } - - template<> SIMD_INLINE float32x4_t Activate<::SimdConvolutionActivationHswish>(float32x4_t value, const float32x4_t * params) - { - return Neon::SynetHswish32f(value, params[0], params[1]); - } - - template - void ConvolutionBiasActivation(const float * src, size_t srcC, size_t srcH, size_t srcW, const float * weight, - const float * bias, const float * params, float * dst, size_t dstC, size_t dstH, size_t dstW) - { - float32x4_t _weight[kernel*kernel]; - float32x4_t _params[2]; - _params[0] = vdupq_n_f32(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = vdupq_n_f32(params[1]); - size_t dstWF = Simd::AlignLo(dstW, F); - float32x4_t tail = RightNotZero32f(dstW - dstWF); - for (size_t dc = 0; dc < dstC; ++dc) - { - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = vdupq_n_f32(params[dc]); - if (srcC == 1) - { - const float * ps = src; - float * pd = dst; - LoadWeight(weight, _weight); - float32x4_t _bias = bias ? vdupq_n_f32(bias[dc]) : vdupq_n_f32(0.0f); - for (size_t y = 0; y < dstH; ++y) - { - for (size_t x = 0; x < dstWF; x += F) - { - float32x4_t conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - Store(pd + x, Activate(vaddq_f32(_bias, conv), _params)); - } - if (dstWF < dstW) - { - size_t x = dstW - F; - float32x4_t _dst = Load(pd + x); - float32x4_t conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - Store(pd + x, vbslq_f32(vreinterpretq_u32_f32(tail), Activate(vaddq_f32(_bias, conv), _params), _dst)); - } - ps += srcW * stride; - pd += dstW; - } - weight += kernel * kernel; - } - else - { - size_t sc = 0; - for (; sc < 1; ++sc) - { - const float * ps = src; - float * pd = dst; - LoadWeight(weight, _weight); - float32x4_t _bias = bias ? vdupq_n_f32(bias[dc]) : vdupq_n_f32(0.0f); - for (size_t y = 0; y < dstH; ++y) - { - for (size_t x = 0; x < dstWF; x += F) - { - float32x4_t conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - Store(pd + x, vaddq_f32(_bias, conv)); - } - if (dstWF < dstW) - { - size_t x = dstW - F; - float32x4_t _dst = Load(pd + x); - float32x4_t conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - Store(pd + x, vbslq_f32(vreinterpretq_u32_f32(tail), vaddq_f32(_bias, conv), _dst)); - } - ps += srcW * stride; - pd += dstW; - } - weight += kernel * kernel; - } - for (; sc < srcC - 1; ++sc) - { - const float * ps = src + sc * srcW * srcH; - float * pd = dst; - LoadWeight(weight, _weight); - for (size_t y = 0; y < dstH; ++y) - { - for (size_t x = 0; x < dstWF; x += F) - { - float32x4_t _dst = Load(pd + x); - float32x4_t conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - Store(pd + x, vaddq_f32(_dst, conv)); - } - if (dstWF < dstW) - { - size_t x = dstW - F; - float32x4_t _dst = Load(pd + x); - float32x4_t conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - Store(pd + x, vaddq_f32(_dst, And(conv, tail))); - } - ps += srcW * stride; - pd += dstW; - } - weight += kernel * kernel; - } - for (; sc < srcC; ++sc) - { - const float * ps = src + sc * srcW * srcH; - float * pd = dst; - LoadWeight(weight, _weight); - for (size_t y = 0; y < dstH; ++y) - { - for (size_t x = 0; x < dstWF; x += F) - { - float32x4_t _dst = Load(pd + x); - float32x4_t conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - Store(pd + x, Activate(vaddq_f32(_dst, conv), _params)); - } - if (dstWF < dstW) - { - size_t x = dstW - F; - float32x4_t _dst = Load(pd + x); - float32x4_t conv = Kernel::SynetConvolution32f(ps + x * stride, srcW, _weight); - Store(pd + x, vbslq_f32(vreinterpretq_u32_f32(tail), Activate(vaddq_f32(_dst, conv), _params), _dst)); - } - ps += srcW * stride; - pd += dstW; - } - weight += kernel * kernel; - } - } - dst += dstH * dstW; - } - } - - bool SynetConvolution32fDirectNchw::Preferable(const ConvParam32f & p) - { - if (!p.IsDilation(1)) - return false; - if (!(p.IsStride(1) || p.IsStride(2) || p.IsStride(3))) - return false; - double k = double(p.srcC) / p.group * p.strideX * p.strideX * p.strideY / p.kernelX / p.kernelY; - return k < 2.0 && ((p.IsStride(1) && p.IsKernel(1)) || p.IsKernel(2) || p.IsKernel(3)) && p.trans == 0; - } - - template SynetConvolution32fDirectNchw::ConvolutionBiasActivationPtr SetConvolutionBiasActivation(::SimdConvolutionActivationType type) - { - switch (type) - { - case ::SimdConvolutionActivationIdentity: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationRelu: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationLeakyRelu: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationRestrictRange: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationPrelu: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationElu: return ConvolutionBiasActivation; - case ::SimdConvolutionActivationHswish: return ConvolutionBiasActivation; - default: - assert(0); - return NULL; - } - } - - SynetConvolution32fDirectNchw::ConvolutionBiasActivationPtr SynetConvolution32fDirectNchw::SetConvolutionBiasActivation() - { - const ConvParam32f & p = _param; - if (p.dstW < F) - return Base::SynetConvolution32fDirectNchw::SetConvolutionBiasActivation(); - switch (p.strideX) - { - case 1: - if (p.kernelX == 1) - return Neon::SetConvolutionBiasActivation<1, 1>(p.activation); - if (p.kernelX == 2) - return Neon::SetConvolutionBiasActivation<2, 1>(p.activation); - if (p.kernelX == 3) - return Neon::SetConvolutionBiasActivation<3, 1>(p.activation); - break; - case 2: - if (p.kernelX == 2) - return Neon::SetConvolutionBiasActivation<2, 2>(p.activation); - if (p.kernelX == 3) - return Neon::SetConvolutionBiasActivation<3, 2>(p.activation); - break; - case 3: - if (p.kernelX == 3) - return Neon::SetConvolutionBiasActivation<3, 3>(p.activation); - break; - default: - return Base::SynetConvolution32fDirectNchw::SetConvolutionBiasActivation(); - } - assert(0); - return NULL; - } - - //--------------------------------------------------------------------- - - SynetConvolution32fDirectNhwc::SynetConvolution32fDirectNhwc(const ConvParam32f & p) - : Base::SynetConvolution32fDirectNhwc(p) - { - _convolutionBiasActivation = SetConvolutionBiasActivation(); - } - - bool SynetConvolution32fDirectNhwc::Preferable(const ConvParam32f & p) - { - if (!p.IsDilation(1) || p.trans == 0) - return false; - if (p.group == 1) - { - if (p.kernelY > p.srcH || p.kernelX > p.srcW) - return false; - double k = double(p.srcC) / p.kernelX / p.kernelY; - return k < 2.0; - } - else if (p.IsDepthwise()) - { - return true; - } - return false; - } - - template<::SimdConvolutionActivationType type> SIMD_INLINE float32x4_t Activate(float32x4_t value, const float * params, size_t offset); - - template<> SIMD_INLINE float32x4_t Activate<::SimdConvolutionActivationIdentity>(float32x4_t value, const float * params, size_t offset) - { - return value; - } - - template<> SIMD_INLINE float32x4_t Activate<::SimdConvolutionActivationRelu>(float32x4_t value, const float * params, size_t offset) - { - return vmaxq_f32(vdupq_n_f32(0.0f), value); - } - - template<> SIMD_INLINE float32x4_t Activate<::SimdConvolutionActivationLeakyRelu>(float32x4_t value, const float * params, size_t offset) - { - return vmlaq_f32(vmaxq_f32(vdupq_n_f32(0.0f), value), vld1q_dup_f32(params + 0), vminq_f32(vdupq_n_f32(0.0f), value)); - } - - template<> SIMD_INLINE float32x4_t Activate<::SimdConvolutionActivationRestrictRange>(float32x4_t value, const float * params, size_t offset) - { - return vminq_f32(vmaxq_f32(vld1q_dup_f32(params + 0), value), vld1q_dup_f32(params + 1)); - } - - template<> SIMD_INLINE float32x4_t Activate<::SimdConvolutionActivationPrelu>(float32x4_t value, const float * params, size_t offset) - { - return vmlaq_f32(vmaxq_f32(vdupq_n_f32(0.0f), value), Load(params + offset), vminq_f32(vdupq_n_f32(0.0f), value)); - } - - template<> SIMD_INLINE float32x4_t Activate<::SimdConvolutionActivationElu>(float32x4_t value, const float * params, size_t offset) - { - return Neon::Elu(value, vld1q_dup_f32(params + 0)); - } - - template<> SIMD_INLINE float32x4_t Activate<::SimdConvolutionActivationHswish>(float32x4_t value, const float * params, size_t offset) - { - return Neon::SynetHswish32f(value, vld1q_dup_f32(params + 0), vld1q_dup_f32(params + 1)); - } - - SIMD_INLINE void KernelHwcDefaultEdge(const float * src, const ConvParam32f & p, size_t kH, size_t kW, const float * weight, float32x4_t & sum) - { - size_t size = kW * p.srcC, tail = (p.kernelX - kW)*p.srcC*p.dstC, dstC = p.dstC, stride = p.srcW * p.srcC; - for (size_t ky = 0; ky < kH; ++ky) - { - for (size_t i = 0; i < size; ++i, weight += dstC) - sum = vmlaq_f32(sum, vld1q_dup_f32(src + i), Load(weight)); - weight += tail; - src += stride; - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void KernelHwcDefaultEdge(const float * src, const ConvParam32f & p, size_t kH, size_t kW, const float * weight, const float * bias, const float * params, float * dst) - { - size_t dstC = p.dstC; - size_t dstCF = AlignLo(dstC, F); - size_t dc = 0; - for (; dc < dstCF; dc += F) - { - float32x4_t conv = bias ? Load(bias + dc) : vdupq_n_f32(0.0f); - KernelHwcDefaultEdge(src, p, kH, kW, weight + dc, conv); - Store(dst + dc, Activate(conv, params, dc)); - } - if (dc < dstC) - { - dc = dstC - F; - float32x4_t conv = bias ? Load(bias + dc) : vdupq_n_f32(0.0f); - KernelHwcDefaultEdge(src, p, kH, kW, weight + dc, conv); - Store(dst + dc, Activate(conv, params, dc)); - } - } - - SIMD_INLINE void KernelHwcDefaultBody2x2(const float * src, const ConvParam32f & p, const float * weight, float32x4_t sums[2][2]) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - float32x4_t w0, w1, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = Load(weight + 0 * F); - w1 = Load(weight + 1 * F); - s0 = vld1q_dup_f32(src0 + offset); - sums[0][0] = vmlaq_f32(sums[0][0], s0, w0); - sums[0][1] = vmlaq_f32(sums[0][1], s0, w1); - s0 = vld1q_dup_f32(src1 + offset); - sums[1][0] = vmlaq_f32(sums[1][0], s0, w0); - sums[1][1] = vmlaq_f32(sums[1][1], s0, w1); - weight += dstC; - } - } - } - - SIMD_INLINE void KernelHwcDefaultBody2x1(const float * src, const ConvParam32f & p, const float * weight, float32x4_t sums[2][1]) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - float32x4_t w0, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = Load(weight + 0 * F); - s0 = vld1q_dup_f32(src0 + offset); - sums[0][0] = vmlaq_f32(sums[0][0], s0, w0); - s0 = vld1q_dup_f32(src1 + offset); - sums[1][0] = vmlaq_f32(sums[1][0], s0, w0); - weight += dstC; - } - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void KernelHwcDefaultBody2(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t dstC = p.dstC; - size_t dstCF1 = AlignLo(dstC, 1 * F); - size_t dstCF2 = AlignLo(dstC, 2 * F); - size_t dc = 0; - for (; dc < dstCF2; dc += 2 * F) - { - float32x4_t sums[2][2]; - float32x4_t bias0 = bias ? Load(bias + dc + 0 * F) : vdupq_n_f32(0.0f); - float32x4_t bias1 = bias ? Load(bias + dc + 1 * F) : vdupq_n_f32(0.0f); - sums[0][0] = bias0; - sums[0][1] = bias1; - sums[1][0] = bias0; - sums[1][1] = bias1; - KernelHwcDefaultBody2x2(src, p, weight + dc, sums); - Store(dst + dc + 0 * dstC + 0 * F, Activate(sums[0][0], params, dc + 0 * F)); - Store(dst + dc + 0 * dstC + 1 * F, Activate(sums[0][1], params, dc + 1 * F)); - Store(dst + dc + 1 * dstC + 0 * F, Activate(sums[1][0], params, dc + 0 * F)); - Store(dst + dc + 1 * dstC + 1 * F, Activate(sums[1][1], params, dc + 1 * F)); - } - for (; dc < dstCF1; dc += 1 * F) - { - float32x4_t sums[2][1]; - float32x4_t bias0 = bias ? Load(bias + dc) : vdupq_n_f32(0.0f); - sums[0][0] = bias0; - sums[1][0] = bias0; - KernelHwcDefaultBody2x1(src, p, weight + dc, sums); - Store(dst + dc + 0 * dstC, Activate(sums[0][0], params, dc)); - Store(dst + dc + 1 * dstC, Activate(sums[1][0], params, dc)); - } - if (dc < dstC) - { - dc = dstC - F; - float32x4_t sums[2][1]; - float32x4_t bias0 = bias ? Load(bias + dc) : vdupq_n_f32(0.0f); - sums[0][0] = bias0; - sums[1][0] = bias0; - KernelHwcDefaultBody2x1(src, p, weight + dc, sums); - Store(dst + dc + 0 * dstC, Activate(sums[0][0], params, dc)); - Store(dst + dc + 1 * dstC, Activate(sums[1][0], params, dc)); - } - } - - SIMD_INLINE void KernelHwcDefaultBody6x2(const float * src, const ConvParam32f & p, const float * weight, float32x4_t sums[6][2]) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - const float * src2 = src + 2 * step; - const float * src3 = src + 3 * step; - const float * src4 = src + 4 * step; - const float * src5 = src + 5 * step; - float32x4_t w0, w1, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = Load(weight + 0 * F); - w1 = Load(weight + 1 * F); - s0 = vld1q_dup_f32(src0 + offset); - sums[0][0] = vmlaq_f32(sums[0][0], s0, w0); - sums[0][1] = vmlaq_f32(sums[0][1], s0, w1); - s0 = vld1q_dup_f32(src1 + offset); - sums[1][0] = vmlaq_f32(sums[1][0], s0, w0); - sums[1][1] = vmlaq_f32(sums[1][1], s0, w1); - s0 = vld1q_dup_f32(src2 + offset); - sums[2][0] = vmlaq_f32(sums[2][0], s0, w0); - sums[2][1] = vmlaq_f32(sums[2][1], s0, w1); - s0 = vld1q_dup_f32(src3 + offset); - sums[3][0] = vmlaq_f32(sums[3][0], s0, w0); - sums[3][1] = vmlaq_f32(sums[3][1], s0, w1); - s0 = vld1q_dup_f32(src4 + offset); - sums[4][0] = vmlaq_f32(sums[4][0], s0, w0); - sums[4][1] = vmlaq_f32(sums[4][1], s0, w1); - s0 = vld1q_dup_f32(src5 + offset); - sums[5][0] = vmlaq_f32(sums[5][0], s0, w0); - sums[5][1] = vmlaq_f32(sums[5][1], s0, w1); - weight += dstC; - } - } - } - - SIMD_INLINE void KernelHwcDefaultBody6x1(const float * src, const ConvParam32f & p, const float * weight, float32x4_t sums[6][1]) - { - size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float * src0 = src + 0 * step; - const float * src1 = src + 1 * step; - const float * src2 = src + 2 * step; - const float * src3 = src + 3 * step; - const float * src4 = src + 4 * step; - const float * src5 = src + 5 * step; - float32x4_t w0, s0; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = Load(weight + 0 * F); - s0 = vld1q_dup_f32(src0 + offset); - sums[0][0] = vmlaq_f32(sums[0][0], s0, w0); - s0 = vld1q_dup_f32(src1 + offset); - sums[1][0] = vmlaq_f32(sums[1][0], s0, w0); - s0 = vld1q_dup_f32(src2 + offset); - sums[2][0] = vmlaq_f32(sums[2][0], s0, w0); - s0 = vld1q_dup_f32(src3 + offset); - sums[3][0] = vmlaq_f32(sums[3][0], s0, w0); - s0 = vld1q_dup_f32(src4 + offset); - sums[4][0] = vmlaq_f32(sums[4][0], s0, w0); - s0 = vld1q_dup_f32(src5 + offset); - sums[5][0] = vmlaq_f32(sums[5][0], s0, w0); - weight += dstC; - } - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void KernelHwcDefaultBody6(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t dstC = p.dstC; - size_t dstCF1 = AlignLo(dstC, 1 * F); - size_t dstCF2 = AlignLo(dstC, 2 * F); - size_t dc = 0; - for (; dc < dstCF2; dc += 2 * F) - { - float32x4_t sums[6][2]; - float32x4_t bias0 = bias ? Load(bias + dc + 0 * F) : vdupq_n_f32(0.0f); - float32x4_t bias1 = bias ? Load(bias + dc + 1 * F) : vdupq_n_f32(0.0f); - sums[0][0] = bias0; - sums[0][1] = bias1; - sums[1][0] = bias0; - sums[1][1] = bias1; - sums[2][0] = bias0; - sums[2][1] = bias1; - sums[3][0] = bias0; - sums[3][1] = bias1; - sums[4][0] = bias0; - sums[4][1] = bias1; - sums[5][0] = bias0; - sums[5][1] = bias1; - KernelHwcDefaultBody6x2(src, p, weight + dc, sums); - Store(dst + dc + 0 * dstC + 0 * F, Activate(sums[0][0], params, dc + 0 * F)); - Store(dst + dc + 0 * dstC + 1 * F, Activate(sums[0][1], params, dc + 1 * F)); - Store(dst + dc + 1 * dstC + 0 * F, Activate(sums[1][0], params, dc + 0 * F)); - Store(dst + dc + 1 * dstC + 1 * F, Activate(sums[1][1], params, dc + 1 * F)); - Store(dst + dc + 2 * dstC + 0 * F, Activate(sums[2][0], params, dc + 0 * F)); - Store(dst + dc + 2 * dstC + 1 * F, Activate(sums[2][1], params, dc + 1 * F)); - Store(dst + dc + 3 * dstC + 0 * F, Activate(sums[3][0], params, dc + 0 * F)); - Store(dst + dc + 3 * dstC + 1 * F, Activate(sums[3][1], params, dc + 1 * F)); - Store(dst + dc + 4 * dstC + 0 * F, Activate(sums[4][0], params, dc + 0 * F)); - Store(dst + dc + 4 * dstC + 1 * F, Activate(sums[4][1], params, dc + 1 * F)); - Store(dst + dc + 5 * dstC + 0 * F, Activate(sums[5][0], params, dc + 0 * F)); - Store(dst + dc + 5 * dstC + 1 * F, Activate(sums[5][1], params, dc + 1 * F)); - } - for (; dc < dstCF1; dc += 1 * F) - { - float32x4_t sums[6][1]; - float32x4_t bias0 = bias ? Load(bias + dc) : vdupq_n_f32(0.0f); - sums[0][0] = bias0; - sums[1][0] = bias0; - sums[2][0] = bias0; - sums[3][0] = bias0; - sums[4][0] = bias0; - sums[5][0] = bias0; - KernelHwcDefaultBody6x1(src, p, weight + dc, sums); - Store(dst + dc + 0 * dstC, Activate(sums[0][0], params, dc)); - Store(dst + dc + 1 * dstC, Activate(sums[1][0], params, dc)); - Store(dst + dc + 2 * dstC, Activate(sums[2][0], params, dc)); - Store(dst + dc + 3 * dstC, Activate(sums[3][0], params, dc)); - Store(dst + dc + 4 * dstC, Activate(sums[4][0], params, dc)); - Store(dst + dc + 5 * dstC, Activate(sums[5][0], params, dc)); - } - if (dc < dstC) - { - dc = dstC - F; - float32x4_t sums[6][1]; - float32x4_t bias0 = bias ? Load(bias + dc) : vdupq_n_f32(0.0f); - sums[0][0] = bias0; - sums[1][0] = bias0; - sums[2][0] = bias0; - sums[3][0] = bias0; - sums[4][0] = bias0; - sums[5][0] = bias0; - KernelHwcDefaultBody6x1(src, p, weight + dc, sums); - Store(dst + dc + 0 * dstC, Activate(sums[0][0], params, dc)); - Store(dst + dc + 1 * dstC, Activate(sums[1][0], params, dc)); - Store(dst + dc + 2 * dstC, Activate(sums[2][0], params, dc)); - Store(dst + dc + 3 * dstC, Activate(sums[3][0], params, dc)); - Store(dst + dc + 4 * dstC, Activate(sums[4][0], params, dc)); - Store(dst + dc + 5 * dstC, Activate(sums[5][0], params, dc)); - } - } - - template<::SimdConvolutionActivationType type> void ConvolutionDirectNhwcConvolutionBiasActivationDefault(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t noseH = p.padY, noseW = p.padX; - size_t bodyH = p.srcH - p.kernelY + 1 + noseH, bodyW = p.srcW - p.kernelX + 1 + noseW; - size_t tailH = bodyH + p.padH, tailW = bodyW + p.padW; - size_t bodyW2 = AlignLoAny(bodyW - noseW, 2 * p.strideX) + noseW; - size_t bodyW6 = AlignLoAny(bodyW - noseW, 6 * p.strideX) + noseW; - size_t wS = p.srcC*p.dstC; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - size_t sy = 0; - for (; sy < noseH; sy += p.strideY) - { - size_t sx = 0; - const float * w = weight + (noseH - sy) * p.kernelY * wS; - for (; sx < noseW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src, p, kY + sy, kX + sx, w + (noseW - sx)*wS, bias, params, dst); - for (; sx < bodyW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, kY + sy, p.kernelX, w, bias, params, dst); - for (; sx < tailW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, kY + sy, kW - sx, w, bias, params, dst); - } - src += (sy - noseH)*p.srcW*p.srcC; - for (; sy < bodyH; sy += p.strideY) - { - size_t sx = 0; - for (; sx < noseW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src, p, p.kernelY, kX + sx, weight + (noseW - sx)*wS, bias, params, dst); - for (; sx < bodyW6; sx += 6 * p.strideX, dst += 6 * p.dstC) - KernelHwcDefaultBody6(src + (sx - noseW) * p.srcC, p, weight, bias, params, dst); - for (; sx < bodyW2; sx += 2 * p.strideX, dst += 2 * p.dstC) - KernelHwcDefaultBody2(src + (sx - noseW) * p.srcC, p, weight, bias, params, dst); - for (; sx < bodyW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, p.kernelY, p.kernelX, weight, bias, params, dst); - for (; sx < tailW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, p.kernelY, kW - sx, weight, bias, params, dst); - src += p.strideY*p.srcW*p.srcC; - } - for (; sy < tailH; sy += p.strideY) - { - size_t sx = 0; - for (; sx < noseW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src, p, kH - sy, kX + sx, weight + (noseW - sx)*wS, bias, params, dst); - for (; sx < bodyW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, kH - sy, p.kernelX, weight, bias, params, dst); - for (; sx < tailW; sx += p.strideX, dst += p.dstC) - KernelHwcDefaultEdge(src + (sx - noseW) * p.srcC, p, kH - sy, kW - sx, weight, bias, params, dst); - src += p.strideY*p.srcW*p.srcC; - } - } - - template<::SimdConvolutionActivationType type> void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t size = p.group; - size_t sizeF = AlignLo(size, F); - size_t size2F = AlignLo(size, 2 * F); - size_t size4F = AlignLo(size, 4 * F); - size_t size8F = AlignLo(size, 8 * F); - for (size_t dy = 0; dy < p.dstH; ++dy) - { - for (size_t dx = 0; dx < p.dstW; ++dx) - { - size_t i = 0; - for (; i < size8F; i += 8 * F) - { - float32x4_t sums[8]; - if (bias) - { - sums[0] = Load(bias + i + 0 * F); - sums[1] = Load(bias + i + 1 * F); - sums[2] = Load(bias + i + 2 * F); - sums[3] = Load(bias + i + 3 * F); - sums[4] = Load(bias + i + 4 * F); - sums[5] = Load(bias + i + 5 * F); - sums[6] = Load(bias + i + 6 * F); - sums[7] = Load(bias + i + 7 * F); - } - else - { - sums[0] = vdupq_n_f32(0.0f); - sums[1] = vdupq_n_f32(0.0f); - sums[2] = vdupq_n_f32(0.0f); - sums[3] = vdupq_n_f32(0.0f); - sums[4] = vdupq_n_f32(0.0f); - sums[5] = vdupq_n_f32(0.0f); - sums[6] = vdupq_n_f32(0.0f); - sums[7] = vdupq_n_f32(0.0f); - } - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * p.strideX + kx * p.dilationX - p.padX; - if (sx < p.srcW) - { - const float * pw = weight + (ky*p.kernelX + kx)*size + i; - const float * ps = src + (sy*p.srcW + sx)*size + i; - sums[0] = vmlaq_f32(sums[0], Load(ps + 0 * F), Load(pw + 0 * F)); - sums[1] = vmlaq_f32(sums[1], Load(ps + 1 * F), Load(pw + 1 * F)); - sums[2] = vmlaq_f32(sums[2], Load(ps + 2 * F), Load(pw + 2 * F)); - sums[3] = vmlaq_f32(sums[3], Load(ps + 3 * F), Load(pw + 3 * F)); - sums[4] = vmlaq_f32(sums[4], Load(ps + 4 * F), Load(pw + 4 * F)); - sums[5] = vmlaq_f32(sums[5], Load(ps + 5 * F), Load(pw + 5 * F)); - sums[6] = vmlaq_f32(sums[6], Load(ps + 6 * F), Load(pw + 6 * F)); - sums[7] = vmlaq_f32(sums[7], Load(ps + 7 * F), Load(pw + 7 * F)); - } - } - } - } - Store(dst + i + 0 * F, Activate(sums[0], params, i + 0 * F)); - Store(dst + i + 1 * F, Activate(sums[1], params, i + 1 * F)); - Store(dst + i + 2 * F, Activate(sums[2], params, i + 2 * F)); - Store(dst + i + 3 * F, Activate(sums[3], params, i + 3 * F)); - Store(dst + i + 4 * F, Activate(sums[4], params, i + 4 * F)); - Store(dst + i + 5 * F, Activate(sums[5], params, i + 5 * F)); - Store(dst + i + 6 * F, Activate(sums[6], params, i + 6 * F)); - Store(dst + i + 7 * F, Activate(sums[7], params, i + 7 * F)); - } - for (; i < size4F; i += 4 * F) - { - float32x4_t sums[4]; - if (bias) - { - sums[0] = Load(bias + i + 0 * F); - sums[1] = Load(bias + i + 1 * F); - sums[2] = Load(bias + i + 2 * F); - sums[3] = Load(bias + i + 3 * F); - } - else - { - sums[0] = vdupq_n_f32(0.0f); - sums[1] = vdupq_n_f32(0.0f); - sums[2] = vdupq_n_f32(0.0f); - sums[3] = vdupq_n_f32(0.0f); - } - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * p.strideX + kx * p.dilationX - p.padX; - if (sx < p.srcW) - { - const float * pw = weight + (ky*p.kernelX + kx)*size + i; - const float * ps = src + (sy*p.srcW + sx)*size + i; - sums[0] = vmlaq_f32(sums[0], Load(ps + 0 * F), Load(pw + 0 * F)); - sums[1] = vmlaq_f32(sums[1], Load(ps + 1 * F), Load(pw + 1 * F)); - sums[2] = vmlaq_f32(sums[2], Load(ps + 2 * F), Load(pw + 2 * F)); - sums[3] = vmlaq_f32(sums[3], Load(ps + 3 * F), Load(pw + 3 * F)); - } - } - } - } - Store(dst + i + 0 * F, Activate(sums[0], params, i + 0 * F)); - Store(dst + i + 1 * F, Activate(sums[1], params, i + 1 * F)); - Store(dst + i + 2 * F, Activate(sums[2], params, i + 2 * F)); - Store(dst + i + 3 * F, Activate(sums[3], params, i + 3 * F)); - } - for (; i < size2F; i += 2 * F) - { - float32x4_t sums[2]; - if (bias) - { - sums[0] = Load(bias + i + 0 * F); - sums[1] = Load(bias + i + 1 * F); - } - else - { - sums[0] = vdupq_n_f32(0.0f); - sums[1] = vdupq_n_f32(0.0f); - } - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * p.strideX + kx * p.dilationX - p.padX; - if (sx < p.srcW) - { - const float * pw = weight + (ky*p.kernelX + kx)*size + i; - const float * ps = src + (sy*p.srcW + sx)*size + i; - sums[0] = vmlaq_f32(sums[0], Load(ps + 0 * F), Load(pw + 0 * F)); - sums[1] = vmlaq_f32(sums[1], Load(ps + 1 * F), Load(pw + 1 * F)); - } - } - } - } - Store(dst + i + 0 * F, Activate(sums[0], params, i + 0 * F)); - Store(dst + i + 1 * F, Activate(sums[1], params, i + 1 * F)); - } - for (; i < size; i += F) - { - size_t ci = i >= sizeF ? size - F : i; - float32x4_t sum = bias ? Load(bias + ci) : vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * p.strideX + kx * p.dilationX - p.padX; - if (sx < p.srcW) - { - const float * pw = weight + (ky*p.kernelX + kx)*size + ci; - const float * ps = src + (sy*p.srcW + sx)*size + ci; - sum = vmlaq_f32(sum, Load(ps + 0 * F), Load(pw + 0 * F)); - } - } - } - } - Store(dst + ci, Activate(sum, params, ci)); - } - dst += p.dstC; - } - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(const float * src, const ConvParam32f & p, size_t dy, size_t dx, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcC = p.srcC; - size_t srcCF = AlignLo(srcC, F); - size_t c = 0; - for (; c < srcCF; c += F) - { - float32x4_t sum = bias ? Load(bias + c) : vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < 3; ++ky) - { - size_t sy = dy * p.strideY + ky - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < 3; ++kx) - { - size_t sx = dx * p.strideX + kx - p.padX; - if (sx < p.srcW) - { - const float * pw = weight + (ky * 3 + kx) * srcC; - const float * ps = src + (sy*p.srcW + sx) * srcC; - sum = vmlaq_f32(sum, Load(ps), Load(pw)); - } - } - } - } - Store(dst + c, Activate(sum, params, c)); - src += F; - weight += F; - } - if (c < srcC) - { - c = p.srcC - F; - src -= srcCF - c; - weight -= srcCF - c; - float32x4_t sum = bias ? Load(bias + c) : vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < 3; ++ky) - { - size_t sy = dy * p.strideY + ky - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < 3; ++kx) - { - size_t sx = dx * p.strideX + kx - p.padX; - if (sx < p.srcW) - { - const float * pw = weight + (ky * 3 + kx) * srcC; - const float * ps = src + (sy*p.srcW + sx) * srcC; - sum = vmlaq_f32(sum, Load(ps), Load(pw)); - } - } - } - } - Store(dst + c, Activate(sum, params, c)); - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1(const float * src, size_t srcS, size_t srcC, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcCF = AlignLo(srcC, F); - size_t c = 0; - for (; c < srcCF; c += F) - { - float32x4_t sum = bias ? Load(bias + c) : vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < 3; ++ky) - { - const float * ps = src + ky * srcS; - const float * pw = weight + ky * 3 * srcC; - sum = vmlaq_f32(sum, Load(ps + 0 * srcC), Load(pw + 0 * srcC)); - sum = vmlaq_f32(sum, Load(ps + 1 * srcC), Load(pw + 1 * srcC)); - sum = vmlaq_f32(sum, Load(ps + 2 * srcC), Load(pw + 2 * srcC)); - } - Store(dst + c, Activate(sum, params, c)); - src += F; - weight += F; - } - if (c < srcC) - { - c = srcC - F; - src -= srcCF - c; - weight -= srcCF - c; - float32x4_t sum = bias ? Load(bias + c) : vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < 3; ++ky) - { - const float * ps = src + ky * srcS; - const float * pw = weight + ky * 3 * srcC; - sum = vmlaq_f32(sum, Load(ps + 0 * srcC), Load(pw + 0 * srcC)); - sum = vmlaq_f32(sum, Load(ps + 1 * srcC), Load(pw + 1 * srcC)); - sum = vmlaq_f32(sum, Load(ps + 2 * srcC), Load(pw + 2 * srcC)); - } - Store(dst + c, Activate(sum, params, c)); - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2(const float * src, size_t srcS, size_t srcX, size_t srcC, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcCF = AlignLo(srcC, F); - size_t c = 0; - float32x4_t sum0, sum1, w0; - for (; c < srcCF; c += F) - { - sum0 = bias ? Load(bias + c) : vdupq_n_f32(0.0f); - sum1 = sum0; - const float * pw = weight + c; - for (size_t ky = 0; ky < 3; ++ky) - { - const float * ps0 = src + ky * srcS; - const float * ps1 = ps0 + srcX; - w0 = Load(pw); - sum0 = vmlaq_f32(sum0, Load(ps0 + 0 * srcC), w0); - sum1 = vmlaq_f32(sum1, Load(ps1 + 0 * srcC), w0); - pw += srcC; - w0 = Load(pw); - sum0 = vmlaq_f32(sum0, Load(ps0 + 1 * srcC), w0); - sum1 = vmlaq_f32(sum1, Load(ps1 + 1 * srcC), w0); - pw += srcC; - w0 = Load(pw); - sum0 = vmlaq_f32(sum0, Load(ps0 + 2 * srcC), w0); - sum1 = vmlaq_f32(sum1, Load(ps1 + 2 * srcC), w0); - pw += srcC; - } - Store(dst + c, Activate(sum0, params, c)); - Store(dst + c + srcC, Activate(sum1, params, c)); - src += F; - } - if (c < srcC) - { - c = srcC - F; - src -= srcCF - c; - sum0 = bias ? Load(bias + c) : vdupq_n_f32(0.0f); - sum1 = sum0; - const float * pw = weight + c; - for (size_t ky = 0; ky < 3; ++ky) - { - const float * ps0 = src + ky * srcS; - const float * ps1 = ps0 + srcX; - w0 = Load(pw); - sum0 = vmlaq_f32(sum0, Load(ps0 + 0 * srcC), w0); - sum1 = vmlaq_f32(sum1, Load(ps1 + 0 * srcC), w0); - pw += srcC; - w0 = Load(pw); - sum0 = vmlaq_f32(sum0, Load(ps0 + 1 * srcC), w0); - sum1 = vmlaq_f32(sum1, Load(ps1 + 1 * srcC), w0); - pw += srcC; - w0 = Load(pw); - sum0 = vmlaq_f32(sum0, Load(ps0 + 2 * srcC), w0); - sum1 = vmlaq_f32(sum1, Load(ps1 + 2 * srcC), w0); - pw += srcC; - } - Store(dst + c, Activate(sum0, params, c)); - Store(dst + c + srcC, Activate(sum1, params, c)); - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4(const float * src, size_t srcS, size_t srcX, size_t srcC, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcCF = AlignLo(srcC, F); - size_t c = 0; - for (; c < srcCF; c += F) - { - float32x4_t sum0, sum1, sum2, sum3, w0; - sum0 = bias ? Load(bias + c) : vdupq_n_f32(0.0f); - sum1 = sum0; - sum2 = sum0; - sum3 = sum0; - const float * pw = weight + c; - const float * ps0 = src + 0 * srcX; - const float * ps1 = src + 1 * srcX; - const float * ps2 = src + 2 * srcX; - const float * ps3 = src + 3 * srcX; - for (size_t ky = 0; ky < 3; ++ky) - { - size_t offset = ky * srcS; - w0 = Load(pw); - sum0 = vmlaq_f32(sum0, Load(ps0 + offset), w0); - sum1 = vmlaq_f32(sum1, Load(ps1 + offset), w0); - sum2 = vmlaq_f32(sum2, Load(ps2 + offset), w0); - sum3 = vmlaq_f32(sum3, Load(ps3 + offset), w0); - pw += srcC, offset += srcC; - w0 = Load(pw); - sum0 = vmlaq_f32(sum0, Load(ps0 + offset), w0); - sum1 = vmlaq_f32(sum1, Load(ps1 + offset), w0); - sum2 = vmlaq_f32(sum2, Load(ps2 + offset), w0); - sum3 = vmlaq_f32(sum3, Load(ps3 + offset), w0); - pw += srcC, offset += srcC; - w0 = Load(pw); - sum0 = vmlaq_f32(sum0, Load(ps0 + offset), w0); - sum1 = vmlaq_f32(sum1, Load(ps1 + offset), w0); - sum2 = vmlaq_f32(sum2, Load(ps2 + offset), w0); - sum3 = vmlaq_f32(sum3, Load(ps3 + offset), w0); - pw += srcC, offset += srcC; - } - Store(dst + 0 * srcC, Activate(sum0, params, c)); - Store(dst + 1 * srcC, Activate(sum1, params, c)); - Store(dst + 2 * srcC, Activate(sum2, params, c)); - Store(dst + 3 * srcC, Activate(sum3, params, c)); - src += F; - dst += F; - } - if (c < srcC) - { - c = srcC - F; - src -= srcCF - c; - dst -= srcCF - c; - float32x4_t sum0, sum1, sum2, sum3, w0; - sum0 = bias ? Load(bias + c) : vdupq_n_f32(0.0f); - sum1 = sum0; - sum2 = sum0; - sum3 = sum0; - const float * pw = weight + c; - const float * ps0 = src + 0 * srcX; - const float * ps1 = src + 1 * srcX; - const float * ps2 = src + 2 * srcX; - const float * ps3 = src + 3 * srcX; - for (size_t ky = 0; ky < 3; ++ky) - { - size_t offset = ky * srcS; - w0 = Load(pw); - sum0 = vmlaq_f32(sum0, Load(ps0 + offset), w0); - sum1 = vmlaq_f32(sum1, Load(ps1 + offset), w0); - sum2 = vmlaq_f32(sum2, Load(ps2 + offset), w0); - sum3 = vmlaq_f32(sum3, Load(ps3 + offset), w0); - pw += srcC, offset += srcC; - w0 = Load(pw); - sum0 = vmlaq_f32(sum0, Load(ps0 + offset), w0); - sum1 = vmlaq_f32(sum1, Load(ps1 + offset), w0); - sum2 = vmlaq_f32(sum2, Load(ps2 + offset), w0); - sum3 = vmlaq_f32(sum3, Load(ps3 + offset), w0); - pw += srcC, offset += srcC; - w0 = Load(pw); - sum0 = vmlaq_f32(sum0, Load(ps0 + offset), w0); - sum1 = vmlaq_f32(sum1, Load(ps1 + offset), w0); - sum2 = vmlaq_f32(sum2, Load(ps2 + offset), w0); - sum3 = vmlaq_f32(sum3, Load(ps3 + offset), w0); - pw += srcC, offset += srcC; - } - Store(dst + 0 * srcC, Activate(sum0, params, c)); - Store(dst + 1 * srcC, Activate(sum1, params, c)); - Store(dst + 2 * srcC, Activate(sum2, params, c)); - Store(dst + 3 * srcC, Activate(sum3, params, c)); - } - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4(const float * src, const ConvParam32f & p, size_t dy, size_t dx, const float32x4_t * weight, float32x4_t bias, const float * params, float * dst) - { - float32x4_t sum = bias; - for (size_t ky = 0; ky < 3; ++ky) - { - size_t sy = dy * p.strideY + ky - p.padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < 3; ++kx) - { - size_t sx = dx * p.strideX + kx - p.padX; - if (sx < p.srcW) - { - const float * ps = src + (sy*p.srcW + sx) * F; - sum = vmlaq_f32(sum, Load(ps), weight[ky * 3 + kx]); - } - } - } - } - Store(dst, Activate(sum, params, 0)); - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1(const float * src, size_t srcS, const float32x4_t * weight, float32x4_t bias, const float * params, float * dst) - { - float32x4_t sum = bias; - sum = vmlaq_f32(sum, Load(src + 0 * F), weight[0]); - sum = vmlaq_f32(sum, Load(src + 1 * F), weight[1]); - sum = vmlaq_f32(sum, Load(src + 2 * F), weight[2]); - src += srcS; - sum = vmlaq_f32(sum, Load(src + 0 * F), weight[3]); - sum = vmlaq_f32(sum, Load(src + 1 * F), weight[4]); - sum = vmlaq_f32(sum, Load(src + 2 * F), weight[5]); - src += srcS; - sum = vmlaq_f32(sum, Load(src + 0 * F), weight[6]); - sum = vmlaq_f32(sum, Load(src + 1 * F), weight[7]); - sum = vmlaq_f32(sum, Load(src + 2 * F), weight[8]); - Store(dst, Activate(sum, params, 0)); - } - - template<::SimdConvolutionActivationType type> - SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2(const float * src, size_t srcS, const float32x4_t * weight, float32x4_t bias, const float * params, float * dst) - { - float32x4_t sum0 = bias; - float32x4_t sum1 = bias; - for (size_t ky = 0; ky < 3; ++ky) - { - float32x4_t s0 = Load(src + 0 * F); - float32x4_t s1 = Load(src + 1 * F); - float32x4_t s2 = Load(src + 2 * F); - float32x4_t s3 = Load(src + 3 * F); - sum0 = vmlaq_f32(sum0, s0, weight[0]); - sum1 = vmlaq_f32(sum1, s1, weight[0]); - sum0 = vmlaq_f32(sum0, s1, weight[1]); - sum1 = vmlaq_f32(sum1, s2, weight[1]); - sum0 = vmlaq_f32(sum0, s2, weight[2]); - sum1 = vmlaq_f32(sum1, s3, weight[2]); - src += srcS; - weight += 3; - } - Store(dst + 0, Activate(sum0, params, 0)); - Store(dst + F, Activate(sum1, params, 0)); - } - - template<::SimdConvolutionActivationType type> void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3(const float * src, const ConvParam32f & p, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcS = p.srcC*p.srcW; - size_t srcX = p.srcC*p.strideX; - size_t dstH = p.dstH - p.padH; - size_t dstW = p.dstW - p.padW; - size_t dstW2 = AlignLo(dstW - p.padX, 2) + p.padX; - size_t dstW4 = AlignLo(dstW - p.padX, 4) + p.padX; - if (p.dstC == F && p.strideX == 1) - { - float32x4_t _weight[9]; - for (size_t i = 0; i < 9; ++i) - _weight[i] = Load(weight + i * F); - float32x4_t _bias = bias ? Load(bias) : vdupq_n_f32(0.0f); - size_t dy = 0; - for (; dy < p.padY; ++dy) - for (size_t dx = 0; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4(src, p, dy, dx, _weight, _bias, params, dst), dst += F; - for (; dy < dstH; ++dy) - { - size_t dx = 0; - for (; dx < p.padX; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4(src, p, dy, dx, _weight, _bias, params, dst), dst += F; - size_t offset = ((dy * p.strideY - p.padY)*p.srcW + dx * p.strideX - p.padX)*p.srcC; - for (; dx < dstW2; dx += 2) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2(src + offset, srcS, _weight, _bias, params, dst), offset += 2 * F, dst += 2 * F; - for (; dx < dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1(src + offset, srcS, _weight, _bias, params, dst), offset += F, dst += F; - for (; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4(src, p, dy, dx, _weight, _bias, params, dst), dst += F; - } - for (; dy < p.dstH; ++dy) - for (size_t dx = 0; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4(src, p, dy, dx, _weight, _bias, params, dst), dst += F; - } - else - { - size_t dy = 0; - for (; dy < p.padY; ++dy) - for (size_t dx = 0; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC; - for (; dy < dstH; ++dy) - { - size_t dx = 0; - for (; dx < p.padX; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC; - size_t offset = ((dy * p.strideY - p.padY)*p.srcW + dx * p.strideX - p.padX)*p.srcC; - for (; dx < dstW4; dx += 4) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4(src + offset, srcS, srcX, p.srcC, weight, bias, params, dst), dst += 4 * p.dstC, offset += 4 * srcX; - for (; dx < dstW2; dx += 2) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2(src + offset, srcS, srcX, p.srcC, weight, bias, params, dst), dst += 2 * p.dstC, offset += 2 * srcX; - for (; dx < dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1(src + offset, srcS, p.srcC, weight, bias, params, dst), dst += p.dstC, offset += srcX; - for (; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC; - } - for (; dy < p.dstH; ++dy) - for (size_t dx = 0; dx < p.dstW; ++dx) - ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC; - } - } - - template <::SimdConvolutionActivationType type> SynetConvolution32fDirectNhwc::ConvolutionBiasActivationPtr GetConvolutionBiasActivation(const ConvParam32f & p) - { - if (p.group == 1) - return ConvolutionDirectNhwcConvolutionBiasActivationDefault; - else if (p.IsDepthwise()) - { - if (p.IsKernel(3) && p.IsDilation(1)) - return ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3; - else - return ConvolutionDirectNhwcConvolutionBiasActivationDepthwise; - } - return NULL; - } - - SynetConvolution32fDirectNhwc::ConvolutionBiasActivationPtr SynetConvolution32fDirectNhwc::SetConvolutionBiasActivation() - { - const ConvParam32f & p = _param; - SynetConvolution32fDirectNhwc::ConvolutionBiasActivationPtr func = NULL; - if (p.dstC >= F && p.dstH >= p.padY + p.padH && p.dstW >= p.padX + p.padW) - { - switch (p.activation) - { - case ::SimdConvolutionActivationIdentity: func = GetConvolutionBiasActivation<::SimdConvolutionActivationIdentity>(p); break; - case ::SimdConvolutionActivationRelu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationRelu>(p); break; - case ::SimdConvolutionActivationLeakyRelu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationLeakyRelu>(p); break; - case ::SimdConvolutionActivationRestrictRange: func = GetConvolutionBiasActivation<::SimdConvolutionActivationRestrictRange>(p); break; - case ::SimdConvolutionActivationPrelu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationPrelu>(p); break; - case ::SimdConvolutionActivationElu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationElu>(p); break; - case ::SimdConvolutionActivationHswish: func = GetConvolutionBiasActivation<::SimdConvolutionActivationHswish>(p); break; - } - } - return func ? func : Base::SynetConvolution32fDirectNhwc::SetConvolutionBiasActivation(); - }; - - //--------------------------------------------------------------------- - - SynetConvolution32fDepthwiseDotProduct::SynetConvolution32fDepthwiseDotProduct(const ConvParam32f & p) - : Base::SynetConvolution32fDepthwiseDotProduct(p) - { - } - - SIMD_INLINE void DotProduct(const float * a, const float * b, size_t offset, float32x4_t & sum) - { - float32x4_t _a = Load(a + offset); - float32x4_t _b = Load(b + offset); - sum = vmlaq_f32(sum, _a, _b); - } - - SIMD_INLINE float DotProduct(const float * a, const float * b, size_t size) - { - float sum = 0; - size_t partialAlignedSize = AlignLo(size, F); - size_t fullAlignedSize = AlignLo(size, QF); - size_t i = 0; - if (partialAlignedSize) - { - float32x4_t sums[4] = { vdupq_n_f32(0.0f), vdupq_n_f32(0.0f), vdupq_n_f32(0.0f), vdupq_n_f32(0.0f) }; - if (fullAlignedSize) - { - for (; i < fullAlignedSize; i += QF) - { - DotProduct(a, b, i + F * 0, sums[0]); - DotProduct(a, b, i + F * 1, sums[1]); - DotProduct(a, b, i + F * 2, sums[2]); - DotProduct(a, b, i + F * 3, sums[3]); - } - sums[0] = vaddq_f32(vaddq_f32(sums[0], sums[1]), vaddq_f32(sums[2], sums[3])); - } - for (; i < partialAlignedSize; i += F) - DotProduct(a, b, i, sums[0]); - sum += ExtractSum32f(sums[0]); - } - for (; i < size; ++i) - sum += a[i] * b[i]; - return sum; - } - - void SynetConvolution32fDepthwiseDotProduct::Forward(const float * src, float * buf, float * dst) - { - for (size_t b = 0; b < _batch; ++b) - { - if (_bias) - { - for (size_t i = 0; i < _count; ++i) - dst[i] = DotProduct(src + i * _size, _weight + i * _size, _size) + _bias[i]; - } - else - { - for (size_t i = 0; i < _count; ++i) - dst[i] = DotProduct(src + i * _size, _weight + i * _size, _size); - } - if (_param.activation) - ConvolutionBiasAndActivation(NULL, _count, 1, _param.activation, _params, ::SimdFalse, dst); - src += _sizeS; - dst += _sizeD; - } - } - - //--------------------------------------------------------------------- - - SynetConvolution32fNhwcDirect::SynetConvolution32fNhwcDirect(const ConvParam32f& p) - : Base::SynetConvolution32fNhwcDirect(p) - { -#ifdef SIMD_SYNET_CONVOLUTION_NHWC_DIRECT_OLD - //_old.enable = true; - if (_old.enable) - { - if (Set2f(p, _old.convolution)) - OldSetAlgParam(F); - } - else -#endif - { - RunFuncs funcs; - for (size_t n = 2; n <= 4; ++n) - { - funcs.push_back(RunFunc(Ext() + "-" + ToStr(n))); - SetAlgParam(F, n, funcs.back().alg); - if (!SetRt(p, funcs.back().alg)) - return; - } - _run.Init(funcs); - } - } - - bool SynetConvolution32fNhwcDirect::SetRt(const ConvParam32f& p, AlgParam& a) - { - switch (a.microD) - { - case 2 * F: return Set2r(p, a); - case 3 * F: return Set3r(p, a); - case 4 * F: return Set4r(p, a); - default: - return false; - } - } - - bool SynetConvolution32fNhwcDirect::Preferable(const ConvParam32f& p) - { - if (p.trans != SimdTrue || p.group != 1 || !p.IsDilation(1)) - return false; - if (!p.Is1x1() && p.dstW < 6 + p.padX + p.padY) - return false; - if (p.Is1x1() && (p.srcC >= 2 * p.dstC || (p.activation == SimdConvolutionActivationIdentity && p.srcC > 128) || p.srcC > 256)) - return false; - if (p.kernelY > p.srcH || p.kernelX > p.srcW) - return false; - return true; - } - - //--------------------------------------------------------------------- - - void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdGemm32fNNPtr gemm) - { - ConvParam32f param(batch, conv, gemm); - if (!param.Valid()) - return NULL; - else if (SynetConvolution32fDepthwiseDotProduct::Preferable(param)) - return new SynetConvolution32fDepthwiseDotProduct(param); - else if (SynetConvolution32fWinograd::Preferable(param)) - return new SynetConvolution32fWinograd(param); - else if (SynetConvolution32fDirectNchw::Preferable(param)) - return new SynetConvolution32fDirectNchw(param); - else if (SynetConvolution32fGemmNT::Preferable(param)) - return new SynetConvolution32fGemmNT(param); - else if (SynetConvolution32fNhwcDirect::Preferable(param)) - return new SynetConvolution32fNhwcDirect(param); - else if (SynetConvolution32fDirectNhwc::Preferable(param)) - return new SynetConvolution32fDirectNhwc(param); - else - return new SynetConvolution32fGemmNN(param); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonSynetConvolution32fNhwcDirect2f.cpp b/src/3rd/Simd/Simd/SimdNeonSynetConvolution32fNhwcDirect2f.cpp deleted file mode 100644 index 65ff1c4e..00000000 --- a/src/3rd/Simd/Simd/SimdNeonSynetConvolution32fNhwcDirect2f.cpp +++ /dev/null @@ -1,797 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - using AlgParam = SynetConvolution32fNhwcDirect::AlgParam; - - template void ConvolutionNhwcDirect_2x6(const float* src0, const ConvParam32f& p, - size_t kernelH, size_t kernelW, size_t srcC, size_t dstC, const float* weight, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t dS = p.srcC * p.strideX, dW = DF * (p.kernelX - kernelW) * srcC, dY = p.srcW * p.srcC, dX = p.srcC, dD = p.dstC; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - d00 = vdupq_n_f32(0.0f); d01 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); d11 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f); d21 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f); d31 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f); d41 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f); d51 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = Load(weight + 0); - w1 = Load(weight + F); - s0 = vdupq_n_f32(src0[offset]); - d00 = vmlaq_f32(d00, s0, w0); - d01 = vmlaq_f32(d01, s0, w1); - s0 = vdupq_n_f32(src1[offset]); - d10 = vmlaq_f32(d10, s0, w0); - d11 = vmlaq_f32(d11, s0, w1); - s0 = vdupq_n_f32(src2[offset]); - d20 = vmlaq_f32(d20, s0, w0); - d21 = vmlaq_f32(d21, s0, w1); - s0 = vdupq_n_f32(src3[offset]); - d30 = vmlaq_f32(d30, s0, w0); - d31 = vmlaq_f32(d31, s0, w1); - s0 = vdupq_n_f32(src4[offset]); - d40 = vmlaq_f32(d40, s0, w0); - d41 = vmlaq_f32(d41, s0, w1); - s0 = vdupq_n_f32(src5[offset]); - d50 = vmlaq_f32(d50, s0, w0); - d51 = vmlaq_f32(d51, s0, w1); - weight += DF; - } - } - weight += dW; - } - if (dstC == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params); - } - else - { - dstC -= F; - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params, dstC); - } - } - else - { - d00 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = Load(weight + 0); - s0 = vdupq_n_f32(src0[offset]); - d00 = vmlaq_f32(d00, s0, w0); - s0 = vdupq_n_f32(src1[offset]); - d10 = vmlaq_f32(d10, s0, w0); - s0 = vdupq_n_f32(src2[offset]); - d20 = vmlaq_f32(d20, s0, w0); - s0 = vdupq_n_f32(src3[offset]); - d30 = vmlaq_f32(d30, s0, w0); - s0 = vdupq_n_f32(src4[offset]); - d40 = vmlaq_f32(d40, s0, w0); - s0 = vdupq_n_f32(src5[offset]); - d50 = vmlaq_f32(d50, s0, w0); - weight += DF; - } - } - weight += dW; - } - if (dstC == F) - { - Term::template Save(dst + 0, d00, bias, params); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d10, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d20, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d30, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d40, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d50, bias, params, dstC); - } - } - } - - template void ConvolutionNhwcDirect_2x3(const float* src0, const ConvParam32f& p, - size_t kernelH, size_t kernelW, size_t srcC, size_t dstC, const float* weight, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d10, d11, d20, d21, s0, w0, w1; - size_t dS = p.srcC * p.strideX, dW = DF * (p.kernelX - kernelW) * srcC, dY = p.srcW * p.srcC, dX = p.srcC, dD = p.dstC; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - if (dstC > F) - { - d00 = vdupq_n_f32(0.0f); d01 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); d11 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f); d21 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = Load(weight + 0); - w1 = Load(weight + F); - s0 = vdupq_n_f32(src0[offset]); - d00 = vmlaq_f32(d00, s0, w0); - d01 = vmlaq_f32(d01, s0, w1); - s0 = vdupq_n_f32(src1[offset]); - d10 = vmlaq_f32(d10, s0, w0); - d11 = vmlaq_f32(d11, s0, w1); - s0 = vdupq_n_f32(src2[offset]); - d20 = vmlaq_f32(d20, s0, w0); - d21 = vmlaq_f32(d21, s0, w1); - weight += DF; - } - } - weight += dW; - } - if (dstC == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params); - } - else - { - dstC -= F; - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, dstC); - } - } - else - { - d00 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = Load(weight + 0); - s0 = vdupq_n_f32(src0[offset]); - d00 = vmlaq_f32(d00, s0, w0); - s0 = vdupq_n_f32(src1[offset]); - d10 = vmlaq_f32(d10, s0, w0); - s0 = vdupq_n_f32(src2[offset]); - d20 = vmlaq_f32(d20, s0, w0); - weight += DF; - } - } - weight += dW; - } - if (dstC == F) - { - Term::template Save(dst + 0, d00, bias, params); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d10, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d20, bias, params, dstC); - } - } - } - - template void ConvolutionNhwcDirect_2x1(const float* src0, const ConvParam32f& p, - size_t kernelH, size_t kernelW, size_t srcC, size_t dstC, const float* weight, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, s0, w0, w1; - size_t dW = DF * (p.kernelX - kernelW) * srcC, dY = p.srcW * p.srcC, dX = p.srcC; - if (dstC > F) - { - d00 = vdupq_n_f32(0.0f); - d01 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = Load(weight + 0); - w1 = Load(weight + F); - s0 = vdupq_n_f32(src0[offset]); - d00 = vmlaq_f32(d00, s0, w0); - d01 = vmlaq_f32(d01, s0, w1); - weight += DF; - } - } - weight += dW; - } - if (dstC == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, dstC - F); - } - } - else - { - d00 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kernelH; ++ky) - { - for (size_t kx = 0; kx < kernelW; ++kx) - { - for (size_t offset = ky * dY + kx * dX, end = offset + srcC; offset < end; ++offset) - { - w0 = Load(weight + 0); - s0 = vdupq_n_f32(src0[offset]); - d00 = vmlaq_f32(d00, s0, w0); - weight += DF; - } - } - weight += dW; - } - if (dstC == F) - Term::template Save(dst + 0, d00, bias, params); - else - Term::template Save(dst + 0, d00, bias, params, dstC); - } - } - - template void ConvolutionNhwcDirect_2(const float* src, const ConvParam32f& p, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t noseH = p.padY, noseW = p.padX; - size_t bodyH = p.srcH - p.kernelY + 1 + noseH, bodyW = p.srcW - p.kernelX + 1 + noseW; - size_t bodyW3 = AlignLoAny(bodyW - noseW, 3 * p.strideX) + noseW; - size_t bodyW6 = AlignLoAny(bodyW - noseW, 6 * p.strideX) + noseW; - size_t tailH = bodyH + p.padH, tailW = bodyW + p.padW; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - - float32x4_t _params[2], _bias[2]; - _params[0] = vdupq_n_f32(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = vdupq_n_f32(params[1]); - - for (size_t dc = 0; dc < dstC; dc += DF) - { - size_t dC = Simd::Min(DF, dstC - dc); - _bias[0] = Load(bias + dc + 0); - _bias[1] = Load(bias + dc + F); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = Load(params + dc + 0); - _params[1] = Load(params + dc + F); - } - float* d = dst + dc + yBeg * p.dstW * p.dstC; - size_t dy = yBeg, sy = dy * p.strideY; - for (; sy < noseH && dy < yEnd; sy += p.strideY, dy++) - { - size_t sx = 0; - const float* s = src; - const float* w = weight + (noseH - sy) * p.kernelX * srcC * DF; - for (; sx < noseW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s, p, kY + sy, kX + sx, srcC, dC, w + (noseW - sx) * srcC * DF, _bias, _params, d); - for (; sx < bodyW6; sx += 6 * p.strideX, d += 6 * p.dstC) - ConvolutionNhwcDirect_2x6(s + (sx - noseW) * p.srcC, p, kY + sy, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < bodyW3; sx += 3 * p.strideX, d += 3 * p.dstC) - ConvolutionNhwcDirect_2x3(s + (sx - noseW) * p.srcC, p, kY + sy, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < bodyW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, kY + sy, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < tailW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, kY + sy, kW - sx, srcC, dC, w, _bias, _params, d); - } - for (; sy < bodyH && dy < yEnd; sy += p.strideY, dy++) - { - size_t sx = 0; - const float* s = src + (sy - noseH) * p.srcW * p.srcC; - const float* w = weight; - for (; sx < noseW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s, p, p.kernelY, kX + sx, srcC, dC, w + (noseW - sx) * srcC * DF, _bias, _params, d); - for (; sx < bodyW6; sx += 6 * p.strideX, d += 6 * p.dstC) - ConvolutionNhwcDirect_2x6(s + (sx - noseW) * p.srcC, p, p.kernelY, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < bodyW3; sx += 3 * p.strideX, d += 3 * p.dstC) - ConvolutionNhwcDirect_2x3(s + (sx - noseW) * p.srcC, p, p.kernelY, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < bodyW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, p.kernelY, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < tailW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, p.kernelY, kW - sx, srcC, dC, w, _bias, _params, d); - } - for (; sy < tailH && dy < yEnd; sy += p.strideY, dy++) - { - size_t sx = 0; - const float* s = src + (sy - noseH) * p.srcW * p.srcC; - const float* w = weight; - for (; sx < noseW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s, p, kH - sy, kX + sx, srcC, dC, w + (noseW - sx) * srcC * DF, _bias, _params, d); - for (; sx < bodyW6; sx += 6 * p.strideX, d += 6 * p.dstC) - ConvolutionNhwcDirect_2x6(s + (sx - noseW) * p.srcC, p, kH - sy, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < bodyW3; sx += 3 * p.strideX, d += 3 * p.dstC) - ConvolutionNhwcDirect_2x3(s + (sx - noseW) * p.srcC, p, kH - sy, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < bodyW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, kH - sy, p.kernelX, srcC, dC, w, _bias, _params, d); - for (; sx < tailW; sx += p.strideX, d += p.dstC) - ConvolutionNhwcDirect_2x1(s + (sx - noseW) * p.srcC, p, kH - sy, kW - sx, srcC, dC, w, _bias, _params, d); - } - weight += p.kernelY * p.kernelX * srcC * DF; - } - } - - template void ConvolutionNhwcDirect_2(const float* src, const ConvParam32f& p, - const SynetConvolution32fNhwcDirect::AlgParam& a, const float* weight, const float* bias, const float* params, float* dst) - { - for (size_t dc = 0; dc < p.dstC; dc += a.macroD) - { - size_t macroD = Simd::Min(p.dstC, dc + a.macroD) - dc; - for (size_t sc = 0; sc < p.srcC; sc += a.macroC) - { - size_t macroC = Simd::Min(p.srcC, sc + a.macroC) - sc; - size_t macroK = p.kernelY * p.kernelX * macroC; - for (size_t yBeg = 0; yBeg < p.dstH;) - { - size_t yEnd = Simd::Min(yBeg + a.macroH, p.dstH); - if (a.macroC == p.srcC) - ConvolutionNhwcDirect_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc == 0) - ConvolutionNhwcDirect_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc + macroC == p.srcC) - ConvolutionNhwcDirect_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else - ConvolutionNhwcDirect_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - yBeg = yEnd; - } - weight += AlignHiAny(macroD, a.microD) * macroK; - } - if (type == ::SimdConvolutionActivationPrelu) - params += macroD; - } - } - - //--------------------------------------------------------------------- - - template void ConvolutionNhwcDirect1x1_2x6(const float* src0, const ConvParam32f& p, - size_t srcC, size_t dstC, const float* weight, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t dS = p.srcC, dD = p.dstC; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - d00 = vdupq_n_f32(0.0f); d01 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); d11 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f); d21 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f); d31 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f); d41 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f); d51 = vdupq_n_f32(0.0f); - for (size_t offset = 0; offset < srcC; ++offset) - { - w0 = Load(weight + 0); - w1 = Load(weight + F); - s0 = vdupq_n_f32(src0[offset]); - d00 = vmlaq_f32(d00, s0, w0); - d01 = vmlaq_f32(d01, s0, w1); - s0 = vdupq_n_f32(src1[offset]); - d10 = vmlaq_f32(d10, s0, w0); - d11 = vmlaq_f32(d11, s0, w1); - s0 = vdupq_n_f32(src2[offset]); - d20 = vmlaq_f32(d20, s0, w0); - d21 = vmlaq_f32(d21, s0, w1); - s0 = vdupq_n_f32(src3[offset]); - d30 = vmlaq_f32(d30, s0, w0); - d31 = vmlaq_f32(d31, s0, w1); - s0 = vdupq_n_f32(src4[offset]); - d40 = vmlaq_f32(d40, s0, w0); - d41 = vmlaq_f32(d41, s0, w1); - s0 = vdupq_n_f32(src5[offset]); - d50 = vmlaq_f32(d50, s0, w0); - d51 = vmlaq_f32(d51, s0, w1); - weight += DF; - } - if (dstC == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params); - } - else - { - dstC -= F; - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params, dstC); - } - } - else - { - d00 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f); - for (size_t offset = 0; offset < srcC; ++offset) - { - w0 = Load(weight + 0); - s0 = vdupq_n_f32(src0[offset]); - d00 = vmlaq_f32(d00, s0, w0); - s0 = vdupq_n_f32(src1[offset]); - d10 = vmlaq_f32(d10, s0, w0); - s0 = vdupq_n_f32(src2[offset]); - d20 = vmlaq_f32(d20, s0, w0); - s0 = vdupq_n_f32(src3[offset]); - d30 = vmlaq_f32(d30, s0, w0); - s0 = vdupq_n_f32(src4[offset]); - d40 = vmlaq_f32(d40, s0, w0); - s0 = vdupq_n_f32(src5[offset]); - d50 = vmlaq_f32(d50, s0, w0); - weight += DF; - } - if (dstC == F) - { - Term::template Save(dst + 0, d00, bias, params); - dst += dD; - Term::template Save(dst + 0, d10, bias, params); - dst += dD; - Term::template Save(dst + 0, d20, bias, params); - dst += dD; - Term::template Save(dst + 0, d30, bias, params); - dst += dD; - Term::template Save(dst + 0, d40, bias, params); - dst += dD; - Term::template Save(dst + 0, d50, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d10, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d20, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d30, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d40, bias, params, dstC); - dst += dD; - Term::template Save(dst + 0, d50, bias, params, dstC); - } - } - } - - template void ConvolutionNhwcDirect1x1_2xM(const float* src0, const ConvParam32f& p, - size_t srcC, size_t dstC, const float* weight, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t dS = p.srcC, dD = p.dstC; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - if (M > 4) d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f); - if (M > 5) d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f); - for (size_t offset = 0; offset < srcC; ++offset) - { - w0 = Load(weight + 0); - w1 = Load(weight + F); - if (M > 0) s0 = vdupq_n_f32(src0[offset]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - if (M > 1) s0 = vdupq_n_f32(src1[offset]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - if (M > 2) s0 = vdupq_n_f32(src2[offset]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - if (M > 3) s0 = vdupq_n_f32(src3[offset]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - if (M > 4) s0 = vdupq_n_f32(src4[offset]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1); - if (M > 5) s0 = vdupq_n_f32(src5[offset]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1); - weight += DF; - } - if (dstC == DF) - { - if (M > 0) Term::template Save(dst + 0, d00, bias, params), Term::template Save(dst + F, d01, bias, params), dst += dD; - if (M > 1) Term::template Save(dst + 0, d10, bias, params), Term::template Save(dst + F, d11, bias, params), dst += dD; - if (M > 2) Term::template Save(dst + 0, d20, bias, params), Term::template Save(dst + F, d21, bias, params), dst += dD; - if (M > 3) Term::template Save(dst + 0, d30, bias, params), Term::template Save(dst + F, d31, bias, params), dst += dD; - if (M > 4) Term::template Save(dst + 0, d40, bias, params), Term::template Save(dst + F, d41, bias, params), dst += dD; - if (M > 5) Term::template Save(dst + 0, d50, bias, params), Term::template Save(dst + F, d51, bias, params), dst += dD; - } - else - { - dstC -= F; - if (M > 0) Term::template Save(dst + 0, d00, bias, params), Term::template Save(dst + F, d01, bias, params, dstC), dst += dD; - if (M > 1) Term::template Save(dst + 0, d10, bias, params), Term::template Save(dst + F, d11, bias, params, dstC), dst += dD; - if (M > 2) Term::template Save(dst + 0, d20, bias, params), Term::template Save(dst + F, d21, bias, params, dstC), dst += dD; - if (M > 3) Term::template Save(dst + 0, d30, bias, params), Term::template Save(dst + F, d31, bias, params, dstC), dst += dD; - if (M > 4) Term::template Save(dst + 0, d40, bias, params), Term::template Save(dst + F, d41, bias, params, dstC), dst += dD; - if (M > 5) Term::template Save(dst + 0, d50, bias, params), Term::template Save(dst + F, d51, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0) d00 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f); - if (M > 4) d40 = vdupq_n_f32(0.0f); - if (M > 5) d50 = vdupq_n_f32(0.0f); - for (size_t offset = 0; offset < srcC; ++offset) - { - w0 = Load(weight + 0); - if (M > 0) s0 = vdupq_n_f32(src0[offset]), d00 = vmlaq_f32(d00, s0, w0); - if (M > 1) s0 = vdupq_n_f32(src1[offset]), d10 = vmlaq_f32(d10, s0, w0); - if (M > 2) s0 = vdupq_n_f32(src2[offset]), d20 = vmlaq_f32(d20, s0, w0); - if (M > 3) s0 = vdupq_n_f32(src3[offset]), d30 = vmlaq_f32(d30, s0, w0); - if (M > 4) s0 = vdupq_n_f32(src4[offset]), d40 = vmlaq_f32(d40, s0, w0); - if (M > 5) s0 = vdupq_n_f32(src5[offset]), d50 = vmlaq_f32(d50, s0, w0); - weight += DF; - } - if (dstC == F) - { - if (M > 0) Term::template Save(dst + 0, d00, bias, params), dst += dD; - if (M > 1) Term::template Save(dst + 0, d10, bias, params), dst += dD; - if (M > 2) Term::template Save(dst + 0, d20, bias, params), dst += dD; - if (M > 3) Term::template Save(dst + 0, d30, bias, params), dst += dD; - if (M > 4) Term::template Save(dst + 0, d40, bias, params), dst += dD; - if (M > 5) Term::template Save(dst + 0, d50, bias, params), dst += dD; - } - else - { - if (M > 0) Term::template Save(dst + 0, d00, bias, params, dstC), dst += dD; - if (M > 1) Term::template Save(dst + 0, d10, bias, params, dstC), dst += dD; - if (M > 2) Term::template Save(dst + 0, d20, bias, params, dstC), dst += dD; - if (M > 3) Term::template Save(dst + 0, d30, bias, params, dstC), dst += dD; - if (M > 4) Term::template Save(dst + 0, d40, bias, params, dstC), dst += dD; - if (M > 5) Term::template Save(dst + 0, d50, bias, params, dstC), dst += dD; - } - } - } - - typedef void(*ConvolutionNhwcDirect1x1_2xM_Ptr)(const float* src0, const ConvParam32f& p, size_t srcC, size_t dstC, const float* weight, const float32x4_t* bias, const float32x4_t* params, float* dst); - - template ConvolutionNhwcDirect1x1_2xM_Ptr GetConvolutionNhwcDirect1x1_2xM(size_t M) - { - switch (M) - { - case 0: return ConvolutionNhwcDirect1x1_2xM; - case 1: return ConvolutionNhwcDirect1x1_2xM; - case 2: return ConvolutionNhwcDirect1x1_2xM; - case 3: return ConvolutionNhwcDirect1x1_2xM; - case 4: return ConvolutionNhwcDirect1x1_2xM; - case 5: return ConvolutionNhwcDirect1x1_2xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect1x1_2(const float* src, const ConvParam32f& p, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t n1 = (yEnd - yBeg) * p.dstW; - size_t n6 = AlignLoAny(n1, 6); - size_t nTail = n1 - n6; - ConvolutionNhwcDirect1x1_2xM_Ptr tailN = GetConvolutionNhwcDirect1x1_2xM(nTail); - - float32x4_t _params[2], _bias[2]; - _params[0] = vdupq_n_f32(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = vdupq_n_f32(params[1]); - - for (size_t dc = 0; dc < dstC; dc += DF) - { - size_t dC = Simd::Min(DF, dstC - dc); - _bias[0] = Load(bias + dc + 0); - _bias[1] = Load(bias + dc + F); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = Load(params + dc + 0); - _params[1] = Load(params + dc + F); - } - const float* ps = src + yBeg * p.srcW * p.srcC; - float* pd = dst + dc + yBeg * p.dstW * p.dstC; - size_t i = 0; - for (; i < n6; i += 6, ps += 6 * p.srcC, pd += 6 * p.dstC) - ConvolutionNhwcDirect1x1_2x6(ps, p, srcC, dC, weight, _bias, _params, pd); - if (nTail) - tailN(ps, p, srcC, dC, weight, _bias, _params, pd), ps += nTail * p.srcC, pd += nTail * p.dstC; - weight += srcC * DF; - } - } - - template void ConvolutionNhwcDirect1x1_2(const float* src, const ConvParam32f& p, - const SynetConvolution32fNhwcDirect::AlgParam& a, const float* weight, const float* bias, const float* params, float* dst) - { - for (size_t dc = 0; dc < p.dstC; dc += a.macroD) - { - size_t macroD = Simd::Min(p.dstC, dc + a.macroD) - dc; - for (size_t sc = 0; sc < p.srcC; sc += a.macroC) - { - size_t macroC = Simd::Min(p.srcC, sc + a.macroC) - sc; - for (size_t yBeg = 0; yBeg < p.dstH;) - { - size_t yEnd = Simd::Min(yBeg + a.macroH, p.dstH); - if (a.macroC == p.srcC) - ConvolutionNhwcDirect1x1_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc == 0) - ConvolutionNhwcDirect1x1_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc + macroC == p.srcC) - ConvolutionNhwcDirect1x1_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else - ConvolutionNhwcDirect1x1_2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - yBeg = yEnd; - } - weight += AlignHiAny(macroD, a.microD) * macroC; - } - if (type == ::SimdConvolutionActivationPrelu) - params += macroD; - } - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void Set(const ConvParam32f& p, SynetConvolution32fNhwcDirect::OldConvolutionPtr & convolution) - { - if (p.Is1x1()) - convolution = ConvolutionNhwcDirect1x1_2; - else - convolution = ConvolutionNhwcDirect_2; - } - - bool SynetConvolution32fNhwcDirect::Set2f(const ConvParam32f& p, OldConvolutionPtr& convolution) - { - switch (p.activation) - { - case SimdConvolutionActivationIdentity: Set(p, convolution); break; - case SimdConvolutionActivationRelu: Set(p, convolution); break; - case SimdConvolutionActivationLeakyRelu: Set(p, convolution); break; - case SimdConvolutionActivationRestrictRange: Set(p, convolution); break; - case SimdConvolutionActivationPrelu: Set(p, convolution); break; - case SimdConvolutionActivationElu: Set(p, convolution); break; - case SimdConvolutionActivationHswish: Set(p, convolution); break; - default: assert(0); - } - return true; - } - } -#endif//SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonSynetConvolution32fNhwcDirect2r.cpp b/src/3rd/Simd/Simd/SimdNeonSynetConvolution32fNhwcDirect2r.cpp deleted file mode 100644 index d1076888..00000000 --- a/src/3rd/Simd/Simd/SimdNeonSynetConvolution32fNhwcDirect2r.cpp +++ /dev/null @@ -1,1358 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - using AlgParam = SynetConvolution32fNhwcDirect::AlgParam; - - typedef void(*ConvolutionNhwcDirect_NxM_Ptr)(const float* src0, const ConvParam32f& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst); - typedef void(*ConvolutionNhwcDirect1x1_NxM_Ptr)(const float* src0, const ConvParam32f& p, const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst); - - template void ConvolutionNhwcDirect_2x1(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, s0, w0, w1; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - if (dstC > F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - } - } - weight0 += dW, weight1 += dW; - } - } - if (dstC == DF) - Save2(dst, d00, d01, bias, params); - else - Save2(dst, d00, d01, bias, params, dstC - F); - } - else - { - d00 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0); - } - } - weight0 += dW; - } - } - if (dstC == F) - Save1(dst, d00, bias, params); - else - Save1(dst, d00, bias, params, dstC); - } - } - -#if defined(SIMD_ARM64_ENABLE) - template void ConvolutionNhwcDirect_2x12(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, da0, da1, db0, db1, s0, w0, w1; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f); - d60 = vdupq_n_f32(0.0f), d61 = vdupq_n_f32(0.0f); - d70 = vdupq_n_f32(0.0f), d71 = vdupq_n_f32(0.0f); - d80 = vdupq_n_f32(0.0f), d81 = vdupq_n_f32(0.0f); - d90 = vdupq_n_f32(0.0f), d91 = vdupq_n_f32(0.0f); - da0 = vdupq_n_f32(0.0f), da1 = vdupq_n_f32(0.0f); - db0 = vdupq_n_f32(0.0f), db1 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 12 <= srcW); - size_t off0 = beg + kx * dX, end = off0 + srcC, off6 = off0 + 6 * dS, offw = 0; - for (; off0 < end; ++off0, ++off6, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - s0 = vdupq_n_f32(src0[off0]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - s0 = vdupq_n_f32(src1[off0]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - s0 = vdupq_n_f32(src2[off0]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - s0 = vdupq_n_f32(src3[off0]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - s0 = vdupq_n_f32(src4[off0]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1); - s0 = vdupq_n_f32(src5[off0]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1); - s0 = vdupq_n_f32(src0[off6]), d60 = vmlaq_f32(d60, s0, w0), d61 = vmlaq_f32(d61, s0, w1); - s0 = vdupq_n_f32(src1[off6]), d70 = vmlaq_f32(d70, s0, w0), d71 = vmlaq_f32(d71, s0, w1); - s0 = vdupq_n_f32(src2[off6]), d80 = vmlaq_f32(d80, s0, w0), d81 = vmlaq_f32(d81, s0, w1); - s0 = vdupq_n_f32(src3[off6]), d90 = vmlaq_f32(d90, s0, w0), d91 = vmlaq_f32(d91, s0, w1); - s0 = vdupq_n_f32(src4[off6]), da0 = vmlaq_f32(da0, s0, w0), da1 = vmlaq_f32(da1, s0, w1); - s0 = vdupq_n_f32(src5[off6]), db0 = vmlaq_f32(db0, s0, w0), db1 = vmlaq_f32(db1, s0, w1); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == DF) - { - Save2(dst, d00, d01, bias, params), dst += dD; - Save2(dst, d10, d11, bias, params), dst += dD; - Save2(dst, d20, d21, bias, params), dst += dD; - Save2(dst, d30, d31, bias, params), dst += dD; - Save2(dst, d40, d41, bias, params), dst += dD; - Save2(dst, d50, d51, bias, params), dst += dD; - Save2(dst, d60, d61, bias, params), dst += dD; - Save2(dst, d70, d71, bias, params), dst += dD; - Save2(dst, d80, d81, bias, params), dst += dD; - Save2(dst, d90, d91, bias, params), dst += dD; - Save2(dst, da0, da1, bias, params), dst += dD; - Save2(dst, db0, db1, bias, params), dst += dD; - } - else - { - dstC -= F; - Save2(dst, d00, d01, bias, params, dstC), dst += dD; - Save2(dst, d10, d11, bias, params, dstC), dst += dD; - Save2(dst, d20, d21, bias, params, dstC), dst += dD; - Save2(dst, d30, d31, bias, params, dstC), dst += dD; - Save2(dst, d40, d41, bias, params, dstC), dst += dD; - Save2(dst, d50, d51, bias, params, dstC), dst += dD; - Save2(dst, d60, d61, bias, params, dstC), dst += dD; - Save2(dst, d70, d71, bias, params, dstC), dst += dD; - Save2(dst, d80, d81, bias, params, dstC), dst += dD; - Save2(dst, d90, d91, bias, params, dstC), dst += dD; - Save2(dst, da0, da1, bias, params, dstC), dst += dD; - Save2(dst, db0, db1, bias, params, dstC), dst += dD; - } - } - else - { - d00 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f); - d60 = vdupq_n_f32(0.0f); - d70 = vdupq_n_f32(0.0f); - d80 = vdupq_n_f32(0.0f); - d90 = vdupq_n_f32(0.0f); - da0 = vdupq_n_f32(0.0f); - db0 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 12 <= srcW); - size_t off0 = beg + kx * dX, end = off0 + srcC, off6 = off0 + 6 * dS, offw = 0; - for (; off0 < end; ++off0, ++off6, offw += F) - { - w0 = Load(weight0 + offw); - s0 = vdupq_n_f32(src0[off0]), d00 = vmlaq_f32(d00, s0, w0); - s0 = vdupq_n_f32(src1[off0]), d10 = vmlaq_f32(d10, s0, w0); - s0 = vdupq_n_f32(src2[off0]), d20 = vmlaq_f32(d20, s0, w0); - s0 = vdupq_n_f32(src3[off0]), d30 = vmlaq_f32(d30, s0, w0); - s0 = vdupq_n_f32(src4[off0]), d40 = vmlaq_f32(d40, s0, w0); - s0 = vdupq_n_f32(src5[off0]), d50 = vmlaq_f32(d50, s0, w0); - s0 = vdupq_n_f32(src0[off6]), d60 = vmlaq_f32(d60, s0, w0); - s0 = vdupq_n_f32(src1[off6]), d70 = vmlaq_f32(d70, s0, w0); - s0 = vdupq_n_f32(src2[off6]), d80 = vmlaq_f32(d80, s0, w0); - s0 = vdupq_n_f32(src3[off6]), d90 = vmlaq_f32(d90, s0, w0); - s0 = vdupq_n_f32(src4[off6]), da0 = vmlaq_f32(da0, s0, w0); - s0 = vdupq_n_f32(src5[off6]), db0 = vmlaq_f32(db0, s0, w0); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - Save1(dst, d00, bias, params), dst += dD; - Save1(dst, d10, bias, params), dst += dD; - Save1(dst, d20, bias, params), dst += dD; - Save1(dst, d30, bias, params), dst += dD; - Save1(dst, d40, bias, params), dst += dD; - Save1(dst, d50, bias, params), dst += dD; - Save1(dst, d60, bias, params), dst += dD; - Save1(dst, d70, bias, params), dst += dD; - Save1(dst, d80, bias, params), dst += dD; - Save1(dst, d90, bias, params), dst += dD; - Save1(dst, da0, bias, params), dst += dD; - Save1(dst, db0, bias, params), dst += dD; - } - else - { - Save1(dst, d00, bias, params, dstC), dst += dD; - Save1(dst, d10, bias, params, dstC), dst += dD; - Save1(dst, d20, bias, params, dstC), dst += dD; - Save1(dst, d30, bias, params, dstC), dst += dD; - Save1(dst, d40, bias, params, dstC), dst += dD; - Save1(dst, d50, bias, params, dstC), dst += dD; - Save1(dst, d60, bias, params, dstC), dst += dD; - Save1(dst, d70, bias, params, dstC), dst += dD; - Save1(dst, d80, bias, params, dstC), dst += dD; - Save1(dst, d90, bias, params, dstC), dst += dD; - Save1(dst, da0, bias, params, dstC), dst += dD; - Save1(dst, db0, bias, params, dstC), dst += dD; - } - } - } - - template void ConvolutionNhwcDirect_2xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, da0, da1, db0, db1, s0, w0, w1; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - if (M > 0x0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - if (M > 0x1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - if (M > 0x2) d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - if (M > 0x3) d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - if (M > 0x4) d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f); - if (M > 0x5) d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f); - if (M > 0x6) d60 = vdupq_n_f32(0.0f), d61 = vdupq_n_f32(0.0f); - if (M > 0x7) d70 = vdupq_n_f32(0.0f), d71 = vdupq_n_f32(0.0f); - if (M > 0x8) d80 = vdupq_n_f32(0.0f), d81 = vdupq_n_f32(0.0f); - if (M > 0x9) d90 = vdupq_n_f32(0.0f), d91 = vdupq_n_f32(0.0f); - if (M > 0xa) da0 = vdupq_n_f32(0.0f), da1 = vdupq_n_f32(0.0f); - if (M > 0xb) db0 = vdupq_n_f32(0.0f), db1 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t off0 = beg + kx * dX, end = off0 + srcC, off6 = off0 + 6 * dS, offw = 0; - for (; off0 < end; ++off0, ++off6, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - if (M > 0x0) s0 = vdupq_n_f32(src0[off0]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - if (M > 0x1) s0 = vdupq_n_f32(src1[off0]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - if (M > 0x2) s0 = vdupq_n_f32(src2[off0]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - if (M > 0x3) s0 = vdupq_n_f32(src3[off0]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - if (M > 0x4) s0 = vdupq_n_f32(src4[off0]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1); - if (M > 0x5) s0 = vdupq_n_f32(src5[off0]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1); - if (M > 0x6) s0 = vdupq_n_f32(src0[off6]), d60 = vmlaq_f32(d60, s0, w0), d61 = vmlaq_f32(d61, s0, w1); - if (M > 0x7) s0 = vdupq_n_f32(src1[off6]), d70 = vmlaq_f32(d70, s0, w0), d71 = vmlaq_f32(d71, s0, w1); - if (M > 0x8) s0 = vdupq_n_f32(src2[off6]), d80 = vmlaq_f32(d80, s0, w0), d81 = vmlaq_f32(d81, s0, w1); - if (M > 0x9) s0 = vdupq_n_f32(src3[off6]), d90 = vmlaq_f32(d90, s0, w0), d91 = vmlaq_f32(d91, s0, w1); - if (M > 0xa) s0 = vdupq_n_f32(src4[off6]), da0 = vmlaq_f32(da0, s0, w0), da1 = vmlaq_f32(da1, s0, w1); - if (M > 0xb) s0 = vdupq_n_f32(src5[off6]), db0 = vmlaq_f32(db0, s0, w0), db1 = vmlaq_f32(db1, s0, w1); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == DF) - { - if (M > 0x0) Save2(dst, d00, d01, bias, params), dst += dD; - if (M > 0x1) Save2(dst, d10, d11, bias, params), dst += dD; - if (M > 0x2) Save2(dst, d20, d21, bias, params), dst += dD; - if (M > 0x3) Save2(dst, d30, d31, bias, params), dst += dD; - if (M > 0x4) Save2(dst, d40, d41, bias, params), dst += dD; - if (M > 0x5) Save2(dst, d50, d51, bias, params), dst += dD; - if (M > 0x6) Save2(dst, d60, d61, bias, params), dst += dD; - if (M > 0x7) Save2(dst, d70, d71, bias, params), dst += dD; - if (M > 0x8) Save2(dst, d80, d81, bias, params), dst += dD; - if (M > 0x9) Save2(dst, d90, d91, bias, params), dst += dD; - if (M > 0xa) Save2(dst, da0, da1, bias, params), dst += dD; - if (M > 0xb) Save2(dst, db0, db1, bias, params), dst += dD; - } - else - { - dstC -= F; - if (M > 0x0) Save2(dst, d00, d01, bias, params, dstC), dst += dD; - if (M > 0x1) Save2(dst, d10, d11, bias, params, dstC), dst += dD; - if (M > 0x2) Save2(dst, d20, d21, bias, params, dstC), dst += dD; - if (M > 0x3) Save2(dst, d30, d31, bias, params, dstC), dst += dD; - if (M > 0x4) Save2(dst, d40, d41, bias, params, dstC), dst += dD; - if (M > 0x5) Save2(dst, d50, d51, bias, params, dstC), dst += dD; - if (M > 0x6) Save2(dst, d60, d61, bias, params, dstC), dst += dD; - if (M > 0x7) Save2(dst, d70, d71, bias, params, dstC), dst += dD; - if (M > 0x8) Save2(dst, d80, d81, bias, params, dstC), dst += dD; - if (M > 0x9) Save2(dst, d90, d91, bias, params, dstC), dst += dD; - if (M > 0xa) Save2(dst, da0, da1, bias, params, dstC), dst += dD; - if (M > 0xb) Save2(dst, db0, db1, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0x0) d00 = vdupq_n_f32(0.0f); - if (M > 0x1) d10 = vdupq_n_f32(0.0f); - if (M > 0x2) d20 = vdupq_n_f32(0.0f); - if (M > 0x3) d30 = vdupq_n_f32(0.0f); - if (M > 0x4) d40 = vdupq_n_f32(0.0f); - if (M > 0x5) d50 = vdupq_n_f32(0.0f); - if (M > 0x6) d60 = vdupq_n_f32(0.0f); - if (M > 0x7) d70 = vdupq_n_f32(0.0f); - if (M > 0x8) d80 = vdupq_n_f32(0.0f); - if (M > 0x9) d90 = vdupq_n_f32(0.0f); - if (M > 0xa) da0 = vdupq_n_f32(0.0f); - if (M > 0xb) db0 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t off0 = beg + kx * dX, end = off0 + srcC, off6 = off0 + 6 * dS, offw = 0; - for (; off0 < end; ++off0, ++off6, offw += F) - { - w0 = Load(weight0 + offw); - if (M > 0x0) s0 = vdupq_n_f32(src0[off0]), d00 = vmlaq_f32(d00, s0, w0); - if (M > 0x1) s0 = vdupq_n_f32(src1[off0]), d10 = vmlaq_f32(d10, s0, w0); - if (M > 0x2) s0 = vdupq_n_f32(src2[off0]), d20 = vmlaq_f32(d20, s0, w0); - if (M > 0x3) s0 = vdupq_n_f32(src3[off0]), d30 = vmlaq_f32(d30, s0, w0); - if (M > 0x4) s0 = vdupq_n_f32(src4[off0]), d40 = vmlaq_f32(d40, s0, w0); - if (M > 0x5) s0 = vdupq_n_f32(src5[off0]), d50 = vmlaq_f32(d50, s0, w0); - if (M > 0x6) s0 = vdupq_n_f32(src0[off6]), d60 = vmlaq_f32(d60, s0, w0); - if (M > 0x7) s0 = vdupq_n_f32(src1[off6]), d70 = vmlaq_f32(d70, s0, w0); - if (M > 0x8) s0 = vdupq_n_f32(src2[off6]), d80 = vmlaq_f32(d80, s0, w0); - if (M > 0x9) s0 = vdupq_n_f32(src3[off6]), d90 = vmlaq_f32(d90, s0, w0); - if (M > 0xa) s0 = vdupq_n_f32(src4[off6]), da0 = vmlaq_f32(da0, s0, w0); - if (M > 0xb) s0 = vdupq_n_f32(src5[off6]), db0 = vmlaq_f32(db0, s0, w0); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - if (M > 0x0) Save1(dst, d00, bias, params), dst += dD; - if (M > 0x1) Save1(dst, d10, bias, params), dst += dD; - if (M > 0x2) Save1(dst, d20, bias, params), dst += dD; - if (M > 0x3) Save1(dst, d30, bias, params), dst += dD; - if (M > 0x4) Save1(dst, d40, bias, params), dst += dD; - if (M > 0x5) Save1(dst, d50, bias, params), dst += dD; - if (M > 0x6) Save1(dst, d60, bias, params), dst += dD; - if (M > 0x7) Save1(dst, d70, bias, params), dst += dD; - if (M > 0x8) Save1(dst, d80, bias, params), dst += dD; - if (M > 0x9) Save1(dst, d90, bias, params), dst += dD; - if (M > 0xa) Save1(dst, da0, bias, params), dst += dD; - if (M > 0xb) Save1(dst, db0, bias, params), dst += dD; - } - else - { - if (M > 0x0) Save1(dst, d00, bias, params, dstC), dst += dD; - if (M > 0x1) Save1(dst, d10, bias, params, dstC), dst += dD; - if (M > 0x2) Save1(dst, d20, bias, params, dstC), dst += dD; - if (M > 0x3) Save1(dst, d30, bias, params, dstC), dst += dD; - if (M > 0x4) Save1(dst, d40, bias, params, dstC), dst += dD; - if (M > 0x5) Save1(dst, d50, bias, params, dstC), dst += dD; - if (M > 0x6) Save1(dst, d60, bias, params, dstC), dst += dD; - if (M > 0x7) Save1(dst, d70, bias, params, dstC), dst += dD; - if (M > 0x8) Save1(dst, d80, bias, params, dstC), dst += dD; - if (M > 0x9) Save1(dst, d90, bias, params, dstC), dst += dD; - if (M > 0xa) Save1(dst, da0, bias, params, dstC), dst += dD; - if (M > 0xb) Save1(dst, db0, bias, params, dstC), dst += dD; - } - } - } - - template ConvolutionNhwcDirect_NxM_Ptr GetConvolutionNhwcDirect_2xM(size_t M) - { - switch (M) - { - case 0x0: return NULL; - case 0x1: return ConvolutionNhwcDirect_2xM; - case 0x2: return ConvolutionNhwcDirect_2xM; - case 0x3: return ConvolutionNhwcDirect_2xM; - case 0x4: return ConvolutionNhwcDirect_2xM; - case 0x5: return ConvolutionNhwcDirect_2xM; - case 0x6: return ConvolutionNhwcDirect_2xM; - case 0x7: return ConvolutionNhwcDirect_2xM; - case 0x8: return ConvolutionNhwcDirect_2xM; - case 0x9: return ConvolutionNhwcDirect_2xM; - case 0xa: return ConvolutionNhwcDirect_2xM; - case 0xb: return ConvolutionNhwcDirect_2xM; - } - assert(0); - return NULL; - } -#else - template void ConvolutionNhwcDirect_2x6(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 6 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1); - s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == DF) - { - Save2(dst, d00, d01, bias, params), dst += dD; - Save2(dst, d10, d11, bias, params), dst += dD; - Save2(dst, d20, d21, bias, params), dst += dD; - Save2(dst, d30, d31, bias, params), dst += dD; - Save2(dst, d40, d41, bias, params), dst += dD; - Save2(dst, d50, d51, bias, params), dst += dD; - } - else - { - dstC -= F; - Save2(dst, d00, d01, bias, params, dstC), dst += dD; - Save2(dst, d10, d11, bias, params, dstC), dst += dD; - Save2(dst, d20, d21, bias, params, dstC), dst += dD; - Save2(dst, d30, d31, bias, params, dstC), dst += dD; - Save2(dst, d40, d41, bias, params, dstC), dst += dD; - Save2(dst, d50, d51, bias, params, dstC), dst += dD; - } - } - else - { - d00 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 6 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0); - s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0); - s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0); - s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0); - s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - Save1(dst, d00, bias, params), dst += dD; - Save1(dst, d10, bias, params), dst += dD; - Save1(dst, d20, bias, params), dst += dD; - Save1(dst, d30, bias, params), dst += dD; - Save1(dst, d40, bias, params), dst += dD; - Save1(dst, d50, bias, params), dst += dD; - } - else - { - Save1(dst, d00, bias, params, dstC), dst += dD; - Save1(dst, d10, bias, params, dstC), dst += dD; - Save1(dst, d20, bias, params, dstC), dst += dD; - Save1(dst, d30, bias, params, dstC), dst += dD; - Save1(dst, d40, bias, params, dstC), dst += dD; - Save1(dst, d50, bias, params, dstC), dst += dD; - } - } - } - - template void ConvolutionNhwcDirect_2xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - if (M > 4) d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f); - if (M > 5) d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - if (M > 2) s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - if (M > 3) s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - if (M > 4) s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1); - if (M > 5) s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == DF) - { - if (M > 0) Save2(dst, d00, d01, bias, params), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params), dst += dD; - } - else - { - dstC -= F; - if (M > 0) Save2(dst, d00, d01, bias, params, dstC), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params, dstC), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params, dstC), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params, dstC), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params, dstC), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0) d00 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f); - if (M > 4) d40 = vdupq_n_f32(0.0f); - if (M > 5) d50 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0); - if (M > 2) s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0); - if (M > 3) s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0); - if (M > 4) s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0); - if (M > 5) s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - if (M > 0) Save1(dst, d00, bias, params), dst += dD; - if (M > 1) Save1(dst, d10, bias, params), dst += dD; - if (M > 2) Save1(dst, d20, bias, params), dst += dD; - if (M > 3) Save1(dst, d30, bias, params), dst += dD; - if (M > 4) Save1(dst, d40, bias, params), dst += dD; - if (M > 5) Save1(dst, d50, bias, params), dst += dD; - } - else - { - if (M > 0) Save1(dst, d00, bias, params, dstC), dst += dD; - if (M > 1) Save1(dst, d10, bias, params, dstC), dst += dD; - if (M > 2) Save1(dst, d20, bias, params, dstC), dst += dD; - if (M > 3) Save1(dst, d30, bias, params, dstC), dst += dD; - if (M > 4) Save1(dst, d40, bias, params, dstC), dst += dD; - if (M > 5) Save1(dst, d50, bias, params, dstC), dst += dD; - } - } - } - - template ConvolutionNhwcDirect_NxM_Ptr GetConvolutionNhwcDirect_2xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect_2xM; - case 2: return ConvolutionNhwcDirect_2xM; - case 3: return ConvolutionNhwcDirect_2xM; - case 4: return ConvolutionNhwcDirect_2xM; - case 5: return ConvolutionNhwcDirect_2xM; - } - assert(0); - return NULL; - } -#endif - - template void ConvolutionNhwcDirect_2(const float* src, const ConvParam32f& p, const AlgParam& a, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t noseH = p.NoseH(), noseW = p.NoseW(), bodyH = p.BodyH(), bodyW = p.BodyW(); - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_2x1 = ConvolutionNhwcDirect_2x1; -#if defined(SIMD_ARM64_ENABLE) - size_t n = 12, bodyWn = AlignLoAny(bodyW - noseW, n) + noseW, m = bodyW - bodyWn; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_2xN = ConvolutionNhwcDirect_2x12; -#else - size_t n = 6, bodyWn = AlignLoAny(bodyW - noseW, n) + noseW, m = bodyW - bodyWn; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_2xN = ConvolutionNhwcDirect_2x6; -#endif - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_2xM = GetConvolutionNhwcDirect_2xM(m); - size_t tailH = p.dstH, tailW = p.dstW; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - - float32x4_t _params[2], _bias[2]; - _params[0] = vdupq_n_f32(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = vdupq_n_f32(params[1]); - - for (size_t dc = 0; dc < dstC; dc += a.microD) - { - size_t dC = Simd::Min(a.microD, dstC - dc); - if (dC > 0 * F) _bias[0] = Load(bias + dc + 0 * F); - if (dC > 1 * F) _bias[1] = Load(bias + dc + 1 * F); - if (type == ::SimdConvolutionActivationPrelu) - { - if (dC > 0 * F) _params[0] = Load(params + dc + 0 * F); - if (dC > 1 * F) _params[1] = Load(params + dc + 1 * F); - } - float* d = dst + dc + yBeg * p.dstW * p.dstC; - size_t dy = yBeg; - for (; dy < noseH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_2xN(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - } - for (; dy < bodyH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_2xN(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - } - for (; dy < tailH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_2xN(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - } - weight += p.kernelY * p.kernelX * p.srcC * a.microD; - } - } - - //--------------------------------------------------------------------- - -#if defined(SIMD_ARM64_ENABLE) - template void ConvolutionNhwcDirect1x1_2x12(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, da0, da1, db0, db1, s0, w0, w1; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f); - d60 = vdupq_n_f32(0.0f), d61 = vdupq_n_f32(0.0f); - d70 = vdupq_n_f32(0.0f), d71 = vdupq_n_f32(0.0f); - d80 = vdupq_n_f32(0.0f), d81 = vdupq_n_f32(0.0f); - d90 = vdupq_n_f32(0.0f), d91 = vdupq_n_f32(0.0f); - da0 = vdupq_n_f32(0.0f), da1 = vdupq_n_f32(0.0f); - db0 = vdupq_n_f32(0.0f), db1 = vdupq_n_f32(0.0f); - for (size_t off0 = 0, off6 = 6 * dS, offw = 0; off0 < srcC; ++off0, ++off6, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - s0 = vdupq_n_f32(src0[off0]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - s0 = vdupq_n_f32(src1[off0]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - s0 = vdupq_n_f32(src2[off0]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - s0 = vdupq_n_f32(src3[off0]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - s0 = vdupq_n_f32(src4[off0]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1); - s0 = vdupq_n_f32(src5[off0]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1); - s0 = vdupq_n_f32(src0[off6]), d60 = vmlaq_f32(d60, s0, w0), d61 = vmlaq_f32(d61, s0, w1); - s0 = vdupq_n_f32(src1[off6]), d70 = vmlaq_f32(d70, s0, w0), d71 = vmlaq_f32(d71, s0, w1); - s0 = vdupq_n_f32(src2[off6]), d80 = vmlaq_f32(d80, s0, w0), d81 = vmlaq_f32(d81, s0, w1); - s0 = vdupq_n_f32(src3[off6]), d90 = vmlaq_f32(d90, s0, w0), d91 = vmlaq_f32(d91, s0, w1); - s0 = vdupq_n_f32(src4[off6]), da0 = vmlaq_f32(da0, s0, w0), da1 = vmlaq_f32(da1, s0, w1); - s0 = vdupq_n_f32(src5[off6]), db0 = vmlaq_f32(db0, s0, w0), db1 = vmlaq_f32(db1, s0, w1); - } - if (dstC == DF) - { - Save2(dst, d00, d01, bias, params), dst += dD; - Save2(dst, d10, d11, bias, params), dst += dD; - Save2(dst, d20, d21, bias, params), dst += dD; - Save2(dst, d30, d31, bias, params), dst += dD; - Save2(dst, d40, d41, bias, params), dst += dD; - Save2(dst, d50, d51, bias, params), dst += dD; - Save2(dst, d60, d61, bias, params), dst += dD; - Save2(dst, d70, d71, bias, params), dst += dD; - Save2(dst, d80, d81, bias, params), dst += dD; - Save2(dst, d90, d91, bias, params), dst += dD; - Save2(dst, da0, da1, bias, params), dst += dD; - Save2(dst, db0, db1, bias, params), dst += dD; - } - else - { - dstC -= F; - Save2(dst, d00, d01, bias, params, dstC), dst += dD; - Save2(dst, d10, d11, bias, params, dstC), dst += dD; - Save2(dst, d20, d21, bias, params, dstC), dst += dD; - Save2(dst, d30, d31, bias, params, dstC), dst += dD; - Save2(dst, d40, d41, bias, params, dstC), dst += dD; - Save2(dst, d50, d51, bias, params, dstC), dst += dD; - Save2(dst, d60, d61, bias, params, dstC), dst += dD; - Save2(dst, d70, d71, bias, params, dstC), dst += dD; - Save2(dst, d80, d81, bias, params, dstC), dst += dD; - Save2(dst, d90, d91, bias, params, dstC), dst += dD; - Save2(dst, da0, da1, bias, params, dstC), dst += dD; - Save2(dst, db0, db1, bias, params, dstC), dst += dD; - } - } - else - { - d00 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f); - d60 = vdupq_n_f32(0.0f); - d70 = vdupq_n_f32(0.0f); - d80 = vdupq_n_f32(0.0f); - d90 = vdupq_n_f32(0.0f); - da0 = vdupq_n_f32(0.0f); - db0 = vdupq_n_f32(0.0f); - for (size_t off0 = 0, off6 = 6 * dS, offw = 0; off0 < srcC; ++off0, ++off6, offw += F) - { - w0 = Load(weight0 + offw); - s0 = vdupq_n_f32(src0[off0]), d00 = vmlaq_f32(d00, s0, w0); - s0 = vdupq_n_f32(src1[off0]), d10 = vmlaq_f32(d10, s0, w0); - s0 = vdupq_n_f32(src2[off0]), d20 = vmlaq_f32(d20, s0, w0); - s0 = vdupq_n_f32(src3[off0]), d30 = vmlaq_f32(d30, s0, w0); - s0 = vdupq_n_f32(src4[off0]), d40 = vmlaq_f32(d40, s0, w0); - s0 = vdupq_n_f32(src5[off0]), d50 = vmlaq_f32(d50, s0, w0); - s0 = vdupq_n_f32(src0[off6]), d60 = vmlaq_f32(d60, s0, w0); - s0 = vdupq_n_f32(src1[off6]), d70 = vmlaq_f32(d70, s0, w0); - s0 = vdupq_n_f32(src2[off6]), d80 = vmlaq_f32(d80, s0, w0); - s0 = vdupq_n_f32(src3[off6]), d90 = vmlaq_f32(d90, s0, w0); - s0 = vdupq_n_f32(src4[off6]), da0 = vmlaq_f32(da0, s0, w0); - s0 = vdupq_n_f32(src5[off6]), db0 = vmlaq_f32(db0, s0, w0); - } - if (dstC == F) - { - Save1(dst, d00, bias, params), dst += dD; - Save1(dst, d10, bias, params), dst += dD; - Save1(dst, d20, bias, params), dst += dD; - Save1(dst, d30, bias, params), dst += dD; - Save1(dst, d40, bias, params), dst += dD; - Save1(dst, d50, bias, params), dst += dD; - Save1(dst, d60, bias, params), dst += dD; - Save1(dst, d70, bias, params), dst += dD; - Save1(dst, d80, bias, params), dst += dD; - Save1(dst, d90, bias, params), dst += dD; - Save1(dst, da0, bias, params), dst += dD; - Save1(dst, db0, bias, params), dst += dD; - } - else - { - Save1(dst, d00, bias, params, dstC), dst += dD; - Save1(dst, d10, bias, params, dstC), dst += dD; - Save1(dst, d20, bias, params, dstC), dst += dD; - Save1(dst, d30, bias, params, dstC), dst += dD; - Save1(dst, d40, bias, params, dstC), dst += dD; - Save1(dst, d50, bias, params, dstC), dst += dD; - Save1(dst, d60, bias, params, dstC), dst += dD; - Save1(dst, d70, bias, params, dstC), dst += dD; - Save1(dst, d80, bias, params, dstC), dst += dD; - Save1(dst, d90, bias, params, dstC), dst += dD; - Save1(dst, da0, bias, params, dstC), dst += dD; - Save1(dst, db0, bias, params, dstC), dst += dD; - } - } - } - - template void ConvolutionNhwcDirect1x1_2xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, da0, da1, db0, db1, s0, w0, w1; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - if (M > 0x0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - if (M > 0x1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - if (M > 0x2) d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - if (M > 0x3) d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - if (M > 0x4) d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f); - if (M > 0x5) d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f); - if (M > 0x6) d60 = vdupq_n_f32(0.0f), d61 = vdupq_n_f32(0.0f); - if (M > 0x7) d70 = vdupq_n_f32(0.0f), d71 = vdupq_n_f32(0.0f); - if (M > 0x8) d80 = vdupq_n_f32(0.0f), d81 = vdupq_n_f32(0.0f); - if (M > 0x9) d90 = vdupq_n_f32(0.0f), d91 = vdupq_n_f32(0.0f); - if (M > 0xa) da0 = vdupq_n_f32(0.0f), da1 = vdupq_n_f32(0.0f); - if (M > 0xb) db0 = vdupq_n_f32(0.0f), db1 = vdupq_n_f32(0.0f); - for (size_t off0 = 0, off6 = 6 * dS, offw = 0; off0 < srcC; ++off0, ++off6, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - if (M > 0x0) s0 = vdupq_n_f32(src0[off0]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - if (M > 0x1) s0 = vdupq_n_f32(src1[off0]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - if (M > 0x2) s0 = vdupq_n_f32(src2[off0]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - if (M > 0x3) s0 = vdupq_n_f32(src3[off0]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - if (M > 0x4) s0 = vdupq_n_f32(src4[off0]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1); - if (M > 0x5) s0 = vdupq_n_f32(src5[off0]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1); - if (M > 0x6) s0 = vdupq_n_f32(src0[off6]), d60 = vmlaq_f32(d60, s0, w0), d61 = vmlaq_f32(d61, s0, w1); - if (M > 0x7) s0 = vdupq_n_f32(src1[off6]), d70 = vmlaq_f32(d70, s0, w0), d71 = vmlaq_f32(d71, s0, w1); - if (M > 0x8) s0 = vdupq_n_f32(src2[off6]), d80 = vmlaq_f32(d80, s0, w0), d81 = vmlaq_f32(d81, s0, w1); - if (M > 0x9) s0 = vdupq_n_f32(src3[off6]), d90 = vmlaq_f32(d90, s0, w0), d91 = vmlaq_f32(d91, s0, w1); - if (M > 0xa) s0 = vdupq_n_f32(src4[off6]), da0 = vmlaq_f32(da0, s0, w0), da1 = vmlaq_f32(da1, s0, w1); - if (M > 0xb) s0 = vdupq_n_f32(src5[off6]), db0 = vmlaq_f32(db0, s0, w0), db1 = vmlaq_f32(db1, s0, w1); - } - if (dstC == DF) - { - if (M > 0x0) Save2(dst, d00, d01, bias, params), dst += dD; - if (M > 0x1) Save2(dst, d10, d11, bias, params), dst += dD; - if (M > 0x2) Save2(dst, d20, d21, bias, params), dst += dD; - if (M > 0x3) Save2(dst, d30, d31, bias, params), dst += dD; - if (M > 0x4) Save2(dst, d40, d41, bias, params), dst += dD; - if (M > 0x5) Save2(dst, d50, d51, bias, params), dst += dD; - if (M > 0x6) Save2(dst, d60, d61, bias, params), dst += dD; - if (M > 0x7) Save2(dst, d70, d71, bias, params), dst += dD; - if (M > 0x8) Save2(dst, d80, d81, bias, params), dst += dD; - if (M > 0x9) Save2(dst, d90, d91, bias, params), dst += dD; - if (M > 0xa) Save2(dst, da0, da1, bias, params), dst += dD; - if (M > 0xb) Save2(dst, db0, db1, bias, params), dst += dD; - } - else - { - dstC -= F; - if (M > 0x0) Save2(dst, d00, d01, bias, params, dstC), dst += dD; - if (M > 0x1) Save2(dst, d10, d11, bias, params, dstC), dst += dD; - if (M > 0x2) Save2(dst, d20, d21, bias, params, dstC), dst += dD; - if (M > 0x3) Save2(dst, d30, d31, bias, params, dstC), dst += dD; - if (M > 0x4) Save2(dst, d40, d41, bias, params, dstC), dst += dD; - if (M > 0x5) Save2(dst, d50, d51, bias, params, dstC), dst += dD; - if (M > 0x6) Save2(dst, d60, d61, bias, params, dstC), dst += dD; - if (M > 0x7) Save2(dst, d70, d71, bias, params, dstC), dst += dD; - if (M > 0x8) Save2(dst, d80, d81, bias, params, dstC), dst += dD; - if (M > 0x9) Save2(dst, d90, d91, bias, params, dstC), dst += dD; - if (M > 0xa) Save2(dst, da0, da1, bias, params, dstC), dst += dD; - if (M > 0xb) Save2(dst, db0, db1, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0x0) d00 = vdupq_n_f32(0.0f); - if (M > 0x1) d10 = vdupq_n_f32(0.0f); - if (M > 0x2) d20 = vdupq_n_f32(0.0f); - if (M > 0x3) d30 = vdupq_n_f32(0.0f); - if (M > 0x4) d40 = vdupq_n_f32(0.0f); - if (M > 0x5) d50 = vdupq_n_f32(0.0f); - if (M > 0x6) d60 = vdupq_n_f32(0.0f); - if (M > 0x7) d70 = vdupq_n_f32(0.0f); - if (M > 0x8) d80 = vdupq_n_f32(0.0f); - if (M > 0x9) d90 = vdupq_n_f32(0.0f); - if (M > 0xa) da0 = vdupq_n_f32(0.0f); - if (M > 0xb) db0 = vdupq_n_f32(0.0f); - for (size_t off0 = 0, off6 = 6 * dS, offw = 0; off0 < srcC; ++off0, ++off6, offw += F) - { - w0 = Load(weight0 + offw); - if (M > 0x0) s0 = vdupq_n_f32(src0[off0]), d00 = vmlaq_f32(d00, s0, w0); - if (M > 0x1) s0 = vdupq_n_f32(src1[off0]), d10 = vmlaq_f32(d10, s0, w0); - if (M > 0x2) s0 = vdupq_n_f32(src2[off0]), d20 = vmlaq_f32(d20, s0, w0); - if (M > 0x3) s0 = vdupq_n_f32(src3[off0]), d30 = vmlaq_f32(d30, s0, w0); - if (M > 0x4) s0 = vdupq_n_f32(src4[off0]), d40 = vmlaq_f32(d40, s0, w0); - if (M > 0x5) s0 = vdupq_n_f32(src5[off0]), d50 = vmlaq_f32(d50, s0, w0); - if (M > 0x6) s0 = vdupq_n_f32(src0[off6]), d60 = vmlaq_f32(d60, s0, w0); - if (M > 0x7) s0 = vdupq_n_f32(src1[off6]), d70 = vmlaq_f32(d70, s0, w0); - if (M > 0x8) s0 = vdupq_n_f32(src2[off6]), d80 = vmlaq_f32(d80, s0, w0); - if (M > 0x9) s0 = vdupq_n_f32(src3[off6]), d90 = vmlaq_f32(d90, s0, w0); - if (M > 0xa) s0 = vdupq_n_f32(src4[off6]), da0 = vmlaq_f32(da0, s0, w0); - if (M > 0xb) s0 = vdupq_n_f32(src5[off6]), db0 = vmlaq_f32(db0, s0, w0); - } - if (dstC == F) - { - if (M > 0x0) Save1(dst, d00, bias, params), dst += dD; - if (M > 0x1) Save1(dst, d10, bias, params), dst += dD; - if (M > 0x2) Save1(dst, d20, bias, params), dst += dD; - if (M > 0x3) Save1(dst, d30, bias, params), dst += dD; - if (M > 0x4) Save1(dst, d40, bias, params), dst += dD; - if (M > 0x5) Save1(dst, d50, bias, params), dst += dD; - if (M > 0x6) Save1(dst, d60, bias, params), dst += dD; - if (M > 0x7) Save1(dst, d70, bias, params), dst += dD; - if (M > 0x8) Save1(dst, d80, bias, params), dst += dD; - if (M > 0x9) Save1(dst, d90, bias, params), dst += dD; - if (M > 0xa) Save1(dst, da0, bias, params), dst += dD; - if (M > 0xb) Save1(dst, db0, bias, params), dst += dD; - } - else - { - if (M > 0x0) Save1(dst, d00, bias, params, dstC), dst += dD; - if (M > 0x1) Save1(dst, d10, bias, params, dstC), dst += dD; - if (M > 0x2) Save1(dst, d20, bias, params, dstC), dst += dD; - if (M > 0x3) Save1(dst, d30, bias, params, dstC), dst += dD; - if (M > 0x4) Save1(dst, d40, bias, params, dstC), dst += dD; - if (M > 0x5) Save1(dst, d50, bias, params, dstC), dst += dD; - if (M > 0x6) Save1(dst, d60, bias, params, dstC), dst += dD; - if (M > 0x7) Save1(dst, d70, bias, params, dstC), dst += dD; - if (M > 0x8) Save1(dst, d80, bias, params, dstC), dst += dD; - if (M > 0x9) Save1(dst, d90, bias, params, dstC), dst += dD; - if (M > 0xa) Save1(dst, da0, bias, params, dstC), dst += dD; - if (M > 0xb) Save1(dst, db0, bias, params, dstC), dst += dD; - } - } - } - - template ConvolutionNhwcDirect1x1_NxM_Ptr GetConvolutionNhwcDirect1x1_2xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 0x1: return ConvolutionNhwcDirect1x1_2xM; - case 0x2: return ConvolutionNhwcDirect1x1_2xM; - case 0x3: return ConvolutionNhwcDirect1x1_2xM; - case 0x4: return ConvolutionNhwcDirect1x1_2xM; - case 0x5: return ConvolutionNhwcDirect1x1_2xM; - case 0x6: return ConvolutionNhwcDirect1x1_2xM; - case 0x7: return ConvolutionNhwcDirect1x1_2xM; - case 0x8: return ConvolutionNhwcDirect1x1_2xM; - case 0x9: return ConvolutionNhwcDirect1x1_2xM; - case 0xa: return ConvolutionNhwcDirect1x1_2xM; - case 0xb: return ConvolutionNhwcDirect1x1_2xM; - } - assert(0); - return NULL; - } -#else - template void ConvolutionNhwcDirect1x1_2x6(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1); - s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1); - } - if (dstC == DF) - { - Save2(dst, d00, d01, bias, params), dst += dD; - Save2(dst, d10, d11, bias, params), dst += dD; - Save2(dst, d20, d21, bias, params), dst += dD; - Save2(dst, d30, d31, bias, params), dst += dD; - Save2(dst, d40, d41, bias, params), dst += dD; - Save2(dst, d50, d51, bias, params), dst += dD; - } - else - { - dstC -= F; - Save2(dst, d00, d01, bias, params, dstC), dst += dD; - Save2(dst, d10, d11, bias, params, dstC), dst += dD; - Save2(dst, d20, d21, bias, params, dstC), dst += dD; - Save2(dst, d30, d31, bias, params, dstC), dst += dD; - Save2(dst, d40, d41, bias, params, dstC), dst += dD; - Save2(dst, d50, d51, bias, params, dstC), dst += dD; - } - } - else - { - d00 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0); - s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0); - s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0); - s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0); - s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0); - } - if (dstC == F) - { - Save1(dst, d00, bias, params), dst += dD; - Save1(dst, d10, bias, params), dst += dD; - Save1(dst, d20, bias, params), dst += dD; - Save1(dst, d30, bias, params), dst += dD; - Save1(dst, d40, bias, params), dst += dD; - Save1(dst, d50, bias, params), dst += dD; - } - else - { - Save1(dst, d00, bias, params, dstC), dst += dD; - Save1(dst, d10, bias, params, dstC), dst += dD; - Save1(dst, d20, bias, params, dstC), dst += dD; - Save1(dst, d30, bias, params, dstC), dst += dD; - Save1(dst, d40, bias, params, dstC), dst += dD; - Save1(dst, d50, bias, params, dstC), dst += dD; - } - } - } - - template void ConvolutionNhwcDirect1x1_2xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - if (M > 4) d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f); - if (M > 5) d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - if (M > 2) s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - if (M > 3) s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - if (M > 4) s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1); - if (M > 5) s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1); - } - if (dstC == DF) - { - if (M > 0) Save2(dst, d00, d01, bias, params), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params), dst += dD; - } - else - { - dstC -= F; - if (M > 0) Save2(dst, d00, d01, bias, params, dstC), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params, dstC), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params, dstC), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params, dstC), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params, dstC), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0) d00 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f); - if (M > 4) d40 = vdupq_n_f32(0.0f); - if (M > 5) d50 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0); - if (M > 2) s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0); - if (M > 3) s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0); - if (M > 4) s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0); - if (M > 5) s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0); - } - if (dstC == F) - { - if (M > 0) Save1(dst, d00, bias, params), dst += dD; - if (M > 1) Save1(dst, d10, bias, params), dst += dD; - if (M > 2) Save1(dst, d20, bias, params), dst += dD; - if (M > 3) Save1(dst, d30, bias, params), dst += dD; - if (M > 4) Save1(dst, d40, bias, params), dst += dD; - if (M > 5) Save1(dst, d50, bias, params), dst += dD; - } - else - { - if (M > 0) Save1(dst, d00, bias, params, dstC), dst += dD; - if (M > 1) Save1(dst, d10, bias, params, dstC), dst += dD; - if (M > 2) Save1(dst, d20, bias, params, dstC), dst += dD; - if (M > 3) Save1(dst, d30, bias, params, dstC), dst += dD; - if (M > 4) Save1(dst, d40, bias, params, dstC), dst += dD; - if (M > 5) Save1(dst, d50, bias, params, dstC), dst += dD; - } - } - } - - template ConvolutionNhwcDirect1x1_NxM_Ptr GetConvolutionNhwcDirect1x1_2xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect1x1_2xM; - case 2: return ConvolutionNhwcDirect1x1_2xM; - case 3: return ConvolutionNhwcDirect1x1_2xM; - case 4: return ConvolutionNhwcDirect1x1_2xM; - case 5: return ConvolutionNhwcDirect1x1_2xM; - } - assert(0); - return NULL; - } -#endif - - template void ConvolutionNhwcDirect1x1_2(const float* src, const ConvParam32f& p, const AlgParam& a, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { -#if defined(SIMD_ARM64_ENABLE) - size_t n = 12, n1 = (yEnd - yBeg) * p.dstW, nn = AlignLoAny(n1, n), m = n1 - nn; - ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_2xN = ConvolutionNhwcDirect1x1_2x12; -#else - size_t n = 6, n1 = (yEnd - yBeg) * p.dstW, nn = AlignLoAny(n1, n), m = n1 - nn; - ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_2xN = ConvolutionNhwcDirect1x1_2x6; -#endif - ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_2xM = GetConvolutionNhwcDirect1x1_2xM(m); - - float32x4_t _params[2], _bias[2]; - _params[0] = vdupq_n_f32(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = vdupq_n_f32(params[1]); - - for (size_t dc = 0; dc < dstC; dc += a.microD) - { - size_t dC = Simd::Min(a.microD, dstC - dc); - if (dC > 0 * F) _bias[0] = Load(bias + dc + 0 * F); - if (dC > 1 * F) _bias[1] = Load(bias + dc + 1 * F); - if (type == ::SimdConvolutionActivationPrelu) - { - if (dC > 0 * F) _params[0] = Load(params + dc + 0 * F); - if (dC > 1 * F) _params[1] = Load(params + dc + 1 * F); - } - const float* ps = src + yBeg * p.srcW * p.srcC; - float* pd = dst + dc + yBeg * p.dstW * p.dstC; - size_t i = 0; - for (; i < nn; i += n, ps += n * p.srcC, pd += n * p.dstC) - convolutionNhwcDirect1x1_2xN(ps, p, a, srcC, dC, weight, _bias, _params, pd); - for (; i < n1; i += m, ps += m * p.srcC, pd += m * p.dstC) - convolutionNhwcDirect1x1_2xM(ps, p, a, srcC, dC, weight, _bias, _params, pd); - weight += p.srcC * a.microD; - } - } - - //--------------------------------------------------------------------- - - template static SIMD_INLINE void Set(const ConvParam32f& p, AlgParam& a) - { - a.convolutions[term] = p.Is1x1() ? ConvolutionNhwcDirect1x1_2 : ConvolutionNhwcDirect_2; - } - - template static SIMD_INLINE void Set(const ConvParam32f& p, AlgParam& a) - { - Set(p, a); - Set(p, a); - Set(p, a); - Set(p, a); - } - - bool SynetConvolution32fNhwcDirect::Set2r(const ConvParam32f& p, AlgParam& a) - { - assert(a.microD == 2 * F); - switch (p.activation) - { - case SimdConvolutionActivationIdentity: Set(p, a); break; - case SimdConvolutionActivationRelu: Set(p, a); break; - case SimdConvolutionActivationLeakyRelu: Set(p, a); break; - case SimdConvolutionActivationRestrictRange: Set(p, a); break; - case SimdConvolutionActivationPrelu: Set(p, a); break; - case SimdConvolutionActivationElu: Set(p, a); break; - case SimdConvolutionActivationHswish: Set(p, a); break; - default: assert(0); - } - return true; - } - } -#endif//SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonSynetConvolution32fNhwcDirect3r.cpp b/src/3rd/Simd/Simd/SimdNeonSynetConvolution32fNhwcDirect3r.cpp deleted file mode 100644 index 8c2b5233..00000000 --- a/src/3rd/Simd/Simd/SimdNeonSynetConvolution32fNhwcDirect3r.cpp +++ /dev/null @@ -1,1562 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - using AlgParam = SynetConvolution32fNhwcDirect::AlgParam; - - typedef void(*ConvolutionNhwcDirect_NxM_Ptr)(const float* src0, const ConvParam32f& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst); - typedef void(*ConvolutionNhwcDirect1x1_NxM_Ptr)(const float* src0, const ConvParam32f& p, const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst); - - //--------------------------------------------------------------------- - - template void ConvolutionNhwcDirect_3x1(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d02, s0, w0, w1, w2; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - if (dstC > 2 * F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2); - } - } - weight0 += dW, weight1 += dW, weight2 += dW; - } - } - if (dstC == 3 * F) - Save3(dst, d00, d01, d02, bias, params); - else - Save3(dst, d00, d01, d02, bias, params, dstC - 2 * F); - } - else if (dstC > F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - } - } - weight0 += dW, weight1 += dW; - } - } - if (dstC == 2 * F) - Save2(dst, d00, d01, bias, params); - else - Save2(dst, d00, d01, bias, params, dstC - F); - } - else - { - d00 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0); - } - } - weight0 += dW; - } - } - if (dstC == F) - Save1(dst, d00, bias, params); - else - Save1(dst, d00, bias, params, dstC); - } - } - -#if defined(SIMD_ARM64_ENABLE) - template void ConvolutionNhwcDirect_3x8(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d02, d10, d11, d12, d20, d21, d22, d30, d31, d32, d40, d41, d42, d50, d51, d52, d60, d61, d62, d70, d71, d72, s0, w0, w1, w2; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - if (dstC > 2 * F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f), d22 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f), d32 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f), d42 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f), d52 = vdupq_n_f32(0.0f); - d60 = vdupq_n_f32(0.0f), d61 = vdupq_n_f32(0.0f), d62 = vdupq_n_f32(0.0f); - d70 = vdupq_n_f32(0.0f), d71 = vdupq_n_f32(0.0f), d72 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 8 <= srcW); - size_t off0 = beg + kx * dX, end = off0 + srcC, off4 = off0 + 4 * dS, offw = 0; - for (; off0 < end; ++off0, ++off4, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - s0 = vdupq_n_f32(src0[off0]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2); - s0 = vdupq_n_f32(src1[off0]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2); - s0 = vdupq_n_f32(src2[off0]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1), d22 = vmlaq_f32(d22, s0, w2); - s0 = vdupq_n_f32(src3[off0]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1), d32 = vmlaq_f32(d32, s0, w2); - s0 = vdupq_n_f32(src0[off4]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1), d42 = vmlaq_f32(d42, s0, w2); - s0 = vdupq_n_f32(src1[off4]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1), d52 = vmlaq_f32(d52, s0, w2); - s0 = vdupq_n_f32(src2[off4]), d60 = vmlaq_f32(d60, s0, w0), d61 = vmlaq_f32(d61, s0, w1), d62 = vmlaq_f32(d62, s0, w2); - s0 = vdupq_n_f32(src3[off4]), d70 = vmlaq_f32(d70, s0, w0), d71 = vmlaq_f32(d71, s0, w1), d72 = vmlaq_f32(d72, s0, w2); - } - weight0 += dW, weight1 += dW, weight2 += dW; - } - } - else - weight0 += dWz, weight1 += dWz, weight2 += dWz; - } - if (dstC == 3 * F) - { - Save3(dst, d00, d01, d02, bias, params), dst += dD; - Save3(dst, d10, d11, d12, bias, params), dst += dD; - Save3(dst, d20, d21, d22, bias, params), dst += dD; - Save3(dst, d30, d31, d32, bias, params), dst += dD; - Save3(dst, d40, d41, d42, bias, params), dst += dD; - Save3(dst, d50, d51, d52, bias, params), dst += dD; - Save3(dst, d60, d61, d62, bias, params), dst += dD; - Save3(dst, d70, d71, d72, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - Save3(dst, d20, d21, d22, bias, params, dstC), dst += dD; - Save3(dst, d30, d31, d32, bias, params, dstC), dst += dD; - Save3(dst, d40, d41, d42, bias, params, dstC), dst += dD; - Save3(dst, d50, d51, d52, bias, params, dstC), dst += dD; - Save3(dst, d60, d61, d62, bias, params, dstC), dst += dD; - Save3(dst, d70, d71, d72, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f); - d60 = vdupq_n_f32(0.0f), d61 = vdupq_n_f32(0.0f); - d70 = vdupq_n_f32(0.0f), d71 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 8 <= srcW); - size_t off0 = beg + kx * dX, end = off0 + srcC, off4 = off0 + 4 * dS, offw = 0; - for (; off0 < end; ++off0, ++off4, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - s0 = vdupq_n_f32(src0[off0]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - s0 = vdupq_n_f32(src1[off0]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - s0 = vdupq_n_f32(src2[off0]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - s0 = vdupq_n_f32(src3[off0]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - s0 = vdupq_n_f32(src0[off4]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1); - s0 = vdupq_n_f32(src1[off4]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1); - s0 = vdupq_n_f32(src2[off4]), d60 = vmlaq_f32(d60, s0, w0), d61 = vmlaq_f32(d61, s0, w1); - s0 = vdupq_n_f32(src3[off4]), d70 = vmlaq_f32(d70, s0, w0), d71 = vmlaq_f32(d71, s0, w1); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == DF) - { - Save2(dst, d00, d01, bias, params), dst += dD; - Save2(dst, d10, d11, bias, params), dst += dD; - Save2(dst, d20, d21, bias, params), dst += dD; - Save2(dst, d30, d31, bias, params), dst += dD; - Save2(dst, d40, d41, bias, params), dst += dD; - Save2(dst, d50, d51, bias, params), dst += dD; - Save2(dst, d60, d61, bias, params), dst += dD; - Save2(dst, d70, d71, bias, params), dst += dD; - } - else - { - dstC -= F; - Save2(dst, d00, d01, bias, params, dstC), dst += dD; - Save2(dst, d10, d11, bias, params, dstC), dst += dD; - Save2(dst, d20, d21, bias, params, dstC), dst += dD; - Save2(dst, d30, d31, bias, params, dstC), dst += dD; - Save2(dst, d40, d41, bias, params, dstC), dst += dD; - Save2(dst, d50, d51, bias, params, dstC), dst += dD; - Save2(dst, d60, d61, bias, params, dstC), dst += dD; - Save2(dst, d70, d71, bias, params, dstC), dst += dD; - } - } - else - { - d00 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f); - d60 = vdupq_n_f32(0.0f); - d70 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 8 <= srcW); - size_t off0 = beg + kx * dX, end = off0 + srcC, off4 = off0 + 4 * dS, offw = 0; - for (; off0 < end; ++off0, ++off4, offw += F) - { - w0 = Load(weight0 + offw); - s0 = vdupq_n_f32(src0[off0]), d00 = vmlaq_f32(d00, s0, w0); - s0 = vdupq_n_f32(src1[off0]), d10 = vmlaq_f32(d10, s0, w0); - s0 = vdupq_n_f32(src2[off0]), d20 = vmlaq_f32(d20, s0, w0); - s0 = vdupq_n_f32(src3[off0]), d30 = vmlaq_f32(d30, s0, w0); - s0 = vdupq_n_f32(src0[off4]), d40 = vmlaq_f32(d40, s0, w0); - s0 = vdupq_n_f32(src1[off4]), d50 = vmlaq_f32(d50, s0, w0); - s0 = vdupq_n_f32(src2[off4]), d60 = vmlaq_f32(d60, s0, w0); - s0 = vdupq_n_f32(src3[off4]), d70 = vmlaq_f32(d70, s0, w0); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - Save1(dst, d00, bias, params), dst += dD; - Save1(dst, d10, bias, params), dst += dD; - Save1(dst, d20, bias, params), dst += dD; - Save1(dst, d30, bias, params), dst += dD; - Save1(dst, d40, bias, params), dst += dD; - Save1(dst, d50, bias, params), dst += dD; - Save1(dst, d60, bias, params), dst += dD; - Save1(dst, d70, bias, params), dst += dD; - } - else - { - Save1(dst, d00, bias, params, dstC), dst += dD; - Save1(dst, d10, bias, params, dstC), dst += dD; - Save1(dst, d20, bias, params, dstC), dst += dD; - Save1(dst, d30, bias, params, dstC), dst += dD; - Save1(dst, d40, bias, params, dstC), dst += dD; - Save1(dst, d50, bias, params, dstC), dst += dD; - Save1(dst, d60, bias, params, dstC), dst += dD; - Save1(dst, d70, bias, params, dstC), dst += dD; - } - } - } - - template void ConvolutionNhwcDirect_3xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d02, d10, d11, d12, d20, d21, d22, d30, d31, d32, d40, d41, d42, d50, d51, d52, d60, d61, d62, d70, d71, d72, s0, w0, w1, w2; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - if (dstC > 2 * F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f), d22 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f), d32 = vdupq_n_f32(0.0f); - if (M > 4) d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f), d42 = vdupq_n_f32(0.0f); - if (M > 5) d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f), d52 = vdupq_n_f32(0.0f); - if (M > 6) d60 = vdupq_n_f32(0.0f), d61 = vdupq_n_f32(0.0f), d62 = vdupq_n_f32(0.0f); - if (M > 7) d70 = vdupq_n_f32(0.0f), d71 = vdupq_n_f32(0.0f), d72 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t off0 = beg + kx * dX, end = off0 + srcC, off4 = off0 + 4 * dS, offw = 0; - for (; off0 < end; ++off0, ++off4, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[off0]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2); - if (M > 1) s0 = vdupq_n_f32(src1[off0]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2); - if (M > 2) s0 = vdupq_n_f32(src2[off0]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1), d22 = vmlaq_f32(d22, s0, w2); - if (M > 3) s0 = vdupq_n_f32(src3[off0]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1), d32 = vmlaq_f32(d32, s0, w2); - if (M > 4) s0 = vdupq_n_f32(src0[off4]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1), d42 = vmlaq_f32(d42, s0, w2); - if (M > 5) s0 = vdupq_n_f32(src1[off4]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1), d52 = vmlaq_f32(d52, s0, w2); - if (M > 6) s0 = vdupq_n_f32(src2[off4]), d60 = vmlaq_f32(d60, s0, w0), d61 = vmlaq_f32(d61, s0, w1), d62 = vmlaq_f32(d62, s0, w2); - if (M > 7) s0 = vdupq_n_f32(src3[off4]), d70 = vmlaq_f32(d70, s0, w0), d71 = vmlaq_f32(d71, s0, w1), d72 = vmlaq_f32(d72, s0, w2); - } - weight0 += dW, weight1 += dW, weight2 += dW; - } - } - else - weight0 += dWz, weight1 += dWz, weight2 += dWz; - } - if (dstC == 3 * F) - { - if (M > 0) Save3(dst, d00, d01, d02, bias, params), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params), dst += dD; - if (M > 4) Save3(dst, d40, d41, d42, bias, params), dst += dD; - if (M > 5) Save3(dst, d50, d51, d52, bias, params), dst += dD; - if (M > 6) Save3(dst, d60, d61, d62, bias, params), dst += dD; - if (M > 7) Save3(dst, d70, d71, d72, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - if (M > 0) Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params, dstC), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params, dstC), dst += dD; - if (M > 4) Save3(dst, d40, d41, d42, bias, params, dstC), dst += dD; - if (M > 5) Save3(dst, d50, d51, d52, bias, params, dstC), dst += dD; - if (M > 6) Save3(dst, d60, d61, d62, bias, params, dstC), dst += dD; - if (M > 7) Save3(dst, d70, d71, d72, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - if (M > 4) d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f); - if (M > 5) d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f); - if (M > 6) d60 = vdupq_n_f32(0.0f), d61 = vdupq_n_f32(0.0f); - if (M > 7) d70 = vdupq_n_f32(0.0f), d71 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t off0 = beg + kx * dX, end = off0 + srcC, off4 = off0 + 4 * dS, offw = 0; - for (; off0 < end; ++off0, ++off4, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[off0]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - if (M > 1) s0 = vdupq_n_f32(src1[off0]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - if (M > 2) s0 = vdupq_n_f32(src2[off0]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - if (M > 3) s0 = vdupq_n_f32(src3[off0]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - if (M > 4) s0 = vdupq_n_f32(src0[off4]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1); - if (M > 5) s0 = vdupq_n_f32(src1[off4]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1); - if (M > 6) s0 = vdupq_n_f32(src2[off4]), d60 = vmlaq_f32(d60, s0, w0), d61 = vmlaq_f32(d61, s0, w1); - if (M > 7) s0 = vdupq_n_f32(src3[off4]), d70 = vmlaq_f32(d70, s0, w0), d71 = vmlaq_f32(d71, s0, w1); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == DF) - { - if (M > 0) Save2(dst, d00, d01, bias, params), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params), dst += dD; - if (M > 6) Save2(dst, d60, d61, bias, params), dst += dD; - if (M > 7) Save2(dst, d70, d71, bias, params), dst += dD; - } - else - { - dstC -= F; - if (M > 0) Save2(dst, d00, d01, bias, params, dstC), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params, dstC), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params, dstC), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params, dstC), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params, dstC), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params, dstC), dst += dD; - if (M > 6) Save2(dst, d60, d61, bias, params, dstC), dst += dD; - if (M > 7) Save2(dst, d70, d71, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0) d00 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f); - if (M > 4) d40 = vdupq_n_f32(0.0f); - if (M > 5) d50 = vdupq_n_f32(0.0f); - if (M > 6) d60 = vdupq_n_f32(0.0f); - if (M > 7) d70 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t off0 = beg + kx * dX, end = off0 + srcC, off4 = off0 + 4 * dS, offw = 0; - for (; off0 < end; ++off0, ++off4, offw += F) - { - w0 = Load(weight0 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[off0]), d00 = vmlaq_f32(d00, s0, w0); - if (M > 1) s0 = vdupq_n_f32(src1[off0]), d10 = vmlaq_f32(d10, s0, w0); - if (M > 2) s0 = vdupq_n_f32(src2[off0]), d20 = vmlaq_f32(d20, s0, w0); - if (M > 3) s0 = vdupq_n_f32(src3[off0]), d30 = vmlaq_f32(d30, s0, w0); - if (M > 4) s0 = vdupq_n_f32(src0[off4]), d40 = vmlaq_f32(d40, s0, w0); - if (M > 5) s0 = vdupq_n_f32(src1[off4]), d50 = vmlaq_f32(d50, s0, w0); - if (M > 6) s0 = vdupq_n_f32(src2[off4]), d60 = vmlaq_f32(d60, s0, w0); - if (M > 7) s0 = vdupq_n_f32(src3[off4]), d70 = vmlaq_f32(d70, s0, w0); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - if (M > 0) Save1(dst, d00, bias, params), dst += dD; - if (M > 1) Save1(dst, d10, bias, params), dst += dD; - if (M > 2) Save1(dst, d20, bias, params), dst += dD; - if (M > 3) Save1(dst, d30, bias, params), dst += dD; - if (M > 4) Save1(dst, d40, bias, params), dst += dD; - if (M > 5) Save1(dst, d50, bias, params), dst += dD; - if (M > 6) Save1(dst, d60, bias, params), dst += dD; - if (M > 7) Save1(dst, d70, bias, params), dst += dD; - } - else - { - if (M > 0) Save1(dst, d00, bias, params, dstC), dst += dD; - if (M > 1) Save1(dst, d10, bias, params, dstC), dst += dD; - if (M > 2) Save1(dst, d20, bias, params, dstC), dst += dD; - if (M > 3) Save1(dst, d30, bias, params, dstC), dst += dD; - if (M > 4) Save1(dst, d40, bias, params, dstC), dst += dD; - if (M > 5) Save1(dst, d50, bias, params, dstC), dst += dD; - if (M > 6) Save1(dst, d60, bias, params, dstC), dst += dD; - if (M > 7) Save1(dst, d70, bias, params, dstC), dst += dD; - } - } - } - - template ConvolutionNhwcDirect_NxM_Ptr GetConvolutionNhwcDirect_3xM(size_t M) - { - switch (M) - { - case 0x0: return NULL; - case 0x1: return ConvolutionNhwcDirect_3xM; - case 0x2: return ConvolutionNhwcDirect_3xM; - case 0x3: return ConvolutionNhwcDirect_3xM; - case 0x4: return ConvolutionNhwcDirect_3xM; - case 0x5: return ConvolutionNhwcDirect_3xM; - case 0x6: return ConvolutionNhwcDirect_3xM; - case 0x7: return ConvolutionNhwcDirect_3xM; - } - assert(0); - return NULL; - } -#else - template void ConvolutionNhwcDirect_3x4(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d02, d10, d11, d12, d20, d21, d22, d30, d31, d32, s0, w0, w1, w2; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - if (dstC > 2 * F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f), d22 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f), d32 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 4 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2); - s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1), d22 = vmlaq_f32(d22, s0, w2); - s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1), d32 = vmlaq_f32(d32, s0, w2); - } - weight0 += dW, weight1 += dW, weight2 += dW; - } - } - else - weight0 += dWz, weight1 += dWz, weight2 += dWz; - } - if (dstC == 3 * F) - { - Save3(dst, d00, d01, d02, bias, params), dst += dD; - Save3(dst, d10, d11, d12, bias, params), dst += dD; - Save3(dst, d20, d21, d22, bias, params), dst += dD; - Save3(dst, d30, d31, d32, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - Save3(dst, d20, d21, d22, bias, params, dstC), dst += dD; - Save3(dst, d30, d31, d32, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 4 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == 2 * F) - { - Save2(dst, d00, d01, bias, params), dst += dD; - Save2(dst, d10, d11, bias, params), dst += dD; - Save2(dst, d20, d21, bias, params), dst += dD; - Save2(dst, d30, d31, bias, params), dst += dD; - } - else - { - dstC -= 1 * F; - Save2(dst, d00, d01, bias, params, dstC), dst += dD; - Save2(dst, d10, d11, bias, params, dstC), dst += dD; - Save2(dst, d20, d21, bias, params, dstC), dst += dD; - Save2(dst, d30, d31, bias, params, dstC), dst += dD; - } - } - else - { - d00 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 4 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0); - s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0); - s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - Save1(dst, d00, bias, params), dst += dD; - Save1(dst, d10, bias, params), dst += dD; - Save1(dst, d20, bias, params), dst += dD; - Save1(dst, d30, bias, params), dst += dD; - } - else - { - Save1(dst, d00, bias, params, dstC), dst += dD; - Save1(dst, d10, bias, params, dstC), dst += dD; - Save1(dst, d20, bias, params, dstC), dst += dD; - Save1(dst, d30, bias, params, dstC), dst += dD; - } - } - } - - template void ConvolutionNhwcDirect_3xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d02, d10, d11, d12, d20, d21, d22, d30, d31, d32, s0, w0, w1, w2; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - if (dstC > 2 * F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f), d22 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f), d32 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2); - if (M > 2) s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1), d22 = vmlaq_f32(d22, s0, w2); - if (M > 3) s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1), d32 = vmlaq_f32(d32, s0, w2); - } - weight0 += dW, weight1 += dW, weight2 += dW; - } - } - else - weight0 += dWz, weight1 += dWz, weight2 += dWz; - } - if (dstC == 3 * F) - { - if (M > 0) Save3(dst, d00, d01, d02, bias, params), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - if (M > 0) Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params, dstC), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - if (M > 2) s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - if (M > 3) s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == DF) - { - if (M > 0) Save2(dst, d00, d01, bias, params), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params), dst += dD; - } - else - { - dstC -= F; - if (M > 0) Save2(dst, d00, d01, bias, params, dstC), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params, dstC), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params, dstC), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0) d00 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0); - if (M > 2) s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0); - if (M > 3) s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - if (M > 0) Save1(dst, d00, bias, params), dst += dD; - if (M > 1) Save1(dst, d10, bias, params), dst += dD; - if (M > 2) Save1(dst, d20, bias, params), dst += dD; - if (M > 3) Save1(dst, d30, bias, params), dst += dD; - } - else - { - if (M > 0) Save1(dst, d00, bias, params, dstC), dst += dD; - if (M > 1) Save1(dst, d10, bias, params, dstC), dst += dD; - if (M > 2) Save1(dst, d20, bias, params, dstC), dst += dD; - if (M > 3) Save1(dst, d30, bias, params, dstC), dst += dD; - } - } - } - - template ConvolutionNhwcDirect_NxM_Ptr GetConvolutionNhwcDirect_3xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect_3xM; - case 2: return ConvolutionNhwcDirect_3xM; - case 3: return ConvolutionNhwcDirect_3xM; - } - assert(0); - return NULL; - } -#endif - - template void ConvolutionNhwcDirect_3(const float* src, const ConvParam32f& p, const AlgParam& a, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t noseH = p.NoseH(), noseW = p.NoseW(), bodyH = p.BodyH(), bodyW = p.BodyW(); - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_3x1 = ConvolutionNhwcDirect_3x1; -#if defined(SIMD_ARM64_ENABLE) - size_t n = 8, bodyWn = AlignLoAny(bodyW - noseW, n) + noseW, m = bodyW - bodyWn; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_3xN = ConvolutionNhwcDirect_3x8; -#else - size_t n = 4, bodyWn = AlignLoAny(bodyW - noseW, n) + noseW, m = bodyW - bodyWn; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_3xN = ConvolutionNhwcDirect_3x4; -#endif - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_3xM = GetConvolutionNhwcDirect_3xM(m); - size_t tailH = p.dstH, tailW = p.dstW; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - - float32x4_t _params[3], _bias[3]; - _params[0] = vdupq_n_f32(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = vdupq_n_f32(params[1]); - - for (size_t dc = 0; dc < dstC; dc += a.microD) - { - size_t dC = Simd::Min(a.microD, dstC - dc); - if (dC > 0 * F) _bias[0] = Load(bias + dc + 0 * F); - if (dC > 1 * F) _bias[1] = Load(bias + dc + 1 * F); - if (dC > 2 * F) _bias[2] = Load(bias + dc + 2 * F); - if (type == ::SimdConvolutionActivationPrelu) - { - if (dC > 0 * F) _params[0] = Load(params + dc + 0 * F); - if (dC > 1 * F) _params[1] = Load(params + dc + 1 * F); - if (dC > 2 * F) _params[2] = Load(params + dc + 2 * F); - } - float* d = dst + dc + yBeg * p.dstW * p.dstC; - size_t dy = yBeg; - for (; dy < noseH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_3xN(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_3xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - } - for (; dy < bodyH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_3xN(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_3xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - } - for (; dy < tailH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_3xN(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_3xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_3x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - } - weight += p.kernelY * p.kernelX * p.srcC * a.microD; - } - } - - //--------------------------------------------------------------------- - -#if defined(SIMD_ARM64_ENABLE) - template void ConvolutionNhwcDirect1x1_3x8(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d02, d10, d11, d12, d20, d21, d22, d30, d31, d32, d40, d41, d42, d50, d51, d52, d60, d61, d62, d70, d71, d72, s0, w0, w1, w2; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - if (dstC > 2 * F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f), d22 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f), d32 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f), d42 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f), d52 = vdupq_n_f32(0.0f); - d60 = vdupq_n_f32(0.0f), d61 = vdupq_n_f32(0.0f), d62 = vdupq_n_f32(0.0f); - d70 = vdupq_n_f32(0.0f), d71 = vdupq_n_f32(0.0f), d72 = vdupq_n_f32(0.0f); - for (size_t off0 = 0, off4 = 4 * dS, offw = 0; off0 < srcC; ++off0, ++off4, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - s0 = vdupq_n_f32(src0[off0]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2); - s0 = vdupq_n_f32(src1[off0]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2); - s0 = vdupq_n_f32(src2[off0]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1), d22 = vmlaq_f32(d22, s0, w2); - s0 = vdupq_n_f32(src3[off0]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1), d32 = vmlaq_f32(d32, s0, w2); - s0 = vdupq_n_f32(src0[off4]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1), d42 = vmlaq_f32(d42, s0, w2); - s0 = vdupq_n_f32(src1[off4]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1), d52 = vmlaq_f32(d52, s0, w2); - s0 = vdupq_n_f32(src2[off4]), d60 = vmlaq_f32(d60, s0, w0), d61 = vmlaq_f32(d61, s0, w1), d62 = vmlaq_f32(d62, s0, w2); - s0 = vdupq_n_f32(src3[off4]), d70 = vmlaq_f32(d70, s0, w0), d71 = vmlaq_f32(d71, s0, w1), d72 = vmlaq_f32(d72, s0, w2); - } - if (dstC == 3 * F) - { - Save3(dst, d00, d01, d02, bias, params), dst += dD; - Save3(dst, d10, d11, d12, bias, params), dst += dD; - Save3(dst, d20, d21, d22, bias, params), dst += dD; - Save3(dst, d30, d31, d32, bias, params), dst += dD; - Save3(dst, d40, d41, d42, bias, params), dst += dD; - Save3(dst, d50, d51, d52, bias, params), dst += dD; - Save3(dst, d60, d61, d62, bias, params), dst += dD; - Save3(dst, d70, d71, d72, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - Save3(dst, d20, d21, d22, bias, params, dstC), dst += dD; - Save3(dst, d30, d31, d32, bias, params, dstC), dst += dD; - Save3(dst, d40, d41, d42, bias, params, dstC), dst += dD; - Save3(dst, d50, d51, d52, bias, params, dstC), dst += dD; - Save3(dst, d60, d61, d62, bias, params, dstC), dst += dD; - Save3(dst, d70, d71, d72, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f); - d60 = vdupq_n_f32(0.0f), d61 = vdupq_n_f32(0.0f); - d70 = vdupq_n_f32(0.0f), d71 = vdupq_n_f32(0.0f); - for (size_t off0 = 0, off4 = 4 * dS, offw = 0; off0 < srcC; ++off0, ++off4, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - s0 = vdupq_n_f32(src0[off0]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - s0 = vdupq_n_f32(src1[off0]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - s0 = vdupq_n_f32(src2[off0]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - s0 = vdupq_n_f32(src3[off0]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - s0 = vdupq_n_f32(src0[off4]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1); - s0 = vdupq_n_f32(src1[off4]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1); - s0 = vdupq_n_f32(src2[off4]), d60 = vmlaq_f32(d60, s0, w0), d61 = vmlaq_f32(d61, s0, w1); - s0 = vdupq_n_f32(src3[off4]), d70 = vmlaq_f32(d70, s0, w0), d71 = vmlaq_f32(d71, s0, w1); - } - if (dstC == DF) - { - Save2(dst, d00, d01, bias, params), dst += dD; - Save2(dst, d10, d11, bias, params), dst += dD; - Save2(dst, d20, d21, bias, params), dst += dD; - Save2(dst, d30, d31, bias, params), dst += dD; - Save2(dst, d40, d41, bias, params), dst += dD; - Save2(dst, d50, d51, bias, params), dst += dD; - Save2(dst, d60, d61, bias, params), dst += dD; - Save2(dst, d70, d71, bias, params), dst += dD; - } - else - { - dstC -= F; - Save2(dst, d00, d01, bias, params, dstC), dst += dD; - Save2(dst, d10, d11, bias, params, dstC), dst += dD; - Save2(dst, d20, d21, bias, params, dstC), dst += dD; - Save2(dst, d30, d31, bias, params, dstC), dst += dD; - Save2(dst, d40, d41, bias, params, dstC), dst += dD; - Save2(dst, d50, d51, bias, params, dstC), dst += dD; - Save2(dst, d60, d61, bias, params, dstC), dst += dD; - Save2(dst, d70, d71, bias, params, dstC), dst += dD; - } - } - else - { - d00 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f); - d60 = vdupq_n_f32(0.0f); - d70 = vdupq_n_f32(0.0f); - for (size_t off0 = 0, off4 = 4 * dS, offw = 0; off0 < srcC; ++off0, ++off4, offw += F) - { - w0 = Load(weight0 + offw); - s0 = vdupq_n_f32(src0[off0]), d00 = vmlaq_f32(d00, s0, w0); - s0 = vdupq_n_f32(src1[off0]), d10 = vmlaq_f32(d10, s0, w0); - s0 = vdupq_n_f32(src2[off0]), d20 = vmlaq_f32(d20, s0, w0); - s0 = vdupq_n_f32(src3[off0]), d30 = vmlaq_f32(d30, s0, w0); - s0 = vdupq_n_f32(src0[off4]), d40 = vmlaq_f32(d40, s0, w0); - s0 = vdupq_n_f32(src1[off4]), d50 = vmlaq_f32(d50, s0, w0); - s0 = vdupq_n_f32(src2[off4]), d60 = vmlaq_f32(d60, s0, w0); - s0 = vdupq_n_f32(src3[off4]), d70 = vmlaq_f32(d70, s0, w0); - } - if (dstC == F) - { - Save1(dst, d00, bias, params), dst += dD; - Save1(dst, d10, bias, params), dst += dD; - Save1(dst, d20, bias, params), dst += dD; - Save1(dst, d30, bias, params), dst += dD; - Save1(dst, d40, bias, params), dst += dD; - Save1(dst, d50, bias, params), dst += dD; - Save1(dst, d60, bias, params), dst += dD; - Save1(dst, d70, bias, params), dst += dD; - } - else - { - Save1(dst, d00, bias, params, dstC), dst += dD; - Save1(dst, d10, bias, params, dstC), dst += dD; - Save1(dst, d20, bias, params, dstC), dst += dD; - Save1(dst, d30, bias, params, dstC), dst += dD; - Save1(dst, d40, bias, params, dstC), dst += dD; - Save1(dst, d50, bias, params, dstC), dst += dD; - Save1(dst, d60, bias, params, dstC), dst += dD; - Save1(dst, d70, bias, params, dstC), dst += dD; - } - } - } - - template void ConvolutionNhwcDirect1x1_3xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d02, d10, d11, d12, d20, d21, d22, d30, d31, d32, d40, d41, d42, d50, d51, d52, d60, d61, d62, d70, d71, d72, s0, w0, w1, w2; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - if (dstC > 2 * F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f), d22 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f), d32 = vdupq_n_f32(0.0f); - if (M > 4) d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f), d42 = vdupq_n_f32(0.0f); - if (M > 5) d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f), d52 = vdupq_n_f32(0.0f); - if (M > 6) d60 = vdupq_n_f32(0.0f), d61 = vdupq_n_f32(0.0f), d62 = vdupq_n_f32(0.0f); - if (M > 7) d70 = vdupq_n_f32(0.0f), d71 = vdupq_n_f32(0.0f), d72 = vdupq_n_f32(0.0f); - for (size_t off0 = 0, off4 = 4 * dS, offw = 0; off0 < srcC; ++off0, ++off4, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[off0]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2); - if (M > 1) s0 = vdupq_n_f32(src1[off0]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2); - if (M > 2) s0 = vdupq_n_f32(src2[off0]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1), d22 = vmlaq_f32(d22, s0, w2); - if (M > 3) s0 = vdupq_n_f32(src3[off0]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1), d32 = vmlaq_f32(d32, s0, w2); - if (M > 4) s0 = vdupq_n_f32(src0[off4]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1), d42 = vmlaq_f32(d42, s0, w2); - if (M > 5) s0 = vdupq_n_f32(src1[off4]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1), d52 = vmlaq_f32(d52, s0, w2); - if (M > 6) s0 = vdupq_n_f32(src2[off4]), d60 = vmlaq_f32(d60, s0, w0), d61 = vmlaq_f32(d61, s0, w1), d62 = vmlaq_f32(d62, s0, w2); - if (M > 7) s0 = vdupq_n_f32(src3[off4]), d70 = vmlaq_f32(d70, s0, w0), d71 = vmlaq_f32(d71, s0, w1), d72 = vmlaq_f32(d72, s0, w2); - } - if (dstC == 3 * F) - { - if (M > 0) Save3(dst, d00, d01, d02, bias, params), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params), dst += dD; - if (M > 4) Save3(dst, d40, d41, d42, bias, params), dst += dD; - if (M > 5) Save3(dst, d50, d51, d52, bias, params), dst += dD; - if (M > 6) Save3(dst, d60, d61, d62, bias, params), dst += dD; - if (M > 7) Save3(dst, d70, d71, d72, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - if (M > 0) Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params, dstC), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params, dstC), dst += dD; - if (M > 4) Save3(dst, d40, d41, d42, bias, params, dstC), dst += dD; - if (M > 5) Save3(dst, d50, d51, d52, bias, params, dstC), dst += dD; - if (M > 6) Save3(dst, d60, d61, d62, bias, params, dstC), dst += dD; - if (M > 7) Save3(dst, d70, d71, d72, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - if (M > 4) d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f); - if (M > 5) d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f); - if (M > 6) d60 = vdupq_n_f32(0.0f), d61 = vdupq_n_f32(0.0f); - if (M > 7) d70 = vdupq_n_f32(0.0f), d71 = vdupq_n_f32(0.0f); - for (size_t off0 = 0, off4 = 4 * dS, offw = 0; off0 < srcC; ++off0, ++off4, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[off0]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - if (M > 1) s0 = vdupq_n_f32(src1[off0]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - if (M > 2) s0 = vdupq_n_f32(src2[off0]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - if (M > 3) s0 = vdupq_n_f32(src3[off0]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - if (M > 4) s0 = vdupq_n_f32(src0[off4]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1); - if (M > 5) s0 = vdupq_n_f32(src1[off4]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1); - if (M > 6) s0 = vdupq_n_f32(src2[off4]), d60 = vmlaq_f32(d60, s0, w0), d61 = vmlaq_f32(d61, s0, w1); - if (M > 7) s0 = vdupq_n_f32(src3[off4]), d70 = vmlaq_f32(d70, s0, w0), d71 = vmlaq_f32(d71, s0, w1); - } - if (dstC == DF) - { - if (M > 0) Save2(dst, d00, d01, bias, params), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params), dst += dD; - if (M > 6) Save2(dst, d60, d61, bias, params), dst += dD; - if (M > 7) Save2(dst, d70, d71, bias, params), dst += dD; - } - else - { - dstC -= F; - if (M > 0) Save2(dst, d00, d01, bias, params, dstC), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params, dstC), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params, dstC), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params, dstC), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params, dstC), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params, dstC), dst += dD; - if (M > 6) Save2(dst, d60, d61, bias, params, dstC), dst += dD; - if (M > 7) Save2(dst, d70, d71, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0) d00 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f); - if (M > 4) d40 = vdupq_n_f32(0.0f); - if (M > 5) d50 = vdupq_n_f32(0.0f); - if (M > 6) d60 = vdupq_n_f32(0.0f); - if (M > 7) d70 = vdupq_n_f32(0.0f); - for (size_t off0 = 0, off4 = 4 * dS, offw = 0; off0 < srcC; ++off0, ++off4, offw += F) - { - w0 = Load(weight0 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[off0]), d00 = vmlaq_f32(d00, s0, w0); - if (M > 1) s0 = vdupq_n_f32(src1[off0]), d10 = vmlaq_f32(d10, s0, w0); - if (M > 2) s0 = vdupq_n_f32(src2[off0]), d20 = vmlaq_f32(d20, s0, w0); - if (M > 3) s0 = vdupq_n_f32(src3[off0]), d30 = vmlaq_f32(d30, s0, w0); - if (M > 4) s0 = vdupq_n_f32(src0[off4]), d40 = vmlaq_f32(d40, s0, w0); - if (M > 5) s0 = vdupq_n_f32(src1[off4]), d50 = vmlaq_f32(d50, s0, w0); - if (M > 6) s0 = vdupq_n_f32(src2[off4]), d60 = vmlaq_f32(d60, s0, w0); - if (M > 7) s0 = vdupq_n_f32(src3[off4]), d70 = vmlaq_f32(d70, s0, w0); - } - if (dstC == F) - { - if (M > 0) Save1(dst, d00, bias, params), dst += dD; - if (M > 1) Save1(dst, d10, bias, params), dst += dD; - if (M > 2) Save1(dst, d20, bias, params), dst += dD; - if (M > 3) Save1(dst, d30, bias, params), dst += dD; - if (M > 4) Save1(dst, d40, bias, params), dst += dD; - if (M > 5) Save1(dst, d50, bias, params), dst += dD; - if (M > 6) Save1(dst, d60, bias, params), dst += dD; - if (M > 7) Save1(dst, d70, bias, params), dst += dD; - } - else - { - if (M > 0) Save1(dst, d00, bias, params, dstC), dst += dD; - if (M > 1) Save1(dst, d10, bias, params, dstC), dst += dD; - if (M > 2) Save1(dst, d20, bias, params, dstC), dst += dD; - if (M > 3) Save1(dst, d30, bias, params, dstC), dst += dD; - if (M > 4) Save1(dst, d40, bias, params, dstC), dst += dD; - if (M > 5) Save1(dst, d50, bias, params, dstC), dst += dD; - if (M > 6) Save1(dst, d60, bias, params, dstC), dst += dD; - if (M > 7) Save1(dst, d70, bias, params, dstC), dst += dD; - } - } - } - - template ConvolutionNhwcDirect1x1_NxM_Ptr GetConvolutionNhwcDirect1x1_3xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect1x1_3xM; - case 2: return ConvolutionNhwcDirect1x1_3xM; - case 3: return ConvolutionNhwcDirect1x1_3xM; - case 4: return ConvolutionNhwcDirect1x1_3xM; - case 5: return ConvolutionNhwcDirect1x1_3xM; - case 6: return ConvolutionNhwcDirect1x1_3xM; - case 7: return ConvolutionNhwcDirect1x1_3xM; - } - assert(0); - return NULL; - } -#else - template void ConvolutionNhwcDirect1x1_3x4(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d02, d10, d11, d12, d20, d21, d22, d30, d31, d32, s0, w0, w1, w2; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - if (dstC > 2 * F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f), d22 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f), d32 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2); - s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1), d22 = vmlaq_f32(d22, s0, w2); - s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1), d32 = vmlaq_f32(d32, s0, w2); - } - if (dstC == 3 * F) - { - Save3(dst, d00, d01, d02, bias, params), dst += dD; - Save3(dst, d10, d11, d12, bias, params), dst += dD; - Save3(dst, d20, d21, d22, bias, params), dst += dD; - Save3(dst, d30, d31, d32, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - Save3(dst, d20, d21, d22, bias, params, dstC), dst += dD; - Save3(dst, d30, d31, d32, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - } - if (dstC == 2 * F) - { - Save2(dst, d00, d01, bias, params), dst += dD; - Save2(dst, d10, d11, bias, params), dst += dD; - Save2(dst, d20, d21, bias, params), dst += dD; - Save2(dst, d30, d31, bias, params), dst += dD; - } - else - { - dstC -= 1 * F; - Save2(dst, d00, d01, bias, params, dstC), dst += dD; - Save2(dst, d10, d11, bias, params, dstC), dst += dD; - Save2(dst, d20, d21, bias, params, dstC), dst += dD; - Save2(dst, d30, d31, bias, params, dstC), dst += dD; - } - } - else - { - d00 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0); - s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0); - s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0); - } - if (dstC == F) - { - Save1(dst, d00, bias, params), dst += dD; - Save1(dst, d10, bias, params), dst += dD; - Save1(dst, d20, bias, params), dst += dD; - Save1(dst, d30, bias, params), dst += dD; - } - else - { - Save1(dst, d00, bias, params, dstC), dst += dD; - Save1(dst, d10, bias, params, dstC), dst += dD; - Save1(dst, d20, bias, params, dstC), dst += dD; - Save1(dst, d30, bias, params, dstC), dst += dD; - } - } - } - - template void ConvolutionNhwcDirect1x1_3xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d02, d10, d11, d12, d20, d21, d22, d30, d31, d32, s0, w0, w1, w2; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - if (dstC > 2 * F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f), d22 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f), d32 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2); - if (M > 2) s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1), d22 = vmlaq_f32(d22, s0, w2); - if (M > 3) s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1), d32 = vmlaq_f32(d32, s0, w2); - } - if (dstC == 3 * F) - { - if (M > 0) Save3(dst, d00, d01, d02, bias, params), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - if (M > 0) Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params, dstC), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - if (M > 2) s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - if (M > 3) s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - } - if (dstC == DF) - { - if (M > 0) Save2(dst, d00, d01, bias, params), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params), dst += dD; - } - else - { - dstC -= F; - if (M > 0) Save2(dst, d00, d01, bias, params, dstC), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params, dstC), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params, dstC), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0) d00 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0); - if (M > 2) s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0); - if (M > 3) s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0); - } - if (dstC == F) - { - if (M > 0) Save1(dst, d00, bias, params), dst += dD; - if (M > 1) Save1(dst, d10, bias, params), dst += dD; - if (M > 2) Save1(dst, d20, bias, params), dst += dD; - if (M > 3) Save1(dst, d30, bias, params), dst += dD; - } - else - { - if (M > 0) Save1(dst, d00, bias, params, dstC), dst += dD; - if (M > 1) Save1(dst, d10, bias, params, dstC), dst += dD; - if (M > 2) Save1(dst, d20, bias, params, dstC), dst += dD; - if (M > 3) Save1(dst, d30, bias, params, dstC), dst += dD; - } - } - } - - template ConvolutionNhwcDirect1x1_NxM_Ptr GetConvolutionNhwcDirect1x1_3xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect1x1_3xM; - case 2: return ConvolutionNhwcDirect1x1_3xM; - case 3: return ConvolutionNhwcDirect1x1_3xM; - } - assert(0); - return NULL; - } -#endif - - template void ConvolutionNhwcDirect1x1_3(const float* src, const ConvParam32f& p, const AlgParam& a, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { -#if defined(SIMD_ARM64_ENABLE) - size_t n = 8, n1 = (yEnd - yBeg) * p.dstW, nn = AlignLoAny(n1, n), m = n1 - nn; - ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_3xN = ConvolutionNhwcDirect1x1_3x8; -#else - size_t n = 4, n1 = (yEnd - yBeg) * p.dstW, nn = AlignLoAny(n1, n), m = n1 - nn; - ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_3xN = ConvolutionNhwcDirect1x1_3x4; -#endif - ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_3xM = GetConvolutionNhwcDirect1x1_3xM(m); - - float32x4_t _params[3], _bias[3]; - _params[0] = vdupq_n_f32(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = vdupq_n_f32(params[1]); - - for (size_t dc = 0; dc < dstC; dc += a.microD) - { - size_t dC = Simd::Min(a.microD, dstC - dc); - if (dC > 0 * F) _bias[0] = Load(bias + dc + 0 * F); - if (dC > 1 * F) _bias[1] = Load(bias + dc + 1 * F); - if (dC > 2 * F) _bias[2] = Load(bias + dc + 2 * F); - if (type == ::SimdConvolutionActivationPrelu) - { - if (dC > 0 * F) _params[0] = Load(params + dc + 0 * F); - if (dC > 1 * F) _params[1] = Load(params + dc + 1 * F); - if (dC > 2 * F) _params[2] = Load(params + dc + 2 * F); - } - const float* ps = src + yBeg * p.srcW * p.srcC; - float* pd = dst + dc + yBeg * p.dstW * p.dstC; - size_t i = 0; - for (; i < nn; i += n, ps += n * p.srcC, pd += n * p.dstC) - convolutionNhwcDirect1x1_3xN(ps, p, a, srcC, dC, weight, _bias, _params, pd); - for (; i < n1; i += m, ps += m * p.srcC, pd += m * p.dstC) - convolutionNhwcDirect1x1_3xM(ps, p, a, srcC, dC, weight, _bias, _params, pd); - weight += p.srcC * a.microD; - } - } - - //--------------------------------------------------------------------- - - template static SIMD_INLINE void Set(const ConvParam32f& p, AlgParam& a) - { - a.convolutions[term] = p.Is1x1() ? ConvolutionNhwcDirect1x1_3 : ConvolutionNhwcDirect_3; - } - - template static SIMD_INLINE void Set(const ConvParam32f& p, AlgParam& a) - { - Set(p, a); - Set(p, a); - Set(p, a); - Set(p, a); - } - - bool SynetConvolution32fNhwcDirect::Set3r(const ConvParam32f& p, AlgParam& a) - { - assert(a.microD == 3 * F); - switch (p.activation) - { - case SimdConvolutionActivationIdentity: Set(p, a); break; - case SimdConvolutionActivationRelu: Set(p, a); break; - case SimdConvolutionActivationLeakyRelu: Set(p, a); break; - case SimdConvolutionActivationRestrictRange: Set(p, a); break; - case SimdConvolutionActivationPrelu: Set(p, a); break; - case SimdConvolutionActivationElu: Set(p, a); break; - case SimdConvolutionActivationHswish: Set(p, a); break; - default: assert(0); - } - return true; - } - } -#endif//SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonSynetConvolution32fNhwcDirect4r.cpp b/src/3rd/Simd/Simd/SimdNeonSynetConvolution32fNhwcDirect4r.cpp deleted file mode 100644 index e565b57a..00000000 --- a/src/3rd/Simd/Simd/SimdNeonSynetConvolution32fNhwcDirect4r.cpp +++ /dev/null @@ -1,1725 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - using AlgParam = SynetConvolution32fNhwcDirect::AlgParam; - - typedef void(*ConvolutionNhwcDirect_NxM_Ptr)(const float* src0, const ConvParam32f& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst); - typedef void(*ConvolutionNhwcDirect1x1_NxM_Ptr)(const float* src0, const ConvParam32f& p, const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst); - - //--------------------------------------------------------------------- - - template void ConvolutionNhwcDirect_4x1(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d02, d03, s0, w0, w1, w2, w3; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* weight3 = weight2 + a.stepW; - if (dstC > 3 * F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f), d03 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - w3 = Load(weight3 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2), d03 = vmlaq_f32(d03, s0, w3); - } - } - weight0 += dW, weight1 += dW, weight2 += dW, weight3 += dW; - } - } - if (dstC == 4 * F) - Save4(dst, d00, d01, d02, d03, bias, params); - else - Save4(dst, d00, d01, d02, d03, bias, params, dstC - 3 * F); - } - else if (dstC > 2 * F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2); - } - } - weight0 += dW, weight1 += dW, weight2 += dW; - } - } - if (dstC == 3 * F) - Save3(dst, d00, d01, d02, bias, params); - else - Save3(dst, d00, d01, d02, bias, params, dstC - 2 * F); - } - else if (dstC > F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - } - } - weight0 += dW, weight1 += dW; - } - } - if (dstC == 2 * F) - Save2(dst, d00, d01, bias, params); - else - Save2(dst, d00, d01, bias, params, dstC - F); - } - else - { - d00 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - if (sy + ky < srcH && sx + kx < srcW) - { - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0); - } - } - weight0 += dW; - } - } - if (dstC == F) - Save1(dst, d00, bias, params); - else - Save1(dst, d00, bias, params, dstC); - } - } - -#if defined(SIMD_ARM64_ENABLE) - template void ConvolutionNhwcDirect_4x6(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d02, d03, d10, d11, d12, d13, d20, d21, d22, d23, d30, d31, d32, d33, d40, d41, d42, d43, d50, d51, d52, d53, s0, w0, w1, w2, w3; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* weight3 = weight2 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > 3 * F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f), d03 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f), d13 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f), d22 = vdupq_n_f32(0.0f), d23 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f), d32 = vdupq_n_f32(0.0f), d33 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f), d42 = vdupq_n_f32(0.0f), d43 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f), d52 = vdupq_n_f32(0.0f), d53 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 6 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - w3 = Load(weight3 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2), d03 = vmlaq_f32(d03, s0, w3); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2), d13 = vmlaq_f32(d13, s0, w3); - s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1), d22 = vmlaq_f32(d22, s0, w2), d23 = vmlaq_f32(d23, s0, w3); - s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1), d32 = vmlaq_f32(d32, s0, w2), d33 = vmlaq_f32(d33, s0, w3); - s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1), d42 = vmlaq_f32(d42, s0, w2), d43 = vmlaq_f32(d43, s0, w3); - s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1), d52 = vmlaq_f32(d52, s0, w2), d53 = vmlaq_f32(d53, s0, w3); - } - weight0 += dW, weight1 += dW, weight2 += dW, weight3 += dW; - } - } - else - weight0 += dWz, weight1 += dWz, weight2 += dWz, weight3 += dWz; - } - if (dstC == 4 * F) - { - Save4(dst, d00, d01, d02, d03, bias, params), dst += dD; - Save4(dst, d10, d11, d12, d13, bias, params), dst += dD; - Save4(dst, d20, d21, d22, d23, bias, params), dst += dD; - Save4(dst, d30, d31, d32, d33, bias, params), dst += dD; - Save4(dst, d40, d41, d42, d43, bias, params), dst += dD; - Save4(dst, d50, d51, d52, d53, bias, params), dst += dD; - } - else - { - dstC -= 3 * F; - Save4(dst, d00, d01, d02, d03, bias, params, dstC), dst += dD; - Save4(dst, d10, d11, d12, d13, bias, params, dstC), dst += dD; - Save4(dst, d20, d21, d22, d23, bias, params, dstC), dst += dD; - Save4(dst, d30, d31, d32, d33, bias, params, dstC), dst += dD; - Save4(dst, d40, d41, d42, d43, bias, params, dstC), dst += dD; - Save4(dst, d50, d51, d52, d53, bias, params, dstC), dst += dD; - } - } - else if (dstC > 2 * F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f), d22 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f), d32 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f), d42 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f), d52 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 6 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2); - s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1), d22 = vmlaq_f32(d22, s0, w2); - s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1), d32 = vmlaq_f32(d32, s0, w2); - s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1), d42 = vmlaq_f32(d42, s0, w2); - s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1), d52 = vmlaq_f32(d52, s0, w2); - } - weight0 += dW, weight1 += dW, weight2 += dW; - } - } - else - weight0 += dWz, weight1 += dWz, weight2 += dWz; - } - if (dstC == 3 * F) - { - Save3(dst, d00, d01, d02, bias, params), dst += dD; - Save3(dst, d10, d11, d12, bias, params), dst += dD; - Save3(dst, d20, d21, d22, bias, params), dst += dD; - Save3(dst, d30, d31, d32, bias, params), dst += dD; - Save3(dst, d40, d41, d42, bias, params), dst += dD; - Save3(dst, d50, d51, d52, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - Save3(dst, d20, d21, d22, bias, params, dstC), dst += dD; - Save3(dst, d30, d31, d32, bias, params, dstC), dst += dD; - Save3(dst, d40, d41, d42, bias, params, dstC), dst += dD; - Save3(dst, d50, d51, d52, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 6 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1); - s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == 2 * F) - { - Save2(dst, d00, d01, bias, params), dst += dD; - Save2(dst, d10, d11, bias, params), dst += dD; - Save2(dst, d20, d21, bias, params), dst += dD; - Save2(dst, d30, d31, bias, params), dst += dD; - Save2(dst, d40, d41, bias, params), dst += dD; - Save2(dst, d50, d51, bias, params), dst += dD; - } - else - { - dstC -= 1 * F; - Save2(dst, d00, d01, bias, params, dstC), dst += dD; - Save2(dst, d10, d11, bias, params, dstC), dst += dD; - Save2(dst, d20, d21, bias, params, dstC), dst += dD; - Save2(dst, d30, d31, bias, params, dstC), dst += dD; - Save2(dst, d40, d41, bias, params, dstC), dst += dD; - Save2(dst, d50, d51, bias, params, dstC), dst += dD; - } - } - else - { - d00 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 6 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0); - s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0); - s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0); - s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0); - s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - Save1(dst, d00, bias, params), dst += dD; - Save1(dst, d10, bias, params), dst += dD; - Save1(dst, d20, bias, params), dst += dD; - Save1(dst, d30, bias, params), dst += dD; - Save1(dst, d40, bias, params), dst += dD; - Save1(dst, d50, bias, params), dst += dD; - } - else - { - Save1(dst, d00, bias, params, dstC), dst += dD; - Save1(dst, d10, bias, params, dstC), dst += dD; - Save1(dst, d20, bias, params, dstC), dst += dD; - Save1(dst, d30, bias, params, dstC), dst += dD; - Save1(dst, d40, bias, params, dstC), dst += dD; - Save1(dst, d50, bias, params, dstC), dst += dD; - } - } - } - - template void ConvolutionNhwcDirect_4xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d02, d03, d10, d11, d12, d13, d20, d21, d22, d23, d30, d31, d32, d33, d40, d41, d42, d43, d50, d51, d52, d53, s0, w0, w1, w2, w3; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* weight3 = weight2 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > 3 * F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f), d03 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f), d13 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f), d22 = vdupq_n_f32(0.0f), d23 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f), d32 = vdupq_n_f32(0.0f), d33 = vdupq_n_f32(0.0f); - if (M > 4) d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f), d42 = vdupq_n_f32(0.0f), d43 = vdupq_n_f32(0.0f); - if (M > 5) d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f), d52 = vdupq_n_f32(0.0f), d53 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - w3 = Load(weight3 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2), d03 = vmlaq_f32(d03, s0, w3); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2), d13 = vmlaq_f32(d13, s0, w3); - if (M > 2) s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1), d22 = vmlaq_f32(d22, s0, w2), d23 = vmlaq_f32(d23, s0, w3); - if (M > 3) s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1), d32 = vmlaq_f32(d32, s0, w2), d33 = vmlaq_f32(d33, s0, w3); - if (M > 4) s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1), d42 = vmlaq_f32(d42, s0, w2), d43 = vmlaq_f32(d43, s0, w3); - if (M > 5) s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1), d52 = vmlaq_f32(d52, s0, w2), d53 = vmlaq_f32(d53, s0, w3); - } - weight0 += dW, weight1 += dW, weight2 += dW, weight3 += dW; - } - } - else - weight0 += dWz, weight1 += dWz, weight2 += dWz, weight3 += dWz; - } - if (dstC == 4 * F) - { - if (M > 0) Save4(dst, d00, d01, d02, d03, bias, params), dst += dD; - if (M > 1) Save4(dst, d10, d11, d12, d13, bias, params), dst += dD; - if (M > 2) Save4(dst, d20, d21, d22, d23, bias, params), dst += dD; - if (M > 3) Save4(dst, d30, d31, d32, d33, bias, params), dst += dD; - if (M > 4) Save4(dst, d40, d41, d42, d43, bias, params), dst += dD; - if (M > 5) Save4(dst, d50, d51, d52, d53, bias, params), dst += dD; - } - else - { - dstC -= 3 * F; - if (M > 0) Save4(dst, d00, d01, d02, d03, bias, params, dstC), dst += dD; - if (M > 1) Save4(dst, d10, d11, d12, d13, bias, params, dstC), dst += dD; - if (M > 2) Save4(dst, d20, d21, d22, d23, bias, params, dstC), dst += dD; - if (M > 3) Save4(dst, d30, d31, d32, d33, bias, params, dstC), dst += dD; - if (M > 4) Save4(dst, d40, d41, d42, d43, bias, params, dstC), dst += dD; - if (M > 5) Save4(dst, d50, d51, d52, d53, bias, params, dstC), dst += dD; - } - } - else if (dstC > 2 * F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f), d22 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f), d32 = vdupq_n_f32(0.0f); - if (M > 4) d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f), d42 = vdupq_n_f32(0.0f); - if (M > 5) d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f), d52 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2); - if (M > 2) s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1), d22 = vmlaq_f32(d22, s0, w2); - if (M > 3) s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1), d32 = vmlaq_f32(d32, s0, w2); - if (M > 4) s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1), d42 = vmlaq_f32(d42, s0, w2); - if (M > 5) s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1), d52 = vmlaq_f32(d52, s0, w2); - } - weight0 += dW, weight1 += dW, weight2 += dW; - } - } - else - weight0 += dWz, weight1 += dWz, weight2 += dWz; - } - if (dstC == 3 * F) - { - if (M > 0) Save3(dst, d00, d01, d02, bias, params), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params), dst += dD; - if (M > 4) Save3(dst, d40, d41, d42, bias, params), dst += dD; - if (M > 5) Save3(dst, d50, d51, d52, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - if (M > 0) Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params, dstC), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params, dstC), dst += dD; - if (M > 4) Save3(dst, d40, d41, d42, bias, params, dstC), dst += dD; - if (M > 5) Save3(dst, d50, d51, d52, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - if (M > 4) d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f); - if (M > 5) d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - if (M > 2) s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - if (M > 3) s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - if (M > 4) s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1); - if (M > 5) s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == DF) - { - if (M > 0) Save2(dst, d00, d01, bias, params), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params), dst += dD; - } - else - { - dstC -= F; - if (M > 0) Save2(dst, d00, d01, bias, params, dstC), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params, dstC), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params, dstC), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params, dstC), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params, dstC), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0) d00 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f); - if (M > 4) d40 = vdupq_n_f32(0.0f); - if (M > 5) d50 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0); - if (M > 2) s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0); - if (M > 3) s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0); - if (M > 4) s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0); - if (M > 5) s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - if (M > 0) Save1(dst, d00, bias, params), dst += dD; - if (M > 1) Save1(dst, d10, bias, params), dst += dD; - if (M > 2) Save1(dst, d20, bias, params), dst += dD; - if (M > 3) Save1(dst, d30, bias, params), dst += dD; - if (M > 4) Save1(dst, d40, bias, params), dst += dD; - if (M > 5) Save1(dst, d50, bias, params), dst += dD; - } - else - { - if (M > 0) Save1(dst, d00, bias, params, dstC), dst += dD; - if (M > 1) Save1(dst, d10, bias, params, dstC), dst += dD; - if (M > 2) Save1(dst, d20, bias, params, dstC), dst += dD; - if (M > 3) Save1(dst, d30, bias, params, dstC), dst += dD; - if (M > 4) Save1(dst, d40, bias, params, dstC), dst += dD; - if (M > 5) Save1(dst, d50, bias, params, dstC), dst += dD; - } - } - } - - template ConvolutionNhwcDirect_NxM_Ptr GetConvolutionNhwcDirect_4xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect_4xM; - case 2: return ConvolutionNhwcDirect_4xM; - case 3: return ConvolutionNhwcDirect_4xM; - case 4: return ConvolutionNhwcDirect_4xM; - case 5: return ConvolutionNhwcDirect_4xM; - } - assert(0); - return NULL; - } -#else - template void ConvolutionNhwcDirect_4x2(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d02, d03, d10, d11, d12, d13, s0, w0, w1, w2, w3; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* weight3 = weight2 + a.stepW; - const float* src1 = src0 + 1 * dS; - if (dstC > 3 * F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f), d03 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f), d13 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 2 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - w3 = Load(weight3 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2), d03 = vmlaq_f32(d03, s0, w3); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2), d13 = vmlaq_f32(d13, s0, w3); - } - weight0 += dW, weight1 += dW, weight2 += dW, weight3 += dW; - } - } - else - weight0 += dWz, weight1 += dWz, weight2 += dWz, weight3 += dWz; - } - if (dstC == 4 * F) - { - Save4(dst, d00, d01, d02, d03, bias, params), dst += dD; - Save4(dst, d10, d11, d12, d13, bias, params), dst += dD; - } - else - { - dstC -= 3 * F; - Save4(dst, d00, d01, d02, d03, bias, params, dstC), dst += dD; - Save4(dst, d10, d11, d12, d13, bias, params, dstC), dst += dD; - } - } - else if (dstC > 2 * F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 2 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2); - } - weight0 += dW, weight1 += dW, weight2 += dW; - } - } - else - weight0 += dWz, weight1 += dWz, weight2 += dWz; - } - if (dstC == 3 * F) - { - Save3(dst, d00, d01, d02, bias, params), dst += dD; - Save3(dst, d10, d11, d12, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 2 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == 2 * F) - { - Save2(dst, d00, d01, bias, params), dst += dD; - Save2(dst, d10, d11, bias, params), dst += dD; - } - else - { - dstC -= 1 * F; - Save2(dst, d00, d01, bias, params, dstC), dst += dD; - Save2(dst, d10, d11, bias, params, dstC), dst += dD; - } - } - else - { - d00 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + 2 <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - Save1(dst, d00, bias, params), dst += dD; - Save1(dst, d10, bias, params), dst += dD; - } - else - { - Save1(dst, d00, bias, params, dstC), dst += dD; - Save1(dst, d10, bias, params, dstC), dst += dD; - } - } - } - - template void ConvolutionNhwcDirect_4xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d02, d03, d10, d11, d12, d13, s0, w0, w1, w2, w3; - size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX; - size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC; - size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* weight3 = weight2 + a.stepW; - const float* src1 = src0 + 1 * dS; - if (dstC > 3 * F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f), d03 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f), d13 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - w3 = Load(weight3 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2), d03 = vmlaq_f32(d03, s0, w3); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2), d13 = vmlaq_f32(d13, s0, w3); - } - weight0 += dW, weight1 += dW, weight2 += dW, weight3 += dW; - } - } - else - weight0 += dWz, weight1 += dWz, weight2 += dWz, weight3 += dWz; - } - if (dstC == 4 * F) - { - if (M > 0) Save4(dst, d00, d01, d02, d03, bias, params), dst += dD; - if (M > 1) Save4(dst, d10, d11, d12, d13, bias, params), dst += dD; - } - else - { - dstC -= 3 * F; - if (M > 0) Save4(dst, d00, d01, d02, d03, bias, params, dstC), dst += dD; - if (M > 1) Save4(dst, d10, d11, d12, d13, bias, params, dstC), dst += dD; - } - } - else if (dstC > 2 * F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2); - } - weight0 += dW, weight1 += dW, weight2 += dW; - } - } - else - weight0 += dWz, weight1 += dWz, weight2 += dWz; - } - if (dstC == 3 * F) - { - if (M > 0) Save3(dst, d00, d01, d02, bias, params), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - if (M > 0) Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == DF) - { - if (M > 0) Save2(dst, d00, d01, bias, params), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params), dst += dD; - } - else - { - dstC -= F; - if (M > 0) Save2(dst, d00, d01, bias, params, dstC), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0) d00 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f); - for (size_t ky = 0; ky < kY; ky += dilY) - { - if (sy + ky < srcH) - { - size_t beg = (sy + ky) * dY + sx * dX; - for (size_t kx = 0; kx < kX; kx += dilX) - { - assert(sx + kx < srcW && sx + kx + M <= srcW); - size_t offs = beg + kx * dX, end = offs + srcC, offw = 0; - for (; offs < end; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0); - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - if (M > 0) Save1(dst, d00, bias, params), dst += dD; - if (M > 1) Save1(dst, d10, bias, params), dst += dD; - } - else - { - if (M > 0) Save1(dst, d00, bias, params, dstC), dst += dD; - if (M > 1) Save1(dst, d10, bias, params, dstC), dst += dD; - } - } - } - - template ConvolutionNhwcDirect_NxM_Ptr GetConvolutionNhwcDirect_4xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect_4xM; - } - assert(0); - return NULL; - } -#endif - - template void ConvolutionNhwcDirect_4(const float* src, const ConvParam32f& p, const AlgParam& a, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { - size_t noseH = p.NoseH(), noseW = p.NoseW(), bodyH = p.BodyH(), bodyW = p.BodyW(); - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_4x1 = ConvolutionNhwcDirect_4x1; -#if defined(SIMD_ARM64_ENABLE) - size_t n = 6, bodyWn = AlignLoAny(bodyW - noseW, n) + noseW, m = bodyW - bodyWn; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_4xN = ConvolutionNhwcDirect_4x6; -#else - size_t n = 2, bodyWn = AlignLoAny(bodyW - noseW, n) + noseW, m = bodyW - bodyWn; - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_4xN = ConvolutionNhwcDirect_4x2; -#endif - ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_4xM = GetConvolutionNhwcDirect_4xM(m); - size_t tailH = p.dstH, tailW = p.dstW; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - - float32x4_t _params[4], _bias[4]; - _params[0] = vdupq_n_f32(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = vdupq_n_f32(params[1]); - - for (size_t dc = 0; dc < dstC; dc += a.microD) - { - size_t dC = Simd::Min(a.microD, dstC - dc); - if (dC > 0 * F) _bias[0] = Load(bias + dc + 0 * F); - if (dC > 1 * F) _bias[1] = Load(bias + dc + 1 * F); - if (dC > 2 * F) _bias[2] = Load(bias + dc + 2 * F); - if (dC > 3 * F) _bias[3] = Load(bias + dc + 3 * F); - if (type == ::SimdConvolutionActivationPrelu) - { - if (dC > 0 * F) _params[0] = Load(params + dc + 0 * F); - if (dC > 1 * F) _params[1] = Load(params + dc + 1 * F); - if (dC > 2 * F) _params[2] = Load(params + dc + 2 * F); - if (dC > 3 * F) _params[3] = Load(params + dc + 3 * F); - } - float* d = dst + dc + yBeg * p.dstW * p.dstC; - size_t dy = yBeg; - for (; dy < noseH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_4x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_4xN(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_4xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_4x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - } - for (; dy < bodyH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_4x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_4xN(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_4xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_4x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - } - for (; dy < tailH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, d += p.dstC) - convolutionNhwcDirect_4x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyWn; dx += n, d += p.dstC * n) - convolutionNhwcDirect_4xN(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < bodyW; dx += m, d += p.dstC * m) - convolutionNhwcDirect_4xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - for (; dx < tailW; dx++, d += p.dstC) - convolutionNhwcDirect_4x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, d); - } - weight += p.kernelY * p.kernelX * p.srcC * a.microD; - } - } - - //--------------------------------------------------------------------- - -#if defined(SIMD_ARM64_ENABLE) - template void ConvolutionNhwcDirect1x1_4x6(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d02, d03, d10, d11, d12, d13, d20, d21, d22, d23, d30, d31, d32, d33, d40, d41, d42, d43, d50, d51, d52, d53, s0, w0, w1, w2, w3; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* weight3 = weight2 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > 3 * F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f), d03 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f), d13 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f), d22 = vdupq_n_f32(0.0f), d23 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f), d32 = vdupq_n_f32(0.0f), d33 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f), d42 = vdupq_n_f32(0.0f), d43 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f), d52 = vdupq_n_f32(0.0f), d53 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - w3 = Load(weight3 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2), d03 = vmlaq_f32(d03, s0, w3); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2), d13 = vmlaq_f32(d13, s0, w3); - s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1), d22 = vmlaq_f32(d22, s0, w2), d23 = vmlaq_f32(d23, s0, w3); - s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1), d32 = vmlaq_f32(d32, s0, w2), d33 = vmlaq_f32(d33, s0, w3); - s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1), d42 = vmlaq_f32(d42, s0, w2), d43 = vmlaq_f32(d43, s0, w3); - s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1), d52 = vmlaq_f32(d52, s0, w2), d53 = vmlaq_f32(d53, s0, w3); - } - if (dstC == 4 * F) - { - Save4(dst, d00, d01, d02, d03, bias, params), dst += dD; - Save4(dst, d10, d11, d12, d13, bias, params), dst += dD; - Save4(dst, d20, d21, d22, d23, bias, params), dst += dD; - Save4(dst, d30, d31, d32, d33, bias, params), dst += dD; - Save4(dst, d40, d41, d42, d43, bias, params), dst += dD; - Save4(dst, d50, d51, d52, d53, bias, params), dst += dD; - } - else - { - dstC -= 3 * F; - Save4(dst, d00, d01, d02, d03, bias, params, dstC), dst += dD; - Save4(dst, d10, d11, d12, d13, bias, params, dstC), dst += dD; - Save4(dst, d20, d21, d22, d23, bias, params, dstC), dst += dD; - Save4(dst, d30, d31, d32, d33, bias, params, dstC), dst += dD; - Save4(dst, d40, d41, d42, d43, bias, params, dstC), dst += dD; - Save4(dst, d50, d51, d52, d53, bias, params, dstC), dst += dD; - } - } - else if (dstC > 2 * F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f), d22 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f), d32 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f), d42 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f), d52 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2); - s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1), d22 = vmlaq_f32(d22, s0, w2); - s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1), d32 = vmlaq_f32(d32, s0, w2); - s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1), d42 = vmlaq_f32(d42, s0, w2); - s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1), d52 = vmlaq_f32(d52, s0, w2); - } - if (dstC == 3 * F) - { - Save3(dst, d00, d01, d02, bias, params), dst += dD; - Save3(dst, d10, d11, d12, bias, params), dst += dD; - Save3(dst, d20, d21, d22, bias, params), dst += dD; - Save3(dst, d30, d31, d32, bias, params), dst += dD; - Save3(dst, d40, d41, d42, bias, params), dst += dD; - Save3(dst, d50, d51, d52, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - Save3(dst, d20, d21, d22, bias, params, dstC), dst += dD; - Save3(dst, d30, d31, d32, bias, params, dstC), dst += dD; - Save3(dst, d40, d41, d42, bias, params, dstC), dst += dD; - Save3(dst, d50, d51, d52, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1); - s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1); - } - if (dstC == 2 * F) - { - Save2(dst, d00, d01, bias, params), dst += dD; - Save2(dst, d10, d11, bias, params), dst += dD; - Save2(dst, d20, d21, bias, params), dst += dD; - Save2(dst, d30, d31, bias, params), dst += dD; - Save2(dst, d40, d41, bias, params), dst += dD; - Save2(dst, d50, d51, bias, params), dst += dD; - } - else - { - dstC -= 1 * F; - Save2(dst, d00, d01, bias, params, dstC), dst += dD; - Save2(dst, d10, d11, bias, params, dstC), dst += dD; - Save2(dst, d20, d21, bias, params, dstC), dst += dD; - Save2(dst, d30, d31, bias, params, dstC), dst += dD; - Save2(dst, d40, d41, bias, params, dstC), dst += dD; - Save2(dst, d50, d51, bias, params, dstC), dst += dD; - } - } - else - { - d00 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0); - s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0); - s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0); - s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0); - s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0); - } - if (dstC == F) - { - Save1(dst, d00, bias, params), dst += dD; - Save1(dst, d10, bias, params), dst += dD; - Save1(dst, d20, bias, params), dst += dD; - Save1(dst, d30, bias, params), dst += dD; - Save1(dst, d40, bias, params), dst += dD; - Save1(dst, d50, bias, params), dst += dD; - } - else - { - Save1(dst, d00, bias, params, dstC), dst += dD; - Save1(dst, d10, bias, params, dstC), dst += dD; - Save1(dst, d20, bias, params, dstC), dst += dD; - Save1(dst, d30, bias, params, dstC), dst += dD; - Save1(dst, d40, bias, params, dstC), dst += dD; - Save1(dst, d50, bias, params, dstC), dst += dD; - } - } - } - - template void ConvolutionNhwcDirect1x1_4xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d02, d03, d10, d11, d12, d13, d20, d21, d22, d23, d30, d31, d32, d33, d40, d41, d42, d43, d50, d51, d52, d53, s0, w0, w1, w2, w3; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* weight3 = weight2 + a.stepW; - const float* src1 = src0 + 1 * dS; - const float* src2 = src0 + 2 * dS; - const float* src3 = src0 + 3 * dS; - const float* src4 = src0 + 4 * dS; - const float* src5 = src0 + 5 * dS; - if (dstC > 3 * F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f), d03 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f), d13 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f), d22 = vdupq_n_f32(0.0f), d23 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f), d32 = vdupq_n_f32(0.0f), d33 = vdupq_n_f32(0.0f); - if (M > 4) d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f), d42 = vdupq_n_f32(0.0f), d43 = vdupq_n_f32(0.0f); - if (M > 5) d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f), d52 = vdupq_n_f32(0.0f), d53 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - w3 = Load(weight3 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2), d03 = vmlaq_f32(d03, s0, w3); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2), d13 = vmlaq_f32(d13, s0, w3); - if (M > 2) s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1), d22 = vmlaq_f32(d22, s0, w2), d23 = vmlaq_f32(d23, s0, w3); - if (M > 3) s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1), d32 = vmlaq_f32(d32, s0, w2), d33 = vmlaq_f32(d33, s0, w3); - if (M > 4) s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1), d42 = vmlaq_f32(d42, s0, w2), d43 = vmlaq_f32(d43, s0, w3); - if (M > 5) s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1), d52 = vmlaq_f32(d52, s0, w2), d53 = vmlaq_f32(d53, s0, w3); - } - if (dstC == 4 * F) - { - if (M > 0) Save4(dst, d00, d01, d02, d03, bias, params), dst += dD; - if (M > 1) Save4(dst, d10, d11, d12, d13, bias, params), dst += dD; - if (M > 2) Save4(dst, d20, d21, d22, d23, bias, params), dst += dD; - if (M > 3) Save4(dst, d30, d31, d32, d33, bias, params), dst += dD; - if (M > 4) Save4(dst, d40, d41, d42, d43, bias, params), dst += dD; - if (M > 5) Save4(dst, d50, d51, d52, d53, bias, params), dst += dD; - } - else - { - dstC -= 3 * F; - if (M > 0) Save4(dst, d00, d01, d02, d03, bias, params, dstC), dst += dD; - if (M > 1) Save4(dst, d10, d11, d12, d13, bias, params, dstC), dst += dD; - if (M > 2) Save4(dst, d20, d21, d22, d23, bias, params, dstC), dst += dD; - if (M > 3) Save4(dst, d30, d31, d32, d33, bias, params, dstC), dst += dD; - if (M > 4) Save4(dst, d40, d41, d42, d43, bias, params, dstC), dst += dD; - if (M > 5) Save4(dst, d50, d51, d52, d53, bias, params, dstC), dst += dD; - } - } - else if (dstC > 2 * F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f), d22 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f), d32 = vdupq_n_f32(0.0f); - if (M > 4) d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f), d42 = vdupq_n_f32(0.0f); - if (M > 5) d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f), d52 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2); - if (M > 2) s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1), d22 = vmlaq_f32(d22, s0, w2); - if (M > 3) s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1), d32 = vmlaq_f32(d32, s0, w2); - if (M > 4) s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1), d42 = vmlaq_f32(d42, s0, w2); - if (M > 5) s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1), d52 = vmlaq_f32(d52, s0, w2); - } - if (dstC == 3 * F) - { - if (M > 0) Save3(dst, d00, d01, d02, bias, params), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params), dst += dD; - if (M > 4) Save3(dst, d40, d41, d42, bias, params), dst += dD; - if (M > 5) Save3(dst, d50, d51, d52, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - if (M > 0) Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - if (M > 2) Save3(dst, d20, d21, d22, bias, params, dstC), dst += dD; - if (M > 3) Save3(dst, d30, d31, d32, bias, params, dstC), dst += dD; - if (M > 4) Save3(dst, d40, d41, d42, bias, params, dstC), dst += dD; - if (M > 5) Save3(dst, d50, d51, d52, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - if (M > 4) d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f); - if (M > 5) d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - if (M > 2) s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - if (M > 3) s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - if (M > 4) s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1); - if (M > 5) s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1); - } - if (dstC == DF) - { - if (M > 0) Save2(dst, d00, d01, bias, params), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params), dst += dD; - } - else - { - dstC -= F; - if (M > 0) Save2(dst, d00, d01, bias, params, dstC), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params, dstC), dst += dD; - if (M > 2) Save2(dst, d20, d21, bias, params, dstC), dst += dD; - if (M > 3) Save2(dst, d30, d31, bias, params, dstC), dst += dD; - if (M > 4) Save2(dst, d40, d41, bias, params, dstC), dst += dD; - if (M > 5) Save2(dst, d50, d51, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0) d00 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f); - if (M > 2) d20 = vdupq_n_f32(0.0f); - if (M > 3) d30 = vdupq_n_f32(0.0f); - if (M > 4) d40 = vdupq_n_f32(0.0f); - if (M > 5) d50 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0); - if (M > 2) s0 = vdupq_n_f32(src2[offs]), d20 = vmlaq_f32(d20, s0, w0); - if (M > 3) s0 = vdupq_n_f32(src3[offs]), d30 = vmlaq_f32(d30, s0, w0); - if (M > 4) s0 = vdupq_n_f32(src4[offs]), d40 = vmlaq_f32(d40, s0, w0); - if (M > 5) s0 = vdupq_n_f32(src5[offs]), d50 = vmlaq_f32(d50, s0, w0); - } - if (dstC == F) - { - if (M > 0) Save1(dst, d00, bias, params), dst += dD; - if (M > 1) Save1(dst, d10, bias, params), dst += dD; - if (M > 2) Save1(dst, d20, bias, params), dst += dD; - if (M > 3) Save1(dst, d30, bias, params), dst += dD; - if (M > 4) Save1(dst, d40, bias, params), dst += dD; - if (M > 5) Save1(dst, d50, bias, params), dst += dD; - } - else - { - if (M > 0) Save1(dst, d00, bias, params, dstC), dst += dD; - if (M > 1) Save1(dst, d10, bias, params, dstC), dst += dD; - if (M > 2) Save1(dst, d20, bias, params, dstC), dst += dD; - if (M > 3) Save1(dst, d30, bias, params, dstC), dst += dD; - if (M > 4) Save1(dst, d40, bias, params, dstC), dst += dD; - if (M > 5) Save1(dst, d50, bias, params, dstC), dst += dD; - } - } - } - - template ConvolutionNhwcDirect1x1_NxM_Ptr GetConvolutionNhwcDirect1x1_4xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect1x1_4xM; - case 2: return ConvolutionNhwcDirect1x1_4xM; - case 3: return ConvolutionNhwcDirect1x1_4xM; - case 4: return ConvolutionNhwcDirect1x1_4xM; - case 5: return ConvolutionNhwcDirect1x1_4xM; - } - assert(0); - return NULL; - } -#else - template void ConvolutionNhwcDirect1x1_4x2(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d02, d03, d10, d11, d12, d13, s0, w0, w1, w2, w3; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* weight3 = weight2 + a.stepW; - const float* src1 = src0 + 1 * dS; - if (dstC > 3 * F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f), d03 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f), d13 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - w3 = Load(weight3 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2), d03 = vmlaq_f32(d03, s0, w3); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2), d13 = vmlaq_f32(d13, s0, w3); - } - if (dstC == 4 * F) - { - Save4(dst, d00, d01, d02, d03, bias, params), dst += dD; - Save4(dst, d10, d11, d12, d13, bias, params), dst += dD; - } - else - { - dstC -= 3 * F; - Save4(dst, d00, d01, d02, d03, bias, params, dstC), dst += dD; - Save4(dst, d10, d11, d12, d13, bias, params, dstC), dst += dD; - } - } - else if (dstC > 2 * F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2); - } - if (dstC == 3 * F) - { - Save3(dst, d00, d01, d02, bias, params), dst += dD; - Save3(dst, d10, d11, d12, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - } - if (dstC == 2 * F) - { - Save2(dst, d00, d01, bias, params), dst += dD; - Save2(dst, d10, d11, bias, params), dst += dD; - } - else - { - dstC -= 1 * F; - Save2(dst, d00, d01, bias, params, dstC), dst += dD; - Save2(dst, d10, d11, bias, params, dstC), dst += dD; - } - } - else - { - d00 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0); - s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0); - } - if (dstC == F) - { - Save1(dst, d00, bias, params), dst += dD; - Save1(dst, d10, bias, params), dst += dD; - } - else - { - Save1(dst, d00, bias, params, dstC), dst += dD; - Save1(dst, d10, bias, params, dstC), dst += dD; - } - } - } - - template void ConvolutionNhwcDirect1x1_4xM(const float* src0, const ConvParam32f& p, - const AlgParam& a, size_t srcC, size_t dstC, const float* weight0, const float32x4_t* bias, const float32x4_t* params, float* dst) - { - float32x4_t d00, d01, d02, d03, d10, d11, d12, d13, s0, w0, w1, w2, w3; - size_t dS = p.srcC, dD = p.dstC; - const float* weight1 = weight0 + a.stepW; - const float* weight2 = weight1 + a.stepW; - const float* weight3 = weight2 + a.stepW; - const float* src1 = src0 + 1 * dS; - if (dstC > 3 * F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f), d03 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f), d13 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - w3 = Load(weight3 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2), d03 = vmlaq_f32(d03, s0, w3); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2), d13 = vmlaq_f32(d13, s0, w3); - } - if (dstC == 4 * F) - { - if (M > 0) Save4(dst, d00, d01, d02, d03, bias, params), dst += dD; - if (M > 1) Save4(dst, d10, d11, d12, d13, bias, params), dst += dD; - } - else - { - dstC -= 3 * F; - if (M > 0) Save4(dst, d00, d01, d02, d03, bias, params, dstC), dst += dD; - if (M > 1) Save4(dst, d10, d11, d12, d13, bias, params, dstC), dst += dD; - } - } - else if (dstC > 2 * F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f), d02 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f), d12 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - w2 = Load(weight2 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1), d02 = vmlaq_f32(d02, s0, w2); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1), d12 = vmlaq_f32(d12, s0, w2); - } - if (dstC == 3 * F) - { - if (M > 0) Save3(dst, d00, d01, d02, bias, params), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params), dst += dD; - } - else - { - dstC -= 2 * F; - if (M > 0) Save3(dst, d00, d01, d02, bias, params, dstC), dst += dD; - if (M > 1) Save3(dst, d10, d11, d12, bias, params, dstC), dst += dD; - } - } - else if (dstC > F) - { - if (M > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - w1 = Load(weight1 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - } - if (dstC == DF) - { - if (M > 0) Save2(dst, d00, d01, bias, params), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params), dst += dD; - } - else - { - dstC -= F; - if (M > 0) Save2(dst, d00, d01, bias, params, dstC), dst += dD; - if (M > 1) Save2(dst, d10, d11, bias, params, dstC), dst += dD; - } - } - else - { - if (M > 0) d00 = vdupq_n_f32(0.0f); - if (M > 1) d10 = vdupq_n_f32(0.0f); - for (size_t offs = 0, offw = 0; offs < srcC; ++offs, offw += F) - { - w0 = Load(weight0 + offw); - if (M > 0) s0 = vdupq_n_f32(src0[offs]), d00 = vmlaq_f32(d00, s0, w0); - if (M > 1) s0 = vdupq_n_f32(src1[offs]), d10 = vmlaq_f32(d10, s0, w0); - } - if (dstC == F) - { - if (M > 0) Save1(dst, d00, bias, params), dst += dD; - if (M > 1) Save1(dst, d10, bias, params), dst += dD; - } - else - { - if (M > 0) Save1(dst, d00, bias, params, dstC), dst += dD; - if (M > 1) Save1(dst, d10, bias, params, dstC), dst += dD; - } - } - } - - template ConvolutionNhwcDirect1x1_NxM_Ptr GetConvolutionNhwcDirect1x1_4xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect1x1_4xM; - } - assert(0); - return NULL; - } -#endif - - template void ConvolutionNhwcDirect1x1_4(const float* src, const ConvParam32f& p, const AlgParam& a, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst) - { -#if defined(SIMD_ARM64_ENABLE) - size_t n = 6, n1 = (yEnd - yBeg) * p.dstW, nn = AlignLoAny(n1, n), m = n1 - nn; - ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_4xN = ConvolutionNhwcDirect1x1_4x6; -#else - size_t n = 2, n1 = (yEnd - yBeg) * p.dstW, nn = AlignLoAny(n1, n), m = n1 - nn; - ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_4xN = ConvolutionNhwcDirect1x1_4x2; -#endif - ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_4xM = GetConvolutionNhwcDirect1x1_4xM(m); - - float32x4_t _params[4], _bias[4]; - _params[0] = vdupq_n_f32(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = vdupq_n_f32(params[1]); - - for (size_t dc = 0; dc < dstC; dc += a.microD) - { - size_t dC = Simd::Min(a.microD, dstC - dc); - if (dC > 0 * F) _bias[0] = Load(bias + dc + 0 * F); - if (dC > 1 * F) _bias[1] = Load(bias + dc + 1 * F); - if (dC > 2 * F) _bias[2] = Load(bias + dc + 2 * F); - if (dC > 3 * F) _bias[3] = Load(bias + dc + 3 * F); - if (type == ::SimdConvolutionActivationPrelu) - { - if (dC > 0 * F) _params[0] = Load(params + dc + 0 * F); - if (dC > 1 * F) _params[1] = Load(params + dc + 1 * F); - if (dC > 2 * F) _params[2] = Load(params + dc + 2 * F); - if (dC > 3 * F) _params[3] = Load(params + dc + 3 * F); - } - const float* ps = src + yBeg * p.srcW * p.srcC; - float* pd = dst + dc + yBeg * p.dstW * p.dstC; - size_t i = 0; - for (; i < nn; i += n, ps += n * p.srcC, pd += n * p.dstC) - convolutionNhwcDirect1x1_4xN(ps, p, a, srcC, dC, weight, _bias, _params, pd); - for (; i < n1; i += m, ps += m * p.srcC, pd += m * p.dstC) - convolutionNhwcDirect1x1_4xM(ps, p, a, srcC, dC, weight, _bias, _params, pd); - weight += p.srcC * a.microD; - } - } - - //--------------------------------------------------------------------- - - template static SIMD_INLINE void Set(const ConvParam32f& p, AlgParam& a) - { - a.convolutions[term] = p.Is1x1() ? ConvolutionNhwcDirect1x1_4 : ConvolutionNhwcDirect_4; - } - - template static SIMD_INLINE void Set(const ConvParam32f& p, AlgParam& a) - { - Set(p, a); - Set(p, a); - Set(p, a); - Set(p, a); - } - - bool SynetConvolution32fNhwcDirect::Set4r(const ConvParam32f& p, AlgParam& a) - { - assert(a.microD == 4 * F); - switch (p.activation) - { - case SimdConvolutionActivationIdentity: Set(p, a); break; - case SimdConvolutionActivationRelu: Set(p, a); break; - case SimdConvolutionActivationLeakyRelu: Set(p, a); break; - case SimdConvolutionActivationRestrictRange: Set(p, a); break; - case SimdConvolutionActivationPrelu: Set(p, a); break; - case SimdConvolutionActivationElu: Set(p, a); break; - case SimdConvolutionActivationHswish: Set(p, a); break; - default: assert(0); - } - return true; - } - } -#endif//SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonSynetConvolution8i.cpp b/src/3rd/Simd/Simd/SimdNeonSynetConvolution8i.cpp deleted file mode 100644 index 270013e2..00000000 --- a/src/3rd/Simd/Simd/SimdNeonSynetConvolution8i.cpp +++ /dev/null @@ -1,938 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetConvolution8i.h" -#include "Simd/SimdSynetConvolution8iCommon.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdMath.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdCpu.h" -#include "Simd/SimdLog.h" -#include "Simd/SimdNeon.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - using AlgParam = SynetConvolution8iNhwcDirect::AlgParam; - using ConvolutionPtr = SynetConvolution8iNhwcDirect::ConvolutionPtr; - using Term8iType = Base::SynetConvolution8iNhwcDirect::Term8iType; - - SIMD_INLINE uint8x16_t Set4(const uint8_t* src) - { - return (uint8x16_t)vdupq_n_s32(*(int32_t*)src); - } - - template void Madd4(int32x4_t & i32, uint8x16_t u8, int8x16_t i8); - - template<> SIMD_INLINE void Madd4(int32x4_t& i32, uint8x16_t u8, int8x16_t i8) - { - int32x4_t lo = vmaxq_s32(vminq_s32(vpaddlq_s16(vmulq_s16(UnpackU8s<0>(u8), UnpackI8<0>(i8))), vdupq_n_s32(SHRT_MAX)), vdupq_n_s32(SHRT_MIN)); - int32x4_t hi = vmaxq_s32(vminq_s32(vpaddlq_s16(vmulq_s16(UnpackU8s<1>(u8), UnpackI8<1>(i8))), vdupq_n_s32(SHRT_MAX)), vdupq_n_s32(SHRT_MIN)); -#if defined(__aarch64__) - int32x4_t sum = vpaddq_s32(lo, hi); -#else - int32x4_t sum = vcombine_s32( - vpadd_s32(Half<0>(lo), Half<1>(lo)), - vpadd_s32(Half<0>(hi), Half<1>(hi))); -#endif - i32 = vaddq_s32(i32, sum); - } - - template<> SIMD_INLINE void Madd4(int32x4_t& i32, uint8x16_t u8, int8x16_t i8) - { - int32x4_t lo = vpaddlq_s16(vmulq_s16(UnpackU8s<0>(u8), UnpackI8<0>(i8))); - int32x4_t hi = vpaddlq_s16(vmulq_s16(UnpackU8s<1>(u8), UnpackI8<1>(i8))); -#if defined(__aarch64__) - int32x4_t sum = vpaddq_s32(lo, hi); -#else - int32x4_t sum = vcombine_s32( - vpadd_s32(Half<0>(lo), Half<1>(lo)), - vpadd_s32(Half<0>(hi), Half<1>(hi))); -#endif - i32 = vaddq_s32(i32, sum); - } - - inline void pdpbusd(int32x4_t& sum, uint8x16_t input, int8x16_t weight) - { - for (size_t i = 0; i < 4; ++i) - for (size_t j = 0; j < 4; ++j) - sum[i] += int32_t(input[i * 4 + j]) * int32_t(weight[i * 4 + j]); - } - - template void ConvolutionNhwcDirect_2x1(const uint8_t* src0, - const ConvParam8i& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const int8_t* weight0, - const int32x4_t* bias, const int32x4_t * params, const float32x4_t* scale, const float32x4_t* shift, int32_t* buf, uint8_t* dst) - { - int32x4_t d00, d01; - uint8x16_t s0; - int8x16_t w0, w1; - size_t dW = (DivHi(p.srcC, 4) - DivHi(srcC, 4)) * A, dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dWz = DivHi(srcC, 4) * A; - const int8_t* weight1 = weight0 + p.kernelY * p.kernelX * DivHi(p.srcC, 4) * A; - int32x4_t norm = vdupq_n_s32(a.norm); - size_t sy = dy * p.strideY - p.padY; - size_t sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY; - size_t kX = p.kernelX * p.dilationX; - if (dstC > F) - { - d00 = vdupq_n_s32(0), d01 = vdupq_n_s32(0); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - if (sy + ky < p.srcH && sx + kx < p.srcW) - { - size_t offs = (sy + ky) * dY + (sx + kx) * dX, end = offs + srcC; - for (; offs < end; offs += 4) - { - w0 = Load(weight0); - w1 = Load(weight1); - s0 = Set4(src0 + offs); - Madd4(d00, s0, w0); - Madd4(d01, s0, w1); - weight0 += A, weight1 += A; - } - } - else - { - if (a.zero) - { - s0 = (uint8x16_t)vdupq_n_s32(a.zero); - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = Load(weight0); - w1 = Load(weight1); - Madd4(d00, s0, w0); - Madd4(d01, s0, w1); - weight0 += A, weight1 += A; - } - } - else - weight0 += dWz, weight1 += dWz; - } - weight0 += dW, weight1 += dW; - } - } - if (dstC == DF) - Save2(dst, buf, d00, d01, norm, bias, params, scale, shift); - else - Save2(dst, buf, d00, d01, norm, bias, params, scale, shift, dstC - F); - } - else - { - d00 = vdupq_n_s32(0); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - if (sy + ky < p.srcH && sx + kx < p.srcW) - { - size_t offs = (sy + ky) * dY + (sx + kx) * dX, end = offs + srcC; - for (; offs < end; offs += 4) - { - w0 = Load(weight0); - s0 = Set4(src0 + offs); - Madd4(d00, s0, w0); - weight0 += A; - } - } - else - { - if (a.zero) - { - s0 = (uint8x16_t)vdupq_n_s32(a.zero); - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = Load(weight0); - Madd4(d00, s0, w0); - weight0 += A; - } - } - else - weight0 += dWz; - } - weight0 += dW; - } - } - if (dstC == F) - Save1(dst, buf, d00, norm, bias, params, scale, shift); - else - Save1(dst, buf, d00, norm, bias, params, scale, shift, dstC); - } - } - - template void ConvolutionNhwcDirect_2x5(const uint8_t* src0, - const ConvParam8i& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const int8_t* weight0, - const int32x4_t* bias, const int32x4_t* params, const float32x4_t* scale, const float32x4_t* shift, int32_t* buf, uint8_t* dst) - { - int32x4_t d00, d01, d10, d11, d20, d21, d30, d31, d40, d41; - uint8x16_t s0; - int8x16_t w0, w1; - size_t dW = (DivHi(p.srcC, 4) - DivHi(srcC, 4)) * A, dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dD = p.dstC * a.size, dB = p.dstC, dWz = (DivHi(srcC, 4) * A + dW) * p.kernelX; - const int8_t* weight1 = weight0 + p.kernelY * p.kernelX * DivHi(p.srcC, 4) * A; - const uint8_t* src1 = src0 + 1 * dS; - const uint8_t* src2 = src0 + 2 * dS; - const uint8_t* src3 = src0 + 3 * dS; - const uint8_t* src4 = src0 + 4 * dS; - int32x4_t norm = vdupq_n_s32(a.norm); - size_t sy = dy * p.strideY - p.padY; - size_t sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY; - size_t kX = p.kernelX * p.dilationX; - if (dstC > F) - { - d00 = vdupq_n_s32(0), d01 = vdupq_n_s32(0); - d10 = vdupq_n_s32(0), d11 = vdupq_n_s32(0); - d20 = vdupq_n_s32(0), d21 = vdupq_n_s32(0); - d30 = vdupq_n_s32(0), d31 = vdupq_n_s32(0); - d40 = vdupq_n_s32(0), d41 = vdupq_n_s32(0); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - if (sy + ky < p.srcH) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - assert(sx + kx < p.srcW && sx + kx + 5 <= p.srcW); - size_t offs = (sy + ky) * dY + (sx + kx) * dX, end = offs + srcC; - for (; offs < end; offs += 4) - { - w0 = Load(weight0); - w1 = Load(weight1); - s0 = Set4(src0 + offs); - Madd4(d00, s0, w0); - Madd4(d01, s0, w1); - s0 = Set4(src1 + offs); - Madd4(d10, s0, w0); - Madd4(d11, s0, w1); - s0 = Set4(src2 + offs); - Madd4(d20, s0, w0); - Madd4(d21, s0, w1); - s0 = Set4(src3 + offs); - Madd4(d30, s0, w0); - Madd4(d31, s0, w1); - s0 = Set4(src4 + offs); - Madd4(d40, s0, w0); - Madd4(d41, s0, w1); - weight0 += A, weight1 += A; - } - weight0 += dW, weight1 += dW; - } - } - else if (a.zero) - { - s0 = (uint8x16_t)vdupq_n_s32(a.zero); - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = Load(weight0); - w1 = Load(weight1); - Madd4(d00, s0, w0); - Madd4(d01, s0, w1); - Madd4(d10, s0, w0); - Madd4(d11, s0, w1); - Madd4(d20, s0, w0); - Madd4(d21, s0, w1); - Madd4(d30, s0, w0); - Madd4(d31, s0, w1); - Madd4(d40, s0, w0); - Madd4(d41, s0, w1); - weight0 += A, weight1 += A; - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == DF) - { - Save2(dst, buf, d00, d01, norm, bias, params, scale, shift); - dst += dD, buf += dB; - Save2(dst, buf, d10, d11, norm, bias, params, scale, shift); - dst += dD, buf += dB; - Save2(dst, buf, d20, d21, norm, bias, params, scale, shift); - dst += dD, buf += dB; - Save2(dst, buf, d30, d31, norm, bias, params, scale, shift); - dst += dD, buf += dB; - Save2(dst, buf, d40, d41, norm, bias, params, scale, shift); - dst += dD, buf += dB; - } - else - { - Save2(dst, buf, d00, d01, norm, bias, params, scale, shift, dstC - F); - dst += dD, buf += dB; - Save2(dst, buf, d10, d11, norm, bias, params, scale, shift, dstC - F); - dst += dD, buf += dB; - Save2(dst, buf, d20, d21, norm, bias, params, scale, shift, dstC - F); - dst += dD, buf += dB; - Save2(dst, buf, d30, d31, norm, bias, params, scale, shift, dstC - F); - dst += dD, buf += dB; - Save2(dst, buf, d40, d41, norm, bias, params, scale, shift, dstC - F); - dst += dD, buf += dB; - } - } - else - { - d00 = vdupq_n_s32(0); - d10 = vdupq_n_s32(0); - d20 = vdupq_n_s32(0); - d30 = vdupq_n_s32(0); - d40 = vdupq_n_s32(0); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - if (sy + ky < p.srcH) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - assert(sx + kx < p.srcW && sx + kx + 5 <= p.srcW); - size_t offs = (sy + ky) * dY + (sx + kx) * dX, end = offs + srcC; - for (; offs < end; offs += 4) - { - w0 = Load(weight0); - s0 = Set4(src0 + offs); - Madd4(d00, s0, w0); - s0 = Set4(src1 + offs); - Madd4(d10, s0, w0); - s0 = Set4(src2 + offs); - Madd4(d20, s0, w0); - s0 = Set4(src3 + offs); - Madd4(d30, s0, w0); - s0 = Set4(src4 + offs); - Madd4(d40, s0, w0); - weight0 += A; - } - weight0 += dW; - } - } - else if (a.zero) - { - s0 = (uint8x16_t)vdupq_n_s32(a.zero); - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = Load(weight0); - Madd4(d00, s0, w0); - Madd4(d10, s0, w0); - Madd4(d20, s0, w0); - Madd4(d30, s0, w0); - Madd4(d40, s0, w0); - weight0 += A; - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - Save1(dst, buf, d00, norm, bias, params, scale, shift); - dst += dD, buf += dB; - Save1(dst, buf, d10, norm, bias, params, scale, shift); - dst += dD, buf += dB; - Save1(dst, buf, d20, norm, bias, params, scale, shift); - dst += dD, buf += dB; - Save1(dst, buf, d30, norm, bias, params, scale, shift); - dst += dD, buf += dB; - Save1(dst, buf, d40, norm, bias, params, scale, shift); - dst += dD, buf += dB; - } - else - { - Save1(dst, buf, d00, norm, bias, params, scale, shift, dstC); - dst += dD, buf += dB; - Save1(dst, buf, d10, norm, bias, params, scale, shift, dstC); - dst += dD, buf += dB; - Save1(dst, buf, d20, norm, bias, params, scale, shift, dstC); - dst += dD, buf += dB; - Save1(dst, buf, d30, norm, bias, params, scale, shift, dstC); - dst += dD, buf += dB; - Save1(dst, buf, d40, norm, bias, params, scale, shift, dstC); - dst += dD, buf += dB; - } - } - } - - template void ConvolutionNhwcDirect_2xM(const uint8_t* src0, - const ConvParam8i& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, const int8_t* weight0, - const int32x4_t* bias, const int32x4_t* params, const float32x4_t* scale, const float32x4_t* shift, int32_t* buf, uint8_t* dst) - { - int32x4_t d00, d01, d10, d11, d20, d21, d30, d31, d40, d41; - uint8x16_t s0; - int8x16_t w0, w1; - size_t dW = (DivHi(p.srcC, 4) - DivHi(srcC, 4)) * A, dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dD = p.dstC * a.size, dB = p.dstC, dWz = (DivHi(srcC, 4) * A + dW) * p.kernelX; - const int8_t* weight1 = weight0 + p.kernelY * p.kernelX * DivHi(p.srcC, 4) * A; - const uint8_t* src1 = src0 + 1 * dS; - const uint8_t* src2 = src0 + 2 * dS; - const uint8_t* src3 = src0 + 3 * dS; - const uint8_t* src4 = src0 + 4 * dS; - int32x4_t norm = vdupq_n_s32(a.norm); - size_t sy = dy * p.strideY - p.padY; - size_t sx = dx * p.strideX - p.padX; - size_t kY = p.kernelY * p.dilationY; - size_t kX = p.kernelX * p.dilationX; - if (dstC > F) - { - if (M > 0) d00 = vdupq_n_s32(0), d01 = vdupq_n_s32(0); - if (M > 1) d10 = vdupq_n_s32(0), d11 = vdupq_n_s32(0); - if (M > 2) d20 = vdupq_n_s32(0), d21 = vdupq_n_s32(0); - if (M > 3) d30 = vdupq_n_s32(0), d31 = vdupq_n_s32(0); - if (M > 4) d40 = vdupq_n_s32(0), d41 = vdupq_n_s32(0); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - if (sy + ky < p.srcH) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - assert(sx + kx < p.srcW && sx + kx + M <= p.srcW); - size_t offs = (sy + ky) * dY + (sx + kx) * dX, end = offs + srcC; - for (; offs < end; offs += 4) - { - w0 = Load(weight0); - w1 = Load(weight1); - if (M > 0) s0 = Set4(src0 + offs), Madd4(d00, s0, w0), Madd4(d01, s0, w1); - if (M > 1) s0 = Set4(src1 + offs), Madd4(d10, s0, w0), Madd4(d11, s0, w1); - if (M > 2) s0 = Set4(src2 + offs), Madd4(d20, s0, w0), Madd4(d21, s0, w1); - if (M > 3) s0 = Set4(src3 + offs), Madd4(d30, s0, w0), Madd4(d31, s0, w1); - if (M > 4) s0 = Set4(src4 + offs), Madd4(d40, s0, w0), Madd4(d41, s0, w1); - weight0 += A, weight1 += A; - } - weight0 += dW, weight1 += dW; - } - } - else if (a.zero) - { - s0 = (uint8x16_t)vdupq_n_s32(a.zero); - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = Load(weight0); - w1 = Load(weight1); - if (M > 0) Madd4(d00, s0, w0), Madd4(d01, s0, w1); - if (M > 1) Madd4(d10, s0, w0), Madd4(d11, s0, w1); - if (M > 2) Madd4(d20, s0, w0), Madd4(d21, s0, w1); - if (M > 3) Madd4(d30, s0, w0), Madd4(d31, s0, w1); - if (M > 4) Madd4(d40, s0, w0), Madd4(d41, s0, w1); - weight0 += A, weight1 += A; - } - weight0 += dW, weight1 += dW; - } - } - else - weight0 += dWz, weight1 += dWz; - } - if (dstC == DF) - { - if (M > 0) Save2(dst, buf, d00, d01, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 1) Save2(dst, buf, d10, d11, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 2) Save2(dst, buf, d20, d21, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 3) Save2(dst, buf, d30, d31, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 4) Save2(dst, buf, d40, d41, norm, bias, params, scale, shift), dst += dD, buf += dB; - } - else - { - if (M > 0) Save2(dst, buf, d00, d01, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - if (M > 1) Save2(dst, buf, d10, d11, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - if (M > 2) Save2(dst, buf, d20, d21, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - if (M > 3) Save2(dst, buf, d30, d31, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - if (M > 4) Save2(dst, buf, d40, d41, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - } - } - else - { - if (M > 0) d00 = vdupq_n_s32(0); - if (M > 1) d10 = vdupq_n_s32(0); - if (M > 2) d20 = vdupq_n_s32(0); - if (M > 3) d30 = vdupq_n_s32(0); - if (M > 4) d40 = vdupq_n_s32(0); - for (size_t ky = 0; ky < kY; ky += p.dilationY) - { - if (sy + ky < p.srcH) - { - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - assert(sx + kx < p.srcW && sx + kx + M <= p.srcW); - size_t offs = (sy + ky) * dY + (sx + kx) * dX, end = offs + srcC; - for (; offs < end; offs += 4) - { - w0 = Load(weight0); - if (M > 0) s0 = Set4(src0 + offs), Madd4(d00, s0, w0); - if (M > 1) s0 = Set4(src1 + offs), Madd4(d10, s0, w0); - if (M > 2) s0 = Set4(src2 + offs), Madd4(d20, s0, w0); - if (M > 3) s0 = Set4(src3 + offs), Madd4(d30, s0, w0); - if (M > 4) s0 = Set4(src4 + offs), Madd4(d40, s0, w0); - weight0 += A; - } - weight0 += dW; - } - } - else if (a.zero) - { - s0 = (uint8x16_t)vdupq_n_s32(a.zero); - for (size_t kx = 0; kx < kX; kx += p.dilationX) - { - for (size_t offs = 0, end = srcC; offs < end; offs += 4) - { - w0 = Load(weight0); - if (M > 0) Madd4(d00, s0, w0); - if (M > 1) Madd4(d10, s0, w0); - if (M > 2) Madd4(d20, s0, w0); - if (M > 3) Madd4(d30, s0, w0); - if (M > 4) Madd4(d40, s0, w0); - weight0 += A; - } - weight0 += dW; - } - } - else - weight0 += dWz; - } - if (dstC == F) - { - if (M > 0) Save1(dst, buf, d00, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 1) Save1(dst, buf, d10, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 2) Save1(dst, buf, d20, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 3) Save1(dst, buf, d30, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 4) Save1(dst, buf, d40, norm, bias, params, scale, shift), dst += dD, buf += dB; - } - else - { - if (M > 0) Save1(dst, buf, d00, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - if (M > 1) Save1(dst, buf, d10, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - if (M > 2) Save1(dst, buf, d20, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - if (M > 3) Save1(dst, buf, d30, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - if (M > 4) Save1(dst, buf, d40, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - } - } - } - - typedef void(*ConvolutionNhwcDirect_2xM_Ptr)(const uint8_t* src0, const ConvParam8i& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, size_t dstC, - const int8_t* weight0, const int32x4_t* bias, const int32x4_t* params, const float32x4_t* scale, const float32x4_t* shift, int32_t* buf, uint8_t* dst); - - template ConvolutionNhwcDirect_2xM_Ptr GetConvolutionNhwcDirect_2xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect_2xM; - case 2: return ConvolutionNhwcDirect_2xM; - case 3: return ConvolutionNhwcDirect_2xM; - case 4: return ConvolutionNhwcDirect_2xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect_2(const uint8_t* src, - const ConvParam8i& p, const AlgParam& a, size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const int8_t* weight, - const int32_t* bias, const int32_t* params, const float* scale, const float* shift, int32_t* buf, uint8_t* dst) - { - size_t noseH = p.NoseH(), noseW = p.NoseW(), bodyH = p.BodyH(), bodyW = p.BodyW(); - size_t n = 5, bodyWn = AlignLoAny(bodyW - noseW, n) + noseW, m = bodyW - bodyWn; - ConvolutionNhwcDirect_2xM_Ptr convolutionNhwcDirect_2xM = GetConvolutionNhwcDirect_2xM(m); - size_t tailH = p.dstH, tailW = p.dstW; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - int32x4_t _params[2], _bias[2]; - _params[0] = vdupq_n_s32(0); - if (type == ::SimdConvolutionActivationRestrictRange) - _params[1] = vdupq_n_s32(a.high); - float32x4_t _scale[2], _shift[2]; - - for (size_t dc = 0; dc < dstC; dc += DF) - { - size_t dC = Simd::Min(DF, dstC - dc); - _bias[0] = Load(bias + dc + 0); - _bias[1] = Load(bias + dc + F); - _scale[0] = Load(scale + dc + 0); - _scale[1] = Load(scale + dc + F); - _shift[0] = Load(shift + dc + 0); - _shift[1] = Load(shift + dc + F); - - uint8_t* d = dst + (dc + yBeg * p.dstW * p.dstC) * a.size; - int32_t* b = buf + dc + yBeg * p.dstW * p.dstC; - size_t dy = yBeg; - for (; dy < noseH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyWn; dx += n, b += p.dstC * n, d += p.dstC * a.size * n) - ConvolutionNhwcDirect_2x5(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyW; dx += m, b += p.dstC * m, d += p.dstC * a.size * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < tailW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - } - for (; dy < bodyH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyWn; dx += n, b += p.dstC * n, d += p.dstC * a.size * n) - ConvolutionNhwcDirect_2x5(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyW; dx += m, b += p.dstC * m, d += p.dstC * a.size * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < tailW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - } - for (; dy < tailH && dy < yEnd; dy++) - { - size_t dx = 0; - for (; dx < noseW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyWn; dx += n, b += p.dstC * n, d += p.dstC * a.size * n) - ConvolutionNhwcDirect_2x5(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < bodyW; dx += m, b += p.dstC * m, d += p.dstC * a.size * m) - convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; dx < tailW; dx++, b += p.dstC, d += p.dstC * a.size) - ConvolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - } - weight += p.kernelY * p.kernelX * DivHi(p.srcC, 4) * DA; - } - } - - //--------------------------------------------------------------------- - - template void ConvolutionNhwcDirect1x1_2x5( - const uint8_t* src0, const ConvParam8i& p, const AlgParam& a, size_t srcC, size_t dstC, const int8_t* weight0, - const int32x4_t* bias, const int32x4_t* params, const float32x4_t* scale, const float32x4_t* shift, int32_t* buf, uint8_t* dst) - { - int32x4_t d00, d01, d10, d11, d20, d21, d30, d31, d40, d41; - uint8x16_t s0; - int8x16_t w0, w1; - size_t dS = p.srcC * p.strideX, dD = p.dstC * a.size, dB = p.dstC; - const int8_t* weight1 = weight0 + DivHi(p.srcC, 4) * A; - const uint8_t* src1 = src0 + 1 * dS; - const uint8_t* src2 = src0 + 2 * dS; - const uint8_t* src3 = src0 + 3 * dS; - const uint8_t* src4 = src0 + 4 * dS; - int32x4_t norm = vdupq_n_s32(a.norm); - if (dstC > F) - { - d00 = vdupq_n_s32(0), d01 = vdupq_n_s32(0); - d10 = vdupq_n_s32(0), d11 = vdupq_n_s32(0); - d20 = vdupq_n_s32(0), d21 = vdupq_n_s32(0); - d30 = vdupq_n_s32(0), d31 = vdupq_n_s32(0); - d40 = vdupq_n_s32(0), d41 = vdupq_n_s32(0); - for (size_t offs = 0; offs < srcC; offs += 4) - { - w0 = Load(weight0); - w1 = Load(weight1); - s0 = Set4(src0 + offs); - Madd4(d00, s0, w0); - Madd4(d01, s0, w1); - s0 = Set4(src1 + offs); - Madd4(d10, s0, w0); - Madd4(d11, s0, w1); - s0 = Set4(src2 + offs); - Madd4(d20, s0, w0); - Madd4(d21, s0, w1); - s0 = Set4(src3 + offs); - Madd4(d30, s0, w0); - Madd4(d31, s0, w1); - s0 = Set4(src4 + offs); - Madd4(d40, s0, w0); - Madd4(d41, s0, w1); - weight0 += A, weight1 += A; - } - if (dstC == DF) - { - Save2(dst, buf, d00, d01, norm, bias, params, scale, shift), dst += dD, buf += dB; - Save2(dst, buf, d10, d11, norm, bias, params, scale, shift), dst += dD, buf += dB; - Save2(dst, buf, d20, d21, norm, bias, params, scale, shift), dst += dD, buf += dB; - Save2(dst, buf, d30, d31, norm, bias, params, scale, shift), dst += dD, buf += dB; - Save2(dst, buf, d40, d41, norm, bias, params, scale, shift), dst += dD, buf += dB; - } - else - { - Save2(dst, buf, d00, d01, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - Save2(dst, buf, d10, d11, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - Save2(dst, buf, d20, d21, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - Save2(dst, buf, d30, d31, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - Save2(dst, buf, d40, d41, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - } - } - else - { - d00 = vdupq_n_s32(0); - d10 = vdupq_n_s32(0); - d20 = vdupq_n_s32(0); - d30 = vdupq_n_s32(0); - d40 = vdupq_n_s32(0); - for (size_t offs = 0; offs < srcC; offs += 4) - { - w0 = Load(weight0); - s0 = Set4(src0 + offs); - Madd4(d00, s0, w0); - s0 = Set4(src1 + offs); - Madd4(d10, s0, w0); - s0 = Set4(src2 + offs); - Madd4(d20, s0, w0); - s0 = Set4(src3 + offs); - Madd4(d30, s0, w0); - s0 = Set4(src4 + offs); - Madd4(d40, s0, w0); - weight0 += A; - } - if (dstC == F) - { - Save1(dst, buf, d00, norm, bias, params, scale, shift), dst += dD, buf += dB; - Save1(dst, buf, d10, norm, bias, params, scale, shift), dst += dD, buf += dB; - Save1(dst, buf, d20, norm, bias, params, scale, shift), dst += dD, buf += dB; - Save1(dst, buf, d30, norm, bias, params, scale, shift), dst += dD, buf += dB; - Save1(dst, buf, d40, norm, bias, params, scale, shift), dst += dD, buf += dB; - } - else - { - Save1(dst, buf, d00, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - Save1(dst, buf, d10, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - Save1(dst, buf, d20, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - Save1(dst, buf, d30, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - Save1(dst, buf, d40, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - } - } - } - - template void ConvolutionNhwcDirect1x1_2xM( - const uint8_t* src0, const ConvParam8i& p, const AlgParam& a, size_t srcC, size_t dstC, const int8_t* weight0, - const int32x4_t* bias, const int32x4_t* params, const float32x4_t* scale, const float32x4_t* shift, int32_t* buf, uint8_t* dst) - { - int32x4_t d00, d01, d10, d11, d20, d21, d30, d31, d40, d41; - uint8x16_t s0; - int8x16_t w0, w1; - size_t dS = p.srcC * p.strideX, dD = p.dstC * a.size, dB = p.dstC; - const int8_t* weight1 = weight0 + DivHi(p.srcC, 4) * A; - const uint8_t* src1 = src0 + 1 * dS; - const uint8_t* src2 = src0 + 2 * dS; - const uint8_t* src3 = src0 + 3 * dS; - const uint8_t* src4 = src0 + 4 * dS; - int32x4_t norm = vdupq_n_s32(a.norm); - if (dstC > F) - { - if (M > 0) d00 = vdupq_n_s32(0), d01 = vdupq_n_s32(0); - if (M > 1) d10 = vdupq_n_s32(0), d11 = vdupq_n_s32(0); - if (M > 2) d20 = vdupq_n_s32(0), d21 = vdupq_n_s32(0); - if (M > 3) d30 = vdupq_n_s32(0), d31 = vdupq_n_s32(0); - if (M > 4) d40 = vdupq_n_s32(0), d41 = vdupq_n_s32(0); - for (size_t offs = 0; offs < srcC; offs += 4) - { - w0 = Load(weight0); - w1 = Load(weight1); - if (M > 0) s0 = Set4(src0 + offs), Madd4(d00, s0, w0), Madd4(d01, s0, w1); - if (M > 1) s0 = Set4(src1 + offs), Madd4(d10, s0, w0), Madd4(d11, s0, w1); - if (M > 2) s0 = Set4(src2 + offs), Madd4(d20, s0, w0), Madd4(d21, s0, w1); - if (M > 3) s0 = Set4(src3 + offs), Madd4(d30, s0, w0), Madd4(d31, s0, w1); - if (M > 4) s0 = Set4(src4 + offs), Madd4(d40, s0, w0), Madd4(d41, s0, w1); - weight0 += A, weight1 += A; - } - if (dstC == DF) - { - if (M > 0) Save2(dst, buf, d00, d01, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 1) Save2(dst, buf, d10, d11, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 2) Save2(dst, buf, d20, d21, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 3) Save2(dst, buf, d30, d31, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 4) Save2(dst, buf, d40, d41, norm, bias, params, scale, shift), dst += dD, buf += dB; - } - else - { - if (M > 0) Save2(dst, buf, d00, d01, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - if (M > 1) Save2(dst, buf, d10, d11, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - if (M > 2) Save2(dst, buf, d20, d21, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - if (M > 3) Save2(dst, buf, d30, d31, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - if (M > 4) Save2(dst, buf, d40, d41, norm, bias, params, scale, shift, dstC - F), dst += dD, buf += dB; - } - } - else - { - if (M > 0) d00 = vdupq_n_s32(0); - if (M > 1) d10 = vdupq_n_s32(0); - if (M > 2) d20 = vdupq_n_s32(0); - if (M > 3) d30 = vdupq_n_s32(0); - if (M > 4) d40 = vdupq_n_s32(0); - for (size_t offs = 0; offs < srcC; offs += 4) - { - w0 = Load(weight0); - if (M > 0) s0 = Set4(src0 + offs), Madd4(d00, s0, w0); - if (M > 1) s0 = Set4(src1 + offs), Madd4(d10, s0, w0); - if (M > 2) s0 = Set4(src2 + offs), Madd4(d20, s0, w0); - if (M > 3) s0 = Set4(src3 + offs), Madd4(d30, s0, w0); - if (M > 4) s0 = Set4(src4 + offs), Madd4(d40, s0, w0); - weight0 += A; - } - if (dstC == F) - { - if (M > 0) Save1(dst, buf, d00, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 1) Save1(dst, buf, d10, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 2) Save1(dst, buf, d20, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 3) Save1(dst, buf, d30, norm, bias, params, scale, shift), dst += dD, buf += dB; - if (M > 4) Save1(dst, buf, d40, norm, bias, params, scale, shift), dst += dD, buf += dB; - } - else - { - if (M > 0) Save1(dst, buf, d00, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - if (M > 1) Save1(dst, buf, d10, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - if (M > 2) Save1(dst, buf, d20, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - if (M > 3) Save1(dst, buf, d30, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - if (M > 4) Save1(dst, buf, d40, norm, bias, params, scale, shift, dstC), dst += dD, buf += dB; - } - } - } - - typedef void(*ConvolutionNhwcDirect1x1_2xM_Ptr)(const uint8_t* src0, const ConvParam8i& p, const AlgParam& a, size_t srcC, size_t dstC, - const int8_t* weight0, const int32x4_t* bias, const int32x4_t* params, const float32x4_t* scale, const float32x4_t* shift, int32_t* buf, uint8_t* dst); - - template ConvolutionNhwcDirect1x1_2xM_Ptr GetConvolutionNhwcDirect1x1_2xM(size_t M) - { - switch (M) - { - case 0: return NULL; - case 1: return ConvolutionNhwcDirect1x1_2xM; - case 2: return ConvolutionNhwcDirect1x1_2xM; - case 3: return ConvolutionNhwcDirect1x1_2xM; - case 4: return ConvolutionNhwcDirect1x1_2xM; - } - assert(0); - return NULL; - } - - template void ConvolutionNhwcDirect1x1_2(const uint8_t* src, - const ConvParam8i& p, const AlgParam& a, size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const int8_t* weight, - const int32_t* bias, const int32_t* params, const float* scale, const float* shift, int32_t* buf, uint8_t* dst) - { - size_t n1 = (yEnd - yBeg) * p.dstW, n5 = AlignLoAny(n1, 5), m = n1 - n5; - ConvolutionNhwcDirect1x1_2xM_Ptr convolutionNhwcDirect1x1_2xM = GetConvolutionNhwcDirect1x1_2xM(m); - int32x4_t _params[2], _bias[2]; - _params[0] = vdupq_n_s32(0); - if (type == ::SimdConvolutionActivationRestrictRange) - _params[1] = vdupq_n_s32(a.high); - float32x4_t _scale[2], _shift[2]; - - for (size_t dc = 0; dc < dstC; dc += DF) - { - size_t dC = Simd::Min(DF, dstC - dc); - _bias[0] = Load(bias + dc + 0); - _bias[1] = Load(bias + dc + F); - _scale[0] = Load(scale + dc + 0); - _scale[1] = Load(scale + dc + F); - _shift[0] = Load(shift + dc + 0); - _shift[1] = Load(shift + dc + F); - const uint8_t* s = src + yBeg * p.srcW * p.srcC; - uint8_t* d = dst + (dc + yBeg * p.dstW * p.dstC) * a.size; - int32_t* b = buf + dc + yBeg * p.dstW * p.dstC; - size_t i = 0; - for (; i < n5; i += 5, s += p.srcC * 5, b += p.dstC * 5, d += p.dstC * a.size * 5) - ConvolutionNhwcDirect1x1_2x5(s, p, a, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - for (; i < n1; i += m, s += p.srcC * m, b += p.dstC * m, d += p.dstC * a.size * m) - convolutionNhwcDirect1x1_2xM(s, p, a, srcC, dC, weight, _bias, _params, _scale, _shift, b, d); - weight += DivHi(p.srcC, 4) * DA; - } - } - - //--------------------------------------------------------------------- - - template void Set(const ConvParam8i& p, const AlgParam& a, ConvolutionPtr* d) - { - if (p.Is1x1()) - { - switch (a.microD) - { - case 2 * F: d[term] = ConvolutionNhwcDirect1x1_2; break; - default: - assert(0); - } - } - else - { - switch (a.microD) - { - case 2 * F: d[term] = ConvolutionNhwcDirect_2; break; - default: - assert(0); - } - } - } - - template void Set(const ConvParam8i& p, const AlgParam& a, ConvolutionPtr* d) - { - if (p.compatibility & SimdSynetCompatibilityOverflow16i) - Set(p, a, d); - else - Set(p, a, d); - } - - template void Set(const ConvParam8i& p, const AlgParam& a, ConvolutionPtr* d) - { - Set(p, a, d); - Set(p, a, d); - Set(p, a, d); - Set(p, a, d); - Set(p, a, d); - Set(p, a, d); - } - - static void Set(const ConvParam8i& p, const AlgParam& a, ConvolutionPtr* d) - { - switch (p.activation) - { - case SimdConvolutionActivationIdentity: Set(p, a, d); break; - case SimdConvolutionActivationRelu: Set(p, a, d); break; - case SimdConvolutionActivationRestrictRange: Set(p, a, d); break; - default: assert(0); - } - } - - SynetConvolution8iNhwcDirect::SynetConvolution8iNhwcDirect(const ConvParam8i& p) - : Base::SynetConvolution8iNhwcDirect(p) - { - SetAlgParam(F, 2 * F, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3()); - Set(p, _alg, _convolutions); - _convertSrc = Neon::SynetConvert32fTo8u; - } - - bool SynetConvolution8iNhwcDirect::Preferable(const ConvParam8i& p) - { - if (p.trans != SimdTrue || p.group != 1) - return false; - return true; - } - - //--------------------------------------------------------------------- - - void* SynetConvolution8iInit(size_t batch, const SimdConvolutionParameters* conv, SimdSynetCompatibilityType compatibility) - { - ConvParam8i param(batch, conv, compatibility); - if (!param.Valid()) - return NULL; - else if (SynetConvolution8iNhwcDirect::Preferable(param)) - return new SynetConvolution8iNhwcDirect(param); - else - return new Base::SynetConvolution8iGemmNN(param); - } - } -#endif -} diff --git a/src/3rd/Simd/Simd/SimdNeonSynetDeconvolution32f.cpp b/src/3rd/Simd/Simd/SimdNeonSynetDeconvolution32f.cpp deleted file mode 100644 index 806711a0..00000000 --- a/src/3rd/Simd/Simd/SimdNeonSynetDeconvolution32f.cpp +++ /dev/null @@ -1,310 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetDeconvolution32f.h" -#include "Simd/SimdSynetConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdNeon.h" -#include "Simd/SimdGemm.h" -#include "Simd/SimdExp.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SynetDeconvolution32fGemmNN::SynetDeconvolution32fGemmNN(const DeconvParam32f & p) - : Base::SynetDeconvolution32fGemmNN(p) - { - _gemm.Init(InitGemmFuncs(Neon::Gemm32fNN, "Neon", p.gemm, "Ext")); - if (_param.trans && _param.group == 1) - { - if (NHWC_GEMM_RUNTIME) - { -#if defined(SIMD_ARM64_ENABLE) - _gemmCb.Init(InitGemmCbFuncs(Neon::Gemm32fNNcbBufferSize, Neon::Gemm32fNNcbReorderB, Neon::Gemm32fNNcbRun, "Neon", GemmKernelF2, GemmKernelF4)); -#else - _gemmCb.Init(InitGemmCbFuncs(Neon::Gemm32fNNcbBufferSize, Neon::Gemm32fNNcbReorderB, Neon::Gemm32fNNcbRun, "Neon", GemmKernelF2, GemmKernelF3)); -#endif - _nhwcWeight.Resize(_gemmCb.At(0).BufferSize(_M*_merge, _N, _K)); - } - else - _nhwcWeight.Resize(Neon::Gemm32fNNcbBufferSize(_M*_merge, _N, _K, GemmKernelAny, NHWC_GEMM_COMPATIBLE)); - _nhwcRun = Neon::Gemm32fNNcbRun; - _nhwcReorderB = Neon::Gemm32fNNcbReorderB; - } - _biasAndActivation = Neon::ConvolutionBiasAndActivation; - } - - //--------------------------------------------------------------------- - - typedef void(*DeconvolutionNhwcDirect2x2_Ptr) (const float * src0, const DeconvParam32f & p, size_t srcC, size_t dstC, const float * weight, const float32x4_t * bias, const float32x4_t * params, float * ds); - - template void DeconvolutionNhwcDirect2x2_6(const float * src0, - const DeconvParam32f & p, size_t srcC, size_t dstC, const float * weight0, const float32x4_t * bias, const float32x4_t * params, float * dst) - { - size_t dS = p.srcC, dD = p.dstC; - const float * weight1 = weight0 + srcC * F; - const float * src1 = src0 + 1 * dS; - const float * src2 = src0 + 2 * dS; - const float * src3 = src0 + 3 * dS; - const float * src4 = src0 + 4 * dS; - const float * src5 = src0 + 5 * dS; - float32x4_t d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - d00 = vdupq_n_f32(0.0f); d01 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); d11 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f); d21 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f); d31 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f); d41 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f); d51 = vdupq_n_f32(0.0f); - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = Load(weight0); - w1 = Load(weight1); - s0 = vld1q_dup_f32(src0 + sc); - d00 = vmlaq_f32(d00, s0, w0); - d01 = vmlaq_f32(d01, s0, w1); - s0 = vld1q_dup_f32(src1 + sc); - d10 = vmlaq_f32(d10, s0, w0); - d11 = vmlaq_f32(d11, s0, w1); - s0 = vld1q_dup_f32(src2 + sc); - d20 = vmlaq_f32(d20, s0, w0); - d21 = vmlaq_f32(d21, s0, w1); - s0 = vld1q_dup_f32(src3 + sc); - d30 = vmlaq_f32(d30, s0, w0); - d31 = vmlaq_f32(d31, s0, w1); - s0 = vld1q_dup_f32(src4 + sc); - d40 = vmlaq_f32(d40, s0, w0); - d41 = vmlaq_f32(d41, s0, w1); - s0 = vld1q_dup_f32(src5 + sc); - d50 = vmlaq_f32(d50, s0, w0); - d51 = vmlaq_f32(d51, s0, w1); - weight0 += F; - weight1 += F; - } - if (dstC == F) - { - Term::template Save(dst + 0x0 * dD, d00, bias, params); - Term::template Save(dst + 0x1 * dD, d01, bias, params); - Term::template Save(dst + 0x2 * dD, d10, bias, params); - Term::template Save(dst + 0x3 * dD, d11, bias, params); - Term::template Save(dst + 0x4 * dD, d20, bias, params); - Term::template Save(dst + 0x5 * dD, d21, bias, params); - Term::template Save(dst + 0x6 * dD, d30, bias, params); - Term::template Save(dst + 0x7 * dD, d31, bias, params); - Term::template Save(dst + 0x8 * dD, d40, bias, params); - Term::template Save(dst + 0x9 * dD, d41, bias, params); - Term::template Save(dst + 0xA * dD, d50, bias, params); - Term::template Save(dst + 0xB * dD, d51, bias, params); - } - else - { - Term::template Save(dst + 0x0 * dD, d00, bias, params, dstC); - Term::template Save(dst + 0x1 * dD, d01, bias, params, dstC); - Term::template Save(dst + 0x2 * dD, d10, bias, params, dstC); - Term::template Save(dst + 0x3 * dD, d11, bias, params, dstC); - Term::template Save(dst + 0x4 * dD, d20, bias, params, dstC); - Term::template Save(dst + 0x5 * dD, d21, bias, params, dstC); - Term::template Save(dst + 0x6 * dD, d30, bias, params, dstC); - Term::template Save(dst + 0x7 * dD, d31, bias, params, dstC); - Term::template Save(dst + 0x8 * dD, d40, bias, params, dstC); - Term::template Save(dst + 0x9 * dD, d41, bias, params, dstC); - Term::template Save(dst + 0xA * dD, d50, bias, params, dstC); - Term::template Save(dst + 0xB * dD, d51, bias, params, dstC); - } -} - - template void DeconvolutionNhwcDirect2x2_M(const float * src0, - const DeconvParam32f & p, size_t srcC, size_t dstC, const float * weight0, const float32x4_t * bias, const float32x4_t * params, float * dst) - { - size_t dS = p.srcC, dD = p.dstC; - const float * weight1 = weight0 + srcC * F, *src1, *src2, *src3, *src4, *src5; - if (tail > 1) src1 = src0 + 1 * dS; - if (tail > 2) src2 = src0 + 2 * dS; - if (tail > 3) src3 = src0 + 3 * dS; - if (tail > 4) src4 = src0 + 4 * dS; - if (tail > 5) src5 = src0 + 5 * dS; - float32x4_t d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - if (tail > 0) d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - if (tail > 1) d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - if (tail > 2) d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - if (tail > 3) d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - if (tail > 4) d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f); - if (tail > 5) d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f); - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = Load(weight0); - w1 = Load(weight1); - if (tail > 0) s0 = vld1q_dup_f32(src0 + sc), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - if (tail > 1) s0 = vld1q_dup_f32(src1 + sc), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - if (tail > 2) s0 = vld1q_dup_f32(src2 + sc), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - if (tail > 3) s0 = vld1q_dup_f32(src3 + sc), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - if (tail > 4) s0 = vld1q_dup_f32(src4 + sc), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1); - if (tail > 5) s0 = vld1q_dup_f32(src5 + sc), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1); - weight0 += F; - weight1 += F; - } - if (dstC == F) - { - if (tail > 0) Term::template Save(dst + 0x0 * dD, d00, bias, params), Term::template Save(dst + 0x1 * dD, d01, bias, params); - if (tail > 1) Term::template Save(dst + 0x2 * dD, d10, bias, params), Term::template Save(dst + 0x3 * dD, d11, bias, params); - if (tail > 2) Term::template Save(dst + 0x4 * dD, d20, bias, params), Term::template Save(dst + 0x5 * dD, d21, bias, params); - if (tail > 3) Term::template Save(dst + 0x6 * dD, d30, bias, params), Term::template Save(dst + 0x7 * dD, d31, bias, params); - if (tail > 4) Term::template Save(dst + 0x8 * dD, d40, bias, params), Term::template Save(dst + 0x9 * dD, d41, bias, params); - if (tail > 5) Term::template Save(dst + 0xA * dD, d50, bias, params), Term::template Save(dst + 0xB * dD, d51, bias, params); - } - else - { - if (tail > 0) Term::template Save(dst + 0x0 * dD, d00, bias, params, dstC), Term::template Save(dst + 0x1 * dD, d01, bias, params, dstC); - if (tail > 1) Term::template Save(dst + 0x2 * dD, d10, bias, params, dstC), Term::template Save(dst + 0x3 * dD, d11, bias, params, dstC); - if (tail > 2) Term::template Save(dst + 0x4 * dD, d20, bias, params, dstC), Term::template Save(dst + 0x5 * dD, d21, bias, params, dstC); - if (tail > 3) Term::template Save(dst + 0x6 * dD, d30, bias, params, dstC), Term::template Save(dst + 0x7 * dD, d31, bias, params, dstC); - if (tail > 4) Term::template Save(dst + 0x8 * dD, d40, bias, params, dstC), Term::template Save(dst + 0x9 * dD, d41, bias, params, dstC); - if (tail > 5) Term::template Save(dst + 0xA * dD, d50, bias, params, dstC), Term::template Save(dst + 0xB * dD, d51, bias, params, dstC); - } - } - - template SIMD_INLINE DeconvolutionNhwcDirect2x2_Ptr GetTailKernel(size_t tail) - { - switch (tail) - { - case 0: return DeconvolutionNhwcDirect2x2_M; - case 1: return DeconvolutionNhwcDirect2x2_M; - case 2: return DeconvolutionNhwcDirect2x2_M; - case 3: return DeconvolutionNhwcDirect2x2_M; - case 4: return DeconvolutionNhwcDirect2x2_M; - case 5: return DeconvolutionNhwcDirect2x2_M; - default: - assert(0); - return NULL; - } - } - - template void DeconvolutionNhwcDirect2x2(const float * src, const DeconvParam32f & p, - size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcW6 = AlignLoAny(p.srcW, 6), tail = p.srcW - srcW6; - DeconvolutionNhwcDirect2x2_Ptr bodyKernel = DeconvolutionNhwcDirect2x2_6; - DeconvolutionNhwcDirect2x2_Ptr tailKernel = GetTailKernel(tail); - - float32x4_t _params[2], _bias[1]; - _params[0] = vdupq_n_f32(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = vdupq_n_f32(params[1]); - - for (size_t dc = 0; dc < dstC; dc += F) - { - size_t dC = Simd::Min(F, dstC - dc); - _bias[0] = Load(bias + dc); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = Load(params + dc); - const float * s = src + yBeg * p.srcW * p.srcC; - float * d = dst + yBeg * p.strideY * p.dstW * p.dstC; - const float * w0 = weight + 0 * p.kernelX * p.srcC * F; - const float * w1 = weight + 1 * p.kernelX * p.srcC * F; - for (size_t sy = yBeg; sy < yEnd; sy += 1, s += p.srcW * p.srcC) - { - for (size_t sx = 0; sx < srcW6; sx += 6) - bodyKernel(s + sx * p.srcC, p, srcC, dC, w0, _bias, _params, d), d += 6 * p.strideX * p.dstC; - if (tail) - tailKernel(s + srcW6 * p.srcC, p, srcC, dC, w0, _bias, _params, d), d += tail * p.strideX * p.dstC; - for (size_t sx = 0; sx < srcW6; sx += 6) - bodyKernel(s + sx * p.srcC, p, srcC, dC, w1, _bias, _params, d), d += 6 * p.strideX * p.dstC; - if (tail) - tailKernel(s + srcW6 * p.srcC, p, srcC, dC, w1, _bias, _params, d), d += tail * p.strideX * p.dstC; - } - weight += p.kernelY * p.kernelX*srcC*F; - dst += F; - } - } - - template void DeconvolutionNhwcDirect2x2(const float * src, const DeconvParam32f & p, - const SynetDeconvolution32fNhwcDirect2x2::AlgParam & a, const float * weight, const float * bias, const float * params, float * dst) - { - for (size_t dc = 0; dc < p.dstC; dc += a.macroD) - { - size_t macroD = Simd::Min(p.dstC, dc + a.macroD) - dc; - for (size_t sc = 0; sc < p.srcC; sc += a.macroC) - { - size_t macroC = Simd::Min(p.srcC, sc + a.macroC) - sc; - size_t macroK = p.kernelY * p.kernelX * macroC; - for (size_t yBeg = 0; yBeg < p.srcH;) - { - size_t yEnd = Simd::Min(yBeg + a.macroH, p.srcH); - if (a.macroC == p.srcC) - DeconvolutionNhwcDirect2x2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc == 0) - DeconvolutionNhwcDirect2x2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else if (sc + macroC == p.srcC) - DeconvolutionNhwcDirect2x2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - else - DeconvolutionNhwcDirect2x2(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc); - yBeg = yEnd; - } - weight += AlignHiAny(macroD, a.microD)*macroK; - } - if (type == ::SimdConvolutionActivationPrelu) - params += macroD; - } - } - - SynetDeconvolution32fNhwcDirect2x2::SynetDeconvolution32fNhwcDirect2x2(const DeconvParam32f & p) - : Base::SynetDeconvolution32fNhwcDirect2x2(p) - { - switch (p.activation) - { - case SimdConvolutionActivationIdentity: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationRelu: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationLeakyRelu: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationRestrictRange: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationPrelu: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationElu: _deconvolution = DeconvolutionNhwcDirect2x2; break; - case SimdConvolutionActivationHswish: _deconvolution = DeconvolutionNhwcDirect2x2; break; - default: assert(0); - } - SetAlgParam(F, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3()); - } - - bool SynetDeconvolution32fNhwcDirect2x2::Preferable(const DeconvParam32f & p) - { - return p.IsPad(0) && p.IsDilation(1) && p.IsKernel(2) && p.IsStride(2) && p.group == 1 && p.trans; - } - - //--------------------------------------------------------------------- - - void * SynetDeconvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdGemm32fNNPtr gemm) - { - DeconvParam32f param(batch, conv, gemm); - if (!param.Valid()) - return NULL; - if (SynetDeconvolution32fNhwcDirect2x2::Preferable(param)) - return new SynetDeconvolution32fNhwcDirect2x2(param); - else - return new SynetDeconvolution32fGemmNN(param); - } - } -#endif//SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonSynetFused.cpp b/src/3rd/Simd/Simd/SimdNeonSynetFused.cpp deleted file mode 100644 index a965e434..00000000 --- a/src/3rd/Simd/Simd/SimdNeonSynetFused.cpp +++ /dev/null @@ -1,1237 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdSynet.h" -#include "Simd/SimdNeon.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdPow.h" -#include "Simd/SimdExp.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE void SynetFusedLayerForward0(const float * src, const float * bias, const float * scale, float * dst, size_t offset) - { - float32x4_t _bias = Load(bias + offset); - float32x4_t x = vaddq_f32(Load(src + offset), _bias); - float32x4_t _scale = Load(scale + offset); - Store(dst + offset, vmlaq_f32(vmaxq_f32(vdupq_n_f32(0.0f), x), vsubq_f32(x, vabsq_f32(x)), _scale)); - } - - template SIMD_INLINE void SynetFusedLayerForward0(const float * src, float32x4_t bias, float32x4_t scale, float * dst, size_t offset) - { - float32x4_t x = vaddq_f32(Load(src + offset), bias); - Store(dst + offset, vmlaq_f32(vmaxq_f32(vdupq_n_f32(0.0f), x), vsubq_f32(x, vabsq_f32(x)), scale)); - } - - - template void SynetFusedLayerForward0Nchw(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(spatial, F) && Aligned(dst)); - - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - if (partial) - { - float32x4_t _bias = vdupq_n_f32(bias[c]); - float32x4_t _scale = vdupq_n_f32(scale[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward0(src, _bias, _scale, dst, s + F * 0); - SynetFusedLayerForward0(src, _bias, _scale, dst, s + F * 1); - SynetFusedLayerForward0(src, _bias, _scale, dst, s + F * 2); - SynetFusedLayerForward0(src, _bias, _scale, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetFusedLayerForward0(src, _bias, _scale, dst, s); - } - for (; s < spatial; ++s) - dst[s] = Base::SynetFusedLayerForward0(src[s] + bias[c], scale[c]); - src += spatial; - dst += spatial; - } - } - - SIMD_INLINE void SynetFusedLayerForward0Nchw(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(spatial, F) && Aligned(dst)) - SynetFusedLayerForward0Nchw(src, bias, scale, channels, spatial, dst); - else - SynetFusedLayerForward0Nchw(src, bias, scale, channels, spatial, dst); - } - - template void SynetFusedLayerForward0Nhwc(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(bias) && Aligned(scale) && Aligned(channels, F) && Aligned(dst)); - - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - if (partial) - { - for (; c < aligned; c += QF) - { - SynetFusedLayerForward0(src, bias, scale, dst, c + F * 0); - SynetFusedLayerForward0(src, bias, scale, dst, c + F * 1); - SynetFusedLayerForward0(src, bias, scale, dst, c + F * 2); - SynetFusedLayerForward0(src, bias, scale, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetFusedLayerForward0(src, bias, scale, dst, c); - } - for (; c < channels; ++c) - dst[c] = Base::SynetFusedLayerForward0(src[c] + bias[c], scale[c]); - src += channels; - dst += channels; - } - } - - SIMD_INLINE void SynetFusedLayerForward0Nhwc(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(bias) && Aligned(scale) && Aligned(channels, F) && Aligned(dst)) - SynetFusedLayerForward0Nhwc(src, bias, scale, channels, spatial, dst); - else - SynetFusedLayerForward0Nhwc(src, bias, scale, channels, spatial, dst); - } - - template void SynetFusedLayerForward0Nchw4c(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - for (size_t c = 0; c < channels; c += F) - { - float32x4_t _bias = Load(bias + c); - float32x4_t _scale = Load(scale + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward0(src, _bias, _scale, dst, s + F * 0); - SynetFusedLayerForward0(src, _bias, _scale, dst, s + F * 1); - SynetFusedLayerForward0(src, _bias, _scale, dst, s + F * 2); - SynetFusedLayerForward0(src, _bias, _scale, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward0(src, _bias, _scale, dst, s); - src += spatialF; - dst += spatialF; - } - } - - SIMD_INLINE void SynetFusedLayerForward0Nchw4c(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetFusedLayerForward0Nchw4c(src, bias, scale, channels, spatial, dst); - else - SynetFusedLayerForward0Nchw4c(src, bias, scale, channels, spatial, dst); - } - - void SynetFusedLayerForward0(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward0Nchw(src, bias, scale, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward0Nhwc(src, bias, scale, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - SynetFusedLayerForward0Nchw4c(src, bias, scale, channels, spatial, dst); - else - Base::SynetFusedLayerForward0(src, bias, scale, channels, spatial, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetFusedLayerForward1(const float * src, const float * bias0, const float * scale1, const float * bias1, float32x4_t _0, float * dst, size_t offset) - { - float32x4_t _bias0 = Load(bias0 + offset); - float32x4_t x = vaddq_f32(Load(src + offset), _bias0); - float32x4_t _scale1 = Load(scale1 + offset); - float32x4_t _bias1 = Load(bias1 + offset); - Store(dst + offset, vaddq_f32(vmlaq_f32(_bias1, vmaxq_f32(_0, vnegq_f32(x)), _scale1), vmaxq_f32(_0, x))); - } - - template SIMD_INLINE void SynetFusedLayerForward1(const float * src, float32x4_t bias0, float32x4_t scale1, float32x4_t bias1, float32x4_t _0, float * dst, size_t offset) - { - float32x4_t x = vaddq_f32(Load(src + offset), bias0); - Store(dst + offset, vaddq_f32(vmlaq_f32(bias1, vmaxq_f32(_0, vnegq_f32(x)), scale1), vmaxq_f32(_0, x))); - } - - template void SynetFusedLayerForward1Nchw(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(spatial, F) && Aligned(dst)); - - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - float32x4_t _0 = vdupq_n_f32(0.0f); - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - if (partial) - { - float32x4_t _bias0 = vdupq_n_f32(bias0[c]); - float32x4_t _scale1 = vdupq_n_f32(scale1[c]); - float32x4_t _bias1 = vdupq_n_f32(bias1[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, _0, dst, s + F * 0); - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, _0, dst, s + F * 1); - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, _0, dst, s + F * 2); - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, _0, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, _0, dst, s); - } - for (; s < spatial; ++s) - dst[s] = Base::SynetFusedLayerForward1(src[s] + bias0[c], scale1[c], bias1[c]); - src += spatial; - dst += spatial; - } - } - - SIMD_INLINE void SynetFusedLayerForward1Nchw(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(spatial, F) && Aligned(dst)) - SynetFusedLayerForward1Nchw(src, bias0, scale1, bias1, channels, spatial, dst); - else - SynetFusedLayerForward1Nchw(src, bias0, scale1, bias1, channels, spatial, dst); - } - - template void SynetFusedLayerForward1Nhwc(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(bias0) && Aligned(scale1) && Aligned(bias1) && Aligned(channels, F) && Aligned(dst)); - - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - float32x4_t _0 = vdupq_n_f32(0.0f); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - if (partial) - { - for (; c < aligned; c += QF) - { - SynetFusedLayerForward1(src, bias0, scale1, bias1, _0, dst, c + F * 0); - SynetFusedLayerForward1(src, bias0, scale1, bias1, _0, dst, c + F * 1); - SynetFusedLayerForward1(src, bias0, scale1, bias1, _0, dst, c + F * 2); - SynetFusedLayerForward1(src, bias0, scale1, bias1, _0, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetFusedLayerForward1(src, bias0, scale1, bias1, _0, dst, c); - } - for (; c < channels; ++c) - dst[c] = Base::SynetFusedLayerForward1(src[c] + bias0[c], scale1[c], bias1[c]); - src += channels; - dst += channels; - } - } - - SIMD_INLINE void SynetFusedLayerForward1Nhwc(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(bias0) && Aligned(scale1) && Aligned(bias1) && Aligned(channels, F) && Aligned(dst)) - SynetFusedLayerForward1Nhwc(src, bias0, scale1, bias1, channels, spatial, dst); - else - SynetFusedLayerForward1Nhwc(src, bias0, scale1, bias1, channels, spatial, dst); - } - - template void SynetFusedLayerForward1Nchw4c(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - float32x4_t _0 = vdupq_n_f32(0.0f); - for (size_t c = 0; c < channels; c += F) - { - float32x4_t _bias0 = Load(bias0 + c); - float32x4_t _scale1 = Load(scale1 + c); - float32x4_t _bias1 = Load(bias1 + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, _0, dst, s + F * 0); - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, _0, dst, s + F * 1); - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, _0, dst, s + F * 2); - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, _0, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward1(src, _bias0, _scale1, _bias1, _0, dst, s); - src += spatialF; - dst += spatialF; - } - } - - SIMD_INLINE void SynetFusedLayerForward1Nchw4c(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetFusedLayerForward1Nchw4c(src, bias0, scale1, bias1, channels, spatial, dst); - else - SynetFusedLayerForward1Nchw4c(src, bias0, scale1, bias1, channels, spatial, dst); - } - - void SynetFusedLayerForward1(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward1Nchw(src, bias0, scale1, bias1, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward1Nhwc(src, bias0, scale1, bias1, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - SynetFusedLayerForward1Nchw4c(src, bias0, scale1, bias1, channels, spatial, dst); - else - Base::SynetFusedLayerForward1(src, bias0, scale1, bias1, channels, spatial, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetFusedLayerForward2(const float * src, const float * scale, const float * bias, float32x4_t slope, float32x4_t _0, float * dst, size_t offset) - { - float32x4_t _src = Load(src + offset); - float32x4_t _scale = Load(scale + offset); - float32x4_t _bias = Load(bias + offset); - float32x4_t x = vmlaq_f32(_bias, _src, _scale); - Store(dst + offset, vmlaq_f32(vmaxq_f32(_0, x), vminq_f32(_0, x), slope)); - } - - template SIMD_INLINE void SynetFusedLayerForward2(const float * src, float32x4_t scale, float32x4_t bias, float32x4_t slope, float32x4_t _0, float * dst, size_t offset) - { - float32x4_t _src = Load(src + offset); - float32x4_t x = vmlaq_f32(bias, _src, scale); - Store(dst + offset, vmlaq_f32(vmaxq_f32(_0, x), vminq_f32(_0, x), slope)); - } - - template void SynetFusedLayerForward2Nchw(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(spatial, F) && Aligned(dst)); - - float32x4_t _slope = vdupq_n_f32(slope[0]); - float32x4_t _0 = vdupq_n_f32(0.0f); - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - if (partial) - { - float32x4_t _scale = vdupq_n_f32(scale[c]); - float32x4_t _bias = vdupq_n_f32(bias[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward2(src, _scale, _bias, _slope, _0, dst, s + F * 0); - SynetFusedLayerForward2(src, _scale, _bias, _slope, _0, dst, s + F * 1); - SynetFusedLayerForward2(src, _scale, _bias, _slope, _0, dst, s + F * 2); - SynetFusedLayerForward2(src, _scale, _bias, _slope, _0, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetFusedLayerForward2(src, _scale, _bias, _slope, _0, dst, s); - } - for (; s < spatial; ++s) - dst[s] = Base::SynetFusedLayerForward2(src[s], scale[c], bias[c], slope[0]); - src += spatial; - dst += spatial; - } - } - - SIMD_INLINE void SynetFusedLayerForward2Nchw(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(spatial, F) && Aligned(dst)) - SynetFusedLayerForward2Nchw(src, scale, bias, channels, spatial, slope, dst); - else - SynetFusedLayerForward2Nchw(src, scale, bias, channels, spatial, slope, dst); - } - - template void SynetFusedLayerForward2Nhwc(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(scale) && Aligned(bias) && Aligned(channels, F) && Aligned(dst)); - - float32x4_t _slope = vdupq_n_f32(slope[0]); - float32x4_t _0 = vdupq_n_f32(0.0f); - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - if (partial) - { - for (; c < aligned; c += QF) - { - SynetFusedLayerForward2(src, scale, bias, _slope, _0, dst, c + F * 0); - SynetFusedLayerForward2(src, scale, bias, _slope, _0, dst, c + F * 1); - SynetFusedLayerForward2(src, scale, bias, _slope, _0, dst, c + F * 2); - SynetFusedLayerForward2(src, scale, bias, _slope, _0, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetFusedLayerForward2(src, scale, bias, _slope, _0, dst, c); - } - for (; c < channels; ++c) - dst[c] = Base::SynetFusedLayerForward2(src[c], scale[c], bias[c], slope[0]); - src += channels; - dst += channels; - } - } - - SIMD_INLINE void SynetFusedLayerForward2Nhwc(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(scale) && Aligned(bias) && Aligned(channels, F) && Aligned(dst)) - SynetFusedLayerForward2Nhwc(src, scale, bias, channels, spatial, slope, dst); - else - SynetFusedLayerForward2Nhwc(src, scale, bias, channels, spatial, slope, dst); - } - - template void SynetFusedLayerForward2Nchw4c(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - float32x4_t _slope = vdupq_n_f32(slope[0]); - float32x4_t _0 = vdupq_n_f32(0.0f); - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - for (size_t c = 0; c < channels; c += F) - { - float32x4_t _scale = Load(scale + c); - float32x4_t _bias = Load(bias + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward2(src, _scale, _bias, _slope, _0, dst, s + F * 0); - SynetFusedLayerForward2(src, _scale, _bias, _slope, _0, dst, s + F * 1); - SynetFusedLayerForward2(src, _scale, _bias, _slope, _0, dst, s + F * 2); - SynetFusedLayerForward2(src, _scale, _bias, _slope, _0, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward2(src, _scale, _bias, _slope, _0, dst, s); - src += spatialF; - dst += spatialF; - } - } - - SIMD_INLINE void SynetFusedLayerForward2Nchw4c(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetFusedLayerForward2Nchw4c(src, scale, bias, channels, spatial, slope, dst); - else - SynetFusedLayerForward2Nchw4c(src, scale, bias, channels, spatial, slope, dst); - } - - void SynetFusedLayerForward2(const float * src, const float * scale, const float * bias, size_t channels, size_t spatial, const float * slope, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward2Nchw(src, scale, bias, channels, spatial, slope, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward2Nhwc(src, scale, bias, channels, spatial, slope, dst); - else if (format == SimdTensorFormatNchw4c) - SynetFusedLayerForward2Nchw4c(src, scale, bias, channels, spatial, slope, dst); - else - Base::SynetFusedLayerForward2(src, scale, bias, channels, spatial, slope, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetFusedLayerForward3(const float * src, const float * bias, const float * scale, float32x4_t _0, float * dst, size_t offset) - { - float32x4_t _bias = Load(bias + offset); - float32x4_t x = vaddq_f32(Load(src + offset), _bias); - float32x4_t _scale = Load(scale + offset); - float32x4_t pos = vmaxq_f32(_0, x); - float32x4_t neg = vminq_f32(_0, x); - Store(dst + offset, vmlaq_f32(pos, _scale, neg)); - } - - template SIMD_INLINE void SynetFusedLayerForward3(const float * src, float32x4_t bias, float32x4_t scale, float32x4_t _0, float * dst, size_t offset) - { - float32x4_t x = vaddq_f32(Load(src + offset), bias); - float32x4_t pos = vmaxq_f32(_0, x); - float32x4_t neg = vminq_f32(_0, x); - Store(dst + offset, vmlaq_f32(pos, scale, neg)); - } - - template void SynetFusedLayerForward3Nchw(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(spatial, F) && Aligned(dst)); - - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - float32x4_t _0 = vdupq_n_f32(0.0f); - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - if (partial) - { - float32x4_t _bias = vdupq_n_f32(bias[c]); - float32x4_t _scale = vdupq_n_f32(scale[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward3(src, _bias, _scale, _0, dst, s + F * 0); - SynetFusedLayerForward3(src, _bias, _scale, _0, dst, s + F * 1); - SynetFusedLayerForward3(src, _bias, _scale, _0, dst, s + F * 2); - SynetFusedLayerForward3(src, _bias, _scale, _0, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetFusedLayerForward3(src, _bias, _scale, _0, dst, s); - } - for (; s < spatial; ++s) - dst[s] = Base::SynetFusedLayerForward3(src[s] + bias[c], scale[c]); - src += spatial; - dst += spatial; - } - } - - SIMD_INLINE void SynetFusedLayerForward3Nchw(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(spatial, F) && Aligned(dst)) - SynetFusedLayerForward3Nchw(src, bias, scale, channels, spatial, dst); - else - SynetFusedLayerForward3Nchw(src, bias, scale, channels, spatial, dst); - } - - template void SynetFusedLayerForward3Nhwc(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(bias) && Aligned(scale) && Aligned(channels, F) && Aligned(dst)); - - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - float32x4_t _0 = vdupq_n_f32(0.0f); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - if (partial) - { - for (; c < aligned; c += QF) - { - SynetFusedLayerForward3(src, bias, scale, _0, dst, c + F * 0); - SynetFusedLayerForward3(src, bias, scale, _0, dst, c + F * 1); - SynetFusedLayerForward3(src, bias, scale, _0, dst, c + F * 2); - SynetFusedLayerForward3(src, bias, scale, _0, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetFusedLayerForward3(src, bias, scale, _0, dst, c); - } - for (; c < channels; ++c) - dst[c] = Base::SynetFusedLayerForward3(src[c] + bias[c], scale[c]); - src += channels; - dst += channels; - } - } - - SIMD_INLINE void SynetFusedLayerForward3Nhwc(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(bias) && Aligned(scale) && Aligned(channels, F) && Aligned(dst)) - SynetFusedLayerForward3Nhwc(src, bias, scale, channels, spatial, dst); - else - SynetFusedLayerForward3Nhwc(src, bias, scale, channels, spatial, dst); - } - - template void SynetFusedLayerForward3Nchw4c(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - float32x4_t _0 = vdupq_n_f32(0.0f); - for (size_t c = 0; c < channels; c += F) - { - float32x4_t _bias = Load(bias + c); - float32x4_t _scale = Load(scale + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward3(src, _bias, _scale, _0, dst, s + F * 0); - SynetFusedLayerForward3(src, _bias, _scale, _0, dst, s + F * 1); - SynetFusedLayerForward3(src, _bias, _scale, _0, dst, s + F * 2); - SynetFusedLayerForward3(src, _bias, _scale, _0, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward3(src, _bias, _scale, _0, dst, s); - src += spatialF; - dst += spatialF; - } - } - - SIMD_INLINE void SynetFusedLayerForward3Nchw4c(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(dst)) - SynetFusedLayerForward3Nchw4c(src, bias, scale, channels, spatial, dst); - else - SynetFusedLayerForward3Nchw4c(src, bias, scale, channels, spatial, dst); - } - - void SynetFusedLayerForward3(const float * src, const float * bias, const float * scale, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward3Nchw(src, bias, scale, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward3Nhwc(src, bias, scale, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - SynetFusedLayerForward3Nchw4c(src, bias, scale, channels, spatial, dst); - else - Base::SynetFusedLayerForward3(src, bias, scale, channels, spatial, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetFusedLayerForward4(const float * src, const float * bias0, float32x4_t scale1, float32x4_t bias1, float32x4_t _0, float * dst0, float * dst1, size_t offset) - { - float32x4_t x = vaddq_f32(Load(src + offset), Load(bias0 + offset)); - Store(dst0 + offset, vmaxq_f32(_0, x)); - Store(dst1 + offset, vmaxq_f32(_0, vmlaq_f32(bias1, scale1, x))); - } - - template SIMD_INLINE void SynetFusedLayerForward4(const float * src, float32x4_t bias0, float32x4_t scale1, float32x4_t bias1, float32x4_t _0, float * dst0, float * dst1, size_t offset) - { - float32x4_t x = vaddq_f32(Load(src + offset), bias0); - Store(dst0 + offset, vmaxq_f32(_0, x)); - Store(dst1 + offset, vmaxq_f32(_0, vmlaq_f32(bias1, scale1, x))); - } - - template void SynetFusedLayerForward4Nchw(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst0) - { - if (align) - assert(Aligned(src) && Aligned(spatial, F) && Aligned(dst0)); - - float32x4_t _bias1 = vdupq_n_f32(bias1[0]); - float32x4_t _scale1 = vdupq_n_f32(scale1[0]); - float32x4_t _0 = vdupq_n_f32(0.0f); - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - float * dst1 = dst0 + channels * spatial; - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - if (partial) - { - float32x4_t _bias0 = vdupq_n_f32(bias0[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, _0, dst0, dst1, s + F * 0); - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, _0, dst0, dst1, s + F * 1); - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, _0, dst0, dst1, s + F * 2); - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, _0, dst0, dst1, s + F * 3); - } - for (; s < partial; s += F) - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, _0, dst0, dst1, s); - } - for (; s < spatial; ++s) - Base::SynetFusedLayerForward4(src[s], bias0[c], scale1[0], bias1[0], dst0 + s, dst1 + s); - src += spatial; - dst0 += spatial; - dst1 += spatial; - } - } - - SIMD_INLINE void SynetFusedLayerForward4Nchw(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(spatial, F) && Aligned(dst)) - SynetFusedLayerForward4Nchw(src, bias0, scale1, bias1, channels, spatial, dst); - else - SynetFusedLayerForward4Nchw(src, bias0, scale1, bias1, channels, spatial, dst); - } - - template void SynetFusedLayerForward4Nhwc(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst0) - { - if (align) - assert(Aligned(src) && Aligned(bias0) && Aligned(channels, F) && Aligned(dst0)); - - float32x4_t _bias1 = vdupq_n_f32(bias1[0]); - float32x4_t _scale1 = vdupq_n_f32(scale1[0]); - float32x4_t _0 = vdupq_n_f32(0.0f); - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - float * dst1 = dst0 + channels; - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - if (partial) - { - for (; c < aligned; c += QF) - { - SynetFusedLayerForward4(src, bias0, _scale1, _bias1, _0, dst0, dst1, c + F * 0); - SynetFusedLayerForward4(src, bias0, _scale1, _bias1, _0, dst0, dst1, c + F * 1); - SynetFusedLayerForward4(src, bias0, _scale1, _bias1, _0, dst0, dst1, c + F * 2); - SynetFusedLayerForward4(src, bias0, _scale1, _bias1, _0, dst0, dst1, c + F * 3); - } - for (; c < partial; c += F) - SynetFusedLayerForward4(src, bias0, _scale1, _bias1, _0, dst0, dst1, c); - } - for (; c < channels; ++c) - Base::SynetFusedLayerForward4(src[c], bias0[c], scale1[0], bias1[0], dst0 + c, dst1 + c); - src += channels; - dst0 += 2 * channels; - dst1 += 2 * channels; - } - } - - SIMD_INLINE void SynetFusedLayerForward4Nhwc(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src) && Aligned(bias0) && Aligned(channels, F) && Aligned(dst)) - SynetFusedLayerForward4Nhwc(src, bias0, scale1, bias1, channels, spatial, dst); - else - SynetFusedLayerForward4Nhwc(src, bias0, scale1, bias1, channels, spatial, dst); - } - - template void SynetFusedLayerForward4Nchw4cA(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst0) - { - if (align) - assert(Aligned(src) && Aligned(dst0)); - - float32x4_t _bias1 = vdupq_n_f32(bias1[0]); - float32x4_t _scale1 = vdupq_n_f32(scale1[0]); - float32x4_t _0 = vdupq_n_f32(0.0f); - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4) * F; - float * dst1 = dst0 + channels * spatial; - for (size_t c = 0; c < channels; c += F) - { - float32x4_t _bias0 = Load(bias0 + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, _0, dst0, dst1, s + F * 0); - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, _0, dst0, dst1, s + F * 1); - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, _0, dst0, dst1, s + F * 2); - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, _0, dst0, dst1, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward4(src, _bias0, _scale1, _bias1, _0, dst0, dst1, s); - src += spatialF; - dst0 += spatialF; - dst1 += spatialF; - } - } - - SIMD_INLINE void SynetFusedLayerForward4Nchw4cA(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst) - { - assert(Aligned(channels, F)); - if (Aligned(src) && Aligned(dst)) - SynetFusedLayerForward4Nchw4cA(src, bias0, scale1, bias1, channels, spatial, dst); - else - SynetFusedLayerForward4Nchw4cA(src, bias0, scale1, bias1, channels, spatial, dst); - } - - void SynetFusedLayerForward4(const float * src, const float * bias0, const float * scale1, const float * bias1, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward4Nchw(src, bias0, scale1, bias1, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward4Nhwc(src, bias0, scale1, bias1, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c && Aligned(channels, F)) - SynetFusedLayerForward4Nchw4cA(src, bias0, scale1, bias1, channels, spatial, dst); - else - Base::SynetFusedLayerForward4(src, bias0, scale1, bias1, channels, spatial, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetFusedLayerForward8(const float * src0, const float * src1, const float * src2, float * dst, size_t offset) - { - Store(dst + offset, vmlaq_f32(Load(src0 + offset), Load(src1 + offset), Load(src2 + offset))); - } - - template SIMD_INLINE void SynetFusedLayerForward8(const float * src0, const float * src1, const float32x4_t & src2, float * dst, size_t offset) - { - Store(dst + offset, vmlaq_f32(Load(src0 + offset), Load(src1 + offset), src2)); - } - - template void SynetFusedLayerForward8Nchw(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src0) && Aligned(src1) && Aligned(spatial, F) && Aligned(dst)); - - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - for (size_t c = 0; c < channels; ++c) - { - size_t s = 0; - if (partial) - { - float32x4_t _src2 = vdupq_n_f32(src2[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 0); - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 1); - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 2); - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 3); - } - for (; s < partial; s += F) - SynetFusedLayerForward8(src0, src1, _src2, dst, s); - } - for (; s < spatial; ++s) - dst[s] = Base::SynetFusedLayerForward8(src0[s], src1[s], src2[c]); - src0 += spatial; - src1 += spatial; - dst += spatial; - } - } - - SIMD_INLINE void SynetFusedLayerForward8Nchw(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src0) && Aligned(src1) && Aligned(spatial, F) && Aligned(dst)) - SynetFusedLayerForward8Nchw(src0, src1, src2, channels, spatial, dst); - else - SynetFusedLayerForward8Nchw(src0, src1, src2, channels, spatial, dst); - } - - template void SynetFusedLayerForward8Nhwc(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src0) && Aligned(src1) && Aligned(src2) && Aligned(channels, F) && Aligned(dst)); - - size_t aligned = AlignLo(channels, QF); - size_t partial = AlignLo(channels, F); - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - if (partial) - { - for (; c < aligned; c += QF) - { - SynetFusedLayerForward8(src0, src1, src2, dst, c + F * 0); - SynetFusedLayerForward8(src0, src1, src2, dst, c + F * 1); - SynetFusedLayerForward8(src0, src1, src2, dst, c + F * 2); - SynetFusedLayerForward8(src0, src1, src2, dst, c + F * 3); - } - for (; c < partial; c += F) - SynetFusedLayerForward8(src0, src1, src2, dst, c); - } - for (; c < channels; ++c) - dst[c] = Base::SynetFusedLayerForward8(src0[c], src1[c], src2[c]); - src0 += channels; - src1 += channels; - dst += channels; - } - } - - SIMD_INLINE void SynetFusedLayerForward8Nhwc(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src0) && Aligned(src1) && Aligned(src2) && Aligned(channels, F) && Aligned(dst)) - SynetFusedLayerForward8Nhwc(src0, src1, src2, channels, spatial, dst); - else - SynetFusedLayerForward8Nhwc(src0, src1, src2, channels, spatial, dst); - } - - template void SynetFusedLayerForward8Nchw4c(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst) - { - if (align) - assert(Aligned(src0) && Aligned(src1) && Aligned(dst)); - - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - for (size_t c = 0; c < channels; c += F) - { - float32x4_t _src2 = Load(src2 + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 0); - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 1); - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 2); - SynetFusedLayerForward8(src0, src1, _src2, dst, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward8(src0, src1, _src2, dst, s); - src0 += spatialF; - src1 += spatialF; - dst += spatialF; - } - } - - SIMD_INLINE void SynetFusedLayerForward8Nchw4c(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst) - { - if (Aligned(src0) && Aligned(src1) && Aligned(dst)) - SynetFusedLayerForward8Nchw4c(src0, src1, src2, channels, spatial, dst); - else - SynetFusedLayerForward8Nchw4c(src0, src1, src2, channels, spatial, dst); - } - - void SynetFusedLayerForward8(const float * src0, const float * src1, const float * src2, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels, spatial, format)) - SynetFusedLayerForward8Nchw(src0, src1, src2, channels, spatial, dst); - else if (Base::NhwcCompatible(channels, spatial, format)) - SynetFusedLayerForward8Nhwc(src0, src1, src2, channels, spatial, dst); - else if (format == SimdTensorFormatNchw4c) - SynetFusedLayerForward8Nchw4c(src0, src1, src2, channels, spatial, dst); - else - Base::SynetFusedLayerForward8(src0, src1, src2, channels, spatial, dst, format); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void SynetFusedLayerForward9(const float * src, const float * scale, const float * bias, float * dst0, float * dst1, size_t offset) - { - float32x4_t _src = Load(src + offset); - Store(dst0 + offset, vmaxq_f32(vdupq_n_f32(0.0f), vmlaq_f32(Load(bias + offset), _src, Load(scale + offset)))); - Store(dst1 + offset, _src); - } - - template SIMD_INLINE void SynetFusedLayerForward9(const float * src, const float * scale, const float * bias, float * dst0, size_t offset) - { - float32x4_t _src = Load(src + offset); - Store(dst0 + offset, vmaxq_f32(vdupq_n_f32(0.0f), vmlaq_f32(Load(bias + offset), _src, Load(scale + offset)))); - } - - template SIMD_INLINE void SynetFusedLayerForward9(const float * src, const float32x4_t & scale, const float32x4_t & bias, float * dst0, float * dst1, size_t offset) - { - float32x4_t _src = Load(src + offset); - Store(dst0 + offset, vmaxq_f32(vdupq_n_f32(0.0f), vmlaq_f32(bias, _src, scale))); - Store(dst1 + offset, _src); - } - - template SIMD_INLINE void SynetFusedLayerForward9(const float * src, const float32x4_t & scale, const float32x4_t & bias, float * dst0, size_t offset) - { - float32x4_t _src = Load(src + offset); - Store(dst0 + offset, vmaxq_f32(vdupq_n_f32(0.0f), vmlaq_f32(bias, _src, scale))); - } - - template void SynetFusedLayerForward9Nchw(const float * src0, const float * src1, const float * scale0, const float * bias0, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - if (align) - assert(Aligned(src0) && Aligned(src1) && Aligned(spatial, F) && Aligned(dst0) && Aligned(dst1)); - const float * scale1 = scale0 + channels0; - const float * bias1 = bias0 + channels0; - size_t aligned = AlignLo(spatial, QF); - size_t partial = AlignLo(spatial, F); - if (dst1) - { - for (size_t c = 0; c < channels0; ++c) - { - size_t s = 0; - if (partial) - { - float32x4_t _scale0 = vdupq_n_f32(scale0[c]); - float32x4_t _bias0 = vdupq_n_f32(bias0[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + 0 * F); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + 1 * F); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + 2 * F); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + 3 * F); - } - for (; s < partial; s += F) - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s); - } - for (; s < spatial; ++s) - dst0[s] = Base::SynetFusedLayerForward9(src0[s], scale0[c], bias0[c]), dst1[s] = src0[s]; - src0 += spatial; - dst0 += spatial; - dst1 += spatial; - } - for (size_t c = 0; c < channels1; ++c) - { - size_t s = 0; - if (partial) - { - float32x4_t _scale1 = vdupq_n_f32(scale1[c]); - float32x4_t _bias1 = vdupq_n_f32(bias1[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + 0 * F); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + 1 * F); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + 2 * F); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + 3 * F); - } - for (; s < partial; s += F) - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s); - } - for (; s < spatial; ++s) - dst0[s] = Base::SynetFusedLayerForward9(src1[s], scale1[c], bias1[c]), dst1[s] = src1[s]; - src1 += spatial; - dst0 += spatial; - dst1 += spatial; - } - } - else - { - for (size_t c = 0; c < channels0; ++c) - { - size_t s = 0; - if (partial) - { - float32x4_t _scale0 = vdupq_n_f32(scale0[c]); - float32x4_t _bias0 = vdupq_n_f32(bias0[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + 0 * F); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + 1 * F); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + 2 * F); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + 3 * F); - } - for (; s < partial; s += F) - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s); - } - for (; s < spatial; ++s) - dst0[s] = Base::SynetFusedLayerForward9(src0[s], scale0[c], bias0[c]); - src0 += spatial; - dst0 += spatial; - } - for (size_t c = 0; c < channels1; ++c) - { - size_t s = 0; - if (partial) - { - float32x4_t _scale1 = vdupq_n_f32(scale1[c]); - float32x4_t _bias1 = vdupq_n_f32(bias1[c]); - for (; s < aligned; s += QF) - { - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + 0 * F); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + 1 * F); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + 2 * F); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + 3 * F); - } - for (; s < partial; s += F) - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s); - } - for (; s < spatial; ++s) - dst0[s] = Base::SynetFusedLayerForward9(src1[s], scale1[c], bias1[c]); - src1 += spatial; - dst0 += spatial; - } - } - } - - SIMD_INLINE void SynetFusedLayerForward9Nchw(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - if (Aligned(src0) && Aligned(src1) && Aligned(spatial, F) && Aligned(dst0) && Aligned(dst1)) - SynetFusedLayerForward9Nchw(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else - SynetFusedLayerForward9Nchw(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - } - - template void SynetFusedLayerForward9Nhwc(const float * src0, const float * src1, const float * scale0, const float * bias0, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - if (align) - assert(Aligned(src0) && Aligned(src1) && Aligned(scale0) && Aligned(bias0) && Aligned(channels0, F) && Aligned(channels1, F) && Aligned(dst0) && Aligned(dst1)); - const float * scale1 = scale0 + channels0; - const float * bias1 = bias0 + channels0; - size_t aligned0 = AlignLo(channels0, QF); - size_t partial0 = AlignLo(channels0, F); - size_t aligned1 = AlignLo(channels1, QF); - size_t partial1 = AlignLo(channels1, F); - if (dst1) - { - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned0; c += QF) - { - SynetFusedLayerForward9(src0, scale0, bias0, dst0, dst1, c + 0 * F); - SynetFusedLayerForward9(src0, scale0, bias0, dst0, dst1, c + 1 * F); - SynetFusedLayerForward9(src0, scale0, bias0, dst0, dst1, c + 2 * F); - SynetFusedLayerForward9(src0, scale0, bias0, dst0, dst1, c + 3 * F); - } - for (; c < partial0; c += F) - SynetFusedLayerForward9(src0, scale0, bias0, dst0, dst1, c); - for (; c < channels0; ++c) - dst0[c] = Base::SynetFusedLayerForward9(src0[c], scale0[c], bias0[c]), dst1[c] = src0[c]; - src0 += channels0; - dst0 += channels0; - dst1 += channels0; - c = 0; - for (; c < aligned1; c += QF) - { - SynetFusedLayerForward9(src1, scale1, bias1, dst0, dst1, c + 0 * F); - SynetFusedLayerForward9(src1, scale1, bias1, dst0, dst1, c + 1 * F); - SynetFusedLayerForward9(src1, scale1, bias1, dst0, dst1, c + 2 * F); - SynetFusedLayerForward9(src1, scale1, bias1, dst0, dst1, c + 3 * F); - } - for (; c < partial1; c += F) - SynetFusedLayerForward9(src1, scale1, bias1, dst0, dst1, c); - for (; c < channels1; ++c) - dst0[c] = Base::SynetFusedLayerForward9(src1[c], scale1[c], bias1[c]), dst1[c] = src1[c]; - src1 += channels1; - dst0 += channels1; - dst1 += channels1; - } - } - else - { - for (size_t s = 0; s < spatial; ++s) - { - size_t c = 0; - for (; c < aligned0; c += QF) - { - SynetFusedLayerForward9(src0, scale0, bias0, dst0, c + 0 * F); - SynetFusedLayerForward9(src0, scale0, bias0, dst0, c + 1 * F); - SynetFusedLayerForward9(src0, scale0, bias0, dst0, c + 2 * F); - SynetFusedLayerForward9(src0, scale0, bias0, dst0, c + 3 * F); - } - for (; c < partial0; c += F) - SynetFusedLayerForward9(src0, scale0, bias0, dst0, c); - for (; c < channels0; ++c) - dst0[c] = Base::SynetFusedLayerForward9(src0[c], scale0[c], bias0[c]); - src0 += channels0; - dst0 += channels0; - c = 0; - for (; c < aligned1; c += QF) - { - SynetFusedLayerForward9(src1, scale1, bias1, dst0, c + 0 * F); - SynetFusedLayerForward9(src1, scale1, bias1, dst0, c + 1 * F); - SynetFusedLayerForward9(src1, scale1, bias1, dst0, c + 2 * F); - SynetFusedLayerForward9(src1, scale1, bias1, dst0, c + 3 * F); - } - for (; c < partial1; c += F) - SynetFusedLayerForward9(src1, scale1, bias1, dst0, c); - for (; c < channels1; ++c) - dst0[c] = Base::SynetFusedLayerForward9(src1[c], scale1[c], bias1[c]); - src1 += channels1; - dst0 += channels1; - } - } - } - - SIMD_INLINE void SynetFusedLayerForward9Nhwc(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - if (Aligned(src0) && Aligned(src1) && Aligned(scale) && Aligned(bias) && Aligned(channels0, F) && Aligned(channels1, F) && Aligned(dst0) && Aligned(dst1)) - SynetFusedLayerForward9Nhwc(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else - SynetFusedLayerForward9Nhwc(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - } - - template void SynetFusedLayerForward9Nchw4cA(const float * src0, const float * src1, const float * scale0, const float * bias0, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - if (align) - assert(Aligned(src0) && Aligned(src1) && Aligned(dst0) && Aligned(dst1)); - const float * scale1 = scale0 + channels0; - const float * bias1 = bias0 + channels0; - size_t spatialF = spatial * F; - size_t spatial4F = AlignLo(spatial, 4)*F; - if (dst1) - { - for (size_t c = 0; c < channels0; c += F) - { - float32x4_t _scale0 = Load(scale0 + c); - float32x4_t _bias0 = Load(bias0 + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + F * 0); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + F * 1); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + F * 2); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, dst1, s); - src0 += spatialF; - dst0 += spatialF; - dst1 += spatialF; - } - for (size_t c = 0; c < channels1; c += F) - { - float32x4_t _scale1 = Load(scale1 + c); - float32x4_t _bias1 = Load(bias1 + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + F * 0); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + F * 1); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + F * 2); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, dst1, s); - src1 += spatialF; - dst0 += spatialF; - dst1 += spatialF; - } - } - else - { - for (size_t c = 0; c < channels0; c += F) - { - float32x4_t _scale0 = Load(scale0 + c); - float32x4_t _bias0 = Load(bias0 + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + F * 0); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + F * 1); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + F * 2); - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward9(src0, _scale0, _bias0, dst0, s); - src0 += spatialF; - dst0 += spatialF; - } - for (size_t c = 0; c < channels1; c += F) - { - float32x4_t _scale1 = Load(scale1 + c); - float32x4_t _bias1 = Load(bias1 + c); - size_t s = 0; - for (; s < spatial4F; s += 4 * F) - { - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + F * 0); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + F * 1); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + F * 2); - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s + F * 3); - } - for (; s < spatialF; s += F) - SynetFusedLayerForward9(src1, _scale1, _bias1, dst0, s); - src1 += spatialF; - dst0 += spatialF; - } - } - } - - SIMD_INLINE void SynetFusedLayerForward9Nchw4cA(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1) - { - assert(Aligned(channels0, F)); - if (Aligned(src0) && Aligned(src1) && Aligned(dst0) && Aligned(dst1)) - SynetFusedLayerForward9Nchw4cA(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else - SynetFusedLayerForward9Nchw4cA(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - } - - void SynetFusedLayerForward9(const float * src0, const float * src1, const float * scale, const float * bias, size_t channels0, size_t channels1, size_t spatial, float * dst0, float * dst1, SimdTensorFormatType format) - { - if (Base::NchwCompatible(channels0 + channels1, spatial, format)) - SynetFusedLayerForward9Nchw(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else if (Base::NhwcCompatible(channels0 + channels1, spatial, format)) - SynetFusedLayerForward9Nhwc(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else if (format == SimdTensorFormatNchw4c && Aligned(channels0, F)) - SynetFusedLayerForward9Nchw4cA(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1); - else - Base::SynetFusedLayerForward9(src0, src1, scale, bias, channels0, channels1, spatial, dst0, dst1, format); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonSynetMergedConvolution32f.cpp b/src/3rd/Simd/Simd/SimdNeonSynetMergedConvolution32f.cpp deleted file mode 100644 index 3bf9b534..00000000 --- a/src/3rd/Simd/Simd/SimdNeonSynetMergedConvolution32f.cpp +++ /dev/null @@ -1,1428 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetMergedConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdUpdate.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#if defined(SIMD_NEON_ENABLE) - namespace Neon - { - template SIMD_INLINE void InputConvolution1x1_2x6(const float * src0, size_t srcC, - const float * weight, const float32x4_t * bias, const float32x4_t * params, float * dst0, float * dst1) - { - float32x4_t d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - d00 = bias[0], d01 = bias[1]; - d10 = bias[0], d11 = bias[1]; - d20 = bias[0], d21 = bias[1]; - d30 = bias[0], d31 = bias[1]; - d40 = bias[0], d41 = bias[1]; - d50 = bias[0], d51 = bias[1]; - const float * src1 = src0 + 1 * srcC; - const float * src2 = src0 + 2 * srcC; - const float * src3 = src0 + 3 * srcC; - const float * src4 = src0 + 4 * srcC; - const float * src5 = src0 + 5 * srcC; - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = Load(weight + 0); - w1 = Load(weight + F); - s0 = vld1q_dup_f32(src0 + sc); - d00 = vmlaq_f32(d00, s0, w0); - d01 = vmlaq_f32(d01, s0, w1); - s0 = vld1q_dup_f32(src1 + sc); - d10 = vmlaq_f32(d10, s0, w0); - d11 = vmlaq_f32(d11, s0, w1); - s0 = vld1q_dup_f32(src2 + sc); - d20 = vmlaq_f32(d20, s0, w0); - d21 = vmlaq_f32(d21, s0, w1); - s0 = vld1q_dup_f32(src3 + sc); - d30 = vmlaq_f32(d30, s0, w0); - d31 = vmlaq_f32(d31, s0, w1); - s0 = vld1q_dup_f32(src4 + sc); - d40 = vmlaq_f32(d40, s0, w0); - d41 = vmlaq_f32(d41, s0, w1); - s0 = vld1q_dup_f32(src5 + sc); - d50 = vmlaq_f32(d50, s0, w0); - d51 = vmlaq_f32(d51, s0, w1); - weight += DF; - } - Store(dst0 + 0 * F, Activate(d00, params, 0)); - Store(dst1 + 0 * F, Activate(d01, params, 1)); - Store(dst0 + 1 * F, Activate(d10, params, 0)); - Store(dst1 + 1 * F, Activate(d11, params, 1)); - Store(dst0 + 2 * F, Activate(d20, params, 0)); - Store(dst1 + 2 * F, Activate(d21, params, 1)); - Store(dst0 + 3 * F, Activate(d30, params, 0)); - Store(dst1 + 3 * F, Activate(d31, params, 1)); - Store(dst0 + 4 * F, Activate(d40, params, 0)); - Store(dst1 + 4 * F, Activate(d41, params, 1)); - Store(dst0 + 5 * F, Activate(d50, params, 0)); - Store(dst1 + 5 * F, Activate(d51, params, 1)); - } - - template SIMD_INLINE void InputConvolution1x1_2xM(const float * src0, size_t srcC, - const float * weight, const float32x4_t * bias, const float32x4_t * params, float * dst0, float * dst1) - { - float32x4_t d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - if (M > 0) d00 = bias[0], d01 = bias[1]; - if (M > 1) d10 = bias[0], d11 = bias[1]; - if (M > 2) d20 = bias[0], d21 = bias[1]; - if (M > 3) d30 = bias[0], d31 = bias[1]; - if (M > 4) d40 = bias[0], d41 = bias[1]; - if (M > 5) d50 = bias[0], d51 = bias[1]; - const float * src1 = src0 + 1 * srcC; - const float * src2 = src0 + 2 * srcC; - const float * src3 = src0 + 3 * srcC; - const float * src4 = src0 + 4 * srcC; - const float * src5 = src0 + 5 * srcC; - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = Load(weight + 0); - w1 = Load(weight + F); - if (M > 0) s0 = vld1q_dup_f32(src0 + sc), d00 = vmlaq_f32(d00, s0, w0), d01 = vmlaq_f32(d01, s0, w1); - if (M > 1) s0 = vld1q_dup_f32(src1 + sc), d10 = vmlaq_f32(d10, s0, w0), d11 = vmlaq_f32(d11, s0, w1); - if (M > 2) s0 = vld1q_dup_f32(src2 + sc), d20 = vmlaq_f32(d20, s0, w0), d21 = vmlaq_f32(d21, s0, w1); - if (M > 3) s0 = vld1q_dup_f32(src3 + sc), d30 = vmlaq_f32(d30, s0, w0), d31 = vmlaq_f32(d31, s0, w1); - if (M > 4) s0 = vld1q_dup_f32(src4 + sc), d40 = vmlaq_f32(d40, s0, w0), d41 = vmlaq_f32(d41, s0, w1); - if (M > 5) s0 = vld1q_dup_f32(src5 + sc), d50 = vmlaq_f32(d50, s0, w0), d51 = vmlaq_f32(d51, s0, w1); - weight += DF; - } - if (M > 0) Store(dst0 + 0 * F, Activate(d00, params, 0)), Store(dst1 + 0 * F, Activate(d01, params, 1)); - if (M > 1) Store(dst0 + 1 * F, Activate(d10, params, 0)), Store(dst1 + 1 * F, Activate(d11, params, 1)); - if (M > 2) Store(dst0 + 2 * F, Activate(d20, params, 0)), Store(dst1 + 2 * F, Activate(d21, params, 1)); - if (M > 3) Store(dst0 + 3 * F, Activate(d30, params, 0)), Store(dst1 + 3 * F, Activate(d31, params, 1)); - if (M > 4) Store(dst0 + 4 * F, Activate(d40, params, 0)), Store(dst1 + 4 * F, Activate(d41, params, 1)); - if (M > 5) Store(dst0 + 5 * F, Activate(d50, params, 0)), Store(dst1 + 5 * F, Activate(d51, params, 1)); - } - - typedef void(*InputConvolution1x1_2xM_Ptr)(const float * src0, size_t srcC, const float * weight, const float32x4_t * bias, const float32x4_t * params, float * dst0, float * dst1); - - template InputConvolution1x1_2xM_Ptr GetInputConvolution1x1_2xM(size_t M) - { - switch (M) - { - case 0: return InputConvolution1x1_2xM; - case 1: return InputConvolution1x1_2xM; - case 2: return InputConvolution1x1_2xM; - case 3: return InputConvolution1x1_2xM; - case 4: return InputConvolution1x1_2xM; - case 5: return InputConvolution1x1_2xM; - } - assert(0); - return NULL; - } - - template SIMD_INLINE void InputConvolution1x1_1x6(const float * src0, size_t srcC, - const float * weight, const float32x4_t * bias, const float32x4_t * params, float * dst0) - { - float32x4_t d00, d10, d20, d30, d40, d50, s0, w0; - d00 = bias[0]; - d10 = bias[0]; - d20 = bias[0]; - d30 = bias[0]; - d40 = bias[0]; - d50 = bias[0]; - const float * src1 = src0 + 1 * srcC; - const float * src2 = src0 + 2 * srcC; - const float * src3 = src0 + 3 * srcC; - const float * src4 = src0 + 4 * srcC; - const float * src5 = src0 + 5 * srcC; - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = Load(weight + 0); - s0 = vld1q_dup_f32(src0 + sc); - d00 = vmlaq_f32(d00, s0, w0); - s0 = vld1q_dup_f32(src1 + sc); - d10 = vmlaq_f32(d10, s0, w0); - s0 = vld1q_dup_f32(src2 + sc); - d20 = vmlaq_f32(d20, s0, w0); - s0 = vld1q_dup_f32(src3 + sc); - d30 = vmlaq_f32(d30, s0, w0); - s0 = vld1q_dup_f32(src4 + sc); - d40 = vmlaq_f32(d40, s0, w0); - s0 = vld1q_dup_f32(src5 + sc); - d50 = vmlaq_f32(d50, s0, w0); - weight += DF; - } - Store(dst0 + 0 * F, Activate(d00, params, 0)); - Store(dst0 + 1 * F, Activate(d10, params, 0)); - Store(dst0 + 2 * F, Activate(d20, params, 0)); - Store(dst0 + 3 * F, Activate(d30, params, 0)); - Store(dst0 + 4 * F, Activate(d40, params, 0)); - Store(dst0 + 5 * F, Activate(d50, params, 0)); - } - - template SIMD_INLINE void InputConvolution1x1_1xM(const float * src0, size_t srcC, - const float * weight, const float32x4_t * bias, const float32x4_t * params, float * dst0) - { - float32x4_t d00, d10,d20, d30, d40, d50, s0, w0; - if (M > 0) d00 = bias[0]; - if (M > 1) d10 = bias[0]; - if (M > 2) d20 = bias[0]; - if (M > 3) d30 = bias[0]; - if (M > 4) d40 = bias[0]; - if (M > 5) d50 = bias[0]; - const float * src1 = src0 + 1 * srcC; - const float * src2 = src0 + 2 * srcC; - const float * src3 = src0 + 3 * srcC; - const float * src4 = src0 + 4 * srcC; - const float * src5 = src0 + 5 * srcC; - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = Load(weight + 0); - if (M > 0) s0 = vld1q_dup_f32(src0 + sc), d00 = vmlaq_f32(d00, s0, w0); - if (M > 1) s0 = vld1q_dup_f32(src1 + sc), d10 = vmlaq_f32(d10, s0, w0); - if (M > 2) s0 = vld1q_dup_f32(src2 + sc), d20 = vmlaq_f32(d20, s0, w0); - if (M > 3) s0 = vld1q_dup_f32(src3 + sc), d30 = vmlaq_f32(d30, s0, w0); - if (M > 4) s0 = vld1q_dup_f32(src4 + sc), d40 = vmlaq_f32(d40, s0, w0); - if (M > 5) s0 = vld1q_dup_f32(src5 + sc), d50 = vmlaq_f32(d50, s0, w0); - weight += DF; - } - if (M > 0) Store(dst0 + 0 * F, Activate(d00, params, 0)); - if (M > 1) Store(dst0 + 1 * F, Activate(d10, params, 0)); - if (M > 2) Store(dst0 + 2 * F, Activate(d20, params, 0)); - if (M > 3) Store(dst0 + 3 * F, Activate(d30, params, 0)); - if (M > 4) Store(dst0 + 4 * F, Activate(d40, params, 0)); - if (M > 5) Store(dst0 + 5 * F, Activate(d50, params, 0)); - } - - typedef void(*InputConvolution1x1_1xM_Ptr)(const float * src0, size_t srcC, const float * weight, const float32x4_t * bias, const float32x4_t * params, float * dst0); - - template InputConvolution1x1_1xM_Ptr GetInputConvolution1x1_1xM(size_t M) - { - switch (M) - { - case 0: return InputConvolution1x1_1xM; - case 1: return InputConvolution1x1_1xM; - case 2: return InputConvolution1x1_1xM; - case 3: return InputConvolution1x1_1xM; - case 4: return InputConvolution1x1_1xM; - case 5: return InputConvolution1x1_1xM; - } - assert(0); - return NULL; - } - - template void InputConvolution1x1(const float * src, const SimdConvolutionParameters & p, - size_t dstC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcH = p.srcH, srcW = p.srcW, srcC = p.srcC, dstW = p.dstW; - size_t dstM = (bufH[0] - 1), dstS = bufH[0] * dstW *F; - size_t dstCDF = AlignLo(dstC, DF); - float32x4_t _params[2], _bias[2]; - _params[0] = vdupq_n_f32(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = vdupq_n_f32(params[1]); -#ifdef SIMD_MERGECONV_MERGE_INPUT_ROWS_1X1 - size_t yInt = Simd::Max(yBeg, yEnd&(~dstM)), nBeg = yBeg * dstW, nInt = yInt * dstW, nEnd = yEnd * dstW; - size_t nInt6 = AlignLoAny(nInt - nBeg, 6) + nBeg, nEnd6 = AlignLoAny(nEnd - nInt, 6) + nInt, nIntTail = nInt - nInt6, nEndTail = nEnd - nEnd6; - InputConvolution1x1_2xM_Ptr tailInt_2 = GetInputConvolution1x1_2xM(nIntTail); - InputConvolution1x1_2xM_Ptr tailEnd_2 = GetInputConvolution1x1_2xM(nEndTail); -#else - size_t dstW6 = AlignLoAny(dstW, 6), wTail = dstW - dstW6; - InputConvolution1x1_2xM_Ptr tailW_2 = GetInputConvolution1x1_2xM(wTail); - InputConvolution1x1_1xM_Ptr tailW_1 = GetInputConvolution1x1_1xM(wTail); -#endif - - size_t dc = 0; - for (; dc < dstC; dc += DF) - { - _bias[0] = bias ? Load(bias + dc + 0) : vdupq_n_f32(0.0f); - _bias[1] = bias ? Load(bias + dc + F) : vdupq_n_f32(0.0f); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = Load(params + dc + 0); - _params[1] = Load(params + dc + F); - } - const float * pS = src + yBeg * srcW*srcC; - const float * pW = weight + dc * srcC; - float * pD = dst + (dc / F)*dstS; -#ifdef SIMD_MERGECONV_MERGE_INPUT_ROWS_1X1 - float * dst0 = pD + (yBeg&dstM)*dstW*F; - float * dst1 = pD + (yInt&dstM)*dstW*F; - size_t dn = nBeg; - if (dstC - dc > F) - { - for (; dn < nInt6; dn += 6, pS += 6 * srcC, dst0 += 6 * F) - InputConvolution1x1_2x6(pS, srcC, pW, _bias, _params, dst0, dst0 + dstS); - if (nIntTail) - tailInt_2(pS, srcC, pW, _bias, _params, dst0, dst0 + dstS), pS += nIntTail * srcC, dn += nIntTail; - for (; dn < nEnd6; dn += 6, pS += 6 * srcC, dst1 += 6 * F) - InputConvolution1x1_2x6(pS, srcC, pW, _bias, _params, dst1, dst1 + dstS); - if (nEndTail) - tailEnd_2(pS, srcC, pW, _bias, _params, dst1, dst1 + dstS), pS += nEndTail * srcC, dn += nEndTail; - } - else - { - InputConvolution1x1_1xM_Ptr tailInt_1 = GetInputConvolution1x1_1xM(nIntTail); - InputConvolution1x1_1xM_Ptr tailEnd_1 = GetInputConvolution1x1_1xM(nEndTail); - for (; dn < nInt6; dn += 6, pS += 6 * srcC, dst0 += 6 * F) - InputConvolution1x1_1x6(pS, srcC, pW, _bias, _params, dst0); - if (nIntTail) - tailInt_1(pS, srcC, pW, _bias, _params, dst0), pS += nIntTail * srcC, dn += nIntTail; - for (; dn < nEnd6; dn += 6, pS += 6 * srcC, dst1 += 6 * F) - InputConvolution1x1_1x6(pS, srcC, pW, _bias, _params, dst1); - if (nEndTail) - tailEnd_1(pS, srcC, pW, _bias, _params, dst1), pS += nEndTail * srcC, dn += nEndTail; - } -#else - for (size_t dy = yBeg; dy < yEnd; ++dy) - { - float * dst0 = pD + (dy&dstM)*dstW*F; - size_t dx = 0; - if (dstC - dc > F) - { - for (; dx < dstW6; dx += 6, pS += 6 * srcC, dst0 += 6 * F) - InputConvolution1x1_2x6(pS, srcC, pW, _bias, _params, dst0, dst0 + dstS); - if (wTail) - tailW_2(pS, srcC, pW, _bias, _params, dst0, dst0 + dstS), pS += wTail * srcC, dx += wTail; - } - else - { - for (; dx < dstW6; dx += 6, pS += 6 * srcC, dst0 += 6 * F) - InputConvolution1x1_1x6(pS, srcC, pW, _bias, _params, dst0); - if (wTail) - tailW_1(pS, srcC, pW, _bias, _params, dst0), pS += wTail * srcC, dx += wTail; - } - } -#endif - } - } - - template SIMD_INLINE void InputConvolution_2x1(const float * src0, const SimdConvolutionParameters & p, - size_t kH, size_t kW, const float * weight, const float32x4_t * bias, const float32x4_t * params, float * dst0, float * dst1) - { - float32x4_t d00, d01, s0, w0, w1; - d00 = bias[0]; - d01 = bias[1]; - size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW)*p.srcC, stride = p.srcW * p.srcC; - for (size_t ky = 0; ky < kH; ++ky) - { - for (size_t i = 0; i < size; ++i) - { - w0 = Load(weight + 0); - w1 = Load(weight + F); - s0 = vld1q_dup_f32(src0 + i); - d00 = vmlaq_f32(d00, s0, w0); - d01 = vmlaq_f32(d01, s0, w1); - weight += DF; - } - weight += tail; - src0 += stride; - } - Store(dst0, Activate(d00, params, 0)); - Store(dst1, Activate(d01, params, 1)); - } - - template SIMD_INLINE void InputConvolution_1x1(const float * src0, const SimdConvolutionParameters & p, - size_t kH, size_t kW, const float * weight, const float32x4_t * bias, const float32x4_t * params, float * dst0) - { - float32x4_t d00, s0, w0; - d00 = bias[0]; - size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW)*p.srcC, stride = p.srcW * p.srcC; - for (size_t ky = 0; ky < kH; ++ky) - { - for (size_t i = 0; i < size; ++i) - { - w0 = Load(weight + 0); - s0 = vld1q_dup_f32(src0 + i); - d00 = vmlaq_f32(d00, s0, w0); - weight += DF; - } - weight += tail; - src0 += stride; - } - Store(dst0, Activate(d00, params, 0)); - } - - template SIMD_INLINE void InputConvolution_2x6(const float * src0, const SimdConvolutionParameters & p, - size_t kH, size_t kW, const float * weight, const float32x4_t * bias, const float32x4_t * params, float * dst0, float * dst1) - { - float32x4_t d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - d00 = bias[0], d01 = bias[1]; - d10 = bias[0], d11 = bias[1]; - d20 = bias[0], d21 = bias[1]; - d30 = bias[0], d31 = bias[1]; - d40 = bias[0], d41 = bias[1]; - d50 = bias[0], d51 = bias[1]; - size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW)*p.srcC, stride = p.srcW * p.srcC, step = p.srcC*p.strideX; - const float * src1 = src0 + 1 * step; - const float * src2 = src0 + 2 * step; - const float * src3 = src0 + 3 * step; - const float * src4 = src0 + 4 * step; - const float * src5 = src0 + 5 * step; - for (size_t ky = 0; ky < kH; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = Load(weight + 0); - w1 = Load(weight + F); - s0 = vld1q_dup_f32(src0 + offset); - d00 = vmlaq_f32(d00, s0, w0); - d01 = vmlaq_f32(d01, s0, w1); - s0 = vld1q_dup_f32(src1 + offset); - d10 = vmlaq_f32(d10, s0, w0); - d11 = vmlaq_f32(d11, s0, w1); - s0 = vld1q_dup_f32(src2 + offset); - d20 = vmlaq_f32(d20, s0, w0); - d21 = vmlaq_f32(d21, s0, w1); - s0 = vld1q_dup_f32(src3 + offset); - d30 = vmlaq_f32(d30, s0, w0); - d31 = vmlaq_f32(d31, s0, w1); - s0 = vld1q_dup_f32(src4 + offset); - d40 = vmlaq_f32(d40, s0, w0); - d41 = vmlaq_f32(d41, s0, w1); - s0 = vld1q_dup_f32(src5 + offset); - d50 = vmlaq_f32(d50, s0, w0); - d51 = vmlaq_f32(d51, s0, w1); - weight += DF; - } - weight += tail; - } - Store(dst0 + 0 * F, Activate(d00, params, 0)); - Store(dst1 + 0 * F, Activate(d01, params, 1)); - Store(dst0 + 1 * F, Activate(d10, params, 0)); - Store(dst1 + 1 * F, Activate(d11, params, 1)); - Store(dst0 + 2 * F, Activate(d20, params, 0)); - Store(dst1 + 2 * F, Activate(d21, params, 1)); - Store(dst0 + 3 * F, Activate(d30, params, 0)); - Store(dst1 + 3 * F, Activate(d31, params, 1)); - Store(dst0 + 4 * F, Activate(d40, params, 0)); - Store(dst1 + 4 * F, Activate(d41, params, 1)); - Store(dst0 + 5 * F, Activate(d50, params, 0)); - Store(dst1 + 5 * F, Activate(d51, params, 1)); - } - - template SIMD_INLINE void InputConvolution_1x6(const float * src0, const SimdConvolutionParameters & p, - size_t kH, size_t kW, const float * weight, const float32x4_t * bias, const float32x4_t * params, float * dst0) - { - float32x4_t d00, d10, d20, d30, d40, d50, s0, w0; - d00 = bias[0]; - d10 = bias[0]; - d20 = bias[0]; - d30 = bias[0]; - d40 = bias[0]; - d50 = bias[0]; - size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW)*p.srcC, stride = p.srcW * p.srcC, step = p.srcC*p.strideX; - const float * src1 = src0 + 1 * step; - const float * src2 = src0 + 2 * step; - const float * src3 = src0 + 3 * step; - const float * src4 = src0 + 4 * step; - const float * src5 = src0 + 5 * step; - for (size_t ky = 0; ky < kH; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = Load(weight + 0); - s0 = vld1q_dup_f32(src0 + offset); - d00 = vmlaq_f32(d00, s0, w0); - s0 = vld1q_dup_f32(src1 + offset); - d10 = vmlaq_f32(d10, s0, w0); - s0 = vld1q_dup_f32(src2 + offset); - d20 = vmlaq_f32(d20, s0, w0); - s0 = vld1q_dup_f32(src3 + offset); - d30 = vmlaq_f32(d30, s0, w0); - s0 = vld1q_dup_f32(src4 + offset); - d40 = vmlaq_f32(d40, s0, w0); - s0 = vld1q_dup_f32(src5 + offset); - d50 = vmlaq_f32(d50, s0, w0); - weight += DF; - } - weight += tail; - } - Store(dst0 + 0 * F, Activate(d00, params, 0)); - Store(dst0 + 1 * F, Activate(d10, params, 0)); - Store(dst0 + 2 * F, Activate(d20, params, 0)); - Store(dst0 + 3 * F, Activate(d30, params, 0)); - Store(dst0 + 4 * F, Activate(d40, params, 0)); - Store(dst0 + 5 * F, Activate(d50, params, 0)); - } - - template void InputConvolution(const float * src, const SimdConvolutionParameters & p, - size_t dstC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float * weight, const float * bias, const float * params, float * dst) - { - size_t srcH = p.srcH, srcW = p.srcW, srcC = p.srcC, dstW = p.dstW; - size_t kernelY = p.kernelY, kernelX = p.kernelX, strideY = p.strideY, strideX = p.strideX; - size_t dstM = (bufH[0] - 1), dstS = bufH[0] * dstW * F; - size_t dstCDF = AlignLo(dstC, DF); - if (dstC - F > dstCDF) - dstCDF += DF; - - size_t noseH = p.padY, noseW = p.padX; - size_t bodyH = p.srcH - p.kernelY + 1 + noseH, bodyW = p.srcW - p.kernelX + 1 + noseW; - size_t bodyW6 = AlignLoAny(bodyW - noseW, 6 * p.strideX) + noseW; - size_t tailH = bodyH + p.padH, tailW = bodyW + p.padW; - size_t wS = p.srcC*p.dstC; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - - float32x4_t _params[2], _bias[2]; - _params[0] = vdupq_n_f32(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = vdupq_n_f32(params[1]); - - size_t dc = 0; - for (; dc < dstCDF; dc += DF) - { - _bias[0] = bias ? Load(bias + dc + 0) : vdupq_n_f32(0.0f); - _bias[1] = bias ? Load(bias + dc + F) : vdupq_n_f32(0.0f); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = Load(params + dc + 0); - _params[1] = Load(params + dc + F); - } - size_t dy = yBeg, sy = dy * strideY; - for (; sy < noseH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS, *dst1 = dst0 + dstS; - size_t sx = 0; - const float * s = src; - const float * w = weight + (noseH - sy) * kernelX * DF * srcC; - for (; sx < noseW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s, p, kY + sy, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0, dst1); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F, dst1 += 6 * F) - InputConvolution_2x6(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < bodyW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < tailW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kY + sy, kW - sx, w, _bias, _params, dst0, dst1); - } - for (; sy < bodyH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS, *dst1 = dst0 + dstS; - size_t sx = 0; - const float * s = src + (sy - noseH)*srcW*srcC; - const float * w = weight; - for (; sx < noseW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s, p, kernelY, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0, dst1); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F, dst1 += 6 * F) - InputConvolution_2x6(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < bodyW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < tailW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kernelY, kW - sx, w, _bias, _params, dst0, dst1); - } - for (; sy < tailH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS, *dst1 = dst0 + dstS; - size_t sx = 0; - const float * s = src + (sy - noseH)*srcW*srcC; - const float * w = weight; - for (; sx < noseW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s, p, kH - sy, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0, dst1); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F, dst1 += 6 * F) - InputConvolution_2x6(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < bodyW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < tailW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kH - sy, kW - sx, w, _bias, _params, dst0, dst1); - } - weight += kernelY * kernelX*srcC*DF; - } - if (dc < dstC) - { - _bias[0] = bias ? Load(bias + dc) : vdupq_n_f32(0.0f); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = Load(params + dc); - size_t dy = yBeg, sy = dy * strideY; - for (; sy < noseH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS; - size_t sx = 0; - const float * s = src; - const float * w = weight + (noseH - sy) * kernelX * DF * srcC; - for (; sx < noseW; sx += strideX, dst0 += F) - InputConvolution_1x1(s, p, kY + sy, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F) - InputConvolution_1x6(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0); - for (; sx < bodyW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0); - for (; sx < tailW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kY + sy, kW - sx, w, _bias, _params, dst0); - } - for (; sy < bodyH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS; - size_t sx = 0; - const float * s = src + (sy - noseH)*srcW*srcC; - const float * w = weight; - for (; sx < noseW; sx += strideX, dst0 += F) - InputConvolution_1x1(s, p, kernelY, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F) - InputConvolution_1x6(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0); - for (; sx < bodyW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0); - for (; sx < tailW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kernelY, kW - sx, w, _bias, _params, dst0); - } - for (; sy < tailH && dy < yEnd; sy += strideY, dy++) - { - float * dst0 = dst + (dy&dstM)*dstW*F + (dc / F)*dstS; - size_t sx = 0; - const float * s = src + (sy - noseH)*srcW*srcC; - const float * w = weight; - for (; sx < noseW; sx += strideX, dst0 += F) - InputConvolution_1x1(s, p, kH - sy, kX + sx, w + (noseW - sx)*srcC*DF, _bias, _params, dst0); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F) - InputConvolution_1x6(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0); - for (; sx < bodyW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0); - for (; sx < tailW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kH - sy, kW - sx, w, _bias, _params, dst0); - } - } - } - - template void DepthwiseConvolution(const float* src, const SimdConvolutionParameters& p, - size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float* weight, const float* bias, const float* params, float* dst) - { - size_t strideY = p.strideY, strideX = p.strideX, padY = p.padY, padX = p.padX, padH = p.padH, padW = p.padW; - size_t srcW = p.srcW * F, dstW = p.dstW * F, weightS = p.kernelY * p.kernelX * F, strideXF = strideX * F; - size_t srcM = (bufH[0] - 1), dstM = (bufH[1] - 1), srcS = bufH[0] * srcW, dstS = bufH[1] * dstW; - size_t noseY = (p.padY + p.strideY - 1) / p.strideY; - size_t bodyY = (p.srcH + p.padY + p.strideY - p.kernelY) / p.strideY; - size_t noseX = (p.padX + p.strideX - 1) / p.strideX; - size_t bodyX = (p.srcW + p.padX + p.strideX - p.kernelX) / p.strideX; - size_t bodyX2 = AlignLo(bodyX - noseX, 2) + noseX; - size_t bodyX4 = AlignLo(bodyX - noseX, 4) + noseX; - size_t bodyX8 = AlignLo(bodyX - noseX, 8) + noseX; - - float32x4_t _params[2]; - _params[0] = vdupq_n_f32(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = vdupq_n_f32(params[1]); - for (size_t c = 0; c < srcC; c += F) - { - float32x4_t _bias = bias ? Load(bias + c) : vdupq_n_f32(0.0f); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = Load(params + c); - - for (size_t dy = yBeg; dy < yEnd; ++dy) - { - float* pd = dst + (dy & dstM) * dstW; - if (dy >= noseY && dy < bodyY) - { - size_t dx = 0; - for (; dx < noseX; ++dx, pd += F) - { - float32x4_t sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * p.strideY + ky - padY; - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * p.strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + ((sy & srcM) * p.srcW + sx) * F; - sum = vmlaq_f32(sum, Load(ps), Load(pw)); - } - } - } - Store(pd, Activate(sum, _params, 0)); - } - for (; dx < bodyX8; dx += 8, pd += 8 * F) - { - float32x4_t sum0 = _bias; - float32x4_t sum1 = _bias; - float32x4_t sum2 = _bias; - float32x4_t sum3 = _bias; - float32x4_t sum4 = _bias; - float32x4_t sum5 = _bias; - float32x4_t sum6 = _bias; - float32x4_t sum7 = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - float32x4_t w0 = Load(pw); - sum0 = vmlaq_f32(sum0, Load(ps + 0 * strideXF), w0); - sum1 = vmlaq_f32(sum1, Load(ps + 1 * strideXF), w0); - sum2 = vmlaq_f32(sum2, Load(ps + 2 * strideXF), w0); - sum3 = vmlaq_f32(sum3, Load(ps + 3 * strideXF), w0); - sum4 = vmlaq_f32(sum4, Load(ps + 4 * strideXF), w0); - sum5 = vmlaq_f32(sum5, Load(ps + 5 * strideXF), w0); - sum6 = vmlaq_f32(sum6, Load(ps + 6 * strideXF), w0); - sum7 = vmlaq_f32(sum7, Load(ps + 7 * strideXF), w0); - } - } - Store(pd + 0 * F, Activate(sum0, _params, 0)); - Store(pd + 1 * F, Activate(sum1, _params, 0)); - Store(pd + 2 * F, Activate(sum2, _params, 0)); - Store(pd + 3 * F, Activate(sum3, _params, 0)); - Store(pd + 4 * F, Activate(sum4, _params, 0)); - Store(pd + 5 * F, Activate(sum5, _params, 0)); - Store(pd + 6 * F, Activate(sum6, _params, 0)); - Store(pd + 7 * F, Activate(sum7, _params, 0)); - } - for (; dx < bodyX4; dx += 4, pd += 4 * F) - { - float32x4_t sum0 = _bias; - float32x4_t sum1 = _bias; - float32x4_t sum2 = _bias; - float32x4_t sum3 = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - float32x4_t w0 = Load(pw); - sum0 = vmlaq_f32(sum0, Load(ps + 0 * strideXF), w0); - sum1 = vmlaq_f32(sum1, Load(ps + 1 * strideXF), w0); - sum2 = vmlaq_f32(sum2, Load(ps + 2 * strideXF), w0); - sum3 = vmlaq_f32(sum3, Load(ps + 3 * strideXF), w0); - } - } - Store(pd + 0 * F, Activate(sum0, _params, 0)); - Store(pd + 1 * F, Activate(sum1, _params, 0)); - Store(pd + 2 * F, Activate(sum2, _params, 0)); - Store(pd + 3 * F, Activate(sum3, _params, 0)); - } - for (; dx < bodyX2; dx += 2, pd += 2 * F) - { - float32x4_t sum0 = _bias; - float32x4_t sum1 = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - float32x4_t w0 = Load(pw); - sum0 = vmlaq_f32(sum0, Load(ps + 0 * strideXF), w0); - sum1 = vmlaq_f32(sum1, Load(ps + 1 * strideXF), w0); - } - } - Store(pd + 0 * F, Activate(sum0, _params, 0)); - Store(pd + 1 * F, Activate(sum1, _params, 0)); - } - for (; dx < bodyX; ++dx, pd += F) - { - float32x4_t sum = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - float32x4_t w0 = Load(pw); - sum = vmlaq_f32(sum, Load(ps), w0); - } - } - Store(pd, Activate(sum, _params, 0)); - } - for (; dx < p.dstW; ++dx, pd += F) - { - float32x4_t sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + ((sy & srcM) * p.srcW + sx) * F; - sum = vmlaq_f32(sum, Load(ps), Load(pw)); - } - } - } - Store(pd, Activate(sum, _params, 0)); - } - } - else - { - for (size_t dx = 0; dx < p.dstW; ++dx, pd += F) - { - float32x4_t sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + ((sy & srcM) * p.srcW + sx) * F; - sum = vmlaq_f32(sum, Load(ps), Load(pw)); - } - } - } - } - Store(pd, Activate(sum, _params, 0)); - } - } - } - src += srcS; - dst += dstS; - weight += weightS; - } - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Edge2x2( - const float * src0, const float * src1, const float32x4_t * weight, const float32x4_t & bias, const float32x4_t * params, float * dst) - { - float32x4_t sum0 = bias, sum1 = vdupq_n_f32(0.0f); - sum0 = vmlaq_f32(sum0, Load(src0 + 0 * F), weight[0]); - sum1 = vmlaq_f32(sum1, Load(src0 + 1 * F), weight[1]); - sum0 = vmlaq_f32(sum0, Load(src1 + 0 * F), weight[3]); - sum1 = vmlaq_f32(sum1, Load(src1 + 1 * F), weight[4]); - Store(dst, Activate(vaddq_f32(sum0, sum1), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Edge2x3( - const float * src0, const float * src1, const float32x4_t * weight, const float32x4_t & bias, const float32x4_t * params, float * dst) - { - float32x4_t sum0 = bias, sum1 = vdupq_n_f32(0.0f), sum2 = vdupq_n_f32(0.0f); - sum0 = vmlaq_f32(sum0, Load(src0 + 0 * F), weight[0]); - sum1 = vmlaq_f32(sum1, Load(src0 + 1 * F), weight[1]); - sum2 = vmlaq_f32(sum2, Load(src0 + 2 * F), weight[2]); - sum0 = vmlaq_f32(sum0, Load(src1 + 0 * F), weight[3]); - sum1 = vmlaq_f32(sum1, Load(src1 + 1 * F), weight[4]); - sum2 = vmlaq_f32(sum2, Load(src1 + 2 * F), weight[5]); - Store(dst, Activate(vaddq_f32(vaddq_f32(sum0, sum1), sum2), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Edge3x2( - const float * src0, const float * src1, const float * src2, const float32x4_t * weight, const float32x4_t & bias, const float32x4_t * params, float * dst) - { - float32x4_t sum0 = bias, sum1 = vdupq_n_f32(0.0f); - sum0 = vmlaq_f32(sum0, Load(src0 + 0 * F), weight[0]); - sum1 = vmlaq_f32(sum1, Load(src0 + 1 * F), weight[1]); - sum0 = vmlaq_f32(sum0, Load(src1 + 0 * F), weight[3]); - sum1 = vmlaq_f32(sum1, Load(src1 + 1 * F), weight[4]); - sum0 = vmlaq_f32(sum0, Load(src2 + 0 * F), weight[6]); - sum1 = vmlaq_f32(sum1, Load(src2 + 1 * F), weight[7]); - Store(dst, Activate(vaddq_f32(sum0, sum1), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Main1x1( - const float * src0, const float * src1, const float * src2, const float32x4_t * weight, const float32x4_t & bias, const float32x4_t * params, float * dst) - { - float32x4_t sum0 = bias, sum1 = vdupq_n_f32(0.0f), sum2 = vdupq_n_f32(0.0f); - sum0 = vmlaq_f32(sum0, Load(src0 + 0 * F), weight[0]); - sum1 = vmlaq_f32(sum1, Load(src0 + 1 * F), weight[1]); - sum2 = vmlaq_f32(sum2, Load(src0 + 2 * F), weight[2]); - sum0 = vmlaq_f32(sum0, Load(src1 + 0 * F), weight[3]); - sum1 = vmlaq_f32(sum1, Load(src1 + 1 * F), weight[4]); - sum2 = vmlaq_f32(sum2, Load(src1 + 2 * F), weight[5]); - sum0 = vmlaq_f32(sum0, Load(src2 + 0 * F), weight[6]); - sum1 = vmlaq_f32(sum1, Load(src2 + 1 * F), weight[7]); - sum2 = vmlaq_f32(sum2, Load(src2 + 2 * F), weight[8]); - Store(dst, Activate(vaddq_f32(vaddq_f32(sum0, sum1), sum2), params, 0)); - } - - template void DepthwiseConvolution3x3(const float * src, const SimdConvolutionParameters & p, - size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float * weight, const float * bias, const float * params, float * dst) - { - size_t strideY = p.strideY, padY = p.padY, padX = p.padX, padH = p.padH, padW = p.padW; - size_t srcW = p.srcW * F, dstW = p.dstW * F, weightS = p.kernelY * p.kernelX * F; - size_t srcM = (bufH[0] - 1), dstM = (bufH[1] - 1), srcS = bufH[0] * srcW, dstS = bufH[1] * dstW; - size_t xStep = F * p.strideX, xStep0 = (p.strideX - p.padX)*F; - size_t xMainEnd = p.dstW - p.padW, yMainEnd = yEnd == p.dstH && p.padH ? yEnd - 1 : yEnd; - - float32x4_t _params[2]; - _params[0] = vdupq_n_f32(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = vdupq_n_f32(params[1]); - for (size_t c = 0; c < srcC; c += F) - { - float32x4_t _weight[9]; - for (size_t i = 0; i < 9; ++i) - _weight[i] = Load(weight + i * F); - float32x4_t _bias = bias ? Load(bias + c) : vdupq_n_f32(0.0f); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = Load(params + c); - - size_t dy = yBeg; - if (yBeg == 0 && padY) - { - size_t sy = 0, dx = 0; - const float * src0 = src + ((sy + 0)&srcM)*srcW; - const float * src1 = src + ((sy + 1)&srcM)*srcW; - float * pDst = dst + (dy&dstM)*dstW; - if (padX) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 4, _bias, _params, pDst), pDst += F, dx++, src0 += xStep0, src1 += xStep0; - for (; dx < xMainEnd; dx++, pDst += F, src0 += xStep, src1 += xStep) - ConvolutionDepthwise3x3Edge2x3(src0, src1, _weight + 3, _bias, _params, pDst); - if (padW) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 3, _bias, _params, pDst); - dy++; - } - for (; dy < yMainEnd; ++dy) - { - size_t sy = dy * strideY - padY, dx = 0; - const float * src0 = src + ((sy + 0)&srcM)*srcW; - const float * src1 = src + ((sy + 1)&srcM)*srcW; - const float * src2 = src + ((sy + 2)&srcM)*srcW; - float * pDst = dst + (dy&dstM)*dstW; - if (padX) - ConvolutionDepthwise3x3Edge3x2(src0, src1, src2, _weight + 1, _bias, _params, pDst), pDst += F, dx++, src0 += xStep0, src1 += xStep0, src2 += xStep0; - for (; dx < xMainEnd; dx++, pDst += F, src0 += xStep, src1 += xStep, src2 += xStep) - ConvolutionDepthwise3x3Main1x1(src0, src1, src2, _weight + 0, _bias, _params, pDst); - if (padW) - ConvolutionDepthwise3x3Edge3x2(src0, src1, src2, _weight + 0, _bias, _params, pDst); - } - if (dy < yEnd) - { - size_t sy = dy * strideY - padY, dx = 0; - const float * src0 = src + ((sy + 0)&srcM)*srcW; - const float * src1 = src + ((sy + 1)&srcM)*srcW; - float * pDst = dst + (dy&dstM)*dstW; - if (padX) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 1, _bias, _params, pDst), pDst += F, dx++, src0 += xStep0, src1 += xStep0; - for (; dx < xMainEnd; dx++, pDst += F, src0 += xStep, src1 += xStep) - ConvolutionDepthwise3x3Edge2x3(src0, src1, _weight + 0, _bias, _params, pDst); - if (padW) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 0, _bias, _params, pDst); - } - src += srcS; - dst += dstS; - weight += weightS; - } - } - - template void OutputConvolution_2x6(const float * src, size_t srcC, size_t srcS, - const float * weight, const float32x4_t * bias, const float32x4_t * params, float * dst, size_t dstC, size_t tail) - { - float32x4_t d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - if (tail > F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f), d41 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f), d51 = vdupq_n_f32(0.0f); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = Load(weight + 0); - w1 = Load(weight + F); - s0 = vld1q_dup_f32(src + i + 0 * F); - d00 = vmlaq_f32(d00, s0, w0); - d01 = vmlaq_f32(d01, s0, w1); - s0 = vld1q_dup_f32(src + i + 1 * F); - d10 = vmlaq_f32(d10, s0, w0); - d11 = vmlaq_f32(d11, s0, w1); - s0 = vld1q_dup_f32(src + i + 2 * F); - d20 = vmlaq_f32(d20, s0, w0); - d21 = vmlaq_f32(d21, s0, w1); - s0 = vld1q_dup_f32(src + i + 3 * F); - d30 = vmlaq_f32(d30, s0, w0); - d31 = vmlaq_f32(d31, s0, w1); - s0 = vld1q_dup_f32(src + i + 4 * F); - d40 = vmlaq_f32(d40, s0, w0); - d41 = vmlaq_f32(d41, s0, w1); - s0 = vld1q_dup_f32(src + i + 5 * F); - d50 = vmlaq_f32(d50, s0, w0); - d51 = vmlaq_f32(d51, s0, w1); - } - src += srcS; - } - if (tail == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params); - dst += dstC; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params); - dst += dstC; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params); - } - else - { - tail -= F; - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params, tail); - } - } - else - { - d00 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f); - d40 = vdupq_n_f32(0.0f); - d50 = vdupq_n_f32(0.0f); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = Load(weight + 0); - s0 = vld1q_dup_f32(src + i + 0 * F); - d00 = vmlaq_f32(d00, s0, w0); - s0 = vld1q_dup_f32(src + i + 1 * F); - d10 = vmlaq_f32(d10, s0, w0); - s0 = vld1q_dup_f32(src + i + 2 * F); - d20 = vmlaq_f32(d20, s0, w0); - s0 = vld1q_dup_f32(src + i + 3 * F); - d30 = vmlaq_f32(d30, s0, w0); - s0 = vld1q_dup_f32(src + i + 4 * F); - d40 = vmlaq_f32(d40, s0, w0); - s0 = vld1q_dup_f32(src + i + 5 * F); - d50 = vmlaq_f32(d50, s0, w0); - } - src += srcS; - } - if (tail == F) - { - Term::template Save(dst + 0, d00, bias, params); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - dst += dstC; - Term::template Save(dst + 0, d40, bias, params); - dst += dstC; - Term::template Save(dst + 0, d50, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d40, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d50, bias, params, tail); - } - } - } - - template void OutputConvolution_2x4(const float * src, size_t srcC, size_t srcS, - const float * weight, const float32x4_t * bias, const float32x4_t * params, float * dst, size_t dstC, size_t tail) - { - float32x4_t d00, d01, d10, d11, d20, d21, d30, d31, s0, w0, w1; - if (tail > F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f), d31 = vdupq_n_f32(0.0f); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = Load(weight + 0); - w1 = Load(weight + F); - s0 = vld1q_dup_f32(src + i + 0 * F); - d00 = vmlaq_f32(d00, s0, w0); - d01 = vmlaq_f32(d01, s0, w1); - s0 = vld1q_dup_f32(src + i + 1 * F); - d10 = vmlaq_f32(d10, s0, w0); - d11 = vmlaq_f32(d11, s0, w1); - s0 = vld1q_dup_f32(src + i + 2 * F); - d20 = vmlaq_f32(d20, s0, w0); - d21 = vmlaq_f32(d21, s0, w1); - s0 = vld1q_dup_f32(src + i + 3 * F); - d30 = vmlaq_f32(d30, s0, w0); - d31 = vmlaq_f32(d31, s0, w1); - } - src += srcS; - } - if (tail == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params); - } - else - { - tail -= F; - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params, tail); - } - } - else - { - d00 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f); - d30 = vdupq_n_f32(0.0f); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = Load(weight + 0); - s0 = vld1q_dup_f32(src + i + 0 * F); - d00 = vmlaq_f32(d00, s0, w0); - s0 = vld1q_dup_f32(src + i + 1 * F); - d10 = vmlaq_f32(d10, s0, w0); - s0 = vld1q_dup_f32(src + i + 2 * F); - d20 = vmlaq_f32(d20, s0, w0); - s0 = vld1q_dup_f32(src + i + 3 * F); - d30 = vmlaq_f32(d30, s0, w0); - } - src += srcS; - } - if (tail == F) - { - Term::template Save(dst + 0, d00, bias, params); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params, tail); - } - } - } - - template void OutputConvolution_2x3(const float * src, size_t srcC, size_t srcS, - const float * weight, const float32x4_t * bias, const float32x4_t * params, float * dst, size_t dstC, size_t tail) - { - float32x4_t d00, d01, d10, d11, d20, d21, s0, w0, w1; - if (tail > F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f), d11 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f), d21 = vdupq_n_f32(0.0f); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = Load(weight + 0); - w1 = Load(weight + F); - s0 = vld1q_dup_f32(src + i + 0 * F); - d00 = vmlaq_f32(d00, s0, w0); - d01 = vmlaq_f32(d01, s0, w1); - s0 = vld1q_dup_f32(src + i + 1 * F); - d10 = vmlaq_f32(d10, s0, w0); - d11 = vmlaq_f32(d11, s0, w1); - s0 = vld1q_dup_f32(src + i + 2 * F); - d20 = vmlaq_f32(d20, s0, w0); - d21 = vmlaq_f32(d21, s0, w1); - } - src += srcS; - } - if (tail == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params); - } - else - { - tail -= F; - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, tail); - } - } - else - { - d00 = vdupq_n_f32(0.0f); - d10 = vdupq_n_f32(0.0f); - d20 = vdupq_n_f32(0.0f); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = Load(weight + 0); - s0 = vld1q_dup_f32(src + i + 0 * F); - d00 = vmlaq_f32(d00, s0, w0); - s0 = vld1q_dup_f32(src + i + 1 * F); - d10 = vmlaq_f32(d10, s0, w0); - s0 = vld1q_dup_f32(src + i + 2 * F); - d20 = vmlaq_f32(d20, s0, w0); - } - src += srcS; - } - if (tail == F) - { - Term::template Save(dst + 0, d00, bias, params); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params, tail); - } - } - } - - template void OutputConvolution_2x1(const float * src, size_t srcC, size_t srcS, - const float * weight, const float32x4_t * bias, const float32x4_t * params, float * dst, size_t dstC, size_t tail) - { - float32x4_t d00, d01, s0, w0, w1; - if (tail > F) - { - d00 = vdupq_n_f32(0.0f), d01 = vdupq_n_f32(0.0f); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = Load(weight + 0); - w1 = Load(weight + F); - s0 = vld1q_dup_f32(src + i + 0 * F); - d00 = vmlaq_f32(d00, s0, w0); - d01 = vmlaq_f32(d01, s0, w1); - } - src += srcS; - } - if (tail == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, tail - F); - } - } - else - { - d00 = vdupq_n_f32(0.0f); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = Load(weight + 0); - s0 = vld1q_dup_f32(src + i + 0 * F); - d00 = vmlaq_f32(d00, s0, w0); - } - src += srcS; - } - if (tail == F) - Term::template Save(dst + 0, d00, bias, params); - else - Term::template Save(dst + 0, d00, bias, params, tail); - } - } - - template void OutputConvolution(const float * src, const SimdConvolutionParameters & p, - size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float * weight, const float * bias, const float * params, float * dst) - { - assert(p.group == 1 && p.kernelY == 1 && p.strideY == 1); - size_t srcH = p.srcH, srcW = p.srcW, dstW = p.dstW, dstC = p.dstC; - size_t srcM = (bufH[1] - 1), srcS = bufH[1] * srcW*F; - size_t dstW3 = AlignLoAny(dstW, 3), dstW6 = AlignLoAny(dstW, 6); - float32x4_t _params[2], _bias[2]; - _params[0] = vdupq_n_f32(params[0]); - if (type == ::SimdConvolutionActivationRestrictRange || type == ::SimdConvolutionActivationHswish) - _params[1] = vdupq_n_f32(params[1]); - - dst += yBeg * p.dstW * p.dstC; - size_t dc = 0; - for (; dc < dstC; dc += DF) - { - size_t tail = Simd::Min(DF, dstC - dc); - _bias[0] = Load(bias + dc + 0); - _bias[1] = Load(bias + dc + F); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = Load(params + dc + 0); - _params[1] = Load(params + dc + F); - } - float * pDst = dst + dc; - for (size_t y = yBeg; y < yEnd; ++y) - { - const float * pSrc = src + (y&srcM)*srcW*F; - size_t x = 0; - for (; x < dstW6; x += 6, pDst += 6 * dstC, pSrc += 6 * F) - OutputConvolution_2x6(pSrc, srcC, srcS, weight, _bias, _params, pDst, dstC, tail); - if (dstW - dstW6 == 4) - OutputConvolution_2x4(pSrc, srcC, srcS, weight, _bias, _params, pDst, dstC, tail), pDst += 4 * dstC; - else - { - for (; x < dstW3; x += 3, pDst += 3 * dstC, pSrc += 3 * F) - OutputConvolution_2x3(pSrc, srcC, srcS, weight, _bias, _params, pDst, dstC, tail); - for (; x < dstW; ++x, pDst += dstC, pSrc += F) - OutputConvolution_2x1(pSrc, srcC, srcS, weight, _bias, _params, pDst, dstC, tail); - } - } - weight += srcC * DF; - } - } - - template void SetConvolutionPtr(const MergConvParam32f & p, size_t index, SynetMergedConvolution32f::ConvolutionPtr convolution[3]) - { - switch (index) - { - case 0: - if (p.conv[0].kernelY == 1 && p.conv[0].strideY == 1) - convolution[0] = InputConvolution1x1; - else - convolution[0] = InputConvolution; - break; - case 1: - if (p.conv[1].kernelY == 3) - convolution[1] = DepthwiseConvolution3x3; - else - convolution[1] = DepthwiseConvolution; - break; - case 2: - if (p.add) - { - convolution[2] = OutputConvolution; - convolution[3] = OutputConvolution; - convolution[4] = OutputConvolution; - convolution[5] = OutputConvolution; - } - else - { - convolution[2] = OutputConvolution; - convolution[3] = OutputConvolution; - convolution[4] = OutputConvolution; - convolution[5] = OutputConvolution; - } - break; - default: - assert(0); - } - } - - SynetMergedConvolution32f::SynetMergedConvolution32f(const MergConvParam32f & p) - : Base::SynetMergedConvolution32f(p) - { - SetSize(Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), Neon::F); - for (size_t i = 0; i < _param.count; ++i) - { - switch (p.conv[i].activation) - { - case SimdConvolutionActivationIdentity: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationRelu: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationLeakyRelu: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationRestrictRange: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationPrelu: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationElu: SetConvolutionPtr(_param, i, _convolution); break; - case SimdConvolutionActivationHswish: SetConvolutionPtr(_param, i, _convolution); break; - default: assert(0); - } - } - } - - //--------------------------------------------------------------------- - - void * SynetMergedConvolution32fInit(size_t batch, const SimdConvolutionParameters * convs, size_t count, SimdBool add) - { - MergConvParam32f param(batch, convs, count, add); - if (!param.Valid()) - return NULL; - return new Neon::SynetMergedConvolution32f(param); - } - } - #endif//SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonSynetPooling.cpp b/src/3rd/Simd/Simd/SimdNeonSynetPooling.cpp deleted file mode 100644 index 7ed81661..00000000 --- a/src/3rd/Simd/Simd/SimdNeonSynetPooling.cpp +++ /dev/null @@ -1,515 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdNeon.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE void PoolingAverageNhwc1(const float* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const float32x4_t& norm, float* dst) - { - float32x4_t sum0 = vdupq_n_f32(0); - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - sum0 = vaddq_f32(sum0, Load(src + w * srcC + 0 * F)); - } - src += srcS; - } - Store(dst + 0 * F, vmulq_f32(sum0, norm)); - } - - SIMD_INLINE void PoolingAverageNhwc2(const float* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const float32x4_t& norm, float* dst) - { - float32x4_t sum0 = vdupq_n_f32(0); - float32x4_t sum1 = vdupq_n_f32(0); - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - sum0 = vaddq_f32(sum0, Load(src + w * srcC + 0 * F)); - sum1 = vaddq_f32(sum1, Load(src + w * srcC + 1 * F)); - } - src += srcS; - } - Store(dst + 0 * F, vmulq_f32(sum0, norm)); - Store(dst + 1 * F, vmulq_f32(sum1, norm)); - } - - SIMD_INLINE void PoolingAverageNhwc4(const float* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const float32x4_t& norm, float* dst) - { - float32x4_t sum0 = vdupq_n_f32(0); - float32x4_t sum1 = vdupq_n_f32(0); - float32x4_t sum2 = vdupq_n_f32(0); - float32x4_t sum3 = vdupq_n_f32(0); - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - sum0 = vaddq_f32(sum0, Load(src + w * srcC + 0 * F)); - sum1 = vaddq_f32(sum1, Load(src + w * srcC + 1 * F)); - sum2 = vaddq_f32(sum2, Load(src + w * srcC + 2 * F)); - sum3 = vaddq_f32(sum3, Load(src + w * srcC + 3 * F)); - } - src += srcS; - } - Store(dst + 0 * F, vmulq_f32(sum0, norm)); - Store(dst + 1 * F, vmulq_f32(sum1, norm)); - Store(dst + 2 * F, vmulq_f32(sum2, norm)); - Store(dst + 3 * F, vmulq_f32(sum3, norm)); - } - - SIMD_INLINE void PoolingAverageNhwc8(const float* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const float32x4_t& norm, float* dst) - { - float32x4_t sum0 = vdupq_n_f32(0); - float32x4_t sum1 = vdupq_n_f32(0); - float32x4_t sum2 = vdupq_n_f32(0); - float32x4_t sum3 = vdupq_n_f32(0); - float32x4_t sum4 = vdupq_n_f32(0); - float32x4_t sum5 = vdupq_n_f32(0); - float32x4_t sum6 = vdupq_n_f32(0); - float32x4_t sum7 = vdupq_n_f32(0); - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - sum0 = vaddq_f32(sum0, Load(src + w * srcC + 0 * F)); - sum1 = vaddq_f32(sum1, Load(src + w * srcC + 1 * F)); - sum2 = vaddq_f32(sum2, Load(src + w * srcC + 2 * F)); - sum3 = vaddq_f32(sum3, Load(src + w * srcC + 3 * F)); - sum4 = vaddq_f32(sum4, Load(src + w * srcC + 4 * F)); - sum5 = vaddq_f32(sum5, Load(src + w * srcC + 5 * F)); - sum6 = vaddq_f32(sum6, Load(src + w * srcC + 6 * F)); - sum7 = vaddq_f32(sum7, Load(src + w * srcC + 7 * F)); - } - src += srcS; - } - Store(dst + 0 * F, vmulq_f32(sum0, norm)); - Store(dst + 1 * F, vmulq_f32(sum1, norm)); - Store(dst + 2 * F, vmulq_f32(sum2, norm)); - Store(dst + 3 * F, vmulq_f32(sum3, norm)); - Store(dst + 4 * F, vmulq_f32(sum4, norm)); - Store(dst + 5 * F, vmulq_f32(sum5, norm)); - Store(dst + 6 * F, vmulq_f32(sum6, norm)); - Store(dst + 7 * F, vmulq_f32(sum7, norm)); - } - - SIMD_INLINE void PoolingAverageNhwc(const float* src, size_t srcS, size_t srcC, size_t srcCF1, - size_t srcCF2, size_t srcCF4, size_t srcCF8, size_t kernelY, size_t kernelX, const float32x4_t& norm, float* dst) - { - size_t c = 0; - for (; c < srcCF8; c += 8 * F) - PoolingAverageNhwc8(src + c, srcS, srcC, kernelY, kernelX, norm, dst + c); - for (; c < srcCF4; c += 4 * F) - PoolingAverageNhwc4(src + c, srcS, srcC, kernelY, kernelX, norm, dst + c); - for (; c < srcCF2; c += 2 * F) - PoolingAverageNhwc2(src + c, srcS, srcC, kernelY, kernelX, norm, dst + c); - for (; c < srcCF1; c += 1 * F) - PoolingAverageNhwc1(src + c, srcS, srcC, kernelY, kernelX, norm, dst + c); - if (c < srcC) - PoolingAverageNhwc1(src + srcC - F, srcS, srcC, kernelY, kernelX, norm, dst + srcC - F); - } - - void SynetPoolingForwardAverage(const float* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float* dst, size_t dstH, size_t dstW, SimdBool excludePad, SimdTensorFormatType format) - { - if (format == SimdTensorFormatNhwc) - { - if (srcC >= F) - { - size_t srcS = srcW * srcC; - size_t srcCF1 = AlignLo(srcC, 1 * F); - size_t srcCF2 = AlignLo(srcC, 2 * F); - size_t srcCF4 = AlignLo(srcC, 4 * F); - size_t srcCF8 = AlignLo(srcC, 8 * F); - if (padX == 0 && padY == 0 && (dstW - 1) * strideX + kernelX == srcW && (dstH - 1) * strideY + kernelY == srcH) - { - size_t stepY = srcW * srcC * strideY, stepX = strideX * srcC; - float32x4_t norm = vdupq_n_f32(1.0f / (kernelY * kernelX)); - for (size_t ph = 0; ph < dstH; ++ph) - { - const float* ps = src + ph * stepY; - for (size_t pw = 0; pw < dstW; ++pw, ps += stepX, dst += srcC) - PoolingAverageNhwc(ps, srcS, srcC, srcCF1, srcCF2, srcCF4, srcCF8, kernelY, kernelX, norm, dst); - } - } - else if (excludePad) - { - for (size_t ph = 0; ph < dstH; ++ph) - { - size_t hStart = ph * strideY - padY; - size_t hEnd = Simd::Min(hStart + kernelY, srcH); - hStart = Simd::Max(0, hStart); - size_t kH = hEnd - hStart; - for (size_t pw = 0; pw < dstW; ++pw) - { - size_t wStart = pw * strideX - padX; - size_t wEnd = Simd::Min(wStart + kernelX, srcW); - wStart = Simd::Max(0, wStart); - size_t kW = wEnd - wStart; - const float* ps = src + hStart * srcS + wStart * srcC; - float32x4_t norm = vdupq_n_f32(1.0f / (kH * kW)); - PoolingAverageNhwc(ps, srcS, srcC, srcCF1, srcCF2, srcCF4, srcCF8, kH, kW, norm, dst); - dst += srcC; - } - } - } - else - { - float32x4_t norm = vdupq_n_f32(1.0f / (kernelY * kernelX)); - for (size_t ph = 0; ph < dstH; ++ph) - { - size_t hStart = ph * strideY - padY; - size_t hEnd = Simd::Min(hStart + kernelY, srcH); - hStart = Simd::Max(0, hStart); - size_t kH = hEnd - hStart; - for (size_t pw = 0; pw < dstW; ++pw) - { - size_t wStart = pw * strideX - padX; - size_t wEnd = Simd::Min(wStart + kernelX, srcW); - wStart = Simd::Max(0, wStart); - size_t kW = wEnd - wStart; - const float* ps = src + hStart * srcS + wStart * srcC; - PoolingAverageNhwc(ps, srcS, srcC, srcCF1, srcCF2, srcCF4, srcCF8, kH, kW, norm, dst); - dst += srcC; - } -} - } - return; - } - } - else if (format == SimdTensorFormatNchw) - { - } - Base::SynetPoolingForwardAverage(src, srcC, srcH, srcW, kernelY, kernelX, strideY, strideX, padY, padX, dst, dstH, dstW, excludePad, format); - } - - //--------------------------------------------------------------------- - - SIMD_INLINE void PoolingMaxHwc1(const float * src, size_t srcS, size_t srcC, size_t kH, size_t kW, const float32x4_t & min, float * dst) - { - float32x4_t max0 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - max0 = vmaxq_f32(max0, Load(src + w * srcC + 0 * F)); - } - src += srcS; - } - Store(dst + 0 * F, max0); - } - - SIMD_INLINE void PoolingMaxHwc2(const float * src, size_t srcS, size_t srcC, size_t kH, size_t kW, const float32x4_t & min, float * dst) - { - float32x4_t max0 = min; - float32x4_t max1 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - max0 = vmaxq_f32(max0, Load(src + w * srcC + 0 * F)); - max1 = vmaxq_f32(max1, Load(src + w * srcC + 1 * F)); - } - src += srcS; - } - Store(dst + 0 * F, max0); - Store(dst + 1 * F, max1); - } - - SIMD_INLINE void PoolingMaxHwc4(const float * src, size_t srcS, size_t srcC, size_t kH, size_t kW, const float32x4_t & min, float * dst) - { - float32x4_t max0 = min; - float32x4_t max1 = min; - float32x4_t max2 = min; - float32x4_t max3 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - max0 = vmaxq_f32(max0, Load(src + w * srcC + 0 * F)); - max1 = vmaxq_f32(max1, Load(src + w * srcC + 1 * F)); - max2 = vmaxq_f32(max2, Load(src + w * srcC + 2 * F)); - max3 = vmaxq_f32(max3, Load(src + w * srcC + 3 * F)); - } - src += srcS; - } - Store(dst + 0 * F, max0); - Store(dst + 1 * F, max1); - Store(dst + 2 * F, max2); - Store(dst + 3 * F, max3); - } - - SIMD_INLINE void PoolingMaxHwc8(const float * src, size_t srcS, size_t srcC, size_t kH, size_t kW, const float32x4_t & min, float * dst) - { - float32x4_t max0 = min; - float32x4_t max1 = min; - float32x4_t max2 = min; - float32x4_t max3 = min; - float32x4_t max4 = min; - float32x4_t max5 = min; - float32x4_t max6 = min; - float32x4_t max7 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - max0 = vmaxq_f32(max0, Load(src + w * srcC + 0 * F)); - max1 = vmaxq_f32(max1, Load(src + w * srcC + 1 * F)); - max2 = vmaxq_f32(max2, Load(src + w * srcC + 2 * F)); - max3 = vmaxq_f32(max3, Load(src + w * srcC + 3 * F)); - max4 = vmaxq_f32(max4, Load(src + w * srcC + 4 * F)); - max5 = vmaxq_f32(max5, Load(src + w * srcC + 5 * F)); - max6 = vmaxq_f32(max6, Load(src + w * srcC + 6 * F)); - max7 = vmaxq_f32(max7, Load(src + w * srcC + 7 * F)); - } - src += srcS; - } - Store(dst + 0 * F, max0); - Store(dst + 1 * F, max1); - Store(dst + 2 * F, max2); - Store(dst + 3 * F, max3); - Store(dst + 4 * F, max4); - Store(dst + 5 * F, max5); - Store(dst + 6 * F, max6); - Store(dst + 7 * F, max7); - } - - void SynetPoolingForwardMax32f(const float * src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, float * dst, size_t dstH, size_t dstW, SimdTensorFormatType format) - { - if (format == SimdTensorFormatNhwc) - { - if (srcC >= F) - { - size_t srcS = srcW * srcC; - size_t srcCF1 = AlignLo(srcC, 1 * F); - size_t srcCF2 = AlignLo(srcC, 2 * F); - size_t srcCF4 = AlignLo(srcC, 4 * F); - size_t srcCF8 = AlignLo(srcC, 8 * F); - float32x4_t min = vdupq_n_f32(-FLT_MAX); - for (size_t ph = 0; ph < dstH; ++ph) - { - size_t hStart = ph * strideY - padY; - size_t hEnd = Simd::Min(hStart + kernelY, srcH); - hStart = Simd::Max(0, hStart); - for (size_t pw = 0; pw < dstW; ++pw) - { - size_t wStart = pw * strideX - padX; - size_t wEnd = Simd::Min(wStart + kernelX, srcW); - wStart = Simd::Max(0, wStart); - const float* ps = src + hStart * srcS + wStart * srcC; - size_t c = 0; - for (; c < srcCF8; c += 8 * F) - PoolingMaxHwc8(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - for (; c < srcCF4; c += 4 * F) - PoolingMaxHwc4(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - for (; c < srcCF2; c += 2 * F) - PoolingMaxHwc2(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - for (; c < srcCF1; c += 1 * F) - PoolingMaxHwc1(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - if (c < srcC) - PoolingMaxHwc1(ps + srcC - F, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + srcC - F); - dst += srcC; - } - } - } - } - else if (format == SimdTensorFormatNchw) - { - if (strideY == 1 && strideX == 1 && kernelY == 3 && kernelX == 3 && srcH == dstH && srcW == dstW && dstW > F) - { - for (size_t c = 0; c < srcC; ++c, src += srcH * srcW, dst += dstH * dstW) - Neon::NeuralPooling1x1Max3x3(src, srcW, srcW, srcH, dst, dstW); - return; - } - if (strideY == 2 && strideX == 2 && kernelY == 2 && kernelX == 2 && padY == 0 && padX == 0 && dstW >= F) - { - for (size_t c = 0; c < srcC; ++c, src += srcH * srcW, dst += dstH * dstW) - Neon::NeuralPooling2x2Max2x2(src, srcW, srcW, srcH, dst, dstW); - return; - } - if (strideY == 2 && strideX == 2 && kernelY == 3 && kernelX == 3 && padY == 0 && padX == 0 && dstW > F) - { - for (size_t c = 0; c < srcC; ++c, src += srcH * srcW, dst += dstH * dstW) - Neon::NeuralPooling2x2Max3x3(src, srcW, srcW, srcH, dst, dstW); - return; - } - Base::SynetPoolingForwardMax32f(src, srcC, srcH, srcW, kernelY, kernelX, strideY, strideX, padY, padX, dst, dstH, dstW, format); - } - else - assert(0); - } - - //--------------------------------------------------------------------- - - SIMD_INLINE void PoolingMaxNhwc1(const uint8_t* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const uint8x16_t& min, uint8_t* dst) - { - uint8x16_t max0 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - const uint8_t* ps = src + w * srcC; - max0 = vmaxq_u8(max0, Load(ps + 0 * A)); - } - src += srcS; - } - Store(dst + 0 * A, max0); - } - - SIMD_INLINE void PoolingMaxNhwc2(const uint8_t* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const uint8x16_t& min, uint8_t* dst) - { - uint8x16_t max0 = min; - uint8x16_t max1 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - const uint8_t* ps = src + w * srcC; - max0 = vmaxq_u8(max0, Load(ps + 0 * A)); - max1 = vmaxq_u8(max1, Load(ps + 1 * A)); - } - src += srcS; - } - Store(dst + 0 * A, max0); - Store(dst + 1 * A, max1); - } - - SIMD_INLINE void PoolingMaxNhwc4(const uint8_t* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const uint8x16_t& min, uint8_t* dst) - { - uint8x16_t max0 = min; - uint8x16_t max1 = min; - uint8x16_t max2 = min; - uint8x16_t max3 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - const uint8_t* ps = src + w * srcC; - max0 = vmaxq_u8(max0, Load(ps + 0 * A)); - max1 = vmaxq_u8(max1, Load(ps + 1 * A)); - max2 = vmaxq_u8(max2, Load(ps + 2 * A)); - max3 = vmaxq_u8(max3, Load(ps + 3 * A)); - } - src += srcS; - } - Store(dst + 0 * A, max0); - Store(dst + 1 * A, max1); - Store(dst + 2 * A, max2); - Store(dst + 3 * A, max3); - } - - SIMD_INLINE void PoolingMaxNhwc8(const uint8_t* src, size_t srcS, size_t srcC, size_t kH, size_t kW, const uint8x16_t& min, uint8_t* dst) - { - uint8x16_t max0 = min; - uint8x16_t max1 = min; - uint8x16_t max2 = min; - uint8x16_t max3 = min; - uint8x16_t max4 = min; - uint8x16_t max5 = min; - uint8x16_t max6 = min; - uint8x16_t max7 = min; - for (size_t h = 0; h < kH; ++h) - { - for (size_t w = 0; w < kW; ++w) - { - const uint8_t* ps = src + w * srcC; - max0 = vmaxq_u8(max0, Load(ps + 0 * A)); - max1 = vmaxq_u8(max1, Load(ps + 1 * A)); - max2 = vmaxq_u8(max2, Load(ps + 2 * A)); - max3 = vmaxq_u8(max3, Load(ps + 3 * A)); - max4 = vmaxq_u8(max4, Load(ps + 4 * A)); - max5 = vmaxq_u8(max5, Load(ps + 5 * A)); - max6 = vmaxq_u8(max6, Load(ps + 6 * A)); - max7 = vmaxq_u8(max7, Load(ps + 7 * A)); - } - src += srcS; - } - Store(dst + 0 * A, max0); - Store(dst + 1 * A, max1); - Store(dst + 2 * A, max2); - Store(dst + 3 * A, max3); - Store(dst + 4 * A, max4); - Store(dst + 5 * A, max5); - Store(dst + 6 * A, max6); - Store(dst + 7 * A, max7); - } - - void SynetPoolingForwardMax8u(const uint8_t* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX, - size_t strideY, size_t strideX, size_t padY, size_t padX, uint8_t* dst, size_t dstH, size_t dstW, SimdTensorFormatType format) - { - if (format == SimdTensorFormatNhwc) - { - if (srcC >= A) - { - size_t srcS = srcW * srcC; - size_t srcCA1 = AlignLo(srcC, 1 * A); - size_t srcCA2 = AlignLo(srcC, 2 * A); - size_t srcCA4 = AlignLo(srcC, 4 * A); - size_t srcCA8 = AlignLo(srcC, 8 * A); - uint8x16_t min = vdupq_n_u8(0); - for (size_t ph = 0; ph < dstH; ++ph) - { - size_t hStart = ph * strideY - padY; - size_t hEnd = Simd::Min(hStart + kernelY, srcH); - hStart = Simd::Max(0, hStart); - for (size_t pw = 0; pw < dstW; ++pw) - { - size_t wStart = pw * strideX - padX; - size_t wEnd = Simd::Min(wStart + kernelX, srcW); - wStart = Simd::Max(0, wStart); - const uint8_t* ps = src + hStart * srcS + wStart * srcC; - size_t c = 0; - for (; c < srcCA8; c += 8 * A) - PoolingMaxNhwc8(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - for (; c < srcCA4; c += 4 * A) - PoolingMaxNhwc4(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - for (; c < srcCA2; c += 2 * A) - PoolingMaxNhwc2(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - for (; c < srcCA1; c += 1 * A) - PoolingMaxNhwc1(ps + c, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + c); - if (c < srcC) - PoolingMaxNhwc1(ps + srcC - A, srcS, srcC, hEnd - hStart, wEnd - wStart, min, dst + srcC - A); - dst += srcC; - } - } - } - else - Base::SynetPoolingForwardMax8u(src, srcC, srcH, srcW, kernelY, kernelX, strideY, strideX, padY, padX, dst, dstH, dstW, format); - } - else if (format == SimdTensorFormatNchw) - { - Base::SynetPoolingForwardMax8u(src, srcC, srcH, srcW, kernelY, kernelX, strideY, strideX, padY, padX, dst, dstH, dstW, format); - } - else - assert(0); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonTexture.cpp b/src/3rd/Simd/Simd/SimdNeonTexture.cpp deleted file mode 100644 index b407e0d2..00000000 --- a/src/3rd/Simd/Simd/SimdNeonTexture.cpp +++ /dev/null @@ -1,258 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdExtract.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdBase.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE uint8x16_t TextureBoostedSaturatedGradient(const uint8x16_t & a, const uint8x16_t & b, const uint8x16_t & saturation, const uint8x16_t & boost) - { - uint8x16_t p = vminq_u8(vqsubq_u8(b, a), saturation); - uint8x16_t n = vminq_u8(vqsubq_u8(a, b), saturation); - return vmulq_u8(vsubq_u8(vaddq_u8(saturation, p), n), boost); - } - - template SIMD_INLINE void TextureBoostedSaturatedGradient(const uint8_t * src, size_t stride, uint8_t * dx, uint8_t * dy, - const uint8x16_t & saturation, const uint8x16_t & boost) - { - Store(dx, TextureBoostedSaturatedGradient(Load(src - 1), Load(src + 1), saturation, boost)); - Store(dy, TextureBoostedSaturatedGradient(Load(src - stride), Load(src + stride), saturation, boost)); - } - - template void TextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride) - { - assert(width >= A && int(2)*saturation*boost <= 0xFF); - if (align) - { - assert(Aligned(src) && Aligned(srcStride) && Aligned(dx) && Aligned(dxStride) && Aligned(dy) && Aligned(dyStride)); - } - - size_t alignedWidth = AlignLo(width, A); - uint8x16_t _saturation = vdupq_n_u8(saturation); - uint8x16_t _boost = vdupq_n_u8(boost); - - memset(dx, 0, width); - memset(dy, 0, width); - src += srcStride; - dx += dxStride; - dy += dyStride; - for (size_t row = 2; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - TextureBoostedSaturatedGradient(src + col, srcStride, dx + col, dy + col, _saturation, _boost); - if (width != alignedWidth) - TextureBoostedSaturatedGradient(src + width - A, srcStride, dx + width - A, dy + width - A, _saturation, _boost); - - dx[0] = 0; - dy[0] = 0; - dx[width - 1] = 0; - dy[width - 1] = 0; - - src += srcStride; - dx += dxStride; - dy += dyStride; - } - memset(dx, 0, width); - memset(dy, 0, width); - } - - void TextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dx) && Aligned(dxStride) && Aligned(dy) && Aligned(dyStride)) - TextureBoostedSaturatedGradient(src, srcStride, width, height, saturation, boost, dx, dxStride, dy, dyStride); - else - TextureBoostedSaturatedGradient(src, srcStride, width, height, saturation, boost, dx, dxStride, dy, dyStride); - } - - template SIMD_INLINE void TextureBoostedUv(const uint8_t * src, uint8_t * dst, uint8x16_t min, uint8x16_t max, uint8x16_t boost) - { - Store(dst, vmulq_u8(vsubq_u8(vmaxq_u8(min, vminq_u8(max, Load(src))), min), boost)); - } - - template void TextureBoostedUv(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t boost, uint8_t * dst, size_t dstStride) - { - assert(width >= A && boost < 0x80); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); - - size_t alignedWidth = AlignLo(width, A); - int min = 128 - (128 / boost); - int max = 255 - min; - - uint8x16_t _min = vdupq_n_u8(min); - uint8x16_t _max = vdupq_n_u8(max); - uint8x16_t _boost = vdupq_n_u8(boost); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - TextureBoostedUv(src + col, dst + col, _min, _max, _boost); - if (width != alignedWidth) - TextureBoostedUv(src + width - A, dst + width - A, _min, _max, _boost); - src += srcStride; - dst += dstStride; - } - } - - void TextureBoostedUv(const uint8_t * src, size_t srcStride, size_t width, size_t height, - uint8_t boost, uint8_t * dst, size_t dstStride) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - TextureBoostedUv(src, srcStride, width, height, boost, dst, dstStride); - else - TextureBoostedUv(src, srcStride, width, height, boost, dst, dstStride); - } - - template SIMD_INLINE int16x8_t TextureGetDifferenceSum(const uint8_t * src, const uint8_t * lo, const uint8_t * hi, size_t offset) - { - uint8x16_t _src = Load(src + offset); - uint8x16_t avg = vrhaddq_u8(Load(lo + offset), Load(hi + offset)); - return (int16x8_t)vsubq_u16(vpaddlq_u8(_src), vpaddlq_u8(avg)); - } - - template SIMD_INLINE int16x8_t TextureGetDifferenceSum(const uint8_t * src, const uint8_t * lo, const uint8_t * hi, size_t offset, const uint8x16_t & mask) - { - uint8x16_t _src = vandq_u8(Load(src + offset), mask); - uint8x16_t avg = vandq_u8(vrhaddq_u8(Load(lo + offset), Load(hi + offset)), mask); - return (int16x8_t)vsubq_u16(vpaddlq_u8(_src), vpaddlq_u8(avg)); - } - - template void TextureGetDifferenceSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, int64_t * sum) - { - assert(width >= A && sum != NULL); - if (align) - { - assert(Aligned(src) && Aligned(srcStride) && Aligned(lo) && Aligned(loStride) && Aligned(hi) && Aligned(hiStride)); - } - - size_t alignedWidth = AlignLo(width, A); - uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); - size_t blockSize = A << 6; - size_t blockCount = (alignedWidth >> 6) + 1; - - int64x2_t _sum = (int64x2_t)K64_0000000000000000; - for (size_t row = 0; row < height; ++row) - { - int32x4_t rowSum = (int32x4_t)K32_00000000; - for (size_t block = 0; block < blockCount; ++block) - { - int16x8_t blockSum = (int16x8_t)K16_0000; - for (size_t col = block*blockSize, end = Min(col + blockSize, alignedWidth); col < end; col += A) - blockSum = vaddq_s16(blockSum, TextureGetDifferenceSum(src, lo, hi, col)); - rowSum = vaddq_s32(rowSum, vpaddlq_s16(blockSum)); - } - if (alignedWidth != width) - rowSum = vaddq_s32(rowSum, vpaddlq_s16(TextureGetDifferenceSum(src, lo, hi, width - A, tailMask))); - _sum = vaddq_s64(_sum, vpaddlq_s32(rowSum)); - src += srcStride; - lo += loStride; - hi += hiStride; - } - *sum = ExtractSum64i(_sum); - } - - void TextureGetDifferenceSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, - const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, int64_t * sum) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(lo) && Aligned(loStride) && Aligned(hi) && Aligned(hiStride)) - TextureGetDifferenceSum(src, srcStride, width, height, lo, loStride, hi, hiStride, sum); - else - TextureGetDifferenceSum(src, srcStride, width, height, lo, loStride, hi, hiStride, sum); - } - - template void TexturePerformCompensation(const uint8_t * src, size_t srcStride, size_t width, size_t height, - int shift, uint8_t * dst, size_t dstStride) - { - assert(width >= A && shift > -0xFF && shift < 0xFF && shift != 0); - if (align) - assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); - - size_t alignedWidth = AlignLo(width, A); - uint8x16_t tailMask = src == dst ? ShiftLeft(K8_FF, A - width + alignedWidth) : K8_FF; - if (shift > 0) - { - uint8x16_t _shift = vdupq_n_u8(shift); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - { - uint8x16_t _src = Load(src + col); - Store(dst + col, vqaddq_u8(_src, _shift)); - } - if (width != alignedWidth) - { - uint8x16_t _src = Load(src + width - A); - Store(dst + width - A, vqaddq_u8(_src, vandq_u8(_shift, tailMask))); - } - src += srcStride; - dst += dstStride; - } - } - if (shift < 0) - { - uint8x16_t _shift = vdupq_n_u8(-shift); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - { - uint8x16_t _src = Load(src + col); - Store(dst + col, vqsubq_u8(_src, _shift)); - } - if (width != alignedWidth) - { - uint8x16_t _src = Load(src + width - A); - Store(dst + width - A, vqsubq_u8(_src, vandq_u8(_shift, tailMask))); - } - src += srcStride; - dst += dstStride; - } - } - } - - void TexturePerformCompensation(const uint8_t * src, size_t srcStride, size_t width, size_t height, - int shift, uint8_t * dst, size_t dstStride) - { - if (shift == 0) - { - if (src != dst) - Base::Copy(src, srcStride, width, height, 1, dst, dstStride); - return; - } - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - TexturePerformCompensation(src, srcStride, width, height, shift, dst, dstStride); - else - TexturePerformCompensation(src, srcStride, width, height, shift, dst, dstStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonTransform.cpp b/src/3rd/Simd/Simd/SimdNeonTransform.cpp deleted file mode 100644 index 6b39774a..00000000 --- a/src/3rd/Simd/Simd/SimdNeonTransform.cpp +++ /dev/null @@ -1,345 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE void CopyPixel(const uint8_t * src, uint8_t * dst) - { - for (size_t i = 0; i < N; ++i) - dst[i] = src[i]; - } - - template<> SIMD_INLINE void CopyPixel<1>(const uint8_t * src, uint8_t * dst) - { - dst[0] = src[0]; - } - - template<> SIMD_INLINE void CopyPixel<2>(const uint8_t * src, uint8_t * dst) - { - ((uint16_t*)dst)[0] = ((uint16_t*)src)[0]; - } - - template<> SIMD_INLINE void CopyPixel<3>(const uint8_t * src, uint8_t * dst) - { - ((uint16_t*)dst)[0] = ((uint16_t*)src)[0]; - dst[2] = src[2]; - } - - template<> SIMD_INLINE void CopyPixel<4>(const uint8_t * src, uint8_t * dst) - { - ((uint32_t*)dst)[0] = ((uint32_t*)src)[0]; - } - - template void TransformImageRotate0(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - size_t rowSize = width * N; - for (size_t row = 0; row < height; ++row) - { - memcpy(dst, src, rowSize); - src += srcStride; - dst += dstStride; - } - } - - template void TransformImageRotate90(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - dst += (width - 1)*dstStride; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - CopyPixel(src + col * N, dst - col * dstStride); - src += srcStride; - dst += N; - } - } - - template SIMD_INLINE void TransformImageRotate180HA(const uint8_t * src, uint8_t * dst) - { - dst += (HA - 1)*N; - for (size_t i = 0; i < HA; ++i) - CopyPixel(src + i * N, dst - i * N); - } - - uint8x8_t K8_TURN = SIMD_VEC_SETR_PI8(7, 6, 5, 4, 3, 2, 1, 0); - - template<> SIMD_INLINE void TransformImageRotate180HA<1>(const uint8_t * src, uint8_t * dst) - { - uint8x8_t v = LoadHalf(src); - v = vtbl1_u8(v, K8_TURN); - Store(dst, v); - } - - template<> SIMD_INLINE void TransformImageRotate180HA<2>(const uint8_t * src, uint8_t * dst) - { - uint8x8x2_t v = LoadHalf2(src); - v.val[0] = vtbl1_u8(v.val[0], K8_TURN); - v.val[1] = vtbl1_u8(v.val[1], K8_TURN); - Store2(dst, v); - } - - template<> SIMD_INLINE void TransformImageRotate180HA<3>(const uint8_t * src, uint8_t * dst) - { - uint8x8x3_t v = LoadHalf3(src); - v.val[0] = vtbl1_u8(v.val[0], K8_TURN); - v.val[1] = vtbl1_u8(v.val[1], K8_TURN); - v.val[2] = vtbl1_u8(v.val[2], K8_TURN); - Store3(dst, v); - } - - template<> SIMD_INLINE void TransformImageRotate180HA<4>(const uint8_t * src, uint8_t * dst) - { - uint8x8x4_t v = LoadHalf4(src); - v.val[0] = vtbl1_u8(v.val[0], K8_TURN); - v.val[1] = vtbl1_u8(v.val[1], K8_TURN); - v.val[2] = vtbl1_u8(v.val[2], K8_TURN); - v.val[3] = vtbl1_u8(v.val[3], K8_TURN); - Store4(dst, v); - } - - template SIMD_INLINE void TransformImageRotate180DA(const uint8_t * src, uint8_t * dst) - { - TransformImageRotate180HA(src + 0 * N * HA, dst - 0 * N * HA); - TransformImageRotate180HA(src + 1 * N * HA, dst - 1 * N * HA); - TransformImageRotate180HA(src + 2 * N * HA, dst - 2 * N * HA); - TransformImageRotate180HA(src + 3 * N * HA, dst - 3 * N * HA); - } - - template void TransformImageRotate180(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - dst += (height - 1)*dstStride + (width - HA)*N; - size_t widthHA = AlignLo(width, HA); - size_t widthDA = AlignLo(width, DA); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < widthDA; col += DA) - TransformImageRotate180DA(src + col * N, dst - col * N); - for (; col < widthHA; col += HA) - TransformImageRotate180HA(src + col * N, dst - col * N); - if(col < width) - TransformImageRotate180HA(src + (width - HA) * N, dst - (width - HA) * N); - src += srcStride; - dst -= dstStride; - } - } - - template void TransformImageRotate270(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - dst += (height - 1)*N; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - CopyPixel(src + col * N, dst + col * dstStride); - src += srcStride; - dst -= N; - } - } - - template void TransformImageTransposeRotate0(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - CopyPixel(src + col * N, dst + col * dstStride); - src += srcStride; - dst += N; - } - } - - union Type3x4x4 - { - uint8x8x4_t d4; - uint8x16x2_t q2; - }; - - uint8x8_t K8_ROT0_000 = SIMD_VEC_SETR_PI8(0, 1, 2, 16, 17, 18, 32, 32); - uint8x8_t K8_ROT0_001 = SIMD_VEC_SETR_PI8(32, 32, 32, 32, 32, 32, 0, 1); - uint8x8_t K8_ROT0_011 = SIMD_VEC_SETR_PI8(2, 16, 17, 18, 32, 32, 32, 32); - uint8x8_t K8_ROT0_020 = SIMD_VEC_SETR_PI8(3, 4, 5, 19, 20, 21, 32, 32); - uint8x8_t K8_ROT0_021 = SIMD_VEC_SETR_PI8(32, 32, 32, 32, 32, 32, 3, 4); - uint8x8_t K8_ROT0_031 = SIMD_VEC_SETR_PI8(5, 19, 20, 21, 32, 32, 32, 32); - uint8x8_t K8_ROT0_100 = SIMD_VEC_SETR_PI8(6, 7, 8, 22, 23, 24, 32, 32); - uint8x8_t K8_ROT0_101 = SIMD_VEC_SETR_PI8(32, 32, 32, 32, 32, 32, 6, 7); - uint8x8_t K8_ROT0_111 = SIMD_VEC_SETR_PI8(8, 22, 23, 24, 32, 32, 32, 32); - uint8x8_t K8_ROT0_120 = SIMD_VEC_SETR_PI8(9, 10, 11, 25, 26, 27, 32, 32); - uint8x8_t K8_ROT0_121 = SIMD_VEC_SETR_PI8(32, 32, 32, 32, 32, 32, 9, 10); - uint8x8_t K8_ROT0_131 = SIMD_VEC_SETR_PI8(11, 25, 26, 27, 32, 32, 32, 32); - - SIMD_INLINE void TransformImageTransposeRotate0_3x4x4(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - Type3x4x4 a0, a1, b0, b1; - a0.q2.val[0] = Load(src + 0 * srcStride); - a0.q2.val[1] = Load(src + 1 * srcStride); - a1.q2.val[0] = Load(src + 2 * srcStride); - a1.q2.val[1] = Load(src + 3 * srcStride); - b0.d4.val[0] = vtbx4_u8(vtbl4_u8(a0.d4, K8_ROT0_000), a1.d4, K8_ROT0_001); - b0.d4.val[1] = vtbl4_u8(a1.d4, K8_ROT0_011); - b0.d4.val[2] = vtbx4_u8(vtbl4_u8(a0.d4, K8_ROT0_020), a1.d4, K8_ROT0_021); - b0.d4.val[3] = vtbl4_u8(a1.d4, K8_ROT0_031); - b1.d4.val[0] = vtbx4_u8(vtbl4_u8(a0.d4, K8_ROT0_100), a1.d4, K8_ROT0_101); - b1.d4.val[1] = vtbl4_u8(a1.d4, K8_ROT0_111); - b1.d4.val[2] = vtbx4_u8(vtbl4_u8(a0.d4, K8_ROT0_120), a1.d4, K8_ROT0_121); - b1.d4.val[3] = vtbl4_u8(a1.d4, K8_ROT0_131); - Store(dst + 0 * dstStride, b0.q2.val[0]); - Store(dst + 1 * dstStride, b0.q2.val[1]); - Store(dst + 2 * dstStride, b1.q2.val[0]); - Store(dst + 3 * dstStride, b1.q2.val[1]); - } - - template<> void TransformImageTransposeRotate0<3>(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - size_t width4 = AlignLo(width - 4, 4); - size_t height4 = AlignLo(height, 4); - size_t row = 0; - for (; row < height4; row += 4) - { - size_t col = 0; - for (; col < width4; col += 4) - TransformImageTransposeRotate0_3x4x4(src + col * 3, srcStride, dst + col * dstStride, dstStride); - for (; col < width; ++col) - for (size_t i = 0; i < 4; ++i) - CopyPixel<3>(src + col * 3 + i * srcStride, dst + col * dstStride + i * 3); - src += 4 * srcStride; - dst += 12; - } - for (; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - CopyPixel<3>(src + col * 3, dst + col * dstStride); - src += srcStride; - dst += 3; - } - } - - SIMD_INLINE void TransformImageTransposeRotate0_4x4x4(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - uint32x4_t a0 = (uint32x4_t)Load(src + 0 * srcStride); - uint32x4_t a1 = (uint32x4_t)Load(src + 1 * srcStride); - uint32x4_t a2 = (uint32x4_t)Load(src + 2 * srcStride); - uint32x4_t a3 = (uint32x4_t)Load(src + 3 * srcStride); - uint32x4x2_t b0 = vzipq_u32(a0, a2); - uint32x4x2_t b1 = vzipq_u32(a1, a3); - uint32x4x2_t c0 = vzipq_u32(b0.val[0], b1.val[0]); - uint32x4x2_t c1 = vzipq_u32(b0.val[1], b1.val[1]); - Store(dst + 0 * dstStride, (uint8x16_t)c0.val[0]); - Store(dst + 1 * dstStride, (uint8x16_t)c0.val[1]); - Store(dst + 2 * dstStride, (uint8x16_t)c1.val[0]); - Store(dst + 3 * dstStride, (uint8x16_t)c1.val[1]); - } - - template<> void TransformImageTransposeRotate0<4>(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - size_t width4 = AlignLo(width, 4); - size_t height4 = AlignLo(height, 4); - size_t row = 0; - for (; row < height4; row += 4) - { - size_t col = 0; - for (; col < width4; col += 4) - TransformImageTransposeRotate0_4x4x4(src + col * 4, srcStride, dst + col * dstStride, dstStride); - for (; col < width; ++col) - for (size_t i = 0; i < 4; ++i) - CopyPixel<4>(src + col * 4 + i * srcStride, dst + col * dstStride + i * 4); - src += 4 * srcStride; - dst += 16; - } - for (; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - CopyPixel<4>(src + col * 4, dst + col * dstStride); - src += srcStride; - dst += 4; - } - } - - template void TransformImageTransposeRotate90(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - dst += (width - HA)*N; - size_t widthHA = AlignLo(width, HA); - size_t widthQA = AlignLo(width, QA); - for (size_t row = 0; row < height; ++row) - { - size_t col = 0; - for (; col < widthQA; col += DA) - TransformImageRotate180DA(src + col * N, dst - col * N); - for (; col < widthHA; col += HA) - TransformImageRotate180HA(src + col * N, dst - col * N); - if (col < width) - TransformImageRotate180HA(src + (width - HA) * N, dst - (width - HA) * N); - src += srcStride; - dst += dstStride; - } - } - - template void TransformImageTransposeRotate180(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - dst += (width - 1)*dstStride + (height - 1)*N; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - CopyPixel(src + col * N, dst - col * dstStride); - src += srcStride; - dst -= N; - } - } - - template void TransformImageTransposeRotate270(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) - { - size_t rowSize = width * N; - dst += (height - 1)*dstStride; - for (size_t row = 0; row < height; ++row) - { - memcpy(dst, src, rowSize); - src += srcStride; - dst -= dstStride; - } - } - - template void TransformImage(const uint8_t * src, size_t srcStride, size_t width, size_t height, SimdTransformType transform, uint8_t * dst, size_t dstStride) - { - typedef void(*TransformImagePtr)(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride); - static const TransformImagePtr transformImage[8] = { TransformImageRotate0, TransformImageRotate90, TransformImageRotate180, TransformImageRotate270, - TransformImageTransposeRotate0, TransformImageTransposeRotate90, TransformImageTransposeRotate180, TransformImageTransposeRotate270 }; - transformImage[(int)transform](src, srcStride, width, height, dst, dstStride); - }; - - void TransformImage(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, SimdTransformType transform, uint8_t * dst, size_t dstStride) - { - switch (pixelSize) - { - case 1: TransformImage<1>(src, srcStride, width, height, transform, dst, dstStride); break; - case 2: TransformImage<2>(src, srcStride, width, height, transform, dst, dstStride); break; - case 3: TransformImage<3>(src, srcStride, width, height, transform, dst, dstStride); break; - case 4: TransformImage<4>(src, srcStride, width, height, transform, dst, dstStride); break; - default: assert(0); - } - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonWinograd.cpp b/src/3rd/Simd/Simd/SimdNeonWinograd.cpp deleted file mode 100644 index 3693ebe3..00000000 --- a/src/3rd/Simd/Simd/SimdNeonWinograd.cpp +++ /dev/null @@ -1,2783 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2020 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdWinograd.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdSet.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE void Load4(const float * src, size_t step, float32x4_t * dst) - { - float32x4_t a0 = Load(src + 0 * step); - float32x4_t a1 = Load(src + 1 * step); - float32x4_t a2 = Load(src + 2 * step); - float32x4_t a3 = Load(src + 3 * step); - float32x4x2_t b0 = vzipq_f32(a0, a2); - float32x4x2_t b1 = vzipq_f32(a1, a3); - *(float32x4x2_t*)(dst + 0) = vzipq_f32(b0.val[0], b1.val[0]); - *(float32x4x2_t*)(dst + 2) = vzipq_f32(b0.val[1], b1.val[1]); - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel1x3Block1x4SetFilter(const float32x4_t* t, float* dst, size_t stride) - { - const float32x4_t r4 = vdupq_n_f32(1.0f / 4.0f); - const float32x4_t r6 = vdupq_n_f32(1.0f / 6.0f); - const float32x4_t mr6 = vdupq_n_f32(-1.0f / 6.0f); - const float32x4_t r12 = vdupq_n_f32(1.0f / 12.0f); - const float32x4_t r24 = vdupq_n_f32(1.0f / 24.0f); - Store(dst + 0 * stride, vmulq_f32(r4, t[0])); - float32x4_t t0 = vaddq_f32(t[0], t[2]); - Store(dst + 1 * stride, vmulq_f32(mr6, vaddq_f32(t0, t[1]))); - Store(dst + 2 * stride, vmulq_f32(mr6, vsubq_f32(t0, t[1]))); - float32x4_t t1 = vaddq_f32(vmulq_f32(r24, t[0]), vmulq_f32(r6, t[2])); - float32x4_t t2 = vmulq_f32(r12, t[1]); - Store(dst + 3 * stride, vaddq_f32(t1, t2)); - Store(dst + 4 * stride, vsubq_f32(t1, t2)); - Store(dst + 5 * stride, t[2]); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetFilter4n(const float* src, float* dst, size_t stride) - { - float32x4_t s[3]; - s[0] = SetF32(src[0], src[3], src[6], src[9]); - s[1] = SetF32(src[1], src[4], src[7], src[10]); - s[2] = SetF32(src[2], src[5], src[8], src[11]); - WinogradKernel1x3Block1x4SetFilter(s, dst + 0 * stride, stride); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetFilter4t(const float* src, float* dst, size_t stride) - { - float32x4_t s[3]; - s[0] = Load(src + 0 * stride); - s[1] = Load(src + 1 * stride); - s[2] = Load(src + 2 * stride); - WinogradKernel1x3Block1x4SetFilter(s, dst + 0 * stride, stride); - } - - void WinogradKernel1x3Block1x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans) - { - size_t size4 = AlignLo(size, 4), i = 0; - if (trans) - { - for (; i < size4; i += 4) - WinogradKernel1x3Block1x4SetFilter4t(src + i, dst + i, size); - for (; i < size; i += 1) - Base::WinogradKernel1x3Block1x4SetFilter1t(src + i, dst + i, size); - } - else - { - for (; i < size4; i += 4, src += 12, dst += 4) - WinogradKernel1x3Block1x4SetFilter4n(src, dst, size); - for (; i < size; i += 1, src += 3, dst += 1) - Base::WinogradKernel1x3Block1x4SetFilter1n(src, dst, size); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel1x3Block1x4SetInput4Store(const float32x4_t src[6], float* dst, size_t stride) - { - float32x4_t _2 = vdupq_n_f32(2.0f); - float32x4_t _4 = vdupq_n_f32(4.0f); - float32x4_t _5 = vdupq_n_f32(5.0f); - Store(dst + 0 * stride, vaddq_f32(vsubq_f32(vmulq_f32(_4, src[0]), vmulq_f32(_5, src[2])), src[4])); - Store(dst + 1 * stride, vsubq_f32(vaddq_f32(src[3], src[4]), vmulq_f32(_4, vaddq_f32(src[1], src[2])))); - Store(dst + 2 * stride, vaddq_f32(vmulq_f32(_4, vsubq_f32(src[1], src[2])), vsubq_f32(src[4], src[3]))); - Store(dst + 3 * stride, vaddq_f32(vmulq_f32(_2, vsubq_f32(src[3], src[1])), vsubq_f32(src[4], src[2]))); - Store(dst + 4 * stride, vaddq_f32(vmulq_f32(_2, vsubq_f32(src[1], src[3])), vsubq_f32(src[4], src[2]))); - Store(dst + 5 * stride, vaddq_f32(vsubq_f32(vmulq_f32(_4, src[1]), vmulq_f32(_5, src[3])), src[5])); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetInput4t(const float* src, size_t srcC, float32x4_t dst[6]) - { - dst[0] = Load(src + 0 * srcC); - dst[1] = Load(src + 1 * srcC); - dst[2] = Load(src + 2 * srcC); - dst[3] = Load(src + 3 * srcC); - dst[4] = Load(src + 4 * srcC); - dst[5] = Load(src + 5 * srcC); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetInput4t(const float* src, size_t srcC, float* dst, size_t dstStride) - { - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - float32x4_t tmp[6]; - WinogradKernel1x3Block1x4SetInput4t(src + c, srcC, tmp); - WinogradKernel1x3Block1x4SetInput4Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - float32x4_t tmp[6]; - WinogradKernel1x3Block1x4SetInput4t(src + srcC - F, srcC, tmp); - WinogradKernel1x3Block1x4SetInput4Store(tmp, dst + srcC - F, dstStride); - } - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetInput4t(const float* src, size_t srcC, size_t colB, size_t colE, float32x4_t dst[6]) - { - for (size_t col = 0; col < colB; ++col) - dst[col] = vdupq_n_f32(0.0f); - for (size_t col = colB; col < colE; ++col) - dst[col] = Load(src + col * srcC); - for (size_t col = colE; col < 6; ++col) - dst[col] = vdupq_n_f32(0.0f); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetInput4t(const float* src, size_t srcC, size_t colB, size_t colE, float* dst, size_t dstStride) - { - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - float32x4_t tmp[6]; - WinogradKernel1x3Block1x4SetInput4t(src + c, srcC, colB, colE, tmp); - WinogradKernel1x3Block1x4SetInput4Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - float32x4_t tmp[6]; - WinogradKernel1x3Block1x4SetInput4t(src + srcC - F, srcC, colB, colE, tmp); - WinogradKernel1x3Block1x4SetInput4Store(tmp, dst + srcC - F, dstStride); - } - } - - void WinogradKernel1x3Block1x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padX == padW && padY == 0 && padH == 0 && (padX == 0 || padX == 1)); - if (trans ? (srcChannels < 4) : (srcWidth < 12)) - { - Base::WinogradKernel1x3Block1x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - return; - } - size_t dstH = srcHeight; - size_t dstW = padX ? srcWidth : srcWidth - 2; - size_t tileW = (dstW + 3) / 4; - size_t dstW4 = AlignLo(dstW, 4); - if (trans) - { - size_t noseW = Simd::Min(6, dstW + 1); - size_t startX = padX ? 4 : 0; - if (padX) - { - if (dstW == dstW4) - dstW4 -= 4; - src -= srcChannels; - } - size_t tailW = dstW - dstW4 + (padX ? 1 : 2); - for (size_t row = 0; row < dstH; row += 1) - { - size_t col = 0; - if (padX) - WinogradKernel1x3Block1x4SetInput4t(src, srcChannels, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel1x3Block1x4SetInput4t(src + col * srcChannels, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel1x3Block1x4SetInput4t(src + col * srcChannels, srcChannels, 0, tailW, dst, dstStride), dst += srcChannels; - src += srcWidth * srcChannels; - } - } - else - { - Base::WinogradKernel1x3Block1x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - } - } - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel1x3Block1x4SetOutputLoad6(const float* src, size_t stride, float32x4_t dst[4]) - { - float32x4_t s[6]; - s[0] = Load(src + 0 * stride); - s[1] = Load(src + 1 * stride); - s[2] = Load(src + 2 * stride); - s[3] = Load(src + 3 * stride); - s[4] = Load(src + 4 * stride); - s[5] = Load(src + 5 * stride); - float32x4_t _2 = vdupq_n_f32(2.0f); - float32x4_t _4 = vdupq_n_f32(4.0f); - float32x4_t _8 = vdupq_n_f32(8.0f); - dst[0] = vaddq_f32(vaddq_f32(vaddq_f32(s[0], s[1]), vaddq_f32(s[2], s[3])), s[4]); - dst[1] = vaddq_f32(vsubq_f32(s[1], s[2]), vmulq_f32(_2, vsubq_f32(s[3], s[4]))); - dst[2] = vaddq_f32(vaddq_f32(s[1], s[2]), vmulq_f32(_4, vaddq_f32(s[3], s[4]))); - dst[3] = vaddq_f32(vaddq_f32(vsubq_f32(s[1], s[2]), vmulq_f32(_8, vsubq_f32(s[3], s[4]))), s[5]); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetOutputStore4(const float32x4_t src[4], float* dst, size_t dstC) - { - Store(dst + 0 * dstC, src[0]); - Store(dst + 1 * dstC, src[1]); - Store(dst + 2 * dstC, src[2]); - Store(dst + 3 * dstC, src[3]); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstC) - { - size_t dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - float32x4_t tmp[4]; - WinogradKernel1x3Block1x4SetOutputLoad6(src + d, srcStride, tmp); - WinogradKernel1x3Block1x4SetOutputStore4(tmp, dst + d, dstC); - } - if (dstCF < dstC) - { - float32x4_t tmp[4]; - WinogradKernel1x3Block1x4SetOutputLoad6(src + dstC - F, srcStride, tmp); - WinogradKernel1x3Block1x4SetOutputStore4(tmp, dst + dstC - F, dstC); - } - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetOutputStore4(const float32x4_t src[4], float* dst, size_t dstC, size_t colE) - { - for (size_t col = 0; col < colE; ++col) - Store(dst + col * dstC, src[col]); - } - - SIMD_INLINE void WinogradKernel1x3Block1x4SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstC, size_t colE) - { - size_t dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - float32x4_t tmp[4]; - WinogradKernel1x3Block1x4SetOutputLoad6(src + d, srcStride, tmp); - WinogradKernel1x3Block1x4SetOutputStore4(tmp, dst + d, dstC, colE); - } - if (dstCF < dstC) - { - float32x4_t tmp[4]; - WinogradKernel1x3Block1x4SetOutputLoad6(src + dstC - F, srcStride, tmp); - WinogradKernel1x3Block1x4SetOutputStore4(tmp, dst + dstC - F, dstC, colE); - } - } - - void WinogradKernel1x3Block1x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - if (trans ? (dstChannels < 4) : (dstWidth < 16)) - { - Base::WinogradKernel1x3Block1x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - return; - } - size_t tileW = (dstWidth + 3) / 4; - size_t dstW4 = AlignLo(dstWidth, 4); - if (trans) - { - for (size_t row = 0; row < dstHeight; row += 1) - { - size_t col; - for (col = 0; col < dstW4; col += 4) - WinogradKernel1x3Block1x4SetOutput4t(src, srcStride, dst + col * dstChannels, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel1x3Block1x4SetOutput4t(src, srcStride, dst + col * dstChannels, dstChannels, dstWidth - col), src += dstChannels; - dst += dstWidth * dstChannels; - } - } - else - { - Base::WinogradKernel1x3Block1x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel1x5Block1x4SetFilter(const float32x4_t* t, float* dst, size_t stride) - { - const float32x4_t r36 = vdupq_n_f32(1.0f / 36.0f); - const float32x4_t r48 = vdupq_n_f32(1.0f / 48.0f); - const float32x4_t mr120 = vdupq_n_f32(-1.0f / 120.0f); - const float32x4_t r720 = vdupq_n_f32(1.0f / 720.0f); - const float32x4_t _2 = vdupq_n_f32(2.0f); - const float32x4_t _3 = vdupq_n_f32(3.0f); - const float32x4_t _4 = vdupq_n_f32(4.0f); - const float32x4_t _9 = vdupq_n_f32(9.0f); - Store(dst + 0 * stride, vmulq_f32(r36, t[0])); - float32x4_t a[2]; - a[0] = vaddq_f32(vaddq_f32(t[0], t[2]), t[4]); - a[1] = vaddq_f32(t[1], t[3]); - Store(dst + 1 * stride, vmulq_f32(r48, vaddq_f32(a[0], a[1]))); - Store(dst + 2 * stride, vmulq_f32(r48, vsubq_f32(a[0], a[1]))); - a[0] = vaddq_f32(t[0], vmulq_f32(_4, vaddq_f32(t[2], vmulq_f32(_4, t[4])))); - a[1] = vmulq_f32(_2, vaddq_f32(t[1], vmulq_f32(_4, t[3]))); - Store(dst + 3 * stride, vmulq_f32(mr120, vaddq_f32(a[0], a[1]))); - Store(dst + 4 * stride, vmulq_f32(mr120, vsubq_f32(a[0], a[1]))); - a[0] = vaddq_f32(t[0], vmulq_f32(_9, vaddq_f32(t[2], vmulq_f32(_9, t[4])))); - a[1] = vmulq_f32(_3, vaddq_f32(t[1], vmulq_f32(_9, t[3]))); - Store(dst + 5 * stride, vmulq_f32(r720, vaddq_f32(a[0], a[1]))); - Store(dst + 6 * stride, vmulq_f32(r720, vsubq_f32(a[0], a[1]))); - Store(dst + 7 * stride, t[4]); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetFilter4n(const float* src, float* dst, size_t stride) - { - float32x4_t s[5]; - Load4(src + 0, 5, s + 0); - s[4] = SetF32(src[4], src[9], src[14], src[19]); - WinogradKernel1x5Block1x4SetFilter(s, dst, stride); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetFilter4t(const float* src, float* dst, size_t stride) - { - float32x4_t s[5]; - s[0] = Load(src + 0 * stride); - s[1] = Load(src + 1 * stride); - s[2] = Load(src + 2 * stride); - s[3] = Load(src + 3 * stride); - s[4] = Load(src + 4 * stride); - WinogradKernel1x5Block1x4SetFilter(s, dst, stride); - } - - void WinogradKernel1x5Block1x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans) - { - size_t size4 = AlignLo(size, 4), i = 0; - if (trans) - { - for (; i < size4; i += 4) - WinogradKernel1x5Block1x4SetFilter4t(src + i, dst + i, size); - for (; i < size; i += 1) - Base::WinogradKernel1x5Block1x4SetFilter1t(src + i, dst + i, size); - } - else - { - for (; i < size4; i += 4, src += 20, dst += 4) - WinogradKernel1x5Block1x4SetFilter4n(src, dst, size); - for (; i < size; i += 1, src += 5, dst += 1) - Base::WinogradKernel1x5Block1x4SetFilter1n(src, dst, size); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel1x5Block1x4SetInput4Store(const float32x4_t src[8], float* dst, size_t stride) - { - float32x4_t _2 = vdupq_n_f32(2.0f); - float32x4_t _3 = vdupq_n_f32(3.0f); - float32x4_t _4 = vdupq_n_f32(4.0f); - float32x4_t _5 = vdupq_n_f32(5.0f); - float32x4_t _9 = vdupq_n_f32(9.0f); - float32x4_t _10 = vdupq_n_f32(10.0f); - float32x4_t _13 = vdupq_n_f32(13.0f); - float32x4_t _14 = vdupq_n_f32(14.0f); - float32x4_t _36 = vdupq_n_f32(36.0f); - float32x4_t _49 = vdupq_n_f32(49.0f); - Store(dst + 0 * stride, vaddq_f32(vsubq_f32(vmulq_f32(_36, src[0]), vmulq_f32(_49, src[2])), vsubq_f32(vmulq_f32(_14, src[4]), src[6]))); - float32x4_t a[2]; - a[0] = vaddq_f32(vsubq_f32(vmulq_f32(_36, src[2]), vmulq_f32(_13, src[4])), src[6]); - a[1] = vaddq_f32(vsubq_f32(vmulq_f32(_36, src[1]), vmulq_f32(_13, src[3])), src[5]); - Store(dst + 1 * stride, vaddq_f32(a[0], a[1])); - Store(dst + 2 * stride, vsubq_f32(a[0], a[1])); - a[0] = vaddq_f32(vsubq_f32(vmulq_f32(_9, src[2]), vmulq_f32(_10, src[4])), src[6]); - a[1] = vmulq_f32(_2, vaddq_f32(vsubq_f32(vmulq_f32(_9, src[1]), vmulq_f32(_10, src[3])), src[5])); - Store(dst + 3 * stride, vaddq_f32(a[0], a[1])); - Store(dst + 4 * stride, vsubq_f32(a[0], a[1])); - a[0] = vaddq_f32(vsubq_f32(vmulq_f32(_4, src[2]), vmulq_f32(_5, src[4])), src[6]); - a[1] = vmulq_f32(_3, vaddq_f32(vsubq_f32(vmulq_f32(_4, src[1]), vmulq_f32(_5, src[3])), src[5])); - Store(dst + 5 * stride, vaddq_f32(a[0], a[1])); - Store(dst + 6 * stride, vsubq_f32(a[0], a[1])); - Store(dst + 7 * stride, vaddq_f32(vsubq_f32(vmulq_f32(_49, src[3]), vmulq_f32(_36, src[1])), vsubq_f32(src[7], vmulq_f32(_14, src[5])))); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetInput4t(const float* src, size_t srcC, float32x4_t dst[8]) - { - dst[0] = Load(src + 0 * srcC); - dst[1] = Load(src + 1 * srcC); - dst[2] = Load(src + 2 * srcC); - dst[3] = Load(src + 3 * srcC); - dst[4] = Load(src + 4 * srcC); - dst[5] = Load(src + 5 * srcC); - dst[6] = Load(src + 6 * srcC); - dst[7] = Load(src + 7 * srcC); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetInput4t(const float* src, size_t srcC, float* dst, size_t dstStride) - { - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - float32x4_t tmp[8]; - WinogradKernel1x5Block1x4SetInput4t(src + c, srcC, tmp); - WinogradKernel1x5Block1x4SetInput4Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - float32x4_t tmp[8]; - WinogradKernel1x5Block1x4SetInput4t(src + srcC - F, srcC, tmp); - WinogradKernel1x5Block1x4SetInput4Store(tmp, dst + srcC - F, dstStride); - } - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetInput4t(const float* src, size_t srcC, size_t colB, size_t colE, float32x4_t dst[8]) - { - for (size_t col = 0; col < colB; ++col) - dst[col] = vdupq_n_f32(0.0f); - for (size_t col = colB; col < colE; ++col) - dst[col] = Load(src + col * srcC); - for (size_t col = colE; col < 8; ++col) - dst[col] = vdupq_n_f32(0.0f); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetInput4t(const float* src, size_t srcC, size_t colB, size_t colE, float* dst, size_t dstStride) - { - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - float32x4_t tmp[8]; - WinogradKernel1x5Block1x4SetInput4t(src + c, srcC, colB, colE, tmp); - WinogradKernel1x5Block1x4SetInput4Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - float32x4_t tmp[8]; - WinogradKernel1x5Block1x4SetInput4t(src + srcC - F, srcC, colB, colE, tmp); - WinogradKernel1x5Block1x4SetInput4Store(tmp, dst + srcC - F, dstStride); - } - } - - void WinogradKernel1x5Block1x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padX == padW && padY == 0 && padH == 0 && (padX == 0 || padX == 2)); - if (trans ? (srcChannels < F) : true) - { - Base::WinogradKernel1x5Block1x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - return; - } - size_t dstH = srcHeight; - size_t dstW = padX ? srcWidth : srcWidth - 4; - size_t tileW = (dstW + 3) / 4; - size_t dstW4 = AlignLo(dstW, 4); - size_t noseW = Simd::Min(8, dstW + 2); - size_t startX = padX ? 4 : 0; - if (padX) - { - if (dstW == dstW4 || dstW == dstW4 + 1) - dstW4 -= 4; - src -= 2 * srcChannels; - } - size_t tailW = dstW - dstW4 + (padX ? 2 : 4); - for (size_t row = 0; row < dstH; row += 1) - { - size_t col = 0; - if (padX) - WinogradKernel1x5Block1x4SetInput4t(src, srcChannels, 2, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel1x5Block1x4SetInput4t(src + col * srcChannels, srcChannels, dst, dstStride), dst += srcChannels; - for (size_t tail = tailW; col < dstW; col += 4, tail -= 4) - WinogradKernel1x5Block1x4SetInput4t(src + col * srcChannels, srcChannels, 0, tail, dst, dstStride), dst += srcChannels; - src += srcWidth * srcChannels; - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel1x5Block1x4SetOutputLoad8(const float* src, size_t stride, float32x4_t dst[4]) - { - const float32x4_t _2 = vdupq_n_f32(2.0f); - const float32x4_t _3 = vdupq_n_f32(3.0f); - const float32x4_t _4 = vdupq_n_f32(4.0f); - const float32x4_t _9 = vdupq_n_f32(9.0f); - float32x4_t s[8]; - s[0] = Load(src + 1 * stride); - s[7] = Load(src + 2 * stride); - s[1] = vaddq_f32(s[0], s[7]); - s[2] = vsubq_f32(s[0], s[7]); - s[0] = Load(src + 3 * stride); - s[7] = Load(src + 4 * stride); - s[3] = vaddq_f32(s[0], s[7]); - s[4] = vmulq_f32(_2, vsubq_f32(s[0], s[7])); - s[0] = Load(src + 5 * stride); - s[7] = Load(src + 6 * stride); - s[5] = vaddq_f32(s[0], s[7]); - s[6] = vmulq_f32(_3, vsubq_f32(s[0], s[7])); - dst[0] = vaddq_f32(Load(src + 0 * stride), vaddq_f32(vaddq_f32(s[1], s[3]), s[5])); - dst[1] = vaddq_f32(s[2], vaddq_f32(s[4], s[6])); - dst[2] = vaddq_f32(s[1], vaddq_f32(vmulq_f32(_4, s[3]), vmulq_f32(_9, s[5]))); - dst[3] = vaddq_f32(Load(src + 7 * stride), vaddq_f32(vaddq_f32(s[2], vmulq_f32(_4, s[4])), vmulq_f32(_9, s[6]))); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetOutputStore4(const float32x4_t src[4], float* dst, size_t dstC) - { - Store(dst + 0 * dstC, src[0]); - Store(dst + 1 * dstC, src[1]); - Store(dst + 2 * dstC, src[2]); - Store(dst + 3 * dstC, src[3]); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstC) - { - size_t dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - float32x4_t tmp[4]; - WinogradKernel1x5Block1x4SetOutputLoad8(src + d, srcStride, tmp); - WinogradKernel1x5Block1x4SetOutputStore4(tmp, dst + d, dstC); - } - if (dstCF < dstC) - { - float32x4_t tmp[4]; - WinogradKernel1x5Block1x4SetOutputLoad8(src + dstC - F, srcStride, tmp); - WinogradKernel1x5Block1x4SetOutputStore4(tmp, dst + dstC - F, dstC); - } - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetOutputStore4(const float32x4_t src[4], float* dst, size_t dstC, size_t colE) - { - for (size_t col = 0; col < colE; ++col) - Store(dst + col * dstC, src[col]); - } - - SIMD_INLINE void WinogradKernel1x5Block1x4SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstC, size_t colE) - { - size_t dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - float32x4_t tmp[4]; - WinogradKernel1x5Block1x4SetOutputLoad8(src + d, srcStride, tmp); - WinogradKernel1x5Block1x4SetOutputStore4(tmp, dst + d, dstC, colE); - } - if (dstCF < dstC) - { - float32x4_t tmp[4]; - WinogradKernel1x5Block1x4SetOutputLoad8(src + dstC - F, srcStride, tmp); - WinogradKernel1x5Block1x4SetOutputStore4(tmp, dst + dstC - F, dstC, colE); - } - } - - void WinogradKernel1x5Block1x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - if (trans ? (dstChannels < F) : true) - { - Base::WinogradKernel1x5Block1x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - return; - } - size_t tileW = (dstWidth + 3) / 4; - size_t dstW4 = AlignLo(dstWidth, 4); - for (size_t row = 0; row < dstHeight; row += 1) - { - size_t col; - for (col = 0; col < dstW4; col += 4) - WinogradKernel1x5Block1x4SetOutput4t(src, srcStride, dst + col * dstChannels, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel1x5Block1x4SetOutput4t(src, srcStride, dst + col * dstChannels, dstChannels, dstWidth - col), src += dstChannels; - dst += dstWidth * dstChannels; - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block2x2SetFilter(const float32x4_t src[4], float* dst, size_t stride) - { - Store(dst + 0 * stride, src[0]); - Store(dst + 1 * stride, vaddq_f32(src[0], src[1])); - Store(dst + 2 * stride, src[1]); - - Store(dst + 3 * stride, vaddq_f32(src[0], src[2])); - Store(dst + 4 * stride, vaddq_f32(vaddq_f32(src[0], src[1]), vaddq_f32(src[2], src[3]))); - Store(dst + 5 * stride, vaddq_f32(src[1], src[3])); - - Store(dst + 6 * stride, src[2]); - Store(dst + 7 * stride, vaddq_f32(src[2], src[3])); - Store(dst + 8 * stride, src[3]); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetFilter4n(const float* src, float* dst, size_t stride) - { - float32x4_t _src[4]; - Load4(src + 0, 4, _src + 0); - WinogradKernel2x2Block2x2SetFilter(_src, dst, stride); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetFilter4t(const float* src, float* dst, size_t stride) - { - float32x4_t _src[4]; - _src[0] = Load(src + 0 * stride); - _src[1] = Load(src + 1 * stride); - _src[2] = Load(src + 2 * stride); - _src[3] = Load(src + 3 * stride); - WinogradKernel2x2Block2x2SetFilter(_src, dst, stride); - } - - void WinogradKernel2x2Block2x2SetFilter(const float* src, size_t size, float* dst, SimdBool trans) - { - size_t size4 = AlignLo(size, 4), i = 0; - if (trans) - { - for (; i < size4; i += 4) - WinogradKernel2x2Block2x2SetFilter4t(src + i, dst + i, size); - for (; i < size; i += 1) - Base::WinogradKernel2x2Block2x2SetFilter1t(src + i, dst + i, size); - } - else - { - for (; i < size4; i += 4, src += 16, dst += 4) - WinogradKernel2x2Block2x2SetFilter4n(src, dst, size); - for (; i < size; i += 1, src += 4, dst += 1) - Base::WinogradKernel2x2Block2x2SetFilter1n(src, dst, size); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block2x2SetInput4Store(const float32x4_t* src, float* dst, size_t stride) - { - Store(dst + 0 * stride, vaddq_f32(vsubq_f32(src[0], src[1]), vsubq_f32(src[4], src[3]))); - Store(dst + 1 * stride, vsubq_f32(src[1], src[4])); - Store(dst + 2 * stride, vaddq_f32(vsubq_f32(src[2], src[1]), vsubq_f32(src[4], src[5]))); - Store(dst + 3 * stride, vsubq_f32(src[3], src[4])); - Store(dst + 4 * stride, src[4]); - Store(dst + 5 * stride, vsubq_f32(src[5], src[4])); - Store(dst + 6 * stride, vaddq_f32(vsubq_f32(src[4], src[3]), vsubq_f32(src[6], src[7]))); - Store(dst + 7 * stride, vsubq_f32(src[7], src[4])); - Store(dst + 8 * stride, vaddq_f32(vsubq_f32(src[4], src[5]), vsubq_f32(src[8], src[7]))); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetInput4t(const float* src, size_t srcS, size_t srcC, float32x4_t dst[9]) - { - dst[0] = Load(src + 0 * srcS + 0 * srcC); - dst[1] = Load(src + 0 * srcS + 1 * srcC); - dst[2] = Load(src + 0 * srcS + 2 * srcC); - dst[3] = Load(src + 1 * srcS + 0 * srcC); - dst[4] = Load(src + 1 * srcS + 1 * srcC); - dst[5] = Load(src + 1 * srcS + 2 * srcC); - dst[6] = Load(src + 2 * srcS + 0 * srcC); - dst[7] = Load(src + 2 * srcS + 1 * srcC); - dst[8] = Load(src + 2 * srcS + 2 * srcC); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetInput4t(const float* src, size_t srcW, size_t srcC, float* dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - float32x4_t tmp[9]; - WinogradKernel2x2Block2x2SetInput4t(src + c, srcS, srcC, tmp); - WinogradKernel2x2Block2x2SetInput4Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - float32x4_t tmp[9]; - WinogradKernel2x2Block2x2SetInput4t(src + srcC - F, srcS, srcC, tmp); - WinogradKernel2x2Block2x2SetInput4Store(tmp, dst + srcC - F, dstStride); - } - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetInput4t(const float* src, size_t srcS, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float32x4_t dst[9]) - { - for (size_t i = 0; i < 9; ++i) - dst[i] = vdupq_n_f32(0.0f); - for (size_t row = rowB; row < rowE; ++row) - for (size_t col = colB; col < colE; ++col) - dst[row * 3 + col] = Load(src + row * srcS + col * srcC); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetInput4t(const float* src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float* dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - float32x4_t tmp[9]; - WinogradKernel2x2Block2x2SetInput4t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel2x2Block2x2SetInput4Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - float32x4_t tmp[9]; - WinogradKernel2x2Block2x2SetInput4t(src + srcC - F, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel2x2Block2x2SetInput4Store(tmp, dst + srcC - F, dstStride); - } - } - - void WinogradKernel2x2Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padY == padX && padW == padH && (padY + padH == 0 || padY + padH == 1)); - if (trans ? (srcChannels < F) : true) - { - Base::WinogradKernel2x2Block2x2SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - return; - } - size_t dstH = srcHeight - 1 + padY + padH; - size_t dstW = srcWidth - 1 + padX + padW; - size_t dstH2 = AlignLo(dstH, 2); - size_t dstW2 = AlignLo(dstW, 2); - size_t noseW = Simd::Min(3, dstW + 1); - size_t noseH = Simd::Min(3, dstH + 1); - size_t startY = padY ? 2 : 0; - size_t startX = padX ? 2 : 0; - if (padY || padH) - { - if (dstH == dstH2) - dstH2 -= 2; - if (dstW == dstW2) - dstW2 -= 2; - if (padY) - src -= (srcWidth + 1) * (trans ? srcChannels : 1); - } - size_t tailW = dstW - dstW2 + (padW ? 0 : 1); - size_t tailH = dstH - dstH2 + (padH ? 0 : 1); - size_t row = 0, col = 0; - if (padY) - { - if (padX) - WinogradKernel2x2Block2x2SetInput4t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW2; col += 2) - WinogradKernel2x2Block2x2SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 3, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel2x2Block2x2SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; - } - for (row = startY; row < dstH2; row += 2) - { - if (padX) - WinogradKernel2x2Block2x2SetInput4t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 3, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW2; col += 2) - WinogradKernel2x2Block2x2SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel2x2Block2x2SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 3, 0, tailW, dst, dstStride), dst += srcChannels; - } - if (row < dstH) - { - if (padX) - WinogradKernel2x2Block2x2SetInput4t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW2; col += 2) - WinogradKernel2x2Block2x2SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 3, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel2x2Block2x2SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block2x2SetOutputLoad9(const float* src, size_t stride, float32x4_t* dst) - { - float32x4_t s[9]; - s[0] = Load(src + 0 * stride); - s[1] = Load(src + 1 * stride); - s[2] = Load(src + 2 * stride); - s[3] = Load(src + 3 * stride); - s[4] = Load(src + 4 * stride); - s[5] = Load(src + 5 * stride); - s[6] = Load(src + 6 * stride); - s[7] = Load(src + 7 * stride); - s[8] = Load(src + 8 * stride); - dst[0] = vaddq_f32(vaddq_f32(s[0], s[1]), vaddq_f32(s[3], s[4])); - dst[1] = vaddq_f32(vaddq_f32(s[1], s[2]), vaddq_f32(s[4], s[5])); - dst[2] = vaddq_f32(vaddq_f32(s[3], s[4]), vaddq_f32(s[6], s[7])); - dst[3] = vaddq_f32(vaddq_f32(s[4], s[5]), vaddq_f32(s[7], s[8])); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetOutputStore4(const float32x4_t src[4], float* dst, size_t dstS, size_t dstC) - { - Store(dst + 0 * dstS + 0 * dstC, src[0]); - Store(dst + 0 * dstS + 1 * dstC, src[1]); - Store(dst + 1 * dstS + 0 * dstC, src[2]); - Store(dst + 1 * dstS + 1 * dstC, src[3]); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - float32x4_t tmp[4]; - WinogradKernel2x2Block2x2SetOutputLoad9(src + d, srcStride, tmp); - WinogradKernel2x2Block2x2SetOutputStore4(tmp, dst + d, dstS, dstC); - } - if (dstCF < dstC) - { - float32x4_t tmp[4]; - WinogradKernel2x2Block2x2SetOutputLoad9(src + dstC - F, srcStride, tmp); - WinogradKernel2x2Block2x2SetOutputStore4(tmp, dst + dstC - F, dstS, dstC); - } - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetOutputStore4(const float32x4_t src[4], float* dst, size_t dstS, size_t dstC, size_t rowE, size_t colE) - { - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - Store(dst + row * dstS + col * dstC, src[row * 2 + col]); - } - - SIMD_INLINE void WinogradKernel2x2Block2x2SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - float32x4_t tmp[4]; - WinogradKernel2x2Block2x2SetOutputLoad9(src + d, srcStride, tmp); - WinogradKernel2x2Block2x2SetOutputStore4(tmp, dst + d, dstS, dstC, rowE, colE); - } - if (dstCF < dstC) - { - float32x4_t tmp[4]; - WinogradKernel2x2Block2x2SetOutputLoad9(src + dstC - F, srcStride, tmp); - WinogradKernel2x2Block2x2SetOutputStore4(tmp, dst + dstC - F, dstS, dstC, rowE, colE); - } - } - - void WinogradKernel2x2Block2x2SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - if (trans ? (dstChannels < F) : true) - { - Base::WinogradKernel2x2Block2x2SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - return; - } - size_t tileH = (dstHeight + 1) / 2; - size_t tileW = (dstWidth + 1) / 2; - size_t dstH2 = AlignLo(dstHeight, 2); - size_t dstW2 = AlignLo(dstWidth, 2); - size_t row, col; - for (row = 0; row < dstH2; row += 2) - { - for (col = 0; col < dstW2; col += 2) - WinogradKernel2x2Block2x2SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel2x2Block2x2SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, 2, dstWidth - col), src += dstChannels; - } - if (row < dstHeight) - { - for (col = 0; col < dstW2; col += 2) - WinogradKernel2x2Block2x2SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, 2), src += dstChannels; - if (col < dstWidth) - WinogradKernel2x2Block2x2SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block4x4SetFilterRow(const float32x4_t* t, float* dst, size_t stride) - { - const float32x4_t r2 = vdupq_n_f32(1.0f / 2.0f); - const float32x4_t r3 = vdupq_n_f32(1.0f / 3.0f); - const float32x4_t r6 = vdupq_n_f32(1.0f / 6.0f); - const float32x4_t mr2 = vdupq_n_f32(-1.0f / 2.0f); - - Store(dst + 0 * stride, vmulq_f32(r2, t[0])); - Store(dst + 1 * stride, vmulq_f32(mr2, vaddq_f32(t[0], t[1]))); - Store(dst + 2 * stride, vmulq_f32(r6, vsubq_f32(t[1], t[0]))); - Store(dst + 3 * stride, vaddq_f32(vmulq_f32(r6, t[0]), vmulq_f32(r3, t[1]))); - Store(dst + 4 * stride, t[1]); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetFilter(const float32x4_t src[4], float* dst, size_t stride) - { - const float32x4_t r2 = vdupq_n_f32(1.0f / 2.0f); - const float32x4_t r3 = vdupq_n_f32(1.0f / 3.0f); - const float32x4_t r6 = vdupq_n_f32(1.0f / 6.0f); - const float32x4_t mr2 = vdupq_n_f32(-1.0f / 2.0f); - - float32x4_t t[2]; - t[0] = vmulq_f32(r2, src[0]); - t[1] = vmulq_f32(r2, src[1]); - WinogradKernel2x2Block4x4SetFilterRow(t, dst + 0 * stride, stride); - - t[0] = vmulq_f32(mr2, vaddq_f32(src[0], src[2])); - t[1] = vmulq_f32(mr2, vaddq_f32(src[1], src[3])); - WinogradKernel2x2Block4x4SetFilterRow(t, dst + 5 * stride, stride); - - t[0] = vmulq_f32(r6, vsubq_f32(src[2], src[0])); - t[1] = vmulq_f32(r6, vsubq_f32(src[3], src[1])); - WinogradKernel2x2Block4x4SetFilterRow(t, dst + 10 * stride, stride); - - t[0] = vaddq_f32(vmulq_f32(r6, src[0]), vmulq_f32(r3, src[2])); - t[1] = vaddq_f32(vmulq_f32(r6, src[1]), vmulq_f32(r3, src[3])); - WinogradKernel2x2Block4x4SetFilterRow(t, dst + 15 * stride, stride); - - t[0] = src[2]; - t[1] = src[3]; - WinogradKernel2x2Block4x4SetFilterRow(t, dst + 20 * stride, stride); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetFilter4n(const float* src, float* dst, size_t stride) - { - float32x4_t _src[4]; - Load4(src + 0, 4, _src + 0); - WinogradKernel2x2Block4x4SetFilter(_src, dst, stride); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetFilter4t(const float* src, float* dst, size_t stride) - { - float32x4_t _src[4]; - _src[0] = Load(src + 0 * stride); - _src[1] = Load(src + 1 * stride); - _src[2] = Load(src + 2 * stride); - _src[3] = Load(src + 3 * stride); - WinogradKernel2x2Block4x4SetFilter(_src, dst, stride); - } - - void WinogradKernel2x2Block4x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans) - { - size_t size4 = AlignLo(size, 4), i = 0; - if (trans) - { - for (; i < size4; i += 4) - WinogradKernel2x2Block4x4SetFilter4t(src + i, dst + i, size); - for (; i < size; i += 1) - Base::WinogradKernel2x2Block4x4SetFilter1t(src + i, dst + i, size); - } - else - { - for (; i < size4; i += 4, src += 16, dst += 4) - WinogradKernel2x2Block4x4SetFilter4n(src, dst, size); - for (; i < size; i += 1, src += 4, dst += 1) - Base::WinogradKernel2x2Block4x4SetFilter1n(src, dst, size); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInputStoreRow(const float32x4_t tmp[5], float* dst, size_t stride) - { - const float32x4_t _2 = vdupq_n_f32(2.0f); - const float32x4_t _3 = vdupq_n_f32(3.0f); - Store(dst + 0 * stride, vaddq_f32(vsubq_f32(vmulq_f32(_2, tmp[0]), tmp[1]), vsubq_f32(tmp[3], vmulq_f32(_2, tmp[2])))); - Store(dst + 1 * stride, vsubq_f32(tmp[3], vaddq_f32(vmulq_f32(_2, tmp[1]), tmp[2]))); - Store(dst + 2 * stride, vaddq_f32(vsubq_f32(vmulq_f32(_2, tmp[1]), vmulq_f32(_3, tmp[2])), tmp[3])); - Store(dst + 3 * stride, vsubq_f32(tmp[3], tmp[1])); - Store(dst + 4 * stride, vaddq_f32(vsubq_f32(vmulq_f32(_2, tmp[1]), tmp[2]), vsubq_f32(tmp[4], vmulq_f32(_2, tmp[3])))); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInputStore(const float32x4_t* src, float* dst, size_t stride) - { - const float32x4_t _2 = vdupq_n_f32(2.0f); - const float32x4_t _3 = vdupq_n_f32(3.0f); - float32x4_t tmp[5]; - tmp[0] = vaddq_f32(vsubq_f32(vmulq_f32(_2, src[0]), src[5]), vsubq_f32(src[15], vmulq_f32(_2, src[10]))); - tmp[1] = vaddq_f32(vsubq_f32(vmulq_f32(_2, src[1]), src[6]), vsubq_f32(src[16], vmulq_f32(_2, src[11]))); - tmp[2] = vaddq_f32(vsubq_f32(vmulq_f32(_2, src[2]), src[7]), vsubq_f32(src[17], vmulq_f32(_2, src[12]))); - tmp[3] = vaddq_f32(vsubq_f32(vmulq_f32(_2, src[3]), src[8]), vsubq_f32(src[18], vmulq_f32(_2, src[13]))); - tmp[4] = vaddq_f32(vsubq_f32(vmulq_f32(_2, src[4]), src[9]), vsubq_f32(src[19], vmulq_f32(_2, src[14]))); - WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 0 * stride, stride); - - tmp[0] = vsubq_f32(src[15], vaddq_f32(vmulq_f32(_2, src[5]), src[10])); - tmp[1] = vsubq_f32(src[16], vaddq_f32(vmulq_f32(_2, src[6]), src[11])); - tmp[2] = vsubq_f32(src[17], vaddq_f32(vmulq_f32(_2, src[7]), src[12])); - tmp[3] = vsubq_f32(src[18], vaddq_f32(vmulq_f32(_2, src[8]), src[13])); - tmp[4] = vsubq_f32(src[19], vaddq_f32(vmulq_f32(_2, src[9]), src[14])); - WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 5 * stride, stride); - - tmp[0] = vaddq_f32(vsubq_f32(vmulq_f32(_2, src[5]), vmulq_f32(_3, src[10])), src[15]); - tmp[1] = vaddq_f32(vsubq_f32(vmulq_f32(_2, src[6]), vmulq_f32(_3, src[11])), src[16]); - tmp[2] = vaddq_f32(vsubq_f32(vmulq_f32(_2, src[7]), vmulq_f32(_3, src[12])), src[17]); - tmp[3] = vaddq_f32(vsubq_f32(vmulq_f32(_2, src[8]), vmulq_f32(_3, src[13])), src[18]); - tmp[4] = vaddq_f32(vsubq_f32(vmulq_f32(_2, src[9]), vmulq_f32(_3, src[14])), src[19]); - WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 10 * stride, stride); - - tmp[0] = vsubq_f32(src[15], src[5]); - tmp[1] = vsubq_f32(src[16], src[6]); - tmp[2] = vsubq_f32(src[17], src[7]); - tmp[3] = vsubq_f32(src[18], src[8]); - tmp[4] = vsubq_f32(src[19], src[9]); - WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 15 * stride, stride); - - tmp[0] = vaddq_f32(vsubq_f32(vmulq_f32(_2, src[5]), src[10]), vsubq_f32(src[20], vmulq_f32(_2, src[15]))); - tmp[1] = vaddq_f32(vsubq_f32(vmulq_f32(_2, src[6]), src[11]), vsubq_f32(src[21], vmulq_f32(_2, src[16]))); - tmp[2] = vaddq_f32(vsubq_f32(vmulq_f32(_2, src[7]), src[12]), vsubq_f32(src[22], vmulq_f32(_2, src[17]))); - tmp[3] = vaddq_f32(vsubq_f32(vmulq_f32(_2, src[8]), src[13]), vsubq_f32(src[23], vmulq_f32(_2, src[18]))); - tmp[4] = vaddq_f32(vsubq_f32(vmulq_f32(_2, src[9]), src[14]), vsubq_f32(src[24], vmulq_f32(_2, src[19]))); - WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 20 * stride, stride); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInput4t(const float* src, size_t srcS, size_t srcC, float32x4_t dst[25]) - { - dst[0] = Load(src + 0 * srcS + 0 * srcC); - dst[1] = Load(src + 0 * srcS + 1 * srcC); - dst[2] = Load(src + 0 * srcS + 2 * srcC); - dst[3] = Load(src + 0 * srcS + 3 * srcC); - dst[4] = Load(src + 0 * srcS + 4 * srcC); - dst[5] = Load(src + 1 * srcS + 0 * srcC); - dst[6] = Load(src + 1 * srcS + 1 * srcC); - dst[7] = Load(src + 1 * srcS + 2 * srcC); - dst[8] = Load(src + 1 * srcS + 3 * srcC); - dst[9] = Load(src + 1 * srcS + 4 * srcC); - dst[10] = Load(src + 2 * srcS + 0 * srcC); - dst[11] = Load(src + 2 * srcS + 1 * srcC); - dst[12] = Load(src + 2 * srcS + 2 * srcC); - dst[13] = Load(src + 2 * srcS + 3 * srcC); - dst[14] = Load(src + 2 * srcS + 4 * srcC); - dst[15] = Load(src + 3 * srcS + 0 * srcC); - dst[16] = Load(src + 3 * srcS + 1 * srcC); - dst[17] = Load(src + 3 * srcS + 2 * srcC); - dst[18] = Load(src + 3 * srcS + 3 * srcC); - dst[19] = Load(src + 3 * srcS + 4 * srcC); - dst[20] = Load(src + 4 * srcS + 0 * srcC); - dst[21] = Load(src + 4 * srcS + 1 * srcC); - dst[22] = Load(src + 4 * srcS + 2 * srcC); - dst[23] = Load(src + 4 * srcS + 3 * srcC); - dst[24] = Load(src + 4 * srcS + 4 * srcC); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInput4t(const float* src, size_t srcW, size_t srcC, float* dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - float32x4_t tmp[25]; - WinogradKernel2x2Block4x4SetInput4t(src + c, srcS, srcC, tmp); - WinogradKernel2x2Block4x4SetInputStore(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - float32x4_t tmp[25]; - WinogradKernel2x2Block4x4SetInput4t(src + srcC - F, srcS, srcC, tmp); - WinogradKernel2x2Block4x4SetInputStore(tmp, dst + srcC - F, dstStride); - } - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInput4t(const float* src, size_t srcS, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float32x4_t dst[25]) - { - for (size_t i = 0; i < 25; ++i) - dst[i] = vdupq_n_f32(0.0f); - for (size_t row = rowB; row < rowE; ++row) - for (size_t col = colB; col < colE; ++col) - dst[row * 5 + col] = Load(src + row * srcS + col * srcC); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetInput4t(const float* src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float* dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - float32x4_t tmp[25]; - WinogradKernel2x2Block4x4SetInput4t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel2x2Block4x4SetInputStore(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - float32x4_t tmp[25]; - WinogradKernel2x2Block4x4SetInput4t(src + srcC - F, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel2x2Block4x4SetInputStore(tmp, dst + srcC - F, dstStride); - } - } - - void WinogradKernel2x2Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padY == padX && padW == padH && (padY + padH == 0 || padY + padH == 1)); - if (trans ? (srcChannels < F) : true) - { - Base::WinogradKernel2x2Block4x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - return; - } - size_t dstH = srcHeight - 1 + padY + padH; - size_t dstW = srcWidth - 1 + padX + padW; - size_t dstH4 = AlignLo(dstH, 4); - size_t dstW4 = AlignLo(dstW, 4); - size_t noseW = Simd::Min(5, dstW + 1); - size_t noseH = Simd::Min(5, dstH + 1); - size_t startY = padY ? 4 : 0; - size_t startX = padX ? 4 : 0; - if (padY || padH) - { - if (dstH == dstH4) - dstH4 -= 4; - if (dstW == dstW4) - dstW4 -= 4; - if (padY) - src -= (srcWidth + 1) * (trans ? srcChannels : 1); - } - size_t tailW = dstW - dstW4 + (padW ? 0 : 1); - size_t tailH = dstH - dstH4 + (padH ? 0 : 1); - size_t row = 0, col = 0; - if (padY) - { - if (padX) - WinogradKernel2x2Block4x4SetInput4t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel2x2Block4x4SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 5, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel2x2Block4x4SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; - } - for (row = startY; row < dstH4; row += 4) - { - if (padX) - WinogradKernel2x2Block4x4SetInput4t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 5, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel2x2Block4x4SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel2x2Block4x4SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 5, 0, tailW, dst, dstStride), dst += srcChannels; - } - if (row < dstH) - { - if (padX) - WinogradKernel2x2Block4x4SetInput4t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel2x2Block4x4SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 5, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel2x2Block4x4SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputGetRow(const float32x4_t* s, float32x4_t* d) - { - const float32x4_t _2 = vdupq_n_f32(2.0f); - const float32x4_t _4 = vdupq_n_f32(4.0f); - const float32x4_t _8 = vdupq_n_f32(8.0f); - d[0] = vaddq_f32(vaddq_f32(s[0], s[1]), vaddq_f32(s[2], s[3])); - d[1] = vaddq_f32(vsubq_f32(s[1], s[2]), vmulq_f32(_2, s[3])); - d[2] = vaddq_f32(vaddq_f32(s[1], s[2]), vmulq_f32(_4, s[3])); - d[3] = vaddq_f32(vsubq_f32(s[1], s[2]), vaddq_f32(vmulq_f32(_8, s[3]), s[4])); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputLoad25(const float* src, size_t stride, float32x4_t* dst) - { - float32x4_t s[25]; - s[0] = Load(src + 0 * stride); - s[1] = Load(src + 1 * stride); - s[2] = Load(src + 2 * stride); - s[3] = Load(src + 3 * stride); - s[4] = Load(src + 4 * stride); - s[5] = Load(src + 5 * stride); - s[6] = Load(src + 6 * stride); - s[7] = Load(src + 7 * stride); - s[8] = Load(src + 8 * stride); - s[9] = Load(src + 9 * stride); - s[10] = Load(src + 10 * stride); - s[11] = Load(src + 11 * stride); - s[12] = Load(src + 12 * stride); - s[13] = Load(src + 13 * stride); - s[14] = Load(src + 14 * stride); - s[15] = Load(src + 15 * stride); - s[16] = Load(src + 16 * stride); - s[17] = Load(src + 17 * stride); - s[18] = Load(src + 18 * stride); - s[19] = Load(src + 19 * stride); - s[20] = Load(src + 20 * stride); - s[21] = Load(src + 21 * stride); - s[22] = Load(src + 22 * stride); - s[23] = Load(src + 23 * stride); - s[24] = Load(src + 24 * stride); - - const float32x4_t _2 = vdupq_n_f32(2.0f); - const float32x4_t _4 = vdupq_n_f32(4.0f); - const float32x4_t _8 = vdupq_n_f32(8.0f); - float32x4_t t[5]; - t[0] = vaddq_f32(vaddq_f32(s[0], s[5]), vaddq_f32(s[10], s[15])); - t[1] = vaddq_f32(vaddq_f32(s[1], s[6]), vaddq_f32(s[11], s[16])); - t[2] = vaddq_f32(vaddq_f32(s[2], s[7]), vaddq_f32(s[12], s[17])); - t[3] = vaddq_f32(vaddq_f32(s[3], s[8]), vaddq_f32(s[13], s[18])); - t[4] = vaddq_f32(vaddq_f32(s[4], s[9]), vaddq_f32(s[14], s[19])); - WinogradKernel2x2Block4x4SetOutputGetRow(t, dst + 0); - - t[0] = vaddq_f32(vsubq_f32(s[5], s[10]), vmulq_f32(_2, s[15])); - t[1] = vaddq_f32(vsubq_f32(s[6], s[11]), vmulq_f32(_2, s[16])); - t[2] = vaddq_f32(vsubq_f32(s[7], s[12]), vmulq_f32(_2, s[17])); - t[3] = vaddq_f32(vsubq_f32(s[8], s[13]), vmulq_f32(_2, s[18])); - t[4] = vaddq_f32(vsubq_f32(s[9], s[14]), vmulq_f32(_2, s[19])); - WinogradKernel2x2Block4x4SetOutputGetRow(t, dst + 4); - - t[0] = vaddq_f32(vaddq_f32(s[5], s[10]), vmulq_f32(_4, s[15])); - t[1] = vaddq_f32(vaddq_f32(s[6], s[11]), vmulq_f32(_4, s[16])); - t[2] = vaddq_f32(vaddq_f32(s[7], s[12]), vmulq_f32(_4, s[17])); - t[3] = vaddq_f32(vaddq_f32(s[8], s[13]), vmulq_f32(_4, s[18])); - t[4] = vaddq_f32(vaddq_f32(s[9], s[14]), vmulq_f32(_4, s[19])); - WinogradKernel2x2Block4x4SetOutputGetRow(t, dst + 8); - - t[0] = vaddq_f32(vsubq_f32(s[5], s[10]), vaddq_f32(vmulq_f32(_8, s[15]), s[20])); - t[1] = vaddq_f32(vsubq_f32(s[6], s[11]), vaddq_f32(vmulq_f32(_8, s[16]), s[21])); - t[2] = vaddq_f32(vsubq_f32(s[7], s[12]), vaddq_f32(vmulq_f32(_8, s[17]), s[22])); - t[3] = vaddq_f32(vsubq_f32(s[8], s[13]), vaddq_f32(vmulq_f32(_8, s[18]), s[23])); - t[4] = vaddq_f32(vsubq_f32(s[9], s[14]), vaddq_f32(vmulq_f32(_8, s[19]), s[24])); - WinogradKernel2x2Block4x4SetOutputGetRow(t, dst + 12); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputStore16(const float32x4_t src[16], float* dst, size_t dstS, size_t dstC) - { - Store(dst + 0 * dstS + 0 * dstC, src[0]); - Store(dst + 0 * dstS + 1 * dstC, src[1]); - Store(dst + 0 * dstS + 2 * dstC, src[2]); - Store(dst + 0 * dstS + 3 * dstC, src[3]); - Store(dst + 1 * dstS + 0 * dstC, src[4]); - Store(dst + 1 * dstS + 1 * dstC, src[5]); - Store(dst + 1 * dstS + 2 * dstC, src[6]); - Store(dst + 1 * dstS + 3 * dstC, src[7]); - Store(dst + 2 * dstS + 0 * dstC, src[8]); - Store(dst + 2 * dstS + 1 * dstC, src[9]); - Store(dst + 2 * dstS + 2 * dstC, src[10]); - Store(dst + 2 * dstS + 3 * dstC, src[11]); - Store(dst + 3 * dstS + 0 * dstC, src[12]); - Store(dst + 3 * dstS + 1 * dstC, src[13]); - Store(dst + 3 * dstS + 2 * dstC, src[14]); - Store(dst + 3 * dstS + 3 * dstC, src[15]); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - float32x4_t tmp[16]; - WinogradKernel2x2Block4x4SetOutputLoad25(src + d, srcStride, tmp); - WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + d, dstS, dstC); - } - if (dstCF < dstC) - { - float32x4_t tmp[16]; - WinogradKernel2x2Block4x4SetOutputLoad25(src + dstC - F, srcStride, tmp); - WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + dstC - F, dstS, dstC); - } - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputStore16(const float32x4_t src[16], float* dst, size_t dstS, size_t dstC, size_t rowE, size_t colE) - { - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - Store(dst + row * dstS + col * dstC, src[row * 4 + col]); - } - - SIMD_INLINE void WinogradKernel2x2Block4x4SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - float32x4_t tmp[16]; - WinogradKernel2x2Block4x4SetOutputLoad25(src + d, srcStride, tmp); - WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + d, dstS, dstC, rowE, colE); - } - if (dstCF < dstC) - { - float32x4_t tmp[16]; - WinogradKernel2x2Block4x4SetOutputLoad25(src + dstC - F, srcStride, tmp); - WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + dstC - F, dstS, dstC, rowE, colE); - } - } - - void WinogradKernel2x2Block4x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - if (trans ? (dstChannels < F) : true) - { - Base::WinogradKernel2x2Block4x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - return; - } - size_t tileH = (dstHeight + 3) / 4; - size_t tileW = (dstWidth + 3) / 4; - size_t dstH4 = AlignLo(dstHeight, 4); - size_t dstW4 = AlignLo(dstWidth, 4); - size_t row, col; - for (row = 0; row < dstH4; row += 4) - { - for (col = 0; col < dstW4; col += 4) - WinogradKernel2x2Block4x4SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel2x2Block4x4SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, 4, dstWidth - col), src += dstChannels; - } - if (row < dstHeight) - { - for (col = 0; col < dstW4; col += 4) - WinogradKernel2x2Block4x4SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, 4), src += dstChannels; - if (col < dstWidth) - WinogradKernel2x2Block4x4SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block2x2SetFilter(const float32x4_t src[9], float* dst, size_t stride) - { - const float32x4_t r2 = vdupq_n_f32(1.0f / 2.0f); - const float32x4_t r4 = vdupq_n_f32(1.0f / 4.0f); - - Store(dst + 0 * stride, src[0]); - float32x4_t _0a2 = vaddq_f32(src[0], src[2]); - Store(dst + 1 * stride, vmulq_f32(vaddq_f32(_0a2, src[1]), r2)); - Store(dst + 2 * stride, vmulq_f32(vsubq_f32(_0a2, src[1]), r2)); - Store(dst + 3 * stride, src[2]); - - float32x4_t _0a6a3 = vaddq_f32(vaddq_f32(src[0], src[6]), src[3]); - Store(dst + 4 * stride, vmulq_f32(_0a6a3, r2)); - float32x4_t _2a8a5 = vaddq_f32(vaddq_f32(src[2], src[8]), src[5]); - float32x4_t _1a7a4 = vaddq_f32(vaddq_f32(src[1], src[7]), src[4]); - Store(dst + 5 * stride, vmulq_f32(vaddq_f32(vaddq_f32(_0a6a3, _2a8a5), _1a7a4), r4)); - Store(dst + 6 * stride, vmulq_f32(vsubq_f32(vaddq_f32(_0a6a3, _2a8a5), _1a7a4), r4)); - Store(dst + 7 * stride, vmulq_f32(_2a8a5, r2)); - - float32x4_t _0a6s3 = vsubq_f32(vaddq_f32(src[0], src[6]), src[3]); - Store(dst + 8 * stride, vmulq_f32(_0a6s3, r2)); - float32x4_t _2a8s5 = vsubq_f32(vaddq_f32(src[2], src[8]), src[5]); - float32x4_t _1a7s4 = vsubq_f32(vaddq_f32(src[1], src[7]), src[4]); - Store(dst + 9 * stride, vmulq_f32(vaddq_f32(vaddq_f32(_0a6s3, _2a8s5), _1a7s4), r4)); - Store(dst + 10 * stride, vmulq_f32(vsubq_f32(vaddq_f32(_0a6s3, _2a8s5), _1a7s4), r4)); - Store(dst + 11 * stride, vmulq_f32(_2a8s5, r2)); - - Store(dst + 12 * stride, src[6]); - float32x4_t _6a8 = vaddq_f32(src[6], src[8]); - Store(dst + 13 * stride, vmulq_f32(vaddq_f32(_6a8, src[7]), r2)); - Store(dst + 14 * stride, vmulq_f32(vsubq_f32(_6a8, src[7]), r2)); - Store(dst + 15 * stride, src[8]); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetFilter4n(const float * src, float * dst, size_t stride) - { - float32x4_t _src[9]; - Load4(src + 0, 9, _src + 0); - Load4(src + 4, 9, _src + 4); - _src[8] = SetF32(src[8], src[17], src[26], src[35]); - WinogradKernel3x3Block2x2SetFilter(_src, dst, stride); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetFilter4t(const float * src, float * dst, size_t stride) - { - float32x4_t _src[9]; - _src[0] = Load(src + 0 * stride); - _src[1] = Load(src + 1 * stride); - _src[2] = Load(src + 2 * stride); - _src[3] = Load(src + 3 * stride); - _src[4] = Load(src + 4 * stride); - _src[5] = Load(src + 5 * stride); - _src[6] = Load(src + 6 * stride); - _src[7] = Load(src + 7 * stride); - _src[8] = Load(src + 8 * stride); - WinogradKernel3x3Block2x2SetFilter(_src, dst, stride); - } - - void WinogradKernel3x3Block2x2SetFilter(const float * src, size_t size, float * dst, SimdBool trans) - { - size_t size4 = AlignLo(size, 4), i = 0; - if (trans) - { - for (; i < size4; i += 4) - WinogradKernel3x3Block2x2SetFilter4t(src + i, dst + i, size); - for (; i < size; i += 1) - Base::WinogradKernel3x3Block2x2SetFilter1t(src + i, dst + i, size); - } - else - { - for (; i < size4; i += 4, src += 36, dst += 4) - WinogradKernel3x3Block2x2SetFilter4n(src, dst, size); - for (; i < size; i += 1, src += 9, dst += 1) - Base::WinogradKernel3x3Block2x2SetFilter1n(src, dst, size); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInputLoad4n(const float * src, float32x4_t * dst) - { - *(float32x4x2_t*)(dst + 0) = Load2(src + 0); - *(float32x4x2_t*)(dst + 2) = Load2(src + 2); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInputLoad4n(const float * src, float32x4_t * dst, PadType pad) - { - float32x4_t a0 = (pad == PadNose1 ? LoadPadZeroNose1(src + 0) : Load(src + 0)); - float32x4_t a1 = Load(src + 2); - float32x4_t a2 = Load(src + 4); - float32x4_t a3 = (pad == PadTail2 ? LoadPadZeroTail2(src + 6) : (pad == PadTail1 ? LoadPadZeroTail1(src + 6) : Load(src + 6))); - *(float32x4x2_t*)(dst + 0) = vuzpq_f32(a0, a2); - *(float32x4x2_t*)(dst + 2) = vuzpq_f32(a1, a3); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInputLoad4z(float32x4_t * dst) - { - dst[0] = vdupq_n_f32(0.0f); - dst[1] = vdupq_n_f32(0.0f); - dst[2] = vdupq_n_f32(0.0f); - dst[3] = vdupq_n_f32(0.0f); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput4Store(const float32x4_t * src, float * dst, size_t stride) - { - Store(dst + 0 * stride, vsubq_f32(vsubq_f32(src[0], src[8]), vsubq_f32(src[2], src[10]))); - Store(dst + 1 * stride, vaddq_f32(vsubq_f32(src[1], src[9]), vsubq_f32(src[2], src[10]))); - Store(dst + 2 * stride, vsubq_f32(vsubq_f32(src[2], src[10]), vsubq_f32(src[1], src[9]))); - Store(dst + 3 * stride, vsubq_f32(vsubq_f32(src[1], src[9]), vsubq_f32(src[3], src[11]))); - Store(dst + 4 * stride, vsubq_f32(vaddq_f32(src[4], src[8]), vaddq_f32(src[6], src[10]))); - Store(dst + 5 * stride, vaddq_f32(vaddq_f32(src[5], src[9]), vaddq_f32(src[6], src[10]))); - Store(dst + 6 * stride, vsubq_f32(vaddq_f32(src[6], src[10]), vaddq_f32(src[5], src[9]))); - Store(dst + 7 * stride, vsubq_f32(vaddq_f32(src[5], src[9]), vaddq_f32(src[7], src[11]))); - Store(dst + 8 * stride, vsubq_f32(vsubq_f32(src[8], src[4]), vsubq_f32(src[10], src[6]))); - Store(dst + 9 * stride, vaddq_f32(vsubq_f32(src[9], src[5]), vsubq_f32(src[10], src[6]))); - Store(dst + 10 * stride, vsubq_f32(vsubq_f32(src[10], src[6]), vsubq_f32(src[9], src[5]))); - Store(dst + 11 * stride, vsubq_f32(vsubq_f32(src[9], src[5]), vsubq_f32(src[11], src[7]))); - Store(dst + 12 * stride, vsubq_f32(vsubq_f32(src[4], src[12]), vsubq_f32(src[6], src[14]))); - Store(dst + 13 * stride, vaddq_f32(vsubq_f32(src[5], src[13]), vsubq_f32(src[6], src[14]))); - Store(dst + 14 * stride, vsubq_f32(vsubq_f32(src[6], src[14]), vsubq_f32(src[5], src[13]))); - Store(dst + 15 * stride, vsubq_f32(vsubq_f32(src[5], src[13]), vsubq_f32(src[7], src[15]))); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput4n(const float * src, size_t srcStride, float * dst, size_t dstStride) - { - float32x4_t t[16]; - WinogradKernel3x3Block2x2SetInputLoad4n(src + 0 * srcStride, t + 0); - WinogradKernel3x3Block2x2SetInputLoad4n(src + 1 * srcStride, t + 4); - WinogradKernel3x3Block2x2SetInputLoad4n(src + 2 * srcStride, t + 8); - WinogradKernel3x3Block2x2SetInputLoad4n(src + 3 * srcStride, t + 12); - WinogradKernel3x3Block2x2SetInput4Store(t, dst, dstStride); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput4n(const float * src, size_t srcStride, PadType rowPad, PadType colPad, float * dst, size_t dstStride) - { - float32x4_t t[16]; - if (rowPad == PadNose1) - WinogradKernel3x3Block2x2SetInputLoad4z(t + 0); - else - WinogradKernel3x3Block2x2SetInputLoad4n(src + 0 * srcStride, t + 0, colPad); - WinogradKernel3x3Block2x2SetInputLoad4n(src + 1 * srcStride, t + 4, colPad); - if (rowPad == PadTail2) - WinogradKernel3x3Block2x2SetInputLoad4z(t + 8); - else - WinogradKernel3x3Block2x2SetInputLoad4n(src + 2 * srcStride, t + 8, colPad); - if (rowPad >= PadTail1) - WinogradKernel3x3Block2x2SetInputLoad4z(t + 12); - else - WinogradKernel3x3Block2x2SetInputLoad4n(src + 3 * srcStride, t + 12, colPad); - WinogradKernel3x3Block2x2SetInput4Store(t, dst, dstStride); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput4t(const float * src, size_t srcS, size_t srcC, float32x4_t dst[16]) - { - dst[0] = Load(src + 0 * srcS + 0 * srcC); - dst[1] = Load(src + 0 * srcS + 1 * srcC); - dst[2] = Load(src + 0 * srcS + 2 * srcC); - dst[3] = Load(src + 0 * srcS + 3 * srcC); - dst[4] = Load(src + 1 * srcS + 0 * srcC); - dst[5] = Load(src + 1 * srcS + 1 * srcC); - dst[6] = Load(src + 1 * srcS + 2 * srcC); - dst[7] = Load(src + 1 * srcS + 3 * srcC); - dst[8] = Load(src + 2 * srcS + 0 * srcC); - dst[9] = Load(src + 2 * srcS + 1 * srcC); - dst[10] = Load(src + 2 * srcS + 2 * srcC); - dst[11] = Load(src + 2 * srcS + 3 * srcC); - dst[12] = Load(src + 3 * srcS + 0 * srcC); - dst[13] = Load(src + 3 * srcS + 1 * srcC); - dst[14] = Load(src + 3 * srcS + 2 * srcC); - dst[15] = Load(src + 3 * srcS + 3 * srcC); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput4t(const float * src, size_t srcW, size_t srcC, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - float32x4_t tmp[16]; - WinogradKernel3x3Block2x2SetInput4t(src + c, srcS, srcC, tmp); - WinogradKernel3x3Block2x2SetInput4Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - float32x4_t tmp[16]; - WinogradKernel3x3Block2x2SetInput4t(src + srcC - F, srcS, srcC, tmp); - WinogradKernel3x3Block2x2SetInput4Store(tmp, dst + srcC - F, dstStride); - } - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput4t(const float * src, size_t srcS, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float32x4_t dst[16]) - { - for (size_t i = 0; i < 16; ++i) - dst[i] = vdupq_n_f32(0.0f); - for (size_t row = rowB; row < rowE; ++row) - for (size_t col = colB; col < colE; ++col) - dst[row * 4 + col] = Load(src + row * srcS + col * srcC); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetInput4t(const float * src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - float32x4_t tmp[16]; - WinogradKernel3x3Block2x2SetInput4t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel3x3Block2x2SetInput4Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - float32x4_t tmp[16]; - WinogradKernel3x3Block2x2SetInput4t(src + srcC - F, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel3x3Block2x2SetInput4Store(tmp, dst + srcC - F, dstStride); - } - } - - void WinogradKernel3x3Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padY == padX && padY == padH && padY == padW && (padY == 0 || padY == 1)); - SimdBool pad = padY > 0 ? SimdTrue : SimdFalse; - if (trans ? (srcChannels < 4) : (srcHeight < 4 || srcWidth < 10)) - { - Base::WinogradKernel3x3Block2x2SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - return; - } - size_t dstH = pad ? srcHeight : srcHeight - 2; - size_t dstW = pad ? srcWidth : srcWidth - 2; - size_t tileH = (dstH + 1) / 2; - size_t tileW = (dstW + 1) / 2; - size_t dstH2 = AlignLo(dstH, 2); - size_t dstW2 = AlignLo(dstW, 2); - if (trans) - { - size_t noseW = Simd::Min(4, dstW + 1); - size_t noseH = Simd::Min(4, dstH + 1); - size_t start = pad ? 2 : 0; - if (pad) - { - if (dstH == dstH2) - dstH2 -= 2; - if (dstW == dstW2) - dstW2 -= 2; - src -= (srcWidth + 1)*srcChannels; - } - size_t tailW = dstW - dstW2 + (pad ? 1 : 2); - size_t tailH = dstH - dstH2 + (pad ? 1 : 2); - size_t row = 0, col = 0; - if (pad) - { - if (pad) - WinogradKernel3x3Block2x2SetInput4t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstW2; col += 2) - WinogradKernel3x3Block2x2SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 4, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block2x2SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; - } - for (row = start; row < dstH2; row += 2) - { - if (pad) - WinogradKernel3x3Block2x2SetInput4t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 4, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstW2; col += 2) - WinogradKernel3x3Block2x2SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block2x2SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 4, 0, tailW, dst, dstStride), dst += srcChannels; - } - if (row < dstH) - { - if (pad) - WinogradKernel3x3Block2x2SetInput4t(src + row * srcWidth* srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstW2; col += 2) - WinogradKernel3x3Block2x2SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 4, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block2x2SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; - } - } - else - { - size_t dstW8 = AlignLo(dstW, 8); - if (pad && dstW8 == dstW) - dstW8 -= 8; - PadType rowPad = dstH2 < dstH ? PadTail1 : PadNone; - PadType colPad = dstW2 < dstW ? PadTail1 : PadNone; - size_t tailCol = dstW2 < dstW ? dstW - 7 : dstW - 8; - size_t tailRow = dstH2 < dstH ? dstH - 1 : dstH - 2; - bool specialColTail = dstW8 < dstW || pad; - bool specialRowTail = dstH2 < dstH || pad; - if (pad) - { - src -= srcWidth + 1; - rowPad = dstH2 < dstH ? PadTail2 : PadTail1; - colPad = dstW2 < dstW ? PadTail2 : PadTail1; - if (dstH2 == dstH) - dstH2 -= 2; - } - for (size_t c = 0; c < srcChannels; ++c) - { - size_t row = 0, tileY = 0; - if (pad) - { - size_t col = 0, tileX = 0; - const float * s = src + row * srcWidth; - float * d = dst + tileY * tileW; - if (pad) - WinogradKernel3x3Block2x2SetInput4n(s + col, srcWidth, PadNose1, PadNose1, d + tileX, dstStride), col += 8, tileX += 4; - for (; col < dstW8; col += 8, tileX += 4) - WinogradKernel3x3Block2x2SetInput4n(s + col, srcWidth, PadNose1, PadNone, d + tileX, dstStride); - if (specialColTail) - WinogradKernel3x3Block2x2SetInput4n(s + tailCol, srcWidth, PadNose1, colPad, d + tileW - 4, dstStride); - row += 2, tileY += 1; - } - for (; row < dstH2; row += 2, tileY += 1) - { - size_t col = 0, tileX = 0; - const float * s = src + row * srcWidth; - float * d = dst + tileY * tileW; - if (pad) - WinogradKernel3x3Block2x2SetInput4n(s + col, srcWidth, PadNone, PadNose1, d + tileX, dstStride), col += 8, tileX += 4; - for (; col < dstW8; col += 8, tileX += 4) - WinogradKernel3x3Block2x2SetInput4n(s + col, srcWidth, d + tileX, dstStride); - if (specialColTail) - WinogradKernel3x3Block2x2SetInput4n(s + tailCol, srcWidth, PadNone, colPad, d + tileW - 4, dstStride); - } - if (specialRowTail) - { - size_t col = 0, tileX = 0; - const float * s = src + tailRow * srcWidth; - float * d = dst + (tileH - 1) * tileW; - if (pad) - WinogradKernel3x3Block2x2SetInput4n(s + col, srcWidth, rowPad, PadNose1, d + tileX, dstStride), col += 8, tileX += 4; - for (; col < dstW8; col += 8, tileX += 4) - WinogradKernel3x3Block2x2SetInput4n(s + col, srcWidth, rowPad, PadNone, d + tileX, dstStride); - if (specialColTail) - WinogradKernel3x3Block2x2SetInput4n(s + tailCol, srcWidth, rowPad, colPad, d + tileW - 4, dstStride); - } - src += srcWidth * srcHeight; - dst += tileW * tileH; - } - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutputLoad4(const float * src, size_t stride, float32x4_t * dst) - { - float32x4_t s0 = Load(src + 0 * stride); - float32x4_t s1 = Load(src + 1 * stride); - float32x4_t s2 = Load(src + 2 * stride); - float32x4_t s3 = Load(src + 3 * stride); - dst[0] = vaddq_f32(vaddq_f32(s0, s1), s2); - dst[1] = vsubq_f32(vsubq_f32(s1, s2), s3); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutputLoad16(const float * src, size_t stride, float32x4_t * dst) - { - float32x4_t tmp[8]; - WinogradKernel3x3Block2x2SetOutputLoad4(src + 0 * stride, stride, tmp + 0); - WinogradKernel3x3Block2x2SetOutputLoad4(src + 4 * stride, stride, tmp + 2); - WinogradKernel3x3Block2x2SetOutputLoad4(src + 8 * stride, stride, tmp + 4); - WinogradKernel3x3Block2x2SetOutputLoad4(src + 12 * stride, stride, tmp + 6); - dst[0] = vaddq_f32(vaddq_f32(tmp[0], tmp[2]), tmp[4]); - dst[1] = vaddq_f32(vaddq_f32(tmp[1], tmp[3]), tmp[5]); - dst[2] = vsubq_f32(vsubq_f32(tmp[2], tmp[4]), tmp[6]); - dst[3] = vsubq_f32(vsubq_f32(tmp[3], tmp[5]), tmp[7]); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutput4n(const float * src, size_t srcStride, float * dst, size_t dstStride) - { - float32x4x2_t tmp[2]; - WinogradKernel3x3Block2x2SetOutputLoad16(src, srcStride, (float32x4_t*)tmp); - Store2(dst + 0 * dstStride, tmp[0]); - Store2(dst + 1 * dstStride, tmp[1]); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutput4n(const float * src, size_t srcStride, float * dst, size_t dstStride, bool lastRow, bool lastCol, const uint32x4_t & mask) - { - float32x4_t tmp[4]; - WinogradKernel3x3Block2x2SetOutputLoad16(src, srcStride, tmp); - float32x4x2_t zip0 = vzipq_f32(tmp[0], tmp[1]); - Store(dst + 0, zip0.val[0]); - if (lastCol) - Store(dst + 4, zip0.val[1]); - else - StoreMasked(dst + 4, zip0.val[1], mask); - if (lastRow) - { - float32x4x2_t zip1 = vzipq_f32(tmp[2], tmp[3]); - dst += dstStride; - Store(dst + 0, zip1.val[0]); - if (lastCol) - Store(dst + 4, zip1.val[1]); - else - StoreMasked(dst + 4, zip1.val[1], mask); - } - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutputStore4(const float32x4_t src[4], float * dst, size_t dstS, size_t dstC) - { - Store(dst + 0 * dstS + 0 * dstC, src[0]); - Store(dst + 0 * dstS + 1 * dstC, src[1]); - Store(dst + 1 * dstS + 0 * dstC, src[2]); - Store(dst + 1 * dstS + 1 * dstC, src[3]); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutput4t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - float32x4_t tmp[4]; - WinogradKernel3x3Block2x2SetOutputLoad16(src + d, srcStride, tmp); - WinogradKernel3x3Block2x2SetOutputStore4(tmp, dst + d, dstS, dstC); - } - if (dstCF < dstC) - { - float32x4_t tmp[4]; - WinogradKernel3x3Block2x2SetOutputLoad16(src + dstC - F, srcStride, tmp); - WinogradKernel3x3Block2x2SetOutputStore4(tmp, dst + dstC - F, dstS, dstC); - } - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutputStore4(const float32x4_t src[4], float * dst, size_t dstS, size_t dstC, size_t rowE, size_t colE) - { - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - Store(dst + row * dstS + col * dstC, src[row * 2 + col]); - } - - SIMD_INLINE void WinogradKernel3x3Block2x2SetOutput4t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - float32x4_t tmp[4]; - WinogradKernel3x3Block2x2SetOutputLoad16(src + d, srcStride, tmp); - WinogradKernel3x3Block2x2SetOutputStore4(tmp, dst + d, dstS, dstC, rowE, colE); - } - if (dstCF < dstC) - { - float32x4_t tmp[4]; - WinogradKernel3x3Block2x2SetOutputLoad16(src + dstC - F, srcStride, tmp); - WinogradKernel3x3Block2x2SetOutputStore4(tmp, dst + dstC - F, dstS, dstC, rowE, colE); - } - } - - void WinogradKernel3x3Block2x2SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - if (trans ? (dstChannels < 4) : (dstHeight < 2 || dstWidth < 8)) - { - Base::WinogradKernel3x3Block2x2SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - return; - } - size_t tileH = (dstHeight + 1) / 2; - size_t tileW = (dstWidth + 1) / 2; - size_t dstH2 = AlignLo(dstHeight, 2); - size_t dstW2 = AlignLo(dstWidth, 2); - if (trans) - { - size_t row, col; - for (row = 0; row < dstH2; row += 2) - { - for (col = 0; col < dstW2; col += 2) - WinogradKernel3x3Block2x2SetOutput4t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block2x2SetOutput4t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, 2, dstWidth - col), src += dstChannels; - } - if (row < dstHeight) - { - for (col = 0; col < dstW2; col += 2) - WinogradKernel3x3Block2x2SetOutput4t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, 2), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block2x2SetOutput4t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; - } - } - else - { - size_t dstW8 = AlignLo(dstWidth, 8); - uint32x4_t tailMask = vreinterpretq_u32_f32(LeftNotZero32f(4 + dstW2 - dstWidth)); - size_t tailCol = dstW2 < dstWidth ? dstWidth - 7 : dstWidth - 8; - for (size_t c = 0; c < dstChannels; ++c) - { - size_t row = 0, tileY = 0; - for (; row < dstH2; row += 2, tileY += 1) - { - size_t col = 0, tileX = 0; - const float * s = src + tileY * tileW; - float * d = dst + row * dstWidth; - for (; col < dstW8; col += 8, tileX += 4) - WinogradKernel3x3Block2x2SetOutput4n(s + tileX, srcStride, d + col, dstWidth); - if (col < dstWidth) - WinogradKernel3x3Block2x2SetOutput4n(s + tileW - 4, srcStride, d + tailCol, dstWidth, true, false, tailMask); - } - if (row < dstHeight) - { - size_t col = 0, tileX = 0; - const float * s = src + (tileH - 1) * tileW; - float * d = dst + (dstHeight - 1) * dstWidth; - for (; col < dstW8; col += 8, tileX += 4) - WinogradKernel3x3Block2x2SetOutput4n(s + tileX, srcStride, d + col, dstWidth, false, true, tailMask); - if (col < dstWidth) - WinogradKernel3x3Block2x2SetOutput4n(s + tileW - 4, srcStride, d + tailCol, dstWidth, false, false, tailMask); - } - src += tileW * tileH; - dst += dstHeight * dstWidth; - } - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block3x3SetFilter4Row(const float32x4_t * t, float * dst, size_t stride) - { - const float32x4_t r6 = vdupq_n_f32(1.0f / 6.0f); - const float32x4_t r3 = vdupq_n_f32(1.0f / 3.0f); - const float32x4_t r2 = vdupq_n_f32(1.0f / 2.0f); - const float32x4_t f2_3 = vdupq_n_f32(2.0f / 3.0f); - const float32x4_t mr2 = vdupq_n_f32(-1.0f / 2.0f); - - Store(dst + 0 * stride, vmulq_f32(r2, t[0])); - float32x4_t t0 = vaddq_f32(t[0], t[2]); - Store(dst + 1 * stride, vmulq_f32(mr2, vaddq_f32(t0, t[1]))); - Store(dst + 2 * stride, vmulq_f32(r6, vsubq_f32(t[1], t0))); - Store(dst + 3 * stride, vmlaq_f32(vmlaq_f32(vmulq_f32(r3, t[1]), f2_3, t[2]), r6, t[0])); - Store(dst + 4 * stride, t[2]); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetFilter4All(const float32x4_t * s, float * dst, size_t stride) - { - const float32x4_t r6 = vdupq_n_f32(1.0f / 6.0f); - const float32x4_t r3 = vdupq_n_f32(1.0f / 3.0f); - const float32x4_t r2 = vdupq_n_f32(1.0f / 2.0f); - const float32x4_t f2_3 = vdupq_n_f32(2.0f / 3.0f); - const float32x4_t mr2 = vdupq_n_f32(-1.0f / 2.0f); - - float32x4_t t[3]; - t[0] = vmulq_f32(r2, s[0]); - t[1] = vmulq_f32(r2, s[1]); - t[2] = vmulq_f32(r2, s[2]); - WinogradKernel3x3Block3x3SetFilter4Row(t, dst + 0 * stride, stride); - - t[0] = vmulq_f32(mr2, vaddq_f32(vaddq_f32(s[0], s[6]), s[3])); - t[1] = vmulq_f32(mr2, vaddq_f32(vaddq_f32(s[1], s[7]), s[4])); - t[2] = vmulq_f32(mr2, vaddq_f32(vaddq_f32(s[2], s[8]), s[5])); - WinogradKernel3x3Block3x3SetFilter4Row(t, dst + 5 * stride, stride); - - t[0] = vmulq_f32(r6, vsubq_f32(s[3], vaddq_f32(s[0], s[6]))); - t[1] = vmulq_f32(r6, vsubq_f32(s[4], vaddq_f32(s[1], s[7]))); - t[2] = vmulq_f32(r6, vsubq_f32(s[5], vaddq_f32(s[2], s[8]))); - WinogradKernel3x3Block3x3SetFilter4Row(t, dst + 10 * stride, stride); - - t[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(r3, s[3]), f2_3, s[6]), r6, s[0]); - t[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(r3, s[4]), f2_3, s[7]), r6, s[1]); - t[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(r3, s[5]), f2_3, s[8]), r6, s[2]); - WinogradKernel3x3Block3x3SetFilter4Row(t, dst + 15 * stride, stride); - - WinogradKernel3x3Block3x3SetFilter4Row(s + 6, dst + 20 * stride, stride); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetFilter4n(const float * src, float * dst, size_t stride) - { - float32x4_t s[9]; - Load4(src + 0, 9, s + 0); - Load4(src + 4, 9, s + 4); - s[8] = SetF32(src[8], src[17], src[26], src[35]); - WinogradKernel3x3Block3x3SetFilter4All(s, dst + 0 * stride, stride); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetFilter4t(const float * src, float * dst, size_t stride) - { - float32x4_t s[9]; - s[0] = Load(src + 0 * stride); - s[1] = Load(src + 1 * stride); - s[2] = Load(src + 2 * stride); - s[3] = Load(src + 3 * stride); - s[4] = Load(src + 4 * stride); - s[5] = Load(src + 5 * stride); - s[6] = Load(src + 6 * stride); - s[7] = Load(src + 7 * stride); - s[8] = Load(src + 8 * stride); - WinogradKernel3x3Block3x3SetFilter4All(s, dst + 0 * stride, stride); - } - - void WinogradKernel3x3Block3x3SetFilter(const float * src, size_t size, float * dst, SimdBool trans) - { - size_t size4 = AlignLo(size, 4), i = 0; - if (trans) - { - for (; i < size4; i += 4) - WinogradKernel3x3Block3x3SetFilter4t(src + i, dst + i, size); - for (; i < size; i += 1) - Base::WinogradKernel3x3Block3x3SetFilter1t(src + i, dst + i, size); - } - else - { - for (; i < size4; i += 4, src += 36, dst += 4) - WinogradKernel3x3Block3x3SetFilter4n(src, dst, size); - for (; i < size; i += 1, src += 9, dst += 1) - Base::WinogradKernel3x3Block3x3SetFilter1n(src, dst, size); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block3x3SetInput4Store(const float32x4_t src[25], float * dst, size_t stride) - { - float32x4_t _2 = vdupq_n_f32(2.0f); - float32x4_t _3 = vdupq_n_f32(3.0f); - float32x4_t tmp[5]; - - tmp[0] = vaddq_f32(vmulq_f32(_2, vsubq_f32(src[0], src[10])), vsubq_f32(src[15], src[5])); - tmp[1] = vaddq_f32(vmulq_f32(_2, vsubq_f32(src[1], src[11])), vsubq_f32(src[16], src[6])); - tmp[2] = vaddq_f32(vmulq_f32(_2, vsubq_f32(src[2], src[12])), vsubq_f32(src[17], src[7])); - tmp[3] = vaddq_f32(vmulq_f32(_2, vsubq_f32(src[3], src[13])), vsubq_f32(src[18], src[8])); - tmp[4] = vaddq_f32(vmulq_f32(_2, vsubq_f32(src[4], src[14])), vsubq_f32(src[19], src[9])); - Store(dst + 0 * stride, vaddq_f32(vmulq_f32(_2, vsubq_f32(tmp[0], tmp[2])), vsubq_f32(tmp[3], tmp[1]))); - Store(dst + 1 * stride, vsubq_f32(vsubq_f32(tmp[3], tmp[2]), vmulq_f32(_2, tmp[1]))); - Store(dst + 2 * stride, vaddq_f32(vmulq_f32(_2, tmp[1]), vsubq_f32(tmp[3], vmulq_f32(_3, tmp[2])))); - Store(dst + 3 * stride, vsubq_f32(tmp[3], tmp[1])); - Store(dst + 4 * stride, vaddq_f32(vmulq_f32(_2, vsubq_f32(tmp[1], tmp[3])), vsubq_f32(tmp[4], tmp[2]))); - - tmp[0] = vsubq_f32(vsubq_f32(src[15], src[10]), vmulq_f32(_2, src[5])); - tmp[1] = vsubq_f32(vsubq_f32(src[16], src[11]), vmulq_f32(_2, src[6])); - tmp[2] = vsubq_f32(vsubq_f32(src[17], src[12]), vmulq_f32(_2, src[7])); - tmp[3] = vsubq_f32(vsubq_f32(src[18], src[13]), vmulq_f32(_2, src[8])); - tmp[4] = vsubq_f32(vsubq_f32(src[19], src[14]), vmulq_f32(_2, src[9])); - Store(dst + 5 * stride, vaddq_f32(vmulq_f32(_2, vsubq_f32(tmp[0], tmp[2])), vsubq_f32(tmp[3], tmp[1]))); - Store(dst + 6 * stride, vsubq_f32(vsubq_f32(tmp[3], tmp[2]), vmulq_f32(_2, tmp[1]))); - Store(dst + 7 * stride, vaddq_f32(vmulq_f32(_2, tmp[1]), vsubq_f32(tmp[3], vmulq_f32(_3, tmp[2])))); - Store(dst + 8 * stride, vsubq_f32(tmp[3], tmp[1])); - Store(dst + 9 * stride, vaddq_f32(vmulq_f32(_2, vsubq_f32(tmp[1], tmp[3])), vsubq_f32(tmp[4], tmp[2]))); - - tmp[0] = vaddq_f32(vmulq_f32(_2, src[5]), vsubq_f32(src[15], vmulq_f32(_3, src[10]))); - tmp[1] = vaddq_f32(vmulq_f32(_2, src[6]), vsubq_f32(src[16], vmulq_f32(_3, src[11]))); - tmp[2] = vaddq_f32(vmulq_f32(_2, src[7]), vsubq_f32(src[17], vmulq_f32(_3, src[12]))); - tmp[3] = vaddq_f32(vmulq_f32(_2, src[8]), vsubq_f32(src[18], vmulq_f32(_3, src[13]))); - tmp[4] = vaddq_f32(vmulq_f32(_2, src[9]), vsubq_f32(src[19], vmulq_f32(_3, src[14]))); - Store(dst + 10 * stride, vaddq_f32(vmulq_f32(_2, vsubq_f32(tmp[0], tmp[2])), vsubq_f32(tmp[3], tmp[1]))); - Store(dst + 11 * stride, vsubq_f32(vsubq_f32(tmp[3], tmp[2]), vmulq_f32(_2, tmp[1]))); - Store(dst + 12 * stride, vaddq_f32(vmulq_f32(_2, tmp[1]), vsubq_f32(tmp[3], vmulq_f32(_3, tmp[2])))); - Store(dst + 13 * stride, vsubq_f32(tmp[3], tmp[1])); - Store(dst + 14 * stride, vaddq_f32(vmulq_f32(_2, vsubq_f32(tmp[1], tmp[3])), vsubq_f32(tmp[4], tmp[2]))); - - tmp[0] = vsubq_f32(src[15], src[5]); - tmp[1] = vsubq_f32(src[16], src[6]); - tmp[2] = vsubq_f32(src[17], src[7]); - tmp[3] = vsubq_f32(src[18], src[8]); - tmp[4] = vsubq_f32(src[19], src[9]); - Store(dst + 15 * stride, vaddq_f32(vmulq_f32(_2, vsubq_f32(tmp[0], tmp[2])), vsubq_f32(tmp[3], tmp[1]))); - Store(dst + 16 * stride, vsubq_f32(vsubq_f32(tmp[3], tmp[2]), vmulq_f32(_2, tmp[1]))); - Store(dst + 17 * stride, vaddq_f32(vmulq_f32(_2, tmp[1]), vsubq_f32(tmp[3], vmulq_f32(_3, tmp[2])))); - Store(dst + 18 * stride, vsubq_f32(tmp[3], tmp[1])); - Store(dst + 19 * stride, vaddq_f32(vmulq_f32(_2, vsubq_f32(tmp[1], tmp[3])), vsubq_f32(tmp[4], tmp[2]))); - - tmp[0] = vaddq_f32(vmulq_f32(_2, vsubq_f32(src[5], src[15])), vsubq_f32(src[20], src[10])); - tmp[1] = vaddq_f32(vmulq_f32(_2, vsubq_f32(src[6], src[16])), vsubq_f32(src[21], src[11])); - tmp[2] = vaddq_f32(vmulq_f32(_2, vsubq_f32(src[7], src[17])), vsubq_f32(src[22], src[12])); - tmp[3] = vaddq_f32(vmulq_f32(_2, vsubq_f32(src[8], src[18])), vsubq_f32(src[23], src[13])); - tmp[4] = vaddq_f32(vmulq_f32(_2, vsubq_f32(src[9], src[19])), vsubq_f32(src[24], src[14])); - Store(dst + 20 * stride, vaddq_f32(vmulq_f32(_2, vsubq_f32(tmp[0], tmp[2])), vsubq_f32(tmp[3], tmp[1]))); - Store(dst + 21 * stride, vsubq_f32(vsubq_f32(tmp[3], tmp[2]), vmulq_f32(_2, tmp[1]))); - Store(dst + 22 * stride, vaddq_f32(vmulq_f32(_2, tmp[1]), vsubq_f32(tmp[3], vmulq_f32(_3, tmp[2])))); - Store(dst + 23 * stride, vsubq_f32(tmp[3], tmp[1])); - Store(dst + 24 * stride, vaddq_f32(vmulq_f32(_2, vsubq_f32(tmp[1], tmp[3])), vsubq_f32(tmp[4], tmp[2]))); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetInput4t(const float * src, size_t srcS, size_t srcC, float32x4_t dst[25]) - { - dst[0] = Load(src + 0 * srcS + 0 * srcC); - dst[1] = Load(src + 0 * srcS + 1 * srcC); - dst[2] = Load(src + 0 * srcS + 2 * srcC); - dst[3] = Load(src + 0 * srcS + 3 * srcC); - dst[4] = Load(src + 0 * srcS + 4 * srcC); - dst[5] = Load(src + 1 * srcS + 0 * srcC); - dst[6] = Load(src + 1 * srcS + 1 * srcC); - dst[7] = Load(src + 1 * srcS + 2 * srcC); - dst[8] = Load(src + 1 * srcS + 3 * srcC); - dst[9] = Load(src + 1 * srcS + 4 * srcC); - dst[10] = Load(src + 2 * srcS + 0 * srcC); - dst[11] = Load(src + 2 * srcS + 1 * srcC); - dst[12] = Load(src + 2 * srcS + 2 * srcC); - dst[13] = Load(src + 2 * srcS + 3 * srcC); - dst[14] = Load(src + 2 * srcS + 4 * srcC); - dst[15] = Load(src + 3 * srcS + 0 * srcC); - dst[16] = Load(src + 3 * srcS + 1 * srcC); - dst[17] = Load(src + 3 * srcS + 2 * srcC); - dst[18] = Load(src + 3 * srcS + 3 * srcC); - dst[19] = Load(src + 3 * srcS + 4 * srcC); - dst[20] = Load(src + 4 * srcS + 0 * srcC); - dst[21] = Load(src + 4 * srcS + 1 * srcC); - dst[22] = Load(src + 4 * srcS + 2 * srcC); - dst[23] = Load(src + 4 * srcS + 3 * srcC); - dst[24] = Load(src + 4 * srcS + 4 * srcC); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetInput4t(const float * src, size_t srcW, size_t srcC, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - float32x4_t tmp[25]; - WinogradKernel3x3Block3x3SetInput4t(src + c, srcS, srcC, tmp); - WinogradKernel3x3Block3x3SetInput4Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - float32x4_t tmp[25]; - WinogradKernel3x3Block3x3SetInput4t(src + srcC - F, srcS, srcC, tmp); - WinogradKernel3x3Block3x3SetInput4Store(tmp, dst + srcC - F, dstStride); - } - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetInput4t(const float * src, size_t srcS, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float32x4_t dst[25]) - { - for (size_t i = 0; i < 25; ++i) - dst[i] = vdupq_n_f32(0.0f); - for (size_t row = rowB; row < rowE; ++row) - for (size_t col = colB; col < colE; ++col) - dst[row * 5 + col] = Load(src + row * srcS + col * srcC); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetInput4t(const float * src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - float32x4_t tmp[25]; - WinogradKernel3x3Block3x3SetInput4t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel3x3Block3x3SetInput4Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - float32x4_t tmp[25]; - WinogradKernel3x3Block3x3SetInput4t(src + srcC - F, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel3x3Block3x3SetInput4Store(tmp, dst + srcC - F, dstStride); - } - } - - void WinogradKernel3x3Block3x3SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - assert(padY == padX && padY == padH && padY == padW && (padY == 0 || padY == 1)); - SimdBool pad = padY > 0 ? SimdTrue : SimdFalse; - if (trans ? (srcChannels < 4) : (srcHeight < 5 || srcWidth < 11)) - { - Base::WinogradKernel3x3Block3x3SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - return; - } - size_t dstH = pad ? srcHeight : srcHeight - 2; - size_t dstW = pad ? srcWidth : srcWidth - 2; - size_t tileH = (dstH + 2) / 3; - size_t tileW = (dstW + 2) / 3; - size_t dstH3 = AlignLoAny(dstH, 3); - size_t dstW3 = AlignLoAny(dstW, 3); - if (trans) - { - size_t noseW = Simd::Min(5, dstW + 1); - size_t noseH = Simd::Min(5, dstH + 1); - size_t start = pad ? 3 : 0; - if (pad) - { - if (dstH == dstH3) - dstH3 -= 3; - if (dstW == dstW3) - dstW3 -= 3; - src -= (srcWidth + 1)*srcChannels; - } - size_t tailW = dstW - dstW3 + (pad ? 1 : 2); - size_t tailH = dstH - dstH3 + (pad ? 1 : 2); - size_t row = 0, col = 0; - if (pad) - { - if (pad) - WinogradKernel3x3Block3x3SetInput4t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstW3; col += 3) - WinogradKernel3x3Block3x3SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 5, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block3x3SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; - } - for (row = start; row < dstH3; row += 3) - { - if (pad) - WinogradKernel3x3Block3x3SetInput4t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 5, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstW3; col += 3) - WinogradKernel3x3Block3x3SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block3x3SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 5, 0, tailW, dst, dstStride), dst += srcChannels; - } - if (row < dstH) - { - if (pad) - WinogradKernel3x3Block3x3SetInput4t(src + row * srcWidth* srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = start; col < dstW3; col += 3) - WinogradKernel3x3Block3x3SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 5, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block3x3SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; - } - } - else - { - Base::WinogradKernel3x3Block3x3SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block3x3SetOutputLoad25(const float * src, size_t stride, float32x4_t dst[9]) - { - float32x4_t s[25]; - s[0] = Load(src + 0 * stride); - s[1] = Load(src + 1 * stride); - s[2] = Load(src + 2 * stride); - s[3] = Load(src + 3 * stride); - s[4] = Load(src + 4 * stride); - s[5] = Load(src + 5 * stride); - s[6] = Load(src + 6 * stride); - s[7] = Load(src + 7 * stride); - s[8] = Load(src + 8 * stride); - s[9] = Load(src + 9 * stride); - s[10] = Load(src + 10 * stride); - s[11] = Load(src + 11 * stride); - s[12] = Load(src + 12 * stride); - s[13] = Load(src + 13 * stride); - s[14] = Load(src + 14 * stride); - s[15] = Load(src + 15 * stride); - s[16] = Load(src + 16 * stride); - s[17] = Load(src + 17 * stride); - s[18] = Load(src + 18 * stride); - s[19] = Load(src + 19 * stride); - s[20] = Load(src + 20 * stride); - s[21] = Load(src + 21 * stride); - s[22] = Load(src + 22 * stride); - s[23] = Load(src + 23 * stride); - s[24] = Load(src + 24 * stride); - - float32x4_t _2 = vdupq_n_f32(2.0f); - float32x4_t _4 = vdupq_n_f32(4.0f); - float32x4_t t[5]; - t[0] = vaddq_f32(vaddq_f32(s[0], s[5]), vaddq_f32(s[10], s[15])); - t[1] = vaddq_f32(vaddq_f32(s[1], s[6]), vaddq_f32(s[11], s[16])); - t[2] = vaddq_f32(vaddq_f32(s[2], s[7]), vaddq_f32(s[12], s[17])); - t[3] = vaddq_f32(vaddq_f32(s[3], s[8]), vaddq_f32(s[13], s[18])); - t[4] = vaddq_f32(vaddq_f32(s[4], s[9]), vaddq_f32(s[14], s[19])); - dst[0] = vaddq_f32(vaddq_f32(t[0], t[1]), vaddq_f32(t[2], t[3])); - dst[1] = vaddq_f32(vsubq_f32(t[1], t[2]), vmulq_f32(_2, t[3])); - dst[2] = vaddq_f32(vaddq_f32(t[1], t[2]), vaddq_f32(vmulq_f32(_4, t[3]), t[4])); - - t[0] = vaddq_f32(vsubq_f32(s[5], s[10]), vmulq_f32(_2, s[15])); - t[1] = vaddq_f32(vsubq_f32(s[6], s[11]), vmulq_f32(_2, s[16])); - t[2] = vaddq_f32(vsubq_f32(s[7], s[12]), vmulq_f32(_2, s[17])); - t[3] = vaddq_f32(vsubq_f32(s[8], s[13]), vmulq_f32(_2, s[18])); - t[4] = vaddq_f32(vsubq_f32(s[9], s[14]), vmulq_f32(_2, s[19])); - dst[3] = vaddq_f32(vaddq_f32(t[0], t[1]), vaddq_f32(t[2], t[3])); - dst[4] = vaddq_f32(vsubq_f32(t[1], t[2]), vmulq_f32(_2, t[3])); - dst[5] = vaddq_f32(vaddq_f32(t[1], t[2]), vaddq_f32(vmulq_f32(_4, t[3]), t[4])); - - t[0] = vaddq_f32(vaddq_f32(s[5], s[10]), vaddq_f32(vmulq_f32(_4, s[15]), s[20])); - t[1] = vaddq_f32(vaddq_f32(s[6], s[11]), vaddq_f32(vmulq_f32(_4, s[16]), s[21])); - t[2] = vaddq_f32(vaddq_f32(s[7], s[12]), vaddq_f32(vmulq_f32(_4, s[17]), s[22])); - t[3] = vaddq_f32(vaddq_f32(s[8], s[13]), vaddq_f32(vmulq_f32(_4, s[18]), s[23])); - t[4] = vaddq_f32(vaddq_f32(s[9], s[14]), vaddq_f32(vmulq_f32(_4, s[19]), s[24])); - dst[6] = vaddq_f32(vaddq_f32(t[0], t[1]), vaddq_f32(t[2], t[3])); - dst[7] = vaddq_f32(vsubq_f32(t[1], t[2]), vmulq_f32(_2, t[3])); - dst[8] = vaddq_f32(vaddq_f32(t[1], t[2]), vaddq_f32(vmulq_f32(_4, t[3]), t[4])); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetOutputStore9(const float32x4_t src[9], float * dst, size_t dstS, size_t dstC) - { - Store(dst + 0 * dstS + 0 * dstC, src[0]); - Store(dst + 0 * dstS + 1 * dstC, src[1]); - Store(dst + 0 * dstS + 2 * dstC, src[2]); - Store(dst + 1 * dstS + 0 * dstC, src[3]); - Store(dst + 1 * dstS + 1 * dstC, src[4]); - Store(dst + 1 * dstS + 2 * dstC, src[5]); - Store(dst + 2 * dstS + 0 * dstC, src[6]); - Store(dst + 2 * dstS + 1 * dstC, src[7]); - Store(dst + 2 * dstS + 2 * dstC, src[8]); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetOutput4t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - float32x4_t tmp[9]; - WinogradKernel3x3Block3x3SetOutputLoad25(src + d, srcStride, tmp); - WinogradKernel3x3Block3x3SetOutputStore9(tmp, dst + d, dstS, dstC); - } - if (dstCF < dstC) - { - float32x4_t tmp[9]; - WinogradKernel3x3Block3x3SetOutputLoad25(src + dstC - F, srcStride, tmp); - WinogradKernel3x3Block3x3SetOutputStore9(tmp, dst + dstC - F, dstS, dstC); - } - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetOutputStore9(const float32x4_t src[16], float * dst, size_t dstS, size_t dstC, size_t rowE, size_t colE) - { - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - Store(dst + row * dstS + col * dstC, src[row * 3 + col]); - } - - SIMD_INLINE void WinogradKernel3x3Block3x3SetOutput4t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - float32x4_t tmp[9]; - WinogradKernel3x3Block3x3SetOutputLoad25(src + d, srcStride, tmp); - WinogradKernel3x3Block3x3SetOutputStore9(tmp, dst + d, dstS, dstC, rowE, colE); - } - if (dstCF < dstC) - { - float32x4_t tmp[9]; - WinogradKernel3x3Block3x3SetOutputLoad25(src + dstC - F, srcStride, tmp); - WinogradKernel3x3Block3x3SetOutputStore9(tmp, dst + dstC - F, dstS, dstC, rowE, colE); - } - } - - void WinogradKernel3x3Block3x3SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - if (trans ? (dstChannels < 4) : (dstHeight < 3 || dstWidth < 12)) - { - Base::WinogradKernel3x3Block3x3SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - return; - } - size_t tileH = (dstHeight + 2) / 3; - size_t tileW = (dstWidth + 2) / 3; - size_t dstH3 = AlignLoAny(dstHeight, 3); - size_t dstW3 = AlignLoAny(dstWidth, 3); - if (trans) - { - size_t row, col; - for (row = 0; row < dstH3; row += 3) - { - for (col = 0; col < dstW3; col += 3) - WinogradKernel3x3Block3x3SetOutput4t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block3x3SetOutput4t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, 3, dstWidth - col), src += dstChannels; - } - if (row < dstHeight) - { - for (col = 0; col < dstW3; col += 3) - WinogradKernel3x3Block3x3SetOutput4t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, 3), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block3x3SetOutput4t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; - } - } - else - { - Base::WinogradKernel3x3Block3x3SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block4x4SetFilter4Row(const float32x4_t * t, float * dst, size_t stride) - { - const float32x4_t r4 = vdupq_n_f32(1.0f / 4.0f); - const float32x4_t r6 = vdupq_n_f32(1.0f / 6.0f); - const float32x4_t mr6 = vdupq_n_f32(-1.0f / 6.0f); - const float32x4_t r12 = vdupq_n_f32(1.0f / 12.0f); - const float32x4_t r24 = vdupq_n_f32(1.0f / 24.0f); - Store(dst + 0 * stride, vmulq_f32(r4, t[0])); - float32x4_t t0 = vaddq_f32(t[0], t[2]); - Store(dst + 1 * stride, vmulq_f32(mr6, vaddq_f32(t0, t[1]))); - Store(dst + 2 * stride, vmulq_f32(mr6, vsubq_f32(t0, t[1]))); - float32x4_t t1 = vaddq_f32(vmulq_f32(r24, t[0]), vmulq_f32(r6, t[2])); - float32x4_t t2 = vmulq_f32(r12, t[1]); - Store(dst + 3 * stride, vaddq_f32(t1, t2)); - Store(dst + 4 * stride, vsubq_f32(t1, t2)); - Store(dst + 5 * stride, t[2]); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetFilter4All(const float32x4_t * s, float * dst, size_t stride) - { - const float32x4_t r4 = vdupq_n_f32(1.0f / 4.0f); - const float32x4_t r6 = vdupq_n_f32(1.0f / 6.0f); - const float32x4_t mr6 = vdupq_n_f32(-1.0f / 6.0f); - const float32x4_t r12 = vdupq_n_f32(1.0f / 12.0f); - const float32x4_t r24 = vdupq_n_f32(1.0f / 24.0f); - - float32x4_t t[3]; - t[0] = vmulq_f32(r4, s[0]); - t[1] = vmulq_f32(r4, s[1]); - t[2] = vmulq_f32(r4, s[2]); - WinogradKernel3x3Block4x4SetFilter4Row(t, dst + 0 * stride, stride); - - t[0] = vmulq_f32(mr6, vaddq_f32(vaddq_f32(s[0], s[3]), s[6])); - t[1] = vmulq_f32(mr6, vaddq_f32(vaddq_f32(s[1], s[4]), s[7])); - t[2] = vmulq_f32(mr6, vaddq_f32(vaddq_f32(s[2], s[5]), s[8])); - WinogradKernel3x3Block4x4SetFilter4Row(t, dst + 6 * stride, stride); - - t[0] = vmulq_f32(mr6, vaddq_f32(vsubq_f32(s[0], s[3]), s[6])); - t[1] = vmulq_f32(mr6, vaddq_f32(vsubq_f32(s[1], s[4]), s[7])); - t[2] = vmulq_f32(mr6, vaddq_f32(vsubq_f32(s[2], s[5]), s[8])); - WinogradKernel3x3Block4x4SetFilter4Row(t, dst + 12 * stride, stride); - - t[0] = vaddq_f32(vaddq_f32(vmulq_f32(r24, s[0]), vmulq_f32(r12, s[3])), vmulq_f32(r6, s[6])); - t[1] = vaddq_f32(vaddq_f32(vmulq_f32(r24, s[1]), vmulq_f32(r12, s[4])), vmulq_f32(r6, s[7])); - t[2] = vaddq_f32(vaddq_f32(vmulq_f32(r24, s[2]), vmulq_f32(r12, s[5])), vmulq_f32(r6, s[8])); - WinogradKernel3x3Block4x4SetFilter4Row(t, dst + 18 * stride, stride); - - t[0] = vaddq_f32(vsubq_f32(vmulq_f32(r24, s[0]), vmulq_f32(r12, s[3])), vmulq_f32(r6, s[6])); - t[1] = vaddq_f32(vsubq_f32(vmulq_f32(r24, s[1]), vmulq_f32(r12, s[4])), vmulq_f32(r6, s[7])); - t[2] = vaddq_f32(vsubq_f32(vmulq_f32(r24, s[2]), vmulq_f32(r12, s[5])), vmulq_f32(r6, s[8])); - WinogradKernel3x3Block4x4SetFilter4Row(t, dst + 24 * stride, stride); - - WinogradKernel3x3Block4x4SetFilter4Row(s + 6, dst + 30 * stride, stride); - } - - - SIMD_INLINE void WinogradKernel3x3Block4x4SetFilter4n(const float * src, float * dst, size_t stride) - { - float32x4_t s[9]; - Load4(src + 0, 9, s + 0); - Load4(src + 4, 9, s + 4); - s[8] = SetF32(src[8], src[17], src[26], src[35]); - WinogradKernel3x3Block4x4SetFilter4All(s, dst + 0 * stride, stride); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetFilter4t(const float * src, float * dst, size_t stride) - { - float32x4_t s[9]; - s[0] = Load(src + 0 * stride); - s[1] = Load(src + 1 * stride); - s[2] = Load(src + 2 * stride); - s[3] = Load(src + 3 * stride); - s[4] = Load(src + 4 * stride); - s[5] = Load(src + 5 * stride); - s[6] = Load(src + 6 * stride); - s[7] = Load(src + 7 * stride); - s[8] = Load(src + 8 * stride); - WinogradKernel3x3Block4x4SetFilter4All(s, dst + 0 * stride, stride); - } - - void WinogradKernel3x3Block4x4SetFilter(const float * src, size_t size, float * dst, SimdBool trans) - { - size_t size4 = AlignLo(size, 4), i = 0; - if (trans) - { - for (; i < size4; i += 4) - WinogradKernel3x3Block4x4SetFilter4t(src + i, dst + i, size); - for (; i < size; i += 1) - Base::WinogradKernel3x3Block4x4SetFilter1t(src + i, dst + i, size); - } - else - { - for (; i < size4; i += 4, src += 36, dst += 4) - WinogradKernel3x3Block4x4SetFilter4n(src, dst, size); - for (; i < size; i += 1, src += 9, dst += 1) - Base::WinogradKernel3x3Block4x4SetFilter1n(src, dst, size); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block4x4SetInput4Store(const float32x4_t src[36], float * dst, size_t stride) - { - float32x4_t _2 = vdupq_n_f32(2.0f); - float32x4_t _4 = vdupq_n_f32(4.0f); - float32x4_t _5 = vdupq_n_f32(5.0f); - float32x4_t tmp[36]; - tmp[0] = vmlaq_f32(vmlsq_f32(src[24], _5, src[12]), _4, src[0]); - tmp[1] = vmlaq_f32(vmlsq_f32(src[25], _5, src[13]), _4, src[1]); - tmp[2] = vmlaq_f32(vmlsq_f32(src[26], _5, src[14]), _4, src[2]); - tmp[3] = vmlaq_f32(vmlsq_f32(src[27], _5, src[15]), _4, src[3]); - tmp[4] = vmlaq_f32(vmlsq_f32(src[28], _5, src[16]), _4, src[4]); - tmp[5] = vmlaq_f32(vmlsq_f32(src[29], _5, src[17]), _4, src[5]); - tmp[6] = vmlsq_f32(vaddq_f32(src[18], src[24]), _4, vaddq_f32(src[6], src[12])); - tmp[7] = vmlsq_f32(vaddq_f32(src[19], src[25]), _4, vaddq_f32(src[7], src[13])); - tmp[8] = vmlsq_f32(vaddq_f32(src[20], src[26]), _4, vaddq_f32(src[8], src[14])); - tmp[9] = vmlsq_f32(vaddq_f32(src[21], src[27]), _4, vaddq_f32(src[9], src[15])); - tmp[10] = vmlsq_f32(vaddq_f32(src[22], src[28]), _4, vaddq_f32(src[10], src[16])); - tmp[11] = vmlsq_f32(vaddq_f32(src[23], src[29]), _4, vaddq_f32(src[11], src[17])); - tmp[12] = vmlaq_f32(vsubq_f32(src[24], src[18]), _4, vsubq_f32(src[6], src[12])); - tmp[13] = vmlaq_f32(vsubq_f32(src[25], src[19]), _4, vsubq_f32(src[7], src[13])); - tmp[14] = vmlaq_f32(vsubq_f32(src[26], src[20]), _4, vsubq_f32(src[8], src[14])); - tmp[15] = vmlaq_f32(vsubq_f32(src[27], src[21]), _4, vsubq_f32(src[9], src[15])); - tmp[16] = vmlaq_f32(vsubq_f32(src[28], src[22]), _4, vsubq_f32(src[10], src[16])); - tmp[17] = vmlaq_f32(vsubq_f32(src[29], src[23]), _4, vsubq_f32(src[11], src[17])); - tmp[18] = vmlaq_f32(vsubq_f32(src[24], src[12]), _2, vsubq_f32(src[18], src[6])); - tmp[19] = vmlaq_f32(vsubq_f32(src[25], src[13]), _2, vsubq_f32(src[19], src[7])); - tmp[20] = vmlaq_f32(vsubq_f32(src[26], src[14]), _2, vsubq_f32(src[20], src[8])); - tmp[21] = vmlaq_f32(vsubq_f32(src[27], src[15]), _2, vsubq_f32(src[21], src[9])); - tmp[22] = vmlaq_f32(vsubq_f32(src[28], src[16]), _2, vsubq_f32(src[22], src[10])); - tmp[23] = vmlaq_f32(vsubq_f32(src[29], src[17]), _2, vsubq_f32(src[23], src[11])); - tmp[24] = vmlaq_f32(vsubq_f32(src[24], src[12]), _2, vsubq_f32(src[6], src[18])); - tmp[25] = vmlaq_f32(vsubq_f32(src[25], src[13]), _2, vsubq_f32(src[7], src[19])); - tmp[26] = vmlaq_f32(vsubq_f32(src[26], src[14]), _2, vsubq_f32(src[8], src[20])); - tmp[27] = vmlaq_f32(vsubq_f32(src[27], src[15]), _2, vsubq_f32(src[9], src[21])); - tmp[28] = vmlaq_f32(vsubq_f32(src[28], src[16]), _2, vsubq_f32(src[10], src[22])); - tmp[29] = vmlaq_f32(vsubq_f32(src[29], src[17]), _2, vsubq_f32(src[11], src[23])); - tmp[30] = vmlaq_f32(vmlsq_f32(src[30], _5, src[18]), _4, src[6]); - tmp[31] = vmlaq_f32(vmlsq_f32(src[31], _5, src[19]), _4, src[7]); - tmp[32] = vmlaq_f32(vmlsq_f32(src[32], _5, src[20]), _4, src[8]); - tmp[33] = vmlaq_f32(vmlsq_f32(src[33], _5, src[21]), _4, src[9]); - tmp[34] = vmlaq_f32(vmlsq_f32(src[34], _5, src[22]), _4, src[10]); - tmp[35] = vmlaq_f32(vmlsq_f32(src[35], _5, src[23]), _4, src[11]); - - Store(dst + 0 * stride, vmlaq_f32(vmlsq_f32(tmp[4], _5, tmp[2]), _4, tmp[0])); - Store(dst + 1 * stride, vmlsq_f32(vaddq_f32(tmp[3], tmp[4]), _4, vaddq_f32(tmp[1], tmp[2]))); - Store(dst + 2 * stride, vmlaq_f32(vsubq_f32(tmp[4], tmp[3]), _4, vsubq_f32(tmp[1], tmp[2]))); - Store(dst + 3 * stride, vmlaq_f32(vsubq_f32(tmp[4], tmp[2]), _2, vsubq_f32(tmp[3], tmp[1]))); - Store(dst + 4 * stride, vmlaq_f32(vsubq_f32(tmp[4], tmp[2]), _2, vsubq_f32(tmp[1], tmp[3]))); - Store(dst + 5 * stride, vmlaq_f32(vmlsq_f32(tmp[5], _5, tmp[3]), _4, tmp[1])); - Store(dst + 6 * stride, vmlaq_f32(vmlsq_f32(tmp[10], _5, tmp[8]), _4, tmp[6])); - Store(dst + 7 * stride, vmlsq_f32(vaddq_f32(tmp[9], tmp[10]), _4, vaddq_f32(tmp[7], tmp[8]))); - Store(dst + 8 * stride, vmlaq_f32(vsubq_f32(tmp[10], tmp[9]), _4, vsubq_f32(tmp[7], tmp[8]))); - Store(dst + 9 * stride, vmlaq_f32(vsubq_f32(tmp[10], tmp[8]), _2, vsubq_f32(tmp[9], tmp[7]))); - Store(dst + 10 * stride, vmlaq_f32(vsubq_f32(tmp[10], tmp[8]), _2, vsubq_f32(tmp[7], tmp[9]))); - Store(dst + 11 * stride, vmlaq_f32(vmlsq_f32(tmp[11], _5, tmp[9]), _4, tmp[7])); - Store(dst + 12 * stride, vmlaq_f32(vmlsq_f32(tmp[16], _5, tmp[14]), _4, tmp[12])); - Store(dst + 13 * stride, vmlsq_f32(vaddq_f32(tmp[15], tmp[16]), _4, vaddq_f32(tmp[13], tmp[14]))); - Store(dst + 14 * stride, vmlaq_f32(vsubq_f32(tmp[16], tmp[15]), _4, vsubq_f32(tmp[13], tmp[14]))); - Store(dst + 15 * stride, vmlaq_f32(vsubq_f32(tmp[16], tmp[14]), _2, vsubq_f32(tmp[15], tmp[13]))); - Store(dst + 16 * stride, vmlaq_f32(vsubq_f32(tmp[16], tmp[14]), _2, vsubq_f32(tmp[13], tmp[15]))); - Store(dst + 17 * stride, vmlaq_f32(vmlsq_f32(tmp[17], _5, tmp[15]), _4, tmp[13])); - Store(dst + 18 * stride, vmlaq_f32(vmlsq_f32(tmp[22], _5, tmp[20]), _4, tmp[18])); - Store(dst + 19 * stride, vmlsq_f32(vaddq_f32(tmp[21], tmp[22]), _4, vaddq_f32(tmp[19], tmp[20]))); - Store(dst + 20 * stride, vmlaq_f32(vsubq_f32(tmp[22], tmp[21]), _4, vsubq_f32(tmp[19], tmp[20]))); - Store(dst + 21 * stride, vmlaq_f32(vsubq_f32(tmp[22], tmp[20]), _2, vsubq_f32(tmp[21], tmp[19]))); - Store(dst + 22 * stride, vmlaq_f32(vsubq_f32(tmp[22], tmp[20]), _2, vsubq_f32(tmp[19], tmp[21]))); - Store(dst + 23 * stride, vmlaq_f32(vmlsq_f32(tmp[23], _5, tmp[21]), _4, tmp[19])); - Store(dst + 24 * stride, vmlaq_f32(vmlsq_f32(tmp[28], _5, tmp[26]), _4, tmp[24])); - Store(dst + 25 * stride, vmlsq_f32(vaddq_f32(tmp[27], tmp[28]), _4, vaddq_f32(tmp[25], tmp[26]))); - Store(dst + 26 * stride, vmlaq_f32(vsubq_f32(tmp[28], tmp[27]), _4, vsubq_f32(tmp[25], tmp[26]))); - Store(dst + 27 * stride, vmlaq_f32(vsubq_f32(tmp[28], tmp[26]), _2, vsubq_f32(tmp[27], tmp[25]))); - Store(dst + 28 * stride, vmlaq_f32(vsubq_f32(tmp[28], tmp[26]), _2, vsubq_f32(tmp[25], tmp[27]))); - Store(dst + 29 * stride, vmlaq_f32(vmlsq_f32(tmp[29], _5, tmp[27]), _4, tmp[25])); - Store(dst + 30 * stride, vmlaq_f32(vmlsq_f32(tmp[34], _5, tmp[32]), _4, tmp[30])); - Store(dst + 31 * stride, vmlsq_f32(vaddq_f32(tmp[33], tmp[34]), _4, vaddq_f32(tmp[31], tmp[32]))); - Store(dst + 32 * stride, vmlaq_f32(vsubq_f32(tmp[34], tmp[33]), _4, vsubq_f32(tmp[31], tmp[32]))); - Store(dst + 33 * stride, vmlaq_f32(vsubq_f32(tmp[34], tmp[32]), _2, vsubq_f32(tmp[33], tmp[31]))); - Store(dst + 34 * stride, vmlaq_f32(vsubq_f32(tmp[34], tmp[32]), _2, vsubq_f32(tmp[31], tmp[33]))); - Store(dst + 35 * stride, vmlaq_f32(vmlsq_f32(tmp[35], _5, tmp[33]), _4, tmp[31])); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetInput4t(const float * src, size_t srcS, size_t srcC, float32x4_t dst[36]) - { - dst[0] = Load(src + 0 * srcS + 0 * srcC); - dst[1] = Load(src + 0 * srcS + 1 * srcC); - dst[2] = Load(src + 0 * srcS + 2 * srcC); - dst[3] = Load(src + 0 * srcS + 3 * srcC); - dst[4] = Load(src + 0 * srcS + 4 * srcC); - dst[5] = Load(src + 0 * srcS + 5 * srcC); - dst[6] = Load(src + 1 * srcS + 0 * srcC); - dst[7] = Load(src + 1 * srcS + 1 * srcC); - dst[8] = Load(src + 1 * srcS + 2 * srcC); - dst[9] = Load(src + 1 * srcS + 3 * srcC); - dst[10] = Load(src + 1 * srcS + 4 * srcC); - dst[11] = Load(src + 1 * srcS + 5 * srcC); - dst[12] = Load(src + 2 * srcS + 0 * srcC); - dst[13] = Load(src + 2 * srcS + 1 * srcC); - dst[14] = Load(src + 2 * srcS + 2 * srcC); - dst[15] = Load(src + 2 * srcS + 3 * srcC); - dst[16] = Load(src + 2 * srcS + 4 * srcC); - dst[17] = Load(src + 2 * srcS + 5 * srcC); - dst[18] = Load(src + 3 * srcS + 0 * srcC); - dst[19] = Load(src + 3 * srcS + 1 * srcC); - dst[20] = Load(src + 3 * srcS + 2 * srcC); - dst[21] = Load(src + 3 * srcS + 3 * srcC); - dst[22] = Load(src + 3 * srcS + 4 * srcC); - dst[23] = Load(src + 3 * srcS + 5 * srcC); - dst[24] = Load(src + 4 * srcS + 0 * srcC); - dst[25] = Load(src + 4 * srcS + 1 * srcC); - dst[26] = Load(src + 4 * srcS + 2 * srcC); - dst[27] = Load(src + 4 * srcS + 3 * srcC); - dst[28] = Load(src + 4 * srcS + 4 * srcC); - dst[29] = Load(src + 4 * srcS + 5 * srcC); - dst[30] = Load(src + 5 * srcS + 0 * srcC); - dst[31] = Load(src + 5 * srcS + 1 * srcC); - dst[32] = Load(src + 5 * srcS + 2 * srcC); - dst[33] = Load(src + 5 * srcS + 3 * srcC); - dst[34] = Load(src + 5 * srcS + 4 * srcC); - dst[35] = Load(src + 5 * srcS + 5 * srcC); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetInput4t(const float * src, size_t srcW, size_t srcC, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - float32x4_t tmp[36]; - WinogradKernel3x3Block4x4SetInput4t(src + c, srcS, srcC, tmp); - WinogradKernel3x3Block4x4SetInput4Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - float32x4_t tmp[36]; - WinogradKernel3x3Block4x4SetInput4t(src + srcC - F, srcS, srcC, tmp); - WinogradKernel3x3Block4x4SetInput4Store(tmp, dst + srcC - F, dstStride); - } - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetInput4t(const float * src, size_t srcS, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float32x4_t dst[36]) - { - for (size_t i = 0; i < 36; ++i) - dst[i] = vdupq_n_f32(0.0f); - for (size_t row = rowB; row < rowE; ++row) - for (size_t col = colB; col < colE; ++col) - dst[row * 6 + col] = Load(src + row * srcS + col * srcC); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetInput4t(const float * src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float * dst, size_t dstStride) - { - size_t srcS = srcW * srcC; - size_t srcCF = AlignLo(srcC, F); - for (size_t c = 0; c < srcCF; c += F) - { - float32x4_t tmp[36]; - WinogradKernel3x3Block4x4SetInput4t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel3x3Block4x4SetInput4Store(tmp, dst + c, dstStride); - } - if (srcCF < srcC) - { - float32x4_t tmp[36]; - WinogradKernel3x3Block4x4SetInput4t(src + srcC - F, srcS, srcC, rowB, rowE, colB, colE, tmp); - WinogradKernel3x3Block4x4SetInput4Store(tmp, dst + srcC - F, dstStride); - } - } - - void WinogradKernel3x3Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, - size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) - { - if (trans ? (srcChannels < 4) : (srcHeight < 6 || srcWidth < 12)) - { - Base::WinogradKernel3x3Block4x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - return; - } - if (trans) - { - assert(padY + padH <= 2 && padX + padW <= 2); - size_t dstH = srcHeight - 2 + padY + padH; - size_t dstW = srcWidth - 2 + padX + padW; - size_t dstH4 = dstH / 4 * 4; - size_t dstW4 = dstW / 4 * 4; - size_t noseW = Simd::Min(6, srcWidth + padX); - size_t noseH = Simd::Min(6, srcHeight + padY); - size_t startY = padY ? 4 : 0; - size_t startX = padX ? 4 : 0; - if (padH && dstH == dstH4) - dstH4 -= 4; - if (padY) - src -= srcWidth * srcChannels; - if (padW && dstW == dstW4) - dstW4 -= 4; - if (padX) - src -= srcChannels; - size_t tailW = dstW - dstW4 + (padW ? 1 : 2); - size_t tailH = dstH - dstH4 + (padH ? 1 : 2); - size_t row = 0, col = 0; - if (padY) - { - if (padX) - WinogradKernel3x3Block4x4SetInput4t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel3x3Block4x4SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 6, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block4x4SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; - } - for (row = startY; row < dstH4; row += 4) - { - if (padX) - WinogradKernel3x3Block4x4SetInput4t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 6, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel3x3Block4x4SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block4x4SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 6, 0, tailW, dst, dstStride), dst += srcChannels; - } - if (row < dstH) - { - if (padX) - WinogradKernel3x3Block4x4SetInput4t(src + row * srcWidth* srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; - for (col = startX; col < dstW4; col += 4) - WinogradKernel3x3Block4x4SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 6, dst, dstStride), dst += srcChannels; - if (col < dstW) - WinogradKernel3x3Block4x4SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; - } - } - else - { - Base::WinogradKernel3x3Block4x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); - } - } - - //----------------------------------------------------------------------- - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutputLoad36(const float * src, size_t stride, float32x4_t dst[16]) - { - float32x4_t s[36]; - s[0] = Load(src + 0 * stride); - s[1] = Load(src + 1 * stride); - s[2] = Load(src + 2 * stride); - s[3] = Load(src + 3 * stride); - s[4] = Load(src + 4 * stride); - s[5] = Load(src + 5 * stride); - s[6] = Load(src + 6 * stride); - s[7] = Load(src + 7 * stride); - s[8] = Load(src + 8 * stride); - s[9] = Load(src + 9 * stride); - s[10] = Load(src + 10 * stride); - s[11] = Load(src + 11 * stride); - s[12] = Load(src + 12 * stride); - s[13] = Load(src + 13 * stride); - s[14] = Load(src + 14 * stride); - s[15] = Load(src + 15 * stride); - s[16] = Load(src + 16 * stride); - s[17] = Load(src + 17 * stride); - s[18] = Load(src + 18 * stride); - s[19] = Load(src + 19 * stride); - s[20] = Load(src + 20 * stride); - s[21] = Load(src + 21 * stride); - s[22] = Load(src + 22 * stride); - s[23] = Load(src + 23 * stride); - s[24] = Load(src + 24 * stride); - s[25] = Load(src + 25 * stride); - s[26] = Load(src + 26 * stride); - s[27] = Load(src + 27 * stride); - s[28] = Load(src + 28 * stride); - s[29] = Load(src + 29 * stride); - s[30] = Load(src + 30 * stride); - s[31] = Load(src + 31 * stride); - s[32] = Load(src + 32 * stride); - s[33] = Load(src + 33 * stride); - s[34] = Load(src + 34 * stride); - s[35] = Load(src + 35 * stride); - - float32x4_t _2 = vdupq_n_f32(2.0f); - float32x4_t _4 = vdupq_n_f32(4.0f); - float32x4_t _8 = vdupq_n_f32(8.0f); - float32x4_t t[24]; - t[0] = vaddq_f32(vaddq_f32(vaddq_f32(s[0], s[6]), vaddq_f32(s[12], s[18])), s[24]); - t[1] = vaddq_f32(vaddq_f32(vaddq_f32(s[1], s[7]), vaddq_f32(s[13], s[19])), s[25]); - t[2] = vaddq_f32(vaddq_f32(vaddq_f32(s[2], s[8]), vaddq_f32(s[14], s[20])), s[26]); - t[3] = vaddq_f32(vaddq_f32(vaddq_f32(s[3], s[9]), vaddq_f32(s[15], s[21])), s[27]); - t[4] = vaddq_f32(vaddq_f32(vaddq_f32(s[4], s[10]), vaddq_f32(s[16], s[22])), s[28]); - t[5] = vaddq_f32(vaddq_f32(vaddq_f32(s[5], s[11]), vaddq_f32(s[17], s[23])), s[29]); - t[6] = vmlaq_f32(vsubq_f32(s[6], s[12]), _2, vsubq_f32(s[18], s[24])); - t[7] = vmlaq_f32(vsubq_f32(s[7], s[13]), _2, vsubq_f32(s[19], s[25])); - t[8] = vmlaq_f32(vsubq_f32(s[8], s[14]), _2, vsubq_f32(s[20], s[26])); - t[9] = vmlaq_f32(vsubq_f32(s[9], s[15]), _2, vsubq_f32(s[21], s[27])); - t[10] = vmlaq_f32(vsubq_f32(s[10], s[16]), _2, vsubq_f32(s[22], s[28])); - t[11] = vmlaq_f32(vsubq_f32(s[11], s[17]), _2, vsubq_f32(s[23], s[29])); - t[12] = vmlaq_f32(vaddq_f32(s[6], s[12]), _4, vaddq_f32(s[18], s[24])); - t[13] = vmlaq_f32(vaddq_f32(s[7], s[13]), _4, vaddq_f32(s[19], s[25])); - t[14] = vmlaq_f32(vaddq_f32(s[8], s[14]), _4, vaddq_f32(s[20], s[26])); - t[15] = vmlaq_f32(vaddq_f32(s[9], s[15]), _4, vaddq_f32(s[21], s[27])); - t[16] = vmlaq_f32(vaddq_f32(s[10], s[16]), _4, vaddq_f32(s[22], s[28])); - t[17] = vmlaq_f32(vaddq_f32(s[11], s[17]), _4, vaddq_f32(s[23], s[29])); - t[18] = vaddq_f32(vmlaq_f32(vsubq_f32(s[6], s[12]), _8, vsubq_f32(s[18], s[24])), s[30]); - t[19] = vaddq_f32(vmlaq_f32(vsubq_f32(s[7], s[13]), _8, vsubq_f32(s[19], s[25])), s[31]); - t[20] = vaddq_f32(vmlaq_f32(vsubq_f32(s[8], s[14]), _8, vsubq_f32(s[20], s[26])), s[32]); - t[21] = vaddq_f32(vmlaq_f32(vsubq_f32(s[9], s[15]), _8, vsubq_f32(s[21], s[27])), s[33]); - t[22] = vaddq_f32(vmlaq_f32(vsubq_f32(s[10], s[16]), _8, vsubq_f32(s[22], s[28])), s[34]); - t[23] = vaddq_f32(vmlaq_f32(vsubq_f32(s[11], s[17]), _8, vsubq_f32(s[23], s[29])), s[35]); - - dst[0] = vaddq_f32(vaddq_f32(vaddq_f32(t[0], t[1]), vaddq_f32(t[2], t[3])), t[4]); - dst[1] = vmlaq_f32(vsubq_f32(t[1], t[2]), _2, vsubq_f32(t[3], t[4])); - dst[2] = vmlaq_f32(vaddq_f32(t[1], t[2]), _4, vaddq_f32(t[3], t[4])); - dst[3] = vaddq_f32(vmlaq_f32(vsubq_f32(t[1], t[2]), _8, vsubq_f32(t[3], t[4])), t[5]); - dst[4] = vaddq_f32(vaddq_f32(vaddq_f32(t[6], t[7]), vaddq_f32(t[8], t[9])), t[10]); - dst[5] = vmlaq_f32(vsubq_f32(t[7], t[8]), _2, vsubq_f32(t[9], t[10])); - dst[6] = vmlaq_f32(vaddq_f32(t[7], t[8]), _4, vaddq_f32(t[9], t[10])); - dst[7] = vaddq_f32(vmlaq_f32(vsubq_f32(t[7], t[8]), _8, vsubq_f32(t[9], t[10])), t[11]); - dst[8] = vaddq_f32(vaddq_f32(vaddq_f32(t[12], t[13]), vaddq_f32(t[14], t[15])), t[16]); - dst[9] = vmlaq_f32(vsubq_f32(t[13], t[14]), _2, vsubq_f32(t[15], t[16])); - dst[10] = vmlaq_f32(vaddq_f32(t[13], t[14]), _4, vaddq_f32(t[15], t[16])); - dst[11] = vaddq_f32(vmlaq_f32(vsubq_f32(t[13], t[14]), _8, vsubq_f32(t[15], t[16])), t[17]); - dst[12] = vaddq_f32(vaddq_f32(vaddq_f32(t[18], t[19]), vaddq_f32(t[20], t[21])), t[22]); - dst[13] = vmlaq_f32(vsubq_f32(t[19], t[20]), _2, vsubq_f32(t[21], t[22])); - dst[14] = vmlaq_f32(vaddq_f32(t[19], t[20]), _4, vaddq_f32(t[21], t[22])); - dst[15] = vaddq_f32(vmlaq_f32(vsubq_f32(t[19], t[20]), _8, vsubq_f32(t[21], t[22])), t[23]); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutputStore16(const float32x4_t src[16], float * dst, size_t dstS, size_t dstC) - { - Store(dst + 0 * dstS + 0 * dstC, src[0]); - Store(dst + 0 * dstS + 1 * dstC, src[1]); - Store(dst + 0 * dstS + 2 * dstC, src[2]); - Store(dst + 0 * dstS + 3 * dstC, src[3]); - Store(dst + 1 * dstS + 0 * dstC, src[4]); - Store(dst + 1 * dstS + 1 * dstC, src[5]); - Store(dst + 1 * dstS + 2 * dstC, src[6]); - Store(dst + 1 * dstS + 3 * dstC, src[7]); - Store(dst + 2 * dstS + 0 * dstC, src[8]); - Store(dst + 2 * dstS + 1 * dstC, src[9]); - Store(dst + 2 * dstS + 2 * dstC, src[10]); - Store(dst + 2 * dstS + 3 * dstC, src[11]); - Store(dst + 3 * dstS + 0 * dstC, src[12]); - Store(dst + 3 * dstS + 1 * dstC, src[13]); - Store(dst + 3 * dstS + 2 * dstC, src[14]); - Store(dst + 3 * dstS + 3 * dstC, src[15]); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutput4t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - float32x4_t tmp[16]; - WinogradKernel3x3Block4x4SetOutputLoad36(src + d, srcStride, tmp); - WinogradKernel3x3Block4x4SetOutputStore16(tmp, dst + d, dstS, dstC); - } - if (dstCF < dstC) - { - float32x4_t tmp[16]; - WinogradKernel3x3Block4x4SetOutputLoad36(src + dstC - F, srcStride, tmp); - WinogradKernel3x3Block4x4SetOutputStore16(tmp, dst + dstC - F, dstS, dstC); - } - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutputStore16(const float32x4_t src[16], float * dst, size_t dstS, size_t dstC, size_t rowE, size_t colE) - { - for (size_t row = 0; row < rowE; ++row) - for (size_t col = 0; col < colE; ++col) - Store(dst + row * dstS + col * dstC, src[row * 4 + col]); - } - - SIMD_INLINE void WinogradKernel3x3Block4x4SetOutput4t(const float * src, size_t srcStride, float * dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) - { - size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); - for (size_t d = 0; d < dstCF; d += F) - { - float32x4_t tmp[16]; - WinogradKernel3x3Block4x4SetOutputLoad36(src + d, srcStride, tmp); - WinogradKernel3x3Block4x4SetOutputStore16(tmp, dst + d, dstS, dstC, rowE, colE); - } - if (dstCF < dstC) - { - float32x4_t tmp[16]; - WinogradKernel3x3Block4x4SetOutputLoad36(src + dstC - F, srcStride, tmp); - WinogradKernel3x3Block4x4SetOutputStore16(tmp, dst + dstC - F, dstS, dstC, rowE, colE); - } - } - - void WinogradKernel3x3Block4x4SetOutput(const float * src, size_t srcStride, float * dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) - { - if (trans ? (dstChannels < 4) : (dstHeight < 4 || dstWidth < 16)) - { - Base::WinogradKernel3x3Block4x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - return; - } - size_t tileH = (dstHeight + 3) / 4; - size_t tileW = (dstWidth + 3) / 4; - size_t dstH4 = AlignLo(dstHeight, 4); - size_t dstW4 = AlignLo(dstWidth, 4); - if (trans) - { - size_t row, col; - for (row = 0; row < dstH4; row += 4) - { - for (col = 0; col < dstW4; col += 4) - WinogradKernel3x3Block4x4SetOutput4t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block4x4SetOutput4t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, 4, dstWidth - col), src += dstChannels; - } - if (row < dstHeight) - { - for (col = 0; col < dstW4; col += 4) - WinogradKernel3x3Block4x4SetOutput4t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, 4), src += dstChannels; - if (col < dstWidth) - WinogradKernel3x3Block4x4SetOutput4t(src, srcStride, dst + (row * dstWidth + col)*dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; - } - } - else - { - Base::WinogradKernel3x3Block4x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); - } - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonYuvToBgr.cpp b/src/3rd/Simd/Simd/SimdNeonYuvToBgr.cpp deleted file mode 100644 index f1beef50..00000000 --- a/src/3rd/Simd/Simd/SimdNeonYuvToBgr.cpp +++ /dev/null @@ -1,357 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - const size_t A3 = A * 3; - const size_t A6 = A * 6; - - template SIMD_INLINE void YuvToBgr(const uint8x16_t & y, const uint8x16_t & u, const uint8x16_t & v, uint8_t * bgr) - { - uint8x16x3_t _bgr; - YuvToBgr(y, u, v, _bgr); - Store3(bgr, _bgr); - } - - template SIMD_INLINE void Yuv422pToBgr(const uint8_t * y, const uint8x16x2_t & u, const uint8x16x2_t & v, uint8_t * bgr) - { - YuvToBgr(Load(y + 0), u.val[0], v.val[0], bgr + 0); - YuvToBgr(Load(y + A), u.val[1], v.val[1], bgr + A3); - } - - template void Yuv420pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= DA) && (height >= 2)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); - } - - size_t bodyWidth = AlignLo(width, DA); - size_t tail = width - bodyWidth; - uint8x16x2_t _u, _v; - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colY = 0, colBgr = 0; colY < bodyWidth; colY += DA, colUV += A, colBgr += A6) - { - _u.val[1] = _u.val[0] = Load(u + colUV); - _u = vzipq_u8(_u.val[0], _u.val[1]); - _v.val[1] = _v.val[0] = Load(v + colUV); - _v = vzipq_u8(_v.val[0], _v.val[1]); - Yuv422pToBgr(y + colY, _u, _v, bgr + colBgr); - Yuv422pToBgr(y + colY + yStride, _u, _v, bgr + colBgr + bgrStride); - } - if (tail) - { - size_t offset = width - DA; - _u.val[1] = _u.val[0] = Load(u + offset / 2); - _u = vzipq_u8(_u.val[0], _u.val[1]); - _v.val[1] = _v.val[0] = Load(v + offset / 2); - _v = vzipq_u8(_v.val[0], _v.val[1]); - Yuv422pToBgr(y + offset, _u, _v, bgr + 3 * offset); - Yuv422pToBgr(y + offset + yStride, _u, _v, bgr + 3 * offset + bgrStride); - } - y += 2 * yStride; - u += uStride; - v += vStride; - bgr += 2 * bgrStride; - } - } - - void Yuv420pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)) - Yuv420pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else - Yuv420pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - } - - template void Yuv422pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - assert((width % 2 == 0) && (width >= DA)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); - } - - size_t bodyWidth = AlignLo(width, DA); - size_t tail = width - bodyWidth; - uint8x16x2_t _u, _v; - for (size_t row = 0; row < height; ++row) - { - for (size_t colUV = 0, colY = 0, colBgr = 0; colY < bodyWidth; colY += DA, colUV += A, colBgr += A6) - { - _u.val[1] = _u.val[0] = Load(u + colUV); - _u = vzipq_u8(_u.val[0], _u.val[1]); - _v.val[1] = _v.val[0] = Load(v + colUV); - _v = vzipq_u8(_v.val[0], _v.val[1]); - Yuv422pToBgr(y + colY, _u, _v, bgr + colBgr); - } - if (tail) - { - size_t offset = width - DA; - _u.val[1] = _u.val[0] = Load(u + offset / 2); - _u = vzipq_u8(_u.val[0], _u.val[1]); - _v.val[1] = _v.val[0] = Load(v + offset / 2); - _v = vzipq_u8(_v.val[0], _v.val[1]); - Yuv422pToBgr(y + offset, _u, _v, bgr + 3 * offset); - } - y += yStride; - u += uStride; - v += vStride; - bgr += bgrStride; - } - } - - void Yuv422pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)) - Yuv422pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else - Yuv422pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - } - - template void Yuv444pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); - } - - size_t bodyWidth = AlignLo(width, A); - size_t tail = width - bodyWidth; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colBgr = 0; col < bodyWidth; col += A, colBgr += A3) - { - uint8x16_t _y = Load(y + col); - uint8x16_t _u = Load(u + col); - uint8x16_t _v = Load(v + col); - YuvToBgr(_y, _u, _v, bgr + colBgr); - } - if (tail) - { - size_t col = width - A; - uint8x16_t _y = Load(y + col); - uint8x16_t _u = Load(u + col); - uint8x16_t _v = Load(v + col); - YuvToBgr(_y, _u, _v, bgr + 3 * col); - } - y += yStride; - u += uStride; - v += vStride; - bgr += bgrStride; - } - } - - void Yuv444pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgr, size_t bgrStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)) - Yuv444pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - else - Yuv444pToBgr(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void YuvToRgb(const uint8x16_t& y, const uint8x16_t& u, const uint8x16_t& v, uint8_t* rgb) - { - uint8x16x3_t _rgb; - YuvToRgb(y, u, v, _rgb); - Store3(rgb, _rgb); - } - - template SIMD_INLINE void Yuv422pToRgb(const uint8_t* y, const uint8x16x2_t& u, const uint8x16x2_t& v, uint8_t* rgb) - { - YuvToRgb(Load(y + 0), u.val[0], v.val[0], rgb + 0); - YuvToRgb(Load(y + A), u.val[1], v.val[1], rgb + A3); - } - - template void Yuv420pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= DA) && (height >= 2)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(rgb) && Aligned(rgbStride)); - } - - size_t bodyWidth = AlignLo(width, DA); - size_t tail = width - bodyWidth; - uint8x16x2_t _u, _v; - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colY = 0, colRgb = 0; colY < bodyWidth; colY += DA, colUV += A, colRgb += A6) - { - _u.val[1] = _u.val[0] = Load(u + colUV); - _u = vzipq_u8(_u.val[0], _u.val[1]); - _v.val[1] = _v.val[0] = Load(v + colUV); - _v = vzipq_u8(_v.val[0], _v.val[1]); - Yuv422pToRgb(y + colY, _u, _v, rgb + colRgb); - Yuv422pToRgb(y + colY + yStride, _u, _v, rgb + colRgb + rgbStride); - } - if (tail) - { - size_t offset = width - DA; - _u.val[1] = _u.val[0] = Load(u + offset / 2); - _u = vzipq_u8(_u.val[0], _u.val[1]); - _v.val[1] = _v.val[0] = Load(v + offset / 2); - _v = vzipq_u8(_v.val[0], _v.val[1]); - Yuv422pToRgb(y + offset, _u, _v, rgb + 3 * offset); - Yuv422pToRgb(y + offset + yStride, _u, _v, rgb + 3 * offset + rgbStride); - } - y += 2 * yStride; - u += uStride; - v += vStride; - rgb += 2 * rgbStride; - } - } - - void Yuv420pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(rgb) && Aligned(rgbStride)) - Yuv420pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - else - Yuv420pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - } - - template void Yuv422pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) - { - assert((width % 2 == 0) && (width >= DA)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(rgb) && Aligned(rgbStride)); - } - - size_t bodyWidth = AlignLo(width, DA); - size_t tail = width - bodyWidth; - uint8x16x2_t _u, _v; - for (size_t row = 0; row < height; ++row) - { - for (size_t colUV = 0, colY = 0, colRgb = 0; colY < bodyWidth; colY += DA, colUV += A, colRgb += A6) - { - _u.val[1] = _u.val[0] = Load(u + colUV); - _u = vzipq_u8(_u.val[0], _u.val[1]); - _v.val[1] = _v.val[0] = Load(v + colUV); - _v = vzipq_u8(_v.val[0], _v.val[1]); - Yuv422pToRgb(y + colY, _u, _v, rgb + colRgb); - } - if (tail) - { - size_t offset = width - DA; - _u.val[1] = _u.val[0] = Load(u + offset / 2); - _u = vzipq_u8(_u.val[0], _u.val[1]); - _v.val[1] = _v.val[0] = Load(v + offset / 2); - _v = vzipq_u8(_v.val[0], _v.val[1]); - Yuv422pToRgb(y + offset, _u, _v, rgb + 3 * offset); - } - y += yStride; - u += uStride; - v += vStride; - rgb += rgbStride; - } - } - - void Yuv422pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(rgb) && Aligned(rgbStride)) - Yuv422pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - else - Yuv422pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - } - - template void Yuv444pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(rgb) && Aligned(rgbStride)); - } - - size_t bodyWidth = AlignLo(width, A); - size_t tail = width - bodyWidth; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colRgb = 0; col < bodyWidth; col += A, colRgb += A3) - { - uint8x16_t _y = Load(y + col); - uint8x16_t _u = Load(u + col); - uint8x16_t _v = Load(v + col); - YuvToRgb(_y, _u, _v, rgb + colRgb); - } - if (tail) - { - size_t col = width - A; - uint8x16_t _y = Load(y + col); - uint8x16_t _u = Load(u + col); - uint8x16_t _v = Load(v + col); - YuvToRgb(_y, _u, _v, rgb + 3 * col); - } - y += yStride; - u += uStride; - v += vStride; - rgb += rgbStride; - } - } - - void Yuv444pToRgb(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, - size_t width, size_t height, uint8_t* rgb, size_t rgbStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(rgb) && Aligned(rgbStride)) - Yuv444pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - else - Yuv444pToRgb(y, yStride, u, uStride, v, vStride, width, height, rgb, rgbStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonYuvToBgra.cpp b/src/3rd/Simd/Simd/SimdNeonYuvToBgra.cpp deleted file mode 100644 index 4af2981c..00000000 --- a/src/3rd/Simd/Simd/SimdNeonYuvToBgra.cpp +++ /dev/null @@ -1,255 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - template SIMD_INLINE void YuvToBgra(const uint8x16_t & y, const uint8x16_t & u, const uint8x16_t & v, const uint8x16_t & a, uint8_t * bgra) - { - uint8x16x4_t _bgra; - YuvToBgr(y, u, v, *(uint8x16x3_t*)&_bgra); - _bgra.val[3] = a; - Store4(bgra, _bgra); - } - - template SIMD_INLINE void Yuva422pToBgra(const uint8_t * y, const uint8x16x2_t & u, const uint8x16x2_t & v, const uint8_t * a, uint8_t * bgra) - { - YuvToBgra(Load(y + 0), u.val[0], v.val[0], Load(a + 0), bgra + 0); - YuvToBgra(Load(y + A), u.val[1], v.val[1], Load(a + A), bgra + QA); - } - - template void Yuva420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= DA) && (height >= 2)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride)); - assert(Aligned(a) && Aligned(aStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - size_t bodyWidth = AlignLo(width, DA); - size_t tail = width - bodyWidth; - uint8x16x2_t _u, _v; - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colY = 0, colBgra = 0; colY < bodyWidth; colY += DA, colUV += A, colBgra += OA) - { - _u.val[1] = _u.val[0] = Load(u + colUV); - _u = vzipq_u8(_u.val[0], _u.val[1]); - _v.val[1] = _v.val[0] = Load(v + colUV); - _v = vzipq_u8(_v.val[0], _v.val[1]); - Yuva422pToBgra(y + colY, _u, _v, a + colY, bgra + colBgra); - Yuva422pToBgra(y + colY + yStride, _u, _v, a + colY + aStride, bgra + colBgra + bgraStride); - } - if (tail) - { - size_t offset = width - DA; - _u.val[1] = _u.val[0] = Load(u + offset / 2); - _u = vzipq_u8(_u.val[0], _u.val[1]); - _v.val[1] = _v.val[0] = Load(v + offset / 2); - _v = vzipq_u8(_v.val[0], _v.val[1]); - Yuva422pToBgra(y + offset, _u, _v, a + offset, bgra + 4 * offset); - Yuva422pToBgra(y + offset + yStride, _u, _v, a + offset + aStride, bgra + 4 * offset + bgraStride); - } - y += 2 * yStride; - u += uStride; - v += vStride; - a += 2 * aStride; - bgra += 2 * bgraStride; - } - } - - void Yuva420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride) - && Aligned(a) && Aligned(aStride) && Aligned(bgra) && Aligned(bgraStride)) - Yuva420pToBgra(y, yStride, u, uStride, v, vStride, a, aStride, width, height, bgra, bgraStride); - else - Yuva420pToBgra(y, yStride, u, uStride, v, vStride, a, aStride, width, height, bgra, bgraStride); - } - - template SIMD_INLINE void Yuv422pToBgra(const uint8_t * y, const uint8x16x2_t & u, const uint8x16x2_t & v, const uint8x16_t & alpha, uint8_t * bgra) - { - YuvToBgra(Load(y + 0), u.val[0], v.val[0], alpha, bgra + 0); - YuvToBgra(Load(y + A), u.val[1], v.val[1], alpha, bgra + QA); - } - - template void Yuv420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= DA) && (height >= 2)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - uint8x16_t _alpha = vdupq_n_u8(alpha); - size_t bodyWidth = AlignLo(width, DA); - size_t tail = width - bodyWidth; - uint8x16x2_t _u, _v; - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, colY = 0, colBgra = 0; colY < bodyWidth; colY += DA, colUV += A, colBgra += OA) - { - _u.val[1] = _u.val[0] = Load(u + colUV); - _u = vzipq_u8(_u.val[0], _u.val[1]); - _v.val[1] = _v.val[0] = Load(v + colUV); - _v = vzipq_u8(_v.val[0], _v.val[1]); - Yuv422pToBgra(y + colY, _u, _v, _alpha, bgra + colBgra); - Yuv422pToBgra(y + colY + yStride, _u, _v, _alpha, bgra + colBgra + bgraStride); - } - if (tail) - { - size_t offset = width - DA; - _u.val[1] = _u.val[0] = Load(u + offset / 2); - _u = vzipq_u8(_u.val[0], _u.val[1]); - _v.val[1] = _v.val[0] = Load(v + offset / 2); - _v = vzipq_u8(_v.val[0], _v.val[1]); - Yuv422pToBgra(y + offset, _u, _v, _alpha, bgra + 4 * offset); - Yuv422pToBgra(y + offset + yStride, _u, _v, _alpha, bgra + 4 * offset + bgraStride); - } - y += 2 * yStride; - u += uStride; - v += vStride; - bgra += 2 * bgraStride; - } - } - - void Yuv420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)) - Yuv420pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else - Yuv420pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - } - - template void Yuv422pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - assert((width % 2 == 0) && (width >= DA)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - uint8x16_t _alpha = vdupq_n_u8(alpha); - size_t bodyWidth = AlignLo(width, DA); - size_t tail = width - bodyWidth; - uint8x16x2_t _u, _v; - for (size_t row = 0; row < height; ++row) - { - for (size_t colUV = 0, colY = 0, colBgra = 0; colY < bodyWidth; colY += DA, colUV += A, colBgra += OA) - { - _u.val[1] = _u.val[0] = Load(u + colUV); - _u = vzipq_u8(_u.val[0], _u.val[1]); - _v.val[1] = _v.val[0] = Load(v + colUV); - _v = vzipq_u8(_v.val[0], _v.val[1]); - Yuv422pToBgra(y + colY, _u, _v, _alpha, bgra + colBgra); - } - if (tail) - { - size_t offset = width - DA; - _u.val[1] = _u.val[0] = Load(u + offset / 2); - _u = vzipq_u8(_u.val[0], _u.val[1]); - _v.val[1] = _v.val[0] = Load(v + offset / 2); - _v = vzipq_u8(_v.val[0], _v.val[1]); - Yuv422pToBgra(y + offset, _u, _v, _alpha, bgra + 4 * offset); - } - y += yStride; - u += uStride; - v += vStride; - bgra += bgraStride; - } - } - - void Yuv422pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)) - Yuv422pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else - Yuv422pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - } - - template void Yuv444pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - assert(width >= A); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)); - } - - uint8x16_t _alpha = vdupq_n_u8(alpha); - size_t bodyWidth = AlignLo(width, A); - size_t tail = width - bodyWidth; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colBgra = 0; col < bodyWidth; col += A, colBgra += QA) - { - uint8x16_t _y = Load(y + col); - uint8x16_t _u = Load(u + col); - uint8x16_t _v = Load(v + col); - YuvToBgra(_y, _u, _v, _alpha, bgra + colBgra); - } - if (tail) - { - size_t col = width - A; - uint8x16_t _y = Load(y + col); - uint8x16_t _u = Load(u + col); - uint8x16_t _v = Load(v + col); - YuvToBgra(_y, _u, _v, _alpha, bgra + 4 * col); - } - y += yStride; - u += uStride; - v += vStride; - bgra += bgraStride; - } - } - - void Yuv444pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) - && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)) - Yuv444pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - else - Yuv444pToBgra(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeonYuvToHue.cpp b/src/3rd/Simd/Simd/SimdNeonYuvToHue.cpp deleted file mode 100644 index 332fa7b9..00000000 --- a/src/3rd/Simd/Simd/SimdNeonYuvToHue.cpp +++ /dev/null @@ -1,172 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" - -#include "Simd/SimdLog.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE int32x4_t MulDiv(const int32x4_t & dividend, const int32x4_t & divisor, const float32x4_t & KF_255_DIV_6) - { - return vcvtq_s32_f32(Div(vmulq_f32(KF_255_DIV_6, vcvtq_f32_s32(dividend)), vcvtq_f32_s32(divisor))); - } - - SIMD_INLINE int16x8_t MulDiv(const int16x8_t & dividend, const int16x8_t & divisor, const float32x4_t & KF_255_DIV_6) - { - int32x4_t lo = MulDiv(UnpackI16<0>(dividend), UnpackI16<0>(divisor), KF_255_DIV_6); - int32x4_t hi = MulDiv(UnpackI16<1>(dividend), UnpackI16<1>(divisor), KF_255_DIV_6); - return PackI32(lo, hi); - } - - SIMD_INLINE int16x8_t YuvToHue(const int16x8_t & y, const int16x8_t & u, const int16x8_t & v, const float32x4_t & KF_255_DIV_6) - { - int16x8_t red = SaturateByU8(YuvToRed(y, v)); - int16x8_t blue = SaturateByU8(YuvToBlue(y, u)); - int16x8_t green = SaturateByU8(YuvToGreen(y, u, v)); - int16x8_t max = vmaxq_s16(blue, vmaxq_s16(green, red)); - int16x8_t min = vminq_s16(blue, vminq_s16(green, red)); - int16x8_t range = vsubq_s16(max, min); - - int16x8_t redMaxMask = (int16x8_t)vceqq_s16(red, max); - int16x8_t greenMaxMask = vandq_s16(vmvnq_s16(redMaxMask), (int16x8_t)vceqq_s16(green, max)); - int16x8_t blueMaxMask = vandq_s16(vmvnq_s16(redMaxMask), vmvnq_s16(greenMaxMask)); - - int16x8_t redMaxCase = vandq_s16(redMaxMask, vaddq_s16(vsubq_s16(green, blue), vmulq_s16(range, (int16x8_t)K16_0006))); - int16x8_t greenMaxCase = vandq_s16(greenMaxMask, vaddq_s16(vsubq_s16(blue, red), vmulq_s16(range, (int16x8_t)K16_0002))); - int16x8_t blueMaxCase = vandq_s16(blueMaxMask, vaddq_s16(vsubq_s16(red, green), vmulq_s16(range, (int16x8_t)K16_0004))); - - int16x8_t dividend = vorrq_s16(vorrq_s16(redMaxCase, greenMaxCase), blueMaxCase); - - return vandq_s16(vmvnq_s16((int16x8_t)vceqq_s16(range, (int16x8_t)K16_0000)), vandq_s16(MulDiv(dividend, range, KF_255_DIV_6), (int16x8_t)K16_00FF)); - } - - SIMD_INLINE uint8x16_t YuvToHue(const uint8x16_t & y, const uint8x16_t & u, const uint8x16_t & v, const float32x4_t & KF_255_DIV_6) - { - uint16x8_t lo = (uint16x8_t)YuvToHue(AdjustY<0>(y), AdjustUV<0>(u), AdjustUV<0>(v), KF_255_DIV_6); - uint16x8_t hi = (uint16x8_t)YuvToHue(AdjustY<1>(y), AdjustUV<1>(u), AdjustUV<1>(v), KF_255_DIV_6); - return PackU16(lo, hi); - } - - template SIMD_INLINE void Yuv420pToHue(const uint8_t * y, const uint8x16x2_t & u, const uint8x16x2_t & v, uint8_t * hue, const float32x4_t & KF_255_DIV_6) - { - Store(hue + 0, YuvToHue(Load(y + 0), u.val[0], v.val[0], KF_255_DIV_6)); - Store(hue + A, YuvToHue(Load(y + A), u.val[1], v.val[1], KF_255_DIV_6)); - } - - template void Yuv420pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride) - { - assert((width % 2 == 0) && (height % 2 == 0) && (width >= DA) && (height >= 2)); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(hue) && Aligned(hueStride)); - } - - const float32x4_t KF_255_DIV_6 = vdupq_n_f32(Base::KF_255_DIV_6); - size_t bodyWidth = AlignLo(width, DA); - size_t tail = width - bodyWidth; - uint8x16x2_t _u, _v; - for (size_t row = 0; row < height; row += 2) - { - for (size_t colUV = 0, col = 0; col < bodyWidth; col += DA, colUV += A) - { - _u.val[1] = _u.val[0] = Load(u + colUV); - _u = vzipq_u8(_u.val[0], _u.val[1]); - _v.val[1] = _v.val[0] = Load(v + colUV); - _v = vzipq_u8(_v.val[0], _v.val[1]); - Yuv420pToHue(y + col, _u, _v, hue + col, KF_255_DIV_6); - Yuv420pToHue(y + yStride + col, _u, _v, hue + hueStride + col, KF_255_DIV_6); - } - if (tail) - { - size_t col = width - DA; - _u.val[1] = _u.val[0] = Load(u + col / 2); - _u = vzipq_u8(_u.val[0], _u.val[1]); - _v.val[1] = _v.val[0] = Load(v + col / 2); - _v = vzipq_u8(_v.val[0], _v.val[1]); - Yuv420pToHue(y + col, _u, _v, hue + col, KF_255_DIV_6); - Yuv420pToHue(y + yStride + col, _u, _v, hue + hueStride + col, KF_255_DIV_6); - } - y += 2 * yStride; - u += uStride; - v += vStride; - hue += 2 * hueStride; - } - } - - void Yuv420pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride) && Aligned(hue) && Aligned(hueStride)) - Yuv420pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - else - Yuv420pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - } - - template void Yuv444pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride) - { - assert(width >= A); - if (align) - { - assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); - assert(Aligned(v) && Aligned(vStride) && Aligned(hue) && Aligned(hueStride)); - } - - const float32x4_t KF_255_DIV_6 = vdupq_n_f32(Base::KF_255_DIV_6); - size_t bodyWidth = AlignLo(width, A); - size_t tail = width - bodyWidth; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < bodyWidth; col += A) - Store(hue + col, YuvToHue(Load(y + col), Load(u + col), Load(v + col), KF_255_DIV_6)); - if (tail) - { - size_t col = width - A; - Store(hue + col, YuvToHue(Load(y + col), Load(u + col), Load(v + col), KF_255_DIV_6)); - } - y += yStride; - u += uStride; - v += vStride; - hue += hueStride; - } - } - - void Yuv444pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, - size_t width, size_t height, uint8_t * hue, size_t hueStride) - { - if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride) && Aligned(hue) && Aligned(hueStride)) - Yuv444pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - else - Yuv444pToHue(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/src/3rd/Simd/Simd/SimdNeural.h b/src/3rd/Simd/Simd/SimdNeural.h deleted file mode 100644 index 70fa75e3..00000000 --- a/src/3rd/Simd/Simd/SimdNeural.h +++ /dev/null @@ -1,261 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdNeural_h__ -#define __SimdNeural_h__ - -#include "Simd/SimdLoad.h" - -namespace Simd -{ - template struct ConvolutionBackwardBuffer - { - ConvolutionBackwardBuffer(size_t width, size_t align) - { - _size = width * sizeof(float); - size_t stride = AlignHi(width + 2 * (count - 1), align); - size_t full = count*stride * sizeof(float); - _ptr = Allocate(full); - memset(_ptr, 0, full); - rows[0] = (float*)_ptr; - for (size_t i = 1; i < count; ++i) - rows[i] = rows[i - 1] + stride; - } - - void Update(const float * src) - { - float * tmp = rows[0]; - if (src == NULL) - memset(tmp + count - 1, 0, _size); - else - memcpy(tmp + count - 1, src, _size); - for (size_t i = 0; i < count - 1; ++i) - rows[i] = rows[i + 1]; - rows[count - 1] = tmp; - } - - ~ConvolutionBackwardBuffer() - { - Free(_ptr); - } - - float * rows[count]; - private: - size_t _size; - void * _ptr; - }; - -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void LoadWeightsForward(const float * src, __m256 * dst) - { - for (size_t i = 0; i < size; ++i) - dst[i] = _mm256_set1_ps(src[i]); - } - - template SIMD_INLINE void LoadWeightsBackward(const float * src, __m256 * dst) - { - for (size_t i = 0; i < size; ++i) - dst[i] = _mm256_set1_ps(src[size - i - 1]); - } - - template struct Convolution - { - template static SIMD_INLINE __m256 Forward(const float * src, size_t stride, const __m256 * weights); - - template static SIMD_INLINE __m256 Backward(const ConvolutionBackwardBuffer & buffer, size_t offset, const __m256 * weights); - - template static SIMD_INLINE void Sum(const float * src, size_t stride, const __m256 & dst, __m256 * sums); - }; - - template<> struct Convolution<2, 2> - { - template static SIMD_INLINE __m256 RowConvolution(const float * src, const __m256 * weights) - { - return _mm256_fmadd_ps(Avx::Load(src), weights[0], - _mm256_mul_ps(Avx::Load(src + 1), weights[1])); - } - - template static SIMD_INLINE __m256 Forward(const float * src, size_t stride, const __m256 * weights) - { - return _mm256_add_ps(RowConvolution(src, weights), - RowConvolution(src + stride, weights + 2)); - } - - template static SIMD_INLINE __m256 Backward(const ConvolutionBackwardBuffer<2> & buffer, size_t offset, const __m256 * weights) - { - return _mm256_add_ps(RowConvolution(buffer.rows[0] + offset, weights), - RowConvolution(buffer.rows[1] + offset, weights + 2)); - } - - template static SIMD_INLINE void Sum(const float * src, const __m256 & dst, __m256 * sums) - { - sums[0] = _mm256_fmadd_ps(dst, Load(src + 0), sums[0]); - sums[1] = _mm256_fmadd_ps(dst, Load(src + 1), sums[1]); - } - - template static SIMD_INLINE void Sum(const float * src, size_t stride, const __m256 & dst, __m256 * sums) - { - Sum(src + stride * 0, dst, sums + 0); - Sum(src + stride * 1, dst, sums + 2); - } - }; - - template<> struct Convolution<3, 3> - { - template static SIMD_INLINE __m256 RowConvolution(const float * src, const __m256 * weights) - { - return _mm256_fmadd_ps(Avx::Load(src), weights[0], - _mm256_fmadd_ps(Avx::Load(src + 1), weights[1], - _mm256_mul_ps(Avx::Load(src + 2), weights[2]))); - } - - template static SIMD_INLINE __m256 Forward(const float * src, size_t stride, const __m256 * weights) - { - return _mm256_add_ps(RowConvolution(src, weights), - _mm256_add_ps(RowConvolution(src + stride, weights + 3), - RowConvolution(src + 2 * stride, weights + 6))); - } - - template static SIMD_INLINE __m256 Backward(const ConvolutionBackwardBuffer<3> & buffer, size_t offset, const __m256 * weights) - { - return _mm256_add_ps(RowConvolution(buffer.rows[0] + offset, weights), - _mm256_add_ps(RowConvolution(buffer.rows[1] + offset, weights + 3), - RowConvolution(buffer.rows[2] + offset, weights + 6))); - } - - template static SIMD_INLINE void Sum(const float * src, const __m256 & dst, __m256 * sums) - { - __m256 s0 = Load(src + 0); - __m256 s4 = Load(src + 4); - sums[0] = _mm256_fmadd_ps(dst, s0, sums[0]); - sums[1] = _mm256_fmadd_ps(dst, Alignr<1>(s0, s4), sums[1]); - sums[2] = _mm256_fmadd_ps(dst, Alignr<2>(s0, s4), sums[2]); - } - - template static SIMD_INLINE void Sum(const float * src, size_t stride, const __m256 & dst, __m256 * sums) - { - Sum(src + stride * 0, dst, sums + 0); - Sum(src + stride * 1, dst, sums + 3); - Sum(src + stride * 2, dst, sums + 6); - } - }; - - template<> struct Convolution<4, 4> - { - template static SIMD_INLINE __m256 RowConvolution(const float * src, const __m256 * weights) - { - return _mm256_add_ps( - _mm256_fmadd_ps(Load(src + 0), weights[0], _mm256_mul_ps(Load(src + 1), weights[1])), - _mm256_fmadd_ps(Load(src + 2), weights[2], _mm256_mul_ps(Load(src + 3), weights[3]))); - } - - template static SIMD_INLINE __m256 Forward(const float * src, size_t stride, const __m256 * weights) - { - return _mm256_add_ps(_mm256_add_ps(RowConvolution(src, weights), - RowConvolution(src + stride, weights + 4)), - _mm256_add_ps(RowConvolution(src + 2 * stride, weights + 8), - RowConvolution(src + 3 * stride, weights + 12))); - } - - template static SIMD_INLINE __m256 Backward(const ConvolutionBackwardBuffer<4> & buffer, size_t offset, const __m256 * weights) - { - return _mm256_add_ps(_mm256_add_ps(RowConvolution(buffer.rows[0] + offset, weights), - RowConvolution(buffer.rows[1] + offset, weights + 4)), - _mm256_add_ps(RowConvolution(buffer.rows[2] + offset, weights + 8), - RowConvolution(buffer.rows[3] + offset, weights + 12))); - } - - template static SIMD_INLINE void Sum(const float * src, const __m256 & dst, __m256 * sums) - { - __m256 s0 = Load(src + 0); - __m256 s4 = Load(src + 4); - sums[0] = _mm256_fmadd_ps(dst, s0, sums[0]); - sums[1] = _mm256_fmadd_ps(dst, Alignr<1>(s0, s4), sums[1]); - sums[2] = _mm256_fmadd_ps(dst, Alignr<2>(s0, s4), sums[2]); - sums[3] = _mm256_fmadd_ps(dst, Alignr<3>(s0, s4), sums[3]); - } - - template static SIMD_INLINE void Sum(const float * src, size_t stride, const __m256 & dst, __m256 * sums) - { - Sum(src + stride * 0, dst, sums + 0); - Sum(src + stride * 1, dst, sums + 4); - Sum(src + stride * 2, dst, sums + 8); - Sum(src + stride * 3, dst, sums + 12); - } - }; - - template<> struct Convolution<5, 5> - { - template static SIMD_INLINE __m256 RowConvolution(const float * src, const __m256 * weights) - { - __m256 s0 = Load(src + 0); - __m256 s4 = Load(src + 4); - return _mm256_fmadd_ps(s0, weights[0], _mm256_add_ps( - _mm256_fmadd_ps(Alignr<1>(s0, s4), weights[1], _mm256_mul_ps(Alignr<2>(s0, s4), weights[2])), - _mm256_fmadd_ps(s4, weights[4], _mm256_mul_ps(Alignr<3>(s0, s4), weights[3])))); - } - - template static SIMD_INLINE __m256 Forward(const float * src, size_t stride, const __m256 * weights) - { - return _mm256_add_ps(RowConvolution(src, weights), - _mm256_add_ps(_mm256_add_ps(RowConvolution(src + stride, weights + 5), - RowConvolution(src + 2 * stride, weights + 10)), - _mm256_add_ps(RowConvolution(src + 3 * stride, weights + 15), - RowConvolution(src + 4 * stride, weights + 20)))); - } - - template static SIMD_INLINE __m256 Backward(const ConvolutionBackwardBuffer<5> & buffer, size_t offset, const __m256 * weights) - { - return _mm256_add_ps(_mm256_add_ps(RowConvolution(buffer.rows[0] + offset, weights), - _mm256_add_ps(RowConvolution(buffer.rows[1] + offset, weights + 5), - RowConvolution(buffer.rows[2] + offset, weights + 10))), - _mm256_add_ps(RowConvolution(buffer.rows[3] + offset, weights + 15), - RowConvolution(buffer.rows[4] + offset, weights + 20))); - } - - template static SIMD_INLINE void Sum(const float * src, const __m256 & dst, __m256 * sums) - { - __m256 s0 = Load(src + 0); - __m256 s4 = Load(src + 4); - sums[0] = _mm256_fmadd_ps(dst, s0, sums[0]); - sums[1] = _mm256_fmadd_ps(dst, Alignr<1>(s0, s4), sums[1]); - sums[2] = _mm256_fmadd_ps(dst, Alignr<2>(s0, s4), sums[2]); - sums[3] = _mm256_fmadd_ps(dst, Alignr<3>(s0, s4), sums[3]); - sums[4] = _mm256_fmadd_ps(dst, s4, sums[4]); - } - - template static SIMD_INLINE void Sum(const float * src, size_t stride, const __m256 & dst, __m256 * sums) - { - Sum(src + stride * 0, dst, sums + 0); - Sum(src + stride * 1, dst, sums + 5); - Sum(src + stride * 2, dst, sums + 10); - Sum(src + stride * 3, dst, sums + 15); - Sum(src + stride * 4, dst, sums + 20); - } - }; - } -#endif//SIMD_AVX2_ENABLE -} -#endif//__SimdNeural_h__ diff --git a/src/3rd/Simd/Simd/SimdNeural.hpp b/src/3rd/Simd/Simd/SimdNeural.hpp deleted file mode 100644 index ae90b286..00000000 --- a/src/3rd/Simd/Simd/SimdNeural.hpp +++ /dev/null @@ -1,2050 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdNeural_hpp__ -#define __SimdNeural_hpp__ - -#include "Simd/SimdLib.hpp" -#include "Simd/SimdParallel.hpp" - -#include - -#include -#include -#include - -#ifndef SIMD_CHECK_PERFORMANCE -#define SIMD_CHECK_PERFORMANCE() -#endif - -//#define SIMD_CHECK_OVERFLOW - -#if defined(SIMD_CHECK_OVERFLOW) && !defined(NDEBUG) -#define SIMD_CHECK_OVERFLOW_1(vector) Simd::Neural::Detail::CheckOverflow(vector.data(), vector.size()); -#define SIMD_CHECK_OVERFLOW_2(data, size) Simd::Neural::Detail::CheckOverflow(data, size); -#else -#define SIMD_CHECK_OVERFLOW_1(vector) -#define SIMD_CHECK_OVERFLOW_2(data, size) -#endif - -namespace Simd -{ - /*! @ingroup cpp_neural - - \short Contains Framework for learning of Convolutional Neural Network. - */ - namespace Neural - { - typedef Point Size; /*!< \brief 2D-size (width and height). */ - typedef std::vector> Buffer; /*!< \brief Vector with 8-bit unsigned integer values. */ - typedef std::vector> Vector; /*!< \brief Vector with 32-bit float point values. */ - typedef std::vector> VectorI; /*!< \brief Vector with integer values. */ - typedef std::vector Vectors; /*!< \brief Vector of vectors with 32-bit float point values. */ - typedef size_t Label; /*!< \brief Integer name (label) of object class. */ - typedef std::vector